You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

nrm2_vfpv3.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/16 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define N r0
  38. #define X r1
  39. #define INC_X r2
  40. #define I r12
  41. #define X_PRE 512
  42. /**************************************************************************************
  43. * Macro definitions
  44. **************************************************************************************/
  45. #if !defined(COMPLEX)
  46. #if defined(DOUBLE)
  47. .macro KERNEL_F1
  48. vldmia.f64 X!, { d4 }
  49. vcmpe.f64 d4, d6 // compare with 0.0
  50. vmrs APSR_nzcv, fpscr
  51. beq 1f /* KERNEL_F1_NEXT_\@ */
  52. vabs.f64 d4, d4
  53. vcmpe.f64 d0, d4 // compare with scale
  54. vmrs APSR_nzcv, fpscr
  55. vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
  56. vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
  57. bge 1f /* KERNEL_F1_NEXT_\@ */
  58. vdiv.f64 d2 , d0, d4 // scale / x
  59. vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
  60. vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
  61. vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  62. vmov.f64 d0 , d4 // scale = x
  63. 1: /* KERNEL_F1_NEXT_\@: */
  64. .endm
  65. .macro KERNEL_F8
  66. pld [ X, #X_PRE ]
  67. KERNEL_F1
  68. KERNEL_F1
  69. KERNEL_F1
  70. KERNEL_F1
  71. pld [ X, #X_PRE ]
  72. KERNEL_F1
  73. KERNEL_F1
  74. KERNEL_F1
  75. KERNEL_F1
  76. .endm
  77. .macro KERNEL_S1
  78. vldmia.f64 X, { d4 }
  79. vcmpe.f64 d4, d6 // compare with 0.0
  80. vmrs APSR_nzcv, fpscr
  81. beq KERNEL_S1_NEXT
  82. vabs.f64 d4, d4
  83. vcmpe.f64 d0, d4 // compare with scale
  84. vmrs APSR_nzcv, fpscr
  85. vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
  86. vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
  87. bge KERNEL_S1_NEXT
  88. vdiv.f64 d2 , d0, d4 // scale / x
  89. vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
  90. vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
  91. vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  92. vmov.f64 d0 , d4 // scale = x
  93. KERNEL_S1_NEXT:
  94. add X, X, INC_X
  95. .endm
  96. #else
  97. .macro KERNEL_F1
  98. vldmia.f32 X!, { s4 }
  99. vcmpe.f32 s4, s6 // compare with 0.0
  100. vmrs APSR_nzcv, fpscr
  101. beq 1f /* KERNEL_F1_NEXT_\@ */
  102. vabs.f32 s4, s4
  103. vcmpe.f32 s0, s4 // compare with scale
  104. vmrs APSR_nzcv, fpscr
  105. vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
  106. vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
  107. bge 1f /* KERNEL_F1_NEXT_\@ */
  108. vdiv.f32 s2 , s0, s4 // scale / x
  109. vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
  110. vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
  111. vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  112. vmov.f32 s0 , s4 // scale = x
  113. 1: /* KERNEL_F1_NEXT_\@: */
  114. .endm
  115. .macro KERNEL_F8
  116. pld [ X, #X_PRE ]
  117. KERNEL_F1
  118. KERNEL_F1
  119. KERNEL_F1
  120. KERNEL_F1
  121. KERNEL_F1
  122. KERNEL_F1
  123. KERNEL_F1
  124. KERNEL_F1
  125. .endm
  126. .macro KERNEL_S1
  127. vldmia.f32 X, { s4 }
  128. vcmpe.f32 s4, s6 // compare with 0.0
  129. vmrs APSR_nzcv, fpscr
  130. beq KERNEL_S1_NEXT
  131. vabs.f32 s4, s4
  132. vcmpe.f32 s0, s4 // compare with scale
  133. vmrs APSR_nzcv, fpscr
  134. vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
  135. vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
  136. bge KERNEL_S1_NEXT
  137. vdiv.f32 s2 , s0, s4 // scale / x
  138. vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
  139. vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
  140. vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  141. vmov.f32 s0 , s4 // scale = x
  142. KERNEL_S1_NEXT:
  143. add X, X, INC_X
  144. .endm
  145. #endif
  146. #else
  147. #if defined(DOUBLE)
  148. .macro KERNEL_F1
  149. vldmia.f64 X!, { d4 - d5 }
  150. vcmpe.f64 d4, d6 // compare with 0.0
  151. vmrs APSR_nzcv, fpscr
  152. beq 1f /* KERNEL_F1_NEXT_\@ */
  153. vabs.f64 d4, d4
  154. vcmpe.f64 d0, d4 // compare with scale
  155. vmrs APSR_nzcv, fpscr
  156. vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
  157. vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
  158. bge 1f /* KERNEL_F1_NEXT_\@ */
  159. vdiv.f64 d2 , d0, d4 // scale / x
  160. vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
  161. vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
  162. vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  163. vmov.f64 d0 , d4 // scale = x
  164. 1: /* KERNEL_F1_NEXT_\@: */
  165. vcmpe.f64 d5, d6 // compare with 0.0
  166. vmrs APSR_nzcv, fpscr
  167. beq 2f /* KERNEL_F1_END_\@ */
  168. vabs.f64 d5, d5
  169. vcmpe.f64 d0, d5 // compare with scale
  170. vmrs APSR_nzcv, fpscr
  171. vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
  172. vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
  173. bge 2f /* KERNEL_F1_END_\@ */
  174. vdiv.f64 d2 , d0, d5 // scale / x
  175. vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
  176. vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
  177. vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  178. vmov.f64 d0 , d5 // scale = x
  179. 2: /* KERNEL_F1_END_\@: */
  180. .endm
  181. .macro KERNEL_F8
  182. pld [ X, #X_PRE ]
  183. KERNEL_F1
  184. KERNEL_F1
  185. pld [ X, #X_PRE ]
  186. KERNEL_F1
  187. KERNEL_F1
  188. pld [ X, #X_PRE ]
  189. KERNEL_F1
  190. KERNEL_F1
  191. pld [ X, #X_PRE ]
  192. KERNEL_F1
  193. KERNEL_F1
  194. .endm
  195. .macro KERNEL_S1
  196. vldmia.f64 X, { d4 - d5 }
  197. vcmpe.f64 d4, d6 // compare with 0.0
  198. vmrs APSR_nzcv, fpscr
  199. beq 1f /* KERNEL_S1_NEXT_\@ */
  200. vabs.f64 d4, d4
  201. vcmpe.f64 d0, d4 // compare with scale
  202. vmrs APSR_nzcv, fpscr
  203. vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
  204. vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
  205. bge 1f /* KERNEL_S1_NEXT_\@ */
  206. vdiv.f64 d2 , d0, d4 // scale / x
  207. vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
  208. vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
  209. vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  210. vmov.f64 d0 , d4 // scale = x
  211. 1: /* KERNEL_S1_NEXT_\@: */
  212. vcmpe.f64 d5, d6 // compare with 0.0
  213. vmrs APSR_nzcv, fpscr
  214. beq 2f /* KERNEL_S1_END_\@ */
  215. vabs.f64 d5, d5
  216. vcmpe.f64 d0, d5 // compare with scale
  217. vmrs APSR_nzcv, fpscr
  218. vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
  219. vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
  220. bge 2f /* KERNEL_S1_END_\@ */
  221. vdiv.f64 d2 , d0, d5 // scale / x
  222. vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
  223. vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
  224. vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  225. vmov.f64 d0 , d5 // scale = x
  226. 2: /* KERNEL_S1_END_\@: */
  227. add X, X, INC_X
  228. .endm
  229. #else
  230. .macro KERNEL_F1
  231. vldmia.f32 X!, { s4 - s5 }
  232. vcmpe.f32 s4, s6 // compare with 0.0
  233. vmrs APSR_nzcv, fpscr
  234. beq 1f /* KERNEL_F1_NEXT_\@ */
  235. vabs.f32 s4, s4
  236. vcmpe.f32 s0, s4 // compare with scale
  237. vmrs APSR_nzcv, fpscr
  238. vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
  239. vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
  240. bge 1f /* KERNEL_F1_NEXT_\@ */
  241. vdiv.f32 s2 , s0, s4 // scale / x
  242. vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
  243. vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
  244. vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  245. vmov.f32 s0 , s4 // scale = x
  246. 1: /* KERNEL_F1_NEXT_\@: */
  247. vcmpe.f32 s5, s6 // compare with 0.0
  248. vmrs APSR_nzcv, fpscr
  249. beq 2f /* KERNEL_F1_END_\@ */
  250. vabs.f32 s5, s5
  251. vcmpe.f32 s0, s5 // compare with scale
  252. vmrs APSR_nzcv, fpscr
  253. vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
  254. vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
  255. bge 2f /* KERNEL_F1_END_\@ */
  256. vdiv.f32 s2 , s0, s5 // scale / x
  257. vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
  258. vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
  259. vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  260. vmov.f32 s0 , s5 // scale = x
  261. 2: /* KERNEL_F1_END_\@: */
  262. .endm
  263. .macro KERNEL_F8
  264. pld [ X, #X_PRE ]
  265. KERNEL_F1
  266. KERNEL_F1
  267. KERNEL_F1
  268. KERNEL_F1
  269. pld [ X, #X_PRE ]
  270. KERNEL_F1
  271. KERNEL_F1
  272. KERNEL_F1
  273. KERNEL_F1
  274. .endm
  275. .macro KERNEL_S1
  276. vldmia.f32 X, { s4 - s5 }
  277. vcmpe.f32 s4, s6 // compare with 0.0
  278. vmrs APSR_nzcv, fpscr
  279. beq 1f /* KERNEL_S1_NEXT_\@ */
  280. vabs.f32 s4, s4
  281. vcmpe.f32 s0, s4 // compare with scale
  282. vmrs APSR_nzcv, fpscr
  283. vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
  284. vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
  285. bge 1f /* KERNEL_S1_NEXT_\@ */
  286. vdiv.f32 s2 , s0, s4 // scale / x
  287. vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
  288. vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
  289. vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  290. vmov.f32 s0 , s4 // scale = x
  291. 1: /* KERNEL_S1_NEXT_\@: */
  292. vcmpe.f32 s5, s6 // compare with 0.0
  293. vmrs APSR_nzcv, fpscr
  294. beq 2f /* KERNEL_S1_END_\@ */
  295. vabs.f32 s5, s5
  296. vcmpe.f32 s0, s5 // compare with scale
  297. vmrs APSR_nzcv, fpscr
  298. vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
  299. vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
  300. bge 2f /* KERNEL_S1_END_\@ */
  301. vdiv.f32 s2 , s0, s5 // scale / x
  302. vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
  303. vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
  304. vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  305. vmov.f32 s0 , s5 // scale = x
  306. 2: /* KERNEL_S1_END_\@: */
  307. add X, X, INC_X
  308. .endm
  309. #endif
  310. #endif
  311. /**************************************************************************************
  312. * End of macro definitions
  313. **************************************************************************************/
  314. PROLOGUE
  315. .align 5
  316. #if defined(DOUBLE)
  317. movs r12 , #0
  318. vmov.f32 s0 , r12 // scale=0.0
  319. vcvt.f64.f32 d0, s0
  320. vmov.f64 d1 , #1.0 // ssq=1.0
  321. vmov.f64 d7 , d1 // value 1.0
  322. vmov.f64 d6 , d0 // value 0.0
  323. #else
  324. movs r12 , #0
  325. vmov.f32 s0 , r12 // scale=0.0
  326. vmov.f32 s1 , #1.0 // ssq=1.0
  327. vmov.f32 s7 , s1 // value 1.0
  328. vmov.f32 s6 , s0 // value 0.0
  329. #endif
  330. cmp N, #0
  331. ble nrm2_kernel_L999
  332. cmp INC_X, #0
  333. beq nrm2_kernel_L999
  334. cmp INC_X, #1
  335. bne nrm2_kernel_S_BEGIN
  336. nrm2_kernel_F_BEGIN:
  337. asrs I, N, #3 // I = N / 8
  338. ble nrm2_kernel_F1
  339. nrm2_kernel_F8:
  340. KERNEL_F8
  341. subs I, I, #1
  342. bne nrm2_kernel_F8
  343. nrm2_kernel_F1:
  344. ands I, N, #7
  345. ble nrm2_kernel_L999
  346. nrm2_kernel_F10:
  347. KERNEL_F1
  348. subs I, I, #1
  349. bne nrm2_kernel_F10
  350. b nrm2_kernel_L999
  351. nrm2_kernel_S_BEGIN:
  352. #if defined(COMPLEX)
  353. #if defined(DOUBLE)
  354. lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
  355. #else
  356. lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
  357. #endif
  358. #else
  359. #if defined(DOUBLE)
  360. lsl INC_X, INC_X, #3 // INC_X * SIZE
  361. #else
  362. lsl INC_X, INC_X, #2 // INC_X * SIZE
  363. #endif
  364. #endif
  365. nrm2_kernel_S1:
  366. mov I, N
  367. .align 5
  368. nrm2_kernel_S10:
  369. KERNEL_S1
  370. subs I, I, #1
  371. bne nrm2_kernel_S10
  372. nrm2_kernel_L999:
  373. #if defined(DOUBLE)
  374. vsqrt.f64 d1, d1
  375. vmul.f64 d0, d0, d1
  376. #else
  377. vsqrt.f32 s1, s1
  378. vmul.f32 s0, s0, s1
  379. #endif
  380. #if !defined(__ARM_PCS_VFP)
  381. #if defined(DOUBLE)
  382. vmov r0, r1, d0
  383. #else
  384. vmov r0, s0
  385. #endif
  386. #endif
  387. bx lr
  388. EPILOGUE