You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

nrm2_vfp.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/22 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define N r0
  38. #define X r1
  39. #define INC_X r2
  40. #define I r12
  41. #define X_PRE 512
  42. /**************************************************************************************
  43. * Macro definitions
  44. **************************************************************************************/
  45. #if !defined(COMPLEX)
  46. #if defined(DOUBLE)
  47. .macro KERNEL_F1
  48. vldmia.f64 X!, { d4 }
  49. vcmpe.f64 d4, d6 // compare with 0.0
  50. vmrs APSR_nzcv, fpscr
  51. beq KERNEL_F1_NEXT_\@
  52. vabs.f64 d4, d4
  53. vcmpe.f64 d0, d4 // compare with scale
  54. vmrs APSR_nzcv, fpscr
  55. vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
  56. vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
  57. bge KERNEL_F1_NEXT_\@
  58. vdiv.f64 d2 , d0, d4 // scale / x
  59. vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
  60. vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
  61. vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  62. vmov.f64 d0 , d4 // scale = x
  63. KERNEL_F1_NEXT_\@:
  64. .endm
  65. .macro KERNEL_F8
  66. pld [ X, #X_PRE ]
  67. KERNEL_F1
  68. KERNEL_F1
  69. KERNEL_F1
  70. KERNEL_F1
  71. pld [ X, #X_PRE ]
  72. KERNEL_F1
  73. KERNEL_F1
  74. KERNEL_F1
  75. KERNEL_F1
  76. .endm
  77. .macro KERNEL_S1
  78. vldmia.f64 X, { d4 }
  79. vcmpe.f64 d4, d6 // compare with 0.0
  80. vmrs APSR_nzcv, fpscr
  81. beq KERNEL_S1_NEXT
  82. vabs.f64 d4, d4
  83. vcmpe.f64 d0, d4 // compare with scale
  84. vmrs APSR_nzcv, fpscr
  85. vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
  86. vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
  87. bge KERNEL_S1_NEXT
  88. vdiv.f64 d2 , d0, d4 // scale / x
  89. vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
  90. vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
  91. vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  92. vmov.f64 d0 , d4 // scale = x
  93. KERNEL_S1_NEXT:
  94. add X, X, INC_X
  95. .endm
  96. #else
  97. .macro KERNEL_F1
  98. vldmia.f32 X!, { s4 }
  99. vcmpe.f32 s4, s6 // compare with 0.0
  100. vmrs APSR_nzcv, fpscr
  101. beq KERNEL_F1_NEXT_\@
  102. vabs.f32 s4, s4
  103. vcmpe.f32 s0, s4 // compare with scale
  104. vmrs APSR_nzcv, fpscr
  105. vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
  106. vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
  107. bge KERNEL_F1_NEXT_\@
  108. vdiv.f32 s2 , s0, s4 // scale / x
  109. vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
  110. vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
  111. vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  112. vmov.f32 s0 , s4 // scale = x
  113. KERNEL_F1_NEXT_\@:
  114. .endm
  115. .macro KERNEL_F8
  116. pld [ X, #X_PRE ]
  117. KERNEL_F1
  118. KERNEL_F1
  119. KERNEL_F1
  120. KERNEL_F1
  121. KERNEL_F1
  122. KERNEL_F1
  123. KERNEL_F1
  124. KERNEL_F1
  125. .endm
  126. .macro KERNEL_S1
  127. vldmia.f32 X, { s4 }
  128. vcmpe.f32 s4, s6 // compare with 0.0
  129. vmrs APSR_nzcv, fpscr
  130. beq KERNEL_S1_NEXT
  131. vabs.f32 s4, s4
  132. vcmpe.f32 s0, s4 // compare with scale
  133. vmrs APSR_nzcv, fpscr
  134. vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
  135. vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
  136. bge KERNEL_S1_NEXT
  137. vdiv.f32 s2 , s0, s4 // scale / x
  138. vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
  139. vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
  140. vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  141. vmov.f32 s0 , s4 // scale = x
  142. KERNEL_S1_NEXT:
  143. add X, X, INC_X
  144. .endm
  145. #endif
  146. #else
  147. #if defined(DOUBLE)
  148. .macro KERNEL_F1
  149. vldmia.f64 X!, { d4 - d5 }
  150. vcmpe.f64 d4, d6 // compare with 0.0
  151. vmrs APSR_nzcv, fpscr
  152. beq KERNEL_F1_NEXT_\@
  153. vabs.f64 d4, d4
  154. vcmpe.f64 d0, d4 // compare with scale
  155. vmrs APSR_nzcv, fpscr
  156. vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
  157. vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
  158. bge KERNEL_F1_NEXT_\@
  159. vdiv.f64 d2 , d0, d4 // scale / x
  160. vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
  161. vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
  162. vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  163. vmov.f64 d0 , d4 // scale = x
  164. KERNEL_F1_NEXT_\@:
  165. vcmpe.f64 d5, d6 // compare with 0.0
  166. vmrs APSR_nzcv, fpscr
  167. beq KERNEL_F1_END_\@
  168. vabs.f64 d5, d5
  169. vcmpe.f64 d0, d5 // compare with scale
  170. vmrs APSR_nzcv, fpscr
  171. vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
  172. vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
  173. bge KERNEL_F1_END_\@
  174. vdiv.f64 d2 , d0, d5 // scale / x
  175. vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
  176. vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
  177. vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  178. vmov.f64 d0 , d5 // scale = x
  179. KERNEL_F1_END_\@:
  180. .endm
  181. .macro KERNEL_F8
  182. pld [ X, #X_PRE ]
  183. KERNEL_F1
  184. KERNEL_F1
  185. pld [ X, #X_PRE ]
  186. KERNEL_F1
  187. KERNEL_F1
  188. pld [ X, #X_PRE ]
  189. KERNEL_F1
  190. KERNEL_F1
  191. pld [ X, #X_PRE ]
  192. KERNEL_F1
  193. KERNEL_F1
  194. .endm
  195. .macro KERNEL_S1
  196. vldmia.f64 X, { d4 - d5 }
  197. vcmpe.f64 d4, d6 // compare with 0.0
  198. vmrs APSR_nzcv, fpscr
  199. beq KERNEL_S1_NEXT_\@
  200. vabs.f64 d4, d4
  201. vcmpe.f64 d0, d4 // compare with scale
  202. vmrs APSR_nzcv, fpscr
  203. vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
  204. vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
  205. bge KERNEL_S1_NEXT_\@
  206. vdiv.f64 d2 , d0, d4 // scale / x
  207. vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
  208. vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
  209. vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  210. vmov.f64 d0 , d4 // scale = x
  211. KERNEL_S1_NEXT_\@:
  212. vcmpe.f64 d5, d6 // compare with 0.0
  213. vmrs APSR_nzcv, fpscr
  214. beq KERNEL_S1_END_\@
  215. vabs.f64 d5, d5
  216. vcmpe.f64 d0, d5 // compare with scale
  217. vmrs APSR_nzcv, fpscr
  218. vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
  219. vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
  220. bge KERNEL_S1_END_\@
  221. vdiv.f64 d2 , d0, d5 // scale / x
  222. vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
  223. vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
  224. vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  225. vmov.f64 d0 , d5 // scale = x
  226. KERNEL_S1_END_\@:
  227. add X, X, INC_X
  228. .endm
  229. #else
  230. .macro KERNEL_F1
  231. vldmia.f32 X!, { s4 - s5 }
  232. vcmpe.f32 s4, s6 // compare with 0.0
  233. vmrs APSR_nzcv, fpscr
  234. beq KERNEL_F1_NEXT_\@
  235. vabs.f32 s4, s4
  236. vcmpe.f32 s0, s4 // compare with scale
  237. vmrs APSR_nzcv, fpscr
  238. vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
  239. vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
  240. bge KERNEL_F1_NEXT_\@
  241. vdiv.f32 s2 , s0, s4 // scale / x
  242. vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
  243. vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
  244. vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  245. vmov.f32 s0 , s4 // scale = x
  246. KERNEL_F1_NEXT_\@:
  247. vcmpe.f32 s5, s6 // compare with 0.0
  248. vmrs APSR_nzcv, fpscr
  249. beq KERNEL_F1_END_\@
  250. vabs.f32 s5, s5
  251. vcmpe.f32 s0, s5 // compare with scale
  252. vmrs APSR_nzcv, fpscr
  253. vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
  254. vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
  255. bge KERNEL_F1_END_\@
  256. vdiv.f32 s2 , s0, s5 // scale / x
  257. vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
  258. vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
  259. vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  260. vmov.f32 s0 , s5 // scale = x
  261. KERNEL_F1_END_\@:
  262. .endm
  263. .macro KERNEL_F8
  264. pld [ X, #X_PRE ]
  265. KERNEL_F1
  266. KERNEL_F1
  267. KERNEL_F1
  268. KERNEL_F1
  269. pld [ X, #X_PRE ]
  270. KERNEL_F1
  271. KERNEL_F1
  272. KERNEL_F1
  273. KERNEL_F1
  274. .endm
  275. .macro KERNEL_S1
  276. vldmia.f32 X, { s4 - s5 }
  277. vcmpe.f32 s4, s6 // compare with 0.0
  278. vmrs APSR_nzcv, fpscr
  279. beq KERNEL_S1_NEXT_\@
  280. vabs.f32 s4, s4
  281. vcmpe.f32 s0, s4 // compare with scale
  282. vmrs APSR_nzcv, fpscr
  283. vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
  284. vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
  285. bge KERNEL_S1_NEXT_\@
  286. vdiv.f32 s2 , s0, s4 // scale / x
  287. vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
  288. vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
  289. vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  290. vmov.f32 s0 , s4 // scale = x
  291. KERNEL_S1_NEXT_\@:
  292. vcmpe.f32 s5, s6 // compare with 0.0
  293. vmrs APSR_nzcv, fpscr
  294. beq KERNEL_S1_END_\@
  295. vabs.f32 s5, s5
  296. vcmpe.f32 s0, s5 // compare with scale
  297. vmrs APSR_nzcv, fpscr
  298. vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
  299. vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
  300. bge KERNEL_S1_END_\@
  301. vdiv.f32 s2 , s0, s5 // scale / x
  302. vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
  303. vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
  304. vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
  305. vmov.f32 s0 , s5 // scale = x
  306. KERNEL_S1_END_\@:
  307. add X, X, INC_X
  308. .endm
  309. #endif
  310. #endif
  311. /**************************************************************************************
  312. * End of macro definitions
  313. **************************************************************************************/
  314. PROLOGUE
  315. b nrm2_begin
  316. #if defined(COMPLEX)
  317. #if defined(DOUBLE)
  318. znrm2_zero:
  319. .word 0x00000000
  320. .word 0x00000000
  321. znrm2_one:
  322. .word 0x00000000
  323. .word 0x3ff00000
  324. #else
  325. cnrm2_zero:
  326. .word 0x00000000
  327. cnrm2_one:
  328. .word 0x3f800000
  329. #endif
  330. #else
  331. #if defined(DOUBLE)
  332. dnrm2_zero:
  333. .word 0x00000000
  334. .word 0x00000000
  335. dnrm2_one:
  336. .word 0x00000000
  337. .word 0x3ff00000
  338. #else
  339. snrm2_zero:
  340. .word 0x00000000
  341. snrm2_one:
  342. .word 0x3f800000
  343. #endif
  344. #endif
  345. .align 5
  346. nrm2_begin:
  347. #if defined(COMPLEX)
  348. #if defined(DOUBLE)
  349. vldr.64 d0 , znrm2_zero
  350. vldr.64 d1 , znrm2_one // ssq=1.0
  351. vmov.f64 d7 , d1 // value 1.0
  352. vmov.f64 d6 , d0 // value 0.0
  353. #else
  354. vldr.32 s0 , cnrm2_zero
  355. vldr.32 s1 , cnrm2_one // ssq=1.0
  356. vmov.f32 s7 , s1 // value 1.0
  357. vmov.f32 s6 , s0 // value 0.0
  358. #endif
  359. #else
  360. #if defined(DOUBLE)
  361. vldr.64 d0 , dnrm2_zero
  362. vldr.64 d1 , dnrm2_one // ssq=1.0
  363. vmov.f64 d7 , d1 // value 1.0
  364. vmov.f64 d6 , d0 // value 0.0
  365. #else
  366. vldr.32 s0 , snrm2_zero
  367. vldr.32 s1 , snrm2_one // ssq=1.0
  368. vmov.f32 s7 , s1 // value 1.0
  369. vmov.f32 s6 , s0 // value 0.0
  370. #endif
  371. #endif
  372. cmp N, #0
  373. ble nrm2_kernel_L999
  374. cmp INC_X, #0
  375. beq nrm2_kernel_L999
  376. cmp INC_X, #1
  377. bne nrm2_kernel_S_BEGIN
  378. nrm2_kernel_F_BEGIN:
  379. asrs I, N, #3 // I = N / 8
  380. ble nrm2_kernel_F1
  381. nrm2_kernel_F8:
  382. KERNEL_F8
  383. subs I, I, #1
  384. bne nrm2_kernel_F8
  385. nrm2_kernel_F1:
  386. ands I, N, #7
  387. ble nrm2_kernel_L999
  388. nrm2_kernel_F10:
  389. KERNEL_F1
  390. subs I, I, #1
  391. bne nrm2_kernel_F10
  392. b nrm2_kernel_L999
  393. nrm2_kernel_S_BEGIN:
  394. #if defined(COMPLEX)
  395. #if defined(DOUBLE)
  396. lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
  397. #else
  398. lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
  399. #endif
  400. #else
  401. #if defined(DOUBLE)
  402. lsl INC_X, INC_X, #3 // INC_X * SIZE
  403. #else
  404. lsl INC_X, INC_X, #2 // INC_X * SIZE
  405. #endif
  406. #endif
  407. nrm2_kernel_S1:
  408. mov I, N
  409. .align 5
  410. nrm2_kernel_S10:
  411. KERNEL_S1
  412. subs I, I, #1
  413. bne nrm2_kernel_S10
  414. nrm2_kernel_L999:
  415. #if defined(DOUBLE)
  416. vsqrt.f64 d1, d1
  417. vmul.f64 d0, d0, d1
  418. #else
  419. vsqrt.f32 s1, s1
  420. vmul.f32 s0, s0, s1
  421. #endif
  422. #if !defined(__ARM_PCS_VFP)
  423. #if !defined(DOUBLE)
  424. vmov r0, s0
  425. #else
  426. vmov r0, r1, d0
  427. #endif
  428. #endif
  429. bx lr
  430. EPILOGUE