You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

iamax_vfp.S 7.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/14 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define N r0
  38. #define X r1
  39. #define INC_X r2
  40. #define INDEX r3
  41. #define Z r4
  42. #define I r12
  43. #define X_PRE 512
  44. /**************************************************************************************
  45. * Macro definitions
  46. **************************************************************************************/
  47. #if defined(USE_ABS)
  48. #if defined(DOUBLE)
  49. #define VABS(x0,x1) vabs.f64 x0, x1
  50. #else
  51. #define VABS(x0,x1) vabs.f32 x0, x1
  52. #endif
  53. #else
  54. #define VABS(x0,x1) nop
  55. #endif
  56. /*****************************************************************************************/
  57. #if defined(USE_MIN)
  58. #define MOVCOND movlt
  59. #if defined(DOUBLE)
  60. #define VMOVCOND vmovlt.f64
  61. #else
  62. #define VMOVCOND vmovlt.f32
  63. #endif
  64. #else
  65. #define MOVCOND movgt
  66. #if defined(DOUBLE)
  67. #define VMOVCOND vmovgt.f64
  68. #else
  69. #define VMOVCOND vmovgt.f32
  70. #endif
  71. #endif
  72. /*****************************************************************************************/
  73. #if !defined(COMPLEX)
  74. #if defined(DOUBLE)
  75. .macro INIT_F
  76. vldmia.f64 X!, { d0 }
  77. VABS( d0, d0 )
  78. mov Z, #1
  79. mov INDEX, Z
  80. .endm
  81. .macro KERNEL_F1
  82. vldmia.f64 X!, { d4 }
  83. add Z, Z, #1
  84. VABS( d4, d4 )
  85. vcmpe.f64 d4, d0
  86. vmrs APSR_nzcv, fpscr
  87. VMOVCOND d0, d4
  88. MOVCOND INDEX, Z
  89. .endm
  90. .macro INIT_S
  91. vldmia.f64 X, { d0 }
  92. VABS( d0, d0 )
  93. mov Z, #1
  94. mov INDEX, Z
  95. add X, X, INC_X
  96. .endm
  97. .macro KERNEL_S1
  98. vldmia.f64 X, { d4 }
  99. add Z, Z, #1
  100. VABS( d4, d4 )
  101. vcmpe.f64 d4, d0
  102. vmrs APSR_nzcv, fpscr
  103. VMOVCOND d0, d4
  104. MOVCOND INDEX, Z
  105. add X, X, INC_X
  106. .endm
  107. #else
  108. .macro INIT_F
  109. vldmia.f32 X!, { s0 }
  110. VABS( s0, s0 )
  111. mov Z, #1
  112. mov INDEX, Z
  113. .endm
  114. .macro KERNEL_F1
  115. vldmia.f32 X!, { s4 }
  116. add Z, Z, #1
  117. VABS( s4, s4 )
  118. vcmpe.f32 s4, s0
  119. vmrs APSR_nzcv, fpscr
  120. VMOVCOND s0, s4
  121. MOVCOND INDEX, Z
  122. .endm
  123. .macro INIT_S
  124. vldmia.f32 X, { s0 }
  125. VABS( s0, s0 )
  126. mov Z, #1
  127. mov INDEX, Z
  128. add X, X, INC_X
  129. .endm
  130. .macro KERNEL_S1
  131. vldmia.f32 X, { s4 }
  132. add Z, Z, #1
  133. VABS( s4, s4 )
  134. vcmpe.f32 s4, s0
  135. vmrs APSR_nzcv, fpscr
  136. VMOVCOND s0, s4
  137. MOVCOND INDEX, Z
  138. add X, X, INC_X
  139. .endm
  140. #endif
  141. #else
  142. #if defined(DOUBLE)
  143. .macro INIT_F
  144. vldmia.f64 X!, { d0 -d1 }
  145. vabs.f64 d0, d0
  146. vabs.f64 d1, d1
  147. vadd.f64 d0 , d0, d1
  148. mov Z, #1
  149. mov INDEX, Z
  150. .endm
  151. .macro KERNEL_F1
  152. vldmia.f64 X!, { d4 - d5 }
  153. add Z, Z, #1
  154. vabs.f64 d4, d4
  155. vabs.f64 d5, d5
  156. vadd.f64 d4 , d4, d5
  157. vcmpe.f64 d4, d0
  158. vmrs APSR_nzcv, fpscr
  159. VMOVCOND d0, d4
  160. MOVCOND INDEX, Z
  161. .endm
  162. .macro INIT_S
  163. vldmia.f64 X, { d0 -d1 }
  164. vabs.f64 d0, d0
  165. vabs.f64 d1, d1
  166. vadd.f64 d0 , d0, d1
  167. mov Z, #1
  168. mov INDEX, Z
  169. add X, X, INC_X
  170. .endm
  171. .macro KERNEL_S1
  172. vldmia.f64 X, { d4 - d5 }
  173. add Z, Z, #1
  174. vabs.f64 d4, d4
  175. vabs.f64 d5, d5
  176. vadd.f64 d4 , d4, d5
  177. vcmpe.f64 d4, d0
  178. vmrs APSR_nzcv, fpscr
  179. VMOVCOND d0, d4
  180. MOVCOND INDEX, Z
  181. add X, X, INC_X
  182. .endm
  183. #else
  184. .macro INIT_F
  185. vldmia.f32 X!, { s0 -s1 }
  186. vabs.f32 s0, s0
  187. vabs.f32 s1, s1
  188. vadd.f32 s0 , s0, s1
  189. mov Z, #1
  190. mov INDEX, Z
  191. .endm
  192. .macro KERNEL_F1
  193. vldmia.f32 X!, { s4 - s5 }
  194. add Z, Z, #1
  195. vabs.f32 s4, s4
  196. vabs.f32 s5, s5
  197. vadd.f32 s4 , s4, s5
  198. vcmpe.f32 s4, s0
  199. vmrs APSR_nzcv, fpscr
  200. VMOVCOND s0, s4
  201. MOVCOND INDEX, Z
  202. .endm
  203. .macro INIT_S
  204. vldmia.f32 X, { s0 -s1 }
  205. vabs.f32 s0, s0
  206. vabs.f32 s1, s1
  207. vadd.f32 s0 , s0, s1
  208. mov Z, #1
  209. mov INDEX, Z
  210. add X, X, INC_X
  211. .endm
  212. .macro KERNEL_S1
  213. vldmia.f32 X, { s4 - s5 }
  214. add Z, Z, #1
  215. vabs.f32 s4, s4
  216. vabs.f32 s5, s5
  217. vadd.f32 s4 , s4, s5
  218. vcmpe.f32 s4, s0
  219. vmrs APSR_nzcv, fpscr
  220. VMOVCOND s0, s4
  221. MOVCOND INDEX, Z
  222. add X, X, INC_X
  223. .endm
  224. #endif
  225. #endif
  226. /**************************************************************************************
  227. * End of macro definitions
  228. **************************************************************************************/
  229. PROLOGUE
  230. .align 5
  231. push {r4}
  232. movs r12, #0 // clear floating point register
  233. vmov s0, r12
  234. #if defined(DOUBLE)
  235. vcvt.f64.f32 d0, s0
  236. #endif
  237. mov INDEX, #0
  238. cmp N, #0
  239. ble iamax_kernel_L999
  240. cmp INC_X, #0
  241. beq iamax_kernel_L999
  242. cmp INC_X, #1
  243. bne iamax_kernel_S_BEGIN
  244. iamax_kernel_F_BEGIN:
  245. INIT_F
  246. subs N, N , #1
  247. ble iamax_kernel_L999
  248. asrs I, N, #2 // I = N / 4
  249. ble iamax_kernel_F1
  250. .align 5
  251. iamax_kernel_F4:
  252. pld [ X, #X_PRE ]
  253. KERNEL_F1
  254. KERNEL_F1
  255. #if defined(COMPLEX) && defined(DOUBLE)
  256. pld [ X, #X_PRE ]
  257. #endif
  258. KERNEL_F1
  259. KERNEL_F1
  260. subs I, I, #1
  261. ble iamax_kernel_F1
  262. #if defined(COMPLEX) || defined(DOUBLE)
  263. pld [ X, #X_PRE ]
  264. #endif
  265. KERNEL_F1
  266. KERNEL_F1
  267. #if defined(COMPLEX) && defined(DOUBLE)
  268. pld [ X, #X_PRE ]
  269. #endif
  270. KERNEL_F1
  271. KERNEL_F1
  272. subs I, I, #1
  273. bne iamax_kernel_F4
  274. iamax_kernel_F1:
  275. ands I, N, #3
  276. ble iamax_kernel_L999
  277. iamax_kernel_F10:
  278. KERNEL_F1
  279. subs I, I, #1
  280. bne iamax_kernel_F10
  281. b iamax_kernel_L999
  282. iamax_kernel_S_BEGIN:
  283. #if defined(COMPLEX)
  284. #if defined(DOUBLE)
  285. lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
  286. #else
  287. lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
  288. #endif
  289. #else
  290. #if defined(DOUBLE)
  291. lsl INC_X, INC_X, #3 // INC_X * SIZE
  292. #else
  293. lsl INC_X, INC_X, #2 // INC_X * SIZE
  294. #endif
  295. #endif
  296. INIT_S
  297. subs N, N , #1
  298. ble iamax_kernel_L999
  299. asrs I, N, #2 // I = N / 4
  300. ble iamax_kernel_S1
  301. .align 5
  302. iamax_kernel_S4:
  303. KERNEL_S1
  304. KERNEL_S1
  305. KERNEL_S1
  306. KERNEL_S1
  307. subs I, I, #1
  308. bne iamax_kernel_S4
  309. iamax_kernel_S1:
  310. ands I, N, #3
  311. ble iamax_kernel_L999
  312. iamax_kernel_S10:
  313. KERNEL_S1
  314. subs I, I, #1
  315. bne iamax_kernel_S10
  316. iamax_kernel_L999:
  317. mov r0, INDEX // set return value
  318. pop {r4}
  319. bx lr
  320. EPILOGUE