You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

asum_vfp.S 8.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/11 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define N r0
  38. #define X r1
  39. #define INC_X r2
  40. #define I r12
  41. #define X_PRE 512
  42. /**************************************************************************************
  43. * Macro definitions
  44. **************************************************************************************/
  45. #if !defined(COMPLEX)
  46. #if defined(DOUBLE)
  47. .macro KERNEL_F4
  48. pld [ X, #X_PRE ]
  49. vldmia.f64 X!, { d4 - d5 }
  50. vabs.f64 d4, d4
  51. vadd.f64 d0 , d0, d4
  52. vabs.f64 d5, d5
  53. vldmia.f64 X!, { d6 - d7 }
  54. vabs.f64 d6, d6
  55. vadd.f64 d1 , d1, d5
  56. vabs.f64 d7, d7
  57. vadd.f64 d0 , d0, d6
  58. vadd.f64 d1 , d1, d7
  59. .endm
  60. .macro KERNEL_F1
  61. vldmia.f64 X!, { d4 }
  62. vabs.f64 d4, d4
  63. vadd.f64 d0 , d0, d4
  64. .endm
  65. .macro KERNEL_S4
  66. vldmia.f64 X, { d4 }
  67. vabs.f64 d4, d4
  68. vadd.f64 d0 , d0, d4
  69. add X, X, INC_X
  70. vldmia.f64 X, { d4 }
  71. vabs.f64 d4, d4
  72. vadd.f64 d0 , d0, d4
  73. add X, X, INC_X
  74. vldmia.f64 X, { d4 }
  75. vabs.f64 d4, d4
  76. vadd.f64 d0 , d0, d4
  77. add X, X, INC_X
  78. vldmia.f64 X, { d4 }
  79. vabs.f64 d4, d4
  80. vadd.f64 d0 , d0, d4
  81. add X, X, INC_X
  82. .endm
  83. .macro KERNEL_S1
  84. vldmia.f64 X, { d4 }
  85. vabs.f64 d4, d4
  86. vadd.f64 d0 , d0, d4
  87. add X, X, INC_X
  88. .endm
  89. #else
  90. .macro KERNEL_F4
  91. vldmia.f32 X!, { s4 - s5 }
  92. vabs.f32 s4, s4
  93. vadd.f32 s0 , s0, s4
  94. vabs.f32 s5, s5
  95. vldmia.f32 X!, { s6 - s7 }
  96. vabs.f32 s6, s6
  97. vadd.f32 s1 , s1, s5
  98. vabs.f32 s7, s7
  99. vadd.f32 s0 , s0, s6
  100. vadd.f32 s1 , s1, s7
  101. .endm
  102. .macro KERNEL_F1
  103. vldmia.f32 X!, { s4 }
  104. vabs.f32 s4, s4
  105. vadd.f32 s0 , s0, s4
  106. .endm
  107. .macro KERNEL_S4
  108. vldmia.f32 X, { s4 }
  109. vabs.f32 s4, s4
  110. vadd.f32 s0 , s0, s4
  111. add X, X, INC_X
  112. vldmia.f32 X, { s4 }
  113. vabs.f32 s4, s4
  114. vadd.f32 s0 , s0, s4
  115. add X, X, INC_X
  116. vldmia.f32 X, { s4 }
  117. vabs.f32 s4, s4
  118. vadd.f32 s0 , s0, s4
  119. add X, X, INC_X
  120. vldmia.f32 X, { s4 }
  121. vabs.f32 s4, s4
  122. vadd.f32 s0 , s0, s4
  123. add X, X, INC_X
  124. .endm
  125. .macro KERNEL_S1
  126. vldmia.f32 X, { s4 }
  127. vabs.f32 s4, s4
  128. vadd.f32 s0 , s0, s4
  129. add X, X, INC_X
  130. .endm
  131. #endif
  132. #else
  133. #if defined(DOUBLE)
  134. .macro KERNEL_F4
  135. pld [ X, #X_PRE ]
  136. vldmia.f64 X!, { d4 - d5 }
  137. vabs.f64 d4, d4
  138. vadd.f64 d0 , d0, d4
  139. vabs.f64 d5, d5
  140. vldmia.f64 X!, { d6 - d7 }
  141. vabs.f64 d6, d6
  142. vadd.f64 d1 , d1, d5
  143. vabs.f64 d7, d7
  144. vadd.f64 d0 , d0, d6
  145. vadd.f64 d1 , d1, d7
  146. pld [ X, #X_PRE ]
  147. vldmia.f64 X!, { d4 - d5 }
  148. vabs.f64 d4, d4
  149. vadd.f64 d0 , d0, d4
  150. vabs.f64 d5, d5
  151. vldmia.f64 X!, { d6 - d7 }
  152. vabs.f64 d6, d6
  153. vadd.f64 d1 , d1, d5
  154. vabs.f64 d7, d7
  155. vadd.f64 d0 , d0, d6
  156. vadd.f64 d1 , d1, d7
  157. .endm
  158. .macro KERNEL_F1
  159. vldmia.f64 X!, { d4 }
  160. vabs.f64 d4, d4
  161. vadd.f64 d0 , d0, d4
  162. vldmia.f64 X!, { d4 }
  163. vabs.f64 d4, d4
  164. vadd.f64 d0 , d0, d4
  165. .endm
  166. .macro KERNEL_S4
  167. vldmia.f64 X, { d4 -d5 }
  168. vabs.f64 d4, d4
  169. vadd.f64 d0 , d0, d4
  170. vabs.f64 d5, d5
  171. vadd.f64 d0 , d0, d5
  172. add X, X, INC_X
  173. vldmia.f64 X, { d4 -d5 }
  174. vabs.f64 d4, d4
  175. vadd.f64 d0 , d0, d4
  176. vabs.f64 d5, d5
  177. vadd.f64 d0 , d0, d5
  178. add X, X, INC_X
  179. vldmia.f64 X, { d4 -d5 }
  180. vabs.f64 d4, d4
  181. vadd.f64 d0 , d0, d4
  182. vabs.f64 d5, d5
  183. vadd.f64 d0 , d0, d5
  184. add X, X, INC_X
  185. vldmia.f64 X, { d4 -d5 }
  186. vabs.f64 d4, d4
  187. vadd.f64 d0 , d0, d4
  188. vabs.f64 d5, d5
  189. vadd.f64 d0 , d0, d5
  190. add X, X, INC_X
  191. .endm
  192. .macro KERNEL_S1
  193. vldmia.f64 X, { d4 -d5 }
  194. vabs.f64 d4, d4
  195. vadd.f64 d0 , d0, d4
  196. vabs.f64 d5, d5
  197. vadd.f64 d0 , d0, d5
  198. add X, X, INC_X
  199. .endm
  200. #else
  201. .macro KERNEL_F4
  202. pld [ X, #X_PRE ]
  203. vldmia.f32 X!, { s4 - s5 }
  204. vabs.f32 s4, s4
  205. vadd.f32 s0 , s0, s4
  206. vabs.f32 s5, s5
  207. vldmia.f32 X!, { s6 - s7 }
  208. vabs.f32 s6, s6
  209. vadd.f32 s1 , s1, s5
  210. vabs.f32 s7, s7
  211. vadd.f32 s0 , s0, s6
  212. vadd.f32 s1 , s1, s7
  213. vldmia.f32 X!, { s4 - s5 }
  214. vabs.f32 s4, s4
  215. vadd.f32 s0 , s0, s4
  216. vabs.f32 s5, s5
  217. vldmia.f32 X!, { s6 - s7 }
  218. vabs.f32 s6, s6
  219. vadd.f32 s1 , s1, s5
  220. vabs.f32 s7, s7
  221. vadd.f32 s0 , s0, s6
  222. vadd.f32 s1 , s1, s7
  223. .endm
  224. .macro KERNEL_F1
  225. vldmia.f32 X!, { s4 }
  226. vabs.f32 s4, s4
  227. vadd.f32 s0 , s0, s4
  228. vldmia.f32 X!, { s4 }
  229. vabs.f32 s4, s4
  230. vadd.f32 s0 , s0, s4
  231. .endm
  232. .macro KERNEL_S4
  233. vldmia.f32 X, { s4 -s5 }
  234. vabs.f32 s4, s4
  235. vadd.f32 s0 , s0, s4
  236. vabs.f32 s5, s5
  237. vadd.f32 s0 , s0, s5
  238. add X, X, INC_X
  239. vldmia.f32 X, { s4 -s5 }
  240. vabs.f32 s4, s4
  241. vadd.f32 s0 , s0, s4
  242. vabs.f32 s5, s5
  243. vadd.f32 s0 , s0, s5
  244. add X, X, INC_X
  245. vldmia.f32 X, { s4 -s5 }
  246. vabs.f32 s4, s4
  247. vadd.f32 s0 , s0, s4
  248. vabs.f32 s5, s5
  249. vadd.f32 s0 , s0, s5
  250. add X, X, INC_X
  251. vldmia.f32 X, { s4 -s5 }
  252. vabs.f32 s4, s4
  253. vadd.f32 s0 , s0, s4
  254. vabs.f32 s5, s5
  255. vadd.f32 s0 , s0, s5
  256. add X, X, INC_X
  257. .endm
  258. .macro KERNEL_S1
  259. vldmia.f32 X, { s4 -s5 }
  260. vabs.f32 s4, s4
  261. vadd.f32 s0 , s0, s4
  262. vabs.f32 s5, s5
  263. vadd.f32 s0 , s0, s5
  264. add X, X, INC_X
  265. .endm
  266. #endif
  267. #endif
  268. /**************************************************************************************
  269. * End of macro definitions
  270. **************************************************************************************/
  271. PROLOGUE
  272. .align 5
  273. movs r12, #0 // clear floating point register
  274. vmov s0, r12
  275. vmov s1, r12
  276. #if defined(DOUBLE)
  277. vcvt.f64.f32 d0, s0
  278. vcvt.f64.f32 d1, s1
  279. #endif
  280. cmp N, #0
  281. ble asum_kernel_L999
  282. cmp INC_X, #0
  283. beq asum_kernel_L999
  284. cmp INC_X, #1
  285. bne asum_kernel_S_BEGIN
  286. asum_kernel_F_BEGIN:
  287. asrs I, N, #2 // I = N / 4
  288. ble asum_kernel_F1
  289. .align 5
  290. asum_kernel_F4:
  291. #if !defined(DOUBLE) && !defined(COMPLEX)
  292. pld [ X, #X_PRE ]
  293. #endif
  294. KERNEL_F4
  295. subs I, I, #1
  296. ble asum_kernel_F1
  297. KERNEL_F4
  298. subs I, I, #1
  299. bne asum_kernel_F4
  300. asum_kernel_F1:
  301. ands I, N, #3
  302. ble asum_kernel_L999
  303. asum_kernel_F10:
  304. KERNEL_F1
  305. subs I, I, #1
  306. bne asum_kernel_F10
  307. b asum_kernel_L999
  308. asum_kernel_S_BEGIN:
  309. #if defined(COMPLEX)
  310. #if defined(DOUBLE)
  311. lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
  312. #else
  313. lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
  314. #endif
  315. #else
  316. #if defined(DOUBLE)
  317. lsl INC_X, INC_X, #3 // INC_X * SIZE
  318. #else
  319. lsl INC_X, INC_X, #2 // INC_X * SIZE
  320. #endif
  321. #endif
  322. asrs I, N, #2 // I = N / 4
  323. ble asum_kernel_S1
  324. .align 5
  325. asum_kernel_S4:
  326. KERNEL_S4
  327. subs I, I, #1
  328. bne asum_kernel_S4
  329. asum_kernel_S1:
  330. ands I, N, #3
  331. ble asum_kernel_L999
  332. asum_kernel_S10:
  333. KERNEL_S1
  334. subs I, I, #1
  335. bne asum_kernel_S10
  336. asum_kernel_L999:
  337. #if defined(DOUBLE)
  338. vadd.f64 d0 , d0, d1 // set return value
  339. #else
  340. vadd.f32 s0 , s0, s1 // set return value
  341. #endif
  342. #if !defined(__ARM_PCS_VFP)
  343. #if !defined(DOUBLE)
  344. vmov r0, s0
  345. #else
  346. vmov r0, r1, d0
  347. #endif
  348. #endif
  349. bx lr
  350. EPILOGUE