You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sum_vfp.S 7.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed *
  29. **************************************************************************************/
  30. #define ASSEMBLER
  31. #include "common.h"
  32. #define STACKSIZE 256
  33. #define N r0
  34. #define X r1
  35. #define INC_X r2
  36. #define I r12
  37. #define X_PRE 512
  38. /**************************************************************************************
  39. * Macro definitions
  40. **************************************************************************************/
  41. #if !defined(COMPLEX)
  42. #if defined(DOUBLE)
  43. .macro KERNEL_F4
  44. pld [ X, #X_PRE ]
  45. vldmia.f64 X!, { d4 - d5 }
  46. vadd.f64 d0 , d0, d4
  47. vldmia.f64 X!, { d6 - d7 }
  48. vadd.f64 d1 , d1, d5
  49. vadd.f64 d0 , d0, d6
  50. vadd.f64 d1 , d1, d7
  51. .endm
  52. .macro KERNEL_F1
  53. vldmia.f64 X!, { d4 }
  54. vadd.f64 d0 , d0, d4
  55. .endm
  56. .macro KERNEL_S4
  57. vldmia.f64 X, { d4 }
  58. vadd.f64 d0 , d0, d4
  59. add X, X, INC_X
  60. vldmia.f64 X, { d4 }
  61. vadd.f64 d0 , d0, d4
  62. add X, X, INC_X
  63. vldmia.f64 X, { d4 }
  64. vadd.f64 d0 , d0, d4
  65. add X, X, INC_X
  66. vldmia.f64 X, { d4 }
  67. vadd.f64 d0 , d0, d4
  68. add X, X, INC_X
  69. .endm
  70. .macro KERNEL_S1
  71. vldmia.f64 X, { d4 }
  72. vadd.f64 d0 , d0, d4
  73. add X, X, INC_X
  74. .endm
  75. #else
  76. .macro KERNEL_F4
  77. vldmia.f32 X!, { s4 - s5 }
  78. vadd.f32 s0 , s0, s4
  79. vldmia.f32 X!, { s6 - s7 }
  80. vadd.f32 s1 , s1, s5
  81. vadd.f32 s0 , s0, s6
  82. vadd.f32 s1 , s1, s7
  83. .endm
  84. .macro KERNEL_F1
  85. vldmia.f32 X!, { s4 }
  86. vadd.f32 s0 , s0, s4
  87. .endm
  88. .macro KERNEL_S4
  89. vldmia.f32 X, { s4 }
  90. vadd.f32 s0 , s0, s4
  91. add X, X, INC_X
  92. vldmia.f32 X, { s4 }
  93. vadd.f32 s0 , s0, s4
  94. add X, X, INC_X
  95. vldmia.f32 X, { s4 }
  96. vadd.f32 s0 , s0, s4
  97. add X, X, INC_X
  98. vldmia.f32 X, { s4 }
  99. vadd.f32 s0 , s0, s4
  100. add X, X, INC_X
  101. .endm
  102. .macro KERNEL_S1
  103. vldmia.f32 X, { s4 }
  104. vadd.f32 s0 , s0, s4
  105. add X, X, INC_X
  106. .endm
  107. #endif
  108. #else
  109. #if defined(DOUBLE)
  110. .macro KERNEL_F4
  111. pld [ X, #X_PRE ]
  112. vldmia.f64 X!, { d4 - d5 }
  113. vadd.f64 d0 , d0, d4
  114. vldmia.f64 X!, { d6 - d7 }
  115. vadd.f64 d1 , d1, d5
  116. vadd.f64 d0 , d0, d6
  117. vadd.f64 d1 , d1, d7
  118. pld [ X, #X_PRE ]
  119. vldmia.f64 X!, { d4 - d5 }
  120. vadd.f64 d0 , d0, d4
  121. vldmia.f64 X!, { d6 - d7 }
  122. vadd.f64 d1 , d1, d5
  123. vadd.f64 d0 , d0, d6
  124. vadd.f64 d1 , d1, d7
  125. .endm
  126. .macro KERNEL_F1
  127. vldmia.f64 X!, { d4 }
  128. vadd.f64 d0 , d0, d4
  129. vldmia.f64 X!, { d4 }
  130. vadd.f64 d0 , d0, d4
  131. .endm
  132. .macro KERNEL_S4
  133. vldmia.f64 X, { d4 -d5 }
  134. vadd.f64 d0 , d0, d4
  135. vadd.f64 d0 , d0, d5
  136. add X, X, INC_X
  137. vldmia.f64 X, { d4 -d5 }
  138. vadd.f64 d0 , d0, d4
  139. vadd.f64 d0 , d0, d5
  140. add X, X, INC_X
  141. vldmia.f64 X, { d4 -d5 }
  142. vadd.f64 d0 , d0, d4
  143. vadd.f64 d0 , d0, d5
  144. add X, X, INC_X
  145. vldmia.f64 X, { d4 -d5 }
  146. vadd.f64 d0 , d0, d4
  147. vadd.f64 d0 , d0, d5
  148. add X, X, INC_X
  149. .endm
  150. .macro KERNEL_S1
  151. vldmia.f64 X, { d4 -d5 }
  152. vadd.f64 d0 , d0, d4
  153. vadd.f64 d0 , d0, d5
  154. add X, X, INC_X
  155. .endm
  156. #else
  157. .macro KERNEL_F4
  158. pld [ X, #X_PRE ]
  159. vldmia.f32 X!, { s4 - s5 }
  160. vadd.f32 s0 , s0, s4
  161. vldmia.f32 X!, { s6 - s7 }
  162. vadd.f32 s1 , s1, s5
  163. vadd.f32 s0 , s0, s6
  164. vadd.f32 s1 , s1, s7
  165. vldmia.f32 X!, { s4 - s5 }
  166. vadd.f32 s0 , s0, s4
  167. vldmia.f32 X!, { s6 - s7 }
  168. vadd.f32 s1 , s1, s5
  169. vadd.f32 s0 , s0, s6
  170. vadd.f32 s1 , s1, s7
  171. .endm
  172. .macro KERNEL_F1
  173. vldmia.f32 X!, { s4 }
  174. vadd.f32 s0 , s0, s4
  175. vldmia.f32 X!, { s4 }
  176. vadd.f32 s0 , s0, s4
  177. .endm
  178. .macro KERNEL_S4
  179. vldmia.f32 X, { s4 -s5 }
  180. vadd.f32 s0 , s0, s4
  181. vadd.f32 s0 , s0, s5
  182. add X, X, INC_X
  183. vldmia.f32 X, { s4 -s5 }
  184. vadd.f32 s0 , s0, s4
  185. vadd.f32 s0 , s0, s5
  186. add X, X, INC_X
  187. vldmia.f32 X, { s4 -s5 }
  188. vadd.f32 s0 , s0, s4
  189. vadd.f32 s0 , s0, s5
  190. add X, X, INC_X
  191. vldmia.f32 X, { s4 -s5 }
  192. vadd.f32 s0 , s0, s4
  193. vadd.f32 s0 , s0, s5
  194. add X, X, INC_X
  195. .endm
  196. .macro KERNEL_S1
  197. vldmia.f32 X, { s4 -s5 }
  198. vadd.f32 s0 , s0, s4
  199. vadd.f32 s0 , s0, s5
  200. add X, X, INC_X
  201. .endm
  202. #endif
  203. #endif
  204. /**************************************************************************************
  205. * End of macro definitions
  206. **************************************************************************************/
  207. PROLOGUE
  208. .align 5
  209. movs r12, #0 // clear floating point register
  210. vmov s0, r12
  211. vmov s1, r12
  212. #if defined(DOUBLE)
  213. vcvt.f64.f32 d0, s0
  214. vcvt.f64.f32 d1, s1
  215. #endif
  216. cmp N, #0
  217. ble asum_kernel_L999
  218. cmp INC_X, #0
  219. beq asum_kernel_L999
  220. cmp INC_X, #1
  221. bne asum_kernel_S_BEGIN
  222. asum_kernel_F_BEGIN:
  223. asrs I, N, #2 // I = N / 4
  224. ble asum_kernel_F1
  225. .align 5
  226. asum_kernel_F4:
  227. #if !defined(DOUBLE) && !defined(COMPLEX)
  228. pld [ X, #X_PRE ]
  229. #endif
  230. KERNEL_F4
  231. subs I, I, #1
  232. ble asum_kernel_F1
  233. KERNEL_F4
  234. subs I, I, #1
  235. bne asum_kernel_F4
  236. asum_kernel_F1:
  237. ands I, N, #3
  238. ble asum_kernel_L999
  239. asum_kernel_F10:
  240. KERNEL_F1
  241. subs I, I, #1
  242. bne asum_kernel_F10
  243. b asum_kernel_L999
  244. asum_kernel_S_BEGIN:
  245. #if defined(COMPLEX)
  246. #if defined(DOUBLE)
  247. lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
  248. #else
  249. lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
  250. #endif
  251. #else
  252. #if defined(DOUBLE)
  253. lsl INC_X, INC_X, #3 // INC_X * SIZE
  254. #else
  255. lsl INC_X, INC_X, #2 // INC_X * SIZE
  256. #endif
  257. #endif
  258. asrs I, N, #2 // I = N / 4
  259. ble asum_kernel_S1
  260. .align 5
  261. asum_kernel_S4:
  262. KERNEL_S4
  263. subs I, I, #1
  264. bne asum_kernel_S4
  265. asum_kernel_S1:
  266. ands I, N, #3
  267. ble asum_kernel_L999
  268. asum_kernel_S10:
  269. KERNEL_S1
  270. subs I, I, #1
  271. bne asum_kernel_S10
  272. asum_kernel_L999:
  273. #if defined(DOUBLE)
  274. vadd.f64 d0 , d0, d1 // set return value
  275. #else
  276. vadd.f32 s0 , s0, s1 // set return value
  277. #endif
  278. #if !defined(__ARM_PCS_VFP)
  279. #if !defined(DOUBLE)
  280. vmov r0, s0
  281. #else
  282. vmov r0, r1, d0
  283. #endif
  284. #endif
  285. bx lr
  286. EPILOGUE