You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy_vfp.S 9.2 kB


  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/14 Saar
  29. * BLASTEST : xOK
  30. * CTEST : xOK
  31. * TEST : xOK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #ifndef ARM_SOFTFP_ABI
  38. //hard
  39. #define OLD_INC_X [fp, #0 ]
  40. #define OLD_Y [fp, #4 ]
  41. #define OLD_INC_Y [fp, #8 ]
  42. #else
  43. #define OLD_X [fp, #0 ]
  44. #define OLD_INC_X [fp, #4 ]
  45. #define OLD_Y [fp, #8 ]
  46. #define OLD_INC_Y [fp, #12 ]
  47. #endif
  48. #define N r0
  49. #define Y r1
  50. #define INC_X r2
  51. #define X r3
  52. #define INC_Y r4
  53. #define I r12
  54. #define X_PRE 512
  55. /**************************************************************************************
  56. * Macro definitions
  57. **************************************************************************************/
  58. /*****************************************************************************************/
  59. #if !defined(CONJ)
  60. #if defined(DOUBLE)
  61. #define FMAC_R1 fmacd
  62. #define FMAC_R2 vmls.f64
  63. #define FMAC_I1 fmacd
  64. #define FMAC_I2 fmacd
  65. #else
  66. #define FMAC_R1 fmacs
  67. #define FMAC_R2 vmls.f32
  68. #define FMAC_I1 fmacs
  69. #define FMAC_I2 fmacs
  70. #endif
  71. #else // CONJ
  72. #if defined(DOUBLE)
  73. #define FMAC_R1 fmacd
  74. #define FMAC_R2 fmacd
  75. #define FMAC_I1 vmls.f64
  76. #define FMAC_I2 fmacd
  77. #else
  78. #define FMAC_R1 fmacs
  79. #define FMAC_R2 fmacs
  80. #define FMAC_I1 vmls.f32
  81. #define FMAC_I2 fmacs
  82. #endif
  83. #endif
  84. #if !defined(COMPLEX)
  85. #if defined(DOUBLE)
  86. .macro KERNEL_F4
  87. pld [ X, #X_PRE ]
  88. fldmiad X!, { d4 - d7 }
  89. pld [ Y, #X_PRE ]
  90. fldmiad Y , { d8 - d11 }
  91. fmacd d8 , d0, d4
  92. fstmiad Y!, { d8 }
  93. fmacd d9 , d0, d5
  94. fstmiad Y!, { d9 }
  95. fmacd d10, d0, d6
  96. fstmiad Y!, { d10 }
  97. fmacd d11, d0, d7
  98. fstmiad Y!, { d11 }
  99. .endm
  100. .macro KERNEL_F1
  101. fldmiad X!, { d4 }
  102. fldmiad Y , { d8 }
  103. fmacd d8 , d0, d4
  104. fstmiad Y!, { d8 }
  105. .endm
  106. .macro KERNEL_S1
  107. fldmiad X , { d4 }
  108. fldmiad Y , { d8 }
  109. fmacd d8 , d0, d4
  110. fstmiad Y , { d8 }
  111. add X, X, INC_X
  112. add Y, Y, INC_Y
  113. .endm
  114. #else
  115. .macro KERNEL_F4
  116. fldmias X!, { s4 - s7 }
  117. fldmias Y , { s8 - s11 }
  118. fmacs s8 , s0, s4
  119. fstmias Y!, { s8 }
  120. fmacs s9 , s0, s5
  121. fstmias Y!, { s9 }
  122. fmacs s10, s0, s6
  123. fstmias Y!, { s10 }
  124. fmacs s11, s0, s7
  125. fstmias Y!, { s11 }
  126. .endm
  127. .macro KERNEL_F1
  128. fldmias X!, { s4 }
  129. fldmias Y , { s8 }
  130. fmacs s8 , s0, s4
  131. fstmias Y!, { s8 }
  132. .endm
  133. .macro KERNEL_S1
  134. fldmias X , { s4 }
  135. fldmias Y , { s8 }
  136. fmacs s8 , s0, s4
  137. fstmias Y , { s8 }
  138. add X, X, INC_X
  139. add Y, Y, INC_Y
  140. .endm
  141. #endif
  142. #else
  143. #if defined(DOUBLE)
  144. .macro KERNEL_F4
  145. pld [ X, #X_PRE ]
  146. fldmiad X!, { d4 - d7 }
  147. pld [ Y, #X_PRE ]
  148. fldmiad Y , { d8 - d11 }
  149. FMAC_R1 d8 , d0, d4
  150. FMAC_R2 d8 , d1, d5
  151. FMAC_I1 d9 , d0, d5
  152. FMAC_I2 d9 , d1, d4
  153. fstmiad Y!, { d8 }
  154. fstmiad Y!, { d9 }
  155. FMAC_R1 d10, d0, d6
  156. FMAC_R2 d10, d1, d7
  157. FMAC_I1 d11, d0, d7
  158. FMAC_I2 d11, d1, d6
  159. fstmiad Y!, { d10 }
  160. fstmiad Y!, { d11 }
  161. pld [ X, #X_PRE ]
  162. fldmiad X!, { d4 - d7 }
  163. pld [ Y, #X_PRE ]
  164. fldmiad Y , { d8 - d11 }
  165. FMAC_R1 d8 , d0, d4
  166. FMAC_R2 d8 , d1, d5
  167. FMAC_I1 d9 , d0, d5
  168. FMAC_I2 d9 , d1, d4
  169. fstmiad Y!, { d8 }
  170. fstmiad Y!, { d9 }
  171. FMAC_R1 d10, d0, d6
  172. FMAC_R2 d10, d1, d7
  173. FMAC_I1 d11, d0, d7
  174. FMAC_I2 d11, d1, d6
  175. fstmiad Y!, { d10 }
  176. fstmiad Y!, { d11 }
  177. .endm
  178. .macro KERNEL_F1
  179. fldmiad X!, { d4 - d5 }
  180. fldmiad Y , { d8 - d9 }
  181. FMAC_R1 d8 , d0, d4
  182. FMAC_R2 d8 , d1, d5
  183. FMAC_I1 d9 , d0, d5
  184. FMAC_I2 d9 , d1, d4
  185. fstmiad Y!, { d8 }
  186. fstmiad Y!, { d9 }
  187. .endm
  188. .macro KERNEL_S1
  189. fldmiad X , { d4 - d5 }
  190. fldmiad Y , { d8 - d9 }
  191. FMAC_R1 d8 , d0, d4
  192. FMAC_R2 d8 , d1, d5
  193. FMAC_I1 d9 , d0, d5
  194. FMAC_I2 d9 , d1, d4
  195. fstmiad Y , { d8 - d9 }
  196. add X, X, INC_X
  197. add Y, Y, INC_Y
  198. .endm
  199. #else
  200. .macro KERNEL_F4
  201. pld [ X, #X_PRE ]
  202. fldmias X!, { s4 - s7 }
  203. pld [ Y, #X_PRE ]
  204. fldmias Y , { s8 - s11 }
  205. FMAC_R1 s8 , s0, s4
  206. FMAC_R2 s8 , s1, s5
  207. FMAC_I1 s9 , s0, s5
  208. FMAC_I2 s9 , s1, s4
  209. fstmias Y!, { s8 }
  210. fstmias Y!, { s9 }
  211. FMAC_R1 s10, s0, s6
  212. FMAC_R2 s10, s1, s7
  213. FMAC_I1 s11, s0, s7
  214. FMAC_I2 s11, s1, s6
  215. fstmias Y!, { s10 }
  216. fstmias Y!, { s11 }
  217. fldmias X!, { s4 - s7 }
  218. fldmias Y , { s8 - s11 }
  219. FMAC_R1 s8 , s0, s4
  220. FMAC_R2 s8 , s1, s5
  221. FMAC_I1 s9 , s0, s5
  222. FMAC_I2 s9 , s1, s4
  223. fstmias Y!, { s8 }
  224. fstmias Y!, { s9 }
  225. FMAC_R1 s10, s0, s6
  226. FMAC_R2 s10, s1, s7
  227. FMAC_I1 s11, s0, s7
  228. FMAC_I2 s11, s1, s6
  229. fstmias Y!, { s10 }
  230. fstmias Y!, { s11 }
  231. .endm
  232. .macro KERNEL_F1
  233. fldmias X!, { s4 - s5 }
  234. fldmias Y , { s8 - s9 }
  235. FMAC_R1 s8 , s0, s4
  236. FMAC_R2 s8 , s1, s5
  237. FMAC_I1 s9 , s0, s5
  238. FMAC_I2 s9 , s1, s4
  239. fstmias Y!, { s8 }
  240. fstmias Y!, { s9 }
  241. .endm
  242. .macro KERNEL_S1
  243. fldmias X , { s4 - s5 }
  244. fldmias Y , { s8 - s9 }
  245. FMAC_R1 s8 , s0, s4
  246. FMAC_R2 s8 , s1, s5
  247. FMAC_I1 s9 , s0, s5
  248. FMAC_I2 s9 , s1, s4
  249. fstmias Y , { s8 - s9 }
  250. add X, X, INC_X
  251. add Y, Y, INC_Y
  252. .endm
  253. #endif
  254. #endif
  255. /**************************************************************************************
  256. * End of macro definitions
  257. **************************************************************************************/
  258. PROLOGUE
  259. .align 5
  260. push {r4 , fp}
  261. add fp, sp, #8
  262. sub sp, sp, #STACKSIZE // reserve stack
  263. #ifdef ARM_SOFTFP_ABI
  264. #ifndef DOUBLE
  265. vmov s0, r3 //move alpha to s0
  266. ldr X, OLD_X
  267. #endif
  268. #endif
  269. ldr INC_X , OLD_INC_X
  270. ldr Y, OLD_Y
  271. ldr INC_Y , OLD_INC_Y
  272. sub r12, fp, #128
  273. #if defined(DOUBLE)
  274. vstm r12, { d8 - d15} // store floating point registers
  275. #else
  276. vstm r12, { s8 - s15} // store floating point registers
  277. #endif
  278. cmp N, #0
  279. ble axpy_kernel_L999
  280. cmp INC_X, #0
  281. beq axpy_kernel_L999
  282. cmp INC_Y, #0
  283. beq axpy_kernel_L999
  284. cmp INC_X, #1
  285. bne axpy_kernel_S_BEGIN
  286. cmp INC_Y, #1
  287. bne axpy_kernel_S_BEGIN
  288. axpy_kernel_F_BEGIN:
  289. asrs I, N, #2 // I = N / 4
  290. ble axpy_kernel_F1
  291. .align 5
  292. axpy_kernel_F4:
  293. #if !defined(COMPLEX) && !defined(DOUBLE)
  294. pld [ X, #X_PRE ]
  295. pld [ Y, #X_PRE ]
  296. #endif
  297. KERNEL_F4
  298. subs I, I, #1
  299. ble axpy_kernel_F1
  300. KERNEL_F4
  301. subs I, I, #1
  302. bne axpy_kernel_F4
  303. axpy_kernel_F1:
  304. ands I, N, #3
  305. ble axpy_kernel_L999
  306. axpy_kernel_F10:
  307. KERNEL_F1
  308. subs I, I, #1
  309. bne axpy_kernel_F10
  310. b axpy_kernel_L999
  311. axpy_kernel_S_BEGIN:
  312. #if defined(COMPLEX)
  313. #if defined(DOUBLE)
  314. lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
  315. lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
  316. #else
  317. lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
  318. lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
  319. #endif
  320. #else
  321. #if defined(DOUBLE)
  322. lsl INC_X, INC_X, #3 // INC_X * SIZE
  323. lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
  324. #else
  325. lsl INC_X, INC_X, #2 // INC_X * SIZE
  326. lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
  327. #endif
  328. #endif
  329. asrs I, N, #2 // I = N / 4
  330. ble axpy_kernel_S1
  331. .align 5
  332. axpy_kernel_S4:
  333. KERNEL_S1
  334. KERNEL_S1
  335. KERNEL_S1
  336. KERNEL_S1
  337. subs I, I, #1
  338. bne axpy_kernel_S4
  339. axpy_kernel_S1:
  340. ands I, N, #3
  341. ble axpy_kernel_L999
  342. axpy_kernel_S10:
  343. KERNEL_S1
  344. subs I, I, #1
  345. bne axpy_kernel_S10
  346. axpy_kernel_L999:
  347. sub r3, fp, #128
  348. #if defined(DOUBLE)
  349. vldm r3, { d8 - d15 } // restore floating point registers
  350. #else
  351. vldm r3, { s8 - s15 } // restore floating point registers
  352. #endif
  353. mov r0, #0 // set return value
  354. sub sp, fp, #8
  355. pop {r4,fp}
  356. bx lr
  357. EPILOGUE