You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t_vfp.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/29 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #if !defined(__ARM_PCS_VFP)
  38. #define OLD_ALPHAR [fp, #0 ]
  39. #define OLD_ALPHAI [fp, #8 ]
  40. #define OLD_A_SOFTFP [fp, #16]
  41. #define OLD_LDA [fp, #20]
  42. #define X [fp, #24]
  43. #define OLD_INC_X [fp, #28]
  44. #define Y [fp, #32]
  45. #define OLD_INC_Y [fp, #36]
  46. #else
  47. #define OLD_LDA [fp, #0 ]
  48. #define X [fp, #4 ]
  49. #define OLD_INC_X [fp, #8 ]
  50. #define Y [fp, #12 ]
  51. #define OLD_INC_Y [fp, #16 ]
  52. #endif
  53. #define OLD_A r3
  54. #define OLD_N r1
  55. #define M r0
  56. #define AO1 r1
  57. #define J r2
  58. #define AO2 r4
  59. #define XO r5
  60. #define YO r6
  61. #define LDA r7
  62. #define INC_X r8
  63. #define INC_Y r9
  64. #define I r12
  65. #define FP_ZERO [fp, #-228]
  66. #define FP_ZERO_0 [fp, #-228]
  67. #define FP_ZERO_1 [fp, #-224]
  68. #define N [fp, #-252 ]
  69. #define A [fp, #-256 ]
  70. #define X_PRE 512
  71. #define A_PRE 512
  72. #define Y_PRE 32
  73. /**************************************************************************************
  74. * Macro definitions
  75. **************************************************************************************/
  76. #if !defined(CONJ) && !defined(XCONJ)
  77. #define KMAC_R vmls.f64
  78. #define KMAC_I fmacd
  79. #define FMAC_R1 fmacd
  80. #define FMAC_R2 vmls.f64
  81. #define FMAC_I1 fmacd
  82. #define FMAC_I2 fmacd
  83. #elif defined(CONJ) && !defined(XCONJ)
  84. #define KMAC_R fmacd
  85. #define KMAC_I vmls.f64
  86. #define FMAC_R1 fmacd
  87. #define FMAC_R2 vmls.f64
  88. #define FMAC_I1 fmacd
  89. #define FMAC_I2 fmacd
  90. #elif !defined(CONJ) && defined(XCONJ)
  91. #define KMAC_R fmacd
  92. #define KMAC_I vmls.f64
  93. #define FMAC_R1 fmacd
  94. #define FMAC_R2 fmacd
  95. #define FMAC_I1 vmls.f64
  96. #define FMAC_I2 fmacd
  97. #else
  98. #define KMAC_R vmls.f64
  99. #define KMAC_I fmacd
  100. #define FMAC_R1 fmacd
  101. #define FMAC_R2 fmacd
  102. #define FMAC_I1 vmls.f64
  103. #define FMAC_I2 fmacd
  104. #endif
  105. .macro INIT_F2
  106. fldd d12, FP_ZERO
  107. vmov.f64 d13, d12
  108. vmov.f64 d14, d12
  109. vmov.f64 d15, d12
  110. .endm
  111. .macro KERNEL_F2X4
  112. KERNEL_F2X1
  113. KERNEL_F2X1
  114. KERNEL_F2X1
  115. KERNEL_F2X1
  116. .endm
  117. .macro KERNEL_F2X1
  118. vldmia.f64 XO! , { d2 - d3 }
  119. vldmia.f64 AO1!, { d4 - d5 }
  120. fmacd d12 , d4 , d2
  121. fmacd d13 , d4 , d3
  122. vldmia.f64 AO2!, { d8 - d9 }
  123. KMAC_R d12 , d5 , d3
  124. KMAC_I d13 , d5 , d2
  125. fmacd d14 , d8 , d2
  126. fmacd d15 , d8 , d3
  127. KMAC_R d14 , d9 , d3
  128. KMAC_I d15 , d9 , d2
  129. .endm
  130. .macro SAVE_F2
  131. vldmia.f64 YO, { d4 - d7 }
  132. FMAC_R1 d4 , d0 , d12
  133. FMAC_I1 d5 , d0 , d13
  134. FMAC_R2 d4 , d1 , d13
  135. FMAC_I2 d5 , d1 , d12
  136. FMAC_R1 d6 , d0 , d14
  137. FMAC_I1 d7 , d0 , d15
  138. FMAC_R2 d6 , d1 , d15
  139. FMAC_I2 d7 , d1 , d14
  140. vstmia.f64 YO!, { d4 - d7 }
  141. .endm
  142. /************************************************************************************************/
  143. .macro INIT_F1
  144. fldd d12, FP_ZERO
  145. vmov.f64 d13, d12
  146. .endm
  147. .macro KERNEL_F1X4
  148. KERNEL_F1X1
  149. KERNEL_F1X1
  150. KERNEL_F1X1
  151. KERNEL_F1X1
  152. .endm
  153. .macro KERNEL_F1X1
  154. vldmia.f64 XO! , { d2 - d3 }
  155. vldmia.f64 AO1!, { d4 - d5 }
  156. fmacd d12 , d4 , d2
  157. fmacd d13 , d4 , d3
  158. KMAC_R d12 , d5 , d3
  159. KMAC_I d13 , d5 , d2
  160. .endm
  161. .macro SAVE_F1
  162. vldmia.f64 YO, { d4 - d5 }
  163. FMAC_R1 d4 , d0 , d12
  164. FMAC_I1 d5 , d0 , d13
  165. FMAC_R2 d4 , d1 , d13
  166. FMAC_I2 d5 , d1 , d12
  167. vstmia.f64 YO!, { d4 - d5 }
  168. .endm
  169. /************************************************************************************************/
  170. .macro INIT_S2
  171. fldd d12, FP_ZERO
  172. vmov.f64 d13, d12
  173. vmov.f64 d14, d12
  174. vmov.f64 d15, d12
  175. .endm
  176. .macro KERNEL_S2X4
  177. KERNEL_S2X1
  178. KERNEL_S2X1
  179. KERNEL_S2X1
  180. KERNEL_S2X1
  181. .endm
  182. .macro KERNEL_S2X1
  183. vldmia.f64 XO , { d2 - d3 }
  184. vldmia.f64 AO1!, { d4 - d5 }
  185. vldmia.f64 AO2!, { d8 - d9 }
  186. fmacd d12 , d4 , d2
  187. fmacd d13 , d4 , d3
  188. KMAC_R d12 , d5 , d3
  189. KMAC_I d13 , d5 , d2
  190. fmacd d14 , d8 , d2
  191. fmacd d15 , d8 , d3
  192. KMAC_R d14 , d9 , d3
  193. KMAC_I d15 , d9 , d2
  194. add XO, XO, INC_X
  195. .endm
  196. .macro SAVE_S2
  197. vldmia.f64 YO, { d4 - d5 }
  198. FMAC_R1 d4 , d0 , d12
  199. FMAC_I1 d5 , d0 , d13
  200. FMAC_R2 d4 , d1 , d13
  201. FMAC_I2 d5 , d1 , d12
  202. vstmia.f64 YO, { d4 - d5 }
  203. add YO, YO, INC_Y
  204. vldmia.f64 YO, { d6 - d7 }
  205. FMAC_R1 d6 , d0 , d14
  206. FMAC_I1 d7 , d0 , d15
  207. FMAC_R2 d6 , d1 , d15
  208. FMAC_I2 d7 , d1 , d14
  209. vstmia.f64 YO, { d6 - d7 }
  210. add YO, YO, INC_Y
  211. .endm
  212. /************************************************************************************************/
  213. .macro INIT_S1
  214. fldd d12, FP_ZERO
  215. vmov.f64 d13, d12
  216. .endm
  217. .macro KERNEL_S1X4
  218. KERNEL_S1X1
  219. KERNEL_S1X1
  220. KERNEL_S1X1
  221. KERNEL_S1X1
  222. .endm
  223. .macro KERNEL_S1X1
  224. vldmia.f64 XO , { d2 - d3 }
  225. vldmia.f64 AO1!, { d4 - d5 }
  226. fmacd d12 , d4 , d2
  227. fmacd d13 , d4 , d3
  228. KMAC_R d12 , d5 , d3
  229. KMAC_I d13 , d5 , d2
  230. add XO, XO, INC_X
  231. .endm
  232. .macro SAVE_S1
  233. vldmia.f64 YO, { d4 - d5 }
  234. FMAC_R1 d4 , d0 , d12
  235. FMAC_I1 d5 , d0 , d13
  236. FMAC_R2 d4 , d1 , d13
  237. FMAC_I2 d5 , d1 , d12
  238. vstmia.f64 YO, { d4 - d5 }
  239. add YO, YO, INC_Y
  240. .endm
  241. /**************************************************************************************
  242. * End of macro definitions
  243. **************************************************************************************/
  244. PROLOGUE
  245. .align 5
  246. push {r4 - r9 , fp}
  247. add fp, sp, #28
  248. sub sp, sp, #STACKSIZE // reserve stack
  249. sub r12, fp, #192
  250. #if defined(DOUBLE)
  251. vstm r12, { d8 - d15 } // store floating point registers
  252. #else
  253. vstm r12, { s8 - s15 } // store floating point registers
  254. #endif
  255. movs r12, #0
  256. str r12, FP_ZERO
  257. str r12, FP_ZERO_1
  258. cmp M, #0
  259. ble zgemvt_kernel_L999
  260. cmp OLD_N, #0
  261. ble zgemvt_kernel_L999
  262. #if !defined(__ARM_PCS_VFP)
  263. vldr d0, OLD_ALPHAR
  264. vldr d1, OLD_ALPHAI
  265. ldr OLD_A, OLD_A_SOFTFP
  266. #endif
  267. str OLD_A, A
  268. str OLD_N, N
  269. ldr INC_X , OLD_INC_X
  270. ldr INC_Y , OLD_INC_Y
  271. cmp INC_X, #0
  272. beq zgemvt_kernel_L999
  273. cmp INC_Y, #0
  274. beq zgemvt_kernel_L999
  275. ldr LDA, OLD_LDA
  276. #if defined(DOUBLE)
  277. lsl LDA, LDA, #4 // LDA * SIZE
  278. #else
  279. lsl LDA, LDA, #3 // LDA * SIZE
  280. #endif
  281. cmp INC_X, #1
  282. bne zgemvt_kernel_S2_BEGIN
  283. cmp INC_Y, #1
  284. bne zgemvt_kernel_S2_BEGIN
  285. zgemvt_kernel_F2_BEGIN:
  286. ldr YO , Y
  287. ldr J, N
  288. asrs J, J, #1 // J = N / 2
  289. ble zgemvt_kernel_F1_BEGIN
  290. zgemvt_kernel_F2X4:
  291. ldr AO1, A
  292. add AO2, AO1, LDA
  293. add r3 , AO2, LDA
  294. str r3 , A
  295. ldr XO , X
  296. INIT_F2
  297. asrs I, M, #2 // I = M / 4
  298. ble zgemvt_kernel_F2X1
  299. zgemvt_kernel_F2X4_10:
  300. KERNEL_F2X4
  301. subs I, I, #1
  302. bne zgemvt_kernel_F2X4_10
  303. zgemvt_kernel_F2X1:
  304. ands I, M , #3
  305. ble zgemvt_kernel_F2_END
  306. zgemvt_kernel_F2X1_10:
  307. KERNEL_F2X1
  308. subs I, I, #1
  309. bne zgemvt_kernel_F2X1_10
  310. zgemvt_kernel_F2_END:
  311. SAVE_F2
  312. subs J , J , #1
  313. bne zgemvt_kernel_F2X4
  314. zgemvt_kernel_F1_BEGIN:
  315. ldr J, N
  316. ands J, J, #1
  317. ble zgemvt_kernel_L999
  318. zgemvt_kernel_F1X4:
  319. ldr AO1, A
  320. ldr XO , X
  321. INIT_F1
  322. asrs I, M, #2 // I = M / 4
  323. ble zgemvt_kernel_F1X1
  324. zgemvt_kernel_F1X4_10:
  325. KERNEL_F1X4
  326. subs I, I, #1
  327. bne zgemvt_kernel_F1X4_10
  328. zgemvt_kernel_F1X1:
  329. ands I, M , #3
  330. ble zgemvt_kernel_F1_END
  331. zgemvt_kernel_F1X1_10:
  332. KERNEL_F1X1
  333. subs I, I, #1
  334. bne zgemvt_kernel_F1X1_10
  335. zgemvt_kernel_F1_END:
  336. SAVE_F1
  337. b zgemvt_kernel_L999
  338. /*************************************************************************************************************/
  339. zgemvt_kernel_S2_BEGIN:
  340. #if defined(DOUBLE)
  341. lsl INC_X, INC_X, #4 // INC_X * SIZE
  342. lsl INC_Y, INC_Y, #4 // INC_Y * SIZE
  343. #else
  344. lsl INC_X, INC_X, #3 // INC_X * SIZE
  345. lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
  346. #endif
  347. ldr YO , Y
  348. ldr J, N
  349. asrs J, J, #1 // J = N / 2
  350. ble zgemvt_kernel_S1_BEGIN
  351. zgemvt_kernel_S2X4:
  352. ldr AO1, A
  353. add AO2, AO1, LDA
  354. add r3 , AO2, LDA
  355. str r3 , A
  356. ldr XO , X
  357. INIT_S2
  358. asrs I, M, #2 // I = M / 4
  359. ble zgemvt_kernel_S2X1
  360. zgemvt_kernel_S2X4_10:
  361. KERNEL_S2X4
  362. subs I, I, #1
  363. bne zgemvt_kernel_S2X4_10
  364. zgemvt_kernel_S2X1:
  365. ands I, M , #3
  366. ble zgemvt_kernel_S2_END
  367. zgemvt_kernel_S2X1_10:
  368. KERNEL_S2X1
  369. subs I, I, #1
  370. bne zgemvt_kernel_S2X1_10
  371. zgemvt_kernel_S2_END:
  372. SAVE_S2
  373. subs J , J , #1
  374. bne zgemvt_kernel_S2X4
  375. zgemvt_kernel_S1_BEGIN:
  376. ldr J, N
  377. ands J, J, #1
  378. ble zgemvt_kernel_L999
  379. zgemvt_kernel_S1X4:
  380. ldr AO1, A
  381. ldr XO , X
  382. INIT_S1
  383. asrs I, M, #2 // I = M / 4
  384. ble zgemvt_kernel_S1X1
  385. zgemvt_kernel_S1X4_10:
  386. KERNEL_S1X4
  387. subs I, I, #1
  388. bne zgemvt_kernel_S1X4_10
  389. zgemvt_kernel_S1X1:
  390. ands I, M , #3
  391. ble zgemvt_kernel_S1_END
  392. zgemvt_kernel_S1X1_10:
  393. KERNEL_S1X1
  394. subs I, I, #1
  395. bne zgemvt_kernel_S1X1_10
  396. zgemvt_kernel_S1_END:
  397. SAVE_S1
  398. /*************************************************************************************************************/
  399. zgemvt_kernel_L999:
  400. sub r3, fp, #192
  401. #if defined(DOUBLE)
  402. vldm r3, { d8 - d15 } // restore floating point registers
  403. #else
  404. vldm r3, { s8 - s15 } // restore floating point registers
  405. #endif
  406. mov r0, #0 // set return value
  407. sub sp, fp, #28
  408. pop {r4 -r9 ,fp}
  409. bx lr
  410. EPILOGUE