You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t_vfp.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/29 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_LDA [fp, #0 ]
  38. #define X [fp, #4 ]
  39. #define OLD_INC_X [fp, #8 ]
  40. #define Y [fp, #12 ]
  41. #define OLD_INC_Y [fp, #16 ]
  42. #define OLD_A r3
  43. #define OLD_N r1
  44. #define M r0
  45. #define AO1 r1
  46. #define J r2
  47. #define AO2 r4
  48. #define XO r5
  49. #define YO r6
  50. #define LDA r7
  51. #define INC_X r8
  52. #define INC_Y r9
  53. #define I r12
  54. #define N [fp, #-252 ]
  55. #define A [fp, #-256 ]
  56. #define X_PRE 512
  57. #define A_PRE 512
  58. #define Y_PRE 32
  59. /**************************************************************************************
  60. * Macro definitions
  61. **************************************************************************************/
  62. #if !defined(CONJ) && !defined(XCONJ)
  63. #define KMAC_R fnmacd
  64. #define KMAC_I fmacd
  65. #define FMAC_R1 fmacd
  66. #define FMAC_R2 fnmacd
  67. #define FMAC_I1 fmacd
  68. #define FMAC_I2 fmacd
  69. #elif defined(CONJ) && !defined(XCONJ)
  70. #define KMAC_R fmacd
  71. #define KMAC_I fnmacd
  72. #define FMAC_R1 fmacd
  73. #define FMAC_R2 fnmacd
  74. #define FMAC_I1 fmacd
  75. #define FMAC_I2 fmacd
  76. #elif !defined(CONJ) && defined(XCONJ)
  77. #define KMAC_R fmacd
  78. #define KMAC_I fnmacd
  79. #define FMAC_R1 fmacd
  80. #define FMAC_R2 fmacd
  81. #define FMAC_I1 fnmacd
  82. #define FMAC_I2 fmacd
  83. #else
  84. #define KMAC_R fnmacd
  85. #define KMAC_I fmacd
  86. #define FMAC_R1 fmacd
  87. #define FMAC_R2 fmacd
  88. #define FMAC_I1 fnmacd
  89. #define FMAC_I2 fmacd
  90. #endif
  91. .macro INIT_F2
  92. vsub.f64 d12, d12, d12
  93. vsub.f64 d13, d13, d13
  94. vsub.f64 d14, d14, d14
  95. vsub.f64 d15, d15, d15
  96. .endm
  97. .macro KERNEL_F2X4
  98. KERNEL_F2X1
  99. KERNEL_F2X1
  100. KERNEL_F2X1
  101. KERNEL_F2X1
  102. .endm
  103. .macro KERNEL_F2X1
  104. fldmiad XO! , { d2 - d3 }
  105. fldmiad AO1!, { d4 - d5 }
  106. fmacd d12 , d4 , d2
  107. fmacd d13 , d4 , d3
  108. fldmiad AO2!, { d8 - d9 }
  109. KMAC_R d12 , d5 , d3
  110. KMAC_I d13 , d5 , d2
  111. fmacd d14 , d8 , d2
  112. fmacd d15 , d8 , d3
  113. KMAC_R d14 , d9 , d3
  114. KMAC_I d15 , d9 , d2
  115. .endm
  116. .macro SAVE_F2
  117. fldmiad YO, { d4 - d7 }
  118. FMAC_R1 d4 , d0 , d12
  119. FMAC_I1 d5 , d0 , d13
  120. FMAC_R2 d4 , d1 , d13
  121. FMAC_I2 d5 , d1 , d12
  122. FMAC_R1 d6 , d0 , d14
  123. FMAC_I1 d7 , d0 , d15
  124. FMAC_R2 d6 , d1 , d15
  125. FMAC_I2 d7 , d1 , d14
  126. fstmiad YO!, { d4 - d7 }
  127. .endm
  128. /************************************************************************************************/
  129. .macro INIT_F1
  130. vsub.f64 d12, d12, d12
  131. vsub.f64 d13, d13, d13
  132. .endm
  133. .macro KERNEL_F1X4
  134. KERNEL_F1X1
  135. KERNEL_F1X1
  136. KERNEL_F1X1
  137. KERNEL_F1X1
  138. .endm
  139. .macro KERNEL_F1X1
  140. fldmiad XO! , { d2 - d3 }
  141. fldmiad AO1!, { d4 - d5 }
  142. fmacd d12 , d4 , d2
  143. fmacd d13 , d4 , d3
  144. KMAC_R d12 , d5 , d3
  145. KMAC_I d13 , d5 , d2
  146. .endm
  147. .macro SAVE_F1
  148. fldmiad YO, { d4 - d5 }
  149. FMAC_R1 d4 , d0 , d12
  150. FMAC_I1 d5 , d0 , d13
  151. FMAC_R2 d4 , d1 , d13
  152. FMAC_I2 d5 , d1 , d12
  153. fstmiad YO!, { d4 - d5 }
  154. .endm
  155. /************************************************************************************************/
  156. .macro INIT_S2
  157. vsub.f64 d12, d12, d12
  158. vsub.f64 d13, d13, d13
  159. vsub.f64 d14, d14, d14
  160. vsub.f64 d15, d15, d15
  161. .endm
  162. .macro KERNEL_S2X4
  163. KERNEL_S2X1
  164. KERNEL_S2X1
  165. KERNEL_S2X1
  166. KERNEL_S2X1
  167. .endm
  168. .macro KERNEL_S2X1
  169. fldmiad XO , { d2 - d3 }
  170. fldmiad AO1!, { d4 - d5 }
  171. fldmiad AO2!, { d8 - d9 }
  172. fmacd d12 , d4 , d2
  173. fmacd d13 , d4 , d3
  174. KMAC_R d12 , d5 , d3
  175. KMAC_I d13 , d5 , d2
  176. fmacd d14 , d8 , d2
  177. fmacd d15 , d8 , d3
  178. KMAC_R d14 , d9 , d3
  179. KMAC_I d15 , d9 , d2
  180. add XO, XO, INC_X
  181. .endm
  182. .macro SAVE_S2
  183. fldmiad YO, { d4 - d5 }
  184. FMAC_R1 d4 , d0 , d12
  185. FMAC_I1 d5 , d0 , d13
  186. FMAC_R2 d4 , d1 , d13
  187. FMAC_I2 d5 , d1 , d12
  188. fstmiad YO, { d4 - d5 }
  189. add YO, YO, INC_Y
  190. fldmiad YO, { d6 - d7 }
  191. FMAC_R1 d6 , d0 , d14
  192. FMAC_I1 d7 , d0 , d15
  193. FMAC_R2 d6 , d1 , d15
  194. FMAC_I2 d7 , d1 , d14
  195. fstmiad YO, { d6 - d7 }
  196. add YO, YO, INC_Y
  197. .endm
  198. /************************************************************************************************/
  199. .macro INIT_S1
  200. vsub.f64 d12, d12, d12
  201. vsub.f64 d13, d13, d13
  202. .endm
  203. .macro KERNEL_S1X4
  204. KERNEL_S1X1
  205. KERNEL_S1X1
  206. KERNEL_S1X1
  207. KERNEL_S1X1
  208. .endm
  209. .macro KERNEL_S1X1
  210. fldmiad XO , { d2 - d3 }
  211. fldmiad AO1!, { d4 - d5 }
  212. fmacd d12 , d4 , d2
  213. fmacd d13 , d4 , d3
  214. KMAC_R d12 , d5 , d3
  215. KMAC_I d13 , d5 , d2
  216. add XO, XO, INC_X
  217. .endm
  218. .macro SAVE_S1
  219. fldmiad YO, { d4 - d5 }
  220. FMAC_R1 d4 , d0 , d12
  221. FMAC_I1 d5 , d0 , d13
  222. FMAC_R2 d4 , d1 , d13
  223. FMAC_I2 d5 , d1 , d12
  224. fstmiad YO, { d4 - d5 }
  225. add YO, YO, INC_Y
  226. .endm
  227. /**************************************************************************************
  228. * End of macro definitions
  229. **************************************************************************************/
  230. PROLOGUE
  231. .align 5
  232. push {r4 - r9 , fp}
  233. add fp, sp, #28
  234. sub sp, sp, #STACKSIZE // reserve stack
  235. sub r12, fp, #192
  236. #if defined(DOUBLE)
  237. vstm r12, { d8 - d15 } // store floating point registers
  238. #else
  239. vstm r12, { s8 - s15 } // store floating point registers
  240. #endif
  241. cmp M, #0
  242. ble zgemvt_kernel_L999
  243. cmp OLD_N, #0
  244. ble zgemvt_kernel_L999
  245. str OLD_A, A
  246. str OLD_N, N
  247. ldr INC_X , OLD_INC_X
  248. ldr INC_Y , OLD_INC_Y
  249. cmp INC_X, #0
  250. beq zgemvt_kernel_L999
  251. cmp INC_Y, #0
  252. beq zgemvt_kernel_L999
  253. ldr LDA, OLD_LDA
  254. #if defined(DOUBLE)
  255. lsl LDA, LDA, #4 // LDA * SIZE
  256. #else
  257. lsl LDA, LDA, #3 // LDA * SIZE
  258. #endif
  259. cmp INC_X, #1
  260. bne zgemvt_kernel_S2_BEGIN
  261. cmp INC_Y, #1
  262. bne zgemvt_kernel_S2_BEGIN
  263. zgemvt_kernel_F2_BEGIN:
  264. ldr YO , Y
  265. ldr J, N
  266. asrs J, J, #1 // J = N / 2
  267. ble zgemvt_kernel_F1_BEGIN
  268. zgemvt_kernel_F2X4:
  269. ldr AO1, A
  270. add AO2, AO1, LDA
  271. add r3 , AO2, LDA
  272. str r3 , A
  273. ldr XO , X
  274. INIT_F2
  275. asrs I, M, #2 // I = M / 4
  276. ble zgemvt_kernel_F2X1
  277. zgemvt_kernel_F2X4_10:
  278. KERNEL_F2X4
  279. subs I, I, #1
  280. bne zgemvt_kernel_F2X4_10
  281. zgemvt_kernel_F2X1:
  282. ands I, M , #3
  283. ble zgemvt_kernel_F2_END
  284. zgemvt_kernel_F2X1_10:
  285. KERNEL_F2X1
  286. subs I, I, #1
  287. bne zgemvt_kernel_F2X1_10
  288. zgemvt_kernel_F2_END:
  289. SAVE_F2
  290. subs J , J , #1
  291. bne zgemvt_kernel_F2X4
  292. zgemvt_kernel_F1_BEGIN:
  293. ldr J, N
  294. ands J, J, #1
  295. ble zgemvt_kernel_L999
  296. zgemvt_kernel_F1X4:
  297. ldr AO1, A
  298. ldr XO , X
  299. INIT_F1
  300. asrs I, M, #2 // I = M / 4
  301. ble zgemvt_kernel_F1X1
  302. zgemvt_kernel_F1X4_10:
  303. KERNEL_F1X4
  304. subs I, I, #1
  305. bne zgemvt_kernel_F1X4_10
  306. zgemvt_kernel_F1X1:
  307. ands I, M , #3
  308. ble zgemvt_kernel_F1_END
  309. zgemvt_kernel_F1X1_10:
  310. KERNEL_F1X1
  311. subs I, I, #1
  312. bne zgemvt_kernel_F1X1_10
  313. zgemvt_kernel_F1_END:
  314. SAVE_F1
  315. b zgemvt_kernel_L999
  316. /*************************************************************************************************************/
  317. zgemvt_kernel_S2_BEGIN:
  318. #if defined(DOUBLE)
  319. lsl INC_X, INC_X, #4 // INC_X * SIZE
  320. lsl INC_Y, INC_Y, #4 // INC_Y * SIZE
  321. #else
  322. lsl INC_X, INC_X, #3 // INC_X * SIZE
  323. lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
  324. #endif
  325. ldr YO , Y
  326. ldr J, N
  327. asrs J, J, #1 // J = N / 2
  328. ble zgemvt_kernel_S1_BEGIN
  329. zgemvt_kernel_S2X4:
  330. ldr AO1, A
  331. add AO2, AO1, LDA
  332. add r3 , AO2, LDA
  333. str r3 , A
  334. ldr XO , X
  335. INIT_S2
  336. asrs I, M, #2 // I = M / 4
  337. ble zgemvt_kernel_S2X1
  338. zgemvt_kernel_S2X4_10:
  339. KERNEL_S2X4
  340. subs I, I, #1
  341. bne zgemvt_kernel_S2X4_10
  342. zgemvt_kernel_S2X1:
  343. ands I, M , #3
  344. ble zgemvt_kernel_S2_END
  345. zgemvt_kernel_S2X1_10:
  346. KERNEL_S2X1
  347. subs I, I, #1
  348. bne zgemvt_kernel_S2X1_10
  349. zgemvt_kernel_S2_END:
  350. SAVE_S2
  351. subs J , J , #1
  352. bne zgemvt_kernel_S2X4
  353. zgemvt_kernel_S1_BEGIN:
  354. ldr J, N
  355. ands J, J, #1
  356. ble zgemvt_kernel_L999
  357. zgemvt_kernel_S1X4:
  358. ldr AO1, A
  359. ldr XO , X
  360. INIT_S1
  361. asrs I, M, #2 // I = M / 4
  362. ble zgemvt_kernel_S1X1
  363. zgemvt_kernel_S1X4_10:
  364. KERNEL_S1X4
  365. subs I, I, #1
  366. bne zgemvt_kernel_S1X4_10
  367. zgemvt_kernel_S1X1:
  368. ands I, M , #3
  369. ble zgemvt_kernel_S1_END
  370. zgemvt_kernel_S1X1_10:
  371. KERNEL_S1X1
  372. subs I, I, #1
  373. bne zgemvt_kernel_S1X1_10
  374. zgemvt_kernel_S1_END:
  375. SAVE_S1
  376. /*************************************************************************************************************/
  377. zgemvt_kernel_L999:
  378. sub r3, fp, #192
  379. #if defined(DOUBLE)
  380. vldm r3, { d8 - d15 } // restore floating point registers
  381. #else
  382. vldm r3, { s8 - s15 } // restore floating point registers
  383. #endif
  384. mov r0, #0 // set return value
  385. sub sp, fp, #28
  386. pop {r4 -r9 ,fp}
  387. bx lr
  388. EPILOGUE