You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t_vfpv3.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/18 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #if !defined(__ARM_PCS_VFP)
  38. #if !defined(DOUBLE)
  39. #define OLD_ALPHA r3
  40. #define OLD_A_SOFTFP [fp, #0 ]
  41. #define OLD_LDA [fp, #4 ]
  42. #define X [fp, #8 ]
  43. #define OLD_INC_X [fp, #12 ]
  44. #define Y [fp, #16 ]
  45. #define OLD_INC_Y [fp, #20 ]
  46. #else
  47. #define OLD_ALPHA [fp, #0 ]
  48. #define OLD_A_SOFTFP [fp, #8 ]
  49. #define OLD_LDA [fp, #12]
  50. #define X [fp, #16]
  51. #define OLD_INC_X [fp, #20]
  52. #define Y [fp, #24]
  53. #define OLD_INC_Y [fp, #28]
  54. #endif
  55. #else
  56. #define OLD_LDA [fp, #0 ]
  57. #define X [fp, #4 ]
  58. #define OLD_INC_X [fp, #8 ]
  59. #define Y [fp, #12 ]
  60. #define OLD_INC_Y [fp, #16 ]
  61. #endif
  62. #define OLD_A r3
  63. #define OLD_N r1
  64. #define M r0
  65. #define AO1 r1
  66. #define J r2
  67. #define AO2 r4
  68. #define XO r5
  69. #define YO r6
  70. #define LDA r7
  71. #define INC_X r8
  72. #define INC_Y r9
  73. #define I r12
  74. #define N [fp, #-252 ]
  75. #define A [fp, #-256 ]
  76. #define X_PRE 512
  77. #define A_PRE 512
  78. /**************************************************************************************
  79. * Macro definitions
  80. **************************************************************************************/
  81. #if defined(DOUBLE)
  82. .macro INIT_F2
  83. vsub.f64 d4 , d4 , d4
  84. vsub.f64 d5 , d5 , d5
  85. .endm
  86. .macro KERNEL_F2X4
  87. pld [ XO , #X_PRE ]
  88. vldmia.f64 XO! , { d28 - d31 }
  89. pld [ AO1 , #A_PRE ]
  90. vldmia.f64 AO1!, { d8 - d9 }
  91. pld [ AO2 , #A_PRE ]
  92. vldmia.f64 AO2!, { d16 - d17 }
  93. vmla.f64 d4 , d28 , d8
  94. vmla.f64 d5 , d28 , d16
  95. vldmia.f64 AO1!, { d10 - d11 }
  96. vmla.f64 d4 , d29 , d9
  97. vmla.f64 d5 , d29 , d17
  98. vldmia.f64 AO2!, { d18 - d19 }
  99. vmla.f64 d4 , d30, d10
  100. vmla.f64 d5 , d30, d18
  101. vmla.f64 d4 , d31, d11
  102. vmla.f64 d5 , d31, d19
  103. .endm
  104. .macro KERNEL_F2X1
  105. vldmia.f64 XO! , { d2 }
  106. vldmia.f64 AO1!, { d8 }
  107. vldmia.f64 AO2!, { d16 }
  108. vmla.f64 d4 , d2 , d8
  109. vmla.f64 d5 , d2 , d16
  110. .endm
  111. .macro SAVE_F2
  112. vldmia.f64 YO, { d24 - d25 }
  113. vmla.f64 d24, d0, d4
  114. vmla.f64 d25, d0, d5
  115. vstmia.f64 YO!, { d24 - d25 }
  116. .endm
  117. .macro INIT_S2
  118. vsub.f64 d4 , d4 , d4
  119. vsub.f64 d5 , d5 , d5
  120. .endm
  121. .macro KERNEL_S2X4
  122. pld [ AO1 , #A_PRE ]
  123. vldmia.f64 XO , { d28 }
  124. add XO, XO, INC_X
  125. vldmia.f64 AO1!, { d8 - d9 }
  126. pld [ AO2 , #A_PRE ]
  127. vldmia.f64 AO2!, { d16 - d17 }
  128. vmla.f64 d4 , d28 , d8
  129. vldmia.f64 XO , { d29 }
  130. add XO, XO, INC_X
  131. vmla.f64 d5 , d28 , d16
  132. vldmia.f64 AO1!, { d10 - d11 }
  133. vmla.f64 d4 , d29 , d9
  134. vldmia.f64 XO , { d30 }
  135. add XO, XO, INC_X
  136. vmla.f64 d5 , d29 , d17
  137. vldmia.f64 AO2!, { d18 - d19 }
  138. vmla.f64 d4 , d30, d10
  139. vldmia.f64 XO , { d31 }
  140. add XO, XO, INC_X
  141. vmla.f64 d5 , d30, d18
  142. vmla.f64 d4 , d31, d11
  143. vmla.f64 d5 , d31, d19
  144. .endm
  145. .macro KERNEL_S2X1
  146. vldmia.f64 XO , { d2 }
  147. vldmia.f64 AO1!, { d8 }
  148. add XO, XO, INC_X
  149. vldmia.f64 AO2!, { d16 }
  150. vmla.f64 d4 , d2 , d8
  151. vmla.f64 d5 , d2 , d16
  152. .endm
  153. .macro SAVE_S2
  154. vldmia.f64 YO, { d24 }
  155. vmla.f64 d24, d0, d4
  156. vstmia.f64 YO, { d24 }
  157. add YO, YO, INC_Y
  158. vldmia.f64 YO, { d24 }
  159. vmla.f64 d24, d0, d5
  160. vstmia.f64 YO, { d24 }
  161. add YO, YO, INC_Y
  162. .endm
  163. .macro INIT_F1
  164. vsub.f64 d4 , d4 , d4
  165. .endm
  166. .macro KERNEL_F1X4
  167. pld [ XO , #X_PRE ]
  168. vldmia.f64 XO! , { d28 - d31 }
  169. pld [ AO1 , #A_PRE ]
  170. vldmia.f64 AO1!, { d8 - d9 }
  171. vmla.f64 d4 , d28 , d8
  172. vldmia.f64 AO1!, { d10 - d11 }
  173. vmla.f64 d4 , d29 , d9
  174. vmla.f64 d4 , d30, d10
  175. vmla.f64 d4 , d31, d11
  176. .endm
  177. .macro KERNEL_F1X1
  178. vldmia.f64 XO! , { d2 }
  179. vldmia.f64 AO1!, { d8 }
  180. vmla.f64 d4 , d2 , d8
  181. .endm
  182. .macro SAVE_F1
  183. vldmia.f64 YO, { d24 }
  184. vmla.f64 d24, d0, d4
  185. vstmia.f64 YO!, { d24 }
  186. .endm
  187. .macro INIT_S1
  188. vsub.f64 d4 , d4 , d4
  189. .endm
  190. .macro KERNEL_S1X4
  191. pld [ AO1 , #A_PRE ]
  192. vldmia.f64 XO , { d28 }
  193. add XO, XO, INC_X
  194. vldmia.f64 AO1!, { d8 - d9 }
  195. vmla.f64 d4 , d28 , d8
  196. vldmia.f64 XO , { d29 }
  197. add XO, XO, INC_X
  198. vldmia.f64 AO1!, { d10 - d11 }
  199. vmla.f64 d4 , d29 , d9
  200. vldmia.f64 XO , { d30 }
  201. add XO, XO, INC_X
  202. vmla.f64 d4 , d30, d10
  203. vldmia.f64 XO , { d31 }
  204. add XO, XO, INC_X
  205. vmla.f64 d4 , d31, d11
  206. .endm
  207. .macro KERNEL_S1X1
  208. vldmia.f64 XO , { d2 }
  209. vldmia.f64 AO1!, { d8 }
  210. add XO, XO, INC_X
  211. vmla.f64 d4 , d2 , d8
  212. .endm
  213. .macro SAVE_S1
  214. vldmia.f64 YO, { d24 }
  215. vmla.f64 d24, d0, d4
  216. vstmia.f64 YO, { d24 }
  217. add YO, YO, INC_Y
  218. .endm
  219. #else /************************* SINGLE PRECISION *****************************************/
  220. .macro INIT_F2
  221. vsub.f32 s4 , s4 , s4
  222. vsub.f32 s5 , s5 , s5
  223. .endm
  224. .macro KERNEL_F2X4
  225. vldmia.f32 XO! , { s28 - s31 }
  226. vldmia.f32 AO1!, { s8 - s9 }
  227. vldmia.f32 AO2!, { s16 - s17 }
  228. vmla.f32 s4 , s28 , s8
  229. vmla.f32 s5 , s28 , s16
  230. vldmia.f32 AO1!, { s10 - s11 }
  231. vmla.f32 s4 , s29 , s9
  232. vmla.f32 s5 , s29 , s17
  233. vldmia.f32 AO2!, { s18 - s19 }
  234. vmla.f32 s4 , s30, s10
  235. vmla.f32 s5 , s30, s18
  236. vmla.f32 s4 , s31, s11
  237. vmla.f32 s5 , s31, s19
  238. .endm
  239. .macro KERNEL_F2X1
  240. vldmia.f32 XO! , { s2 }
  241. vldmia.f32 AO1!, { s8 }
  242. vldmia.f32 AO2!, { s16 }
  243. vmla.f32 s4 , s2 , s8
  244. vmla.f32 s5 , s2 , s16
  245. .endm
  246. .macro SAVE_F2
  247. vldmia.f32 YO, { s24 - s25 }
  248. vmla.f32 s24, s0, s4
  249. vmla.f32 s25, s0, s5
  250. vstmia.f32 YO!, { s24 - s25 }
  251. .endm
  252. .macro INIT_S2
  253. vsub.f32 s4 , s4 , s4
  254. vsub.f32 s5 , s5 , s5
  255. .endm
  256. .macro KERNEL_S2X4
  257. vldmia.f32 XO , { s28 }
  258. add XO, XO, INC_X
  259. vldmia.f32 AO1!, { s8 - s9 }
  260. vldmia.f32 AO2!, { s16 - s17 }
  261. vmla.f32 s4 , s28 , s8
  262. vldmia.f32 XO , { s29 }
  263. add XO, XO, INC_X
  264. vmla.f32 s5 , s28 , s16
  265. vldmia.f32 AO1!, { s10 - s11 }
  266. vmla.f32 s4 , s29 , s9
  267. vldmia.f32 XO , { s30 }
  268. add XO, XO, INC_X
  269. vmla.f32 s5 , s29 , s17
  270. vldmia.f32 AO2!, { s18 - s19 }
  271. vmla.f32 s4 , s30, s10
  272. vldmia.f32 XO , { s31 }
  273. add XO, XO, INC_X
  274. vmla.f32 s5 , s30, s18
  275. vmla.f32 s4 , s31, s11
  276. vmla.f32 s5 , s31, s19
  277. .endm
  278. .macro KERNEL_S2X1
  279. vldmia.f32 XO , { s2 }
  280. vldmia.f32 AO1!, { s8 }
  281. add XO, XO, INC_X
  282. vldmia.f32 AO2!, { s16 }
  283. vmla.f32 s4 , s2 , s8
  284. vmla.f32 s5 , s2 , s16
  285. .endm
  286. .macro SAVE_S2
  287. vldmia.f32 YO, { s24 }
  288. vmla.f32 s24, s0, s4
  289. vstmia.f32 YO, { s24 }
  290. add YO, YO, INC_Y
  291. vldmia.f32 YO, { s24 }
  292. vmla.f32 s24, s0, s5
  293. vstmia.f32 YO, { s24 }
  294. add YO, YO, INC_Y
  295. .endm
  296. .macro INIT_F1
  297. vsub.f32 s4 , s4 , s4
  298. .endm
  299. .macro KERNEL_F1X4
  300. vldmia.f32 XO! , { s28 - s31 }
  301. vldmia.f32 AO1!, { s8 - s9 }
  302. vmla.f32 s4 , s28 , s8
  303. vldmia.f32 AO1!, { s10 - s11 }
  304. vmla.f32 s4 , s29 , s9
  305. vmla.f32 s4 , s30, s10
  306. vmla.f32 s4 , s31, s11
  307. .endm
  308. .macro KERNEL_F1X1
  309. vldmia.f32 XO! , { s2 }
  310. vldmia.f32 AO1!, { s8 }
  311. vmla.f32 s4 , s2 , s8
  312. .endm
  313. .macro SAVE_F1
  314. vldmia.f32 YO, { s24 }
  315. vmla.f32 s24, s0, s4
  316. vstmia.f32 YO!, { s24 }
  317. .endm
  318. .macro INIT_S1
  319. vsub.f32 s4 , s4 , s4
  320. .endm
  321. .macro KERNEL_S1X4
  322. vldmia.f32 XO , { s28 }
  323. add XO, XO, INC_X
  324. vldmia.f32 AO1!, { s8 - s9 }
  325. vmla.f32 s4 , s28 , s8
  326. vldmia.f32 XO , { s29 }
  327. add XO, XO, INC_X
  328. vldmia.f32 AO1!, { s10 - s11 }
  329. vmla.f32 s4 , s29 , s9
  330. vldmia.f32 XO , { s30 }
  331. add XO, XO, INC_X
  332. vmla.f32 s4 , s30, s10
  333. vldmia.f32 XO , { s31 }
  334. add XO, XO, INC_X
  335. vmla.f32 s4 , s31, s11
  336. .endm
  337. .macro KERNEL_S1X1
  338. vldmia.f32 XO , { s2 }
  339. vldmia.f32 AO1!, { s8 }
  340. add XO, XO, INC_X
  341. vmla.f32 s4 , s2 , s8
  342. .endm
  343. .macro SAVE_S1
  344. vldmia.f32 YO, { s24 }
  345. vmla.f32 s24, s0, s4
  346. vstmia.f32 YO, { s24 }
  347. add YO, YO, INC_Y
  348. .endm
  349. #endif
  350. /**************************************************************************************
  351. * End of macro definitions
  352. **************************************************************************************/
  353. PROLOGUE
  354. .align 5
  355. push {r4 - r9 , fp}
  356. add fp, sp, #28
  357. sub sp, sp, #STACKSIZE // reserve stack
  358. sub r12, fp, #192
  359. #if defined(DOUBLE)
  360. vstm r12, { d8 - d15 } // store floating point registers
  361. #else
  362. vstm r12, { s8 - s31 } // store floating point registers
  363. #endif
  364. cmp M, #0
  365. ble gemvt_kernel_L999
  366. cmp OLD_N, #0
  367. ble gemvt_kernel_L999
  368. #if !defined(__ARM_PCS_VFP)
  369. #if !defined(DOUBLE)
  370. vmov s0, OLD_ALPHA
  371. #else
  372. vldr d0, OLD_ALPHA
  373. #endif
  374. ldr OLD_A, OLD_A_SOFTFP
  375. #endif
  376. str OLD_A, A
  377. str OLD_N, N
  378. ldr INC_X , OLD_INC_X
  379. ldr INC_Y , OLD_INC_Y
  380. cmp INC_X, #0
  381. beq gemvt_kernel_L999
  382. cmp INC_Y, #0
  383. beq gemvt_kernel_L999
  384. ldr LDA, OLD_LDA
  385. #if defined(DOUBLE)
  386. lsl LDA, LDA, #3 // LDA * SIZE
  387. #else
  388. lsl LDA, LDA, #2 // LDA * SIZE
  389. #endif
  390. cmp INC_X, #1
  391. bne gemvt_kernel_S2_BEGIN
  392. cmp INC_Y, #1
  393. bne gemvt_kernel_S2_BEGIN
  394. gemvt_kernel_F2_BEGIN:
  395. ldr YO , Y
  396. ldr J, N
  397. asrs J, J, #1 // J = N / 2
  398. ble gemvt_kernel_F1_BEGIN
  399. gemvt_kernel_F2X4:
  400. ldr AO1, A
  401. add AO2, AO1, LDA
  402. add r3 , AO2, LDA
  403. str r3 , A
  404. ldr XO , X
  405. INIT_F2
  406. asrs I, M, #2 // I = M / 4
  407. ble gemvt_kernel_F2X1
  408. gemvt_kernel_F2X4_10:
  409. KERNEL_F2X4
  410. subs I, I, #1
  411. bne gemvt_kernel_F2X4_10
  412. gemvt_kernel_F2X1:
  413. ands I, M , #3
  414. ble gemvt_kernel_F2_END
  415. gemvt_kernel_F2X1_10:
  416. KERNEL_F2X1
  417. subs I, I, #1
  418. bne gemvt_kernel_F2X1_10
  419. gemvt_kernel_F2_END:
  420. SAVE_F2
  421. subs J , J , #1
  422. bne gemvt_kernel_F2X4
  423. gemvt_kernel_F1_BEGIN:
  424. ldr J, N
  425. ands J, J, #1
  426. ble gemvt_kernel_L999
  427. gemvt_kernel_F1X4:
  428. ldr AO1, A
  429. ldr XO , X
  430. INIT_F1
  431. asrs I, M, #2 // I = M / 4
  432. ble gemvt_kernel_F1X1
  433. gemvt_kernel_F1X4_10:
  434. KERNEL_F1X4
  435. subs I, I, #1
  436. bne gemvt_kernel_F1X4_10
  437. gemvt_kernel_F1X1:
  438. ands I, M , #3
  439. ble gemvt_kernel_F1_END
  440. gemvt_kernel_F1X1_10:
  441. KERNEL_F1X1
  442. subs I, I, #1
  443. bne gemvt_kernel_F1X1_10
  444. gemvt_kernel_F1_END:
  445. SAVE_F1
  446. b gemvt_kernel_L999
  447. /*************************************************************************************************************/
  448. gemvt_kernel_S2_BEGIN:
  449. #if defined(DOUBLE)
  450. lsl INC_X, INC_X, #3 // INC_X * SIZE
  451. lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
  452. #else
  453. lsl INC_X, INC_X, #2 // INC_X * SIZE
  454. lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
  455. #endif
  456. ldr YO , Y
  457. ldr J, N
  458. asrs J, J, #1 // J = N / 2
  459. ble gemvt_kernel_S1_BEGIN
  460. gemvt_kernel_S2X4:
  461. ldr AO1, A
  462. add AO2, AO1, LDA
  463. add r3 , AO2, LDA
  464. str r3 , A
  465. ldr XO , X
  466. INIT_S2
  467. asrs I, M, #2 // I = M / 4
  468. ble gemvt_kernel_S2X1
  469. gemvt_kernel_S2X4_10:
  470. KERNEL_S2X4
  471. subs I, I, #1
  472. bne gemvt_kernel_S2X4_10
  473. gemvt_kernel_S2X1:
  474. ands I, M , #3
  475. ble gemvt_kernel_S2_END
  476. gemvt_kernel_S2X1_10:
  477. KERNEL_S2X1
  478. subs I, I, #1
  479. bne gemvt_kernel_S2X1_10
  480. gemvt_kernel_S2_END:
  481. SAVE_S2
  482. subs J , J , #1
  483. bne gemvt_kernel_S2X4
  484. gemvt_kernel_S1_BEGIN:
  485. ldr J, N
  486. ands J, J, #1
  487. ble gemvt_kernel_L999
  488. gemvt_kernel_S1X4:
  489. ldr AO1, A
  490. ldr XO , X
  491. INIT_S1
  492. asrs I, M, #2 // I = M / 4
  493. ble gemvt_kernel_S1X1
  494. gemvt_kernel_S1X4_10:
  495. KERNEL_S1X4
  496. subs I, I, #1
  497. bne gemvt_kernel_S1X4_10
  498. gemvt_kernel_S1X1:
  499. ands I, M , #3
  500. ble gemvt_kernel_S1_END
  501. gemvt_kernel_S1X1_10:
  502. KERNEL_S1X1
  503. subs I, I, #1
  504. bne gemvt_kernel_S1X1_10
  505. gemvt_kernel_S1_END:
  506. SAVE_S1
  507. /*************************************************************************************************************/
  508. gemvt_kernel_L999:
  509. sub r3, fp, #192
  510. #if defined(DOUBLE)
  511. vldm r3, { d8 - d15 } // restore floating point registers
  512. #else
  513. vldm r3, { s8 - s31 } // restore floating point registers
  514. #endif
  515. mov r0, #0 // set return value
  516. sub sp, fp, #28
  517. pop {r4 -r9 ,fp}
  518. bx lr
  519. EPILOGUE