You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t_vfpv3.S 12 kB


  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/18 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_LDA [fp, #0 ]
  38. #define X [fp, #4 ]
  39. #define OLD_INC_X [fp, #8 ]
  40. #define Y [fp, #12 ]
  41. #define OLD_INC_Y [fp, #16 ]
  42. #define OLD_A r3
  43. #define OLD_N r1
  44. #define M r0
  45. #define AO1 r1
  46. #define J r2
  47. #define AO2 r4
  48. #define XO r5
  49. #define YO r6
  50. #define LDA r7
  51. #define INC_X r8
  52. #define INC_Y r9
  53. #define I r12
  54. #define N [fp, #-252 ]
  55. #define A [fp, #-256 ]
  56. #define X_PRE 512
  57. #define A_PRE 512
  58. /**************************************************************************************
  59. * Macro definitions
  60. **************************************************************************************/
  61. #if defined(DOUBLE)
  62. .macro INIT_F2
  63. vsub.f64 d4 , d4 , d4
  64. vsub.f64 d5 , d5 , d5
  65. .endm
  66. .macro KERNEL_F2X4
  67. pld [ XO , #X_PRE ]
  68. fldmiad XO! , { d28 - d31 }
  69. pld [ AO1 , #A_PRE ]
  70. fldmiad AO1!, { d8 - d9 }
  71. pld [ AO2 , #A_PRE ]
  72. fldmiad AO2!, { d16 - d17 }
  73. vmla.f64 d4 , d28 , d8
  74. vmla.f64 d5 , d28 , d16
  75. fldmiad AO1!, { d10 - d11 }
  76. vmla.f64 d4 , d29 , d9
  77. vmla.f64 d5 , d29 , d17
  78. fldmiad AO2!, { d18 - d19 }
  79. vmla.f64 d4 , d30, d10
  80. vmla.f64 d5 , d30, d18
  81. vmla.f64 d4 , d31, d11
  82. vmla.f64 d5 , d31, d19
  83. .endm
  84. .macro KERNEL_F2X1
  85. fldmiad XO! , { d2 }
  86. fldmiad AO1!, { d8 }
  87. fldmiad AO2!, { d16 }
  88. vmla.f64 d4 , d2 , d8
  89. vmla.f64 d5 , d2 , d16
  90. .endm
  91. .macro SAVE_F2
  92. fldmiad YO, { d24 - d25 }
  93. vmla.f64 d24, d0, d4
  94. vmla.f64 d25, d0, d5
  95. fstmiad YO!, { d24 - d25 }
  96. .endm
  97. .macro INIT_S2
  98. vsub.f64 d4 , d4 , d4
  99. vsub.f64 d5 , d5 , d5
  100. .endm
  101. .macro KERNEL_S2X4
  102. pld [ AO1 , #A_PRE ]
  103. fldmiad XO , { d28 }
  104. add XO, XO, INC_X
  105. fldmiad AO1!, { d8 - d9 }
  106. pld [ AO2 , #A_PRE ]
  107. fldmiad AO2!, { d16 - d17 }
  108. vmla.f64 d4 , d28 , d8
  109. fldmiad XO , { d29 }
  110. add XO, XO, INC_X
  111. vmla.f64 d5 , d28 , d16
  112. fldmiad AO1!, { d10 - d11 }
  113. vmla.f64 d4 , d29 , d9
  114. fldmiad XO , { d30 }
  115. add XO, XO, INC_X
  116. vmla.f64 d5 , d29 , d17
  117. fldmiad AO2!, { d18 - d19 }
  118. vmla.f64 d4 , d30, d10
  119. fldmiad XO , { d31 }
  120. add XO, XO, INC_X
  121. vmla.f64 d5 , d30, d18
  122. vmla.f64 d4 , d31, d11
  123. vmla.f64 d5 , d31, d19
  124. .endm
  125. .macro KERNEL_S2X1
  126. fldmiad XO , { d2 }
  127. fldmiad AO1!, { d8 }
  128. add XO, XO, INC_X
  129. fldmiad AO2!, { d16 }
  130. vmla.f64 d4 , d2 , d8
  131. vmla.f64 d5 , d2 , d16
  132. .endm
  133. .macro SAVE_S2
  134. fldmiad YO, { d24 }
  135. vmla.f64 d24, d0, d4
  136. fstmiad YO, { d24 }
  137. add YO, YO, INC_Y
  138. fldmiad YO, { d24 }
  139. vmla.f64 d24, d0, d5
  140. fstmiad YO, { d24 }
  141. add YO, YO, INC_Y
  142. .endm
  143. .macro INIT_F1
  144. vsub.f64 d4 , d4 , d4
  145. .endm
  146. .macro KERNEL_F1X4
  147. pld [ XO , #X_PRE ]
  148. fldmiad XO! , { d28 - d31 }
  149. pld [ AO1 , #A_PRE ]
  150. fldmiad AO1!, { d8 - d9 }
  151. vmla.f64 d4 , d28 , d8
  152. fldmiad AO1!, { d10 - d11 }
  153. vmla.f64 d4 , d29 , d9
  154. vmla.f64 d4 , d30, d10
  155. vmla.f64 d4 , d31, d11
  156. .endm
  157. .macro KERNEL_F1X1
  158. fldmiad XO! , { d2 }
  159. fldmiad AO1!, { d8 }
  160. vmla.f64 d4 , d2 , d8
  161. .endm
  162. .macro SAVE_F1
  163. fldmiad YO, { d24 }
  164. vmla.f64 d24, d0, d4
  165. fstmiad YO!, { d24 }
  166. .endm
  167. .macro INIT_S1
  168. vsub.f64 d4 , d4 , d4
  169. .endm
  170. .macro KERNEL_S1X4
  171. pld [ AO1 , #A_PRE ]
  172. fldmiad XO , { d28 }
  173. add XO, XO, INC_X
  174. fldmiad AO1!, { d8 - d9 }
  175. vmla.f64 d4 , d28 , d8
  176. fldmiad XO , { d29 }
  177. add XO, XO, INC_X
  178. fldmiad AO1!, { d10 - d11 }
  179. vmla.f64 d4 , d29 , d9
  180. fldmiad XO , { d30 }
  181. add XO, XO, INC_X
  182. vmla.f64 d4 , d30, d10
  183. fldmiad XO , { d31 }
  184. add XO, XO, INC_X
  185. vmla.f64 d4 , d31, d11
  186. .endm
  187. .macro KERNEL_S1X1
  188. fldmiad XO , { d2 }
  189. fldmiad AO1!, { d8 }
  190. add XO, XO, INC_X
  191. vmla.f64 d4 , d2 , d8
  192. .endm
  193. .macro SAVE_S1
  194. fldmiad YO, { d24 }
  195. vmla.f64 d24, d0, d4
  196. fstmiad YO, { d24 }
  197. add YO, YO, INC_Y
  198. .endm
  199. #else /************************* SINGLE PRECISION *****************************************/
  200. .macro INIT_F2
  201. vsub.f32 s4 , s4 , s4
  202. vsub.f32 s5 , s5 , s5
  203. .endm
  204. .macro KERNEL_F2X4
  205. fldmias XO! , { s28 - s31 }
  206. fldmias AO1!, { s8 - s9 }
  207. fldmias AO2!, { s16 - s17 }
  208. vmla.f32 s4 , s28 , s8
  209. vmla.f32 s5 , s28 , s16
  210. fldmias AO1!, { s10 - s11 }
  211. vmla.f32 s4 , s29 , s9
  212. vmla.f32 s5 , s29 , s17
  213. fldmias AO2!, { s18 - s19 }
  214. vmla.f32 s4 , s30, s10
  215. vmla.f32 s5 , s30, s18
  216. vmla.f32 s4 , s31, s11
  217. vmla.f32 s5 , s31, s19
  218. .endm
  219. .macro KERNEL_F2X1
  220. fldmias XO! , { s2 }
  221. fldmias AO1!, { s8 }
  222. fldmias AO2!, { s16 }
  223. vmla.f32 s4 , s2 , s8
  224. vmla.f32 s5 , s2 , s16
  225. .endm
  226. .macro SAVE_F2
  227. fldmias YO, { s24 - s25 }
  228. vmla.f32 s24, s0, s4
  229. vmla.f32 s25, s0, s5
  230. fstmias YO!, { s24 - s25 }
  231. .endm
  232. .macro INIT_S2
  233. vsub.f32 s4 , s4 , s4
  234. vsub.f32 s5 , s5 , s5
  235. .endm
  236. .macro KERNEL_S2X4
  237. fldmias XO , { s28 }
  238. add XO, XO, INC_X
  239. fldmias AO1!, { s8 - s9 }
  240. fldmias AO2!, { s16 - s17 }
  241. vmla.f32 s4 , s28 , s8
  242. fldmias XO , { s29 }
  243. add XO, XO, INC_X
  244. vmla.f32 s5 , s28 , s16
  245. fldmias AO1!, { s10 - s11 }
  246. vmla.f32 s4 , s29 , s9
  247. fldmias XO , { s30 }
  248. add XO, XO, INC_X
  249. vmla.f32 s5 , s29 , s17
  250. fldmias AO2!, { s18 - s19 }
  251. vmla.f32 s4 , s30, s10
  252. fldmias XO , { s31 }
  253. add XO, XO, INC_X
  254. vmla.f32 s5 , s30, s18
  255. vmla.f32 s4 , s31, s11
  256. vmla.f32 s5 , s31, s19
  257. .endm
  258. .macro KERNEL_S2X1
  259. fldmias XO , { s2 }
  260. fldmias AO1!, { s8 }
  261. add XO, XO, INC_X
  262. fldmias AO2!, { s16 }
  263. vmla.f32 s4 , s2 , s8
  264. vmla.f32 s5 , s2 , s16
  265. .endm
  266. .macro SAVE_S2
  267. fldmias YO, { s24 }
  268. vmla.f32 s24, s0, s4
  269. fstmias YO, { s24 }
  270. add YO, YO, INC_Y
  271. fldmias YO, { s24 }
  272. vmla.f32 s24, s0, s5
  273. fstmias YO, { s24 }
  274. add YO, YO, INC_Y
  275. .endm
  276. .macro INIT_F1
  277. vsub.f32 s4 , s4 , s4
  278. .endm
  279. .macro KERNEL_F1X4
  280. fldmias XO! , { s28 - s31 }
  281. fldmias AO1!, { s8 - s9 }
  282. vmla.f32 s4 , s28 , s8
  283. fldmias AO1!, { s10 - s11 }
  284. vmla.f32 s4 , s29 , s9
  285. vmla.f32 s4 , s30, s10
  286. vmla.f32 s4 , s31, s11
  287. .endm
  288. .macro KERNEL_F1X1
  289. fldmias XO! , { s2 }
  290. fldmias AO1!, { s8 }
  291. vmla.f32 s4 , s2 , s8
  292. .endm
  293. .macro SAVE_F1
  294. fldmias YO, { s24 }
  295. vmla.f32 s24, s0, s4
  296. fstmias YO!, { s24 }
  297. .endm
  298. .macro INIT_S1
  299. vsub.f32 s4 , s4 , s4
  300. .endm
  301. .macro KERNEL_S1X4
  302. fldmias XO , { s28 }
  303. add XO, XO, INC_X
  304. fldmias AO1!, { s8 - s9 }
  305. vmla.f32 s4 , s28 , s8
  306. fldmias XO , { s29 }
  307. add XO, XO, INC_X
  308. fldmias AO1!, { s10 - s11 }
  309. vmla.f32 s4 , s29 , s9
  310. fldmias XO , { s30 }
  311. add XO, XO, INC_X
  312. vmla.f32 s4 , s30, s10
  313. fldmias XO , { s31 }
  314. add XO, XO, INC_X
  315. vmla.f32 s4 , s31, s11
  316. .endm
  317. .macro KERNEL_S1X1
  318. fldmias XO , { s2 }
  319. fldmias AO1!, { s8 }
  320. add XO, XO, INC_X
  321. vmla.f32 s4 , s2 , s8
  322. .endm
  323. .macro SAVE_S1
  324. fldmias YO, { s24 }
  325. vmla.f32 s24, s0, s4
  326. fstmias YO, { s24 }
  327. add YO, YO, INC_Y
  328. .endm
  329. #endif
  330. /**************************************************************************************
  331. * End of macro definitions
  332. **************************************************************************************/
  333. PROLOGUE
  334. .align 5
  335. push {r4 - r9 , fp}
  336. add fp, sp, #28
  337. sub sp, sp, #STACKSIZE // reserve stack
  338. sub r12, fp, #192
  339. #if defined(DOUBLE)
  340. vstm r12, { d8 - d15 } // store floating point registers
  341. #else
  342. vstm r12, { s8 - s31 } // store floating point registers
  343. #endif
  344. cmp M, #0
  345. ble gemvt_kernel_L999
  346. cmp OLD_N, #0
  347. ble gemvt_kernel_L999
  348. str OLD_A, A
  349. str OLD_N, N
  350. ldr INC_X , OLD_INC_X
  351. ldr INC_Y , OLD_INC_Y
  352. cmp INC_X, #0
  353. beq gemvt_kernel_L999
  354. cmp INC_Y, #0
  355. beq gemvt_kernel_L999
  356. ldr LDA, OLD_LDA
  357. #if defined(DOUBLE)
  358. lsl LDA, LDA, #3 // LDA * SIZE
  359. #else
  360. lsl LDA, LDA, #2 // LDA * SIZE
  361. #endif
  362. cmp INC_X, #1
  363. bne gemvt_kernel_S2_BEGIN
  364. cmp INC_Y, #1
  365. bne gemvt_kernel_S2_BEGIN
  366. gemvt_kernel_F2_BEGIN:
  367. ldr YO , Y
  368. ldr J, N
  369. asrs J, J, #1 // J = N / 2
  370. ble gemvt_kernel_F1_BEGIN
  371. gemvt_kernel_F2X4:
  372. ldr AO1, A
  373. add AO2, AO1, LDA
  374. add r3 , AO2, LDA
  375. str r3 , A
  376. ldr XO , X
  377. INIT_F2
  378. asrs I, M, #2 // I = M / 4
  379. ble gemvt_kernel_F2X1
  380. gemvt_kernel_F2X4_10:
  381. KERNEL_F2X4
  382. subs I, I, #1
  383. bne gemvt_kernel_F2X4_10
  384. gemvt_kernel_F2X1:
  385. ands I, M , #3
  386. ble gemvt_kernel_F2_END
  387. gemvt_kernel_F2X1_10:
  388. KERNEL_F2X1
  389. subs I, I, #1
  390. bne gemvt_kernel_F2X1_10
  391. gemvt_kernel_F2_END:
  392. SAVE_F2
  393. subs J , J , #1
  394. bne gemvt_kernel_F2X4
  395. gemvt_kernel_F1_BEGIN:
  396. ldr J, N
  397. ands J, J, #1
  398. ble gemvt_kernel_L999
  399. gemvt_kernel_F1X4:
  400. ldr AO1, A
  401. ldr XO , X
  402. INIT_F1
  403. asrs I, M, #2 // I = M / 4
  404. ble gemvt_kernel_F1X1
  405. gemvt_kernel_F1X4_10:
  406. KERNEL_F1X4
  407. subs I, I, #1
  408. bne gemvt_kernel_F1X4_10
  409. gemvt_kernel_F1X1:
  410. ands I, M , #3
  411. ble gemvt_kernel_F1_END
  412. gemvt_kernel_F1X1_10:
  413. KERNEL_F1X1
  414. subs I, I, #1
  415. bne gemvt_kernel_F1X1_10
  416. gemvt_kernel_F1_END:
  417. SAVE_F1
  418. b gemvt_kernel_L999
  419. /*************************************************************************************************************/
  420. gemvt_kernel_S2_BEGIN:
  421. #if defined(DOUBLE)
  422. lsl INC_X, INC_X, #3 // INC_X * SIZE
  423. lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
  424. #else
  425. lsl INC_X, INC_X, #2 // INC_X * SIZE
  426. lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
  427. #endif
  428. ldr YO , Y
  429. ldr J, N
  430. asrs J, J, #1 // J = N / 2
  431. ble gemvt_kernel_S1_BEGIN
  432. gemvt_kernel_S2X4:
  433. ldr AO1, A
  434. add AO2, AO1, LDA
  435. add r3 , AO2, LDA
  436. str r3 , A
  437. ldr XO , X
  438. INIT_S2
  439. asrs I, M, #2 // I = M / 4
  440. ble gemvt_kernel_S2X1
  441. gemvt_kernel_S2X4_10:
  442. KERNEL_S2X4
  443. subs I, I, #1
  444. bne gemvt_kernel_S2X4_10
  445. gemvt_kernel_S2X1:
  446. ands I, M , #3
  447. ble gemvt_kernel_S2_END
  448. gemvt_kernel_S2X1_10:
  449. KERNEL_S2X1
  450. subs I, I, #1
  451. bne gemvt_kernel_S2X1_10
  452. gemvt_kernel_S2_END:
  453. SAVE_S2
  454. subs J , J , #1
  455. bne gemvt_kernel_S2X4
  456. gemvt_kernel_S1_BEGIN:
  457. ldr J, N
  458. ands J, J, #1
  459. ble gemvt_kernel_L999
  460. gemvt_kernel_S1X4:
  461. ldr AO1, A
  462. ldr XO , X
  463. INIT_S1
  464. asrs I, M, #2 // I = M / 4
  465. ble gemvt_kernel_S1X1
  466. gemvt_kernel_S1X4_10:
  467. KERNEL_S1X4
  468. subs I, I, #1
  469. bne gemvt_kernel_S1X4_10
  470. gemvt_kernel_S1X1:
  471. ands I, M , #3
  472. ble gemvt_kernel_S1_END
  473. gemvt_kernel_S1X1_10:
  474. KERNEL_S1X1
  475. subs I, I, #1
  476. bne gemvt_kernel_S1X1_10
  477. gemvt_kernel_S1_END:
  478. SAVE_S1
  479. /*************************************************************************************************************/
  480. gemvt_kernel_L999:
  481. sub r3, fp, #192
  482. #if defined(DOUBLE)
  483. vldm r3, { d8 - d15 } // restore floating point registers
  484. #else
  485. vldm r3, { s8 - s31 } // restore floating point registers
  486. #endif
  487. mov r0, #0 // set return value
  488. sub sp, fp, #28
  489. pop {r4 -r9 ,fp}
  490. bx lr
  491. EPILOGUE