You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t_vfp.S 13 kB


  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/25 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #if !defined(__ARM_PCS_VFP)
  38. #if !defined(DOUBLE)
  39. #define OLD_ALPHA r3
  40. #define OLD_A_SOFTFP [fp, #0 ]
  41. #define OLD_LDA [fp, #4 ]
  42. #define X [fp, #8 ]
  43. #define OLD_INC_X [fp, #12 ]
  44. #define Y [fp, #16 ]
  45. #define OLD_INC_Y [fp, #20 ]
  46. #else
  47. #define OLD_ALPHA [fp, #0 ]
  48. #define OLD_A_SOFTFP [fp, #8 ]
  49. #define OLD_LDA [fp, #12]
  50. #define X [fp, #16]
  51. #define OLD_INC_X [fp, #20]
  52. #define Y [fp, #24]
  53. #define OLD_INC_Y [fp, #28]
  54. #endif
  55. #else
  56. #define OLD_LDA [fp, #0 ]
  57. #define X [fp, #4 ]
  58. #define OLD_INC_X [fp, #8 ]
  59. #define Y [fp, #12 ]
  60. #define OLD_INC_Y [fp, #16 ]
  61. #endif
  62. #define OLD_A r3
  63. #define OLD_N r1
  64. #define M r0
  65. #define AO1 r1
  66. #define J r2
  67. #define AO2 r4
  68. #define XO r5
  69. #define YO r6
  70. #define LDA r7
  71. #define INC_X r8
  72. #define INC_Y r9
  73. #define I r12
  74. #define FP_ZERO [fp, #-228]
  75. #define FP_ZERO_0 [fp, #-228]
  76. #define FP_ZERO_1 [fp, #-224]
  77. #define N [fp, #-252 ]
  78. #define A [fp, #-256 ]
  79. #define X_PRE 512
  80. #define A_PRE 512
  81. /**************************************************************************************
  82. * Macro definitions
  83. **************************************************************************************/
  84. #if defined(DOUBLE)
  85. .macro INIT_F2
  86. fldd d2, FP_ZERO
  87. vmov.f64 d3 , d2
  88. .endm
  89. .macro KERNEL_F2X4
  90. pld [ XO , #X_PRE ]
  91. vldmia.f64 XO! , { d12 - d15 }
  92. pld [ AO1 , #A_PRE ]
  93. vldmia.f64 AO1!, { d8 - d9 }
  94. pld [ AO2 , #A_PRE ]
  95. vldmia.f64 AO2!, { d4 - d5 }
  96. vldmia.f64 AO1!, { d10 - d11 }
  97. vldmia.f64 AO2!, { d6 - d7 }
  98. vmla.f64 d2 , d12 , d8
  99. vmla.f64 d3 , d12 , d4
  100. vmla.f64 d2 , d13 , d9
  101. vmla.f64 d3 , d13 , d5
  102. vmla.f64 d2 , d14, d10
  103. vmla.f64 d3 , d14, d6
  104. vmla.f64 d2 , d15, d11
  105. vmla.f64 d3 , d15, d7
  106. .endm
  107. .macro KERNEL_F2X1
  108. vldmia.f64 XO! , { d1 }
  109. vldmia.f64 AO1!, { d8 }
  110. vldmia.f64 AO2!, { d4 }
  111. vmla.f64 d2 , d1 , d8
  112. vmla.f64 d3 , d1 , d4
  113. .endm
  114. .macro SAVE_F2
  115. vldmia.f64 YO, { d4 - d5 }
  116. vmla.f64 d4, d0, d2
  117. vmla.f64 d5, d0, d3
  118. vstmia.f64 YO!, { d4 - d5 }
  119. .endm
  120. .macro INIT_F1
  121. fldd d2, FP_ZERO
  122. vmov.f64 d3 , d2
  123. .endm
  124. .macro KERNEL_F1X4
  125. pld [ XO , #X_PRE ]
  126. vldmia.f64 XO! , { d12 - d15 }
  127. pld [ AO1 , #A_PRE ]
  128. vldmia.f64 AO1!, { d8 - d9 }
  129. vldmia.f64 AO1!, { d10 - d11 }
  130. vmla.f64 d2 , d12 , d8
  131. vmla.f64 d2 , d13 , d9
  132. vmla.f64 d2 , d14, d10
  133. vmla.f64 d2 , d15, d11
  134. .endm
  135. .macro KERNEL_F1X1
  136. vldmia.f64 XO! , { d1 }
  137. vldmia.f64 AO1!, { d8 }
  138. vmla.f64 d2 , d1 , d8
  139. .endm
  140. .macro SAVE_F1
  141. vldmia.f64 YO, { d4 }
  142. vmla.f64 d4, d0, d2
  143. vstmia.f64 YO!, { d4 }
  144. .endm
  145. .macro INIT_S2
  146. fldd d2, FP_ZERO
  147. vmov.f64 d3 , d2
  148. .endm
  149. .macro KERNEL_S2X4
  150. vldmia.f64 XO , { d12 }
  151. add XO, XO, INC_X
  152. pld [ AO1 , #A_PRE ]
  153. vldmia.f64 AO1!, { d8 - d9 }
  154. pld [ AO2 , #A_PRE ]
  155. vldmia.f64 AO2!, { d4 - d5 }
  156. vldmia.f64 XO , { d13 }
  157. add XO, XO, INC_X
  158. vldmia.f64 AO1!, { d10 - d11 }
  159. vldmia.f64 AO2!, { d6 - d7 }
  160. vldmia.f64 XO , { d14 }
  161. add XO, XO, INC_X
  162. vldmia.f64 XO , { d15 }
  163. add XO, XO, INC_X
  164. vmla.f64 d2 , d12 , d8
  165. vmla.f64 d3 , d12 , d4
  166. vmla.f64 d2 , d13 , d9
  167. vmla.f64 d3 , d13 , d5
  168. vmla.f64 d2 , d14, d10
  169. vmla.f64 d3 , d14, d6
  170. vmla.f64 d2 , d15, d11
  171. vmla.f64 d3 , d15, d7
  172. .endm
  173. .macro KERNEL_S2X1
  174. vldmia.f64 XO , { d1 }
  175. vldmia.f64 AO1!, { d8 }
  176. vldmia.f64 AO2!, { d4 }
  177. vmla.f64 d2 , d1 , d8
  178. add XO, XO, INC_X
  179. vmla.f64 d3 , d1 , d4
  180. .endm
  181. .macro SAVE_S2
  182. vldmia.f64 YO, { d4 }
  183. vmla.f64 d4, d0, d2
  184. vstmia.f64 YO, { d4 }
  185. add YO, YO, INC_Y
  186. vldmia.f64 YO, { d5 }
  187. vmla.f64 d5, d0, d3
  188. vstmia.f64 YO, { d5 }
  189. add YO, YO, INC_Y
  190. .endm
  191. .macro INIT_S1
  192. fldd d2, FP_ZERO
  193. vmov.f64 d3 , d2
  194. .endm
  195. .macro KERNEL_S1X4
  196. vldmia.f64 XO , { d12 }
  197. add XO, XO, INC_X
  198. pld [ AO1 , #A_PRE ]
  199. vldmia.f64 AO1!, { d8 - d9 }
  200. vldmia.f64 XO , { d13 }
  201. add XO, XO, INC_X
  202. vldmia.f64 AO1!, { d10 - d11 }
  203. vldmia.f64 XO , { d14 }
  204. add XO, XO, INC_X
  205. vldmia.f64 XO , { d15 }
  206. add XO, XO, INC_X
  207. vmla.f64 d2 , d12 , d8
  208. vmla.f64 d2 , d13 , d9
  209. vmla.f64 d2 , d14, d10
  210. vmla.f64 d2 , d15, d11
  211. .endm
  212. .macro KERNEL_S1X1
  213. vldmia.f64 XO , { d1 }
  214. vldmia.f64 AO1!, { d8 }
  215. vmla.f64 d2 , d1 , d8
  216. add XO, XO, INC_X
  217. .endm
  218. .macro SAVE_S1
  219. vldmia.f64 YO, { d4 }
  220. vmla.f64 d4, d0, d2
  221. vstmia.f64 YO, { d4 }
  222. add YO, YO, INC_Y
  223. .endm
  224. #else /************************* SINGLE PRECISION *****************************************/
  225. .macro INIT_F2
  226. flds s2 , FP_ZERO
  227. vmov.f32 s3 , s2
  228. .endm
  229. .macro KERNEL_F2X4
  230. vldmia.f32 XO! , { s12 - s15 }
  231. vldmia.f32 AO1!, { s8 - s9 }
  232. vldmia.f32 AO2!, { s4 - s5 }
  233. vldmia.f32 AO1!, { s10 - s11 }
  234. vldmia.f32 AO2!, { s6 - s7 }
  235. vmla.f32 s2 , s12 , s8
  236. vmla.f32 s3 , s12 , s4
  237. vmla.f32 s2 , s13 , s9
  238. vmla.f32 s3 , s13 , s5
  239. vmla.f32 s2 , s14, s10
  240. vmla.f32 s3 , s14, s6
  241. vmla.f32 s2 , s15, s11
  242. vmla.f32 s3 , s15, s7
  243. .endm
  244. .macro KERNEL_F2X1
  245. vldmia.f32 XO! , { s1 }
  246. vldmia.f32 AO1!, { s8 }
  247. vldmia.f32 AO2!, { s4 }
  248. vmla.f32 s2 , s1 , s8
  249. vmla.f32 s3 , s1 , s4
  250. .endm
  251. .macro SAVE_F2
  252. vldmia.f32 YO, { s4 - s5 }
  253. vmla.f32 s4, s0, s2
  254. vmla.f32 s5, s0, s3
  255. vstmia.f32 YO!, { s4 - s5 }
  256. .endm
  257. .macro INIT_F1
  258. flds s2 , FP_ZERO
  259. .endm
  260. .macro KERNEL_F1X4
  261. vldmia.f32 XO! , { s12 - s15 }
  262. vldmia.f32 AO1!, { s8 - s9 }
  263. vldmia.f32 AO1!, { s10 - s11 }
  264. vmla.f32 s2 , s12 , s8
  265. vmla.f32 s2 , s13 , s9
  266. vmla.f32 s2 , s14, s10
  267. vmla.f32 s2 , s15, s11
  268. .endm
  269. .macro KERNEL_F1X1
  270. vldmia.f32 XO! , { s1 }
  271. vldmia.f32 AO1!, { s8 }
  272. vmla.f32 s2 , s1 , s8
  273. .endm
  274. .macro SAVE_F1
  275. vldmia.f32 YO, { s4 }
  276. vmla.f32 s4, s0, s2
  277. vstmia.f32 YO!, { s4 }
  278. .endm
  279. .macro INIT_S2
  280. flds s2 , FP_ZERO
  281. vmov.f32 s3 , s2
  282. .endm
  283. .macro KERNEL_S2X4
  284. vldmia.f32 XO , { s12 }
  285. add XO, XO, INC_X
  286. vldmia.f32 AO1!, { s8 - s9 }
  287. vldmia.f32 AO2!, { s4 - s5 }
  288. vldmia.f32 XO , { s13 }
  289. add XO, XO, INC_X
  290. vldmia.f32 AO1!, { s10 - s11 }
  291. vldmia.f32 AO2!, { s6 - s7 }
  292. vldmia.f32 XO , { s14 }
  293. add XO, XO, INC_X
  294. vldmia.f32 XO , { s15 }
  295. add XO, XO, INC_X
  296. vmla.f32 s2 , s12 , s8
  297. vmla.f32 s3 , s12 , s4
  298. vmla.f32 s2 , s13 , s9
  299. vmla.f32 s3 , s13 , s5
  300. vmla.f32 s2 , s14, s10
  301. vmla.f32 s3 , s14, s6
  302. vmla.f32 s2 , s15, s11
  303. vmla.f32 s3 , s15, s7
  304. .endm
  305. .macro KERNEL_S2X1
  306. vldmia.f32 XO , { s1 }
  307. vldmia.f32 AO1!, { s8 }
  308. vldmia.f32 AO2!, { s4 }
  309. vmla.f32 s2 , s1 , s8
  310. add XO, XO, INC_X
  311. vmla.f32 s3 , s1 , s4
  312. .endm
  313. .macro SAVE_S2
  314. vldmia.f32 YO, { s4 }
  315. vmla.f32 s4, s0, s2
  316. vstmia.f32 YO, { s4 }
  317. add YO, YO, INC_Y
  318. vldmia.f32 YO, { s5 }
  319. vmla.f32 s5, s0, s3
  320. vstmia.f32 YO, { s5 }
  321. add YO, YO, INC_Y
  322. .endm
  323. .macro INIT_S1
  324. flds s2 , FP_ZERO
  325. .endm
  326. .macro KERNEL_S1X4
  327. vldmia.f32 XO , { s12 }
  328. add XO, XO, INC_X
  329. pld [ AO1 , #A_PRE ]
  330. vldmia.f32 AO1!, { s8 - s9 }
  331. vldmia.f32 XO , { s13 }
  332. add XO, XO, INC_X
  333. vldmia.f32 AO1!, { s10 - s11 }
  334. vldmia.f32 XO , { s14 }
  335. add XO, XO, INC_X
  336. vldmia.f32 XO , { s15 }
  337. add XO, XO, INC_X
  338. vmla.f32 s2 , s12 , s8
  339. vmla.f32 s2 , s13 , s9
  340. vmla.f32 s2 , s14, s10
  341. vmla.f32 s2 , s15, s11
  342. .endm
  343. .macro KERNEL_S1X1
  344. vldmia.f32 XO , { s1 }
  345. vldmia.f32 AO1!, { s8 }
  346. vmla.f32 s2 , s1 , s8
  347. add XO, XO, INC_X
  348. .endm
  349. .macro SAVE_S1
  350. vldmia.f32 YO, { s4 }
  351. vmla.f32 s4, s0, s2
  352. vstmia.f32 YO, { s4 }
  353. add YO, YO, INC_Y
  354. .endm
  355. #endif
  356. /**************************************************************************************
  357. * End of macro definitions
  358. **************************************************************************************/
  359. PROLOGUE
  360. .align 5
  361. push {r4 - r9 , fp}
  362. add fp, sp, #28
  363. sub sp, sp, #STACKSIZE // reserve stack
  364. sub r12, fp, #192
  365. #if defined(DOUBLE)
  366. vstm r12, { d8 - d15 } // store floating point registers
  367. #else
  368. vstm r12, { s8 - s15 } // store floating point registers
  369. #endif
  370. movs r12, #0
  371. str r12, FP_ZERO
  372. str r12, FP_ZERO_1
  373. cmp M, #0
  374. ble gemvt_kernel_L999
  375. cmp OLD_N, #0
  376. ble gemvt_kernel_L999
  377. #if !defined(__ARM_PCS_VFP)
  378. #if !defined(DOUBLE)
  379. vmov s0, OLD_ALPHA
  380. #else
  381. vldr d0, OLD_ALPHA
  382. #endif
  383. ldr OLD_A, OLD_A_SOFTFP
  384. #endif
  385. str OLD_A, A
  386. str OLD_N, N
  387. ldr INC_X , OLD_INC_X
  388. ldr INC_Y , OLD_INC_Y
  389. cmp INC_X, #0
  390. beq gemvt_kernel_L999
  391. cmp INC_Y, #0
  392. beq gemvt_kernel_L999
  393. ldr LDA, OLD_LDA
  394. #if defined(DOUBLE)
  395. lsl LDA, LDA, #3 // LDA * SIZE
  396. #else
  397. lsl LDA, LDA, #2 // LDA * SIZE
  398. #endif
  399. cmp INC_X, #1
  400. bne gemvt_kernel_S2_BEGIN
  401. cmp INC_Y, #1
  402. bne gemvt_kernel_S2_BEGIN
  403. gemvt_kernel_F2_BEGIN:
  404. ldr YO , Y
  405. ldr J, N
  406. asrs J, J, #1 // J = N / 2
  407. ble gemvt_kernel_F1_BEGIN
  408. gemvt_kernel_F2X4:
  409. ldr AO1, A
  410. add AO2, AO1, LDA
  411. add r3 , AO2, LDA
  412. str r3 , A
  413. ldr XO , X
  414. INIT_F2
  415. asrs I, M, #2 // I = M / 4
  416. ble gemvt_kernel_F2X1
  417. gemvt_kernel_F2X4_10:
  418. KERNEL_F2X4
  419. subs I, I, #1
  420. bne gemvt_kernel_F2X4_10
  421. gemvt_kernel_F2X1:
  422. ands I, M , #3
  423. ble gemvt_kernel_F2_END
  424. gemvt_kernel_F2X1_10:
  425. KERNEL_F2X1
  426. subs I, I, #1
  427. bne gemvt_kernel_F2X1_10
  428. gemvt_kernel_F2_END:
  429. SAVE_F2
  430. subs J , J , #1
  431. bne gemvt_kernel_F2X4
  432. gemvt_kernel_F1_BEGIN:
  433. ldr J, N
  434. ands J, J, #1
  435. ble gemvt_kernel_L999
  436. gemvt_kernel_F1X4:
  437. ldr AO1, A
  438. ldr XO , X
  439. INIT_F1
  440. asrs I, M, #2 // I = M / 4
  441. ble gemvt_kernel_F1X1
  442. gemvt_kernel_F1X4_10:
  443. KERNEL_F1X4
  444. subs I, I, #1
  445. bne gemvt_kernel_F1X4_10
  446. gemvt_kernel_F1X1:
  447. ands I, M , #3
  448. ble gemvt_kernel_F1_END
  449. gemvt_kernel_F1X1_10:
  450. KERNEL_F1X1
  451. subs I, I, #1
  452. bne gemvt_kernel_F1X1_10
  453. gemvt_kernel_F1_END:
  454. SAVE_F1
  455. b gemvt_kernel_L999
  456. /*************************************************************************************************************/
  457. gemvt_kernel_S2_BEGIN:
  458. #if defined(DOUBLE)
  459. lsl INC_X, INC_X, #3 // INC_X * SIZE
  460. lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
  461. #else
  462. lsl INC_X, INC_X, #2 // INC_X * SIZE
  463. lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
  464. #endif
  465. ldr YO , Y
  466. ldr J, N
  467. asrs J, J, #1 // J = N / 2
  468. ble gemvt_kernel_S1_BEGIN
  469. gemvt_kernel_S2X4:
  470. ldr AO1, A
  471. add AO2, AO1, LDA
  472. add r3 , AO2, LDA
  473. str r3 , A
  474. ldr XO , X
  475. INIT_S2
  476. asrs I, M, #2 // I = M / 4
  477. ble gemvt_kernel_S2X1
  478. gemvt_kernel_S2X4_10:
  479. KERNEL_S2X4
  480. subs I, I, #1
  481. bne gemvt_kernel_S2X4_10
  482. gemvt_kernel_S2X1:
  483. ands I, M , #3
  484. ble gemvt_kernel_S2_END
  485. gemvt_kernel_S2X1_10:
  486. KERNEL_S2X1
  487. subs I, I, #1
  488. bne gemvt_kernel_S2X1_10
  489. gemvt_kernel_S2_END:
  490. SAVE_S2
  491. subs J , J , #1
  492. bne gemvt_kernel_S2X4
  493. gemvt_kernel_S1_BEGIN:
  494. ldr J, N
  495. ands J, J, #1
  496. ble gemvt_kernel_L999
  497. gemvt_kernel_S1X4:
  498. ldr AO1, A
  499. ldr XO , X
  500. INIT_S1
  501. asrs I, M, #2 // I = M / 4
  502. ble gemvt_kernel_S1X1
  503. gemvt_kernel_S1X4_10:
  504. KERNEL_S1X4
  505. subs I, I, #1
  506. bne gemvt_kernel_S1X4_10
  507. gemvt_kernel_S1X1:
  508. ands I, M , #3
  509. ble gemvt_kernel_S1_END
  510. gemvt_kernel_S1X1_10:
  511. KERNEL_S1X1
  512. subs I, I, #1
  513. bne gemvt_kernel_S1X1_10
  514. gemvt_kernel_S1_END:
  515. SAVE_S1
  516. /*************************************************************************************************************/
  517. gemvt_kernel_L999:
  518. sub r3, fp, #192
  519. #if defined(DOUBLE)
  520. vldm r3, { d8 - d15 } // restore floating point registers
  521. #else
  522. vldm r3, { s8 - s15 } // restore floating point registers
  523. #endif
  524. mov r0, #0 // set return value
  525. sub sp, fp, #28
  526. pop {r4 -r9 ,fp}
  527. bx lr
  528. EPILOGUE