You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n_vfp.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #if !defined(__ARM_PCS_VFP)
  38. #if !defined(DOUBLE)
  39. #define OLD_ALPHA r3
  40. #define OLD_A_SOFTFP [fp, #0 ]
  41. #define OLD_LDA [fp, #4 ]
  42. #define X [fp, #8 ]
  43. #define OLD_INC_X [fp, #12 ]
  44. #define Y [fp, #16 ]
  45. #define OLD_INC_Y [fp, #20 ]
  46. #else
  47. #define OLD_ALPHA [fp, #0 ]
  48. #define OLD_A_SOFTFP [fp, #8 ]
  49. #define OLD_LDA [fp, #12]
  50. #define X [fp, #16]
  51. #define OLD_INC_X [fp, #20]
  52. #define Y [fp, #24]
  53. #define OLD_INC_Y [fp, #28]
  54. #endif
  55. #else
  56. #define OLD_LDA [fp, #0 ]
  57. #define X [fp, #4 ]
  58. #define OLD_INC_X [fp, #8 ]
  59. #define Y [fp, #12 ]
  60. #define OLD_INC_Y [fp, #16 ]
  61. #endif
  62. #define OLD_A r3
  63. #define OLD_M r0
  64. #define AO1 r0
  65. #define N r1
  66. #define J r2
  67. #define AO2 r4
  68. #define XO r5
  69. #define YO r6
  70. #define LDA r7
  71. #define INC_X r8
  72. #define INC_Y r9
  73. #define I r12
  74. #define FP_ZERO [fp, #-228]
  75. #define FP_ZERO_0 [fp, #-228]
  76. #define FP_ZERO_1 [fp, #-224]
  77. #define M [fp, #-252 ]
  78. #define A [fp, #-256 ]
  79. #define X_PRE 64
  80. #define Y_PRE 0
  81. #define A_PRE 0
  82. /**************************************************************************************
  83. * Macro definitions
  84. **************************************************************************************/
  85. #if defined(DOUBLE)
  86. .macro INIT_F8
  87. pld [ YO , #Y_PRE ]
  88. pld [ YO , #Y_PRE+32 ]
  89. fldd d8 , FP_ZERO
  90. vmov.f64 d9 , d8
  91. vmov.f64 d10 , d8
  92. vmov.f64 d11 , d8
  93. vmov.f64 d12 , d8
  94. vmov.f64 d13 , d8
  95. vmov.f64 d14 , d8
  96. vmov.f64 d15 , d8
  97. .endm
  98. .macro KERNEL_F8X8
  99. pld [ XO , #X_PRE ]
  100. KERNEL_F8X1
  101. KERNEL_F8X1
  102. KERNEL_F8X1
  103. KERNEL_F8X1
  104. pld [ XO , #X_PRE ]
  105. KERNEL_F8X1
  106. KERNEL_F8X1
  107. KERNEL_F8X1
  108. KERNEL_F8X1
  109. .endm
  110. .macro KERNEL_F8X1
  111. pld [ AO2 , #A_PRE ]
  112. vldmia.f64 XO! , { d2 }
  113. vldmia.f64 AO1 , { d4 - d7 }
  114. vmla.f64 d8 , d2 , d4
  115. pld [ AO2 , #4*SIZE ]
  116. vmla.f64 d9 , d2 , d5
  117. add r3, AO1, #4*SIZE
  118. vmla.f64 d10 , d2 , d6
  119. vmla.f64 d11 , d2 , d7
  120. vldmia.f64 r3 , { d4 - d7 }
  121. vmla.f64 d12 , d2 , d4
  122. vmla.f64 d13 , d2 , d5
  123. add AO1, AO1, LDA
  124. vmla.f64 d14 , d2 , d6
  125. add AO2, AO2, LDA
  126. vmla.f64 d15 , d2 , d7
  127. .endm
  128. .macro SAVE_F8
  129. vldmia.f64 YO, { d4 - d7 }
  130. vmla.f64 d4 , d0, d8
  131. vmla.f64 d5 , d0, d9
  132. vmla.f64 d6 , d0, d10
  133. vmla.f64 d7 , d0, d11
  134. vstmia.f64 YO!, { d4 - d7 }
  135. vldmia.f64 YO, { d4 - d7 }
  136. vmla.f64 d4 , d0, d12
  137. vmla.f64 d5 , d0, d13
  138. vmla.f64 d6 , d0, d14
  139. vmla.f64 d7 , d0, d15
  140. vstmia.f64 YO!, { d4 - d7 }
  141. .endm
  142. .macro INIT_F1
  143. fldd d12 , FP_ZERO
  144. .endm
  145. .macro KERNEL_F1X1
  146. vldmia.f64 XO! , { d2 }
  147. vldmia.f64 AO1 , { d8 }
  148. vmla.f64 d12 , d2 , d8
  149. add AO1, AO1, LDA
  150. .endm
  151. .macro SAVE_F1
  152. vldmia.f64 YO, { d4 }
  153. vmla.f64 d4, d0, d12
  154. vstmia.f64 YO!, { d4 }
  155. .endm
  156. /*********************************************************************************************/
  157. .macro INIT_S4
  158. fldd d12 , FP_ZERO
  159. vmov.f64 d13 , d12
  160. vmov.f64 d14 , d12
  161. vmov.f64 d15 , d12
  162. .endm
  163. .macro KERNEL_S4X4
  164. KERNEL_S4X1
  165. KERNEL_S4X1
  166. KERNEL_S4X1
  167. KERNEL_S4X1
  168. .endm
  169. .macro KERNEL_S4X1
  170. pld [ AO2 , #A_PRE ]
  171. vldmia.f64 XO , { d2 }
  172. vldmia.f64 AO1 , { d8 - d11 }
  173. vmla.f64 d12 , d2 , d8
  174. add AO1, AO1, LDA
  175. vmla.f64 d13 , d2 , d9
  176. add AO2, AO2, LDA
  177. vmla.f64 d14 , d2 , d10
  178. vmla.f64 d15 , d2 , d11
  179. add XO, XO , INC_X
  180. .endm
  181. .macro SAVE_S4
  182. vldmia.f64 YO, { d4 }
  183. vmla.f64 d4 , d0, d12
  184. vstmia.f64 YO, { d4 }
  185. add YO, YO, INC_Y
  186. vldmia.f64 YO, { d5 }
  187. vmla.f64 d5 , d0, d13
  188. vstmia.f64 YO, { d5 }
  189. add YO, YO, INC_Y
  190. vldmia.f64 YO, { d4 }
  191. vmla.f64 d4 , d0, d14
  192. vstmia.f64 YO, { d4 }
  193. add YO, YO, INC_Y
  194. vldmia.f64 YO, { d5 }
  195. vmla.f64 d5 , d0, d15
  196. vstmia.f64 YO, { d5 }
  197. add YO, YO, INC_Y
  198. .endm
  199. .macro INIT_S1
  200. fldd d12 , FP_ZERO
  201. .endm
  202. .macro KERNEL_S1X1
  203. vldmia.f64 XO , { d2 }
  204. vldmia.f64 AO1 , { d8 }
  205. vmla.f64 d12 , d2 , d8
  206. add AO1, AO1, LDA
  207. add XO, XO , INC_X
  208. .endm
  209. .macro SAVE_S1
  210. vldmia.f64 YO, { d4 }
  211. vmla.f64 d4, d0, d12
  212. vstmia.f64 YO , { d4 }
  213. add YO, YO, INC_Y
  214. .endm
  215. #else /************************* SINGLE PRECISION *****************************************/
  216. .macro INIT_F8
  217. pld [ YO , #Y_PRE ]
  218. flds s8 , FP_ZERO
  219. vmov.f32 s9 , s8
  220. vmov.f32 s10 , s8
  221. vmov.f32 s11 , s8
  222. vmov.f32 s12 , s8
  223. vmov.f32 s13 , s8
  224. vmov.f32 s14 , s8
  225. vmov.f32 s15 , s8
  226. .endm
  227. .macro KERNEL_F8X8
  228. pld [ XO , #X_PRE ]
  229. KERNEL_F8X1
  230. KERNEL_F8X1
  231. KERNEL_F8X1
  232. KERNEL_F8X1
  233. KERNEL_F8X1
  234. KERNEL_F8X1
  235. KERNEL_F8X1
  236. KERNEL_F8X1
  237. .endm
  238. .macro KERNEL_F8X1
  239. pld [ AO2, #A_PRE ]
  240. vldmia.f32 XO! , { s2 }
  241. vldmia.f32 AO1 , { s4 - s7 }
  242. vmla.f32 s8 , s2 , s4
  243. vmla.f32 s9 , s2 , s5
  244. vmla.f32 s10 , s2 , s6
  245. vmla.f32 s11 , s2 , s7
  246. add r3, AO1, #4*SIZE
  247. vldmia.f32 r3 , { s4 - s7 }
  248. vmla.f32 s12 , s2 , s4
  249. vmla.f32 s13 , s2 , s5
  250. vmla.f32 s14 , s2 , s6
  251. vmla.f32 s15 , s2 , s7
  252. add AO1, AO1, LDA
  253. add AO2, AO2, LDA
  254. .endm
  255. .macro SAVE_F8
  256. vldmia.f32 YO, { s4 - s7 }
  257. vmla.f32 s4 , s0, s8
  258. vmla.f32 s5 , s0, s9
  259. vmla.f32 s6 , s0, s10
  260. vmla.f32 s7 , s0, s11
  261. vstmia.f32 YO!, { s4 - s7 }
  262. vldmia.f32 YO, { s4 - s7 }
  263. vmla.f32 s4 , s0, s12
  264. vmla.f32 s5 , s0, s13
  265. vmla.f32 s6 , s0, s14
  266. vmla.f32 s7 , s0, s15
  267. vstmia.f32 YO!, { s4 - s7 }
  268. .endm
  269. .macro INIT_F1
  270. flds s12 , FP_ZERO
  271. .endm
  272. .macro KERNEL_F1X1
  273. vldmia.f32 XO! , { s2 }
  274. vldmia.f32 AO1 , { s8 }
  275. vmla.f32 s12 , s2 , s8
  276. add AO1, AO1, LDA
  277. .endm
  278. .macro SAVE_F1
  279. vldmia.f32 YO, { s4 }
  280. vmla.f32 s4, s0, s12
  281. vstmia.f32 YO!, { s4 }
  282. .endm
  283. /*********************************************************************************************/
  284. .macro INIT_S4
  285. flds s12 , FP_ZERO
  286. vmov.f32 s13 , s12
  287. vmov.f32 s14 , s12
  288. vmov.f32 s15 , s12
  289. .endm
  290. .macro KERNEL_S4X4
  291. pld [ AO2 , #A_PRE ]
  292. KERNEL_S4X1
  293. KERNEL_S4X1
  294. pld [ AO2 , #A_PRE ]
  295. KERNEL_S4X1
  296. KERNEL_S4X1
  297. .endm
  298. .macro KERNEL_S4X1
  299. vldmia.f32 XO , { s2 }
  300. vldmia.f32 AO1 , { s8 - s11 }
  301. vmla.f32 s12 , s2 , s8
  302. vmla.f32 s13 , s2 , s9
  303. vmla.f32 s14 , s2 , s10
  304. vmla.f32 s15 , s2 , s11
  305. add AO1, AO1, LDA
  306. add AO2, AO2, LDA
  307. add XO, XO , INC_X
  308. .endm
  309. .macro SAVE_S4
  310. vldmia.f32 YO, { s4 }
  311. vmla.f32 s4 , s0, s12
  312. vstmia.f32 YO, { s4 }
  313. add YO, YO, INC_Y
  314. vldmia.f32 YO, { s5 }
  315. vmla.f32 s5 , s0, s13
  316. vstmia.f32 YO, { s5 }
  317. add YO, YO, INC_Y
  318. vldmia.f32 YO, { s4 }
  319. vmla.f32 s4 , s0, s14
  320. vstmia.f32 YO, { s4 }
  321. add YO, YO, INC_Y
  322. vldmia.f32 YO, { s5 }
  323. vmla.f32 s5 , s0, s15
  324. vstmia.f32 YO, { s5 }
  325. add YO, YO, INC_Y
  326. .endm
  327. .macro INIT_S1
  328. flds s12 , FP_ZERO
  329. .endm
  330. .macro KERNEL_S1X1
  331. vldmia.f32 XO , { s2 }
  332. vldmia.f32 AO1 , { s8 }
  333. vmla.f32 s12 , s2 , s8
  334. add AO1, AO1, LDA
  335. add XO, XO , INC_X
  336. .endm
  337. .macro SAVE_S1
  338. vldmia.f32 YO, { s4 }
  339. vmla.f32 s4, s0, s12
  340. vstmia.f32 YO , { s4 }
  341. add YO, YO, INC_Y
  342. .endm
  343. #endif
  344. /**************************************************************************************
  345. * End of macro definitions
  346. **************************************************************************************/
  347. PROLOGUE
  348. .align 5
  349. push {r4 - r9 , fp}
  350. add fp, sp, #28
  351. sub sp, sp, #STACKSIZE // reserve stack
  352. sub r12, fp, #192
  353. #if defined(DOUBLE)
  354. vstm r12, { d8 - d15 } // store floating point registers
  355. #else
  356. vstm r12, { s8 - s15 } // store floating point registers
  357. #endif
  358. movs r12, #0
  359. str r12, FP_ZERO
  360. str r12, FP_ZERO_1
  361. cmp OLD_M, #0
  362. ble gemvn_kernel_L999
  363. cmp N, #0
  364. ble gemvn_kernel_L999
  365. #if !defined(__ARM_PCS_VFP)
  366. #if !defined(DOUBLE)
  367. vmov s0, OLD_ALPHA
  368. #else
  369. vldr d0, OLD_ALPHA
  370. #endif
  371. ldr OLD_A, OLD_A_SOFTFP
  372. #endif
  373. str OLD_A, A
  374. str OLD_M, M
  375. ldr INC_X , OLD_INC_X
  376. ldr INC_Y , OLD_INC_Y
  377. cmp INC_X, #0
  378. beq gemvn_kernel_L999
  379. cmp INC_Y, #0
  380. beq gemvn_kernel_L999
  381. ldr LDA, OLD_LDA
  382. #if defined(DOUBLE)
  383. lsl LDA, LDA, #3 // LDA * SIZE
  384. #else
  385. lsl LDA, LDA, #2 // LDA * SIZE
  386. #endif
  387. cmp INC_X, #1
  388. bne gemvn_kernel_S4_BEGIN
  389. cmp INC_Y, #1
  390. bne gemvn_kernel_S4_BEGIN
  391. gemvn_kernel_F4_BEGIN:
  392. ldr YO , Y
  393. ldr I, M
  394. asrs I, I, #3 // I = M / 8
  395. ble gemvn_kernel_F1_BEGIN
  396. gemvn_kernel_F4X4:
  397. ldr AO1, A
  398. add AO2, AO1, LDA
  399. add r3 , AO1, #8*SIZE
  400. str r3 , A
  401. add AO2, AO2, LDA
  402. add AO2, AO2, LDA
  403. ldr XO , X
  404. INIT_F8
  405. asrs J, N, #3 // J = N / 8
  406. ble gemvn_kernel_F4X1
  407. gemvn_kernel_F4X4_10:
  408. KERNEL_F8X8
  409. subs J, J, #1
  410. bne gemvn_kernel_F4X4_10
  411. gemvn_kernel_F4X1:
  412. ands J, N , #7
  413. ble gemvn_kernel_F4_END
  414. gemvn_kernel_F4X1_10:
  415. KERNEL_F8X1
  416. subs J, J, #1
  417. bne gemvn_kernel_F4X1_10
  418. gemvn_kernel_F4_END:
  419. SAVE_F8
  420. subs I , I , #1
  421. bne gemvn_kernel_F4X4
  422. gemvn_kernel_F1_BEGIN:
  423. ldr I, M
  424. ands I, I , #7
  425. ble gemvn_kernel_L999
  426. gemvn_kernel_F1X1:
  427. ldr AO1, A
  428. add r3, AO1, #SIZE
  429. str r3, A
  430. ldr XO , X
  431. INIT_F1
  432. mov J, N
  433. gemvn_kernel_F1X1_10:
  434. KERNEL_F1X1
  435. subs J, J, #1
  436. bne gemvn_kernel_F1X1_10
  437. gemvn_kernel_F1_END:
  438. SAVE_F1
  439. subs I , I , #1
  440. bne gemvn_kernel_F1X1
  441. b gemvn_kernel_L999
  442. /*************************************************************************************************************/
  443. gemvn_kernel_S4_BEGIN:
  444. #if defined(DOUBLE)
  445. lsl INC_X, INC_X, #3 // INC_X * SIZE
  446. lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
  447. #else
  448. lsl INC_X, INC_X, #2 // INC_X * SIZE
  449. lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
  450. #endif
  451. ldr YO , Y
  452. ldr I, M
  453. asrs I, I, #2 // I = M / 4
  454. ble gemvn_kernel_S1_BEGIN
  455. gemvn_kernel_S4X4:
  456. ldr AO1, A
  457. add AO2, AO1, LDA
  458. add r3 , AO1, #4*SIZE
  459. str r3 , A
  460. ldr XO , X
  461. INIT_S4
  462. asrs J, N, #2 // J = N / 4
  463. ble gemvn_kernel_S4X1
  464. gemvn_kernel_S4X4_10:
  465. KERNEL_S4X4
  466. subs J, J, #1
  467. bne gemvn_kernel_S4X4_10
  468. gemvn_kernel_S4X1:
  469. ands J, N , #3
  470. ble gemvn_kernel_S4_END
  471. gemvn_kernel_S4X1_10:
  472. KERNEL_S4X1
  473. subs J, J, #1
  474. bne gemvn_kernel_S4X1_10
  475. gemvn_kernel_S4_END:
  476. SAVE_S4
  477. subs I , I , #1
  478. bne gemvn_kernel_S4X4
  479. gemvn_kernel_S1_BEGIN:
  480. ldr I, M
  481. ands I, I , #3
  482. ble gemvn_kernel_L999
  483. gemvn_kernel_S1X1:
  484. ldr AO1, A
  485. add r3, AO1, #SIZE
  486. str r3, A
  487. ldr XO , X
  488. INIT_S1
  489. mov J, N
  490. gemvn_kernel_S1X1_10:
  491. KERNEL_S1X1
  492. subs J, J, #1
  493. bne gemvn_kernel_S1X1_10
  494. gemvn_kernel_S1_END:
  495. SAVE_S1
  496. subs I , I , #1
  497. bne gemvn_kernel_S1X1
  498. /*************************************************************************************************************/
  499. gemvn_kernel_L999:
  500. sub r3, fp, #192
  501. #if defined(DOUBLE)
  502. vldm r3, { d8 - d15 } // restore floating point registers
  503. #else
  504. vldm r3, { s8 - s15 } // restore floating point registers
  505. #endif
  506. mov r0, #0 // set return value
  507. sub sp, fp, #28
  508. pop {r4 -r9 ,fp}
  509. bx lr
  510. EPILOGUE