You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n_vfpv3.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/19 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_LDA [fp, #0 ]
  38. #define X [fp, #4 ]
  39. #define OLD_INC_X [fp, #8 ]
  40. #define Y [fp, #12 ]
  41. #define OLD_INC_Y [fp, #16 ]
  42. #define OLD_A r3
  43. #define OLD_M r0
  44. #define AO1 r0
  45. #define N r1
  46. #define J r2
  47. #define AO2 r4
  48. #define XO r5
  49. #define YO r6
  50. #define LDA r7
  51. #define INC_X r8
  52. #define INC_Y r9
  53. #define I r12
  54. #define M [fp, #-252 ]
  55. #define A [fp, #-256 ]
  56. #define X_PRE 64
  57. #define Y_PRE 0
  58. #define A_PRE 0
  59. /**************************************************************************************
  60. * Macro definitions
  61. **************************************************************************************/
  62. #if defined(DOUBLE)
  63. .macro INIT_F8
  64. pld [ YO , #Y_PRE ]
  65. pld [ YO , #Y_PRE+32 ]
  66. vsub.f64 d24 , d24 , d24
  67. vmov.f64 d25 , d24
  68. vmov.f64 d26 , d24
  69. vmov.f64 d27 , d24
  70. vmov.f64 d28 , d24
  71. vmov.f64 d29 , d24
  72. vmov.f64 d30 , d24
  73. vmov.f64 d31 , d24
  74. .endm
  75. .macro KERNEL_F8X8
  76. pld [ XO , #X_PRE ]
  77. KERNEL_F8X1
  78. KERNEL_F8X1
  79. KERNEL_F8X1
  80. KERNEL_F8X1
  81. pld [ XO , #X_PRE ]
  82. KERNEL_F8X1
  83. KERNEL_F8X1
  84. KERNEL_F8X1
  85. KERNEL_F8X1
  86. .endm
  87. .macro KERNEL_F8X1
  88. fldmiad XO! , { d4 }
  89. fldmiad AO1 , { d8 - d15 }
  90. vmla.f64 d24 , d4 , d8
  91. pld [ AO2 , #A_PRE ]
  92. vmla.f64 d25 , d4 , d9
  93. pld [ AO2 , #A_PRE+32 ]
  94. vmla.f64 d26 , d4 , d10
  95. vmla.f64 d27 , d4 , d11
  96. vmla.f64 d28 , d4 , d12
  97. vmla.f64 d29 , d4 , d13
  98. add AO1, AO1, LDA
  99. vmla.f64 d30 , d4 , d14
  100. add AO2, AO2, LDA
  101. vmla.f64 d31 , d4 , d15
  102. .endm
  103. .macro SAVE_F8
  104. fldmiad YO, { d16 - d23 }
  105. vmla.f64 d16, d0, d24
  106. vmla.f64 d17, d0, d25
  107. vmla.f64 d18, d0, d26
  108. vmla.f64 d19, d0, d27
  109. vmla.f64 d20, d0, d28
  110. vmla.f64 d21, d0, d29
  111. vmla.f64 d22, d0, d30
  112. vmla.f64 d23, d0, d31
  113. fstmiad YO!, { d16 - d23 }
  114. .endm
  115. .macro INIT_F1
  116. vsub.f64 d24 , d24 , d24
  117. .endm
  118. .macro KERNEL_F1X1
  119. fldmiad XO! , { d4 }
  120. fldmiad AO1 , { d8 }
  121. vmla.f64 d24 , d4 , d8
  122. add AO1, AO1, LDA
  123. .endm
  124. .macro SAVE_F1
  125. fldmiad YO, { d16 }
  126. vmla.f64 d16, d0, d24
  127. fstmiad YO!, { d16 }
  128. .endm
  129. /*********************************************************************************************/
  130. .macro INIT_S8
  131. vsub.f64 d24 , d24 , d24
  132. vmov.f64 d25 , d24
  133. vmov.f64 d26 , d24
  134. vmov.f64 d27 , d24
  135. vmov.f64 d28 , d24
  136. vmov.f64 d29 , d24
  137. vmov.f64 d30 , d24
  138. vmov.f64 d31 , d24
  139. .endm
  140. .macro KERNEL_S8X8
  141. KERNEL_S8X1
  142. KERNEL_S8X1
  143. KERNEL_S8X1
  144. KERNEL_S8X1
  145. KERNEL_S8X1
  146. KERNEL_S8X1
  147. KERNEL_S8X1
  148. KERNEL_S8X1
  149. .endm
  150. .macro KERNEL_S8X1
  151. pld [ AO2 , #A_PRE ]
  152. pld [ AO2 , #A_PRE+32 ]
  153. fldmiad XO , { d4 }
  154. fldmiad AO1 , { d8 - d15 }
  155. vmla.f64 d24 , d4 , d8
  156. vmla.f64 d25 , d4 , d9
  157. vmla.f64 d26 , d4 , d10
  158. vmla.f64 d27 , d4 , d11
  159. vmla.f64 d28 , d4 , d12
  160. vmla.f64 d29 , d4 , d13
  161. vmla.f64 d30 , d4 , d14
  162. vmla.f64 d31 , d4 , d15
  163. add AO1, AO1, LDA
  164. add AO2, AO2, LDA
  165. add XO, XO, INC_X
  166. .endm
  167. .macro SAVE_S8
  168. fldmiad YO, { d16 }
  169. vmla.f64 d16, d0, d24
  170. fstmiad YO, { d16 }
  171. add YO, YO, INC_Y
  172. fldmiad YO, { d17 }
  173. vmla.f64 d17, d0, d25
  174. fstmiad YO, { d17 }
  175. add YO, YO, INC_Y
  176. fldmiad YO, { d18 }
  177. vmla.f64 d18, d0, d26
  178. fstmiad YO, { d18 }
  179. add YO, YO, INC_Y
  180. fldmiad YO, { d19 }
  181. vmla.f64 d19, d0, d27
  182. fstmiad YO, { d19 }
  183. add YO, YO, INC_Y
  184. fldmiad YO, { d20 }
  185. vmla.f64 d20, d0, d28
  186. fstmiad YO, { d20 }
  187. add YO, YO, INC_Y
  188. fldmiad YO, { d21 }
  189. vmla.f64 d21, d0, d29
  190. fstmiad YO, { d21 }
  191. add YO, YO, INC_Y
  192. fldmiad YO, { d22 }
  193. vmla.f64 d22, d0, d30
  194. fstmiad YO, { d22 }
  195. add YO, YO, INC_Y
  196. fldmiad YO, { d23 }
  197. vmla.f64 d23, d0, d31
  198. fstmiad YO, { d23 }
  199. add YO, YO, INC_Y
  200. .endm
  201. .macro INIT_S1
  202. vsub.f64 d24 , d24 , d24
  203. .endm
  204. .macro KERNEL_S1X1
  205. fldmiad XO , { d4 }
  206. fldmiad AO1 , { d8 }
  207. vmla.f64 d24 , d4 , d8
  208. add AO1, AO1, LDA
  209. add XO, XO, INC_X
  210. .endm
  211. .macro SAVE_S1
  212. fldmiad YO, { d16 }
  213. vmla.f64 d16, d0, d24
  214. fstmiad YO, { d16 }
  215. add YO, YO, INC_Y
  216. .endm
  217. #else /************************* SINGLE PRECISION *****************************************/
  218. .macro INIT_F8
  219. pld [ YO , #Y_PRE ]
  220. vsub.f32 s24 , s24 , s24
  221. vmov.f32 s25 , s24
  222. vmov.f32 s26 , s24
  223. vmov.f32 s27 , s24
  224. vmov.f32 s28 , s24
  225. vmov.f32 s29 , s24
  226. vmov.f32 s30 , s24
  227. vmov.f32 s31 , s24
  228. .endm
  229. .macro KERNEL_F8X8
  230. pld [ XO , #X_PRE ]
  231. KERNEL_F8X1
  232. KERNEL_F8X1
  233. KERNEL_F8X1
  234. KERNEL_F8X1
  235. KERNEL_F8X1
  236. KERNEL_F8X1
  237. KERNEL_F8X1
  238. KERNEL_F8X1
  239. .endm
  240. .macro KERNEL_F8X1
  241. pld [ AO2 , #A_PRE ]
  242. fldmias XO! , { s4 }
  243. fldmias AO1 , { s8 - s15 }
  244. vmla.f32 s24 , s4 , s8
  245. vmla.f32 s25 , s4 , s9
  246. vmla.f32 s26 , s4 , s10
  247. vmla.f32 s27 , s4 , s11
  248. vmla.f32 s28 , s4 , s12
  249. vmla.f32 s29 , s4 , s13
  250. vmla.f32 s30 , s4 , s14
  251. vmla.f32 s31 , s4 , s15
  252. add AO1, AO1, LDA
  253. add AO2, AO2, LDA
  254. .endm
  255. .macro SAVE_F8
  256. fldmias YO, { s16 - s23 }
  257. vmla.f32 s16, s0, s24
  258. vmla.f32 s17, s0, s25
  259. vmla.f32 s18, s0, s26
  260. vmla.f32 s19, s0, s27
  261. vmla.f32 s20, s0, s28
  262. vmla.f32 s21, s0, s29
  263. vmla.f32 s22, s0, s30
  264. vmla.f32 s23, s0, s31
  265. fstmias YO!, { s16 - s23 }
  266. .endm
  267. .macro INIT_F1
  268. vsub.f32 s24 , s24 , s24
  269. .endm
  270. .macro KERNEL_F1X1
  271. fldmias XO! , { s4 }
  272. fldmias AO1 , { s8 }
  273. vmla.f32 s24 , s4 , s8
  274. add AO1, AO1, LDA
  275. .endm
  276. .macro SAVE_F1
  277. fldmias YO, { s16 }
  278. vmla.f32 s16, s0, s24
  279. fstmias YO!, { s16 }
  280. .endm
  281. /*********************************************************************************************/
  282. .macro INIT_S8
  283. vsub.f32 s24 , s24 , s24
  284. vmov.f32 s25 , s24
  285. vmov.f32 s26 , s24
  286. vmov.f32 s27 , s24
  287. vmov.f32 s28 , s24
  288. vmov.f32 s29 , s24
  289. vmov.f32 s30 , s24
  290. vmov.f32 s31 , s24
  291. .endm
  292. .macro KERNEL_S8X8
  293. KERNEL_S8X1
  294. KERNEL_S8X1
  295. KERNEL_S8X1
  296. KERNEL_S8X1
  297. KERNEL_S8X1
  298. KERNEL_S8X1
  299. KERNEL_S8X1
  300. KERNEL_S8X1
  301. .endm
  302. .macro KERNEL_S8X1
  303. pld [ AO2 , #A_PRE ]
  304. fldmias XO , { s4 }
  305. fldmias AO1 , { s8 - s15 }
  306. vmla.f32 s24 , s4 , s8
  307. vmla.f32 s25 , s4 , s9
  308. vmla.f32 s26 , s4 , s10
  309. vmla.f32 s27 , s4 , s11
  310. vmla.f32 s28 , s4 , s12
  311. vmla.f32 s29 , s4 , s13
  312. vmla.f32 s30 , s4 , s14
  313. vmla.f32 s31 , s4 , s15
  314. add AO1, AO1, LDA
  315. add AO2, AO2, LDA
  316. add XO, XO, INC_X
  317. .endm
  318. .macro SAVE_S8
  319. fldmias YO, { s16 }
  320. vmla.f32 s16, s0, s24
  321. fstmias YO, { s16 }
  322. add YO, YO, INC_Y
  323. fldmias YO, { s17 }
  324. vmla.f32 s17, s0, s25
  325. fstmias YO, { s17 }
  326. add YO, YO, INC_Y
  327. fldmias YO, { s18 }
  328. vmla.f32 s18, s0, s26
  329. fstmias YO, { s18 }
  330. add YO, YO, INC_Y
  331. fldmias YO, { s19 }
  332. vmla.f32 s19, s0, s27
  333. fstmias YO, { s19 }
  334. add YO, YO, INC_Y
  335. fldmias YO, { s20 }
  336. vmla.f32 s20, s0, s28
  337. fstmias YO, { s20 }
  338. add YO, YO, INC_Y
  339. fldmias YO, { s21 }
  340. vmla.f32 s21, s0, s29
  341. fstmias YO, { s21 }
  342. add YO, YO, INC_Y
  343. fldmias YO, { s22 }
  344. vmla.f32 s22, s0, s30
  345. fstmias YO, { s22 }
  346. add YO, YO, INC_Y
  347. fldmias YO, { s23 }
  348. vmla.f32 s23, s0, s31
  349. fstmias YO, { s23 }
  350. add YO, YO, INC_Y
  351. .endm
  352. .macro INIT_S1
  353. vsub.f32 s24 , s24 , s24
  354. .endm
  355. .macro KERNEL_S1X1
  356. fldmias XO , { s4 }
  357. fldmias AO1 , { s8 }
  358. vmla.f32 s24 , s4 , s8
  359. add AO1, AO1, LDA
  360. add XO, XO, INC_X
  361. .endm
  362. .macro SAVE_S1
  363. fldmias YO, { s16 }
  364. vmla.f32 s16, s0, s24
  365. fstmias YO, { s16 }
  366. add YO, YO, INC_Y
  367. .endm
  368. #endif
  369. /**************************************************************************************
  370. * End of macro definitions
  371. **************************************************************************************/
  372. PROLOGUE
  373. .align 5
  374. push {r4 - r9 , fp}
  375. add fp, sp, #28
  376. sub sp, sp, #STACKSIZE // reserve stack
  377. sub r12, fp, #192
  378. #if defined(DOUBLE)
  379. vstm r12, { d8 - d15 } // store floating point registers
  380. #else
  381. vstm r12, { s8 - s31 } // store floating point registers
  382. #endif
  383. cmp OLD_M, #0
  384. ble gemvn_kernel_L999
  385. cmp N, #0
  386. ble gemvn_kernel_L999
  387. str OLD_A, A
  388. str OLD_M, M
  389. ldr INC_X , OLD_INC_X
  390. ldr INC_Y , OLD_INC_Y
  391. cmp INC_X, #0
  392. beq gemvn_kernel_L999
  393. cmp INC_Y, #0
  394. beq gemvn_kernel_L999
  395. ldr LDA, OLD_LDA
  396. #if defined(DOUBLE)
  397. lsl LDA, LDA, #3 // LDA * SIZE
  398. #else
  399. lsl LDA, LDA, #2 // LDA * SIZE
  400. #endif
  401. cmp INC_X, #1
  402. bne gemvn_kernel_S8_BEGIN
  403. cmp INC_Y, #1
  404. bne gemvn_kernel_S8_BEGIN
  405. gemvn_kernel_F8_BEGIN:
  406. ldr YO , Y
  407. ldr I, M
  408. asrs I, I, #3 // I = M / 8
  409. ble gemvn_kernel_F1_BEGIN
  410. gemvn_kernel_F8X8:
  411. ldr AO1, A
  412. add AO2, AO1, LDA
  413. add r3 , AO1, #8*SIZE
  414. str r3 , A
  415. ldr XO , X
  416. INIT_F8
  417. asrs J, N, #3 // J = N / 8
  418. ble gemvn_kernel_F8X1
  419. gemvn_kernel_F8X8_10:
  420. KERNEL_F8X8
  421. subs J, J, #1
  422. bne gemvn_kernel_F8X8_10
  423. gemvn_kernel_F8X1:
  424. ands J, N , #7
  425. ble gemvn_kernel_F8_END
  426. gemvn_kernel_F8X1_10:
  427. KERNEL_F8X1
  428. subs J, J, #1
  429. bne gemvn_kernel_F8X1_10
  430. gemvn_kernel_F8_END:
  431. SAVE_F8
  432. subs I , I , #1
  433. bne gemvn_kernel_F8X8
  434. gemvn_kernel_F1_BEGIN:
  435. ldr I, M
  436. ands I, I , #7
  437. ble gemvn_kernel_L999
  438. gemvn_kernel_F1X1:
  439. ldr AO1, A
  440. add r3, AO1, #SIZE
  441. str r3, A
  442. ldr XO , X
  443. INIT_F1
  444. mov J, N
  445. gemvn_kernel_F1X1_10:
  446. KERNEL_F1X1
  447. subs J, J, #1
  448. bne gemvn_kernel_F1X1_10
  449. gemvn_kernel_F1_END:
  450. SAVE_F1
  451. subs I , I , #1
  452. bne gemvn_kernel_F1X1
  453. b gemvn_kernel_L999
  454. /*************************************************************************************************************/
  455. gemvn_kernel_S8_BEGIN:
  456. #if defined(DOUBLE)
  457. lsl INC_X, INC_X, #3 // INC_X * SIZE
  458. lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
  459. #else
  460. lsl INC_X, INC_X, #2 // INC_X * SIZE
  461. lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
  462. #endif
  463. ldr YO , Y
  464. ldr I, M
  465. asrs I, I, #3 // I = M / 8
  466. ble gemvn_kernel_S1_BEGIN
  467. gemvn_kernel_S8X8:
  468. ldr AO1, A
  469. add AO2, AO1, LDA
  470. add r3 , AO1, #8*SIZE
  471. str r3 , A
  472. ldr XO , X
  473. INIT_S8
  474. asrs J, N, #3 // J = N / 8
  475. ble gemvn_kernel_S8X1
  476. gemvn_kernel_S8X8_10:
  477. KERNEL_S8X8
  478. subs J, J, #1
  479. bne gemvn_kernel_S8X8_10
  480. gemvn_kernel_S8X1:
  481. ands J, N , #7
  482. ble gemvn_kernel_S8_END
  483. gemvn_kernel_S8X1_10:
  484. KERNEL_S8X1
  485. subs J, J, #1
  486. bne gemvn_kernel_S8X1_10
  487. gemvn_kernel_S8_END:
  488. SAVE_S8
  489. subs I , I , #1
  490. bne gemvn_kernel_S8X8
  491. gemvn_kernel_S1_BEGIN:
  492. ldr I, M
  493. ands I, I , #7
  494. ble gemvn_kernel_L999
  495. gemvn_kernel_S1X1:
  496. ldr AO1, A
  497. add r3, AO1, #SIZE
  498. str r3, A
  499. ldr XO , X
  500. INIT_S1
  501. mov J, N
  502. gemvn_kernel_S1X1_10:
  503. KERNEL_S1X1
  504. subs J, J, #1
  505. bne gemvn_kernel_S1X1_10
  506. gemvn_kernel_S1_END:
  507. SAVE_S1
  508. subs I , I , #1
  509. bne gemvn_kernel_S1X1
  510. /*************************************************************************************************************/
  511. gemvn_kernel_L999:
  512. sub r3, fp, #192
  513. #if defined(DOUBLE)
  514. vldm r3, { d8 - d15 } // restore floating point registers
  515. #else
  516. vldm r3, { s8 - s31 } // restore floating point registers
  517. #endif
  518. mov r0, #0 // set return value
  519. sub sp, fp, #28
  520. pop {r4 -r9 ,fp}
  521. bx lr
  522. EPILOGUE