You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_4x2_vfp.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. **************************************************************************************/
  34. #define ASSEMBLER
  35. #include "common.h"
  36. #define STACKSIZE 256
  37. #define OLD_M r0
  38. #define OLD_N r1
  39. #define OLD_K r2
  40. #define OLD_A r3
  41. #define OLD_ALPHA s0
  42. /******************************************************
  43. * [fp, #-128] - [fp, #-64] is reserved
  44. * for store and restore of floating point
  45. * registers
  46. *******************************************************/
  47. #define LDC [fp, #-252 ]
  48. #define M [fp, #-256 ]
  49. #define N [fp, #-260 ]
  50. #define K [fp, #-264 ]
  51. #define A [fp, #-268 ]
  52. #define FP_ZERO [fp, #-240]
  53. #define FP_ZERO_0 [fp, # -240]
  54. #define FP_ZERO_1 [fp, # -236]
  55. #define ALPHA [fp, #-280]
  56. #if !defined(__ARM_PCS_VFP)
  57. #define OLD_ALPHA_SOFTFP r3
  58. #define OLD_A_SOFTFP [fp, #4 ]
  59. #define B [fp, #8 ]
  60. #define C [fp, #12 ]
  61. #define OLD_LDC [fp, #16 ]
  62. #else
  63. #define B [fp, #4 ]
  64. #define C [fp, #8 ]
  65. #define OLD_LDC [fp, #12 ]
  66. #endif
  67. #define I r0
  68. #define J r1
  69. #define L r2
  70. #define AO r5
  71. #define BO r6
  72. #define CO1 r8
  73. #define CO2 r9
  74. #define K1 r7
  75. #define BC r12
  76. #define A_PRE 96
  77. #define B_PRE 96
  78. #define C_PRE 64
  79. /**************************************************************************************
  80. * Macro definitions
  81. **************************************************************************************/
  82. .macro INIT4x2
  83. flds s8, FP_ZERO
  84. vmov.f32 s9, s8
  85. vmov.f32 s10, s8
  86. vmov.f32 s11, s8
  87. vmov.f32 s12, s8
  88. vmov.f32 s13, s8
  89. vmov.f32 s14, s8
  90. vmov.f32 s15, s8
  91. .endm
  92. .macro KERNEL4x2_SUB
  93. vldmia.f32 AO! , { s0 - s3 }
  94. vldmia.f32 BO! , { s4 - s5 }
  95. fmacs s8 , s0, s4
  96. fmacs s9 , s1, s4
  97. fmacs s10 , s2, s4
  98. fmacs s11 , s3, s4
  99. fmacs s12 , s0, s5
  100. fmacs s13 , s1, s5
  101. fmacs s14 , s2, s5
  102. fmacs s15 , s3, s5
  103. .endm
  104. .macro SAVE4x2
  105. ldr r3 , LDC
  106. add CO2 , CO1, r3
  107. flds s0, ALPHA
  108. flds s4 , [CO1]
  109. flds s5 , [CO1, #4 ]
  110. flds s6 , [CO1, #8 ]
  111. flds s7 , [CO1, #12 ]
  112. fmacs s4 , s0 , s8
  113. fmacs s5 , s0 , s9
  114. fmacs s6 , s0 , s10
  115. fmacs s7 , s0 , s11
  116. fsts s4 , [CO1]
  117. fsts s5 , [CO1, #4 ]
  118. fsts s6 , [CO1, #8 ]
  119. fsts s7 , [CO1, #12 ]
  120. flds s4 , [CO2]
  121. flds s5 , [CO2, #4 ]
  122. flds s6 , [CO2, #8 ]
  123. flds s7 , [CO2, #12 ]
  124. fmacs s4 , s0 , s12
  125. fmacs s5 , s0 , s13
  126. fmacs s6 , s0 , s14
  127. fmacs s7 , s0 , s15
  128. fsts s4 , [CO2]
  129. fsts s5 , [CO2, #4 ]
  130. fsts s6 , [CO2, #8 ]
  131. fsts s7 , [CO2, #12 ]
  132. add CO1, CO1, #16
  133. .endm
  134. /******************************************************************************/
  135. .macro INIT2x2
  136. flds s8, FP_ZERO
  137. vmov.f32 s9, s8
  138. vmov.f32 s12, s8
  139. vmov.f32 s13, s8
  140. .endm
  141. .macro KERNEL2x2_SUB
  142. flds s4 , [ BO ]
  143. flds s5 , [ BO, #4 ]
  144. flds s0 , [ AO ]
  145. flds s1 , [ AO, #4 ]
  146. fmacs s8 , s0, s4
  147. fmacs s9 , s1, s4
  148. fmacs s12 , s0, s5
  149. fmacs s13 , s1, s5
  150. add AO , AO, #8
  151. add BO , BO, #8
  152. .endm
  153. .macro SAVE2x2
  154. ldr r3 , LDC
  155. add CO2 , CO1, r3
  156. flds s0, ALPHA
  157. flds s4 , [CO1]
  158. flds s5 , [CO1, #4 ]
  159. fmacs s4 , s0 , s8
  160. fmacs s5 , s0 , s9
  161. fsts s4 , [CO1]
  162. fsts s5 , [CO1, #4 ]
  163. flds s4 , [CO2]
  164. flds s5 , [CO2, #4 ]
  165. fmacs s4 , s0 , s12
  166. fmacs s5 , s0 , s13
  167. fsts s4 , [CO2]
  168. fsts s5 , [CO2, #4 ]
  169. add CO1, CO1, #8
  170. .endm
  171. /******************************************************************************/
  172. .macro INIT1x2
  173. flds s8, FP_ZERO
  174. vmov.f32 s12, s8
  175. .endm
  176. .macro KERNEL1x2_SUB
  177. flds s4 , [ BO ]
  178. flds s5 , [ BO, #4 ]
  179. flds s0 , [ AO ]
  180. fmacs s8 , s0, s4
  181. fmacs s12 , s0, s5
  182. add AO , AO, #4
  183. add BO , BO, #8
  184. .endm
  185. .macro SAVE1x2
  186. ldr r3 , LDC
  187. add CO2 , CO1, r3
  188. flds s0, ALPHA
  189. flds s4 , [CO1]
  190. fmacs s4 , s0 , s8
  191. fsts s4 , [CO1]
  192. flds s4 , [CO2]
  193. fmacs s4 , s0 , s12
  194. fsts s4 , [CO2]
  195. add CO1, CO1, #4
  196. .endm
  197. /******************************************************************************/
  198. .macro INIT4x1
  199. flds s8, FP_ZERO
  200. vmov.f32 s9, s8
  201. vmov.f32 s10, s8
  202. vmov.f32 s11, s8
  203. .endm
  204. .macro KERNEL4x1_SUB
  205. flds s4 , [ BO ]
  206. flds s0 , [ AO ]
  207. flds s1 , [ AO, #4 ]
  208. flds s2 , [ AO, #8 ]
  209. flds s3 , [ AO, #12 ]
  210. fmacs s8 , s0, s4
  211. fmacs s9 , s1, s4
  212. fmacs s10 , s2, s4
  213. fmacs s11 , s3, s4
  214. add AO , AO, #16
  215. add BO , BO, #4
  216. .endm
  217. .macro SAVE4x1
  218. flds s0, ALPHA
  219. flds s4 , [CO1]
  220. flds s5 , [CO1, #4 ]
  221. flds s6 , [CO1, #8 ]
  222. flds s7 , [CO1, #12 ]
  223. fmacs s4 , s0 , s8
  224. fmacs s5 , s0 , s9
  225. fmacs s6 , s0 , s10
  226. fmacs s7 , s0 , s11
  227. fsts s4 , [CO1]
  228. fsts s5 , [CO1, #4 ]
  229. fsts s6 , [CO1, #8 ]
  230. fsts s7 , [CO1, #12 ]
  231. add CO1, CO1, #16
  232. .endm
  233. /******************************************************************************/
  234. .macro INIT2x1
  235. flds s8, FP_ZERO
  236. vmov.f32 s9 , s8
  237. .endm
  238. .macro KERNEL2x1_SUB
  239. flds s4 , [ BO ]
  240. flds s0 , [ AO ]
  241. flds s1 , [ AO, #4 ]
  242. fmacs s8 , s0, s4
  243. fmacs s9 , s1, s4
  244. add AO , AO, #8
  245. add BO , BO, #4
  246. .endm
  247. .macro SAVE2x1
  248. flds s0, ALPHA
  249. flds s4 , [CO1]
  250. flds s5 , [CO1, #4 ]
  251. fmacs s4 , s0 , s8
  252. fmacs s5 , s0 , s9
  253. fsts s4 , [CO1]
  254. fsts s5 , [CO1, #4 ]
  255. add CO1, CO1, #8
  256. .endm
  257. /******************************************************************************/
  258. .macro INIT1x1
  259. flds s8, FP_ZERO
  260. .endm
  261. .macro KERNEL1x1_SUB
  262. flds s4 , [ BO ]
  263. flds s0 , [ AO ]
  264. fmacs s8 , s0, s4
  265. add AO , AO, #4
  266. add BO , BO, #4
  267. .endm
  268. .macro SAVE1x1
  269. flds s0, ALPHA
  270. flds s4 , [CO1]
  271. fmacs s4 , s0 , s8
  272. fsts s4 , [CO1]
  273. add CO1, CO1, #4
  274. .endm
  275. /**************************************************************************************
  276. * End of macro definitions
  277. **************************************************************************************/
  278. PROLOGUE
  279. .align 5
  280. push {r4 - r9, fp}
  281. add fp, sp, #24
  282. sub sp, sp, #STACKSIZE // reserve stack
  283. #if !defined(__ARM_PCS_VFP)
  284. vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
  285. ldr OLD_A, OLD_A_SOFTFP
  286. #endif
  287. str OLD_M, M
  288. str OLD_N, N
  289. str OLD_K, K
  290. str OLD_A, A
  291. vstr OLD_ALPHA, ALPHA
  292. sub r3, fp, #128
  293. vstm r3, { s8 - s15} // store floating point registers
  294. movs r4, #0
  295. str r4, FP_ZERO
  296. str r4, FP_ZERO_1
  297. ldr r3, OLD_LDC
  298. lsl r3, r3, #2 // ldc = ldc * 4
  299. str r3, LDC
  300. ldr K1, K
  301. ldr BC, B
  302. ldr J, N
  303. asrs J, J, #1 // J = J / 2
  304. ble sgemm_kernel_L1_BEGIN
  305. /*********************************************************************************************/
  306. sgemm_kernel_L2_BEGIN:
  307. ldr CO1, C // CO1 = C
  308. ldr r4 , LDC
  309. lsl r4 , r4 , #1 // LDC * 2
  310. add r3 , r4, CO1
  311. str r3 , C // store C
  312. ldr AO, A // AO = A
  313. sgemm_kernel_L2_M4_BEGIN:
  314. ldr I, M
  315. asrs I, I, #2 // I = I / 4
  316. ble sgemm_kernel_L2_M2_BEGIN
  317. sgemm_kernel_L2_M4_20:
  318. INIT4x2
  319. mov BO, BC
  320. asrs L , K1, #3 // L = L / 8
  321. ble sgemm_kernel_L2_M4_40
  322. .align 5
  323. sgemm_kernel_L2_M4_22:
  324. pld [ AO, #A_PRE ]
  325. pld [ BO, #B_PRE ]
  326. KERNEL4x2_SUB
  327. KERNEL4x2_SUB
  328. pld [ AO, #A_PRE ]
  329. KERNEL4x2_SUB
  330. KERNEL4x2_SUB
  331. pld [ AO, #A_PRE ]
  332. pld [ BO, #B_PRE ]
  333. KERNEL4x2_SUB
  334. KERNEL4x2_SUB
  335. pld [ AO, #A_PRE ]
  336. KERNEL4x2_SUB
  337. KERNEL4x2_SUB
  338. subs L, L, #1
  339. bgt sgemm_kernel_L2_M4_22
  340. sgemm_kernel_L2_M4_40:
  341. ands L , K1, #7 // L = L % 8
  342. ble sgemm_kernel_L2_M4_100
  343. sgemm_kernel_L2_M4_42:
  344. KERNEL4x2_SUB
  345. subs L, L, #1
  346. bgt sgemm_kernel_L2_M4_42
  347. sgemm_kernel_L2_M4_100:
  348. SAVE4x2
  349. sgemm_kernel_L2_M4_END:
  350. subs I, I, #1
  351. bgt sgemm_kernel_L2_M4_20
  352. sgemm_kernel_L2_M2_BEGIN:
  353. ldr I, M
  354. tst I , #3
  355. ble sgemm_kernel_L2_END
  356. tst I, #2 // I = I / 2
  357. ble sgemm_kernel_L2_M1_BEGIN
  358. sgemm_kernel_L2_M2_20:
  359. INIT2x2
  360. mov BO, BC
  361. asrs L , K1, #3 // L = L / 8
  362. ble sgemm_kernel_L2_M2_40
  363. sgemm_kernel_L2_M2_22:
  364. KERNEL2x2_SUB
  365. KERNEL2x2_SUB
  366. KERNEL2x2_SUB
  367. KERNEL2x2_SUB
  368. KERNEL2x2_SUB
  369. KERNEL2x2_SUB
  370. KERNEL2x2_SUB
  371. KERNEL2x2_SUB
  372. subs L, L, #1
  373. bgt sgemm_kernel_L2_M2_22
  374. sgemm_kernel_L2_M2_40:
  375. ands L , K1, #7 // L = L % 8
  376. ble sgemm_kernel_L2_M2_100
  377. sgemm_kernel_L2_M2_42:
  378. KERNEL2x2_SUB
  379. subs L, L, #1
  380. bgt sgemm_kernel_L2_M2_42
  381. sgemm_kernel_L2_M2_100:
  382. SAVE2x2
  383. sgemm_kernel_L2_M2_END:
  384. sgemm_kernel_L2_M1_BEGIN:
  385. tst I, #1 // I = I % 2
  386. ble sgemm_kernel_L2_END
  387. sgemm_kernel_L2_M1_20:
  388. INIT1x2
  389. mov BO, BC
  390. asrs L , K1, #3 // L = L / 8
  391. ble sgemm_kernel_L2_M1_40
  392. sgemm_kernel_L2_M1_22:
  393. KERNEL1x2_SUB
  394. KERNEL1x2_SUB
  395. KERNEL1x2_SUB
  396. KERNEL1x2_SUB
  397. KERNEL1x2_SUB
  398. KERNEL1x2_SUB
  399. KERNEL1x2_SUB
  400. KERNEL1x2_SUB
  401. subs L, L, #1
  402. bgt sgemm_kernel_L2_M1_22
  403. sgemm_kernel_L2_M1_40:
  404. ands L , K1, #7 // L = L % 8
  405. ble sgemm_kernel_L2_M1_100
  406. sgemm_kernel_L2_M1_42:
  407. KERNEL1x2_SUB
  408. subs L, L, #1
  409. bgt sgemm_kernel_L2_M1_42
  410. sgemm_kernel_L2_M1_100:
  411. SAVE1x2
  412. sgemm_kernel_L2_END:
  413. mov r3, BC
  414. mov r4, K1
  415. lsl r4, r4, #3 // k * 2 * 4
  416. add r3, r3, r4 // B = B + K * 2 * 4
  417. mov BC, r3
  418. subs J , #1 // j--
  419. bgt sgemm_kernel_L2_BEGIN
  420. /*********************************************************************************************/
  421. sgemm_kernel_L1_BEGIN:
  422. ldr J , N
  423. tst J , #1
  424. ble sgemm_kernel_L999
  425. ldr CO1, C // CO1 = C
  426. ldr r4 , LDC
  427. add r3 , r4, CO1
  428. str r3 , C // store C
  429. ldr AO, A // AO = A
  430. sgemm_kernel_L1_M4_BEGIN:
  431. ldr I, M
  432. asrs I, I, #2 // I = I / 4
  433. ble sgemm_kernel_L1_M2_BEGIN
  434. sgemm_kernel_L1_M4_20:
  435. INIT4x1
  436. mov BO, BC
  437. asrs L , K1, #3 // L = L / 8
  438. ble sgemm_kernel_L1_M4_40
  439. .align 5
  440. sgemm_kernel_L1_M4_22:
  441. KERNEL4x1_SUB
  442. KERNEL4x1_SUB
  443. KERNEL4x1_SUB
  444. KERNEL4x1_SUB
  445. KERNEL4x1_SUB
  446. KERNEL4x1_SUB
  447. KERNEL4x1_SUB
  448. KERNEL4x1_SUB
  449. subs L, L, #1
  450. bgt sgemm_kernel_L1_M4_22
  451. sgemm_kernel_L1_M4_40:
  452. ands L , K1, #7 // L = L % 8
  453. ble sgemm_kernel_L1_M4_100
  454. sgemm_kernel_L1_M4_42:
  455. KERNEL4x1_SUB
  456. subs L, L, #1
  457. bgt sgemm_kernel_L1_M4_42
  458. sgemm_kernel_L1_M4_100:
  459. SAVE4x1
  460. sgemm_kernel_L1_M4_END:
  461. subs I, I, #1
  462. bgt sgemm_kernel_L1_M4_20
  463. sgemm_kernel_L1_M2_BEGIN:
  464. ldr I, M
  465. tst I , #3
  466. ble sgemm_kernel_L1_END
  467. tst I, #2 // I = I / 2
  468. ble sgemm_kernel_L1_M1_BEGIN
  469. sgemm_kernel_L1_M2_20:
  470. INIT2x1
  471. mov BO, BC
  472. asrs L , K1, #3 // L = L / 8
  473. ble sgemm_kernel_L1_M2_40
  474. sgemm_kernel_L1_M2_22:
  475. KERNEL2x1_SUB
  476. KERNEL2x1_SUB
  477. KERNEL2x1_SUB
  478. KERNEL2x1_SUB
  479. KERNEL2x1_SUB
  480. KERNEL2x1_SUB
  481. KERNEL2x1_SUB
  482. KERNEL2x1_SUB
  483. subs L, L, #1
  484. bgt sgemm_kernel_L1_M2_22
  485. sgemm_kernel_L1_M2_40:
  486. ands L , K1, #7 // L = L % 8
  487. ble sgemm_kernel_L1_M2_100
  488. sgemm_kernel_L1_M2_42:
  489. KERNEL2x1_SUB
  490. subs L, L, #1
  491. bgt sgemm_kernel_L1_M2_42
  492. sgemm_kernel_L1_M2_100:
  493. SAVE2x1
  494. sgemm_kernel_L1_M2_END:
  495. sgemm_kernel_L1_M1_BEGIN:
  496. tst I, #1 // I = I % 2
  497. ble sgemm_kernel_L1_END
  498. sgemm_kernel_L1_M1_20:
  499. INIT1x1
  500. mov BO, BC
  501. asrs L , K1, #3 // L = L / 8
  502. ble sgemm_kernel_L1_M1_40
  503. sgemm_kernel_L1_M1_22:
  504. KERNEL1x1_SUB
  505. KERNEL1x1_SUB
  506. KERNEL1x1_SUB
  507. KERNEL1x1_SUB
  508. KERNEL1x1_SUB
  509. KERNEL1x1_SUB
  510. KERNEL1x1_SUB
  511. KERNEL1x1_SUB
  512. subs L, L, #1
  513. bgt sgemm_kernel_L1_M1_22
  514. sgemm_kernel_L1_M1_40:
  515. ands L , K1, #7 // L = L % 8
  516. ble sgemm_kernel_L1_M1_100
  517. sgemm_kernel_L1_M1_42:
  518. KERNEL1x1_SUB
  519. subs L, L, #1
  520. bgt sgemm_kernel_L1_M1_42
  521. sgemm_kernel_L1_M1_100:
  522. SAVE1x1
  523. sgemm_kernel_L1_END:
  524. sgemm_kernel_L999:
  525. sub r3, fp, #128
  526. vldm r3, { s8 - s15} // restore floating point registers
  527. movs r0, #0 // set return value
  528. sub sp, fp, #24
  529. pop {r4 - r9, fp}
  530. bx lr
  531. EPILOGUE