You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_4x4_vfpv3.S 22 kB


  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/23 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. *
  34. * 2013/11/02 Saar
  35. * UNROLL_N 4
  36. * UNROLL_M 4
  37. * DGEMM_P 128
  38. * DGEMM_Q 240
  39. * DGEMM_R 12288
  40. * A_PRE 128
  41. * B_PRE 128
  42. * C_PRE 32
  43. *
  44. * Performance on Odroid U2:
  45. *
  46. * 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS
  47. * 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS
  48. * 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS
  49. * 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS
  50. **************************************************************************************/
  51. #define ASSEMBLER
  52. #include "common.h"
  53. #define STACKSIZE 256
  54. #define OLD_M r0
  55. #define OLD_N r1
  56. #define OLD_K r2
  57. #define OLD_A r3
  58. #define OLD_ALPHA s0
  59. /******************************************************
  60. * [fp, #-128] - [fp, #-64] is reserved
  61. * for store and restore of floating point
  62. * registers
  63. *******************************************************/
  64. #define LDC [fp, #-252 ]
  65. #define M [fp, #-256 ]
  66. #define N [fp, #-260 ]
  67. #define K [fp, #-264 ]
  68. #define A [fp, #-268 ]
  69. #define ALPHA [fp, #-280]
  70. #define B [fp, #4 ]
  71. #define C [fp, #8 ]
  72. #define OLD_LDC [fp, #12 ]
  73. #define I r0
  74. #define J r1
  75. #define L r2
  76. #define AO r5
  77. #define BO r6
  78. #define CO1 r8
  79. #define CO2 r9
  80. #define K1 r7
  81. #define BC r12
  82. #define A_PRE 128
  83. #define B_PRE 128
  84. #define C_PRE 32
  85. /**************************************************************************************
  86. * Macro definitions
  87. **************************************************************************************/
  88. .macro INIT4x4
  89. vsub.f32 s16 , s16 , s16
  90. vmov.f32 s17, s16
  91. vmov.f32 s18, s16
  92. vmov.f32 s19, s16
  93. vmov.f32 s20, s16
  94. vmov.f32 s21, s16
  95. vmov.f32 s22, s16
  96. vmov.f32 s23, s16
  97. vmov.f32 s24, s16
  98. vmov.f32 s25, s16
  99. vmov.f32 s26, s16
  100. vmov.f32 s27, s16
  101. vmov.f32 s28, s16
  102. vmov.f32 s29, s16
  103. vmov.f32 s30, s16
  104. vmov.f32 s31, s16
  105. .endm
  106. .macro KERNEL4x4_I
  107. pld [ AO , #A_PRE ]
  108. fldmias AO!, { s0 - s1 }
  109. pld [ BO , #B_PRE ]
  110. fldmias BO!, { s8 - s9 }
  111. fmuls s16 , s0, s8
  112. fldmias AO!, { s2 - s3 }
  113. fmuls s17 , s1, s8
  114. fmuls s18 , s2, s8
  115. fldmias BO!, { s10 - s11 }
  116. fmuls s19 , s3, s8
  117. fmuls s20 , s0, s9
  118. fldmias AO!, { s4 - s5 }
  119. fmuls s21 , s1, s9
  120. fmuls s22 , s2, s9
  121. fldmias AO!, { s6 - s7 }
  122. fmuls s23 , s3, s9
  123. fmuls s24 , s0, s10
  124. fldmias BO!, { s12 - s13 }
  125. fmuls s25 , s1, s10
  126. fmuls s26 , s2, s10
  127. fldmias BO!, { s14 - s15 }
  128. fmuls s27 , s3, s10
  129. fmuls s28 , s0, s11
  130. fmuls s29 , s1, s11
  131. fmuls s30 , s2, s11
  132. fmuls s31 , s3, s11
  133. .endm
  134. .macro KERNEL4x4_M2
  135. pld [ AO , #A_PRE ]
  136. fmacs s16 , s4, s12
  137. fmacs s17 , s5, s12
  138. fldmias AO!, { s0 - s3 }
  139. fmacs s18 , s6, s12
  140. pld [ BO , #B_PRE ]
  141. fmacs s19 , s7, s12
  142. fmacs s20 , s4, s13
  143. fldmias BO!, { s8 - s11 }
  144. fmacs s21 , s5, s13
  145. fmacs s22 , s6, s13
  146. //fldmias AO!, { s2 - s3 }
  147. fmacs s23 , s7, s13
  148. fmacs s24 , s4, s14
  149. //fldmias BO!, { s10 - s11 }
  150. fmacs s25 , s5, s14
  151. fmacs s26 , s6, s14
  152. fmacs s27 , s7, s14
  153. fmacs s28 , s4, s15
  154. fmacs s29 , s5, s15
  155. fmacs s30 , s6, s15
  156. fmacs s31 , s7, s15
  157. .endm
  158. .macro KERNEL4x4_M1
  159. fmacs s16 , s0, s8
  160. fldmias AO!, { s4 - s7 }
  161. fmacs s17 , s1, s8
  162. fmacs s18 , s2, s8
  163. fldmias BO!, { s12 - s15 }
  164. //fldmias AO!, { s6 - s7 }
  165. fmacs s19 , s3, s8
  166. fmacs s20 , s0, s9
  167. fmacs s21 , s1, s9
  168. fmacs s22 , s2, s9
  169. //fldmias BO!, { s14 - s15 }
  170. fmacs s23 , s3, s9
  171. fmacs s24 , s0, s10
  172. fmacs s25 , s1, s10
  173. fmacs s26 , s2, s10
  174. fmacs s27 , s3, s10
  175. fmacs s28 , s0, s11
  176. fmacs s29 , s1, s11
  177. fmacs s30 , s2, s11
  178. fmacs s31 , s3, s11
  179. .endm
  180. .macro KERNEL4x4_E
  181. fmacs s16 , s4, s12
  182. fmacs s17 , s5, s12
  183. fmacs s18 , s6, s12
  184. fmacs s19 , s7, s12
  185. fmacs s20 , s4, s13
  186. fmacs s21 , s5, s13
  187. fmacs s22 , s6, s13
  188. fmacs s23 , s7, s13
  189. fmacs s24 , s4, s14
  190. fmacs s25 , s5, s14
  191. fmacs s26 , s6, s14
  192. fmacs s27 , s7, s14
  193. fmacs s28 , s4, s15
  194. fmacs s29 , s5, s15
  195. fmacs s30 , s6, s15
  196. fmacs s31 , s7, s15
  197. .endm
  198. .macro KERNEL4x4_SUB
  199. flds s8 , [ BO ]
  200. flds s0 , [ AO ]
  201. flds s1 , [ AO, #4 ]
  202. fmacs s16 , s0, s8
  203. flds s2 , [ AO, #8 ]
  204. fmacs s17 , s1, s8
  205. flds s3 , [ AO, #12 ]
  206. fmacs s18 , s2, s8
  207. flds s9 , [ BO, #4 ]
  208. fmacs s19 , s3, s8
  209. flds s10, [ BO, #8 ]
  210. fmacs s20 , s0, s9
  211. flds s11, [ BO, #12 ]
  212. fmacs s21 , s1, s9
  213. fmacs s22 , s2, s9
  214. fmacs s23 , s3, s9
  215. fmacs s24 , s0, s10
  216. fmacs s25 , s1, s10
  217. fmacs s26 , s2, s10
  218. fmacs s27 , s3, s10
  219. fmacs s28 , s0, s11
  220. fmacs s29 , s1, s11
  221. add AO , AO, #16
  222. fmacs s30 , s2, s11
  223. add BO , BO, #16
  224. fmacs s31 , s3, s11
  225. .endm
  226. .macro SAVE4x4
  227. ldr r3 , LDC
  228. add CO2 , CO1, r3
  229. flds s0, ALPHA
  230. add r4 , CO2, r3
  231. fldmias CO1, { s8 - s11 }
  232. fmacs s8 , s0 , s16
  233. flds s12, [CO2]
  234. fmacs s9 , s0 , s17
  235. flds s13, [CO2, #4 ]
  236. fmacs s10, s0 , s18
  237. flds s14, [CO2, #8 ]
  238. fmacs s11, s0 , s19
  239. flds s15, [CO2, #12 ]
  240. fmacs s12, s0 , s20
  241. fsts s8 , [CO1]
  242. fmacs s13, s0 , s21
  243. fsts s9 , [CO1, #4 ]
  244. fmacs s14, s0 , s22
  245. fsts s10, [CO1, #8 ]
  246. fmacs s15, s0 , s23
  247. fsts s11, [CO1, #12 ]
  248. pld [ CO1 , #C_PRE ]
  249. fldmias r4, { s8 - s11 }
  250. fmacs s8 , s0 , s24
  251. fsts s12, [CO2]
  252. fmacs s9 , s0 , s25
  253. fsts s13, [CO2, #4 ]
  254. fmacs s10, s0 , s26
  255. fsts s14, [CO2, #8 ]
  256. fmacs s11, s0 , s27
  257. fsts s15, [CO2, #12 ]
  258. pld [ CO2 , #C_PRE ]
  259. add CO2, r4 , r3
  260. fldmias CO2, { s12 - s15 }
  261. fsts s8 , [r4 ]
  262. fmacs s12, s0 , s28
  263. fsts s9 , [r4 , #4 ]
  264. fmacs s13, s0 , s29
  265. fsts s10, [r4 , #8 ]
  266. fmacs s14, s0 , s30
  267. fsts s11, [r4 , #12 ]
  268. fmacs s15, s0 , s31
  269. pld [ r4 , #C_PRE ]
  270. fstmias CO2, { s12 - s15 }
  271. pld [ CO2 , #C_PRE ]
  272. add CO1, CO1, #16
  273. .endm
  274. /******************************************************************************/
  275. .macro INIT2x4
  276. vsub.f32 s16 , s16 , s16
  277. vmov.f32 s17, s16
  278. vmov.f32 s20, s16
  279. vmov.f32 s21, s16
  280. vmov.f32 s24, s16
  281. vmov.f32 s25, s16
  282. vmov.f32 s28, s16
  283. vmov.f32 s29, s16
  284. .endm
  285. .macro KERNEL2x4_SUB
  286. flds s8 , [ BO ]
  287. flds s9 , [ BO, #4 ]
  288. flds s10, [ BO, #8 ]
  289. flds s11, [ BO, #12 ]
  290. flds s0 , [ AO ]
  291. flds s1 , [ AO, #4 ]
  292. fmacs s16 , s0, s8
  293. fmacs s17 , s1, s8
  294. fmacs s20 , s0, s9
  295. fmacs s21 , s1, s9
  296. fmacs s24 , s0, s10
  297. fmacs s25 , s1, s10
  298. fmacs s28 , s0, s11
  299. fmacs s29 , s1, s11
  300. add AO , AO, #8
  301. add BO , BO, #16
  302. .endm
  303. .macro SAVE2x4
  304. ldr r3 , LDC
  305. add CO2 , CO1, r3
  306. add r4 , CO2, r3
  307. flds s0, ALPHA
  308. flds s8 , [CO1]
  309. flds s9 , [CO1, #4 ]
  310. fmacs s8 , s0 , s16
  311. fmacs s9 , s0 , s17
  312. fsts s8 , [CO1]
  313. fsts s9 , [CO1, #4 ]
  314. flds s12, [CO2]
  315. flds s13, [CO2, #4 ]
  316. fmacs s12, s0 , s20
  317. fmacs s13, s0 , s21
  318. fsts s12, [CO2]
  319. fsts s13, [CO2, #4 ]
  320. flds s8 , [r4 ]
  321. flds s9 , [r4 , #4 ]
  322. fmacs s8 , s0 , s24
  323. fmacs s9 , s0 , s25
  324. fsts s8 , [r4 ]
  325. fsts s9 , [r4 , #4 ]
  326. add CO2, r4 , r3
  327. flds s12, [CO2]
  328. flds s13, [CO2, #4 ]
  329. fmacs s12, s0 , s28
  330. fmacs s13, s0 , s29
  331. fsts s12, [CO2]
  332. fsts s13, [CO2, #4 ]
  333. add CO1, CO1, #8
  334. .endm
  335. /******************************************************************************/
  336. .macro INIT1x4
  337. vsub.f32 s16 , s16 , s16
  338. vmov.f32 s20, s16
  339. vmov.f32 s24, s16
  340. vmov.f32 s28, s16
  341. .endm
  342. .macro KERNEL1x4_SUB
  343. flds s8 , [ BO ]
  344. flds s9 , [ BO, #4 ]
  345. flds s10, [ BO, #8 ]
  346. flds s11, [ BO, #12 ]
  347. flds s0 , [ AO ]
  348. fmacs s16 , s0, s8
  349. fmacs s20 , s0, s9
  350. fmacs s24 , s0, s10
  351. fmacs s28 , s0, s11
  352. add AO , AO, #4
  353. add BO , BO, #16
  354. .endm
  355. .macro SAVE1x4
  356. ldr r3 , LDC
  357. add CO2 , CO1, r3
  358. add r4 , CO2, r3
  359. flds s0, ALPHA
  360. flds s8 , [CO1]
  361. fmacs s8 , s0 , s16
  362. fsts s8 , [CO1]
  363. flds s12, [CO2]
  364. fmacs s12, s0 , s20
  365. fsts s12, [CO2]
  366. flds s8 , [r4 ]
  367. fmacs s8 , s0 , s24
  368. fsts s8 , [r4 ]
  369. add CO2, r4 , r3
  370. flds s12, [CO2]
  371. fmacs s12, s0 , s28
  372. fsts s12, [CO2]
  373. add CO1, CO1, #4
  374. .endm
  375. /******************************************************************************/
  376. /******************************************************************************/
  377. .macro INIT4x2
  378. vsub.f32 s16 , s16 , s16
  379. vmov.f32 s17, s16
  380. vmov.f32 s18, s16
  381. vmov.f32 s19, s16
  382. vmov.f32 s20, s16
  383. vmov.f32 s21, s16
  384. vmov.f32 s22, s16
  385. vmov.f32 s23, s16
  386. .endm
  387. .macro KERNEL4x2_SUB
  388. flds s8 , [ BO ]
  389. flds s9 , [ BO, #4 ]
  390. flds s0 , [ AO ]
  391. flds s1 , [ AO, #4 ]
  392. flds s2 , [ AO, #8 ]
  393. flds s3 , [ AO, #12 ]
  394. fmacs s16 , s0, s8
  395. fmacs s17 , s1, s8
  396. fmacs s18 , s2, s8
  397. fmacs s19 , s3, s8
  398. fmacs s20 , s0, s9
  399. fmacs s21 , s1, s9
  400. fmacs s22 , s2, s9
  401. fmacs s23 , s3, s9
  402. add AO , AO, #16
  403. add BO , BO, #8
  404. .endm
  405. .macro SAVE4x2
  406. ldr r3 , LDC
  407. add CO2 , CO1, r3
  408. flds s0, ALPHA
  409. flds s8 , [CO1]
  410. flds s9 , [CO1, #4 ]
  411. flds s10, [CO1, #8 ]
  412. flds s11, [CO1, #12 ]
  413. fmacs s8 , s0 , s16
  414. fmacs s9 , s0 , s17
  415. fmacs s10, s0 , s18
  416. fmacs s11, s0 , s19
  417. fsts s8 , [CO1]
  418. fsts s9 , [CO1, #4 ]
  419. fsts s10, [CO1, #8 ]
  420. fsts s11, [CO1, #12 ]
  421. flds s12, [CO2]
  422. flds s13, [CO2, #4 ]
  423. flds s14, [CO2, #8 ]
  424. flds s15, [CO2, #12 ]
  425. fmacs s12, s0 , s20
  426. fmacs s13, s0 , s21
  427. fmacs s14, s0 , s22
  428. fmacs s15, s0 , s23
  429. fsts s12, [CO2]
  430. fsts s13, [CO2, #4 ]
  431. fsts s14, [CO2, #8 ]
  432. fsts s15, [CO2, #12 ]
  433. add CO1, CO1, #16
  434. .endm
  435. /******************************************************************************/
  436. .macro INIT2x2
  437. vsub.f32 s16 , s16 , s16
  438. vmov.f32 s17, s16
  439. vmov.f32 s20, s16
  440. vmov.f32 s21, s16
  441. .endm
  442. .macro KERNEL2x2_SUB
  443. flds s8 , [ BO ]
  444. flds s9 , [ BO, #4 ]
  445. flds s0 , [ AO ]
  446. flds s1 , [ AO, #4 ]
  447. fmacs s16 , s0, s8
  448. fmacs s17 , s1, s8
  449. fmacs s20 , s0, s9
  450. fmacs s21 , s1, s9
  451. add AO , AO, #8
  452. add BO , BO, #8
  453. .endm
  454. .macro SAVE2x2
  455. ldr r3 , LDC
  456. add CO2 , CO1, r3
  457. flds s0, ALPHA
  458. flds s8 , [CO1]
  459. flds s9 , [CO1, #4 ]
  460. fmacs s8 , s0 , s16
  461. fmacs s9 , s0 , s17
  462. fsts s8 , [CO1]
  463. fsts s9 , [CO1, #4 ]
  464. flds s12, [CO2]
  465. flds s13, [CO2, #4 ]
  466. fmacs s12, s0 , s20
  467. fmacs s13, s0 , s21
  468. fsts s12, [CO2]
  469. fsts s13, [CO2, #4 ]
  470. add CO1, CO1, #8
  471. .endm
  472. /******************************************************************************/
  473. .macro INIT1x2
  474. vsub.f32 s16 , s16 , s16
  475. vmov.f32 s20, s16
  476. .endm
  477. .macro KERNEL1x2_SUB
  478. flds s8 , [ BO ]
  479. flds s9 , [ BO, #4 ]
  480. flds s0 , [ AO ]
  481. fmacs s16 , s0, s8
  482. fmacs s20 , s0, s9
  483. add AO , AO, #4
  484. add BO , BO, #8
  485. .endm
  486. .macro SAVE1x2
  487. ldr r3 , LDC
  488. add CO2 , CO1, r3
  489. flds s0, ALPHA
  490. flds s8 , [CO1]
  491. fmacs s8 , s0 , s16
  492. fsts s8 , [CO1]
  493. flds s12, [CO2]
  494. fmacs s12, s0 , s20
  495. fsts s12, [CO2]
  496. add CO1, CO1, #4
  497. .endm
  498. /******************************************************************************/
  499. /******************************************************************************/
  500. .macro INIT4x1
  501. vsub.f32 s16 , s16 , s16
  502. vmov.f32 s17, s16
  503. vmov.f32 s18, s16
  504. vmov.f32 s19, s16
  505. .endm
  506. .macro KERNEL4x1_SUB
  507. flds s8 , [ BO ]
  508. flds s0 , [ AO ]
  509. flds s1 , [ AO, #4 ]
  510. flds s2 , [ AO, #8 ]
  511. flds s3 , [ AO, #12 ]
  512. fmacs s16 , s0, s8
  513. fmacs s17 , s1, s8
  514. fmacs s18 , s2, s8
  515. fmacs s19 , s3, s8
  516. add AO , AO, #16
  517. add BO , BO, #4
  518. .endm
  519. .macro SAVE4x1
  520. flds s0, ALPHA
  521. flds s8 , [CO1]
  522. flds s9 , [CO1, #4 ]
  523. flds s10, [CO1, #8 ]
  524. flds s11, [CO1, #12 ]
  525. fmacs s8 , s0 , s16
  526. fmacs s9 , s0 , s17
  527. fmacs s10, s0 , s18
  528. fmacs s11, s0 , s19
  529. fsts s8 , [CO1]
  530. fsts s9 , [CO1, #4 ]
  531. fsts s10, [CO1, #8 ]
  532. fsts s11, [CO1, #12 ]
  533. add CO1, CO1, #16
  534. .endm
  535. /******************************************************************************/
  536. .macro INIT2x1
  537. vsub.f32 s16 , s16 , s16
  538. vmov.f32 s17, s16
  539. .endm
  540. .macro KERNEL2x1_SUB
  541. flds s8 , [ BO ]
  542. flds s0 , [ AO ]
  543. flds s1 , [ AO, #4 ]
  544. fmacs s16 , s0, s8
  545. fmacs s17 , s1, s8
  546. add AO , AO, #8
  547. add BO , BO, #4
  548. .endm
  549. .macro SAVE2x1
  550. flds s0, ALPHA
  551. flds s8 , [CO1]
  552. flds s9 , [CO1, #4 ]
  553. fmacs s8 , s0 , s16
  554. fmacs s9 , s0 , s17
  555. fsts s8 , [CO1]
  556. fsts s9 , [CO1, #4 ]
  557. add CO1, CO1, #8
  558. .endm
  559. /******************************************************************************/
  560. .macro INIT1x1
  561. vsub.f32 s16 , s16 , s16
  562. .endm
  563. .macro KERNEL1x1_SUB
  564. flds s8 , [ BO ]
  565. flds s0 , [ AO ]
  566. fmacs s16 , s0, s8
  567. add AO , AO, #4
  568. add BO , BO, #4
  569. .endm
  570. .macro SAVE1x1
  571. flds s0, ALPHA
  572. flds s8 , [CO1]
  573. fmacs s8 , s0 , s16
  574. fsts s8 , [CO1]
  575. add CO1, CO1, #4
  576. .endm
  577. /**************************************************************************************
  578. * End of macro definitions
  579. **************************************************************************************/
  580. PROLOGUE
  581. .align 5
  582. push {r4 - r9, fp}
  583. add fp, sp, #24
  584. sub sp, sp, #STACKSIZE // reserve stack
  585. str OLD_M, M
  586. str OLD_N, N
  587. str OLD_K, K
  588. str OLD_A, A
  589. vstr OLD_ALPHA, ALPHA
  590. sub r3, fp, #128
  591. vstm r3, { s8 - s31} // store floating point registers
  592. ldr r3, OLD_LDC
  593. lsl r3, r3, #2 // ldc = ldc * 4
  594. str r3, LDC
  595. ldr K1, K
  596. ldr BC, B
  597. ldr J, N
  598. asrs J, J, #2 // J = J / 4
  599. ble sgemm_kernel_L2_BEGIN
  600. sgemm_kernel_L4_BEGIN:
  601. ldr CO1, C // CO1 = C
  602. ldr r4 , LDC
  603. lsl r4 , r4 , #2 // LDC * 4
  604. add r3 , r4, CO1
  605. str r3 , C // store C
  606. ldr AO, A // AO = A
  607. pld [AO , #A_PRE-64]
  608. pld [AO , #A_PRE-32]
  609. sgemm_kernel_L4_M4_BEGIN:
  610. ldr I, M
  611. asrs I, I, #2 // I = I / 4
  612. ble sgemm_kernel_L4_M2_BEGIN
  613. sgemm_kernel_L4_M4_20:
  614. mov BO, BC
  615. asrs L , K1, #1 // L = L / 8
  616. cmp L , #2
  617. blt sgemm_kernel_L4_M4_32
  618. KERNEL4x4_I
  619. KERNEL4x4_M2
  620. subs L, L, #2
  621. ble sgemm_kernel_L4_M4_22a
  622. .align 5
  623. sgemm_kernel_L4_M4_22:
  624. KERNEL4x4_M1
  625. KERNEL4x4_M2
  626. subs L, L, #1
  627. bgt sgemm_kernel_L4_M4_22
  628. sgemm_kernel_L4_M4_22a:
  629. KERNEL4x4_M1
  630. KERNEL4x4_E
  631. b sgemm_kernel_L4_M4_44
  632. sgemm_kernel_L4_M4_32:
  633. tst L, #1
  634. ble sgemm_kernel_L4_M4_40
  635. KERNEL4x4_I
  636. KERNEL4x4_E
  637. b sgemm_kernel_L4_M4_44
  638. sgemm_kernel_L4_M4_40:
  639. INIT4x4
  640. sgemm_kernel_L4_M4_44:
  641. ands L , K1, #1 // L = L % 8
  642. ble sgemm_kernel_L4_M4_100
  643. sgemm_kernel_L4_M4_46:
  644. KERNEL4x4_SUB
  645. subs L, L, #1
  646. bne sgemm_kernel_L4_M4_46
  647. sgemm_kernel_L4_M4_100:
  648. SAVE4x4
  649. sgemm_kernel_L4_M4_END:
  650. subs I, I, #1
  651. bne sgemm_kernel_L4_M4_20
  652. sgemm_kernel_L4_M2_BEGIN:
  653. ldr I, M
  654. tst I , #3
  655. ble sgemm_kernel_L4_END
  656. tst I, #2 // I = I / 2
  657. ble sgemm_kernel_L4_M1_BEGIN
  658. sgemm_kernel_L4_M2_20:
  659. INIT2x4
  660. mov BO, BC
  661. asrs L , K1, #3 // L = L / 8
  662. ble sgemm_kernel_L4_M2_40
  663. sgemm_kernel_L4_M2_22:
  664. KERNEL2x4_SUB
  665. KERNEL2x4_SUB
  666. KERNEL2x4_SUB
  667. KERNEL2x4_SUB
  668. KERNEL2x4_SUB
  669. KERNEL2x4_SUB
  670. KERNEL2x4_SUB
  671. KERNEL2x4_SUB
  672. subs L, L, #1
  673. bgt sgemm_kernel_L4_M2_22
  674. sgemm_kernel_L4_M2_40:
  675. ands L , K1, #7 // L = L % 8
  676. ble sgemm_kernel_L4_M2_100
  677. sgemm_kernel_L4_M2_42:
  678. KERNEL2x4_SUB
  679. subs L, L, #1
  680. bgt sgemm_kernel_L4_M2_42
  681. sgemm_kernel_L4_M2_100:
  682. SAVE2x4
  683. sgemm_kernel_L4_M2_END:
  684. sgemm_kernel_L4_M1_BEGIN:
  685. tst I, #1 // I = I % 2
  686. ble sgemm_kernel_L4_END
  687. sgemm_kernel_L4_M1_20:
  688. INIT1x4
  689. mov BO, BC
  690. asrs L , K1, #3 // L = L / 8
  691. ble sgemm_kernel_L4_M1_40
  692. sgemm_kernel_L4_M1_22:
  693. KERNEL1x4_SUB
  694. KERNEL1x4_SUB
  695. KERNEL1x4_SUB
  696. KERNEL1x4_SUB
  697. KERNEL1x4_SUB
  698. KERNEL1x4_SUB
  699. KERNEL1x4_SUB
  700. KERNEL1x4_SUB
  701. subs L, L, #1
  702. bgt sgemm_kernel_L4_M1_22
  703. sgemm_kernel_L4_M1_40:
  704. ands L , K1, #7 // L = L % 8
  705. ble sgemm_kernel_L4_M1_100
  706. sgemm_kernel_L4_M1_42:
  707. KERNEL1x4_SUB
  708. subs L, L, #1
  709. bgt sgemm_kernel_L4_M1_42
  710. sgemm_kernel_L4_M1_100:
  711. SAVE1x4
  712. sgemm_kernel_L4_END:
  713. mov r3, BC
  714. mov r4, K1
  715. lsl r4, r4, #4 // k * 4 * 4
  716. add r3, r3, r4 // B = B + K * 4 * 4
  717. mov BC, r3
  718. subs J , #1 // j--
  719. bgt sgemm_kernel_L4_BEGIN
  720. /*********************************************************************************************/
  721. sgemm_kernel_L2_BEGIN:
  722. ldr J , N
  723. tst J , #3
  724. ble sgemm_kernel_L999
  725. tst J , #2
  726. ble sgemm_kernel_L1_BEGIN
  727. ldr CO1, C // CO1 = C
  728. ldr r4 , LDC
  729. lsl r4 , r4 , #1 // LDC * 2
  730. add r3 , r4, CO1
  731. str r3 , C // store C
  732. ldr AO, A // AO = A
  733. //pld [AO , #A_PRE-96]
  734. //pld [AO , #A_PRE-64]
  735. //pld [AO , #A_PRE-32]
  736. sgemm_kernel_L2_M4_BEGIN:
  737. ldr I, M
  738. asrs I, I, #2 // I = I / 4
  739. ble sgemm_kernel_L2_M2_BEGIN
  740. sgemm_kernel_L2_M4_20:
  741. INIT4x2
  742. mov BO, BC
  743. asrs L , K1, #3 // L = L / 8
  744. ble sgemm_kernel_L2_M4_40
  745. .align 5
  746. sgemm_kernel_L2_M4_22:
  747. KERNEL4x2_SUB
  748. KERNEL4x2_SUB
  749. KERNEL4x2_SUB
  750. KERNEL4x2_SUB
  751. KERNEL4x2_SUB
  752. KERNEL4x2_SUB
  753. KERNEL4x2_SUB
  754. KERNEL4x2_SUB
  755. subs L, L, #1
  756. bgt sgemm_kernel_L2_M4_22
  757. sgemm_kernel_L2_M4_40:
  758. ands L , K1, #7 // L = L % 8
  759. ble sgemm_kernel_L2_M4_100
  760. sgemm_kernel_L2_M4_42:
  761. KERNEL4x2_SUB
  762. subs L, L, #1
  763. bgt sgemm_kernel_L2_M4_42
  764. sgemm_kernel_L2_M4_100:
  765. SAVE4x2
  766. sgemm_kernel_L2_M4_END:
  767. subs I, I, #1
  768. bgt sgemm_kernel_L2_M4_20
  769. sgemm_kernel_L2_M2_BEGIN:
  770. ldr I, M
  771. tst I , #3
  772. ble sgemm_kernel_L2_END
  773. tst I, #2 // I = I / 2
  774. ble sgemm_kernel_L2_M1_BEGIN
  775. sgemm_kernel_L2_M2_20:
  776. INIT2x2
  777. mov BO, BC
  778. asrs L , K1, #3 // L = L / 8
  779. ble sgemm_kernel_L2_M2_40
  780. sgemm_kernel_L2_M2_22:
  781. KERNEL2x2_SUB
  782. KERNEL2x2_SUB
  783. KERNEL2x2_SUB
  784. KERNEL2x2_SUB
  785. KERNEL2x2_SUB
  786. KERNEL2x2_SUB
  787. KERNEL2x2_SUB
  788. KERNEL2x2_SUB
  789. subs L, L, #1
  790. bgt sgemm_kernel_L2_M2_22
  791. sgemm_kernel_L2_M2_40:
  792. ands L , K1, #7 // L = L % 8
  793. ble sgemm_kernel_L2_M2_100
  794. sgemm_kernel_L2_M2_42:
  795. KERNEL2x2_SUB
  796. subs L, L, #1
  797. bgt sgemm_kernel_L2_M2_42
  798. sgemm_kernel_L2_M2_100:
  799. SAVE2x2
  800. sgemm_kernel_L2_M2_END:
  801. sgemm_kernel_L2_M1_BEGIN:
  802. tst I, #1 // I = I % 2
  803. ble sgemm_kernel_L2_END
  804. sgemm_kernel_L2_M1_20:
  805. INIT1x2
  806. mov BO, BC
  807. asrs L , K1, #3 // L = L / 8
  808. ble sgemm_kernel_L2_M1_40
  809. sgemm_kernel_L2_M1_22:
  810. KERNEL1x2_SUB
  811. KERNEL1x2_SUB
  812. KERNEL1x2_SUB
  813. KERNEL1x2_SUB
  814. KERNEL1x2_SUB
  815. KERNEL1x2_SUB
  816. KERNEL1x2_SUB
  817. KERNEL1x2_SUB
  818. subs L, L, #1
  819. bgt sgemm_kernel_L2_M1_22
  820. sgemm_kernel_L2_M1_40:
  821. ands L , K1, #7 // L = L % 8
  822. ble sgemm_kernel_L2_M1_100
  823. sgemm_kernel_L2_M1_42:
  824. KERNEL1x2_SUB
  825. subs L, L, #1
  826. bgt sgemm_kernel_L2_M1_42
  827. sgemm_kernel_L2_M1_100:
  828. SAVE1x2
  829. sgemm_kernel_L2_END:
  830. mov r3, BC
  831. mov r4, K1
  832. lsl r4, r4, #3 // k * 2 * 4
  833. add r3, r3, r4 // B = B + K * 2 * 4
  834. mov BC, r3
  835. /*********************************************************************************************/
  836. sgemm_kernel_L1_BEGIN:
  837. ldr J , N
  838. tst J , #1
  839. ble sgemm_kernel_L999
  840. ldr CO1, C // CO1 = C
  841. ldr r4 , LDC
  842. add r3 , r4, CO1
  843. str r3 , C // store C
  844. ldr AO, A // AO = A
  845. //pld [AO , #A_PRE-96]
  846. //pld [AO , #A_PRE-64]
  847. //pld [AO , #A_PRE-32]
  848. sgemm_kernel_L1_M4_BEGIN:
  849. ldr I, M
  850. asrs I, I, #2 // I = I / 4
  851. ble sgemm_kernel_L1_M2_BEGIN
  852. sgemm_kernel_L1_M4_20:
  853. INIT4x1
  854. mov BO, BC
  855. asrs L , K1, #3 // L = L / 8
  856. ble sgemm_kernel_L1_M4_40
  857. .align 5
  858. sgemm_kernel_L1_M4_22:
  859. KERNEL4x1_SUB
  860. KERNEL4x1_SUB
  861. KERNEL4x1_SUB
  862. KERNEL4x1_SUB
  863. KERNEL4x1_SUB
  864. KERNEL4x1_SUB
  865. KERNEL4x1_SUB
  866. KERNEL4x1_SUB
  867. subs L, L, #1
  868. bgt sgemm_kernel_L1_M4_22
  869. sgemm_kernel_L1_M4_40:
  870. ands L , K1, #7 // L = L % 8
  871. ble sgemm_kernel_L1_M4_100
  872. sgemm_kernel_L1_M4_42:
  873. KERNEL4x1_SUB
  874. subs L, L, #1
  875. bgt sgemm_kernel_L1_M4_42
  876. sgemm_kernel_L1_M4_100:
  877. SAVE4x1
  878. sgemm_kernel_L1_M4_END:
  879. subs I, I, #1
  880. bgt sgemm_kernel_L1_M4_20
  881. sgemm_kernel_L1_M2_BEGIN:
  882. ldr I, M
  883. tst I , #3
  884. ble sgemm_kernel_L1_END
  885. tst I, #2 // I = I / 2
  886. ble sgemm_kernel_L1_M1_BEGIN
  887. sgemm_kernel_L1_M2_20:
  888. INIT2x1
  889. mov BO, BC
  890. asrs L , K1, #3 // L = L / 8
  891. ble sgemm_kernel_L1_M2_40
  892. sgemm_kernel_L1_M2_22:
  893. KERNEL2x1_SUB
  894. KERNEL2x1_SUB
  895. KERNEL2x1_SUB
  896. KERNEL2x1_SUB
  897. KERNEL2x1_SUB
  898. KERNEL2x1_SUB
  899. KERNEL2x1_SUB
  900. KERNEL2x1_SUB
  901. subs L, L, #1
  902. bgt sgemm_kernel_L1_M2_22
  903. sgemm_kernel_L1_M2_40:
  904. ands L , K1, #7 // L = L % 8
  905. ble sgemm_kernel_L1_M2_100
  906. sgemm_kernel_L1_M2_42:
  907. KERNEL2x1_SUB
  908. subs L, L, #1
  909. bgt sgemm_kernel_L1_M2_42
  910. sgemm_kernel_L1_M2_100:
  911. SAVE2x1
  912. sgemm_kernel_L1_M2_END:
  913. sgemm_kernel_L1_M1_BEGIN:
  914. tst I, #1 // I = I % 2
  915. ble sgemm_kernel_L1_END
  916. sgemm_kernel_L1_M1_20:
  917. INIT1x1
  918. mov BO, BC
  919. asrs L , K1, #3 // L = L / 8
  920. ble sgemm_kernel_L1_M1_40
  921. sgemm_kernel_L1_M1_22:
  922. KERNEL1x1_SUB
  923. KERNEL1x1_SUB
  924. KERNEL1x1_SUB
  925. KERNEL1x1_SUB
  926. KERNEL1x1_SUB
  927. KERNEL1x1_SUB
  928. KERNEL1x1_SUB
  929. KERNEL1x1_SUB
  930. subs L, L, #1
  931. bgt sgemm_kernel_L1_M1_22
  932. sgemm_kernel_L1_M1_40:
  933. ands L , K1, #7 // L = L % 8
  934. ble sgemm_kernel_L1_M1_100
  935. sgemm_kernel_L1_M1_42:
  936. KERNEL1x1_SUB
  937. subs L, L, #1
  938. bgt sgemm_kernel_L1_M1_42
  939. sgemm_kernel_L1_M1_100:
  940. SAVE1x1
  941. sgemm_kernel_L1_END:
  942. sgemm_kernel_L999:
  943. sub r3, fp, #128
  944. vldm r3, { s8 - s31} // restore floating point registers
  945. movs r0, #0 // set return value
  946. sub sp, fp, #24
  947. pop {r4 - r9, fp}
  948. bx lr
  949. EPILOGUE