You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_4x4_vfpv3.S 22 kB


  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2013/11/23 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. *
  34. * 2013/11/02 Saar
  35. * UNROLL_N 4
  36. * UNROLL_M 4
  37. * DGEMM_P 128
  38. * DGEMM_Q 240
  39. * DGEMM_R 12288
  40. * A_PRE 128
  41. * B_PRE 128
  42. * C_PRE 32
  43. *
  44. * Performance on Odroid U2:
  45. *
  46. * 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS
  47. * 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS
  48. * 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS
  49. * 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS
  50. **************************************************************************************/
  51. #define ASSEMBLER
  52. #include "common.h"
  53. #define STACKSIZE 256
  54. #define OLD_M r0
  55. #define OLD_N r1
  56. #define OLD_K r2
  57. #define OLD_A r3
  58. #define OLD_ALPHA s0
  59. /******************************************************
  60. * [fp, #-128] - [fp, #-64] is reserved
  61. * for store and restore of floating point
  62. * registers
  63. *******************************************************/
  64. #define LDC [fp, #-252 ]
  65. #define M [fp, #-256 ]
  66. #define N [fp, #-260 ]
  67. #define K [fp, #-264 ]
  68. #define A [fp, #-268 ]
  69. #define FP_ZERO [fp, #-240]
  70. #define FP_ZERO_0 [fp, #-240]
  71. #define FP_ZERO_1 [fp, #-236]
  72. #define ALPHA [fp, #-280]
  73. #if !defined(__ARM_PCS_VFP)
  74. #define OLD_ALPHA_SOFTFP r3
  75. #define OLD_A_SOFTFP [fp, #4 ]
  76. #define B [fp, #8 ]
  77. #define C [fp, #12 ]
  78. #define OLD_LDC [fp, #16 ]
  79. #else
  80. #define B [fp, #4 ]
  81. #define C [fp, #8 ]
  82. #define OLD_LDC [fp, #12 ]
  83. #endif
  84. #define I r0
  85. #define J r1
  86. #define L r2
  87. #define AO r5
  88. #define BO r6
  89. #define CO1 r8
  90. #define CO2 r9
  91. #define K1 r7
  92. #define BC r12
  93. #define A_PRE 128
  94. #define B_PRE 128
  95. #define C_PRE 32
  96. /**************************************************************************************
  97. * Macro definitions
  98. **************************************************************************************/
  99. .macro INIT4x4
  100. flds s16, FP_ZERO
  101. vmov.f32 s17, s16
  102. vmov.f32 s18, s16
  103. vmov.f32 s19, s16
  104. vmov.f32 s20, s16
  105. vmov.f32 s21, s16
  106. vmov.f32 s22, s16
  107. vmov.f32 s23, s16
  108. vmov.f32 s24, s16
  109. vmov.f32 s25, s16
  110. vmov.f32 s26, s16
  111. vmov.f32 s27, s16
  112. vmov.f32 s28, s16
  113. vmov.f32 s29, s16
  114. vmov.f32 s30, s16
  115. vmov.f32 s31, s16
  116. .endm
  117. .macro KERNEL4x4_I
  118. pld [ AO , #A_PRE ]
  119. vldmia.f32 AO!, { s0 - s1 }
  120. pld [ BO , #B_PRE ]
  121. vldmia.f32 BO!, { s8 - s9 }
  122. fmuls s16 , s0, s8
  123. vldmia.f32 AO!, { s2 - s3 }
  124. fmuls s17 , s1, s8
  125. fmuls s18 , s2, s8
  126. vldmia.f32 BO!, { s10 - s11 }
  127. fmuls s19 , s3, s8
  128. fmuls s20 , s0, s9
  129. vldmia.f32 AO!, { s4 - s5 }
  130. fmuls s21 , s1, s9
  131. fmuls s22 , s2, s9
  132. vldmia.f32 AO!, { s6 - s7 }
  133. fmuls s23 , s3, s9
  134. fmuls s24 , s0, s10
  135. vldmia.f32 BO!, { s12 - s13 }
  136. fmuls s25 , s1, s10
  137. fmuls s26 , s2, s10
  138. vldmia.f32 BO!, { s14 - s15 }
  139. fmuls s27 , s3, s10
  140. fmuls s28 , s0, s11
  141. fmuls s29 , s1, s11
  142. fmuls s30 , s2, s11
  143. fmuls s31 , s3, s11
  144. .endm
  145. .macro KERNEL4x4_M2
  146. pld [ AO , #A_PRE ]
  147. fmacs s16 , s4, s12
  148. fmacs s17 , s5, s12
  149. vldmia.f32 AO!, { s0 - s3 }
  150. fmacs s18 , s6, s12
  151. pld [ BO , #B_PRE ]
  152. fmacs s19 , s7, s12
  153. fmacs s20 , s4, s13
  154. vldmia.f32 BO!, { s8 - s11 }
  155. fmacs s21 , s5, s13
  156. fmacs s22 , s6, s13
  157. //vldmia.f32 AO!, { s2 - s3 }
  158. fmacs s23 , s7, s13
  159. fmacs s24 , s4, s14
  160. //vldmia.f32 BO!, { s10 - s11 }
  161. fmacs s25 , s5, s14
  162. fmacs s26 , s6, s14
  163. fmacs s27 , s7, s14
  164. fmacs s28 , s4, s15
  165. fmacs s29 , s5, s15
  166. fmacs s30 , s6, s15
  167. fmacs s31 , s7, s15
  168. .endm
  169. .macro KERNEL4x4_M1
  170. fmacs s16 , s0, s8
  171. vldmia.f32 AO!, { s4 - s7 }
  172. fmacs s17 , s1, s8
  173. fmacs s18 , s2, s8
  174. vldmia.f32 BO!, { s12 - s15 }
  175. //vldmia.f32 AO!, { s6 - s7 }
  176. fmacs s19 , s3, s8
  177. fmacs s20 , s0, s9
  178. fmacs s21 , s1, s9
  179. fmacs s22 , s2, s9
  180. //vldmia.f32 BO!, { s14 - s15 }
  181. fmacs s23 , s3, s9
  182. fmacs s24 , s0, s10
  183. fmacs s25 , s1, s10
  184. fmacs s26 , s2, s10
  185. fmacs s27 , s3, s10
  186. fmacs s28 , s0, s11
  187. fmacs s29 , s1, s11
  188. fmacs s30 , s2, s11
  189. fmacs s31 , s3, s11
  190. .endm
  191. .macro KERNEL4x4_E
  192. fmacs s16 , s4, s12
  193. fmacs s17 , s5, s12
  194. fmacs s18 , s6, s12
  195. fmacs s19 , s7, s12
  196. fmacs s20 , s4, s13
  197. fmacs s21 , s5, s13
  198. fmacs s22 , s6, s13
  199. fmacs s23 , s7, s13
  200. fmacs s24 , s4, s14
  201. fmacs s25 , s5, s14
  202. fmacs s26 , s6, s14
  203. fmacs s27 , s7, s14
  204. fmacs s28 , s4, s15
  205. fmacs s29 , s5, s15
  206. fmacs s30 , s6, s15
  207. fmacs s31 , s7, s15
  208. .endm
  209. .macro KERNEL4x4_SUB
  210. flds s8 , [ BO ]
  211. flds s0 , [ AO ]
  212. flds s1 , [ AO, #4 ]
  213. fmacs s16 , s0, s8
  214. flds s2 , [ AO, #8 ]
  215. fmacs s17 , s1, s8
  216. flds s3 , [ AO, #12 ]
  217. fmacs s18 , s2, s8
  218. flds s9 , [ BO, #4 ]
  219. fmacs s19 , s3, s8
  220. flds s10, [ BO, #8 ]
  221. fmacs s20 , s0, s9
  222. flds s11, [ BO, #12 ]
  223. fmacs s21 , s1, s9
  224. fmacs s22 , s2, s9
  225. fmacs s23 , s3, s9
  226. fmacs s24 , s0, s10
  227. fmacs s25 , s1, s10
  228. fmacs s26 , s2, s10
  229. fmacs s27 , s3, s10
  230. fmacs s28 , s0, s11
  231. fmacs s29 , s1, s11
  232. add AO , AO, #16
  233. fmacs s30 , s2, s11
  234. add BO , BO, #16
  235. fmacs s31 , s3, s11
  236. .endm
  237. .macro SAVE4x4
  238. ldr r3 , LDC
  239. add CO2 , CO1, r3
  240. flds s0, ALPHA
  241. add r4 , CO2, r3
  242. vldmia.f32 CO1, { s8 - s11 }
  243. fmacs s8 , s0 , s16
  244. flds s12, [CO2]
  245. fmacs s9 , s0 , s17
  246. flds s13, [CO2, #4 ]
  247. fmacs s10, s0 , s18
  248. flds s14, [CO2, #8 ]
  249. fmacs s11, s0 , s19
  250. flds s15, [CO2, #12 ]
  251. fmacs s12, s0 , s20
  252. fsts s8 , [CO1]
  253. fmacs s13, s0 , s21
  254. fsts s9 , [CO1, #4 ]
  255. fmacs s14, s0 , s22
  256. fsts s10, [CO1, #8 ]
  257. fmacs s15, s0 , s23
  258. fsts s11, [CO1, #12 ]
  259. pld [ CO1 , #C_PRE ]
  260. vldmia.f32 r4, { s8 - s11 }
  261. fmacs s8 , s0 , s24
  262. fsts s12, [CO2]
  263. fmacs s9 , s0 , s25
  264. fsts s13, [CO2, #4 ]
  265. fmacs s10, s0 , s26
  266. fsts s14, [CO2, #8 ]
  267. fmacs s11, s0 , s27
  268. fsts s15, [CO2, #12 ]
  269. pld [ CO2 , #C_PRE ]
  270. add CO2, r4 , r3
  271. vldmia.f32 CO2, { s12 - s15 }
  272. fsts s8 , [r4 ]
  273. fmacs s12, s0 , s28
  274. fsts s9 , [r4 , #4 ]
  275. fmacs s13, s0 , s29
  276. fsts s10, [r4 , #8 ]
  277. fmacs s14, s0 , s30
  278. fsts s11, [r4 , #12 ]
  279. fmacs s15, s0 , s31
  280. pld [ r4 , #C_PRE ]
  281. vstmia.f32 CO2, { s12 - s15 }
  282. pld [ CO2 , #C_PRE ]
  283. add CO1, CO1, #16
  284. .endm
  285. /******************************************************************************/
  286. .macro INIT2x4
  287. flds s16, FP_ZERO
  288. vmov.f32 s17, s16
  289. vmov.f32 s20, s16
  290. vmov.f32 s21, s16
  291. vmov.f32 s24, s16
  292. vmov.f32 s25, s16
  293. vmov.f32 s28, s16
  294. vmov.f32 s29, s16
  295. .endm
  296. .macro KERNEL2x4_SUB
  297. flds s8 , [ BO ]
  298. flds s9 , [ BO, #4 ]
  299. flds s10, [ BO, #8 ]
  300. flds s11, [ BO, #12 ]
  301. flds s0 , [ AO ]
  302. flds s1 , [ AO, #4 ]
  303. fmacs s16 , s0, s8
  304. fmacs s17 , s1, s8
  305. fmacs s20 , s0, s9
  306. fmacs s21 , s1, s9
  307. fmacs s24 , s0, s10
  308. fmacs s25 , s1, s10
  309. fmacs s28 , s0, s11
  310. fmacs s29 , s1, s11
  311. add AO , AO, #8
  312. add BO , BO, #16
  313. .endm
  314. .macro SAVE2x4
  315. ldr r3 , LDC
  316. add CO2 , CO1, r3
  317. add r4 , CO2, r3
  318. flds s0, ALPHA
  319. flds s8 , [CO1]
  320. flds s9 , [CO1, #4 ]
  321. fmacs s8 , s0 , s16
  322. fmacs s9 , s0 , s17
  323. fsts s8 , [CO1]
  324. fsts s9 , [CO1, #4 ]
  325. flds s12, [CO2]
  326. flds s13, [CO2, #4 ]
  327. fmacs s12, s0 , s20
  328. fmacs s13, s0 , s21
  329. fsts s12, [CO2]
  330. fsts s13, [CO2, #4 ]
  331. flds s8 , [r4 ]
  332. flds s9 , [r4 , #4 ]
  333. fmacs s8 , s0 , s24
  334. fmacs s9 , s0 , s25
  335. fsts s8 , [r4 ]
  336. fsts s9 , [r4 , #4 ]
  337. add CO2, r4 , r3
  338. flds s12, [CO2]
  339. flds s13, [CO2, #4 ]
  340. fmacs s12, s0 , s28
  341. fmacs s13, s0 , s29
  342. fsts s12, [CO2]
  343. fsts s13, [CO2, #4 ]
  344. add CO1, CO1, #8
  345. .endm
  346. /******************************************************************************/
  347. .macro INIT1x4
  348. flds s16, FP_ZERO
  349. vmov.f32 s20, s16
  350. vmov.f32 s24, s16
  351. vmov.f32 s28, s16
  352. .endm
  353. .macro KERNEL1x4_SUB
  354. flds s8 , [ BO ]
  355. flds s9 , [ BO, #4 ]
  356. flds s10, [ BO, #8 ]
  357. flds s11, [ BO, #12 ]
  358. flds s0 , [ AO ]
  359. fmacs s16 , s0, s8
  360. fmacs s20 , s0, s9
  361. fmacs s24 , s0, s10
  362. fmacs s28 , s0, s11
  363. add AO , AO, #4
  364. add BO , BO, #16
  365. .endm
  366. .macro SAVE1x4
  367. ldr r3 , LDC
  368. add CO2 , CO1, r3
  369. add r4 , CO2, r3
  370. flds s0, ALPHA
  371. flds s8 , [CO1]
  372. fmacs s8 , s0 , s16
  373. fsts s8 , [CO1]
  374. flds s12, [CO2]
  375. fmacs s12, s0 , s20
  376. fsts s12, [CO2]
  377. flds s8 , [r4 ]
  378. fmacs s8 , s0 , s24
  379. fsts s8 , [r4 ]
  380. add CO2, r4 , r3
  381. flds s12, [CO2]
  382. fmacs s12, s0 , s28
  383. fsts s12, [CO2]
  384. add CO1, CO1, #4
  385. .endm
  386. /******************************************************************************/
  387. /******************************************************************************/
  388. .macro INIT4x2
  389. flds s16, FP_ZERO
  390. vmov.f32 s17, s16
  391. vmov.f32 s18, s16
  392. vmov.f32 s19, s16
  393. vmov.f32 s20, s16
  394. vmov.f32 s21, s16
  395. vmov.f32 s22, s16
  396. vmov.f32 s23, s16
  397. .endm
  398. .macro KERNEL4x2_SUB
  399. flds s8 , [ BO ]
  400. flds s9 , [ BO, #4 ]
  401. flds s0 , [ AO ]
  402. flds s1 , [ AO, #4 ]
  403. flds s2 , [ AO, #8 ]
  404. flds s3 , [ AO, #12 ]
  405. fmacs s16 , s0, s8
  406. fmacs s17 , s1, s8
  407. fmacs s18 , s2, s8
  408. fmacs s19 , s3, s8
  409. fmacs s20 , s0, s9
  410. fmacs s21 , s1, s9
  411. fmacs s22 , s2, s9
  412. fmacs s23 , s3, s9
  413. add AO , AO, #16
  414. add BO , BO, #8
  415. .endm
  416. .macro SAVE4x2
  417. ldr r3 , LDC
  418. add CO2 , CO1, r3
  419. flds s0, ALPHA
  420. flds s8 , [CO1]
  421. flds s9 , [CO1, #4 ]
  422. flds s10, [CO1, #8 ]
  423. flds s11, [CO1, #12 ]
  424. fmacs s8 , s0 , s16
  425. fmacs s9 , s0 , s17
  426. fmacs s10, s0 , s18
  427. fmacs s11, s0 , s19
  428. fsts s8 , [CO1]
  429. fsts s9 , [CO1, #4 ]
  430. fsts s10, [CO1, #8 ]
  431. fsts s11, [CO1, #12 ]
  432. flds s12, [CO2]
  433. flds s13, [CO2, #4 ]
  434. flds s14, [CO2, #8 ]
  435. flds s15, [CO2, #12 ]
  436. fmacs s12, s0 , s20
  437. fmacs s13, s0 , s21
  438. fmacs s14, s0 , s22
  439. fmacs s15, s0 , s23
  440. fsts s12, [CO2]
  441. fsts s13, [CO2, #4 ]
  442. fsts s14, [CO2, #8 ]
  443. fsts s15, [CO2, #12 ]
  444. add CO1, CO1, #16
  445. .endm
  446. /******************************************************************************/
  447. .macro INIT2x2
  448. flds s16, FP_ZERO
  449. vmov.f32 s17, s16
  450. vmov.f32 s20, s16
  451. vmov.f32 s21, s16
  452. .endm
  453. .macro KERNEL2x2_SUB
  454. flds s8 , [ BO ]
  455. flds s9 , [ BO, #4 ]
  456. flds s0 , [ AO ]
  457. flds s1 , [ AO, #4 ]
  458. fmacs s16 , s0, s8
  459. fmacs s17 , s1, s8
  460. fmacs s20 , s0, s9
  461. fmacs s21 , s1, s9
  462. add AO , AO, #8
  463. add BO , BO, #8
  464. .endm
  465. .macro SAVE2x2
  466. ldr r3 , LDC
  467. add CO2 , CO1, r3
  468. flds s0, ALPHA
  469. flds s8 , [CO1]
  470. flds s9 , [CO1, #4 ]
  471. fmacs s8 , s0 , s16
  472. fmacs s9 , s0 , s17
  473. fsts s8 , [CO1]
  474. fsts s9 , [CO1, #4 ]
  475. flds s12, [CO2]
  476. flds s13, [CO2, #4 ]
  477. fmacs s12, s0 , s20
  478. fmacs s13, s0 , s21
  479. fsts s12, [CO2]
  480. fsts s13, [CO2, #4 ]
  481. add CO1, CO1, #8
  482. .endm
  483. /******************************************************************************/
  484. .macro INIT1x2
  485. flds s16, FP_ZERO
  486. vmov.f32 s20, s16
  487. .endm
  488. .macro KERNEL1x2_SUB
  489. flds s8 , [ BO ]
  490. flds s9 , [ BO, #4 ]
  491. flds s0 , [ AO ]
  492. fmacs s16 , s0, s8
  493. fmacs s20 , s0, s9
  494. add AO , AO, #4
  495. add BO , BO, #8
  496. .endm
  497. .macro SAVE1x2
  498. ldr r3 , LDC
  499. add CO2 , CO1, r3
  500. flds s0, ALPHA
  501. flds s8 , [CO1]
  502. fmacs s8 , s0 , s16
  503. fsts s8 , [CO1]
  504. flds s12, [CO2]
  505. fmacs s12, s0 , s20
  506. fsts s12, [CO2]
  507. add CO1, CO1, #4
  508. .endm
  509. /******************************************************************************/
  510. /******************************************************************************/
  511. .macro INIT4x1
  512. flds s16, FP_ZERO
  513. vmov.f32 s17, s16
  514. vmov.f32 s18, s16
  515. vmov.f32 s19, s16
  516. .endm
  517. .macro KERNEL4x1_SUB
  518. flds s8 , [ BO ]
  519. flds s0 , [ AO ]
  520. flds s1 , [ AO, #4 ]
  521. flds s2 , [ AO, #8 ]
  522. flds s3 , [ AO, #12 ]
  523. fmacs s16 , s0, s8
  524. fmacs s17 , s1, s8
  525. fmacs s18 , s2, s8
  526. fmacs s19 , s3, s8
  527. add AO , AO, #16
  528. add BO , BO, #4
  529. .endm
  530. .macro SAVE4x1
  531. flds s0, ALPHA
  532. flds s8 , [CO1]
  533. flds s9 , [CO1, #4 ]
  534. flds s10, [CO1, #8 ]
  535. flds s11, [CO1, #12 ]
  536. fmacs s8 , s0 , s16
  537. fmacs s9 , s0 , s17
  538. fmacs s10, s0 , s18
  539. fmacs s11, s0 , s19
  540. fsts s8 , [CO1]
  541. fsts s9 , [CO1, #4 ]
  542. fsts s10, [CO1, #8 ]
  543. fsts s11, [CO1, #12 ]
  544. add CO1, CO1, #16
  545. .endm
  546. /******************************************************************************/
  547. .macro INIT2x1
  548. flds s16, FP_ZERO
  549. vmov.f32 s17, s16
  550. .endm
  551. .macro KERNEL2x1_SUB
  552. flds s8 , [ BO ]
  553. flds s0 , [ AO ]
  554. flds s1 , [ AO, #4 ]
  555. fmacs s16 , s0, s8
  556. fmacs s17 , s1, s8
  557. add AO , AO, #8
  558. add BO , BO, #4
  559. .endm
  560. .macro SAVE2x1
  561. flds s0, ALPHA
  562. flds s8 , [CO1]
  563. flds s9 , [CO1, #4 ]
  564. fmacs s8 , s0 , s16
  565. fmacs s9 , s0 , s17
  566. fsts s8 , [CO1]
  567. fsts s9 , [CO1, #4 ]
  568. add CO1, CO1, #8
  569. .endm
  570. /******************************************************************************/
  571. .macro INIT1x1
  572. flds s16, FP_ZERO
  573. .endm
  574. .macro KERNEL1x1_SUB
  575. flds s8 , [ BO ]
  576. flds s0 , [ AO ]
  577. fmacs s16 , s0, s8
  578. add AO , AO, #4
  579. add BO , BO, #4
  580. .endm
  581. .macro SAVE1x1
  582. flds s0, ALPHA
  583. flds s8 , [CO1]
  584. fmacs s8 , s0 , s16
  585. fsts s8 , [CO1]
  586. add CO1, CO1, #4
  587. .endm
  588. /**************************************************************************************
  589. * End of macro definitions
  590. **************************************************************************************/
  591. PROLOGUE
  592. .align 5
  593. push {r4 - r9, fp}
  594. add fp, sp, #24
  595. sub sp, sp, #STACKSIZE // reserve stack
  596. #if !defined(__ARM_PCS_VFP)
  597. vmov OLD_ALPHA, OLD_ALPHA_SOFTFP
  598. ldr OLD_A, OLD_A_SOFTFP
  599. #endif
  600. str OLD_M, M
  601. str OLD_N, N
  602. str OLD_K, K
  603. str OLD_A, A
  604. vstr OLD_ALPHA, ALPHA
  605. sub r3, fp, #128
  606. vstm r3, { s8 - s31} // store floating point registers
  607. movs r4, #0
  608. str r4, FP_ZERO
  609. str r4, FP_ZERO_1
  610. ldr r3, OLD_LDC
  611. lsl r3, r3, #2 // ldc = ldc * 4
  612. str r3, LDC
  613. ldr K1, K
  614. ldr BC, B
  615. ldr J, N
  616. asrs J, J, #2 // J = J / 4
  617. ble sgemm_kernel_L2_BEGIN
  618. sgemm_kernel_L4_BEGIN:
  619. ldr CO1, C // CO1 = C
  620. ldr r4 , LDC
  621. lsl r4 , r4 , #2 // LDC * 4
  622. add r3 , r4, CO1
  623. str r3 , C // store C
  624. ldr AO, A // AO = A
  625. pld [AO , #A_PRE-64]
  626. pld [AO , #A_PRE-32]
  627. sgemm_kernel_L4_M4_BEGIN:
  628. ldr I, M
  629. asrs I, I, #2 // I = I / 4
  630. ble sgemm_kernel_L4_M2_BEGIN
  631. sgemm_kernel_L4_M4_20:
  632. mov BO, BC
  633. asrs L , K1, #1 // L = L / 8
  634. cmp L , #2
  635. blt sgemm_kernel_L4_M4_32
  636. KERNEL4x4_I
  637. KERNEL4x4_M2
  638. subs L, L, #2
  639. ble sgemm_kernel_L4_M4_22a
  640. .align 5
  641. sgemm_kernel_L4_M4_22:
  642. KERNEL4x4_M1
  643. KERNEL4x4_M2
  644. subs L, L, #1
  645. bgt sgemm_kernel_L4_M4_22
  646. sgemm_kernel_L4_M4_22a:
  647. KERNEL4x4_M1
  648. KERNEL4x4_E
  649. b sgemm_kernel_L4_M4_44
  650. sgemm_kernel_L4_M4_32:
  651. tst L, #1
  652. ble sgemm_kernel_L4_M4_40
  653. KERNEL4x4_I
  654. KERNEL4x4_E
  655. b sgemm_kernel_L4_M4_44
  656. sgemm_kernel_L4_M4_40:
  657. INIT4x4
  658. sgemm_kernel_L4_M4_44:
  659. ands L , K1, #1 // L = L % 8
  660. ble sgemm_kernel_L4_M4_100
  661. sgemm_kernel_L4_M4_46:
  662. KERNEL4x4_SUB
  663. subs L, L, #1
  664. bne sgemm_kernel_L4_M4_46
  665. sgemm_kernel_L4_M4_100:
  666. SAVE4x4
  667. sgemm_kernel_L4_M4_END:
  668. subs I, I, #1
  669. bne sgemm_kernel_L4_M4_20
  670. sgemm_kernel_L4_M2_BEGIN:
  671. ldr I, M
  672. tst I , #3
  673. ble sgemm_kernel_L4_END
  674. tst I, #2 // I = I / 2
  675. ble sgemm_kernel_L4_M1_BEGIN
  676. sgemm_kernel_L4_M2_20:
  677. INIT2x4
  678. mov BO, BC
  679. asrs L , K1, #3 // L = L / 8
  680. ble sgemm_kernel_L4_M2_40
  681. sgemm_kernel_L4_M2_22:
  682. KERNEL2x4_SUB
  683. KERNEL2x4_SUB
  684. KERNEL2x4_SUB
  685. KERNEL2x4_SUB
  686. KERNEL2x4_SUB
  687. KERNEL2x4_SUB
  688. KERNEL2x4_SUB
  689. KERNEL2x4_SUB
  690. subs L, L, #1
  691. bgt sgemm_kernel_L4_M2_22
  692. sgemm_kernel_L4_M2_40:
  693. ands L , K1, #7 // L = L % 8
  694. ble sgemm_kernel_L4_M2_100
  695. sgemm_kernel_L4_M2_42:
  696. KERNEL2x4_SUB
  697. subs L, L, #1
  698. bgt sgemm_kernel_L4_M2_42
  699. sgemm_kernel_L4_M2_100:
  700. SAVE2x4
  701. sgemm_kernel_L4_M2_END:
  702. sgemm_kernel_L4_M1_BEGIN:
  703. tst I, #1 // I = I % 2
  704. ble sgemm_kernel_L4_END
  705. sgemm_kernel_L4_M1_20:
  706. INIT1x4
  707. mov BO, BC
  708. asrs L , K1, #3 // L = L / 8
  709. ble sgemm_kernel_L4_M1_40
  710. sgemm_kernel_L4_M1_22:
  711. KERNEL1x4_SUB
  712. KERNEL1x4_SUB
  713. KERNEL1x4_SUB
  714. KERNEL1x4_SUB
  715. KERNEL1x4_SUB
  716. KERNEL1x4_SUB
  717. KERNEL1x4_SUB
  718. KERNEL1x4_SUB
  719. subs L, L, #1
  720. bgt sgemm_kernel_L4_M1_22
  721. sgemm_kernel_L4_M1_40:
  722. ands L , K1, #7 // L = L % 8
  723. ble sgemm_kernel_L4_M1_100
  724. sgemm_kernel_L4_M1_42:
  725. KERNEL1x4_SUB
  726. subs L, L, #1
  727. bgt sgemm_kernel_L4_M1_42
  728. sgemm_kernel_L4_M1_100:
  729. SAVE1x4
  730. sgemm_kernel_L4_END:
  731. mov r3, BC
  732. mov r4, K1
  733. lsl r4, r4, #4 // k * 4 * 4
  734. add r3, r3, r4 // B = B + K * 4 * 4
  735. mov BC, r3
  736. subs J , #1 // j--
  737. bgt sgemm_kernel_L4_BEGIN
  738. /*********************************************************************************************/
  739. sgemm_kernel_L2_BEGIN:
  740. ldr J , N
  741. tst J , #3
  742. ble sgemm_kernel_L999
  743. tst J , #2
  744. ble sgemm_kernel_L1_BEGIN
  745. ldr CO1, C // CO1 = C
  746. ldr r4 , LDC
  747. lsl r4 , r4 , #1 // LDC * 2
  748. add r3 , r4, CO1
  749. str r3 , C // store C
  750. ldr AO, A // AO = A
  751. //pld [AO , #A_PRE-96]
  752. //pld [AO , #A_PRE-64]
  753. //pld [AO , #A_PRE-32]
  754. sgemm_kernel_L2_M4_BEGIN:
  755. ldr I, M
  756. asrs I, I, #2 // I = I / 4
  757. ble sgemm_kernel_L2_M2_BEGIN
  758. sgemm_kernel_L2_M4_20:
  759. INIT4x2
  760. mov BO, BC
  761. asrs L , K1, #3 // L = L / 8
  762. ble sgemm_kernel_L2_M4_40
  763. .align 5
  764. sgemm_kernel_L2_M4_22:
  765. KERNEL4x2_SUB
  766. KERNEL4x2_SUB
  767. KERNEL4x2_SUB
  768. KERNEL4x2_SUB
  769. KERNEL4x2_SUB
  770. KERNEL4x2_SUB
  771. KERNEL4x2_SUB
  772. KERNEL4x2_SUB
  773. subs L, L, #1
  774. bgt sgemm_kernel_L2_M4_22
  775. sgemm_kernel_L2_M4_40:
  776. ands L , K1, #7 // L = L % 8
  777. ble sgemm_kernel_L2_M4_100
  778. sgemm_kernel_L2_M4_42:
  779. KERNEL4x2_SUB
  780. subs L, L, #1
  781. bgt sgemm_kernel_L2_M4_42
  782. sgemm_kernel_L2_M4_100:
  783. SAVE4x2
  784. sgemm_kernel_L2_M4_END:
  785. subs I, I, #1
  786. bgt sgemm_kernel_L2_M4_20
  787. sgemm_kernel_L2_M2_BEGIN:
  788. ldr I, M
  789. tst I , #3
  790. ble sgemm_kernel_L2_END
  791. tst I, #2 // I = I / 2
  792. ble sgemm_kernel_L2_M1_BEGIN
  793. sgemm_kernel_L2_M2_20:
  794. INIT2x2
  795. mov BO, BC
  796. asrs L , K1, #3 // L = L / 8
  797. ble sgemm_kernel_L2_M2_40
  798. sgemm_kernel_L2_M2_22:
  799. KERNEL2x2_SUB
  800. KERNEL2x2_SUB
  801. KERNEL2x2_SUB
  802. KERNEL2x2_SUB
  803. KERNEL2x2_SUB
  804. KERNEL2x2_SUB
  805. KERNEL2x2_SUB
  806. KERNEL2x2_SUB
  807. subs L, L, #1
  808. bgt sgemm_kernel_L2_M2_22
  809. sgemm_kernel_L2_M2_40:
  810. ands L , K1, #7 // L = L % 8
  811. ble sgemm_kernel_L2_M2_100
  812. sgemm_kernel_L2_M2_42:
  813. KERNEL2x2_SUB
  814. subs L, L, #1
  815. bgt sgemm_kernel_L2_M2_42
  816. sgemm_kernel_L2_M2_100:
  817. SAVE2x2
  818. sgemm_kernel_L2_M2_END:
  819. sgemm_kernel_L2_M1_BEGIN:
  820. tst I, #1 // I = I % 2
  821. ble sgemm_kernel_L2_END
  822. sgemm_kernel_L2_M1_20:
  823. INIT1x2
  824. mov BO, BC
  825. asrs L , K1, #3 // L = L / 8
  826. ble sgemm_kernel_L2_M1_40
  827. sgemm_kernel_L2_M1_22:
  828. KERNEL1x2_SUB
  829. KERNEL1x2_SUB
  830. KERNEL1x2_SUB
  831. KERNEL1x2_SUB
  832. KERNEL1x2_SUB
  833. KERNEL1x2_SUB
  834. KERNEL1x2_SUB
  835. KERNEL1x2_SUB
  836. subs L, L, #1
  837. bgt sgemm_kernel_L2_M1_22
  838. sgemm_kernel_L2_M1_40:
  839. ands L , K1, #7 // L = L % 8
  840. ble sgemm_kernel_L2_M1_100
  841. sgemm_kernel_L2_M1_42:
  842. KERNEL1x2_SUB
  843. subs L, L, #1
  844. bgt sgemm_kernel_L2_M1_42
  845. sgemm_kernel_L2_M1_100:
  846. SAVE1x2
  847. sgemm_kernel_L2_END:
  848. mov r3, BC
  849. mov r4, K1
  850. lsl r4, r4, #3 // k * 2 * 4
  851. add r3, r3, r4 // B = B + K * 2 * 4
  852. mov BC, r3
  853. /*********************************************************************************************/
  854. sgemm_kernel_L1_BEGIN:
  855. ldr J , N
  856. tst J , #1
  857. ble sgemm_kernel_L999
  858. ldr CO1, C // CO1 = C
  859. ldr r4 , LDC
  860. add r3 , r4, CO1
  861. str r3 , C // store C
  862. ldr AO, A // AO = A
  863. //pld [AO , #A_PRE-96]
  864. //pld [AO , #A_PRE-64]
  865. //pld [AO , #A_PRE-32]
  866. sgemm_kernel_L1_M4_BEGIN:
  867. ldr I, M
  868. asrs I, I, #2 // I = I / 4
  869. ble sgemm_kernel_L1_M2_BEGIN
  870. sgemm_kernel_L1_M4_20:
  871. INIT4x1
  872. mov BO, BC
  873. asrs L , K1, #3 // L = L / 8
  874. ble sgemm_kernel_L1_M4_40
  875. .align 5
  876. sgemm_kernel_L1_M4_22:
  877. KERNEL4x1_SUB
  878. KERNEL4x1_SUB
  879. KERNEL4x1_SUB
  880. KERNEL4x1_SUB
  881. KERNEL4x1_SUB
  882. KERNEL4x1_SUB
  883. KERNEL4x1_SUB
  884. KERNEL4x1_SUB
  885. subs L, L, #1
  886. bgt sgemm_kernel_L1_M4_22
  887. sgemm_kernel_L1_M4_40:
  888. ands L , K1, #7 // L = L % 8
  889. ble sgemm_kernel_L1_M4_100
  890. sgemm_kernel_L1_M4_42:
  891. KERNEL4x1_SUB
  892. subs L, L, #1
  893. bgt sgemm_kernel_L1_M4_42
  894. sgemm_kernel_L1_M4_100:
  895. SAVE4x1
  896. sgemm_kernel_L1_M4_END:
  897. subs I, I, #1
  898. bgt sgemm_kernel_L1_M4_20
  899. sgemm_kernel_L1_M2_BEGIN:
  900. ldr I, M
  901. tst I , #3
  902. ble sgemm_kernel_L1_END
  903. tst I, #2 // I = I / 2
  904. ble sgemm_kernel_L1_M1_BEGIN
  905. sgemm_kernel_L1_M2_20:
  906. INIT2x1
  907. mov BO, BC
  908. asrs L , K1, #3 // L = L / 8
  909. ble sgemm_kernel_L1_M2_40
  910. sgemm_kernel_L1_M2_22:
  911. KERNEL2x1_SUB
  912. KERNEL2x1_SUB
  913. KERNEL2x1_SUB
  914. KERNEL2x1_SUB
  915. KERNEL2x1_SUB
  916. KERNEL2x1_SUB
  917. KERNEL2x1_SUB
  918. KERNEL2x1_SUB
  919. subs L, L, #1
  920. bgt sgemm_kernel_L1_M2_22
  921. sgemm_kernel_L1_M2_40:
  922. ands L , K1, #7 // L = L % 8
  923. ble sgemm_kernel_L1_M2_100
  924. sgemm_kernel_L1_M2_42:
  925. KERNEL2x1_SUB
  926. subs L, L, #1
  927. bgt sgemm_kernel_L1_M2_42
  928. sgemm_kernel_L1_M2_100:
  929. SAVE2x1
  930. sgemm_kernel_L1_M2_END:
  931. sgemm_kernel_L1_M1_BEGIN:
  932. tst I, #1 // I = I % 2
  933. ble sgemm_kernel_L1_END
  934. sgemm_kernel_L1_M1_20:
  935. INIT1x1
  936. mov BO, BC
  937. asrs L , K1, #3 // L = L / 8
  938. ble sgemm_kernel_L1_M1_40
  939. sgemm_kernel_L1_M1_22:
  940. KERNEL1x1_SUB
  941. KERNEL1x1_SUB
  942. KERNEL1x1_SUB
  943. KERNEL1x1_SUB
  944. KERNEL1x1_SUB
  945. KERNEL1x1_SUB
  946. KERNEL1x1_SUB
  947. KERNEL1x1_SUB
  948. subs L, L, #1
  949. bgt sgemm_kernel_L1_M1_22
  950. sgemm_kernel_L1_M1_40:
  951. ands L , K1, #7 // L = L % 8
  952. ble sgemm_kernel_L1_M1_100
  953. sgemm_kernel_L1_M1_42:
  954. KERNEL1x1_SUB
  955. subs L, L, #1
  956. bgt sgemm_kernel_L1_M1_42
  957. sgemm_kernel_L1_M1_100:
  958. SAVE1x1
  959. sgemm_kernel_L1_END:
  960. sgemm_kernel_L999:
  961. sub r3, fp, #128
  962. vldm r3, { s8 - s31} // restore floating point registers
  963. movs r0, #0 // set return value
  964. sub sp, fp, #24
  965. pop {r4 - r9, fp}
  966. bx lr
  967. EPILOGUE