You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm_kernel_4x4.S 27 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 x7 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define offset x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define temp x16
  48. #define tempOffset x17
  49. #define tempK x18
  50. #define alpha0 s10
  51. #define alphaV0 v10.s[0]
  52. #define alpha1 s11
  53. #define alphaV1 v11.s[0]
  54. #define alpha2 s14
  55. #define alphaV2 v14.s[0]
  56. #define alpha3 s15
  57. #define alphaV3 v15.s[0]
  58. // 00 origM
  59. // 01 origN
  60. // 02 origK
  61. // 03 origPA
  62. // 04 origPB
  63. // 05 pC
  64. // 06 origLDC -> LDC
  65. // 07 offset
  66. // 08 counterL
  67. // 09 counterI
  68. // 10 counterJ
  69. // 11 pB
  70. // 12 pCRow0
  71. // 13 pCRow1
  72. // 14 pCRow2
  73. // 15 pA
  74. // 16 temp
  75. // 17 tempOffset
  76. // 18 must save tempK
  77. // 19 must save
  78. // 20 must save
  79. // 21 must save
  80. // 22 must save
  81. // 23 must save
  82. // 24 must save
  83. // 25 must save
  84. // 26 must save
  85. // 27 must save
  86. // 28 must save
  87. // 29 frame
  88. // 30 link
  89. // 31 sp
  90. //v00 ALPHA -> pA00, pA01
  91. //v01 pA02, pA03
  92. //v02
  93. //v03
  94. //v04 pA10, pA11
  95. //v05 pA12, pA13
  96. //v06
  97. //v07
  98. //v08 must save pB00, pB01
  99. //v09 must save pB02, pB03
  100. //v10 must save ALPHA0
  101. //v11 must save ALPHA1
  102. //v12 must save pB10, pB11
  103. //v13 must save pB12, pB13
  104. //v14 must save ALPHA2
  105. //v15 must save ALPHA3
  106. //v16 must save C00, C01
  107. //v17 must save C02, C03
  108. //v18
  109. //v19
  110. //v20 C10, C11
  111. //v21 C12, C13
  112. //v22
  113. //v23
  114. //v24 C20, C21
  115. //v25 C22, C23
  116. //v26
  117. //v27
  118. //v28 C30, C31
  119. //v29 C32, C33
  120. //v30
  121. //v31
  122. /*******************************************************************************
  123. * Macro definitions
  124. *******************************************************************************/
  125. .macro INIT4x4
  126. fmov s16, wzr
  127. fmov s17, s16
  128. fmov s20, s17
  129. fmov s21, s16
  130. fmov s24, s17
  131. fmov s25, s16
  132. fmov s28, s17
  133. fmov s29, s16
  134. .endm
  135. .macro KERNEL4x4_I
  136. ld1 {v8.2s, v9.2s}, [pB]
  137. add pB, pB, #16
  138. ld1 {v0.2s, v1.2s}, [pA]
  139. add pA, pA, #16
  140. fmul v16.2s, v0.2s, v8.s[0]
  141. fmul v29.2s, v1.2s, v9.s[1]
  142. fmul v20.2s, v0.2s, v8.s[1]
  143. fmul v25.2s, v1.2s, v9.s[0]
  144. fmul v24.2s, v0.2s, v9.s[0]
  145. fmul v21.2s, v1.2s, v8.s[1]
  146. fmul v28.2s, v0.2s, v9.s[1]
  147. fmul v17.2s, v1.2s, v8.s[0]
  148. ld1 {v12.2s, v13.2s}, [pB]
  149. add pB, pB, #16
  150. ld1 {v4.2s, v5.2s}, [pA]
  151. add pA, pA, #16
  152. .endm
  153. .macro KERNEL4x4_M1
  154. fmla v16.2s, v0.2s, v8.s[0]
  155. fmla v29.2s, v1.2s, v9.s[1]
  156. ld1 {v12.2s, v13.2s}, [pB] // For next round
  157. add pB, pB, #16
  158. fmla v20.2s, v0.2s, v8.s[1]
  159. fmla v25.2s, v1.2s, v9.s[0]
  160. ld1 {v4.2s, v5.2s}, [pA] // For next round
  161. add pA, pA, #16
  162. fmla v24.2s, v0.2s, v9.s[0]
  163. fmla v21.2s, v1.2s, v8.s[1]
  164. prfm PLDL1KEEP, [pB, #512]
  165. fmla v28.2s, v0.2s, v9.s[1]
  166. fmla v17.2s, v1.2s, v8.s[0]
  167. .endm
  168. .macro KERNEL4x4_M2
  169. fmla v16.2s, v4.2s, v12.s[0]
  170. fmla v29.2s, v5.2s, v13.s[1]
  171. ld1 {v8.2s, v9.2s}, [pB] // For next round
  172. add pB, pB, #16
  173. fmla v20.2s, v4.2s, v12.s[1]
  174. fmla v25.2s, v5.2s, v13.s[0]
  175. ld1 {v0.2s, v1.2s}, [pA] // For next round
  176. add pA, pA, #16
  177. fmla v24.2s, v4.2s, v13.s[0]
  178. fmla v21.2s, v5.2s, v12.s[1]
  179. prfm PLDL1KEEP, [pA, #512]
  180. fmla v28.2s, v4.2s, v13.s[1]
  181. fmla v17.2s, v5.2s, v12.s[0]
  182. .endm
  183. .macro KERNEL4x4_E
  184. fmla v16.2s, v4.2s, v12.s[0]
  185. fmla v29.2s, v5.2s, v13.s[1]
  186. fmla v20.2s, v4.2s, v12.s[1]
  187. fmla v25.2s, v5.2s, v13.s[0]
  188. fmla v24.2s, v4.2s, v13.s[0]
  189. fmla v21.2s, v5.2s, v12.s[1]
  190. fmla v28.2s, v4.2s, v13.s[1]
  191. fmla v17.2s, v5.2s, v12.s[0]
  192. .endm
  193. .macro KERNEL4x4_SUB
  194. ld1 {v8.2s, v9.2s}, [pB]
  195. add pB, pB, #16
  196. ld1 {v0.2s, v1.2s}, [pA]
  197. add pA, pA, #16
  198. fmla v16.2s, v0.2s, v8.s[0]
  199. fmla v29.2s, v1.2s, v9.s[1]
  200. fmla v20.2s, v0.2s, v8.s[1]
  201. fmla v25.2s, v1.2s, v9.s[0]
  202. fmla v24.2s, v0.2s, v9.s[0]
  203. fmla v21.2s, v1.2s, v8.s[1]
  204. fmla v28.2s, v0.2s, v9.s[1]
  205. fmla v17.2s, v1.2s, v8.s[0]
  206. .endm
  207. .macro SAVE4x4
  208. fmul v8.2s, v16.2s, alphaV0
  209. fmul v9.2s, v17.2s, alphaV1
  210. st1 {v8.2s, v9.2s}, [pCRow0]
  211. add pCRow1, pCRow0, LDC
  212. fmul v12.2s, v20.2s, alphaV2
  213. fmul v13.2s, v21.2s, alphaV3
  214. st1 {v12.2s, v13.2s}, [pCRow1]
  215. add pCRow2, pCRow1, LDC
  216. fmul v8.2s, v24.2s, alphaV0
  217. fmul v9.2s, v25.2s, alphaV1
  218. st1 {v8.2s, v9.2s}, [pCRow2]
  219. add pCRow1, pCRow2, LDC
  220. fmul v12.2s, v28.2s, alphaV2
  221. fmul v13.2s, v29.2s, alphaV3
  222. st1 {v12.2s, v13.2s}, [pCRow1]
  223. add pCRow0, pCRow0, #16
  224. .endm
  225. /******************************************************************************/
  226. .macro INIT2x4
  227. fmov s16, wzr
  228. fmov s20, s16
  229. fmov s24, s20
  230. fmov s28, s16
  231. .endm
  232. .macro KERNEL2x4_SUB
  233. ld1 {v8.2s, v9.2s}, [pB]
  234. add pB, pB, #16
  235. ld1 {v0.2s}, [pA]
  236. add pA, pA, #8
  237. fmla v16.2s, v0.2s, v8.s[0]
  238. fmla v20.2s, v0.2s, v8.s[1]
  239. fmla v24.2s, v0.2s, v9.s[0]
  240. fmla v28.2s, v0.2s, v9.s[1]
  241. .endm
  242. .macro SAVE2x4
  243. fmul v8.2s, v16.2s, alphaV0
  244. st1 {v8.2s}, [pCRow0]
  245. add pCRow1, pCRow0, LDC
  246. fmul v12.2s, v20.2s, alphaV1
  247. st1 {v12.2s}, [pCRow1]
  248. add pCRow2, pCRow1, LDC
  249. fmul v8.2s, v24.2s, alphaV2
  250. st1 {v8.2s}, [pCRow2]
  251. add pCRow1, pCRow2, LDC
  252. fmul v12.2s, v28.2s, alphaV3
  253. st1 {v12.2s}, [pCRow1]
  254. add pCRow0, pCRow0, #8
  255. .endm
  256. /******************************************************************************/
  257. .macro INIT1x4
  258. fmov s16, wzr
  259. fmov s20, s16
  260. .endm
  261. .macro KERNEL1x4_SUB
  262. ldr s0, [pA]
  263. add pA, pA, #4
  264. ld1 {v8.2s, v9.2s}, [pB]
  265. add pB, pB, #16
  266. fmla v16.2s, v8.2s, v0.s[0]
  267. fmla v20.2s, v9.2s, v0.s[0]
  268. .endm
  269. .macro SAVE1x4
  270. add pCRow1, pCRow0, LDC
  271. fmul v8.2s, v16.2s, alphaV0
  272. st1 {v8.s}[0], [pCRow0]
  273. st1 {v8.s}[1], [pCRow1]
  274. add pCRow2, pCRow1, LDC
  275. add pCRow1, pCRow2, LDC
  276. fmul v12.2s, v20.2s, alphaV1
  277. st1 {v12.s}[0], [pCRow2]
  278. st1 {v12.s}[1], [pCRow1]
  279. add pCRow0, pCRow0, #4
  280. .endm
  281. /******************************************************************************/
  282. .macro INIT4x2
  283. fmov s16, wzr
  284. fmov s17, s16
  285. fmov s20, s17
  286. fmov s21, s16
  287. .endm
  288. .macro KERNEL4x2_SUB
  289. ld1 {v8.2s}, [pB]
  290. add pB, pB, #8
  291. ld1 {v0.2s, v1.2s}, [pA]
  292. add pA, pA, #16
  293. fmla v16.2s, v0.2s, v8.s[0]
  294. fmla v17.2s, v1.2s, v8.s[0]
  295. fmla v20.2s, v0.2s, v8.s[1]
  296. fmla v21.2s, v1.2s, v8.s[1]
  297. .endm
  298. .macro SAVE4x2
  299. fmul v8.2s, v16.2s, alphaV0
  300. fmul v9.2s, v17.2s, alphaV1
  301. st1 {v8.2s, v9.2s}, [pCRow0]
  302. add pCRow1, pCRow0, LDC
  303. fmul v12.2s, v20.2s, alphaV2
  304. fmul v13.2s, v21.2s, alphaV3
  305. st1 {v12.2s, v13.2s}, [pCRow1]
  306. add pCRow0, pCRow0, #16
  307. .endm
  308. /******************************************************************************/
  309. .macro INIT2x2
  310. fmov s16, wzr
  311. fmov s20, s16
  312. .endm
  313. .macro KERNEL2x2_SUB
  314. ld1 {v8.2s}, [pB]
  315. add pB, pB, #8
  316. ld1 {v0.2s}, [pA]
  317. add pA, pA, #8
  318. fmla v16.2s, v0.2s, v8.s[0]
  319. fmla v20.2s, v0.2s, v8.s[1]
  320. .endm
  321. .macro SAVE2x2
  322. fmul v8.2s, v16.2s, alphaV0
  323. st1 {v8.2s}, [pCRow0]
  324. add pCRow1 , pCRow0, LDC
  325. fmul v12.2s, v20.2s, alphaV1
  326. st1 {v12.2s}, [pCRow1]
  327. add pCRow0, pCRow0, #8
  328. .endm
  329. /******************************************************************************/
  330. .macro INIT1x2
  331. fmov s16, wzr
  332. .endm
  333. .macro KERNEL1x2_SUB
  334. ld1 {v8.2s} , [pB]
  335. add pB , pB, #8
  336. ldr s0 , [pA]
  337. add pA, pA, #4
  338. fmla v16.2s, v8.2s, v0.s[0]
  339. .endm
  340. .macro SAVE1x2
  341. add pCRow1 , pCRow0, LDC
  342. fmul v8.2s, v16.2s, alphaV0
  343. st1 {v8.s}[0], [pCRow0]
  344. st1 {v8.s}[1], [pCRow1]
  345. add pCRow0, pCRow0, #4
  346. .endm
  347. /******************************************************************************/
  348. .macro INIT4x1
  349. fmov s16, wzr
  350. fmov s17, s16
  351. .endm
  352. .macro KERNEL4x1_SUB
  353. ldr s8, [pB]
  354. add pB , pB, #4
  355. ld1 {v0.2s, v1.2s}, [pA]
  356. add pA , pA, #16
  357. fmla v16.2s, v0.2s, v8.s[0]
  358. fmla v17.2s, v1.2s, v8.s[0]
  359. .endm
  360. .macro SAVE4x1
  361. fmul v8.2s, v16.2s, alphaV0
  362. fmul v9.2s, v17.2s, alphaV1
  363. st1 {v8.2s, v9.2s}, [pCRow0]
  364. add pCRow0, pCRow0, #16
  365. .endm
  366. /******************************************************************************/
  367. .macro INIT2x1
  368. fmov s16, wzr
  369. .endm
  370. .macro KERNEL2x1_SUB
  371. ldr s8, [pB]
  372. add pB , pB, #4
  373. ld1 {v0.2s}, [pA]
  374. add pA , pA, #8
  375. fmla v16.2s, v0.2s, v8.s[0]
  376. .endm
  377. .macro SAVE2x1
  378. fmul v8.2s, v16.2s, alphaV0
  379. st1 {v8.2s}, [pCRow0]
  380. add pCRow0, pCRow0, #8
  381. .endm
  382. /******************************************************************************/
  383. .macro INIT1x1
  384. fmov s16, wzr
  385. .endm
  386. .macro KERNEL1x1_SUB
  387. ldr s8, [pB]
  388. add pB , pB, #4
  389. ldr s0, [pA]
  390. add pA , pA, #4
  391. fmadd s16, s0, s8, s16
  392. .endm
  393. .macro SAVE1x1
  394. fmul s8, s16, alpha0
  395. str s8, [pCRow0]
  396. add pCRow0, pCRow0, #4
  397. .endm
  398. /*******************************************************************************
  399. * End of macro definitions
  400. *******************************************************************************/
  401. PROLOGUE
  402. .Lstrmm_kernel_begin:
  403. .align 5
  404. add sp, sp, #-(11 * 16)
  405. stp d8, d9, [sp, #(0 * 16)]
  406. stp d10, d11, [sp, #(1 * 16)]
  407. stp d12, d13, [sp, #(2 * 16)]
  408. stp d14, d15, [sp, #(3 * 16)]
  409. stp d16, d17, [sp, #(4 * 16)]
  410. stp x18, x19, [sp, #(5 * 16)]
  411. stp x20, x21, [sp, #(6 * 16)]
  412. stp x22, x23, [sp, #(7 * 16)]
  413. stp x24, x25, [sp, #(8 * 16)]
  414. stp x26, x27, [sp, #(9 * 16)]
  415. str x28, [sp, #(10 * 16)]
  416. fmov alpha0, s0
  417. fmov alpha1, s0
  418. fmov alpha2, s0
  419. fmov alpha3, s0
  420. lsl LDC, LDC, #2 // ldc = ldc * 4
  421. #if !defined(LEFT)
  422. neg tempOffset, offset
  423. #endif
  424. mov pB, origPB
  425. mov counterJ, origN
  426. asr counterJ, counterJ, #2 // J = J / 4
  427. cmp counterJ, #0
  428. ble .Lstrmm_kernel_L2_BEGIN
  429. /******************************************************************************/
  430. .Lstrmm_kernel_L4_BEGIN:
  431. mov pCRow0, pC // pCRow0 = C
  432. add pC, pC, LDC, lsl #2
  433. #if defined(LEFT)
  434. mov tempOffset, offset
  435. #endif
  436. mov pA, origPA // pA = start of A array
  437. .Lstrmm_kernel_L4_M4_BEGIN:
  438. mov counterI, origM
  439. asr counterI, counterI, #2 // counterI = counterI / 4
  440. cmp counterI, #0
  441. ble .Lstrmm_kernel_L4_M2_BEGIN
  442. .Lstrmm_kernel_L4_M4_20:
  443. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  444. mov pB, origPB
  445. #else
  446. mov pB, origPB
  447. lsl temp, tempOffset, #4
  448. add pB, pB, temp
  449. add pA, pA, temp
  450. #endif
  451. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  452. sub tempK, origK, tempOffset
  453. #elif defined(LEFT)
  454. add tempK, tempOffset, #4
  455. #else
  456. add tempK, tempOffset, #4
  457. #endif
  458. asr counterL , tempK, #1 // L = K / 2
  459. cmp counterL , #2 // is there at least 4 to do?
  460. blt .Lstrmm_kernel_L4_M4_32
  461. KERNEL4x4_I // do one in the K
  462. KERNEL4x4_M2 // do another in the K
  463. subs counterL, counterL, #2
  464. ble .Lstrmm_kernel_L4_M4_22a
  465. .align 5
  466. .Lstrmm_kernel_L4_M4_22:
  467. KERNEL4x4_M1
  468. KERNEL4x4_M2
  469. subs counterL, counterL, #1
  470. bgt .Lstrmm_kernel_L4_M4_22
  471. .Lstrmm_kernel_L4_M4_22a:
  472. KERNEL4x4_M1
  473. KERNEL4x4_E
  474. b .Lstrmm_kernel_L4_M4_44
  475. .Lstrmm_kernel_L4_M4_32:
  476. tst counterL, #1
  477. ble .Lstrmm_kernel_L4_M4_40
  478. KERNEL4x4_I
  479. KERNEL4x4_E
  480. b .Lstrmm_kernel_L4_M4_44
  481. .Lstrmm_kernel_L4_M4_40:
  482. INIT4x4
  483. .Lstrmm_kernel_L4_M4_44:
  484. ands counterL , tempK, #1
  485. ble .Lstrmm_kernel_L4_M4_100
  486. .Lstrmm_kernel_L4_M4_46:
  487. KERNEL4x4_SUB
  488. .Lstrmm_kernel_L4_M4_100:
  489. SAVE4x4
  490. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  491. sub tempK, origK, tempOffset
  492. #if defined(LEFT)
  493. sub tempK, tempK, #4
  494. #else
  495. sub tempK, tempK, #4
  496. #endif
  497. lsl temp, tempK, #4
  498. add pA, pA, temp
  499. add pB, pB, temp
  500. #endif
  501. #if defined(LEFT)
  502. add tempOffset, tempOffset, #4
  503. #endif
  504. .Lstrmm_kernel_L4_M4_END:
  505. subs counterI, counterI, #1
  506. bne .Lstrmm_kernel_L4_M4_20
  507. .Lstrmm_kernel_L4_M2_BEGIN:
  508. mov counterI, origM
  509. tst counterI , #3
  510. ble .Lstrmm_kernel_L4_END
  511. tst counterI, #2 // counterI = counterI / 2
  512. ble .Lstrmm_kernel_L4_M1_BEGIN
  513. .Lstrmm_kernel_L4_M2_20:
  514. INIT2x4
  515. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  516. mov pB, origPB
  517. #else
  518. mov pB, origPB
  519. lsl temp, tempOffset, #3
  520. add pA, pA, temp
  521. lsl temp, tempOffset, #4
  522. add pB, pB, temp
  523. #endif
  524. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  525. sub tempK, origK, tempOffset
  526. #elif defined(LEFT)
  527. add tempK, tempOffset, #2
  528. #else
  529. add tempK, tempOffset, #4
  530. #endif
  531. asr counterL , tempK, #3 // counterL = counterL / 8
  532. cmp counterL , #0
  533. ble .Lstrmm_kernel_L4_M2_40
  534. .Lstrmm_kernel_L4_M2_22:
  535. KERNEL2x4_SUB
  536. KERNEL2x4_SUB
  537. KERNEL2x4_SUB
  538. KERNEL2x4_SUB
  539. KERNEL2x4_SUB
  540. KERNEL2x4_SUB
  541. KERNEL2x4_SUB
  542. KERNEL2x4_SUB
  543. subs counterL, counterL, #1
  544. bgt .Lstrmm_kernel_L4_M2_22
  545. .Lstrmm_kernel_L4_M2_40:
  546. ands counterL , tempK, #7 // counterL = counterL % 8
  547. ble .Lstrmm_kernel_L4_M2_100
  548. .Lstrmm_kernel_L4_M2_42:
  549. KERNEL2x4_SUB
  550. subs counterL, counterL, #1
  551. bgt .Lstrmm_kernel_L4_M2_42
  552. .Lstrmm_kernel_L4_M2_100:
  553. SAVE2x4
  554. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  555. sub tempK, origK, tempOffset
  556. #if defined(LEFT)
  557. sub tempK, tempK, #2
  558. #else
  559. sub tempK, tempK, #4
  560. #endif
  561. lsl temp, tempK, #3
  562. add pA, pA, temp
  563. lsl temp, tempK, #4
  564. add pB, pB, temp
  565. #endif
  566. #if defined(LEFT)
  567. add tempOffset, tempOffset, #2
  568. #endif
  569. .Lstrmm_kernel_L4_M2_END:
  570. .Lstrmm_kernel_L4_M1_BEGIN:
  571. tst counterI, #1 // counterI = counterI % 2
  572. ble .Lstrmm_kernel_L4_END
  573. .Lstrmm_kernel_L4_M1_20:
  574. INIT1x4
  575. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  576. mov pB, origPB
  577. #else
  578. mov pB, origPB
  579. lsl temp, tempOffset, #4
  580. add pB, pB, temp
  581. lsl temp, tempOffset, #2
  582. add pA, pA, temp
  583. #endif
  584. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  585. sub tempK, origK, tempOffset
  586. #elif defined(LEFT)
  587. add tempK, tempOffset, #1
  588. #else
  589. add tempK, tempOffset, #4
  590. #endif
  591. asr counterL , tempK, #3 // counterL = counterL / 8
  592. cmp counterL , #0
  593. ble .Lstrmm_kernel_L4_M1_40
  594. .Lstrmm_kernel_L4_M1_22:
  595. KERNEL1x4_SUB
  596. KERNEL1x4_SUB
  597. KERNEL1x4_SUB
  598. KERNEL1x4_SUB
  599. KERNEL1x4_SUB
  600. KERNEL1x4_SUB
  601. KERNEL1x4_SUB
  602. KERNEL1x4_SUB
  603. subs counterL, counterL, #1
  604. bgt .Lstrmm_kernel_L4_M1_22
  605. .Lstrmm_kernel_L4_M1_40:
  606. ands counterL , tempK, #7 // counterL = counterL % 8
  607. ble .Lstrmm_kernel_L4_M1_100
  608. .Lstrmm_kernel_L4_M1_42:
  609. KERNEL1x4_SUB
  610. subs counterL, counterL, #1
  611. bgt .Lstrmm_kernel_L4_M1_42
  612. .Lstrmm_kernel_L4_M1_100:
  613. SAVE1x4
  614. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  615. sub tempK, origK, tempOffset
  616. #if defined(LEFT)
  617. sub tempK, tempK, #1
  618. #else
  619. sub tempK, tempK, #4
  620. #endif
  621. lsl temp, tempK, #2
  622. add pA, pA, temp
  623. lsl temp, tempK, #4
  624. add pB, pB, temp
  625. #endif
  626. #if defined(LEFT)
  627. add tempOffset, tempOffset, #1
  628. #endif
  629. .Lstrmm_kernel_L4_END:
  630. add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
  631. #if !defined(LEFT)
  632. add tempOffset, tempOffset, #4
  633. #endif
  634. subs counterJ, counterJ , #1 // j--
  635. bgt .Lstrmm_kernel_L4_BEGIN
  636. /******************************************************************************/
  637. .Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction
  638. mov counterJ , origN
  639. tst counterJ , #3
  640. ble .Lstrmm_kernel_L999
  641. tst counterJ , #2
  642. ble .Lstrmm_kernel_L1_BEGIN
  643. mov pCRow0, pC // pCRow0 = pC
  644. add pC,pC,LDC, lsl #1
  645. #if defined(LEFT)
  646. mov tempOffset, offset
  647. #endif
  648. mov pA, origPA // pA = A
  649. .Lstrmm_kernel_L2_M4_BEGIN:
  650. mov counterI, origM
  651. asr counterI, counterI, #2 // counterI = counterI / 4
  652. cmp counterI,#0
  653. ble .Lstrmm_kernel_L2_M2_BEGIN
  654. .Lstrmm_kernel_L2_M4_20:
  655. INIT4x2
  656. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  657. mov pB, origPB
  658. #else
  659. mov pB, origPB
  660. lsl temp, tempOffset, #3
  661. add pB, pB, temp
  662. lsl temp, tempOffset, #4
  663. add pA, pA, temp
  664. #endif
  665. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  666. sub tempK, origK, tempOffset
  667. #elif defined(LEFT)
  668. add tempK, tempOffset, #4
  669. #else
  670. add tempK, tempOffset, #2
  671. #endif
  672. asr counterL , tempK, #3 // counterL = counterL / 8
  673. cmp counterL,#0
  674. ble .Lstrmm_kernel_L2_M4_40
  675. .align 5
  676. .Lstrmm_kernel_L2_M4_22:
  677. KERNEL4x2_SUB
  678. KERNEL4x2_SUB
  679. KERNEL4x2_SUB
  680. KERNEL4x2_SUB
  681. KERNEL4x2_SUB
  682. KERNEL4x2_SUB
  683. KERNEL4x2_SUB
  684. KERNEL4x2_SUB
  685. subs counterL, counterL, #1
  686. bgt .Lstrmm_kernel_L2_M4_22
  687. .Lstrmm_kernel_L2_M4_40:
  688. ands counterL , tempK, #7 // counterL = counterL % 8
  689. ble .Lstrmm_kernel_L2_M4_100
  690. .Lstrmm_kernel_L2_M4_42:
  691. KERNEL4x2_SUB
  692. subs counterL, counterL, #1
  693. bgt .Lstrmm_kernel_L2_M4_42
  694. .Lstrmm_kernel_L2_M4_100:
  695. SAVE4x2
  696. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  697. sub tempK, origK, tempOffset
  698. #if defined(LEFT)
  699. sub tempK, tempK, #4
  700. #else
  701. sub tempK, tempK, #2
  702. #endif
  703. lsl temp, tempK, #4
  704. add pA, pA, temp
  705. lsl temp, tempK, #3
  706. add pB, pB, temp
  707. #endif
  708. #if defined(LEFT)
  709. add tempOffset, tempOffset, #4
  710. #endif
  711. .Lstrmm_kernel_L2_M4_END:
  712. subs counterI, counterI, #1
  713. bgt .Lstrmm_kernel_L2_M4_20
  714. .Lstrmm_kernel_L2_M2_BEGIN:
  715. mov counterI, origM
  716. tst counterI , #3
  717. ble .Lstrmm_kernel_L2_END
  718. tst counterI, #2 // counterI = counterI / 2
  719. ble .Lstrmm_kernel_L2_M1_BEGIN
  720. .Lstrmm_kernel_L2_M2_20:
  721. INIT2x2
  722. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  723. mov pB, origPB
  724. #else
  725. mov pB, origPB
  726. lsl temp, tempOffset, #3
  727. add pB, pB, temp
  728. lsl temp, tempOffset, #3
  729. add pA, pA, temp
  730. #endif
  731. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  732. sub tempK, origK, tempOffset
  733. #elif defined(LEFT)
  734. add tempK, tempOffset, #2
  735. #else
  736. add tempK, tempOffset, #2
  737. #endif
  738. asr counterL , tempK, #3 // counterL = counterL / 8
  739. cmp counterL,#0
  740. ble .Lstrmm_kernel_L2_M2_40
  741. .Lstrmm_kernel_L2_M2_22:
  742. KERNEL2x2_SUB
  743. KERNEL2x2_SUB
  744. KERNEL2x2_SUB
  745. KERNEL2x2_SUB
  746. KERNEL2x2_SUB
  747. KERNEL2x2_SUB
  748. KERNEL2x2_SUB
  749. KERNEL2x2_SUB
  750. subs counterL, counterL, #1
  751. bgt .Lstrmm_kernel_L2_M2_22
  752. .Lstrmm_kernel_L2_M2_40:
  753. ands counterL , tempK, #7 // counterL = counterL % 8
  754. ble .Lstrmm_kernel_L2_M2_100
  755. .Lstrmm_kernel_L2_M2_42:
  756. KERNEL2x2_SUB
  757. subs counterL, counterL, #1
  758. bgt .Lstrmm_kernel_L2_M2_42
  759. .Lstrmm_kernel_L2_M2_100:
  760. SAVE2x2
  761. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  762. sub tempK, origK, tempOffset
  763. #if defined(LEFT)
  764. sub tempK, tempK, #2
  765. #else
  766. sub tempK, tempK, #2
  767. #endif
  768. lsl temp, tempK, #3
  769. add pA, pA, temp
  770. lsl temp, tempK, #3
  771. add pB, pB, temp
  772. #endif
  773. #if defined(LEFT)
  774. add tempOffset, tempOffset, #2
  775. #endif
  776. .Lstrmm_kernel_L2_M2_END:
  777. .Lstrmm_kernel_L2_M1_BEGIN:
  778. tst counterI, #1 // counterI = counterI % 2
  779. ble .Lstrmm_kernel_L2_END
  780. .Lstrmm_kernel_L2_M1_20:
  781. INIT1x2
  782. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  783. mov pB, origPB
  784. #else
  785. mov pB, origPB
  786. lsl temp, tempOffset, #3
  787. add pB, pB, temp
  788. lsl temp, tempOffset, #2
  789. add pA, pA, temp
  790. #endif
  791. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  792. sub tempK, origK, tempOffset
  793. #elif defined(LEFT)
  794. add tempK, tempOffset, #1
  795. #else
  796. add tempK, tempOffset, #2
  797. #endif
  798. asr counterL , tempK, #3 // counterL = counterL / 8
  799. cmp counterL, #0
  800. ble .Lstrmm_kernel_L2_M1_40
  801. .Lstrmm_kernel_L2_M1_22:
  802. KERNEL1x2_SUB
  803. KERNEL1x2_SUB
  804. KERNEL1x2_SUB
  805. KERNEL1x2_SUB
  806. KERNEL1x2_SUB
  807. KERNEL1x2_SUB
  808. KERNEL1x2_SUB
  809. KERNEL1x2_SUB
  810. subs counterL, counterL, #1
  811. bgt .Lstrmm_kernel_L2_M1_22
  812. .Lstrmm_kernel_L2_M1_40:
  813. ands counterL , tempK, #7 // counterL = counterL % 8
  814. ble .Lstrmm_kernel_L2_M1_100
  815. .Lstrmm_kernel_L2_M1_42:
  816. KERNEL1x2_SUB
  817. subs counterL, counterL, #1
  818. bgt .Lstrmm_kernel_L2_M1_42
  819. .Lstrmm_kernel_L2_M1_100:
  820. SAVE1x2
  821. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  822. sub tempK, origK, tempOffset
  823. #if defined(LEFT)
  824. sub tempK, tempK, #1
  825. #else
  826. sub tempK, tempK, #2
  827. #endif
  828. lsl temp, tempK, #2
  829. add pA, pA, temp
  830. lsl temp, tempK, #3
  831. add pB, pB, temp
  832. #endif
  833. #if defined(LEFT)
  834. add tempOffset, tempOffset, #1
  835. #endif
  836. .Lstrmm_kernel_L2_END:
  837. #if !defined(LEFT)
  838. add tempOffset, tempOffset, #2
  839. #endif
  840. add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
  841. /******************************************************************************/
  842. .Lstrmm_kernel_L1_BEGIN:
  843. mov counterJ , origN
  844. tst counterJ , #1
  845. ble .Lstrmm_kernel_L999 // done
  846. mov pCRow0, pC // pCRow0 = C
  847. add pC , pC , LDC // Update pC to point to next
  848. #if defined(LEFT)
  849. mov tempOffset, offset
  850. #endif
  851. mov pA, origPA // pA = A
  852. .Lstrmm_kernel_L1_M4_BEGIN:
  853. mov counterI, origM
  854. asr counterI, counterI, #2 // counterI = counterI / 4
  855. cmp counterI, #0
  856. ble .Lstrmm_kernel_L1_M2_BEGIN
  857. .Lstrmm_kernel_L1_M4_20:
  858. INIT4x1
  859. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  860. mov pB, origPB
  861. #else
  862. mov pB, origPB
  863. lsl temp, tempOffset, #2
  864. add pB, pB, temp
  865. lsl temp, tempOffset, #4
  866. add pA, pA, temp
  867. #endif
  868. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  869. sub tempK, origK, tempOffset
  870. #elif defined(LEFT)
  871. add tempK, tempOffset, #4
  872. #else
  873. add tempK, tempOffset, #1
  874. #endif
  875. asr counterL , tempK, #3 // counterL = counterL / 8
  876. cmp counterL , #0
  877. ble .Lstrmm_kernel_L1_M4_40
  878. .align 5
  879. .Lstrmm_kernel_L1_M4_22:
  880. KERNEL4x1_SUB
  881. KERNEL4x1_SUB
  882. KERNEL4x1_SUB
  883. KERNEL4x1_SUB
  884. KERNEL4x1_SUB
  885. KERNEL4x1_SUB
  886. KERNEL4x1_SUB
  887. KERNEL4x1_SUB
  888. subs counterL, counterL, #1
  889. bgt .Lstrmm_kernel_L1_M4_22
  890. .Lstrmm_kernel_L1_M4_40:
  891. ands counterL , tempK, #7 // counterL = counterL % 8
  892. ble .Lstrmm_kernel_L1_M4_100
  893. .Lstrmm_kernel_L1_M4_42:
  894. KERNEL4x1_SUB
  895. subs counterL, counterL, #1
  896. bgt .Lstrmm_kernel_L1_M4_42
  897. .Lstrmm_kernel_L1_M4_100:
  898. SAVE4x1
  899. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  900. sub tempK, origK, tempOffset
  901. #if defined(LEFT)
  902. sub tempK, tempK, #4
  903. #else
  904. sub tempK, tempK, #1
  905. #endif
  906. lsl temp, tempK, #4
  907. add pA, pA, temp
  908. lsl temp, tempK, #2
  909. add pB, pB, temp
  910. #endif
  911. #if defined(LEFT)
  912. add tempOffset, tempOffset, #4
  913. #endif
  914. .Lstrmm_kernel_L1_M4_END:
  915. subs counterI, counterI, #1
  916. bgt .Lstrmm_kernel_L1_M4_20
  917. .Lstrmm_kernel_L1_M2_BEGIN:
  918. mov counterI, origM
  919. tst counterI , #3
  920. ble .Lstrmm_kernel_L1_END
  921. tst counterI, #2 // counterI = counterI / 2
  922. ble .Lstrmm_kernel_L1_M1_BEGIN
  923. .Lstrmm_kernel_L1_M2_20:
  924. INIT2x1
  925. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  926. mov pB, origPB
  927. #else
  928. mov pB, origPB
  929. lsl temp, tempOffset, #2
  930. add pB, pB, temp
  931. lsl temp, tempOffset, #3
  932. add pA, pA, temp
  933. #endif
  934. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  935. sub tempK, origK, tempOffset
  936. #elif defined(LEFT)
  937. add tempK, tempOffset, #2
  938. #else
  939. add tempK, tempOffset, #1
  940. #endif
  941. asr counterL , tempK, #3 // counterL = counterL / 8
  942. cmp counterL , #0
  943. ble .Lstrmm_kernel_L1_M2_40
  944. .Lstrmm_kernel_L1_M2_22:
  945. KERNEL2x1_SUB
  946. KERNEL2x1_SUB
  947. KERNEL2x1_SUB
  948. KERNEL2x1_SUB
  949. KERNEL2x1_SUB
  950. KERNEL2x1_SUB
  951. KERNEL2x1_SUB
  952. KERNEL2x1_SUB
  953. subs counterL, counterL, #1
  954. bgt .Lstrmm_kernel_L1_M2_22
  955. .Lstrmm_kernel_L1_M2_40:
  956. ands counterL , tempK, #7 // counterL = counterL % 8
  957. ble .Lstrmm_kernel_L1_M2_100
  958. .Lstrmm_kernel_L1_M2_42:
  959. KERNEL2x1_SUB
  960. subs counterL, counterL, #1
  961. bgt .Lstrmm_kernel_L1_M2_42
  962. .Lstrmm_kernel_L1_M2_100:
  963. SAVE2x1
  964. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  965. sub tempK, origK, tempOffset
  966. #if defined(LEFT)
  967. sub tempK, tempK, #2
  968. #else
  969. sub tempK, tempK, #1
  970. #endif
  971. lsl temp, tempK, #3
  972. add pA, pA, temp
  973. lsl temp, tempK, #2
  974. add pB, pB, temp
  975. #endif
  976. #if defined(LEFT)
  977. add tempOffset, tempOffset, #2
  978. #endif
  979. .Lstrmm_kernel_L1_M2_END:
  980. .Lstrmm_kernel_L1_M1_BEGIN:
  981. tst counterI, #1 // counterI = counterI % 2
  982. ble .Lstrmm_kernel_L1_END
  983. .Lstrmm_kernel_L1_M1_20:
  984. INIT1x1
  985. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  986. mov pB, origPB
  987. #else
  988. mov pB, origPB
  989. lsl temp, tempOffset, #2
  990. add pB, pB, temp
  991. lsl temp, tempOffset, #2
  992. add pA, pA, temp
  993. #endif
  994. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  995. sub tempK, origK, tempOffset
  996. #elif defined(LEFT)
  997. add tempK, tempOffset, #1
  998. #else
  999. add tempK, tempOffset, #1
  1000. #endif
  1001. asr counterL , tempK, #3 // counterL = counterL / 8
  1002. cmp counterL , #0
  1003. ble .Lstrmm_kernel_L1_M1_40
  1004. .Lstrmm_kernel_L1_M1_22:
  1005. KERNEL1x1_SUB
  1006. KERNEL1x1_SUB
  1007. KERNEL1x1_SUB
  1008. KERNEL1x1_SUB
  1009. KERNEL1x1_SUB
  1010. KERNEL1x1_SUB
  1011. KERNEL1x1_SUB
  1012. KERNEL1x1_SUB
  1013. subs counterL, counterL, #1
  1014. bgt .Lstrmm_kernel_L1_M1_22
  1015. .Lstrmm_kernel_L1_M1_40:
  1016. ands counterL , tempK, #7 // counterL = counterL % 8
  1017. ble .Lstrmm_kernel_L1_M1_100
  1018. .Lstrmm_kernel_L1_M1_42:
  1019. KERNEL1x1_SUB
  1020. subs counterL, counterL, #1
  1021. bgt .Lstrmm_kernel_L1_M1_42
  1022. .Lstrmm_kernel_L1_M1_100:
  1023. SAVE1x1
  1024. #if 0
  1025. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1026. sub tempK, origK, tempOffset
  1027. #if defined(LEFT)
  1028. sub tempK, tempK, #1
  1029. #else
  1030. sub tempK, tempK, #1
  1031. #endif
  1032. lsl temp, tempK, #2
  1033. add pA, pA, temp
  1034. lsl temp, tempK, #2
  1035. add pB, pB, temp
  1036. #endif
  1037. #if defined(LEFT)
  1038. add tempOffset, tempOffset, #1
  1039. #endif
  1040. #endif
  1041. .Lstrmm_kernel_L1_END:
  1042. #if 0
  1043. #if !defined(LEFT)
  1044. add tempOffset, tempOffset, #1
  1045. #endif
  1046. #endif
  1047. .Lstrmm_kernel_L999:
  1048. mov x0, #0 // set return value
  1049. ldp d8, d9, [sp, #(0 * 16)]
  1050. ldp d10, d11, [sp, #(1 * 16)]
  1051. ldp d12, d13, [sp, #(2 * 16)]
  1052. ldp d14, d15, [sp, #(3 * 16)]
  1053. ldp d16, d17, [sp, #(4 * 16)]
  1054. ldp x18, x19, [sp, #(5 * 16)]
  1055. ldp x20, x21, [sp, #(6 * 16)]
  1056. ldp x22, x23, [sp, #(7 * 16)]
  1057. ldp x24, x25, [sp, #(8 * 16)]
  1058. ldp x26, x27, [sp, #(9 * 16)]
  1059. ldr x28, [sp, #(10 * 16)]
  1060. add sp, sp, #(11*16)
  1061. ret
  1062. EPILOGUE