You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_4x4.S 34 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pCRow3 x15
  47. #define pA x16
  48. #define alphaR x19
  49. #define alphaI x20
  50. #define alpha0_R d10
  51. #define alphaV0_R v10.d[0]
  52. #define alpha0_I d11
  53. #define alphaV0_I v11.d[0]
  54. #define A_PRE_SIZE 2560
  55. #define B_PRE_SIZE 448
  56. #define C_PRE_SIZE 128
  57. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  58. #define OP_rr fmla
  59. #define OP_ii fmls
  60. #define OP_ri fmla
  61. #define OP_ir fmla
  62. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  63. #define OP_rr fmla
  64. #define OP_ii fmla
  65. #define OP_ri fmls
  66. #define OP_ir fmla
  67. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  68. #define OP_rr fmla
  69. #define OP_ii fmla
  70. #define OP_ri fmla
  71. #define OP_ir fmls
  72. #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
  73. #define OP_rr fmla
  74. #define OP_ii fmls
  75. #define OP_ri fmls
  76. #define OP_ir fmls
  77. #endif
  78. // 00 origM
  79. // 01 origN
  80. // 02 origK
  81. // 03 origPA
  82. // 04 origPB
  83. // 05 pC
  84. // 06 origLDC -> LDC
  85. // 07 offset -> temp
  86. // 08 counterL
  87. // 09 counterI
  88. // 10 counterJ
  89. // 11 pB
  90. // 12 pCRow0
  91. // 13 pCRow1
  92. // 14 pCRow2
  93. // 15 pCRow3
  94. // 16 pA
  95. // 17 alpha_save_R
  96. // 18 must save alpha_save_I
  97. // 19 must save
  98. // 20 must save
  99. // 21 must save
  100. // 22 must save
  101. // 23 must save
  102. // 24 must save
  103. // 25 must save
  104. // 26 must save
  105. // 27 must save
  106. // 28 must save
  107. // 29 frame
  108. // 30 link
  109. // 31 sp
  110. //v00 ALPHA_R -> pA00_R, pA01_R
  111. //v01 ALPHA_I -> pA00_I, pA01_I
  112. //v02 pA02_R, pA03_R
  113. //v03 pA02_I, pA03_I
  114. //v04 pA10_R, pA11_R
  115. //v05 pA10_I, pA11_I
  116. //v06 pA12_R, pA13_R
  117. //v07 pA12_I, pA13_I
  118. //v08 must save pB00_R, pB01_R
  119. //v09 must save pB00_I, pB01_I
  120. //v10 must save pB02_R, pB03_R OR ALPHA0_R
  121. //v11 must save pB02_I, pB03_I OR ALPHA0_I
  122. //v12 must save pB10_R, pB11_R
  123. //v13 must save pB10_I, pB11_I
  124. //v14 must save pB12_R, pB13_R OR ALPHA1_R
  125. //v15 must save pB12_I, pB13_I OR ALPHA1_R
  126. //v16 must save pC00_R, pC01_R
  127. //v17 must save pC00_I, pC01_I
  128. //v18 pC02_R, pC03_R
  129. //v19 pC02_I, pC03_I
  130. //v20 pC10_R, pC11_R
  131. //v21 pC10_I, pC11_I
  132. //v22 pC12_R, pC13_R
  133. //v23 pC12_I, pC13_I
  134. //v24 pC20_R, pC21_R
  135. //v25 pC20_I, pC21_I
  136. //v26 pC22_R, pC23_R
  137. //v27 pC22_I, pC23_I
  138. //v28 pC30_R, pC31_R
  139. //v29 pC30_I, pC31_I
  140. //v30 pC32_R, pC33_R
  141. //v31 pC32_I, pC33_I
  142. /*******************************************************************************
  143. * Macro definitions
  144. *******************************************************************************/
  145. .macro INIT4x4
  146. fmov d16, xzr
  147. fmov d17, d16
  148. fmov d18, d17
  149. fmov d19, d16
  150. fmov d20, d17
  151. fmov d21, d16
  152. fmov d22, d17
  153. fmov d23, d16
  154. fmov d24, d17
  155. fmov d25, d16
  156. fmov d26, d17
  157. fmov d27, d16
  158. fmov d28, d17
  159. fmov d29, d16
  160. fmov d30, d17
  161. fmov d31, d16
  162. .endm
  163. .macro KERNEL4x4_I
  164. ld2 {v8.2d, v9.2d}, [pB]
  165. add pB, pB, #32
  166. ld2 {v0.2d, v1.2d}, [pA]
  167. add pA, pA, #32
  168. fmul v16.2d, v0.2d, v8.d[0]
  169. OP_ii v16.2d, v1.2d, v9.d[0]
  170. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  171. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  172. eor v17.16b, v17.16b, v17.16b
  173. fmls v17.2d, v0.2d, v9.d[0]
  174. #else
  175. fmul v17.2d, v0.2d, v9.d[0]
  176. #endif
  177. OP_ir v17.2d, v1.2d, v8.d[0]
  178. ld2 {v2.2d, v3.2d}, [pA]
  179. add pA, pA, #32
  180. fmul v20.2d, v0.2d, v8.d[1]
  181. OP_ii v20.2d, v1.2d, v9.d[1]
  182. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  183. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  184. eor v21.16b, v21.16b, v21.16b
  185. fmls v21.2d, v0.2d, v9.d[1]
  186. #else
  187. fmul v21.2d, v0.2d, v9.d[1]
  188. #endif
  189. OP_ir v21.2d, v1.2d, v8.d[1]
  190. ld2 {v10.2d, v11.2d}, [pB]
  191. add pB, pB, #32
  192. fmul v22.2d, v2.2d, v8.d[1]
  193. OP_ii v22.2d, v3.2d, v9.d[1]
  194. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  195. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  196. eor v23.16b, v23.16b, v23.16b
  197. fmls v23.2d, v2.2d, v9.d[1]
  198. #else
  199. fmul v23.2d, v2.2d, v9.d[1]
  200. #endif
  201. OP_ir v23.2d, v3.2d, v8.d[1]
  202. ld2 {v12.2d, v13.2d}, [pB]
  203. add pB, pB, #32
  204. fmul v18.2d, v2.2d, v8.d[0]
  205. OP_ii v18.2d, v3.2d, v9.d[0]
  206. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  207. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  208. eor v19.16b, v19.16b, v19.16b
  209. fmls v19.2d, v2.2d, v9.d[0]
  210. #else
  211. fmul v19.2d, v2.2d, v9.d[0]
  212. #endif
  213. OP_ir v19.2d, v3.2d, v8.d[0]
  214. ld2 {v4.2d, v5.2d} , [pA]
  215. add pA, pA, #32
  216. fmul v24.2d, v0.2d, v10.d[0]
  217. OP_ii v24.2d, v1.2d, v11.d[0]
  218. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  219. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  220. eor v25.16b, v25.16b, v25.16b
  221. fmls v25.2d, v0.2d, v11.d[0]
  222. #else
  223. fmul v25.2d, v0.2d, v11.d[0]
  224. #endif
  225. OP_ir v25.2d, v1.2d, v10.d[0]
  226. ld2 {v6.2d, v7.2d} , [pA]
  227. add pA, pA, #32
  228. fmul v26.2d, v2.2d, v10.d[0]
  229. OP_ii v26.2d, v3.2d, v11.d[0]
  230. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  231. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  232. eor v27.16b, v27.16b, v27.16b
  233. fmls v27.2d, v2.2d, v11.d[0]
  234. #else
  235. fmul v27.2d, v2.2d, v11.d[0]
  236. #endif
  237. OP_ir v27.2d, v3.2d, v10.d[0]
  238. ld2 {v14.2d, v15.2d}, [pB]
  239. add pB, pB, #32
  240. fmul v28.2d, v0.2d, v10.d[1]
  241. OP_ii v28.2d, v1.2d, v11.d[1]
  242. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  243. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  244. eor v29.16b, v29.16b, v29.16b
  245. fmls v29.2d, v0.2d, v11.d[1]
  246. #else
  247. fmul v29.2d, v0.2d, v11.d[1]
  248. #endif
  249. OP_ir v29.2d, v1.2d, v10.d[1]
  250. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  251. fmul v30.2d, v2.2d, v10.d[1]
  252. OP_ii v30.2d, v3.2d, v11.d[1]
  253. #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
  254. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  255. eor v31.16b, v31.16b, v31.16b
  256. fmls v31.2d, v2.2d, v11.d[1]
  257. #else
  258. fmul v31.2d, v2.2d, v11.d[1]
  259. #endif
  260. OP_ir v31.2d, v3.2d, v10.d[1]
  261. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  262. .endm
  263. .macro KERNEL4x4_M1
  264. OP_rr v16.2d, v0.2d, v8.d[0]
  265. OP_ii v16.2d, v1.2d, v9.d[0]
  266. OP_ri v17.2d, v0.2d, v9.d[0]
  267. OP_ir v17.2d, v1.2d, v8.d[0]
  268. ld2 {v12.2d, v13.2d}, [pB]
  269. add pB, pB, #32
  270. OP_rr v18.2d, v2.2d, v8.d[0]
  271. OP_ii v18.2d, v3.2d, v9.d[0]
  272. OP_ri v19.2d, v2.2d, v9.d[0]
  273. OP_ir v19.2d, v3.2d, v8.d[0]
  274. ld2 {v4.2d, v5.2d} , [pA]
  275. add pA, pA, #32
  276. OP_rr v20.2d, v0.2d, v8.d[1]
  277. OP_ii v20.2d, v1.2d, v9.d[1]
  278. OP_ri v21.2d, v0.2d, v9.d[1]
  279. OP_ir v21.2d, v1.2d, v8.d[1]
  280. ld2 {v6.2d, v7.2d} , [pA]
  281. add pA, pA, #32
  282. OP_rr v22.2d, v2.2d, v8.d[1]
  283. OP_ii v22.2d, v3.2d, v9.d[1]
  284. OP_ri v23.2d, v2.2d, v9.d[1]
  285. OP_ir v23.2d, v3.2d, v8.d[1]
  286. ld2 {v14.2d, v15.2d}, [pB]
  287. add pB, pB, #32
  288. OP_rr v24.2d, v0.2d, v10.d[0]
  289. OP_ii v24.2d, v1.2d, v11.d[0]
  290. OP_ri v25.2d, v0.2d, v11.d[0]
  291. OP_ir v25.2d, v1.2d, v10.d[0]
  292. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  293. OP_rr v26.2d, v2.2d, v10.d[0]
  294. OP_ii v26.2d, v3.2d, v11.d[0]
  295. OP_ri v27.2d, v2.2d, v11.d[0]
  296. OP_ir v27.2d, v3.2d, v10.d[0]
  297. prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
  298. OP_rr v28.2d, v0.2d, v10.d[1]
  299. OP_ii v28.2d, v1.2d, v11.d[1]
  300. OP_ri v29.2d, v0.2d, v11.d[1]
  301. OP_ir v29.2d, v1.2d, v10.d[1]
  302. OP_rr v30.2d, v2.2d, v10.d[1]
  303. OP_ii v30.2d, v3.2d, v11.d[1]
  304. OP_ri v31.2d, v2.2d, v11.d[1]
  305. OP_ir v31.2d, v3.2d, v10.d[1]
  306. .endm
  307. .macro KERNEL4x4_M2
  308. OP_rr v16.2d, v4.2d, v12.d[0]
  309. OP_ii v16.2d, v5.2d, v13.d[0]
  310. OP_ri v17.2d, v4.2d, v13.d[0]
  311. OP_ir v17.2d, v5.2d, v12.d[0]
  312. ld2 {v8.2d, v9.2d}, [pB]
  313. add pB, pB, #32
  314. OP_rr v18.2d, v6.2d, v12.d[0]
  315. OP_ii v18.2d, v7.2d, v13.d[0]
  316. OP_ri v19.2d, v6.2d, v13.d[0]
  317. OP_ir v19.2d, v7.2d, v12.d[0]
  318. ld2 {v0.2d, v1.2d}, [pA]
  319. add pA, pA, #32
  320. OP_rr v20.2d, v4.2d, v12.d[1]
  321. OP_ii v20.2d, v5.2d, v13.d[1]
  322. OP_ri v21.2d, v4.2d, v13.d[1]
  323. OP_ir v21.2d, v5.2d, v12.d[1]
  324. ld2 {v2.2d, v3.2d}, [pA]
  325. add pA, pA, #32
  326. OP_rr v22.2d, v6.2d, v12.d[1]
  327. OP_ii v22.2d, v7.2d, v13.d[1]
  328. OP_ri v23.2d, v6.2d, v13.d[1]
  329. OP_ir v23.2d, v7.2d, v12.d[1]
  330. ld2 {v10.2d, v11.2d}, [pB]
  331. add pB, pB, #32
  332. OP_rr v24.2d, v4.2d, v14.d[0]
  333. OP_ii v24.2d, v5.2d, v15.d[0]
  334. OP_ri v25.2d, v4.2d, v15.d[0]
  335. OP_ir v25.2d, v5.2d, v14.d[0]
  336. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  337. OP_rr v26.2d, v6.2d, v14.d[0]
  338. OP_ii v26.2d, v7.2d, v15.d[0]
  339. OP_ri v27.2d, v6.2d, v15.d[0]
  340. OP_ir v27.2d, v7.2d, v14.d[0]
  341. prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
  342. OP_rr v28.2d, v4.2d, v14.d[1]
  343. OP_ii v28.2d, v5.2d, v15.d[1]
  344. OP_ri v29.2d, v4.2d, v15.d[1]
  345. OP_ir v29.2d, v5.2d, v14.d[1]
  346. OP_rr v30.2d, v6.2d, v14.d[1]
  347. OP_ii v30.2d, v7.2d, v15.d[1]
  348. OP_ri v31.2d, v6.2d, v15.d[1]
  349. OP_ir v31.2d, v7.2d, v14.d[1]
  350. .endm
  351. .macro KERNEL4x4_E
  352. OP_rr v16.2d, v4.2d, v12.d[0]
  353. OP_ii v16.2d, v5.2d, v13.d[0]
  354. OP_ri v17.2d, v4.2d, v13.d[0]
  355. OP_ir v17.2d, v5.2d, v12.d[0]
  356. OP_rr v18.2d, v6.2d, v12.d[0]
  357. OP_ii v18.2d, v7.2d, v13.d[0]
  358. OP_ri v19.2d, v6.2d, v13.d[0]
  359. OP_ir v19.2d, v7.2d, v12.d[0]
  360. OP_rr v20.2d, v4.2d, v12.d[1]
  361. OP_ii v20.2d, v5.2d, v13.d[1]
  362. OP_ri v21.2d, v4.2d, v13.d[1]
  363. OP_ir v21.2d, v5.2d, v12.d[1]
  364. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  365. OP_rr v22.2d, v6.2d, v12.d[1]
  366. OP_ii v22.2d, v7.2d, v13.d[1]
  367. OP_ri v23.2d, v6.2d, v13.d[1]
  368. OP_ir v23.2d, v7.2d, v12.d[1]
  369. OP_rr v24.2d, v4.2d, v14.d[0]
  370. OP_ii v24.2d, v5.2d, v15.d[0]
  371. OP_ri v25.2d, v4.2d, v15.d[0]
  372. OP_ir v25.2d, v5.2d, v14.d[0]
  373. prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
  374. OP_rr v26.2d, v6.2d, v14.d[0]
  375. OP_ii v26.2d, v7.2d, v15.d[0]
  376. OP_ri v27.2d, v6.2d, v15.d[0]
  377. OP_ir v27.2d, v7.2d, v14.d[0]
  378. OP_rr v28.2d, v4.2d, v14.d[1]
  379. OP_ii v28.2d, v5.2d, v15.d[1]
  380. OP_ri v29.2d, v4.2d, v15.d[1]
  381. OP_ir v29.2d, v5.2d, v14.d[1]
  382. OP_rr v30.2d, v6.2d, v14.d[1]
  383. OP_ii v30.2d, v7.2d, v15.d[1]
  384. OP_ri v31.2d, v6.2d, v15.d[1]
  385. OP_ir v31.2d, v7.2d, v14.d[1]
  386. .endm
  387. .macro KERNEL4x4_SUB
  388. ld2 {v8.2d, v9.2d}, [pB]
  389. add pB, pB, #32
  390. ld2 {v0.2d, v1.2d}, [pA]
  391. add pA, pA, #32
  392. OP_rr v16.2d, v0.2d, v8.d[0]
  393. OP_ii v16.2d, v1.2d, v9.d[0]
  394. OP_ri v17.2d, v0.2d, v9.d[0]
  395. OP_ir v17.2d, v1.2d, v8.d[0]
  396. ld2 {v2.2d, v3.2d}, [pA]
  397. add pA, pA, #32
  398. OP_rr v20.2d, v0.2d, v8.d[1]
  399. OP_ii v20.2d, v1.2d, v9.d[1]
  400. OP_ri v21.2d, v0.2d, v9.d[1]
  401. OP_ir v21.2d, v1.2d, v8.d[1]
  402. ld2 {v10.2d, v11.2d}, [pB]
  403. add pB, pB, #32
  404. OP_rr v18.2d, v2.2d, v8.d[0]
  405. OP_ii v18.2d, v3.2d, v9.d[0]
  406. OP_ri v19.2d, v2.2d, v9.d[0]
  407. OP_ir v19.2d, v3.2d, v8.d[0]
  408. prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
  409. OP_rr v22.2d, v2.2d, v8.d[1]
  410. OP_ii v22.2d, v3.2d, v9.d[1]
  411. OP_ri v23.2d, v2.2d, v9.d[1]
  412. OP_ir v23.2d, v3.2d, v8.d[1]
  413. prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
  414. OP_rr v24.2d, v0.2d, v10.d[0]
  415. OP_ii v24.2d, v1.2d, v11.d[0]
  416. OP_ri v25.2d, v0.2d, v11.d[0]
  417. OP_ir v25.2d, v1.2d, v10.d[0]
  418. OP_rr v26.2d, v2.2d, v10.d[0]
  419. OP_ii v26.2d, v3.2d, v11.d[0]
  420. OP_ri v27.2d, v2.2d, v11.d[0]
  421. OP_ir v27.2d, v3.2d, v10.d[0]
  422. OP_rr v28.2d, v0.2d, v10.d[1]
  423. OP_ii v28.2d, v1.2d, v11.d[1]
  424. OP_ri v29.2d, v0.2d, v11.d[1]
  425. OP_ir v29.2d, v1.2d, v10.d[1]
  426. OP_rr v30.2d, v2.2d, v10.d[1]
  427. OP_ii v30.2d, v3.2d, v11.d[1]
  428. OP_ri v31.2d, v2.2d, v11.d[1]
  429. OP_ir v31.2d, v3.2d, v10.d[1]
  430. .endm
  431. .macro SAVE4x4
  432. fmov alpha0_R, alphaR
  433. fmov alpha0_I, alphaI
  434. prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
  435. ld2 {v0.2d, v1.2d}, [pCRow0]
  436. fmla v0.2d, v16.2d, alphaV0_R
  437. fmls v0.2d, v17.2d, alphaV0_I
  438. fmla v1.2d, v16.2d, alphaV0_I
  439. fmla v1.2d, v17.2d, alphaV0_R
  440. st2 {v0.2d, v1.2d}, [pCRow0]
  441. add pCRow0, pCRow0, #32
  442. ld2 {v2.2d, v3.2d}, [pCRow0]
  443. fmla v2.2d, v18.2d, alphaV0_R
  444. fmls v2.2d, v19.2d, alphaV0_I
  445. fmla v3.2d, v18.2d, alphaV0_I
  446. fmla v3.2d, v19.2d, alphaV0_R
  447. st2 {v2.2d, v3.2d}, [pCRow0]
  448. add pCRow0, pCRow0, #32
  449. prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
  450. ld2 {v4.2d, v5.2d}, [pCRow1]
  451. fmla v4.2d, v20.2d, alphaV0_R
  452. fmls v4.2d, v21.2d, alphaV0_I
  453. fmla v5.2d, v20.2d, alphaV0_I
  454. fmla v5.2d, v21.2d, alphaV0_R
  455. st2 {v4.2d, v5.2d}, [pCRow1]
  456. add pCRow1, pCRow1, #32
  457. ld2 {v6.2d, v7.2d}, [pCRow1]
  458. fmla v6.2d, v22.2d, alphaV0_R
  459. fmls v6.2d, v23.2d, alphaV0_I
  460. fmla v7.2d, v22.2d, alphaV0_I
  461. fmla v7.2d, v23.2d, alphaV0_R
  462. st2 {v6.2d, v7.2d}, [pCRow1]
  463. add pCRow1, pCRow1, #32
  464. prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
  465. ld2 {v0.2d, v1.2d}, [pCRow2]
  466. fmla v0.2d, v24.2d, alphaV0_R
  467. fmls v0.2d, v25.2d, alphaV0_I
  468. fmla v1.2d, v24.2d, alphaV0_I
  469. fmla v1.2d, v25.2d, alphaV0_R
  470. st2 {v0.2d, v1.2d}, [pCRow2]
  471. add pCRow2, pCRow2, #32
  472. ld2 {v2.2d, v3.2d}, [pCRow2]
  473. fmla v2.2d, v26.2d, alphaV0_R
  474. fmls v2.2d, v27.2d, alphaV0_I
  475. fmla v3.2d, v26.2d, alphaV0_I
  476. fmla v3.2d, v27.2d, alphaV0_R
  477. st2 {v2.2d, v3.2d}, [pCRow2]
  478. add pCRow2, pCRow2, #32
  479. prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
  480. ld2 {v4.2d, v5.2d}, [pCRow3]
  481. fmla v4.2d, v28.2d, alphaV0_R
  482. fmls v4.2d, v29.2d, alphaV0_I
  483. fmla v5.2d, v28.2d, alphaV0_I
  484. fmla v5.2d, v29.2d, alphaV0_R
  485. st2 {v4.2d, v5.2d}, [pCRow3]
  486. add pCRow3, pCRow3, #32
  487. ld2 {v6.2d, v7.2d}, [pCRow3]
  488. fmla v6.2d, v30.2d, alphaV0_R
  489. fmls v6.2d, v31.2d, alphaV0_I
  490. fmla v7.2d, v30.2d, alphaV0_I
  491. fmla v7.2d, v31.2d, alphaV0_R
  492. st2 {v6.2d, v7.2d}, [pCRow3]
  493. add pCRow3, pCRow3, #32
  494. .endm
  495. /******************************************************************************/
  496. .macro INIT2x4
  497. fmov d16, xzr
  498. fmov d17, xzr
  499. fmov d20, d16
  500. fmov d21, d17
  501. fmov d24, d16
  502. fmov d25, d17
  503. fmov d28, d16
  504. fmov d29, d17
  505. .endm
  506. .macro KERNEL2x4_SUB
  507. ld2 {v8.2d, v9.2d}, [pB]
  508. add pB, pB, #32
  509. ld2 {v10.2d, v11.2d}, [pB]
  510. add pB, pB, #32
  511. ld2 {v0.2d, v1.2d}, [pA]
  512. add pA, pA, #32
  513. OP_rr v16.2d, v0.2d, v8.d[0]
  514. OP_ii v16.2d, v1.2d, v9.d[0]
  515. OP_ri v17.2d, v0.2d, v9.d[0]
  516. OP_ir v17.2d, v1.2d, v8.d[0]
  517. OP_rr v20.2d, v0.2d, v8.d[1]
  518. OP_ii v20.2d, v1.2d, v9.d[1]
  519. OP_ri v21.2d, v0.2d, v9.d[1]
  520. OP_ir v21.2d, v1.2d, v8.d[1]
  521. OP_rr v24.2d, v0.2d, v10.d[0]
  522. OP_ii v24.2d, v1.2d, v11.d[0]
  523. OP_ri v25.2d, v0.2d, v11.d[0]
  524. OP_ir v25.2d, v1.2d, v10.d[0]
  525. OP_rr v28.2d, v0.2d, v10.d[1]
  526. OP_ii v28.2d, v1.2d, v11.d[1]
  527. OP_ri v29.2d, v0.2d, v11.d[1]
  528. OP_ir v29.2d, v1.2d, v10.d[1]
  529. .endm
  530. .macro SAVE2x4
  531. fmov alpha0_R, alphaR
  532. fmov alpha0_I, alphaI
  533. mov pCRow1, pCRow0
  534. ld2 {v0.2d, v1.2d}, [pCRow1]
  535. fmla v0.2d, v16.2d, alphaV0_R
  536. fmls v0.2d, v17.2d, alphaV0_I
  537. fmla v1.2d, v16.2d, alphaV0_I
  538. fmla v1.2d, v17.2d, alphaV0_R
  539. st2 {v0.2d, v1.2d}, [pCRow1]
  540. add pCRow1, pCRow1, LDC
  541. ld2 {v4.2d, v5.2d}, [pCRow1]
  542. fmla v4.2d, v20.2d, alphaV0_R
  543. fmls v4.2d, v21.2d, alphaV0_I
  544. fmla v5.2d, v20.2d, alphaV0_I
  545. fmla v5.2d, v21.2d, alphaV0_R
  546. st2 {v4.2d, v5.2d}, [pCRow1]
  547. add pCRow1, pCRow1, LDC
  548. ld2 {v0.2d, v1.2d}, [pCRow1]
  549. fmla v0.2d, v24.2d, alphaV0_R
  550. fmls v0.2d, v25.2d, alphaV0_I
  551. fmla v1.2d, v24.2d, alphaV0_I
  552. fmla v1.2d, v25.2d, alphaV0_R
  553. st2 {v0.2d, v1.2d}, [pCRow1]
  554. add pCRow1, pCRow1, LDC
  555. ld2 {v4.2d, v5.2d}, [pCRow1]
  556. fmla v4.2d, v28.2d, alphaV0_R
  557. fmls v4.2d, v29.2d, alphaV0_I
  558. fmla v5.2d, v28.2d, alphaV0_I
  559. fmla v5.2d, v29.2d, alphaV0_R
  560. st2 {v4.2d, v5.2d}, [pCRow1]
  561. add pCRow0, pCRow0, #32
  562. .endm
  563. /******************************************************************************/
  564. .macro INIT1x4
  565. fmov d16, xzr
  566. fmov d17, xzr
  567. fmov d20, d16
  568. fmov d21, d17
  569. fmov d24, d16
  570. fmov d25, d17
  571. fmov d28, d16
  572. fmov d29, d17
  573. .endm
  574. .macro KERNEL1x4_SUB
  575. ld2 {v8.2d, v9.2d}, [pB]
  576. add pB, pB, #32
  577. ld2 {v10.2d, v11.2d}, [pB]
  578. add pB, pB, #32
  579. ld2 {v0.d, v1.d}[0], [pA]
  580. add pA, pA, #16
  581. OP_rr d16, d0, v8.d[0]
  582. OP_ii d16, d1, v9.d[0]
  583. OP_ri d17, d0, v9.d[0]
  584. OP_ir d17, d1, v8.d[0]
  585. OP_rr d20, d0, v8.d[1]
  586. OP_ii d20, d1, v9.d[1]
  587. OP_ri d21, d0, v9.d[1]
  588. OP_ir d21, d1, v8.d[1]
  589. OP_rr d24, d0, v10.d[0]
  590. OP_ii d24, d1, v11.d[0]
  591. OP_ri d25, d0, v11.d[0]
  592. OP_ir d25, d1, v10.d[0]
  593. OP_rr d28, d0, v10.d[1]
  594. OP_ii d28, d1, v11.d[1]
  595. OP_ri d29, d0, v11.d[1]
  596. OP_ir d29, d1, v10.d[1]
  597. .endm
  598. .macro SAVE1x4
  599. fmov alpha0_R, alphaR
  600. fmov alpha0_I, alphaI
  601. mov pCRow1, pCRow0
  602. ld2 {v0.d, v1.d}[0], [pCRow1]
  603. fmla d0, d16, alphaV0_R
  604. fmls d0, d17, alphaV0_I
  605. fmla d1, d16, alphaV0_I
  606. fmla d1, d17, alphaV0_R
  607. st2 {v0.d, v1.d}[0], [pCRow1]
  608. add pCRow1, pCRow1, LDC
  609. ld2 {v4.d, v5.d}[0], [pCRow1]
  610. fmla d4, d20, alphaV0_R
  611. fmls d4, d21, alphaV0_I
  612. fmla d5, d20, alphaV0_I
  613. fmla d5, d21, alphaV0_R
  614. st2 {v4.d, v5.d}[0], [pCRow1]
  615. add pCRow1, pCRow1, LDC
  616. ld2 {v0.d, v1.d}[0], [pCRow1]
  617. fmla d0, d24, alphaV0_R
  618. fmls d0, d25, alphaV0_I
  619. fmla d1, d24, alphaV0_I
  620. fmla d1, d25, alphaV0_R
  621. st2 {v0.d, v1.d}[0], [pCRow1]
  622. add pCRow1, pCRow1, LDC
  623. ld2 {v4.d, v5.d}[0], [pCRow1]
  624. fmla d4, d28, alphaV0_R
  625. fmls d4, d29, alphaV0_I
  626. fmla d5, d28, alphaV0_I
  627. fmla d5, d29, alphaV0_R
  628. st2 {v4.d, v5.d}[0], [pCRow1]
  629. add pCRow0, pCRow0, #16
  630. .endm
  631. /******************************************************************************/
  632. .macro INIT4x2
  633. fmov d16, xzr
  634. fmov d17, xzr
  635. fmov d18, d16
  636. fmov d19, d17
  637. fmov d20, d16
  638. fmov d21, d17
  639. fmov d22, d16
  640. fmov d23, d17
  641. .endm
  642. .macro KERNEL4x2_SUB
  643. ld2 {v8.2d, v9.2d}, [pB]
  644. add pB, pB, #32
  645. ld2 {v0.2d, v1.2d}, [pA]
  646. add pA, pA, #32
  647. ld2 {v2.2d, v3.2d}, [pA]
  648. add pA, pA, #32
  649. OP_rr v16.2d, v0.2d, v8.d[0]
  650. OP_ii v16.2d, v1.2d, v9.d[0]
  651. OP_ri v17.2d, v0.2d, v9.d[0]
  652. OP_ir v17.2d, v1.2d, v8.d[0]
  653. OP_rr v18.2d, v2.2d, v8.d[0]
  654. OP_ii v18.2d, v3.2d, v9.d[0]
  655. OP_ri v19.2d, v2.2d, v9.d[0]
  656. OP_ir v19.2d, v3.2d, v8.d[0]
  657. OP_rr v20.2d, v0.2d, v8.d[1]
  658. OP_ii v20.2d, v1.2d, v9.d[1]
  659. OP_ri v21.2d, v0.2d, v9.d[1]
  660. OP_ir v21.2d, v1.2d, v8.d[1]
  661. OP_rr v22.2d, v2.2d, v8.d[1]
  662. OP_ii v22.2d, v3.2d, v9.d[1]
  663. OP_ri v23.2d, v2.2d, v9.d[1]
  664. OP_ir v23.2d, v3.2d, v8.d[1]
  665. .endm
  666. .macro SAVE4x2
  667. fmov alpha0_R, alphaR
  668. fmov alpha0_I, alphaI
  669. mov pCRow1, pCRow0
  670. ld2 {v0.2d, v1.2d}, [pCRow1]
  671. fmla v0.2d, v16.2d, alphaV0_R
  672. fmls v0.2d, v17.2d, alphaV0_I
  673. fmla v1.2d, v16.2d, alphaV0_I
  674. fmla v1.2d, v17.2d, alphaV0_R
  675. st2 {v0.2d, v1.2d}, [pCRow1]
  676. add pCRow2, pCRow1, #32
  677. ld2 {v2.2d, v3.2d}, [pCRow2]
  678. fmla v2.2d, v18.2d, alphaV0_R
  679. fmls v2.2d, v19.2d, alphaV0_I
  680. fmla v3.2d, v18.2d, alphaV0_I
  681. fmla v3.2d, v19.2d, alphaV0_R
  682. st2 {v2.2d, v3.2d}, [pCRow2]
  683. add pCRow1, pCRow1, LDC
  684. ld2 {v4.2d, v5.2d}, [pCRow1]
  685. fmla v4.2d, v20.2d, alphaV0_R
  686. fmls v4.2d, v21.2d, alphaV0_I
  687. fmla v5.2d, v20.2d, alphaV0_I
  688. fmla v5.2d, v21.2d, alphaV0_R
  689. st2 {v4.2d, v5.2d}, [pCRow1]
  690. add pCRow2, pCRow1, #32
  691. ld2 {v6.2d, v7.2d}, [pCRow2]
  692. fmla v6.2d, v22.2d, alphaV0_R
  693. fmls v6.2d, v23.2d, alphaV0_I
  694. fmla v7.2d, v22.2d, alphaV0_I
  695. fmla v7.2d, v23.2d, alphaV0_R
  696. st2 {v6.2d, v7.2d}, [pCRow2]
  697. add pCRow0, pCRow0, #64
  698. .endm
  699. /******************************************************************************/
  700. .macro INIT2x2
  701. fmov d16, xzr
  702. fmov d17, xzr
  703. fmov d20, d16
  704. fmov d21, d17
  705. .endm
  706. .macro KERNEL2x2_SUB
  707. ld2 {v8.2d, v9.2d}, [pB]
  708. add pB, pB, #32
  709. ld2 {v0.2d, v1.2d}, [pA]
  710. add pA, pA, #32
  711. OP_rr v16.2d, v0.2d, v8.d[0]
  712. OP_ii v16.2d, v1.2d, v9.d[0]
  713. OP_ri v17.2d, v0.2d, v9.d[0]
  714. OP_ir v17.2d, v1.2d, v8.d[0]
  715. OP_rr v20.2d, v0.2d, v8.d[1]
  716. OP_ii v20.2d, v1.2d, v9.d[1]
  717. OP_ri v21.2d, v0.2d, v9.d[1]
  718. OP_ir v21.2d, v1.2d, v8.d[1]
  719. .endm
  720. .macro SAVE2x2
  721. fmov alpha0_R, alphaR
  722. fmov alpha0_I, alphaI
  723. mov pCRow1, pCRow0
  724. ld2 {v0.2d, v1.2d}, [pCRow1]
  725. fmla v0.2d, v16.2d, alphaV0_R
  726. fmls v0.2d, v17.2d, alphaV0_I
  727. fmla v1.2d, v16.2d, alphaV0_I
  728. fmla v1.2d, v17.2d, alphaV0_R
  729. st2 {v0.2d, v1.2d}, [pCRow1]
  730. add pCRow1, pCRow1, LDC
  731. ld2 {v4.2d, v5.2d}, [pCRow1]
  732. fmla v4.2d, v20.2d, alphaV0_R
  733. fmls v4.2d, v21.2d, alphaV0_I
  734. fmla v5.2d, v20.2d, alphaV0_I
  735. fmla v5.2d, v21.2d, alphaV0_R
  736. st2 {v4.2d, v5.2d}, [pCRow1]
  737. add pCRow0, pCRow0, #32
  738. .endm
  739. /******************************************************************************/
  740. .macro INIT1x2
  741. fmov d16, xzr
  742. fmov d17, xzr
  743. fmov d20, xzr
  744. fmov d21, xzr
  745. .endm
  746. .macro KERNEL1x2_SUB
  747. ld2 {v8.2d, v9.2d}, [pB]
  748. add pB, pB, #32
  749. ld2 {v0.d, v1.d}[0], [pA]
  750. add pA, pA, #16
  751. OP_rr d16, d0, v8.d[0]
  752. OP_ii d16, d1, v9.d[0]
  753. OP_ri d17, d0, v9.d[0]
  754. OP_ir d17, d1, v8.d[0]
  755. OP_rr d20, d0, v8.d[1]
  756. OP_ii d20, d1, v9.d[1]
  757. OP_ri d21, d0, v9.d[1]
  758. OP_ir d21, d1, v8.d[1]
  759. .endm
  760. .macro SAVE1x2
  761. fmov alpha0_R, alphaR
  762. fmov alpha0_I, alphaI
  763. mov pCRow1, pCRow0
  764. ld2 {v0.d, v1.d}[0], [pCRow1]
  765. fmla d0, d16, alphaV0_R
  766. fmls d0, d17, alphaV0_I
  767. fmla d1, d16, alphaV0_I
  768. fmla d1, d17, alphaV0_R
  769. st2 {v0.d, v1.d}[0], [pCRow1]
  770. add pCRow1, pCRow1, LDC
  771. ld2 {v4.d, v5.d}[0], [pCRow1]
  772. fmla d4, d20, alphaV0_R
  773. fmls d4, d21, alphaV0_I
  774. fmla d5, d20, alphaV0_I
  775. fmla d5, d21, alphaV0_R
  776. st2 {v4.d, v5.d}[0], [pCRow1]
  777. add pCRow0, pCRow0, #16
  778. .endm
  779. /******************************************************************************/
  780. .macro INIT4x1
  781. fmov d16, xzr
  782. fmov d17, d16
  783. fmov d18, d16
  784. fmov d19, d17
  785. .endm
  786. .macro KERNEL4x1_SUB
  787. ld2 {v8.d, v9.d}[0], [pB]
  788. add pB, pB, #16
  789. ld2 {v0.2d, v1.2d}, [pA]
  790. add pA, pA, #32
  791. ld2 {v2.2d, v3.2d}, [pA]
  792. add pA, pA, #32
  793. OP_rr v16.2d, v0.2d, v8.d[0]
  794. OP_ii v16.2d, v1.2d, v9.d[0]
  795. OP_ri v17.2d, v0.2d, v9.d[0]
  796. OP_ir v17.2d, v1.2d, v8.d[0]
  797. OP_rr v18.2d, v2.2d, v8.d[0]
  798. OP_ii v18.2d, v3.2d, v9.d[0]
  799. OP_ri v19.2d, v2.2d, v9.d[0]
  800. OP_ir v19.2d, v3.2d, v8.d[0]
  801. .endm
  802. .macro SAVE4x1
  803. fmov alpha0_R, alphaR
  804. fmov alpha0_I, alphaI
  805. mov pCRow1, pCRow0
  806. ld2 {v0.2d, v1.2d}, [pCRow1]
  807. fmla v0.2d, v16.2d, alphaV0_R
  808. fmls v0.2d, v17.2d, alphaV0_I
  809. fmla v1.2d, v16.2d, alphaV0_I
  810. fmla v1.2d, v17.2d, alphaV0_R
  811. st2 {v0.2d, v1.2d}, [pCRow1]
  812. add pCRow2, pCRow1, #32
  813. ld2 {v2.2d, v3.2d}, [pCRow2]
  814. fmla v2.2d, v18.2d, alphaV0_R
  815. fmls v2.2d, v19.2d, alphaV0_I
  816. fmla v3.2d, v18.2d, alphaV0_I
  817. fmla v3.2d, v19.2d, alphaV0_R
  818. st2 {v2.2d, v3.2d}, [pCRow2]
  819. add pCRow0, pCRow0, #64
  820. .endm
  821. /******************************************************************************/
  822. .macro INIT2x1
  823. fmov d16, xzr
  824. fmov d17, xzr
  825. .endm
  826. .macro KERNEL2x1_SUB
  827. ld2 {v8.d, v9.d}[0], [pB]
  828. add pB, pB, #16
  829. ld2 {v0.2d, v1.2d}, [pA]
  830. add pA, pA, #32
  831. OP_rr v16.2d, v0.2d, v8.d[0]
  832. OP_ii v16.2d, v1.2d, v9.d[0]
  833. OP_ri v17.2d, v0.2d, v9.d[0]
  834. OP_ir v17.2d, v1.2d, v8.d[0]
  835. .endm
  836. .macro SAVE2x1
  837. fmov alpha0_R, alphaR
  838. fmov alpha0_I, alphaI
  839. mov pCRow1, pCRow0
  840. ld2 {v0.2d, v1.2d}, [pCRow1]
  841. fmla v0.2d, v16.2d, alphaV0_R
  842. fmls v0.2d, v17.2d, alphaV0_I
  843. fmla v1.2d, v16.2d, alphaV0_I
  844. fmla v1.2d, v17.2d, alphaV0_R
  845. st2 {v0.2d, v1.2d}, [pCRow1]
  846. add pCRow0, pCRow0, #32
  847. .endm
  848. /******************************************************************************/
  849. .macro INIT1x1
  850. fmov d16, xzr
  851. fmov d17, xzr
  852. .endm
  853. .macro KERNEL1x1_SUB
  854. ld2 {v8.d, v9.d}[0], [pB]
  855. add pB, pB, #16
  856. ld2 {v0.d, v1.d}[0], [pA]
  857. add pA, pA, #16
  858. OP_rr d16, d0, v8.d[0]
  859. OP_ii d16, d1, v9.d[0]
  860. OP_ri d17, d0, v9.d[0]
  861. OP_ir d17, d1, v8.d[0]
  862. .endm
  863. .macro SAVE1x1
  864. fmov alpha0_R, alphaR
  865. fmov alpha0_I, alphaI
  866. mov pCRow1, pCRow0
  867. ld2 {v0.d, v1.d}[0], [pCRow1]
  868. fmla d0, d16, alphaV0_R
  869. fmls d0, d17, alphaV0_I
  870. fmla d1, d16, alphaV0_I
  871. fmla d1, d17, alphaV0_R
  872. st2 {v0.d, v1.d}[0], [pCRow1]
  873. add pCRow0, pCRow0, #16
  874. .endm
  875. /*******************************************************************************
  876. * End of macro definitions
  877. *******************************************************************************/
  878. PROLOGUE
  879. .align 5
  880. add sp, sp, #-(11 * 16)
  881. stp d8, d9, [sp, #(0 * 16)]
  882. stp d10, d11, [sp, #(1 * 16)]
  883. stp d12, d13, [sp, #(2 * 16)]
  884. stp d14, d15, [sp, #(3 * 16)]
  885. stp d16, d17, [sp, #(4 * 16)]
  886. stp x18, x19, [sp, #(5 * 16)]
  887. stp x20, x21, [sp, #(6 * 16)]
  888. stp x22, x23, [sp, #(7 * 16)]
  889. stp x24, x25, [sp, #(8 * 16)]
  890. stp x26, x27, [sp, #(9 * 16)]
  891. str x28, [sp, #(10 * 16)]
  892. prfm PLDL1KEEP, [origPB]
  893. prfm PLDL1KEEP, [origPA]
  894. fmov alphaR, d0
  895. fmov alphaI, d1
  896. lsl LDC, LDC, #4 // ldc = ldc * 2 * 8
  897. mov pB, origPB
  898. mov counterJ, origN
  899. asr counterJ, counterJ, #2 // J = J / 4
  900. cmp counterJ, #0
  901. ble .Lzgemm_kernel_L2_BEGIN
  902. .Lzgemm_kernel_L4_BEGIN:
  903. mov pCRow0, pC
  904. add pCRow1, pCRow0, LDC
  905. add pCRow2, pCRow1, LDC
  906. add pCRow3, pCRow2, LDC
  907. add pC, pCRow3, LDC
  908. mov pA, origPA // pA = start of A array
  909. .Lzgemm_kernel_L4_M4_BEGIN:
  910. mov counterI, origM
  911. asr counterI, counterI, #2 // counterI = counterI / 4
  912. cmp counterI, #0
  913. ble .Lzgemm_kernel_L4_M2_BEGIN
  914. .align 5
  915. .Lzgemm_kernel_L4_M4_20:
  916. mov pB, origPB
  917. asr counterL , origK, #3
  918. cmp counterL , #2
  919. blt .Lzgemm_kernel_L4_M4_32
  920. KERNEL4x4_I
  921. KERNEL4x4_M2
  922. KERNEL4x4_M1
  923. KERNEL4x4_M2
  924. KERNEL4x4_M1
  925. KERNEL4x4_M2
  926. KERNEL4x4_M1
  927. KERNEL4x4_M2
  928. subs counterL, counterL, #2 // subtract 2
  929. ble .Lzgemm_kernel_L4_M4_22a
  930. .align 5
  931. .Lzgemm_kernel_L4_M4_22:
  932. KERNEL4x4_M1
  933. KERNEL4x4_M2
  934. KERNEL4x4_M1
  935. KERNEL4x4_M2
  936. KERNEL4x4_M1
  937. KERNEL4x4_M2
  938. KERNEL4x4_M1
  939. KERNEL4x4_M2
  940. subs counterL, counterL, #1
  941. bgt .Lzgemm_kernel_L4_M4_22
  942. .align 5
  943. .Lzgemm_kernel_L4_M4_22a:
  944. KERNEL4x4_M1
  945. KERNEL4x4_M2
  946. KERNEL4x4_M1
  947. KERNEL4x4_M2
  948. KERNEL4x4_M1
  949. KERNEL4x4_M2
  950. KERNEL4x4_M1
  951. KERNEL4x4_E
  952. b .Lzgemm_kernel_L4_M4_44
  953. .align 5
  954. .Lzgemm_kernel_L4_M4_32:
  955. tst counterL, #1
  956. ble .Lzgemm_kernel_L4_M4_40
  957. KERNEL4x4_I
  958. KERNEL4x4_M2
  959. KERNEL4x4_M1
  960. KERNEL4x4_M2
  961. KERNEL4x4_M1
  962. KERNEL4x4_M2
  963. KERNEL4x4_M1
  964. KERNEL4x4_E
  965. b .Lzgemm_kernel_L4_M4_44
  966. .Lzgemm_kernel_L4_M4_40:
  967. INIT4x4
  968. .Lzgemm_kernel_L4_M4_44:
  969. ands counterL , origK, #7
  970. ble .Lzgemm_kernel_L4_M4_100
  971. .align 5
  972. .Lzgemm_kernel_L4_M4_46:
  973. KERNEL4x4_SUB
  974. subs counterL, counterL, #1
  975. bne .Lzgemm_kernel_L4_M4_46
  976. .Lzgemm_kernel_L4_M4_100:
  977. prfm PLDL1KEEP, [pA]
  978. prfm PLDL1KEEP, [pA, #64]
  979. prfm PLDL1KEEP, [origPB]
  980. SAVE4x4
  981. .Lzgemm_kernel_L4_M4_END:
  982. subs counterI, counterI, #1
  983. bne .Lzgemm_kernel_L4_M4_20
  984. .Lzgemm_kernel_L4_M2_BEGIN:
  985. mov counterI, origM
  986. tst counterI , #3
  987. ble .Lzgemm_kernel_L4_END
  988. tst counterI, #2 // counterI = counterI / 2
  989. ble .Lzgemm_kernel_L4_M1_BEGIN
  990. .Lzgemm_kernel_L4_M2_20:
  991. INIT2x4
  992. mov pB, origPB
  993. asr counterL , origK, #3 // counterL = counterL / 8
  994. cmp counterL , #0
  995. ble .Lzgemm_kernel_L4_M2_40
  996. .Lzgemm_kernel_L4_M2_22:
  997. KERNEL2x4_SUB
  998. KERNEL2x4_SUB
  999. KERNEL2x4_SUB
  1000. KERNEL2x4_SUB
  1001. KERNEL2x4_SUB
  1002. KERNEL2x4_SUB
  1003. KERNEL2x4_SUB
  1004. KERNEL2x4_SUB
  1005. subs counterL, counterL, #1
  1006. bgt .Lzgemm_kernel_L4_M2_22
  1007. .Lzgemm_kernel_L4_M2_40:
  1008. ands counterL , origK, #7 // counterL = counterL % 8
  1009. ble .Lzgemm_kernel_L4_M2_100
  1010. .Lzgemm_kernel_L4_M2_42:
  1011. KERNEL2x4_SUB
  1012. subs counterL, counterL, #1
  1013. bgt .Lzgemm_kernel_L4_M2_42
  1014. .Lzgemm_kernel_L4_M2_100:
  1015. SAVE2x4
  1016. .Lzgemm_kernel_L4_M2_END:
  1017. .Lzgemm_kernel_L4_M1_BEGIN:
  1018. tst counterI, #1 // counterI = counterI % 2
  1019. ble .Lzgemm_kernel_L4_END
  1020. .Lzgemm_kernel_L4_M1_20:
  1021. INIT1x4
  1022. mov pB, origPB
  1023. asr counterL , origK, #3 // counterL = counterL / 8
  1024. cmp counterL , #0
  1025. ble .Lzgemm_kernel_L4_M1_40
  1026. .Lzgemm_kernel_L4_M1_22:
  1027. KERNEL1x4_SUB
  1028. KERNEL1x4_SUB
  1029. KERNEL1x4_SUB
  1030. KERNEL1x4_SUB
  1031. KERNEL1x4_SUB
  1032. KERNEL1x4_SUB
  1033. KERNEL1x4_SUB
  1034. KERNEL1x4_SUB
  1035. subs counterL, counterL, #1
  1036. bgt .Lzgemm_kernel_L4_M1_22
  1037. .Lzgemm_kernel_L4_M1_40:
  1038. ands counterL , origK, #7 // counterL = counterL % 8
  1039. ble .Lzgemm_kernel_L4_M1_100
  1040. .Lzgemm_kernel_L4_M1_42:
  1041. KERNEL1x4_SUB
  1042. subs counterL, counterL, #1
  1043. bgt .Lzgemm_kernel_L4_M1_42
  1044. .Lzgemm_kernel_L4_M1_100:
  1045. SAVE1x4
  1046. .Lzgemm_kernel_L4_END:
  1047. lsl temp, origK, #6
  1048. add origPB, origPB, temp // B = B + K * 4 * 8 * 2
  1049. subs counterJ, counterJ , #1 // j--
  1050. bgt .Lzgemm_kernel_L4_BEGIN
  1051. /******************************************************************************/
  1052. .Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  1053. mov counterJ , origN
  1054. tst counterJ , #3
  1055. ble .Lzgemm_kernel_L999
  1056. tst counterJ , #2
  1057. ble .Lzgemm_kernel_L1_BEGIN
  1058. mov pCRow0, pC // pCRow0 = pC
  1059. add pC,pC,LDC, lsl #1
  1060. mov pA, origPA // pA = A
  1061. .Lzgemm_kernel_L2_M4_BEGIN:
  1062. mov counterI, origM
  1063. asr counterI, counterI, #2 // counterI = counterI / 4
  1064. cmp counterI,#0
  1065. ble .Lzgemm_kernel_L2_M2_BEGIN
  1066. .Lzgemm_kernel_L2_M4_20:
  1067. INIT4x2
  1068. mov pB, origPB
  1069. asr counterL , origK, #3 // counterL = counterL / 8
  1070. cmp counterL,#0
  1071. ble .Lzgemm_kernel_L2_M4_40
  1072. .align 5
  1073. .Lzgemm_kernel_L2_M4_22:
  1074. KERNEL4x2_SUB
  1075. KERNEL4x2_SUB
  1076. KERNEL4x2_SUB
  1077. KERNEL4x2_SUB
  1078. KERNEL4x2_SUB
  1079. KERNEL4x2_SUB
  1080. KERNEL4x2_SUB
  1081. KERNEL4x2_SUB
  1082. subs counterL, counterL, #1
  1083. bgt .Lzgemm_kernel_L2_M4_22
  1084. .Lzgemm_kernel_L2_M4_40:
  1085. ands counterL , origK, #7 // counterL = counterL % 8
  1086. ble .Lzgemm_kernel_L2_M4_100
  1087. .Lzgemm_kernel_L2_M4_42:
  1088. KERNEL4x2_SUB
  1089. subs counterL, counterL, #1
  1090. bgt .Lzgemm_kernel_L2_M4_42
  1091. .Lzgemm_kernel_L2_M4_100:
  1092. SAVE4x2
  1093. .Lzgemm_kernel_L2_M4_END:
  1094. subs counterI, counterI, #1
  1095. bgt .Lzgemm_kernel_L2_M4_20
  1096. .Lzgemm_kernel_L2_M2_BEGIN:
  1097. mov counterI, origM
  1098. tst counterI , #3
  1099. ble .Lzgemm_kernel_L2_END
  1100. tst counterI, #2 // counterI = counterI / 2
  1101. ble .Lzgemm_kernel_L2_M1_BEGIN
  1102. .Lzgemm_kernel_L2_M2_20:
  1103. INIT2x2
  1104. mov pB, origPB
  1105. asr counterL , origK, #3 // counterL = counterL / 8
  1106. cmp counterL,#0
  1107. ble .Lzgemm_kernel_L2_M2_40
  1108. .Lzgemm_kernel_L2_M2_22:
  1109. KERNEL2x2_SUB
  1110. KERNEL2x2_SUB
  1111. KERNEL2x2_SUB
  1112. KERNEL2x2_SUB
  1113. KERNEL2x2_SUB
  1114. KERNEL2x2_SUB
  1115. KERNEL2x2_SUB
  1116. KERNEL2x2_SUB
  1117. subs counterL, counterL, #1
  1118. bgt .Lzgemm_kernel_L2_M2_22
  1119. .Lzgemm_kernel_L2_M2_40:
  1120. ands counterL , origK, #7 // counterL = counterL % 8
  1121. ble .Lzgemm_kernel_L2_M2_100
  1122. .Lzgemm_kernel_L2_M2_42:
  1123. KERNEL2x2_SUB
  1124. subs counterL, counterL, #1
  1125. bgt .Lzgemm_kernel_L2_M2_42
  1126. .Lzgemm_kernel_L2_M2_100:
  1127. SAVE2x2
  1128. .Lzgemm_kernel_L2_M2_END:
  1129. .Lzgemm_kernel_L2_M1_BEGIN:
  1130. tst counterI, #1 // counterI = counterI % 2
  1131. ble .Lzgemm_kernel_L2_END
  1132. .Lzgemm_kernel_L2_M1_20:
  1133. INIT1x2
  1134. mov pB, origPB
  1135. asr counterL , origK, #3 // counterL = counterL / 8
  1136. cmp counterL, #0
  1137. ble .Lzgemm_kernel_L2_M1_40
  1138. .Lzgemm_kernel_L2_M1_22:
  1139. KERNEL1x2_SUB
  1140. KERNEL1x2_SUB
  1141. KERNEL1x2_SUB
  1142. KERNEL1x2_SUB
  1143. KERNEL1x2_SUB
  1144. KERNEL1x2_SUB
  1145. KERNEL1x2_SUB
  1146. KERNEL1x2_SUB
  1147. subs counterL, counterL, #1
  1148. bgt .Lzgemm_kernel_L2_M1_22
  1149. .Lzgemm_kernel_L2_M1_40:
  1150. ands counterL , origK, #7 // counterL = counterL % 8
  1151. ble .Lzgemm_kernel_L2_M1_100
  1152. .Lzgemm_kernel_L2_M1_42:
  1153. KERNEL1x2_SUB
  1154. subs counterL, counterL, #1
  1155. bgt .Lzgemm_kernel_L2_M1_42
  1156. .Lzgemm_kernel_L2_M1_100:
  1157. SAVE1x2
  1158. .Lzgemm_kernel_L2_END:
  1159. lsl temp, origK, #5
  1160. add origPB, origPB, temp // B = B + K * 2 * 8 * 2
  1161. /******************************************************************************/
  1162. .Lzgemm_kernel_L1_BEGIN:
  1163. mov counterJ , origN
  1164. tst counterJ , #1
  1165. ble .Lzgemm_kernel_L999 // done
  1166. mov pCRow0, pC // pCRow0 = C
  1167. add pC , pC , LDC // Update pC to point to next
  1168. mov pA, origPA // pA = A
  1169. .Lzgemm_kernel_L1_M4_BEGIN:
  1170. mov counterI, origM
  1171. asr counterI, counterI, #2 // counterI = counterI / 4
  1172. cmp counterI, #0
  1173. ble .Lzgemm_kernel_L1_M2_BEGIN
  1174. .Lzgemm_kernel_L1_M4_20:
  1175. INIT4x1
  1176. mov pB, origPB
  1177. asr counterL , origK, #3 // counterL = counterL / 8
  1178. cmp counterL , #0
  1179. ble .Lzgemm_kernel_L1_M4_40
  1180. .align 5
  1181. .Lzgemm_kernel_L1_M4_22:
  1182. KERNEL4x1_SUB
  1183. KERNEL4x1_SUB
  1184. KERNEL4x1_SUB
  1185. KERNEL4x1_SUB
  1186. KERNEL4x1_SUB
  1187. KERNEL4x1_SUB
  1188. KERNEL4x1_SUB
  1189. KERNEL4x1_SUB
  1190. subs counterL, counterL, #1
  1191. bgt .Lzgemm_kernel_L1_M4_22
  1192. .Lzgemm_kernel_L1_M4_40:
  1193. ands counterL , origK, #7 // counterL = counterL % 8
  1194. ble .Lzgemm_kernel_L1_M4_100
  1195. .Lzgemm_kernel_L1_M4_42:
  1196. KERNEL4x1_SUB
  1197. subs counterL, counterL, #1
  1198. bgt .Lzgemm_kernel_L1_M4_42
  1199. .Lzgemm_kernel_L1_M4_100:
  1200. SAVE4x1
  1201. .Lzgemm_kernel_L1_M4_END:
  1202. subs counterI, counterI, #1
  1203. bgt .Lzgemm_kernel_L1_M4_20
  1204. .Lzgemm_kernel_L1_M2_BEGIN:
  1205. mov counterI, origM
  1206. tst counterI , #3
  1207. ble .Lzgemm_kernel_L1_END
  1208. tst counterI, #2 // counterI = counterI / 2
  1209. ble .Lzgemm_kernel_L1_M1_BEGIN
  1210. .Lzgemm_kernel_L1_M2_20:
  1211. INIT2x1
  1212. mov pB, origPB
  1213. asr counterL , origK, #3 // counterL = counterL / 8
  1214. cmp counterL , #0
  1215. ble .Lzgemm_kernel_L1_M2_40
  1216. .Lzgemm_kernel_L1_M2_22:
  1217. KERNEL2x1_SUB
  1218. KERNEL2x1_SUB
  1219. KERNEL2x1_SUB
  1220. KERNEL2x1_SUB
  1221. KERNEL2x1_SUB
  1222. KERNEL2x1_SUB
  1223. KERNEL2x1_SUB
  1224. KERNEL2x1_SUB
  1225. subs counterL, counterL, #1
  1226. bgt .Lzgemm_kernel_L1_M2_22
  1227. .Lzgemm_kernel_L1_M2_40:
  1228. ands counterL , origK, #7 // counterL = counterL % 8
  1229. ble .Lzgemm_kernel_L1_M2_100
  1230. .Lzgemm_kernel_L1_M2_42:
  1231. KERNEL2x1_SUB
  1232. subs counterL, counterL, #1
  1233. bgt .Lzgemm_kernel_L1_M2_42
  1234. .Lzgemm_kernel_L1_M2_100:
  1235. SAVE2x1
  1236. .Lzgemm_kernel_L1_M2_END:
  1237. .Lzgemm_kernel_L1_M1_BEGIN:
  1238. tst counterI, #1 // counterI = counterI % 2
  1239. ble .Lzgemm_kernel_L1_END
  1240. .Lzgemm_kernel_L1_M1_20:
  1241. INIT1x1
  1242. mov pB, origPB
  1243. asr counterL , origK, #3 // counterL = counterL / 8
  1244. cmp counterL , #0
  1245. ble .Lzgemm_kernel_L1_M1_40
  1246. .Lzgemm_kernel_L1_M1_22:
  1247. KERNEL1x1_SUB
  1248. KERNEL1x1_SUB
  1249. KERNEL1x1_SUB
  1250. KERNEL1x1_SUB
  1251. KERNEL1x1_SUB
  1252. KERNEL1x1_SUB
  1253. KERNEL1x1_SUB
  1254. KERNEL1x1_SUB
  1255. subs counterL, counterL, #1
  1256. bgt .Lzgemm_kernel_L1_M1_22
  1257. .Lzgemm_kernel_L1_M1_40:
  1258. ands counterL , origK, #7 // counterL = counterL % 8
  1259. ble .Lzgemm_kernel_L1_M1_100
  1260. .Lzgemm_kernel_L1_M1_42:
  1261. KERNEL1x1_SUB
  1262. subs counterL, counterL, #1
  1263. bgt .Lzgemm_kernel_L1_M1_42
  1264. .Lzgemm_kernel_L1_M1_100:
  1265. SAVE1x1
  1266. .Lzgemm_kernel_L1_END:
  1267. .Lzgemm_kernel_L999:
  1268. mov x0, #0 // set return value
  1269. ldp d8, d9, [sp, #(0 * 16)]
  1270. ldp d10, d11, [sp, #(1 * 16)]
  1271. ldp d12, d13, [sp, #(2 * 16)]
  1272. ldp d14, d15, [sp, #(3 * 16)]
  1273. ldp d16, d17, [sp, #(4 * 16)]
  1274. ldp x18, x19, [sp, #(5 * 16)]
  1275. ldp x20, x21, [sp, #(6 * 16)]
  1276. ldp x22, x23, [sp, #(7 * 16)]
  1277. ldp x24, x25, [sp, #(8 * 16)]
  1278. ldp x26, x27, [sp, #(9 * 16)]
  1279. ldr x28, [sp, #(10 * 16)]
  1280. add sp, sp, #(11*16)
  1281. ret
  1282. EPILOGUE