You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_4x8.S 31 kB


  1. /*******************************************************************************
  2. Copyright (c) 2015, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* X0 X1 X2 s0 X3 x4 x5 x6 */
  30. /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
  31. #define origM x0
  32. #define origN x1
  33. #define origK x2
  34. #define origPA x3
  35. #define origPB x4
  36. #define pC x5
  37. #define LDC x6
  38. #define temp x7
  39. #define counterL x8
  40. #define counterI x9
  41. #define counterJ x10
  42. #define pB x11
  43. #define pCRow0 x12
  44. #define pCRow1 x13
  45. #define pCRow2 x14
  46. #define pA x15
  47. #define alpha0 d2
  48. #define alphaV0 v2.d[0]
  49. #define alpha1 d3
  50. #define alphaV1 v3.d[0]
  51. #define alpha2 d6
  52. #define alphaV2 v6.d[0]
  53. #define alpha3 d7
  54. #define alphaV3 v7.d[0]
  55. // 00 origM
  56. // 01 origN
  57. // 02 origK
  58. // 03 origPA
  59. // 04 origPB
  60. // 05 pC
  61. // 06 origLDC -> LDC
  62. // 07 temp
  63. // 08 counterL
  64. // 09 counterI
  65. // 10 counterJ
  66. // 11 pB
  67. // 12 pCRow0
  68. // 13 pCRow1
  69. // 14 pCRow2
  70. // 15 pA
  71. // 16
  72. // 17
  73. // 18 must save
  74. // 19 must save
  75. // 20 must save
  76. // 21 must save
  77. // 22 must save
  78. // 23 must save
  79. // 24 must save
  80. // 25 must save
  81. // 26 must save
  82. // 27 must save
  83. // 28 must save
  84. // 29 frame
  85. // 30 link
  86. // 31 sp
  87. //v00 ALPHA -> pA00, pA01
  88. //v01 pA02, pA03
  89. //v02 ALPHA0
  90. //v03 ALPHA1
  91. //v04 pA10, pA11
  92. //v05 pA12, pA13
  93. //v06 ALPHA2
  94. //v07 ALPHA3
  95. //v08 must save pB0_0, pB0_1
  96. //v09 must save pB0_2, pB0_3
  97. //v10 must save pB0_4, pB0_5
  98. //v11 must save pB0_6, pB0_7
  99. //v12 must save pB1_0, pB1_1
  100. //v13 must save pB1_2, pB1_3
  101. //v14 must save pB1_4, pB1_5
  102. //v15 must save pB1_6, pB1_7
  103. //v16 must save C00, C01
  104. //v17 must save C02, C03
  105. //v18 C04, C05
  106. //v19 C06, C07
  107. //v20 C10, C11
  108. //v21 C12, C13
  109. //v22 C14, C15
  110. //v23 C16, C17
  111. //v24 C20, C21
  112. //v25 C22, C23
  113. //v26 C24, C25
  114. //v27 C26, C27
  115. //v28 C30, C31
  116. //v29 C32, C33
  117. //v30 C34, C35
  118. //v31 C36, C37
  119. /*******************************************************************************
  120. * Macro definitions
  121. *******************************************************************************/
  122. .macro INIT4x8
  123. fmov d16, xzr
  124. fmov d17, xzr
  125. fmov d18, xzr
  126. fmov d19, d16
  127. fmov d20, xzr
  128. fmov d21, d16
  129. fmov d22, d17
  130. fmov d23, d18
  131. fmov d24, xzr
  132. fmov d25, d16
  133. fmov d26, d17
  134. fmov d27, d18
  135. fmov d28, xzr
  136. fmov d29, d16
  137. fmov d30, d17
  138. fmov d31, d18
  139. .endm
  140. .macro KERNEL4x8_I
  141. ld1 {v8.2d, v9.2d}, [pB]
  142. add pB, pB, #32
  143. ld1 {v0.2d, v1.2d}, [pA]
  144. add pA, pA, #32
  145. ld1 {v10.2d, v11.2d}, [pB]
  146. add pB, pB, #32
  147. fmul v16.2d, v0.2d, v8.d[0]
  148. fmul v17.2d, v1.2d, v8.d[0]
  149. fmul v18.2d, v0.2d, v8.d[1]
  150. fmul v19.2d, v1.2d, v8.d[1]
  151. fmul v20.2d, v0.2d, v9.d[0]
  152. fmul v21.2d, v1.2d, v9.d[0]
  153. fmul v22.2d, v0.2d, v9.d[1]
  154. fmul v23.2d, v1.2d, v9.d[1]
  155. fmul v24.2d, v0.2d, v10.d[0]
  156. fmul v25.2d, v1.2d, v10.d[0]
  157. fmul v26.2d, v0.2d, v10.d[1]
  158. fmul v27.2d, v1.2d, v10.d[1]
  159. fmul v28.2d, v0.2d, v11.d[0]
  160. fmul v29.2d, v1.2d, v11.d[0]
  161. fmul v30.2d, v0.2d, v11.d[1]
  162. fmul v31.2d, v1.2d, v11.d[1]
  163. ld1 {v12.2d, v13.2d}, [pB]
  164. add pB, pB, #32
  165. ld1 {v4.2d, v5.2d}, [pA]
  166. add pA, pA, #32
  167. ld1 {v14.2d, v15.2d}, [pB]
  168. add pB, pB, #32
  169. .endm
  170. .macro KERNEL4x8_M1
  171. fmla v16.2d, v0.2d, v8.d[0]
  172. fmla v17.2d, v1.2d, v8.d[0]
  173. fmla v18.2d, v0.2d, v8.d[1]
  174. fmla v19.2d, v1.2d, v8.d[1]
  175. fmla v20.2d, v0.2d, v9.d[0]
  176. fmla v21.2d, v1.2d, v9.d[0]
  177. fmla v22.2d, v0.2d, v9.d[1]
  178. fmla v23.2d, v1.2d, v9.d[1]
  179. fmla v24.2d, v0.2d, v10.d[0]
  180. fmla v25.2d, v1.2d, v10.d[0]
  181. fmla v26.2d, v0.2d, v10.d[1]
  182. fmla v27.2d, v1.2d, v10.d[1]
  183. fmla v28.2d, v0.2d, v11.d[0]
  184. fmla v29.2d, v1.2d, v11.d[0]
  185. fmla v30.2d, v0.2d, v11.d[1]
  186. fmla v31.2d, v1.2d, v11.d[1]
  187. ld1 {v12.2d, v13.2d}, [pB] // For next round
  188. add pB, pB, #32
  189. ld1 {v4.2d, v5.2d}, [pA] // For next round
  190. add pA, pA, #32
  191. ld1 {v14.2d, v15.2d}, [pB]
  192. add pB, pB, #32
  193. prfm PLDL1KEEP, [pA, #512]
  194. .endm
  195. .macro KERNEL4x8_M2
  196. fmla v16.2d, v4.2d, v12.d[0]
  197. fmla v17.2d, v5.2d, v12.d[0]
  198. fmla v18.2d, v4.2d, v12.d[1]
  199. fmla v19.2d, v5.2d, v12.d[1]
  200. fmla v20.2d, v4.2d, v13.d[0]
  201. fmla v21.2d, v5.2d, v13.d[0]
  202. fmla v22.2d, v4.2d, v13.d[1]
  203. fmla v23.2d, v5.2d, v13.d[1]
  204. fmla v24.2d, v4.2d, v14.d[0]
  205. fmla v25.2d, v5.2d, v14.d[0]
  206. fmla v26.2d, v4.2d, v14.d[1]
  207. fmla v27.2d, v5.2d, v14.d[1]
  208. fmla v28.2d, v4.2d, v15.d[0]
  209. fmla v29.2d, v5.2d, v15.d[0]
  210. fmla v30.2d, v4.2d, v15.d[1]
  211. fmla v31.2d, v5.2d, v15.d[1]
  212. ld1 {v8.2d, v9.2d}, [pB] // For next round
  213. add pB, pB, #32
  214. ld1 {v0.2d, v1.2d}, [pA] // For next round
  215. add pA, pA, #32
  216. ld1 {v10.2d, v11.2d}, [pB]
  217. add pB, pB, #32
  218. prfm PLDL1KEEP, [pB, #512]
  219. .endm
  220. .macro KERNEL4x8_E
  221. fmla v16.2d, v4.2d, v12.d[0]
  222. fmla v17.2d, v5.2d, v12.d[0]
  223. fmla v18.2d, v4.2d, v12.d[1]
  224. fmla v19.2d, v5.2d, v12.d[1]
  225. fmla v20.2d, v4.2d, v13.d[0]
  226. fmla v21.2d, v5.2d, v13.d[0]
  227. fmla v22.2d, v4.2d, v13.d[1]
  228. fmla v23.2d, v5.2d, v13.d[1]
  229. fmla v24.2d, v4.2d, v14.d[0]
  230. fmla v25.2d, v5.2d, v14.d[0]
  231. fmla v26.2d, v4.2d, v14.d[1]
  232. fmla v27.2d, v5.2d, v14.d[1]
  233. fmla v28.2d, v4.2d, v15.d[0]
  234. fmla v29.2d, v5.2d, v15.d[0]
  235. fmla v30.2d, v4.2d, v15.d[1]
  236. fmla v31.2d, v5.2d, v15.d[1]
  237. .endm
  238. .macro KERNEL4x8_SUB
  239. ld1 {v8.2d, v9.2d}, [pB] // For next round
  240. add pB, pB, #32
  241. ld1 {v0.2d, v1.2d}, [pA] // For next round
  242. add pA, pA, #32
  243. ld1 {v10.2d, v11.2d}, [pB]
  244. add pB, pB, #32
  245. fmla v16.2d, v0.2d, v8.d[0]
  246. fmla v17.2d, v1.2d, v8.d[0]
  247. fmla v18.2d, v0.2d, v8.d[1]
  248. fmla v19.2d, v1.2d, v8.d[1]
  249. fmla v20.2d, v0.2d, v9.d[0]
  250. fmla v21.2d, v1.2d, v9.d[0]
  251. fmla v22.2d, v0.2d, v9.d[1]
  252. fmla v23.2d, v1.2d, v9.d[1]
  253. fmla v24.2d, v0.2d, v10.d[0]
  254. fmla v25.2d, v1.2d, v10.d[0]
  255. fmla v26.2d, v0.2d, v10.d[1]
  256. fmla v27.2d, v1.2d, v10.d[1]
  257. fmla v28.2d, v0.2d, v11.d[0]
  258. fmla v29.2d, v1.2d, v11.d[0]
  259. fmla v30.2d, v0.2d, v11.d[1]
  260. fmla v31.2d, v1.2d, v11.d[1]
  261. .endm
  262. .macro SAVE4x8
  263. add pCRow1, pCRow0, LDC
  264. ld1 {v8.2d, v9.2d}, [pCRow0]
  265. fmla v8.2d, v16.2d, alphaV0
  266. fmla v9.2d, v17.2d, alphaV1
  267. st1 {v8.2d, v9.2d}, [pCRow0]
  268. add pCRow2, pCRow1, LDC
  269. ld1 {v10.2d, v11.2d}, [pCRow1]
  270. fmla v10.2d, v18.2d, alphaV2
  271. fmla v11.2d, v19.2d, alphaV3
  272. st1 {v10.2d, v11.2d}, [pCRow1]
  273. add pCRow1, pCRow2, LDC
  274. ld1 {v12.2d, v13.2d}, [pCRow2]
  275. fmla v12.2d, v20.2d, alphaV0
  276. fmla v13.2d, v21.2d, alphaV1
  277. st1 {v12.2d, v13.2d}, [pCRow2]
  278. add pCRow2, pCRow1, LDC
  279. ld1 {v14.2d, v15.2d}, [pCRow1]
  280. fmla v14.2d, v22.2d, alphaV2
  281. fmla v15.2d, v23.2d, alphaV3
  282. st1 {v14.2d, v15.2d}, [pCRow1]
  283. add pCRow1, pCRow2, LDC
  284. ld1 {v8.2d, v9.2d}, [pCRow2]
  285. fmla v8.2d, v24.2d, alphaV0
  286. fmla v9.2d, v25.2d, alphaV1
  287. st1 {v8.2d, v9.2d}, [pCRow2]
  288. add pCRow2, pCRow1, LDC
  289. ld1 {v10.2d, v11.2d}, [pCRow1]
  290. fmla v10.2d, v26.2d, alphaV2
  291. fmla v11.2d, v27.2d, alphaV3
  292. st1 {v10.2d, v11.2d}, [pCRow1]
  293. add pCRow1, pCRow2, LDC
  294. ld1 {v12.2d, v13.2d}, [pCRow2]
  295. fmla v12.2d, v28.2d, alphaV0
  296. fmla v13.2d, v29.2d, alphaV1
  297. st1 {v12.2d, v13.2d}, [pCRow2]
  298. ld1 {v14.2d, v15.2d}, [pCRow1]
  299. fmla v14.2d, v30.2d, alphaV2
  300. fmla v15.2d, v31.2d, alphaV3
  301. st1 {v14.2d, v15.2d}, [pCRow1]
  302. add pCRow0, pCRow0, #32
  303. .endm
  304. /******************************************************************************/
  305. .macro INIT2x8
  306. fmov d16, xzr
  307. fmov d18, xzr
  308. fmov d20, xzr
  309. fmov d22, d16
  310. fmov d24, xzr
  311. fmov d26, d16
  312. fmov d28, xzr
  313. fmov d30, d16
  314. .endm
  315. .macro KERNEL2x8_SUB
  316. ld1 {v8.2d, v9.2d}, [pB]
  317. add pB, pB, #32
  318. ld1 {v0.2d}, [pA]
  319. add pA, pA, #16
  320. ld1 {v10.2d, v11.2d}, [pB]
  321. add pB, pB, #32
  322. fmla v16.2d, v0.2d, v8.d[0]
  323. fmla v18.2d, v0.2d, v8.d[1]
  324. fmla v20.2d, v0.2d, v9.d[0]
  325. fmla v22.2d, v0.2d, v9.d[1]
  326. fmla v24.2d, v0.2d, v10.d[0]
  327. fmla v26.2d, v0.2d, v10.d[1]
  328. fmla v28.2d, v0.2d, v11.d[0]
  329. fmla v30.2d, v0.2d, v11.d[1]
  330. .endm
  331. .macro SAVE2x8
  332. add pCRow1, pCRow0, LDC
  333. ld1 {v8.2d}, [pCRow0]
  334. fmla v8.2d, v16.2d, alphaV0
  335. st1 {v8.2d}, [pCRow0]
  336. add pCRow2, pCRow1, LDC
  337. ld1 {v10.2d}, [pCRow1]
  338. fmla v10.2d, v18.2d, alphaV2
  339. st1 {v10.2d}, [pCRow1]
  340. add pCRow1, pCRow2, LDC
  341. ld1 {v12.2d}, [pCRow2]
  342. fmla v12.2d, v20.2d, alphaV0
  343. st1 {v12.2d}, [pCRow2]
  344. add pCRow2, pCRow1, LDC
  345. ld1 {v14.2d}, [pCRow1]
  346. fmla v14.2d, v22.2d, alphaV2
  347. st1 {v14.2d}, [pCRow1]
  348. add pCRow1, pCRow2, LDC
  349. ld1 {v8.2d}, [pCRow2]
  350. fmla v8.2d, v24.2d, alphaV0
  351. st1 {v8.2d}, [pCRow2]
  352. add pCRow2, pCRow1, LDC
  353. ld1 {v10.2d}, [pCRow1]
  354. fmla v10.2d, v26.2d, alphaV2
  355. st1 {v10.2d}, [pCRow1]
  356. add pCRow1, pCRow2, LDC
  357. ld1 {v12.2d}, [pCRow2]
  358. fmla v12.2d, v28.2d, alphaV0
  359. st1 {v12.2d}, [pCRow2]
  360. add pCRow2, pCRow1, LDC
  361. ld1 {v14.2d}, [pCRow1]
  362. fmla v14.2d, v30.2d, alphaV2
  363. st1 {v14.2d}, [pCRow1]
  364. add pCRow0, pCRow0, #16
  365. .endm
  366. /******************************************************************************/
  367. .macro INIT1x8
  368. fmov d16, xzr
  369. fmov d20, xzr
  370. fmov d24, xzr
  371. fmov d28, xzr
  372. .endm
  373. .macro KERNEL1x8_SUB
  374. ld1 {v8.2d, v9.2d}, [pB]
  375. add pB, pB, #32
  376. ldr d0, [pA]
  377. add pA, pA, #8
  378. ld1 {v10.2d, v11.2d}, [pB]
  379. add pB, pB, #32
  380. fmla v16.2d, v8.2d, v0.d[0]
  381. fmla v20.2d, v9.2d, v0.d[0]
  382. fmla v24.2d, v10.2d, v0.d[0]
  383. fmla v28.2d, v11.2d, v0.d[0]
  384. .endm
  385. .macro SAVE1x8
  386. add pCRow1, pCRow0, LDC
  387. ld1 {v8.d}[0], [pCRow0]
  388. ld1 {v8.d}[1], [pCRow1]
  389. fmla v8.2d, v16.2d, alphaV0
  390. st1 {v8.d}[0], [pCRow0]
  391. st1 {v8.d}[1], [pCRow1]
  392. add pCRow2, pCRow1, LDC
  393. add pCRow1, pCRow2, LDC
  394. ld1 {v10.d}[0], [pCRow2]
  395. ld1 {v10.d}[1], [pCRow1]
  396. fmla v10.2d, v20.2d, alphaV1
  397. st1 {v10.d}[0], [pCRow2]
  398. st1 {v10.d}[1], [pCRow1]
  399. add pCRow2, pCRow1, LDC
  400. add pCRow1, pCRow2, LDC
  401. ld1 {v12.d}[0], [pCRow2]
  402. ld1 {v12.d}[1], [pCRow1]
  403. fmla v12.2d, v24.2d, alphaV2
  404. st1 {v12.d}[0], [pCRow2]
  405. st1 {v12.d}[1], [pCRow1]
  406. add pCRow2, pCRow1, LDC
  407. add pCRow1, pCRow2, LDC
  408. ld1 {v14.d}[0], [pCRow2]
  409. ld1 {v14.d}[1], [pCRow1]
  410. fmla v14.2d, v28.2d, alphaV3
  411. st1 {v14.d}[0], [pCRow2]
  412. st1 {v14.d}[1], [pCRow1]
  413. add pCRow0, pCRow0, #8
  414. .endm
  415. /******************************************************************************/
  416. .macro INIT4x4
  417. fmov d16, xzr
  418. fmov d17, d16
  419. fmov d20, d17
  420. fmov d21, d16
  421. fmov d24, d17
  422. fmov d25, d16
  423. fmov d28, d17
  424. fmov d29, d16
  425. .endm
  426. .macro KERNEL4x4_I
  427. ld1 {v8.2d, v9.2d}, [pB]
  428. add pB, pB, #32
  429. ld1 {v0.2d, v1.2d}, [pA]
  430. add pA, pA, #32
  431. fmul v16.2d, v0.2d, v8.d[0]
  432. fmul v29.2d, v1.2d, v9.d[1]
  433. fmul v20.2d, v0.2d, v8.d[1]
  434. fmul v25.2d, v1.2d, v9.d[0]
  435. fmul v24.2d, v0.2d, v9.d[0]
  436. fmul v21.2d, v1.2d, v8.d[1]
  437. fmul v28.2d, v0.2d, v9.d[1]
  438. fmul v17.2d, v1.2d, v8.d[0]
  439. ld1 {v12.2d, v13.2d}, [pB]
  440. add pB, pB, #32
  441. ld1 {v4.2d, v5.2d}, [pA]
  442. add pA, pA, #32
  443. .endm
  444. .macro KERNEL4x4_M1
  445. fmla v16.2d, v0.2d, v8.d[0]
  446. fmla v29.2d, v1.2d, v9.d[1]
  447. ld1 {v12.2d, v13.2d}, [pB] // For next round
  448. add pB, pB, #32
  449. fmla v20.2d, v0.2d, v8.d[1]
  450. fmla v25.2d, v1.2d, v9.d[0]
  451. ld1 {v4.2d, v5.2d}, [pA] // For next round
  452. add pA, pA, #32
  453. fmla v24.2d, v0.2d, v9.d[0]
  454. fmla v21.2d, v1.2d, v8.d[1]
  455. prfm PLDL1KEEP, [pA, #512]
  456. fmla v28.2d, v0.2d, v9.d[1]
  457. fmla v17.2d, v1.2d, v8.d[0]
  458. .endm
  459. .macro KERNEL4x4_M2
  460. fmla v16.2d, v4.2d, v12.d[0]
  461. fmla v29.2d, v5.2d, v13.d[1]
  462. ld1 {v8.2d, v9.2d}, [pB] // For next round
  463. add pB, pB, #32
  464. fmla v20.2d, v4.2d, v12.d[1]
  465. fmla v25.2d, v5.2d, v13.d[0]
  466. ld1 {v0.2d, v1.2d}, [pA] // For next round
  467. add pA, pA, #32
  468. fmla v24.2d, v4.2d, v13.d[0]
  469. fmla v21.2d, v5.2d, v12.d[1]
  470. prfm PLDL1KEEP, [pB, #512]
  471. fmla v28.2d, v4.2d, v13.d[1]
  472. fmla v17.2d, v5.2d, v12.d[0]
  473. .endm
  474. .macro KERNEL4x4_E
  475. fmla v16.2d, v4.2d, v12.d[0]
  476. fmla v29.2d, v5.2d, v13.d[1]
  477. fmla v20.2d, v4.2d, v12.d[1]
  478. fmla v25.2d, v5.2d, v13.d[0]
  479. fmla v24.2d, v4.2d, v13.d[0]
  480. fmla v21.2d, v5.2d, v12.d[1]
  481. fmla v28.2d, v4.2d, v13.d[1]
  482. fmla v17.2d, v5.2d, v12.d[0]
  483. .endm
  484. .macro KERNEL4x4_SUB
  485. ld1 {v8.2d, v9.2d}, [pB]
  486. add pB, pB, #32
  487. ld1 {v0.2d, v1.2d}, [pA]
  488. add pA, pA, #32
  489. fmla v16.2d, v0.2d, v8.d[0]
  490. fmla v29.2d, v1.2d, v9.d[1]
  491. fmla v20.2d, v0.2d, v8.d[1]
  492. fmla v25.2d, v1.2d, v9.d[0]
  493. fmla v24.2d, v0.2d, v9.d[0]
  494. fmla v21.2d, v1.2d, v8.d[1]
  495. fmla v28.2d, v0.2d, v9.d[1]
  496. fmla v17.2d, v1.2d, v8.d[0]
  497. .endm
  498. .macro SAVE4x4
  499. ld1 {v8.2d, v9.2d}, [pCRow0]
  500. fmla v8.2d, v16.2d, alphaV0
  501. fmla v9.2d, v17.2d, alphaV1
  502. st1 {v8.2d, v9.2d}, [pCRow0]
  503. add pCRow1, pCRow0, LDC
  504. ld1 {v12.2d, v13.2d}, [pCRow1]
  505. fmla v12.2d, v20.2d, alphaV2
  506. fmla v13.2d, v21.2d, alphaV3
  507. st1 {v12.2d, v13.2d}, [pCRow1]
  508. add pCRow2, pCRow1, LDC
  509. ld1 {v8.2d, v9.2d}, [pCRow2]
  510. fmla v8.2d, v24.2d, alphaV0
  511. fmla v9.2d, v25.2d, alphaV1
  512. st1 {v8.2d, v9.2d}, [pCRow2]
  513. add pCRow1, pCRow2, LDC
  514. ld1 {v12.2d, v13.2d}, [pCRow1]
  515. fmla v12.2d, v28.2d, alphaV2
  516. fmla v13.2d, v29.2d, alphaV3
  517. st1 {v12.2d, v13.2d}, [pCRow1]
  518. add pCRow0, pCRow0, #32
  519. .endm
  520. /******************************************************************************/
  521. .macro INIT2x4
  522. fmov d16, xzr
  523. fmov d20, d16
  524. fmov d24, d20
  525. fmov d28, d16
  526. .endm
  527. .macro KERNEL2x4_SUB
  528. ld1 {v8.2d, v9.2d}, [pB]
  529. add pB, pB, #32
  530. ld1 {v0.2d}, [pA]
  531. add pA, pA, #16
  532. fmla v16.2d, v0.2d, v8.d[0]
  533. fmla v20.2d, v0.2d, v8.d[1]
  534. fmla v24.2d, v0.2d, v9.d[0]
  535. fmla v28.2d, v0.2d, v9.d[1]
  536. .endm
  537. .macro SAVE2x4
  538. ld1 {v8.2d}, [pCRow0]
  539. fmla v8.2d, v16.2d, alphaV0
  540. st1 {v8.2d}, [pCRow0]
  541. add pCRow1, pCRow0, LDC
  542. ld1 {v12.2d}, [pCRow1]
  543. fmla v12.2d, v20.2d, alphaV1
  544. st1 {v12.2d}, [pCRow1]
  545. add pCRow2, pCRow1, LDC
  546. ld1 {v8.2d}, [pCRow2]
  547. fmla v8.2d, v24.2d, alphaV2
  548. st1 {v8.2d}, [pCRow2]
  549. add pCRow1, pCRow2, LDC
  550. ld1 {v12.2d}, [pCRow1]
  551. fmla v12.2d, v28.2d, alphaV3
  552. st1 {v12.2d}, [pCRow1]
  553. add pCRow0, pCRow0, #16
  554. .endm
  555. /******************************************************************************/
  556. .macro INIT1x4
  557. fmov d16, xzr
  558. fmov d20, d16
  559. .endm
  560. .macro KERNEL1x4_SUB
  561. ldr d0, [pA]
  562. add pA, pA, #8
  563. ld1 {v8.2d, v9.2d}, [pB]
  564. add pB, pB, #32
  565. fmla v16.2d, v8.2d, v0.d[0]
  566. fmla v20.2d, v9.2d, v0.d[0]
  567. .endm
  568. .macro SAVE1x4
  569. add pCRow1, pCRow0, LDC
  570. ld1 {v8.d}[0], [pCRow0]
  571. ld1 {v8.d}[1], [pCRow1]
  572. fmla v8.2d, v16.2d, alphaV0
  573. st1 {v8.d}[0], [pCRow0]
  574. st1 {v8.d}[1], [pCRow1]
  575. add pCRow2, pCRow1, LDC
  576. add pCRow1, pCRow2, LDC
  577. ld1 {v12.d}[0], [pCRow2]
  578. ld1 {v12.d}[1], [pCRow1]
  579. fmla v12.2d, v20.2d, alphaV1
  580. st1 {v12.d}[0], [pCRow2]
  581. st1 {v12.d}[1], [pCRow1]
  582. add pCRow0, pCRow0, #8
  583. .endm
  584. /******************************************************************************/
  585. .macro INIT4x2
  586. fmov d16, xzr
  587. fmov d17, d16
  588. fmov d20, d17
  589. fmov d21, d16
  590. .endm
  591. .macro KERNEL4x2_SUB
  592. ld1 {v8.2d}, [pB]
  593. add pB, pB, #16
  594. ld1 {v0.2d, v1.2d}, [pA]
  595. add pA, pA, #32
  596. fmla v16.2d, v0.2d, v8.d[0]
  597. fmla v17.2d, v1.2d, v8.d[0]
  598. fmla v20.2d, v0.2d, v8.d[1]
  599. fmla v21.2d, v1.2d, v8.d[1]
  600. .endm
  601. .macro SAVE4x2
  602. ld1 {v8.2d, v9.2d}, [pCRow0]
  603. fmla v8.2d, v16.2d, alphaV0
  604. fmla v9.2d, v17.2d, alphaV1
  605. st1 {v8.2d, v9.2d}, [pCRow0]
  606. add pCRow1, pCRow0, LDC
  607. ld1 {v12.2d, v13.2d}, [pCRow1]
  608. fmla v12.2d, v20.2d, alphaV2
  609. fmla v13.2d, v21.2d, alphaV3
  610. st1 {v12.2d, v13.2d}, [pCRow1]
  611. add pCRow0, pCRow0, #32
  612. .endm
  613. /******************************************************************************/
  614. .macro INIT2x2
  615. fmov d16, xzr
  616. fmov d20, d16
  617. .endm
  618. .macro KERNEL2x2_SUB
  619. ld1 {v8.2d}, [pB]
  620. add pB, pB, #16
  621. ld1 {v0.2d}, [pA]
  622. add pA, pA, #16
  623. fmla v16.2d, v0.2d, v8.d[0]
  624. fmla v20.2d, v0.2d, v8.d[1]
  625. .endm
  626. .macro SAVE2x2
  627. ld1 {v8.2d}, [pCRow0]
  628. fmla v8.2d, v16.2d, alphaV0
  629. st1 {v8.2d}, [pCRow0]
  630. add pCRow1 , pCRow0, LDC
  631. ld1 {v12.2d}, [pCRow1]
  632. fmla v12.2d, v20.2d, alphaV1
  633. st1 {v12.2d}, [pCRow1]
  634. add pCRow0, pCRow0, #16
  635. .endm
  636. /******************************************************************************/
  637. .macro INIT1x2
  638. fmov d16, xzr
  639. .endm
  640. .macro KERNEL1x2_SUB
  641. ld1 {v8.2d} , [pB]
  642. add pB , pB, #16
  643. ldr d0 , [pA]
  644. add pA, pA, #8
  645. fmla v16.2d, v8.2d, v0.d[0]
  646. .endm
  647. .macro SAVE1x2
  648. add pCRow1 , pCRow0, LDC
  649. ld1 {v8.d}[0], [pCRow0]
  650. ld1 {v8.d}[1], [pCRow1]
  651. fmla v8.2d, v16.2d, alphaV0
  652. st1 {v8.d}[0], [pCRow0]
  653. st1 {v8.d}[1], [pCRow1]
  654. add pCRow0, pCRow0, #8
  655. .endm
  656. /******************************************************************************/
  657. .macro INIT4x1
  658. fmov d16, xzr
  659. fmov d17, d16
  660. .endm
  661. .macro KERNEL4x1_SUB
  662. ldr d8, [pB]
  663. add pB , pB, #8
  664. ld1 {v0.2d, v1.2d}, [pA]
  665. add pA , pA, #32
  666. fmla v16.2d, v0.2d, v8.d[0]
  667. fmla v17.2d, v1.2d, v8.d[0]
  668. .endm
  669. .macro SAVE4x1
  670. ld1 {v8.2d, v9.2d}, [pCRow0]
  671. fmla v8.2d, v16.2d, alphaV0
  672. fmla v9.2d, v17.2d, alphaV1
  673. st1 {v8.2d, v9.2d}, [pCRow0]
  674. add pCRow0, pCRow0, #32
  675. .endm
  676. /******************************************************************************/
  677. .macro INIT2x1
  678. fmov d16, xzr
  679. .endm
  680. .macro KERNEL2x1_SUB
  681. ldr d8, [pB]
  682. add pB , pB, #8
  683. ld1 {v0.2d}, [pA]
  684. add pA , pA, #16
  685. fmla v16.2d, v0.2d, v8.d[0]
  686. .endm
  687. .macro SAVE2x1
  688. ld1 {v8.2d}, [pCRow0]
  689. fmla v8.2d, v16.2d, alphaV0
  690. st1 {v8.2d}, [pCRow0]
  691. add pCRow0, pCRow0, #16
  692. .endm
  693. /******************************************************************************/
  694. .macro INIT1x1
  695. fmov d16, xzr
  696. .endm
  697. .macro KERNEL1x1_SUB
  698. ldr d8, [pB]
  699. add pB , pB, #8
  700. ldr d0, [pA]
  701. add pA , pA, #8
  702. fmadd d16, d0, d8, d16
  703. .endm
  704. .macro SAVE1x1
  705. ldr d8, [pCRow0]
  706. fmadd d8, d16, alpha0, d8
  707. str d8, [pCRow0]
  708. add pCRow0, pCRow0, #8
  709. .endm
  710. /*******************************************************************************
  711. * End of macro definitions
  712. *******************************************************************************/
  713. PROLOGUE
  714. .align 5
  715. add sp, sp, #-(11 * 16)
  716. stp d8, d9, [sp, #(0 * 16)]
  717. stp d10, d11, [sp, #(1 * 16)]
  718. stp d12, d13, [sp, #(2 * 16)]
  719. stp d14, d15, [sp, #(3 * 16)]
  720. stp d16, d17, [sp, #(4 * 16)]
  721. stp x18, x19, [sp, #(5 * 16)]
  722. stp x20, x21, [sp, #(6 * 16)]
  723. stp x22, x23, [sp, #(7 * 16)]
  724. stp x24, x25, [sp, #(8 * 16)]
  725. stp x26, x27, [sp, #(9 * 16)]
  726. str x28, [sp, #(10 * 16)]
  727. fmov alpha0, d0
  728. fmov alpha1, d0
  729. fmov alpha2, d0
  730. fmov alpha3, d0
  731. lsl LDC, LDC, #3 // ldc = ldc * 8
  732. mov pB, origPB
  733. mov counterJ, origN
  734. asr counterJ, counterJ, #3 // J = J / 8
  735. cmp counterJ, #0
  736. ble .Ldgemm_kernel_L4_BEGIN
  737. /******************************************************************************/
  738. .Ldgemm_kernel_L8_BEGIN:
  739. mov pCRow0, pC // pCRow0 = C
  740. add pC, pC, LDC, lsl #3
  741. mov pA, origPA // pA = start of A array
  742. .Ldgemm_kernel_L8_M4_BEGIN:
  743. mov counterI, origM
  744. asr counterI, counterI, #2 // counterI = counterI / 4
  745. cmp counterI, #0
  746. ble .Ldgemm_kernel_L8_M2_BEGIN
  747. .Ldgemm_kernel_L8_M4_20:
  748. mov pB, origPB
  749. asr counterL , origK, #1 // L = K / 2
  750. cmp counterL , #2 // is there at least 4 to do?
  751. blt .Ldgemm_kernel_L8_M4_32
  752. KERNEL4x8_I // do one in the K
  753. KERNEL4x8_M2 // do another in the K
  754. subs counterL, counterL, #2
  755. ble .Ldgemm_kernel_L8_M4_22a
  756. .align 5
  757. .Ldgemm_kernel_L8_M4_22:
  758. KERNEL4x8_M1
  759. KERNEL4x8_M2
  760. subs counterL, counterL, #1
  761. bgt .Ldgemm_kernel_L8_M4_22
  762. .Ldgemm_kernel_L8_M4_22a:
  763. KERNEL4x8_M1
  764. KERNEL4x8_E
  765. b .Ldgemm_kernel_L8_M4_44
  766. .Ldgemm_kernel_L8_M4_32:
  767. tst counterL, #1
  768. ble .Ldgemm_kernel_L8_M4_40
  769. KERNEL4x8_I
  770. KERNEL4x8_E
  771. b .Ldgemm_kernel_L8_M4_44
  772. .Ldgemm_kernel_L8_M4_40:
  773. INIT4x8
  774. .Ldgemm_kernel_L8_M4_44:
  775. ands counterL , origK, #1
  776. ble .Ldgemm_kernel_L8_M4_100
  777. .Ldgemm_kernel_L8_M4_46:
  778. KERNEL4x8_SUB
  779. .Ldgemm_kernel_L8_M4_100:
  780. SAVE4x8
  781. .Ldgemm_kernel_L8_M4_END:
  782. subs counterI, counterI, #1
  783. bne .Ldgemm_kernel_L8_M4_20
  784. .Ldgemm_kernel_L8_M2_BEGIN:
  785. mov counterI, origM
  786. tst counterI , #3
  787. ble .Ldgemm_kernel_L8_END
  788. tst counterI, #2 // counterI = counterI / 2
  789. ble .Ldgemm_kernel_L8_M1_BEGIN
  790. .Ldgemm_kernel_L8_M2_20:
  791. INIT2x8
  792. mov pB, origPB
  793. asr counterL , origK, #3 // counterL = counterL / 8
  794. cmp counterL , #0
  795. ble .Ldgemm_kernel_L8_M2_40
  796. .Ldgemm_kernel_L8_M2_22:
  797. KERNEL2x8_SUB
  798. KERNEL2x8_SUB
  799. KERNEL2x8_SUB
  800. KERNEL2x8_SUB
  801. KERNEL2x8_SUB
  802. KERNEL2x8_SUB
  803. KERNEL2x8_SUB
  804. KERNEL2x8_SUB
  805. subs counterL, counterL, #1
  806. bgt .Ldgemm_kernel_L8_M2_22
  807. .Ldgemm_kernel_L8_M2_40:
  808. ands counterL , origK, #7 // counterL = counterL % 8
  809. ble .Ldgemm_kernel_L8_M2_100
  810. .Ldgemm_kernel_L8_M2_42:
  811. KERNEL2x8_SUB
  812. subs counterL, counterL, #1
  813. bgt .Ldgemm_kernel_L8_M2_42
  814. .Ldgemm_kernel_L8_M2_100:
  815. SAVE2x8
  816. .Ldgemm_kernel_L8_M2_END:
  817. .Ldgemm_kernel_L8_M1_BEGIN:
  818. tst counterI, #1 // counterI = counterI % 2
  819. ble .Ldgemm_kernel_L8_END
  820. .Ldgemm_kernel_L8_M1_20:
  821. INIT1x8
  822. mov pB, origPB
  823. asr counterL , origK, #3 // counterL = counterL / 8
  824. cmp counterL , #0
  825. ble .Ldgemm_kernel_L8_M1_40
  826. .Ldgemm_kernel_L8_M1_22:
  827. KERNEL1x8_SUB
  828. KERNEL1x8_SUB
  829. KERNEL1x8_SUB
  830. KERNEL1x8_SUB
  831. KERNEL1x8_SUB
  832. KERNEL1x8_SUB
  833. KERNEL1x8_SUB
  834. KERNEL1x8_SUB
  835. subs counterL, counterL, #1
  836. bgt .Ldgemm_kernel_L8_M1_22
  837. .Ldgemm_kernel_L8_M1_40:
  838. ands counterL , origK, #7 // counterL = counterL % 8
  839. ble .Ldgemm_kernel_L8_M1_100
  840. .Ldgemm_kernel_L8_M1_42:
  841. KERNEL1x8_SUB
  842. subs counterL, counterL, #1
  843. bgt .Ldgemm_kernel_L8_M1_42
  844. .Ldgemm_kernel_L8_M1_100:
  845. SAVE1x8
  846. .Ldgemm_kernel_L8_END:
  847. lsl temp, origK, #6
  848. add origPB, origPB, temp // B = B + K * 8 * 8
  849. subs counterJ, counterJ , #1 // j--
  850. bgt .Ldgemm_kernel_L8_BEGIN
  851. /******************************************************************************/
  852. .Ldgemm_kernel_L4_BEGIN:
  853. mov counterJ , origN
  854. tst counterJ , #7
  855. ble .Ldgemm_kernel_L999
  856. tst counterJ , #4
  857. ble .Ldgemm_kernel_L2_BEGIN
  858. mov pCRow0, pC // pCRow0 = C
  859. add pC, pC, LDC, lsl #2
  860. mov pA, origPA // pA = start of A array
  861. .Ldgemm_kernel_L4_M4_BEGIN:
  862. mov counterI, origM
  863. asr counterI, counterI, #2 // counterI = counterI / 4
  864. cmp counterI, #0
  865. ble .Ldgemm_kernel_L4_M2_BEGIN
  866. .Ldgemm_kernel_L4_M4_20:
  867. mov pB, origPB
  868. asr counterL , origK, #1 // L = K / 2
  869. cmp counterL , #2 // is there at least 4 to do?
  870. blt .Ldgemm_kernel_L4_M4_32
  871. KERNEL4x4_I // do one in the K
  872. KERNEL4x4_M2 // do another in the K
  873. subs counterL, counterL, #2
  874. ble .Ldgemm_kernel_L4_M4_22a
  875. .align 5
  876. .Ldgemm_kernel_L4_M4_22:
  877. KERNEL4x4_M1
  878. KERNEL4x4_M2
  879. subs counterL, counterL, #1
  880. bgt .Ldgemm_kernel_L4_M4_22
  881. .Ldgemm_kernel_L4_M4_22a:
  882. KERNEL4x4_M1
  883. KERNEL4x4_E
  884. b .Ldgemm_kernel_L4_M4_44
  885. .Ldgemm_kernel_L4_M4_32:
  886. tst counterL, #1
  887. ble .Ldgemm_kernel_L4_M4_40
  888. KERNEL4x4_I
  889. KERNEL4x4_E
  890. b .Ldgemm_kernel_L4_M4_44
  891. .Ldgemm_kernel_L4_M4_40:
  892. INIT4x4
  893. .Ldgemm_kernel_L4_M4_44:
  894. ands counterL , origK, #1
  895. ble .Ldgemm_kernel_L4_M4_100
  896. .Ldgemm_kernel_L4_M4_46:
  897. KERNEL4x4_SUB
  898. .Ldgemm_kernel_L4_M4_100:
  899. SAVE4x4
  900. .Ldgemm_kernel_L4_M4_END:
  901. subs counterI, counterI, #1
  902. bne .Ldgemm_kernel_L4_M4_20
  903. .Ldgemm_kernel_L4_M2_BEGIN:
  904. mov counterI, origM
  905. tst counterI , #3
  906. ble .Ldgemm_kernel_L4_END
  907. tst counterI, #2 // counterI = counterI / 2
  908. ble .Ldgemm_kernel_L4_M1_BEGIN
  909. .Ldgemm_kernel_L4_M2_20:
  910. INIT2x4
  911. mov pB, origPB
  912. asr counterL , origK, #3 // counterL = counterL / 8
  913. cmp counterL , #0
  914. ble .Ldgemm_kernel_L4_M2_40
  915. .Ldgemm_kernel_L4_M2_22:
  916. KERNEL2x4_SUB
  917. KERNEL2x4_SUB
  918. KERNEL2x4_SUB
  919. KERNEL2x4_SUB
  920. KERNEL2x4_SUB
  921. KERNEL2x4_SUB
  922. KERNEL2x4_SUB
  923. KERNEL2x4_SUB
  924. subs counterL, counterL, #1
  925. bgt .Ldgemm_kernel_L4_M2_22
  926. .Ldgemm_kernel_L4_M2_40:
  927. ands counterL , origK, #7 // counterL = counterL % 8
  928. ble .Ldgemm_kernel_L4_M2_100
  929. .Ldgemm_kernel_L4_M2_42:
  930. KERNEL2x4_SUB
  931. subs counterL, counterL, #1
  932. bgt .Ldgemm_kernel_L4_M2_42
  933. .Ldgemm_kernel_L4_M2_100:
  934. SAVE2x4
  935. .Ldgemm_kernel_L4_M2_END:
  936. .Ldgemm_kernel_L4_M1_BEGIN:
  937. tst counterI, #1 // counterI = counterI % 2
  938. ble .Ldgemm_kernel_L4_END
  939. .Ldgemm_kernel_L4_M1_20:
  940. INIT1x4
  941. mov pB, origPB
  942. asr counterL , origK, #3 // counterL = counterL / 8
  943. cmp counterL , #0
  944. ble .Ldgemm_kernel_L4_M1_40
  945. .Ldgemm_kernel_L4_M1_22:
  946. KERNEL1x4_SUB
  947. KERNEL1x4_SUB
  948. KERNEL1x4_SUB
  949. KERNEL1x4_SUB
  950. KERNEL1x4_SUB
  951. KERNEL1x4_SUB
  952. KERNEL1x4_SUB
  953. KERNEL1x4_SUB
  954. subs counterL, counterL, #1
  955. bgt .Ldgemm_kernel_L4_M1_22
  956. .Ldgemm_kernel_L4_M1_40:
  957. ands counterL , origK, #7 // counterL = counterL % 8
  958. ble .Ldgemm_kernel_L4_M1_100
  959. .Ldgemm_kernel_L4_M1_42:
  960. KERNEL1x4_SUB
  961. subs counterL, counterL, #1
  962. bgt .Ldgemm_kernel_L4_M1_42
  963. .Ldgemm_kernel_L4_M1_100:
  964. SAVE1x4
  965. .Ldgemm_kernel_L4_END:
  966. lsl temp, origK, #5
  967. add origPB, origPB, temp // B = B + K * 4 * 8
  968. /******************************************************************************/
  969. .Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
  970. mov counterJ , origN
  971. tst counterJ , #3
  972. ble .Ldgemm_kernel_L999 // error, N was less than 4?
  973. tst counterJ , #2
  974. ble .Ldgemm_kernel_L1_BEGIN
  975. mov pCRow0, pC // pCRow0 = pC
  976. add pC,pC,LDC, lsl #1
  977. mov pA, origPA // pA = A
  978. .Ldgemm_kernel_L2_M4_BEGIN:
  979. mov counterI, origM
  980. asr counterI, counterI, #2 // counterI = counterI / 4
  981. cmp counterI,#0
  982. ble .Ldgemm_kernel_L2_M2_BEGIN
  983. .Ldgemm_kernel_L2_M4_20:
  984. INIT4x2
  985. mov pB, origPB
  986. asr counterL , origK, #3 // counterL = counterL / 8
  987. cmp counterL,#0
  988. ble .Ldgemm_kernel_L2_M4_40
  989. .align 5
  990. .Ldgemm_kernel_L2_M4_22:
  991. KERNEL4x2_SUB
  992. KERNEL4x2_SUB
  993. KERNEL4x2_SUB
  994. KERNEL4x2_SUB
  995. KERNEL4x2_SUB
  996. KERNEL4x2_SUB
  997. KERNEL4x2_SUB
  998. KERNEL4x2_SUB
  999. subs counterL, counterL, #1
  1000. bgt .Ldgemm_kernel_L2_M4_22
  1001. .Ldgemm_kernel_L2_M4_40:
  1002. ands counterL , origK, #7 // counterL = counterL % 8
  1003. ble .Ldgemm_kernel_L2_M4_100
  1004. .Ldgemm_kernel_L2_M4_42:
  1005. KERNEL4x2_SUB
  1006. subs counterL, counterL, #1
  1007. bgt .Ldgemm_kernel_L2_M4_42
  1008. .Ldgemm_kernel_L2_M4_100:
  1009. SAVE4x2
  1010. .Ldgemm_kernel_L2_M4_END:
  1011. subs counterI, counterI, #1
  1012. bgt .Ldgemm_kernel_L2_M4_20
  1013. .Ldgemm_kernel_L2_M2_BEGIN:
  1014. mov counterI, origM
  1015. tst counterI , #3
  1016. ble .Ldgemm_kernel_L2_END
  1017. tst counterI, #2 // counterI = counterI / 2
  1018. ble .Ldgemm_kernel_L2_M1_BEGIN
  1019. .Ldgemm_kernel_L2_M2_20:
  1020. INIT2x2
  1021. mov pB, origPB
  1022. asr counterL , origK, #3 // counterL = counterL / 8
  1023. cmp counterL,#0
  1024. ble .Ldgemm_kernel_L2_M2_40
  1025. .Ldgemm_kernel_L2_M2_22:
  1026. KERNEL2x2_SUB
  1027. KERNEL2x2_SUB
  1028. KERNEL2x2_SUB
  1029. KERNEL2x2_SUB
  1030. KERNEL2x2_SUB
  1031. KERNEL2x2_SUB
  1032. KERNEL2x2_SUB
  1033. KERNEL2x2_SUB
  1034. subs counterL, counterL, #1
  1035. bgt .Ldgemm_kernel_L2_M2_22
  1036. .Ldgemm_kernel_L2_M2_40:
  1037. ands counterL , origK, #7 // counterL = counterL % 8
  1038. ble .Ldgemm_kernel_L2_M2_100
  1039. .Ldgemm_kernel_L2_M2_42:
  1040. KERNEL2x2_SUB
  1041. subs counterL, counterL, #1
  1042. bgt .Ldgemm_kernel_L2_M2_42
  1043. .Ldgemm_kernel_L2_M2_100:
  1044. SAVE2x2
  1045. .Ldgemm_kernel_L2_M2_END:
  1046. .Ldgemm_kernel_L2_M1_BEGIN:
  1047. tst counterI, #1 // counterI = counterI % 2
  1048. ble .Ldgemm_kernel_L2_END
  1049. .Ldgemm_kernel_L2_M1_20:
  1050. INIT1x2
  1051. mov pB, origPB
  1052. asr counterL , origK, #3 // counterL = counterL / 8
  1053. cmp counterL, #0
  1054. ble .Ldgemm_kernel_L2_M1_40
  1055. .Ldgemm_kernel_L2_M1_22:
  1056. KERNEL1x2_SUB
  1057. KERNEL1x2_SUB
  1058. KERNEL1x2_SUB
  1059. KERNEL1x2_SUB
  1060. KERNEL1x2_SUB
  1061. KERNEL1x2_SUB
  1062. KERNEL1x2_SUB
  1063. KERNEL1x2_SUB
  1064. subs counterL, counterL, #1
  1065. bgt .Ldgemm_kernel_L2_M1_22
  1066. .Ldgemm_kernel_L2_M1_40:
  1067. ands counterL , origK, #7 // counterL = counterL % 8
  1068. ble .Ldgemm_kernel_L2_M1_100
  1069. .Ldgemm_kernel_L2_M1_42:
  1070. KERNEL1x2_SUB
  1071. subs counterL, counterL, #1
  1072. bgt .Ldgemm_kernel_L2_M1_42
  1073. .Ldgemm_kernel_L2_M1_100:
  1074. SAVE1x2
  1075. .Ldgemm_kernel_L2_END:
  1076. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
  1077. /******************************************************************************/
  1078. .Ldgemm_kernel_L1_BEGIN:
  1079. mov counterJ , origN
  1080. tst counterJ , #1
  1081. ble .Ldgemm_kernel_L999 // done
  1082. mov pCRow0, pC // pCRow0 = C
  1083. add pC , pC , LDC // Update pC to point to next
  1084. mov pA, origPA // pA = A
  1085. .Ldgemm_kernel_L1_M4_BEGIN:
  1086. mov counterI, origM
  1087. asr counterI, counterI, #2 // counterI = counterI / 4
  1088. cmp counterI, #0
  1089. ble .Ldgemm_kernel_L1_M2_BEGIN
  1090. .Ldgemm_kernel_L1_M4_20:
  1091. INIT4x1
  1092. mov pB, origPB
  1093. asr counterL , origK, #3 // counterL = counterL / 8
  1094. cmp counterL , #0
  1095. ble .Ldgemm_kernel_L1_M4_40
  1096. .align 5
  1097. .Ldgemm_kernel_L1_M4_22:
  1098. KERNEL4x1_SUB
  1099. KERNEL4x1_SUB
  1100. KERNEL4x1_SUB
  1101. KERNEL4x1_SUB
  1102. KERNEL4x1_SUB
  1103. KERNEL4x1_SUB
  1104. KERNEL4x1_SUB
  1105. KERNEL4x1_SUB
  1106. subs counterL, counterL, #1
  1107. bgt .Ldgemm_kernel_L1_M4_22
  1108. .Ldgemm_kernel_L1_M4_40:
  1109. ands counterL , origK, #7 // counterL = counterL % 8
  1110. ble .Ldgemm_kernel_L1_M4_100
  1111. .Ldgemm_kernel_L1_M4_42:
  1112. KERNEL4x1_SUB
  1113. subs counterL, counterL, #1
  1114. bgt .Ldgemm_kernel_L1_M4_42
  1115. .Ldgemm_kernel_L1_M4_100:
  1116. SAVE4x1
  1117. .Ldgemm_kernel_L1_M4_END:
  1118. subs counterI, counterI, #1
  1119. bgt .Ldgemm_kernel_L1_M4_20
  1120. .Ldgemm_kernel_L1_M2_BEGIN:
  1121. mov counterI, origM
  1122. tst counterI , #3
  1123. ble .Ldgemm_kernel_L1_END
  1124. tst counterI, #2 // counterI = counterI / 2
  1125. ble .Ldgemm_kernel_L1_M1_BEGIN
  1126. .Ldgemm_kernel_L1_M2_20:
  1127. INIT2x1
  1128. mov pB, origPB
  1129. asr counterL , origK, #3 // counterL = counterL / 8
  1130. cmp counterL , #0
  1131. ble .Ldgemm_kernel_L1_M2_40
  1132. .Ldgemm_kernel_L1_M2_22:
  1133. KERNEL2x1_SUB
  1134. KERNEL2x1_SUB
  1135. KERNEL2x1_SUB
  1136. KERNEL2x1_SUB
  1137. KERNEL2x1_SUB
  1138. KERNEL2x1_SUB
  1139. KERNEL2x1_SUB
  1140. KERNEL2x1_SUB
  1141. subs counterL, counterL, #1
  1142. bgt .Ldgemm_kernel_L1_M2_22
  1143. .Ldgemm_kernel_L1_M2_40:
  1144. ands counterL , origK, #7 // counterL = counterL % 8
  1145. ble .Ldgemm_kernel_L1_M2_100
  1146. .Ldgemm_kernel_L1_M2_42:
  1147. KERNEL2x1_SUB
  1148. subs counterL, counterL, #1
  1149. bgt .Ldgemm_kernel_L1_M2_42
  1150. .Ldgemm_kernel_L1_M2_100:
  1151. SAVE2x1
  1152. .Ldgemm_kernel_L1_M2_END:
  1153. .Ldgemm_kernel_L1_M1_BEGIN:
  1154. tst counterI, #1 // counterI = counterI % 2
  1155. ble .Ldgemm_kernel_L1_END
  1156. .Ldgemm_kernel_L1_M1_20:
  1157. INIT1x1
  1158. mov pB, origPB
  1159. asr counterL , origK, #3 // counterL = counterL / 8
  1160. cmp counterL , #0
  1161. ble .Ldgemm_kernel_L1_M1_40
  1162. .Ldgemm_kernel_L1_M1_22:
  1163. KERNEL1x1_SUB
  1164. KERNEL1x1_SUB
  1165. KERNEL1x1_SUB
  1166. KERNEL1x1_SUB
  1167. KERNEL1x1_SUB
  1168. KERNEL1x1_SUB
  1169. KERNEL1x1_SUB
  1170. KERNEL1x1_SUB
  1171. subs counterL, counterL, #1
  1172. bgt .Ldgemm_kernel_L1_M1_22
  1173. .Ldgemm_kernel_L1_M1_40:
  1174. ands counterL , origK, #7 // counterL = counterL % 8
  1175. ble .Ldgemm_kernel_L1_M1_100
  1176. .Ldgemm_kernel_L1_M1_42:
  1177. KERNEL1x1_SUB
  1178. subs counterL, counterL, #1
  1179. bgt .Ldgemm_kernel_L1_M1_42
  1180. .Ldgemm_kernel_L1_M1_100:
  1181. SAVE1x1
  1182. .Ldgemm_kernel_L1_END:
  1183. .Ldgemm_kernel_L999:
  1184. mov x0, #0 // set return value
  1185. ldp d8, d9, [sp, #(0 * 16)]
  1186. ldp d10, d11, [sp, #(1 * 16)]
  1187. ldp d12, d13, [sp, #(2 * 16)]
  1188. ldp d14, d15, [sp, #(3 * 16)]
  1189. ldp d16, d17, [sp, #(4 * 16)]
  1190. ldp x18, x19, [sp, #(5 * 16)]
  1191. ldp x20, x21, [sp, #(6 * 16)]
  1192. ldp x22, x23, [sp, #(7 * 16)]
  1193. ldp x24, x25, [sp, #(8 * 16)]
  1194. ldp x26, x27, [sp, #(9 * 16)]
  1195. ldr x28, [sp, #(10 * 16)]
  1196. add sp, sp, #(11*16)
  1197. ret
  1198. EPILOGUE