You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_logic_power10.S 38 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735
  1. /***************************************************************************
  2. Copyright (c) 2013-2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define MY_ALIGN .align 3
  28. b ZGEMM_L2
  29. /* MINI SUBROUTINES */
  30. /* 2x8 MAIN 128x+2 LOOP */
  31. ZGEMM_L2x8_LMAIN_SUB:
  32. /*----------------------------------------*/
  33. mtctr T8
  34. MY_ALIGN
  35. ZGEMM_L2x8_LOOP:
  36. /*----------------------------------------*/
  37. dcbt AO, PRE
  38. dcbt BO, PRE
  39. KERNEL2x8_2 0, 0
  40. ZGEMM_L2x8_K128:
  41. /*----------------------------------------*/
  42. KERNEL2x8_2 1, 0
  43. dcbt AO, T2
  44. KERNEL2x8_2 2, 0
  45. KERNEL2x8_2 3, 0
  46. dcbt AO, T3
  47. dcbt BO, T2
  48. KERNEL2x8_2 4, 0
  49. KERNEL2x8_2 5, 0
  50. dcbt AO, T4
  51. KERNEL2x8_2 6, 0
  52. KERNEL2x8_2 7, 0
  53. dcbt AO, T5
  54. dcbt BO, T3
  55. KERNEL2x8_2 8, 0
  56. KERNEL2x8_2 9, 0
  57. KERNEL2x8_2 10, 0
  58. KERNEL2x8_2 11, 0
  59. dcbt BO, T4
  60. KERNEL2x8_2 12, 0
  61. KERNEL2x8_2 13, 0
  62. KERNEL2x8_2 14, 0
  63. KERNEL2x8_2 15, 0
  64. KERNEL2x8_2 16, 0
  65. KERNEL2x8_2 17, 0
  66. KERNEL2x8_2 18, 0
  67. KERNEL2x8_2 19, 0
  68. KERNEL2x8_2 20, 0
  69. KERNEL2x8_2 21, 0
  70. KERNEL2x8_2 22, 0
  71. KERNEL2x8_2 23, 0
  72. KERNEL2x8_2 24, 0
  73. KERNEL2x8_2 25, 0
  74. KERNEL2x8_2 26, 0
  75. KERNEL2x8_2 27, 0
  76. KERNEL2x8_2 28, 0
  77. KERNEL2x8_2 29, 0
  78. KERNEL2x8_2 30, 0
  79. KERNEL2x8_2 31, 0
  80. KERNEL2x8_2 32, 0
  81. KERNEL2x8_2 33, 0
  82. KERNEL2x8_2 34, 0
  83. KERNEL2x8_2 35, 0
  84. KERNEL2x8_2 36, 0
  85. KERNEL2x8_2 37, 0
  86. KERNEL2x8_2 38, 0
  87. KERNEL2x8_2 39, 0
  88. KERNEL2x8_2 40, 0
  89. KERNEL2x8_2 41, 0
  90. KERNEL2x8_2 42, 0
  91. KERNEL2x8_2 43, 0
  92. KERNEL2x8_2 44, 0
  93. KERNEL2x8_2 45, 0
  94. KERNEL2x8_2 46, 0
  95. KERNEL2x8_2 47, 0
  96. KERNEL2x8_2 48, 0
  97. KERNEL2x8_2 49, 0
  98. KERNEL2x8_2 50, 0
  99. KERNEL2x8_2 51, 0
  100. KERNEL2x8_2 52, 0
  101. KERNEL2x8_2 53, 0
  102. KERNEL2x8_2 54, 0
  103. KERNEL2x8_2 55, 0
  104. KERNEL2x8_2 56, 0
  105. KERNEL2x8_2 57, 0
  106. KERNEL2x8_2 58, 0
  107. KERNEL2x8_2 59, 0
  108. KERNEL2x8_2 60, 0
  109. KERNEL2x8_2 61, 0
  110. KERNEL2x8_2 62, 0
  111. KERNEL2x8_2 63, 1
  112. bdz ZGEMM_L2x8_LOOP_END
  113. b ZGEMM_L2x8_LOOP
  114. MY_ALIGN
  115. ZGEMM_L2x8_LOOP_END:
  116. /*----------------------------------------*/
  117. KERNEL2x8_2 0, 1
  118. blr
  119. MY_ALIGN
  120. ZGEMM_2x4_LMAIN_SUB:
  121. /*----------------------------------------*/
  122. mtctr T8
  123. MY_ALIGN
  124. ZGEMM_L2x4_LOOP:
  125. /*----------------------------------------*/
  126. KERNEL2x4_2 0, 0
  127. ZGEMM_L2x4_K32:
  128. /*----------------------------------------*/
  129. KERNEL2x4_2 1, 0
  130. KERNEL2x4_2 2, 0
  131. KERNEL2x4_2 3, 0
  132. KERNEL2x4_2 4, 0
  133. KERNEL2x4_2 5, 0
  134. KERNEL2x4_2 6, 0
  135. KERNEL2x4_2 7, 0
  136. KERNEL2x4_2 8, 0
  137. KERNEL2x4_2 9, 0
  138. KERNEL2x4_2 10, 0
  139. KERNEL2x4_2 11, 0
  140. KERNEL2x4_2 12, 0
  141. KERNEL2x4_2 13, 0
  142. KERNEL2x4_2 14, 0
  143. KERNEL2x4_2 15, 1
  144. bdnz ZGEMM_L2x4_LOOP
  145. MY_ALIGN
  146. ZGEMM_L2x4_LOOP_END:
  147. /*----------------------------------------*/
  148. KERNEL2x4_2 0, 1
  149. blr
  150. MY_ALIGN
  151. ZGEMM_2x2_LMAIN_SUB:
  152. /*----------------------------------------*/
  153. mtctr T8
  154. MY_ALIGN
  155. ZGEMM_L2x2_LOOP:
  156. /*----------------------------------------*/
  157. KERNEL2x2_2 0, 0
  158. ZGEMM_L2x2_K32:
  159. /*----------------------------------------*/
  160. KERNEL2x2_2 1, 0
  161. KERNEL2x2_2 2, 0
  162. KERNEL2x2_2 3, 0
  163. KERNEL2x2_2 4, 0
  164. KERNEL2x2_2 5, 0
  165. KERNEL2x2_2 6, 0
  166. KERNEL2x2_2 7, 0
  167. KERNEL2x2_2 8, 0
  168. KERNEL2x2_2 9, 0
  169. KERNEL2x2_2 10, 0
  170. KERNEL2x2_2 11, 0
  171. KERNEL2x2_2 12, 0
  172. KERNEL2x2_2 13, 0
  173. KERNEL2x2_2 14, 0
  174. KERNEL2x2_2 15, 1
  175. bdnz ZGEMM_L2x2_LOOP
  176. MY_ALIGN
  177. ZGEMM_L2x2_LOOP_END:
  178. /*----------------------------------------*/
  179. KERNEL2x2_2 0, 1
  180. blr
  181. MY_ALIGN
  182. ZGEMM_2x1_LMAIN_SUB:
  183. /*----------------------------------------*/
  184. mtctr T8
  185. LOAD2x1_2
  186. MY_ALIGN
  187. ZGEMM_L2x1_LOOP:
  188. /*----------------------------------------*/
  189. KERNEL2x1_L2 32, 64, 0, 0
  190. ZGEMM_L2x1_K32:
  191. /*----------------------------------------*/
  192. KERNEL2x1_L2 32, 64, 1, 0
  193. KERNEL2x1_L2 32, 64, 2, 0
  194. KERNEL2x1_L2 32, 64, 3, 0
  195. KERNEL2x1_L2 32, 64, 4, 0
  196. KERNEL2x1_L2 32, 64, 5, 0
  197. KERNEL2x1_L2 32, 64, 6, 0
  198. KERNEL2x1_L2 32, 64, 7, 0
  199. KERNEL2x1_L2 32, 64, 8, 0
  200. KERNEL2x1_L2 32, 64, 9, 0
  201. KERNEL2x1_L2 32, 64, 10, 0
  202. KERNEL2x1_L2 32, 64, 11, 0
  203. KERNEL2x1_L2 32, 64, 12, 0
  204. KERNEL2x1_L2 32, 64, 13, 0
  205. KERNEL2x1_L2 32, 64, 14, 0
  206. KERNEL2x1_L2 32, 64, 15, 1
  207. bdnz ZGEMM_L2x1_LOOP
  208. MY_ALIGN
  209. ZGEMM_L2x1_LOOP_END:
  210. /*----------------------------------------*/
  211. END2x1_2
  212. blr
  213. MY_ALIGN
  214. /* MAIN LOOP BEGINS */
  215. MY_ALIGN
  216. ZGEMM_L2:
  217. /*----------------------------------------*/
  218. #if defined(TRMMKERNEL) && !defined(LEFT)
  219. neg TEMP_REG, OFFSET
  220. #endif
  221. srawi. J, N, 1
  222. bgt ZGEMM_L2_BEGIN
  223. b ZGEMM_L2_END
  224. ZGEMM_L2_BEGIN:
  225. /*----------------------------------------*/
  226. mr CO, C
  227. slwi T1, LDC, 1
  228. add T2,C,LDC
  229. mr AO, A
  230. add C, C, T1
  231. #if defined(TRMMKERNEL) && defined(LEFT)
  232. mr TEMP_REG, OFFSET /*off = offset;*/
  233. #endif
  234. srawi. I, M, 3
  235. bgt ZGEMM_L2_BEGIN_CONTINUE
  236. b ZGEMM_L2x8_END
  237. ZGEMM_L2_BEGIN_CONTINUE:
  238. dcbt CO,r0 /*just prefetch*/
  239. dcbt T2,r0
  240. ZGEMM_L2x8_BEGIN:
  241. /*----------------------------------------*/
  242. #if defined(TRMMKERNEL)
  243. REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 2
  244. #else
  245. mr BO, B
  246. dcbt B, r0
  247. #endif
  248. dcbt AO, r0
  249. #if defined(TRMMKERNEL)
  250. REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 2
  251. mr T1, T6
  252. #else
  253. mr T1, K
  254. #endif
  255. /* TEMPS FOR PREFETCH */
  256. li T2, 1024
  257. li T3, 1024+512
  258. addi T1,T1, -2
  259. /* TEMPS FOR PREFETCH */
  260. li T4, 2048
  261. li T5, 2048+512
  262. srawi. T8, T1, 7 /* T8 <- T1 % 128 */
  263. KERNEL2x8_PRELOAD
  264. KERNEL2x8_ZERO_AND_PRIME_MMA
  265. ble ZGEMM_L2x8_SUB0
  266. bl ZGEMM_L2x8_LMAIN_SUB
  267. andi. L, T1, 127
  268. bgt ZGEMM_L2x8_BEGIN_CONTINUE
  269. b ZGEMM_L2x8_SAVE
  270. ZGEMM_L2x8_BEGIN_CONTINUE:
  271. b ZGEMM_L2x8_SUB2
  272. ZGEMM_L2x8_SUB0:
  273. /*----------------------------------------*/
  274. #if defined(TRMMKERNEL)
  275. andi. L, T6, 255
  276. cmpwi T6, 129
  277. #else
  278. andi. L, K, 255
  279. cmpwi K, 129
  280. #endif
  281. li T8, 1
  282. bne CMP2x8_128K
  283. LOAD_END_2x8 128, 32
  284. KERNEL2x8_PRELOAD
  285. addi BO, BO, -64
  286. addi AO,AO, -256
  287. mtctr T8
  288. bl ZGEMM_L2x8_K128
  289. b ZGEMM_L2x8_SAVE
  290. CMP2x8_128K:
  291. /*----------------------------------------*/
  292. #if defined(TRMMKERNEL)
  293. cmpwi T6, 128
  294. #else
  295. cmpwi K, 128
  296. #endif
  297. bne ZGEMM_L2x8_SUB2
  298. MY_ALIGN
  299. mtctr T8
  300. addi BO, BO, -64
  301. addi AO,AO, -256
  302. bl ZGEMM_L2x8_K128
  303. b ZGEMM_L2x8_SAVE
  304. MY_ALIGN
  305. ZGEMM_L2x8_SUB2:
  306. /*----------------------------------------*/
  307. andi. T1,L, 64
  308. ble ZGEMM_L2x8_SUB2_32
  309. dcbt AO, PRE
  310. dcbt BO, PRE
  311. KERNEL2x8_2 0, 0
  312. KERNEL2x8_2 1, 0
  313. dcbt AO, T2
  314. KERNEL2x8_2 2, 0
  315. KERNEL2x8_2 3, 0
  316. dcbt AO, T3
  317. dcbt BO, T2
  318. KERNEL2x8_2 4, 0
  319. KERNEL2x8_2 5, 0
  320. dcbt AO, T4
  321. KERNEL2x8_2 6, 0
  322. KERNEL2x8_2 7, 0
  323. dcbt AO, T5
  324. dcbt BO, T3
  325. KERNEL2x8_2 8, 0
  326. KERNEL2x8_2 9, 0
  327. KERNEL2x8_2 10, 0
  328. KERNEL2x8_2 11, 0
  329. dcbt BO, T4
  330. KERNEL2x8_2 12, 0
  331. KERNEL2x8_2 13, 0
  332. KERNEL2x8_2 14, 0
  333. KERNEL2x8_2 15, 0
  334. KERNEL2x8_2 16, 0
  335. KERNEL2x8_2 17, 0
  336. KERNEL2x8_2 18, 0
  337. KERNEL2x8_2 19, 0
  338. KERNEL2x8_2 20, 0
  339. KERNEL2x8_2 21, 0
  340. KERNEL2x8_2 22, 0
  341. KERNEL2x8_2 23, 0
  342. KERNEL2x8_2 24, 0
  343. KERNEL2x8_2 25, 0
  344. KERNEL2x8_2 26, 0
  345. KERNEL2x8_2 27, 0
  346. KERNEL2x8_2 28, 0
  347. KERNEL2x8_2 29, 0
  348. KERNEL2x8_2 30, 0
  349. KERNEL2x8_2 31, 1
  350. MY_ALIGN
  351. ZGEMM_L2x8_SUB2_32:
  352. /*----------------------------------------*/
  353. andi. T1,L, 32
  354. ble ZGEMM_L2x8_SUB2_16
  355. dcbt AO, PRE
  356. dcbt BO, PRE
  357. KERNEL2x8_2 0, 0
  358. KERNEL2x8_2 1, 0
  359. dcbt AO, T2
  360. KERNEL2x8_2 2, 0
  361. KERNEL2x8_2 3, 0
  362. dcbt AO, T3
  363. dcbt BO, T2
  364. KERNEL2x8_2 4, 0
  365. KERNEL2x8_2 5, 0
  366. dcbt AO, T4
  367. KERNEL2x8_2 6, 0
  368. KERNEL2x8_2 7, 0
  369. dcbt AO, T5
  370. dcbt BO, T3
  371. KERNEL2x8_2 8, 0
  372. KERNEL2x8_2 9, 0
  373. KERNEL2x8_2 10, 0
  374. KERNEL2x8_2 11, 0
  375. dcbt BO, T4
  376. KERNEL2x8_2 12, 0
  377. KERNEL2x8_2 13, 0
  378. KERNEL2x8_2 14, 0
  379. KERNEL2x8_2 15, 1
  380. MY_ALIGN
  381. ZGEMM_L2x8_SUB2_16:
  382. /*----------------------------------------*/
  383. andi. T1,L, 16
  384. ble ZGEMM_L2x8_SUB2_8
  385. dcbt AO, PRE
  386. dcbt BO, PRE
  387. KERNEL2x8_2 0, 0
  388. KERNEL2x8_2 1, 0
  389. dcbt AO, T2
  390. KERNEL2x8_2 2, 0
  391. KERNEL2x8_2 3, 0
  392. dcbt AO, T3
  393. dcbt BO, T2
  394. KERNEL2x8_2 4, 0
  395. KERNEL2x8_2 5, 0
  396. dcbt AO, T4
  397. KERNEL2x8_2 6, 0
  398. KERNEL2x8_2 7, 1
  399. MY_ALIGN
  400. ZGEMM_L2x8_SUB2_8:
  401. /*----------------------------------------*/
  402. andi. T1,L, 8
  403. ble ZGEMM_L2x8_SUB2_4
  404. KERNEL2x8_2 0, 0
  405. KERNEL2x8_2 1, 0
  406. KERNEL2x8_2 2, 0
  407. KERNEL2x8_2 3, 1
  408. MY_ALIGN
  409. ZGEMM_L2x8_SUB2_4:
  410. /*----------------------------------------*/
  411. andi. T1,L, 4
  412. ble ZGEMM_L2x8_SUB2_2
  413. KERNEL2x8_2 0, 0
  414. KERNEL2x8_2 1, 1
  415. MY_ALIGN
  416. ZGEMM_L2x8_SUB2_2:
  417. /*----------------------------------------*/
  418. andi. T1,L, 2
  419. ble ZGEMM_L2x8_SUB2_1
  420. KERNEL2x8_2 0, 1
  421. MY_ALIGN
  422. ZGEMM_L2x8_SUB2_1:
  423. /*----------------------------------------*/
  424. andi. T1,L, 1
  425. ble ZGEMM_L2x8_SAVE
  426. LOAD_END_2x8 128, 32
  427. ZGEMM_L2x8_SAVE:
  428. /*----------------------------------------*/
  429. addic. I, I, -1
  430. KERNEL2x8_UNPRIME_MMA
  431. SAVE2x8
  432. #if defined(TRMMKERNEL)
  433. REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 2
  434. #endif
  435. ble ZGEMM_L2x8_SAVE_CONTINUE
  436. b ZGEMM_L2x8_BEGIN
  437. ZGEMM_L2x8_SAVE_CONTINUE:
  438. andi. T2, M, 7
  439. ble ZGEMM_L2x1_END
  440. andi. T1, M, 4
  441. ble ZGEMM_L2x4_END
  442. b ZGEMM_L2x4_BEGIN
  443. MY_ALIGN
  444. ZGEMM_L2x8_END:
  445. /*----------------------------------------*/
  446. ZGEMM_L2x4_BEGIN:
  447. /*----------------------------------------*/
  448. andi. T2, M, 7
  449. ble ZGEMM_L2x1_END
  450. andi. T1, M, 4
  451. ble ZGEMM_L2x4_END
  452. #if defined(TRMMKERNEL)
  453. REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 2
  454. #else
  455. mr BO, B
  456. #endif
  457. #if defined(TRMMKERNEL)
  458. REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 2
  459. mr T1, T6
  460. addi T1,T1, -2
  461. srawi. T8, T1, 5 /**(T11-2) % 32x */
  462. #else
  463. mr T1, K
  464. addi T1,T1, -2
  465. srawi. T8, T1, 5 /**(K-2) % 32x */
  466. #endif
  467. KERNEL2x4_PRELOAD
  468. KERNEL2x4_ZERO_AND_PRIME_MMA
  469. ble ZGEMM_L2x4_SUB0
  470. bl ZGEMM_2x4_LMAIN_SUB
  471. andi. L, T1, 31
  472. ble ZGEMM_L2x4_SAVE
  473. b ZGEMM_L2x4_SUB2
  474. ZGEMM_L2x4_SUB0:
  475. /*----------------------------------------*/
  476. #if defined(TRMMKERNEL)
  477. andi. L, T6, 63
  478. cmpwi T6, 33
  479. #else
  480. andi. L, K, 63
  481. cmpwi K, 33
  482. #endif
  483. li T8, 1
  484. bne CMP2x4_32K
  485. LOAD_END_2x4 64, 32
  486. KERNEL2x4_PRELOAD
  487. addi BO, BO, -64
  488. addi AO,AO, -128
  489. mtctr T8
  490. bl ZGEMM_L2x4_K32
  491. b ZGEMM_L2x4_SAVE
  492. CMP2x4_32K:
  493. /*----------------------------------------*/
  494. #if defined(TRMMKERNEL)
  495. cmpwi T6, 32
  496. #else
  497. cmpwi K, 32
  498. #endif
  499. bne ZGEMM_L2x4_SUB2
  500. MY_ALIGN
  501. mtctr T8
  502. addi BO, BO, -64
  503. addi AO,AO, -128
  504. bl ZGEMM_L2x4_K32
  505. b ZGEMM_L2x4_SAVE
  506. MY_ALIGN
  507. MY_ALIGN
  508. ZGEMM_L2x4_SUB2:
  509. /*----------------------------------------*/
  510. andi. T1,L, 16
  511. ble ZGEMM_L2x4_SUB2_8
  512. KERNEL2x4_2 0, 0
  513. KERNEL2x4_2 1, 0
  514. KERNEL2x4_2 2, 0
  515. KERNEL2x4_2 3, 0
  516. KERNEL2x4_2 4, 0
  517. KERNEL2x4_2 5, 0
  518. KERNEL2x4_2 6, 0
  519. KERNEL2x4_2 7, 1
  520. MY_ALIGN
  521. ZGEMM_L2x4_SUB2_8:
  522. /*----------------------------------------*/
  523. andi. T1,L, 8
  524. ble ZGEMM_L2x4_SUB2_4
  525. KERNEL2x4_2 0, 0
  526. KERNEL2x4_2 1, 0
  527. KERNEL2x4_2 2, 0
  528. KERNEL2x4_2 3, 1
  529. MY_ALIGN
  530. ZGEMM_L2x4_SUB2_4:
  531. /*----------------------------------------*/
  532. andi. T1,L, 4
  533. ble ZGEMM_L2x4_SUB2_2
  534. KERNEL2x4_2 0, 0
  535. KERNEL2x4_2 1, 1
  536. MY_ALIGN
  537. ZGEMM_L2x4_SUB2_2:
  538. /*----------------------------------------*/
  539. andi. T1,L, 2
  540. ble ZGEMM_L2x4_SUB2_1
  541. KERNEL2x4_2 0, 1
  542. MY_ALIGN
  543. ZGEMM_L2x4_SUB2_1:
  544. /*----------------------------------------*/
  545. andi. T1,L, 1
  546. ble ZGEMM_L2x4_SAVE
  547. LOAD_END_2x4 64, 32
  548. ZGEMM_L2x4_SAVE:
  549. /*----------------------------------------*/
  550. KERNEL2x4_UNPRIME_MMA
  551. SAVE2x4
  552. #if defined(TRMMKERNEL)
  553. REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 2
  554. #endif
  555. ZGEMM_L2x4_END:
  556. /*----------------------------------------*/
  557. ZGEMM_L2x2_BEGIN:
  558. /*----------------------------------------*/
  559. andi. T1, M, 2
  560. ble ZGEMM_L2x2_END
  561. #if defined(TRMMKERNEL)
  562. REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 2
  563. #else
  564. mr BO, B
  565. #endif
  566. #if defined(TRMMKERNEL)
  567. REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 2
  568. mr T1, T6
  569. addi T1,T1, -2
  570. srawi. T8, T1, 5 /**(T11-2) % 32x */
  571. #else
  572. mr T1, K
  573. addi T1,T1, -2
  574. srawi. T8, T1, 5 /**(K-2) % 32x */
  575. #endif
  576. KERNEL2x2_PRELOAD
  577. KERNEL2x2_ZERO_AND_PRIME_MMA
  578. ble ZGEMM_L2x2_SUB0
  579. bl ZGEMM_2x2_LMAIN_SUB
  580. andi. L, T1, 31
  581. ble ZGEMM_L2x2_SAVE
  582. b ZGEMM_L2x2_SUB2
  583. ZGEMM_L2x2_SUB0:
  584. /*----------------------------------------*/
  585. #if defined(TRMMKERNEL)
  586. andi. L, T6, 63
  587. cmpwi T6, 33
  588. #else
  589. andi. L, K, 63
  590. cmpwi K, 33
  591. #endif
  592. li T8, 1
  593. bne CMP2x2_32K
  594. LOAD_END_2x2 32, 32
  595. KERNEL2x2_PRELOAD
  596. addi BO, BO, -64
  597. addi AO,AO, -64
  598. mtctr T8
  599. bl ZGEMM_L2x2_K32
  600. b ZGEMM_L2x2_SAVE
  601. CMP2x2_32K:
  602. /*----------------------------------------*/
  603. #if defined(TRMMKERNEL)
  604. cmpwi T6, 32
  605. #else
  606. cmpwi K, 32
  607. #endif
  608. bne ZGEMM_L2x2_SUB2
  609. MY_ALIGN
  610. mtctr T8
  611. addi BO, BO, -64
  612. addi AO,AO, -64
  613. bl ZGEMM_L2x2_K32
  614. b ZGEMM_L2x2_SAVE
  615. MY_ALIGN
  616. MY_ALIGN
  617. ZGEMM_L2x2_SUB2:
  618. /*----------------------------------------*/
  619. andi. T1,L, 16
  620. ble ZGEMM_L2x2_SUB2_8
  621. KERNEL2x2_2 0, 0
  622. KERNEL2x2_2 1, 0
  623. KERNEL2x2_2 2, 0
  624. KERNEL2x2_2 3, 0
  625. KERNEL2x2_2 4, 0
  626. KERNEL2x2_2 5, 0
  627. KERNEL2x2_2 6, 0
  628. KERNEL2x2_2 7, 1
  629. MY_ALIGN
  630. ZGEMM_L2x2_SUB2_8:
  631. /*----------------------------------------*/
  632. andi. T1,L, 8
  633. ble ZGEMM_L2x2_SUB2_4
  634. KERNEL2x2_2 0, 0
  635. KERNEL2x2_2 1, 0
  636. KERNEL2x2_2 2, 0
  637. KERNEL2x2_2 3, 1
  638. MY_ALIGN
  639. ZGEMM_L2x2_SUB2_4:
  640. /*----------------------------------------*/
  641. andi. T1,L, 4
  642. ble ZGEMM_L2x2_SUB2_2
  643. KERNEL2x2_2 0, 0
  644. KERNEL2x2_2 1, 1
  645. MY_ALIGN
  646. ZGEMM_L2x2_SUB2_2:
  647. /*----------------------------------------*/
  648. andi. T1,L, 2
  649. ble ZGEMM_L2x2_SUB2_1
  650. KERNEL2x2_2 0, 1
  651. MY_ALIGN
  652. ZGEMM_L2x2_SUB2_1:
  653. /*----------------------------------------*/
  654. andi. T1,L, 1
  655. ble ZGEMM_L2x2_SAVE
  656. LOAD_END_2x2 32, 32
  657. ZGEMM_L2x2_SAVE:
  658. /*----------------------------------------*/
  659. KERNEL2x2_UNPRIME_MMA
  660. SAVE2x2
  661. #if defined(TRMMKERNEL)
  662. REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 2
  663. #endif
  664. ZGEMM_L2x2_END:
  665. /*----------------------------------------*/
  666. ZGEMM_L2x1_BEGIN:
  667. /*----------------------------------------*/
  668. andi. T1, M, 1
  669. ble ZGEMM_L2x1_END
  670. #if defined(TRMMKERNEL)
  671. REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 2
  672. #else
  673. mr BO, B
  674. #endif
  675. #if defined(TRMMKERNEL)
  676. REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 2
  677. mr T1, T6
  678. addi T1,T1, -2
  679. srawi. T8, T1, 5 /**(T11-2) % 32x */
  680. #else
  681. mr T1, K
  682. addi T1,T1, -2
  683. srawi. T8, T1, 5 /**(K-2) % 32x */
  684. #endif
  685. ZERO2x1
  686. ble ZGEMM_L2x1_SUB0
  687. bl ZGEMM_2x1_LMAIN_SUB
  688. andi. L, T1, 31
  689. ble ZGEMM_L2x1_SAVE
  690. b ZGEMM_L2x1_SUB2
  691. ZGEMM_L2x1_SUB0:
  692. /*----------------------------------------*/
  693. #if defined(TRMMKERNEL)
  694. andi. L, T6, 63
  695. cmpwi T6, 33
  696. #else
  697. andi. L, K, 63
  698. cmpwi K, 33
  699. #endif
  700. li T8, 1
  701. bne CMP2x1_32K
  702. addi BO, BO, -32
  703. addi AO,AO, -16
  704. LOAD2x1O 16, 32
  705. END2x1_WITHOUT_ADD
  706. LOAD2x1_2O 32, 64
  707. mtctr T8
  708. bl ZGEMM_L2x1_K32
  709. b ZGEMM_L2x1_SAVE
  710. CMP2x1_32K:
  711. /*----------------------------------------*/
  712. #if defined(TRMMKERNEL)
  713. cmpwi T6, 32
  714. #else
  715. cmpwi K, 32
  716. #endif
  717. bne ZGEMM_L2x1_SUB2
  718. MY_ALIGN
  719. mtctr T8
  720. addi BO, BO, -64
  721. addi AO,AO, -32
  722. LOAD2x1_2O 32, 64
  723. bl ZGEMM_L2x1_K32
  724. b ZGEMM_L2x1_SAVE
  725. MY_ALIGN
  726. MY_ALIGN
  727. ZGEMM_L2x1_SUB2:
  728. /*----------------------------------------*/
  729. andi. T1,L, 16
  730. ble ZGEMM_L2x1_SUB2_8
  731. LOAD2x1_2
  732. KERNEL2x1_L2 32, 64, 0, 0
  733. KERNEL2x1_L2 32, 64, 1, 0
  734. KERNEL2x1_L2 32, 64, 2, 0
  735. KERNEL2x1_L2 32, 64, 3, 0
  736. KERNEL2x1_L2 32, 64, 4, 0
  737. KERNEL2x1_L2 32, 64, 5, 0
  738. KERNEL2x1_L2 32, 64, 6, 0
  739. KERNEL2x1_E2 32, 64, 7, 1
  740. MY_ALIGN
  741. ZGEMM_L2x1_SUB2_8:
  742. /*----------------------------------------*/
  743. andi. T1,L, 8
  744. ble ZGEMM_L2x1_SUB2_4
  745. LOAD2x1_2
  746. KERNEL2x1_L2 32, 64, 0, 0
  747. KERNEL2x1_L2 32, 64, 1, 0
  748. KERNEL2x1_L2 32, 64, 2, 0
  749. KERNEL2x1_E2 32, 64, 3, 1
  750. MY_ALIGN
  751. ZGEMM_L2x1_SUB2_4:
  752. /*----------------------------------------*/
  753. andi. T1,L, 4
  754. ble ZGEMM_L2x1_SUB2_2
  755. LOAD2x1_2
  756. KERNEL2x1_L2 32, 64, 0, 0
  757. KERNEL2x1_E2 32, 64, 1, 1
  758. MY_ALIGN
  759. ZGEMM_L2x1_SUB2_2:
  760. /*----------------------------------------*/
  761. andi. T1,L, 2
  762. ble ZGEMM_L2x1_SUB2_1
  763. LOAD2x1_2
  764. KERNEL2x1_E2 32, 64, 0, 1
  765. MY_ALIGN
  766. ZGEMM_L2x1_SUB2_1:
  767. /*----------------------------------------*/
  768. andi. T1,L, 1
  769. ble ZGEMM_L2x1_SAVE
  770. KERNEL2x1
  771. ZGEMM_L2x1_SAVE:
  772. /*----------------------------------------*/
  773. SAVE2x1
  774. #if defined(TRMMKERNEL)
  775. REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 2
  776. #endif
  777. ZGEMM_L2x1_END:
  778. /*----------------------------------------*/
  779. slwi T1, K, 5
  780. addic. J, J, -1
  781. add B, B, T1
  782. #if defined(TRMMKERNEL) && !defined(LEFT)
  783. addi TEMP_REG, TEMP_REG, 2
  784. #endif
  785. ble ZGEMM_L2_END
  786. b ZGEMM_L2_BEGIN
  787. ZGEMM_L2_END:
  788. b ZGEMM_L1
  789. /* MINI SUBROUTINES */
  790. /* 1x8 MAIN 128x+2 LOOP */
  791. ZGEMM_L1x8_LMAIN_SUB:
  792. /*----------------------------------------*/
  793. mtctr T8
  794. MY_ALIGN
  795. ZGEMM_L1x8_LOOP:
  796. /*----------------------------------------*/
  797. dcbt AO, PRE
  798. dcbt BO, PRE
  799. KERNEL1x8_2 0, 0
  800. ZGEMM_L1x8_K128:
  801. /*----------------------------------------*/
  802. KERNEL1x8_2 1, 0
  803. dcbt AO, T2
  804. KERNEL1x8_2 2, 0
  805. KERNEL1x8_2 3, 0
  806. dcbt AO, T3
  807. dcbt BO, T2
  808. KERNEL1x8_2 4, 0
  809. KERNEL1x8_2 5, 0
  810. dcbt AO, T4
  811. KERNEL1x8_2 6, 0
  812. KERNEL1x8_2 7, 0
  813. dcbt AO, T5
  814. dcbt BO, T3
  815. KERNEL1x8_2 8, 0
  816. KERNEL1x8_2 9, 0
  817. KERNEL1x8_2 10, 0
  818. KERNEL1x8_2 11, 0
  819. dcbt BO, T4
  820. KERNEL1x8_2 12, 0
  821. KERNEL1x8_2 13, 0
  822. KERNEL1x8_2 14, 0
  823. KERNEL1x8_2 15, 0
  824. KERNEL1x8_2 16, 0
  825. KERNEL1x8_2 17, 0
  826. KERNEL1x8_2 18, 0
  827. KERNEL1x8_2 19, 0
  828. KERNEL1x8_2 20, 0
  829. KERNEL1x8_2 21, 0
  830. KERNEL1x8_2 22, 0
  831. KERNEL1x8_2 23, 0
  832. KERNEL1x8_2 24, 0
  833. KERNEL1x8_2 25, 0
  834. KERNEL1x8_2 26, 0
  835. KERNEL1x8_2 27, 0
  836. KERNEL1x8_2 28, 0
  837. KERNEL1x8_2 29, 0
  838. KERNEL1x8_2 30, 0
  839. KERNEL1x8_2 31, 0
  840. KERNEL1x8_2 32, 0
  841. KERNEL1x8_2 33, 0
  842. KERNEL1x8_2 34, 0
  843. KERNEL1x8_2 35, 0
  844. KERNEL1x8_2 36, 0
  845. KERNEL1x8_2 37, 0
  846. KERNEL1x8_2 38, 0
  847. KERNEL1x8_2 39, 0
  848. KERNEL1x8_2 40, 0
  849. KERNEL1x8_2 41, 0
  850. KERNEL1x8_2 42, 0
  851. KERNEL1x8_2 43, 0
  852. KERNEL1x8_2 44, 0
  853. KERNEL1x8_2 45, 0
  854. KERNEL1x8_2 46, 0
  855. KERNEL1x8_2 47, 0
  856. KERNEL1x8_2 48, 0
  857. KERNEL1x8_2 49, 0
  858. KERNEL1x8_2 50, 0
  859. KERNEL1x8_2 51, 0
  860. KERNEL1x8_2 52, 0
  861. KERNEL1x8_2 53, 0
  862. KERNEL1x8_2 54, 0
  863. KERNEL1x8_2 55, 0
  864. KERNEL1x8_2 56, 0
  865. KERNEL1x8_2 57, 0
  866. KERNEL1x8_2 58, 0
  867. KERNEL1x8_2 59, 0
  868. KERNEL1x8_2 60, 0
  869. KERNEL1x8_2 61, 0
  870. KERNEL1x8_2 62, 0
  871. KERNEL1x8_2 63, 1
  872. bdnz ZGEMM_L1x8_LOOP
  873. MY_ALIGN
  874. ZGEMM_L1x8_LOOP_END:
  875. /*----------------------------------------*/
  876. KERNEL1x8_2 0, 1
  877. blr
  878. MY_ALIGN
  879. ZGEMM_1x4_LMAIN_SUB:
  880. /*----------------------------------------*/
  881. mtctr T8
  882. MY_ALIGN
  883. ZGEMM_L1x4_LOOP:
  884. /*----------------------------------------*/
  885. KERNEL1x4_2 0, 0
  886. ZGEMM_L1x4_K32:
  887. /*----------------------------------------*/
  888. KERNEL1x4_2 1, 0
  889. KERNEL1x4_2 2, 0
  890. KERNEL1x4_2 3, 0
  891. KERNEL1x4_2 4, 0
  892. KERNEL1x4_2 5, 0
  893. KERNEL1x4_2 6, 0
  894. KERNEL1x4_2 7, 0
  895. KERNEL1x4_2 8, 0
  896. KERNEL1x4_2 9, 0
  897. KERNEL1x4_2 10, 0
  898. KERNEL1x4_2 11, 0
  899. KERNEL1x4_2 12, 0
  900. KERNEL1x4_2 13, 0
  901. KERNEL1x4_2 14, 0
  902. KERNEL1x4_2 15, 1
  903. bdnz ZGEMM_L1x4_LOOP
  904. MY_ALIGN
  905. ZGEMM_L1x4_LOOP_END:
  906. /*----------------------------------------*/
  907. KERNEL1x4_2 0, 1
  908. blr
  909. MY_ALIGN
  910. ZGEMM_1x2_LMAIN_SUB:
  911. /*----------------------------------------*/
  912. mtctr T8
  913. MY_ALIGN
  914. ZGEMM_L1x2_LOOP:
  915. /*----------------------------------------*/
  916. KERNEL1x2_2 0, 0
  917. ZGEMM_L1x2_K32:
  918. /*----------------------------------------*/
  919. KERNEL1x2_2 1, 0
  920. KERNEL1x2_2 2, 0
  921. KERNEL1x2_2 3, 0
  922. KERNEL1x2_2 4, 0
  923. KERNEL1x2_2 5, 0
  924. KERNEL1x2_2 6, 0
  925. KERNEL1x2_2 7, 0
  926. KERNEL1x2_2 8, 0
  927. KERNEL1x2_2 9, 0
  928. KERNEL1x2_2 10, 0
  929. KERNEL1x2_2 11, 0
  930. KERNEL1x2_2 12, 0
  931. KERNEL1x2_2 13, 0
  932. KERNEL1x2_2 14, 0
  933. KERNEL1x2_2 15, 1
  934. bdnz ZGEMM_L1x2_LOOP
  935. MY_ALIGN
  936. ZGEMM_L1x2_LOOP_END:
  937. /*----------------------------------------*/
  938. KERNEL1x2_2 0, 1
  939. blr
  940. MY_ALIGN
  941. ZGEMM_1x1_LMAIN_SUB:
  942. /*----------------------------------------*/
  943. mtctr T8
  944. LOAD1x1_2
  945. MY_ALIGN
  946. ZGEMM_L1x1_LOOP:
  947. /*----------------------------------------*/
  948. KERNEL1x1_L2 32, 32, 0, 0
  949. ZGEMM_L1x1_K32:
  950. /*----------------------------------------*/
  951. KERNEL1x1_L2 32, 32, 1, 0
  952. KERNEL1x1_L2 32, 32, 2, 0
  953. KERNEL1x1_L2 32, 32, 3, 0
  954. KERNEL1x1_L2 32, 32, 4, 0
  955. KERNEL1x1_L2 32, 32, 5, 0
  956. KERNEL1x1_L2 32, 32, 6, 0
  957. KERNEL1x1_L2 32, 32, 7, 0
  958. KERNEL1x1_L2 32, 32, 8, 0
  959. KERNEL1x1_L2 32, 32, 9, 0
  960. KERNEL1x1_L2 32, 32, 10, 0
  961. KERNEL1x1_L2 32, 32, 11, 0
  962. KERNEL1x1_L2 32, 32, 12, 0
  963. KERNEL1x1_L2 32, 32, 13, 0
  964. KERNEL1x1_L2 32, 32, 14, 0
  965. KERNEL1x1_L2 32, 32, 15, 1
  966. bdnz ZGEMM_L1x1_LOOP
  967. MY_ALIGN
  968. ZGEMM_L1x1_LOOP_END:
  969. /*----------------------------------------*/
  970. END1x1_2
  971. blr
  972. MY_ALIGN
  973. /*----------------------N1 BEGINS---------*/
  974. ZGEMM_L1:
  975. /*----------------------------------------*/
  976. andi. T1, N, 1
  977. ble ZGEMM_L1_END
  978. ZGEMM_L1_BEGIN:
  979. /*----------------------------------------*/
  980. mr CO, C
  981. add T2,C,LDC
  982. mr AO, A
  983. add C, C, T1
  984. #if defined(TRMMKERNEL) && defined(LEFT)
  985. mr TEMP_REG, OFFSET /*off = offset;*/
  986. #endif
  987. srawi. I, M, 3
  988. ble ZGEMM_L1x8_END
  989. dcbt CO,r0 /*just prefetch*/
  990. dcbt T2,r0
  991. ZGEMM_L1x8_BEGIN:
  992. /*----------------------------------------*/
  993. #if defined(TRMMKERNEL)
  994. REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 1
  995. #else
  996. mr BO, B
  997. dcbt B, r0
  998. #endif
  999. dcbt AO, r0
  1000. #if defined(TRMMKERNEL)
  1001. REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 1
  1002. mr T1, T6
  1003. /* TEMPS FOR PREFETCH */
  1004. li T2, 1024
  1005. li T3, 1024+512
  1006. addi T1,T1, -2
  1007. /* TEMPS FOR PREFETCH */
  1008. li T4, 2048
  1009. li T5, 2048+512
  1010. srawi. T8, T1, 7 /**(T11-2) % 128x */
  1011. #else
  1012. mr T1, K
  1013. /* TEMPS FOR PREFETCH */
  1014. li T2, 1024
  1015. li T3, 1024+512
  1016. addi T1,T1, -2
  1017. /* TEMPS FOR PREFETCH */
  1018. li T4, 2048
  1019. li T5, 2048+512
  1020. srawi. T8, T1, 7 /**(K-2) % 128x */
  1021. #endif
  1022. KERNEL1x8_ZERO_AND_PRIME_MMA
  1023. ble ZGEMM_L1x8_SUB0
  1024. bl ZGEMM_L1x8_LMAIN_SUB
  1025. andi. L, T1, 127
  1026. ble ZGEMM_L1x8_SAVE
  1027. b ZGEMM_L1x8_SUB2
  1028. ZGEMM_L1x8_SUB0:
  1029. /*----------------------------------------*/
  1030. #if defined(TRMMKERNEL)
  1031. andi. L, T6, 255
  1032. cmpwi T6, 129
  1033. #else
  1034. andi. L, K, 255
  1035. cmpwi K, 129
  1036. #endif
  1037. li T8, 1
  1038. bne CMP1x8_128K
  1039. LOAD_END_1x8 -128, -16
  1040. mtctr T8
  1041. bl ZGEMM_L1x8_K128
  1042. b ZGEMM_L1x8_SAVE
  1043. CMP1x8_128K:
  1044. /*----------------------------------------*/
  1045. #if defined(TRMMKERNEL)
  1046. cmpwi T6, 128
  1047. #else
  1048. cmpwi K, 128
  1049. #endif
  1050. bne ZGEMM_L1x8_SUB2
  1051. MY_ALIGN
  1052. mtctr T8
  1053. addi BO, BO, -32
  1054. addi AO,AO, -256
  1055. bl ZGEMM_L1x8_K128
  1056. b ZGEMM_L1x8_SAVE
  1057. MY_ALIGN
  1058. ZGEMM_L1x8_SUB2:
  1059. /*----------------------------------------*/
  1060. andi. T1,L, 64
  1061. ble ZGEMM_L1x8_SUB2_32
  1062. dcbt AO, PRE
  1063. dcbt BO, PRE
  1064. KERNEL1x8_2 0, 0
  1065. KERNEL1x8_2 1, 0
  1066. dcbt AO, T2
  1067. KERNEL1x8_2 2, 0
  1068. KERNEL1x8_2 3, 0
  1069. dcbt AO, T3
  1070. dcbt BO, T2
  1071. KERNEL1x8_2 4, 0
  1072. KERNEL1x8_2 5, 0
  1073. dcbt AO, T4
  1074. KERNEL1x8_2 6, 0
  1075. KERNEL1x8_2 7, 0
  1076. dcbt AO, T5
  1077. dcbt BO, T3
  1078. KERNEL1x8_2 8, 0
  1079. KERNEL1x8_2 9, 0
  1080. KERNEL1x8_2 10, 0
  1081. KERNEL1x8_2 11, 0
  1082. dcbt BO, T4
  1083. KERNEL1x8_2 12, 0
  1084. KERNEL1x8_2 13, 0
  1085. KERNEL1x8_2 14, 0
  1086. KERNEL1x8_2 15, 0
  1087. KERNEL1x8_2 16, 0
  1088. KERNEL1x8_2 17, 0
  1089. KERNEL1x8_2 18, 0
  1090. KERNEL1x8_2 19, 0
  1091. KERNEL1x8_2 20, 0
  1092. KERNEL1x8_2 21, 0
  1093. KERNEL1x8_2 22, 0
  1094. KERNEL1x8_2 23, 0
  1095. KERNEL1x8_2 24, 0
  1096. KERNEL1x8_2 25, 0
  1097. KERNEL1x8_2 26, 0
  1098. KERNEL1x8_2 27, 0
  1099. KERNEL1x8_2 28, 0
  1100. KERNEL1x8_2 29, 0
  1101. KERNEL1x8_2 30, 0
  1102. KERNEL1x8_2 31, 1
  1103. MY_ALIGN
  1104. ZGEMM_L1x8_SUB2_32:
  1105. /*----------------------------------------*/
  1106. andi. T1,L, 32
  1107. ble ZGEMM_L1x8_SUB2_16
  1108. dcbt AO, PRE
  1109. dcbt BO, PRE
  1110. KERNEL1x8_2 0, 0
  1111. KERNEL1x8_2 1, 0
  1112. dcbt AO, T2
  1113. KERNEL1x8_2 2, 0
  1114. KERNEL1x8_2 3, 0
  1115. dcbt AO, T3
  1116. dcbt BO, T2
  1117. KERNEL1x8_2 4, 0
  1118. KERNEL1x8_2 5, 0
  1119. dcbt AO, T4
  1120. KERNEL1x8_2 6, 0
  1121. KERNEL1x8_2 7, 0
  1122. dcbt AO, T5
  1123. dcbt BO, T3
  1124. KERNEL1x8_2 8, 0
  1125. KERNEL1x8_2 9, 0
  1126. KERNEL1x8_2 10, 0
  1127. KERNEL1x8_2 11, 0
  1128. dcbt BO, T4
  1129. KERNEL1x8_2 12, 0
  1130. KERNEL1x8_2 13, 0
  1131. KERNEL1x8_2 14, 0
  1132. KERNEL1x8_2 15, 1
  1133. MY_ALIGN
  1134. ZGEMM_L1x8_SUB2_16:
  1135. /*----------------------------------------*/
  1136. andi. T1,L, 16
  1137. ble ZGEMM_L1x8_SUB2_8
  1138. dcbt AO, PRE
  1139. dcbt BO, PRE
  1140. KERNEL1x8_2 0, 0
  1141. KERNEL1x8_2 1, 0
  1142. dcbt AO, T2
  1143. KERNEL1x8_2 2, 0
  1144. KERNEL1x8_2 3, 0
  1145. dcbt AO, T3
  1146. dcbt BO, T2
  1147. KERNEL1x8_2 4, 0
  1148. KERNEL1x8_2 5, 0
  1149. dcbt AO, T4
  1150. KERNEL1x8_2 6, 0
  1151. KERNEL1x8_2 7, 1
  1152. MY_ALIGN
  1153. ZGEMM_L1x8_SUB2_8:
  1154. /*----------------------------------------*/
  1155. andi. T1,L, 8
  1156. ble ZGEMM_L1x8_SUB2_4
  1157. KERNEL1x8_2 0, 0
  1158. KERNEL1x8_2 1, 0
  1159. KERNEL1x8_2 2, 0
  1160. KERNEL1x8_2 3, 1
  1161. MY_ALIGN
  1162. ZGEMM_L1x8_SUB2_4:
  1163. /*----------------------------------------*/
  1164. andi. T1,L, 4
  1165. ble ZGEMM_L1x8_SUB2_2
  1166. KERNEL1x8_2 0, 0
  1167. KERNEL1x8_2 1, 1
  1168. MY_ALIGN
  1169. ZGEMM_L1x8_SUB2_2:
  1170. /*----------------------------------------*/
  1171. andi. T1,L, 2
  1172. ble ZGEMM_L1x8_SUB2_1
  1173. KERNEL1x8_2 0, 1
  1174. MY_ALIGN
  1175. ZGEMM_L1x8_SUB2_1:
  1176. /*----------------------------------------*/
  1177. andi. T1,L, 1
  1178. ble ZGEMM_L1x8_SAVE
  1179. LOAD_END_1x8 128, 16
  1180. ZGEMM_L1x8_SAVE:
  1181. /*----------------------------------------*/
  1182. addic. I, I, -1
  1183. KERNEL1x8_UNPRIME_MMA
  1184. SAVE1x8
  1185. #if defined(TRMMKERNEL)
  1186. REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 1
  1187. #endif
  1188. bgt ZGEMM_L1x8_BEGIN
  1189. andi. T2, M, 7
  1190. ble ZGEMM_L1x1_END
  1191. andi. T1, M, 4
  1192. ble ZGEMM_L1x4_END
  1193. b ZGEMM_L1x4_BEGIN
  1194. MY_ALIGN
  1195. ZGEMM_L1x8_END:
  1196. /*----------------------------------------*/
  1197. ZGEMM_L1x4_BEGIN:
  1198. /*----------------------------------------*/
  1199. andi. T2, M, 7
  1200. ble ZGEMM_L1x1_END
  1201. andi. T1, M, 4
  1202. ble ZGEMM_L1x4_END
  1203. #if defined(TRMMKERNEL)
  1204. REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 1
  1205. #else
  1206. mr BO, B
  1207. #endif
  1208. #if defined(TRMMKERNEL)
  1209. REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 1
  1210. mr T1, T6
  1211. addi T1,T1, -2
  1212. srawi. T8, T1, 5 /**(T11-2) % 32x */
  1213. #else
  1214. mr T1, K
  1215. addi T1,T1, -2
  1216. srawi. T8, T1, 5 /**(K-2) % 32x */
  1217. #endif
  1218. KERNEL1x4_ZERO_AND_PRIME_MMA
  1219. ble ZGEMM_L1x4_SUB0
  1220. bl ZGEMM_1x4_LMAIN_SUB
  1221. andi. L, T1, 31
  1222. ble ZGEMM_L1x4_SAVE
  1223. b ZGEMM_L1x4_SUB2
  1224. ZGEMM_L1x4_SUB0:
  1225. /*----------------------------------------*/
  1226. #if defined(TRMMKERNEL)
  1227. andi. L, T6, 63
  1228. cmpwi T6, 33
  1229. #else
  1230. andi. L, K, 63
  1231. cmpwi K, 33
  1232. #endif
  1233. li T8, 1
  1234. bne CMP1x4_32K
  1235. LOAD_END_1x4 -64, -16
  1236. mtctr T8
  1237. bl ZGEMM_L1x4_K32
  1238. b ZGEMM_L1x4_SAVE
  1239. CMP1x4_32K:
  1240. /*----------------------------------------*/
  1241. #if defined(TRMMKERNEL)
  1242. cmpwi T6, 32
  1243. #else
  1244. cmpwi K, 32
  1245. #endif
  1246. bne ZGEMM_L1x4_SUB2
  1247. MY_ALIGN
  1248. mtctr T8
  1249. addi BO, BO, -32
  1250. addi AO,AO, -128
  1251. bl ZGEMM_L1x4_K32
  1252. b ZGEMM_L1x4_SAVE
  1253. MY_ALIGN
  1254. MY_ALIGN
  1255. ZGEMM_L1x4_SUB2:
  1256. /*----------------------------------------*/
  1257. andi. T1,L, 16
  1258. ble ZGEMM_L1x4_SUB2_8
  1259. KERNEL1x4_2 0, 0
  1260. KERNEL1x4_2 1, 0
  1261. KERNEL1x4_2 2, 0
  1262. KERNEL1x4_2 3, 0
  1263. KERNEL1x4_2 4, 0
  1264. KERNEL1x4_2 5, 0
  1265. KERNEL1x4_2 6, 0
  1266. KERNEL1x4_2 7, 1
  1267. MY_ALIGN
  1268. ZGEMM_L1x4_SUB2_8:
  1269. /*----------------------------------------*/
  1270. andi. T1,L, 8
  1271. ble ZGEMM_L1x4_SUB2_4
  1272. KERNEL1x4_2 0, 0
  1273. KERNEL1x4_2 1, 0
  1274. KERNEL1x4_2 2, 0
  1275. KERNEL1x4_2 3, 1
  1276. MY_ALIGN
  1277. ZGEMM_L1x4_SUB2_4:
  1278. /*----------------------------------------*/
  1279. andi. T1,L, 4
  1280. ble ZGEMM_L1x4_SUB2_2
  1281. KERNEL1x4_2 0, 0
  1282. KERNEL1x4_2 1, 1
  1283. MY_ALIGN
  1284. ZGEMM_L1x4_SUB2_2:
  1285. /*----------------------------------------*/
  1286. andi. T1,L, 2
  1287. ble ZGEMM_L1x4_SUB2_1
  1288. KERNEL1x4_2 0, 1
  1289. MY_ALIGN
  1290. ZGEMM_L1x4_SUB2_1:
  1291. /*----------------------------------------*/
  1292. andi. T1,L, 1
  1293. ble ZGEMM_L1x4_SAVE
  1294. LOAD_END_1x4 64,16
  1295. ZGEMM_L1x4_SAVE:
  1296. /*----------------------------------------*/
  1297. KERNEL1x4_UNPRIME_MMA
  1298. SAVE1x4
  1299. #if defined(TRMMKERNEL)
  1300. REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 1
  1301. #endif
  1302. ZGEMM_L1x4_END:
  1303. /*----------------------------------------*/
  1304. ZGEMM_L1x2_BEGIN:
  1305. /*----------------------------------------*/
  1306. andi. T1, M, 2
  1307. ble ZGEMM_L1x2_END
  1308. #if defined(TRMMKERNEL)
  1309. REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 1
  1310. #else
  1311. mr BO, B
  1312. #endif
  1313. #if defined(TRMMKERNEL)
  1314. REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 1
  1315. mr T1, T6
  1316. addi T1,T1, -2
  1317. srawi. T8, T1, 5 /**(T11-2) % 32x */
  1318. #else
  1319. mr T1, K
  1320. addi T1,T1, -2
  1321. srawi. T8, T1, 5 /**(K-2) % 32x */
  1322. #endif
  1323. KERNEL1x2_ZERO_AND_PRIME_MMA
  1324. ble ZGEMM_L1x2_SUB0
  1325. bl ZGEMM_1x2_LMAIN_SUB
  1326. andi. L, T1, 31
  1327. ble ZGEMM_L1x2_SAVE
  1328. b ZGEMM_L1x2_SUB2
  1329. ZGEMM_L1x2_SUB0:
  1330. /*----------------------------------------*/
  1331. #if defined(TRMMKERNEL)
  1332. andi. L, T6, 63
  1333. cmpwi T6, 33
  1334. #else
  1335. andi. L, K, 63
  1336. cmpwi K, 33
  1337. #endif
  1338. li T8, 1
  1339. bne CMP1x2_32K
  1340. LOAD_END_1x2 -32, -16
  1341. mtctr T8
  1342. bl ZGEMM_L1x2_K32
  1343. b ZGEMM_L1x2_SAVE
  1344. CMP1x2_32K:
  1345. /*----------------------------------------*/
  1346. #if defined(TRMMKERNEL)
  1347. cmpwi T6, 32
  1348. #else
  1349. cmpwi K, 32
  1350. #endif
  1351. bne ZGEMM_L1x2_SUB2
  1352. MY_ALIGN
  1353. mtctr T8
  1354. addi BO, BO, -32
  1355. addi AO,AO, -64
  1356. bl ZGEMM_L1x2_K32
  1357. b ZGEMM_L1x2_SAVE
  1358. MY_ALIGN
  1359. MY_ALIGN
  1360. ZGEMM_L1x2_SUB2:
  1361. /*----------------------------------------*/
  1362. andi. T1,L, 16
  1363. ble ZGEMM_L1x2_SUB2_8
  1364. KERNEL1x2_2 0, 0
  1365. KERNEL1x2_2 1, 0
  1366. KERNEL1x2_2 2, 0
  1367. KERNEL1x2_2 3, 0
  1368. KERNEL1x2_2 4, 0
  1369. KERNEL1x2_2 5, 0
  1370. KERNEL1x2_2 6, 0
  1371. KERNEL1x2_2 7, 1
  1372. MY_ALIGN
  1373. ZGEMM_L1x2_SUB2_8:
  1374. /*----------------------------------------*/
  1375. andi. T1,L, 8
  1376. ble ZGEMM_L1x2_SUB2_4
  1377. KERNEL1x2_2 0, 0
  1378. KERNEL1x2_2 1, 0
  1379. KERNEL1x2_2 2, 0
  1380. KERNEL1x2_2 3, 1
  1381. MY_ALIGN
  1382. ZGEMM_L1x2_SUB2_4:
  1383. /*----------------------------------------*/
  1384. andi. T1,L, 4
  1385. ble ZGEMM_L1x2_SUB2_2
  1386. KERNEL1x2_2 0, 0
  1387. KERNEL1x2_2 1, 1
  1388. MY_ALIGN
  1389. ZGEMM_L1x2_SUB2_2:
  1390. /*----------------------------------------*/
  1391. andi. T1,L, 2
  1392. ble ZGEMM_L1x2_SUB2_1
  1393. KERNEL1x2_2 0, 1
  1394. MY_ALIGN
  1395. ZGEMM_L1x2_SUB2_1:
  1396. /*----------------------------------------*/
  1397. andi. T1,L, 1
  1398. ble ZGEMM_L1x2_SAVE
  1399. LOAD_END_1x2 32,16
  1400. ZGEMM_L1x2_SAVE:
  1401. /*----------------------------------------*/
  1402. KERNEL1x2_UNPRIME_MMA
  1403. SAVE1x2
  1404. #if defined(TRMMKERNEL)
  1405. REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 1
  1406. #endif
  1407. ZGEMM_L1x2_END:
  1408. /*----------------------------------------*/
  1409. ZGEMM_L1x1_BEGIN:
  1410. /*----------------------------------------*/
  1411. andi. T1, M, 1
  1412. ble ZGEMM_L1x1_END
  1413. #if defined(TRMMKERNEL)
  1414. REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 1
  1415. #else
  1416. mr BO, B
  1417. #endif
  1418. #if defined(TRMMKERNEL)
  1419. REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 1
  1420. mr T1, T6
  1421. addi T1,T1, -2
  1422. srawi. T8, T1, 5 /**(T11-2) % 32x */
  1423. #else
  1424. mr T1, K
  1425. addi T1,T1, -2
  1426. srawi. T8, T1, 5 /**(K-2) % 32x */
  1427. #endif
  1428. ZERO1x1
  1429. ble ZGEMM_L1x1_SUB0
  1430. bl ZGEMM_1x1_LMAIN_SUB
  1431. andi. L, T1, 31
  1432. ble ZGEMM_L1x1_SAVE
  1433. b ZGEMM_L1x1_SUB2
  1434. ZGEMM_L1x1_SUB0:
  1435. /*----------------------------------------*/
  1436. #if defined(TRMMKERNEL)
  1437. andi. L, T6, 63
  1438. cmpwi T6, 33
  1439. #else
  1440. andi. L, K, 63
  1441. cmpwi K, 33
  1442. #endif
  1443. li T8, 1
  1444. bne CMP1x1_32K
  1445. addi BO, BO, -16
  1446. addi AO,AO, -16
  1447. LOAD1x1O 16, 16
  1448. END1x1_WITHOUT_ADD
  1449. LOAD1x1_2O 32, 32
  1450. mtctr T8
  1451. bl ZGEMM_L1x1_K32
  1452. b ZGEMM_L1x1_SAVE
  1453. CMP1x1_32K:
  1454. /*----------------------------------------*/
  1455. #if defined(TRMMKERNEL)
  1456. cmpwi T6, 32
  1457. #else
  1458. cmpwi K, 32
  1459. #endif
  1460. bne ZGEMM_L1x1_SUB2
  1461. MY_ALIGN
  1462. mtctr T8
  1463. addi BO, BO, -32
  1464. addi AO,AO, -32
  1465. LOAD1x1_2O 32, 32
  1466. bl ZGEMM_L1x1_K32
  1467. b ZGEMM_L1x1_SAVE
  1468. MY_ALIGN
  1469. ZGEMM_L1x1_SUB2:
  1470. /*----------------------------------------*/
  1471. andi. T1, L, 16
  1472. ble ZGEMM_L1x1_SUB2_8
  1473. LOAD1x1_2
  1474. KERNEL1x1_L2 32, 32, 0, 0
  1475. KERNEL1x1_L2 32, 32, 1, 0
  1476. KERNEL1x1_L2 32, 32, 2, 0
  1477. KERNEL1x1_L2 32, 32, 3, 0
  1478. KERNEL1x1_L2 32, 32, 4, 0
  1479. KERNEL1x1_L2 32, 32, 5, 0
  1480. KERNEL1x1_L2 32, 32, 6, 0
  1481. KERNEL1x1_E2 32, 32, 7, 1
  1482. MY_ALIGN
  1483. ZGEMM_L1x1_SUB2_8:
  1484. /*----------------------------------------*/
  1485. andi. T1, L, 8
  1486. ble ZGEMM_L1x1_SUB2_4
  1487. LOAD1x1_2
  1488. KERNEL1x1_L2 32, 32, 0, 0
  1489. KERNEL1x1_L2 32, 32, 1, 0
  1490. KERNEL1x1_L2 32, 32, 2, 0
  1491. KERNEL1x1_E2 32, 32, 3, 1
  1492. MY_ALIGN
  1493. ZGEMM_L1x1_SUB2_4:
  1494. /*----------------------------------------*/
  1495. andi. T1,L, 4
  1496. ble ZGEMM_L1x1_SUB2_2
  1497. LOAD1x1_2
  1498. KERNEL1x1_L2 32, 32, 0, 0
  1499. KERNEL1x1_E2 32, 32, 1, 1
  1500. MY_ALIGN
  1501. ZGEMM_L1x1_SUB2_2:
  1502. /*----------------------------------------*/
  1503. andi. T1,L, 2
  1504. ble ZGEMM_L1x1_SUB2_1
  1505. LOAD1x1_2
  1506. KERNEL1x1_E2 32, 32, 0, 1
  1507. MY_ALIGN
  1508. ZGEMM_L1x1_SUB2_1:
  1509. /*----------------------------------------*/
  1510. andi. T1,L, 1
  1511. ble ZGEMM_L1x1_SAVE
  1512. KERNEL1x1
  1513. ZGEMM_L1x1_SAVE:
  1514. /*----------------------------------------*/
  1515. SAVE1x1
  1516. #if defined(TRMMKERNEL)
  1517. REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 1
  1518. #endif
  1519. ZGEMM_L1x1_END:
  1520. /*----------------------------------------*/
  1521. #if defined(TRMMKERNEL) && !defined(LEFT)
  1522. addi TEMP_REG, TEMP_REG, 1
  1523. #endif
  1524. ZGEMM_L1_END:
  1525. /*----------------------------------------*/