You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_logic_power9.S 43 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891
  1. /***************************************************************************
  2. Copyright (c) 2013-2019, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define MY_ALIGN .align 3
  28. b ZGEMM_L2
  29. /* MINI SUBROUTINES */
  30. /* 2x8 MAIN 128x+2 LOOP */
  31. ZGEMM_L2x8_LMAIN_SUB:
  32. /*----------------------------------------*/
  33. mtctr T8
  34. LOAD2x8_2
  35. MY_ALIGN
  36. ZGEMM_L2x8_LOOP:
  37. /*----------------------------------------*/
  38. dcbt AO, PRE
  39. dcbt BO, PRE
  40. KERNEL2x8_L2 256,64,0,0
  41. ZGEMM_L2x8_K128:
  42. /*----------------------------------------*/
  43. KERNEL2x8_L2 256,64,1,0
  44. dcbt AO, T2
  45. KERNEL2x8_L2 256,64,2,0
  46. KERNEL2x8_L2 256,64,3,0
  47. dcbt AO, T3
  48. dcbt BO, T2
  49. KERNEL2x8_L2 256,64,4,0
  50. KERNEL2x8_L2 256,64,5,0
  51. dcbt AO, T4
  52. KERNEL2x8_L2 256,64,6,0
  53. KERNEL2x8_L2 256,64,7,0
  54. dcbt AO, T5
  55. dcbt BO, T3
  56. KERNEL2x8_L2 256,64,8,0
  57. KERNEL2x8_L2 256,64,9,0
  58. KERNEL2x8_L2 256,64,10,0
  59. KERNEL2x8_L2 256,64,11,0
  60. dcbt BO, T4
  61. KERNEL2x8_L2 256,64,12,0
  62. KERNEL2x8_L2 256,64,13,0
  63. KERNEL2x8_L2 256,64,14,0
  64. KERNEL2x8_L2 256,64,15,0
  65. KERNEL2x8_L2 256,64,16,0
  66. KERNEL2x8_L2 256,64,17,0
  67. KERNEL2x8_L2 256,64,18,0
  68. KERNEL2x8_L2 256,64,19,0
  69. KERNEL2x8_L2 256,64,20,0
  70. KERNEL2x8_L2 256,64,21,0
  71. KERNEL2x8_L2 256,64,22,0
  72. KERNEL2x8_L2 256,64,23,0
  73. KERNEL2x8_L2 256,64,24,0
  74. KERNEL2x8_L2 256,64,25,0
  75. KERNEL2x8_L2 256,64,26,0
  76. KERNEL2x8_L2 256,64,27,0
  77. KERNEL2x8_L2 256,64,28,0
  78. KERNEL2x8_L2 256,64,29,0
  79. KERNEL2x8_L2 256,64,30,0
  80. KERNEL2x8_L2 256,64,31,0
  81. KERNEL2x8_L2 256,64,32,0
  82. KERNEL2x8_L2 256,64,33,0
  83. KERNEL2x8_L2 256,64,34,0
  84. KERNEL2x8_L2 256,64,35,0
  85. KERNEL2x8_L2 256,64,36,0
  86. KERNEL2x8_L2 256,64,37,0
  87. KERNEL2x8_L2 256,64,38,0
  88. KERNEL2x8_L2 256,64,39,0
  89. KERNEL2x8_L2 256,64,40,0
  90. KERNEL2x8_L2 256,64,41,0
  91. KERNEL2x8_L2 256,64,42,0
  92. KERNEL2x8_L2 256,64,43,0
  93. KERNEL2x8_L2 256,64,44,0
  94. KERNEL2x8_L2 256,64,45,0
  95. KERNEL2x8_L2 256,64,46,0
  96. KERNEL2x8_L2 256,64,47,0
  97. KERNEL2x8_L2 256,64,48,0
  98. KERNEL2x8_L2 256,64,49,0
  99. KERNEL2x8_L2 256,64,50,0
  100. KERNEL2x8_L2 256,64,51,0
  101. KERNEL2x8_L2 256,64,52,0
  102. KERNEL2x8_L2 256,64,53,0
  103. KERNEL2x8_L2 256,64,54,0
  104. KERNEL2x8_L2 256,64,55,0
  105. KERNEL2x8_L2 256,64,56,0
  106. KERNEL2x8_L2 256,64,57,0
  107. KERNEL2x8_L2 256,64,58,0
  108. KERNEL2x8_L2 256,64,59,0
  109. KERNEL2x8_L2 256,64,60,0
  110. KERNEL2x8_L2 256,64,61,0
  111. KERNEL2x8_L2 256,64,62,0
  112. KERNEL2x8_L2 256,64,63,1
  113. bdnz ZGEMM_L2x8_LOOP
  114. MY_ALIGN
  115. ZGEMM_L2x8_LOOP_END:
  116. /*----------------------------------------*/
  117. END2x8_2
  118. blr
  119. MY_ALIGN
  120. ZGEMM_2x8_L64_SUB:
  121. /*----------------------------------------*/
  122. LOAD2x8_2
  123. dcbt AO, PRE
  124. dcbt BO, PRE
  125. KERNEL2x8_L2 256,64,0,0
  126. KERNEL2x8_L2 256,64,1,0
  127. dcbt AO, T2
  128. KERNEL2x8_L2 256,64,2,0
  129. KERNEL2x8_L2 256,64,3,0
  130. dcbt AO, T3
  131. dcbt BO, T2
  132. KERNEL2x8_L2 256,64,4,0
  133. KERNEL2x8_L2 256,64,5,0
  134. dcbt AO, T4
  135. KERNEL2x8_L2 256,64,6,0
  136. KERNEL2x8_L2 256,64,7,0
  137. dcbt AO, T5
  138. dcbt BO, T3
  139. KERNEL2x8_L2 256,64,8,0
  140. KERNEL2x8_L2 256,64,9,0
  141. KERNEL2x8_L2 256,64,10,0
  142. KERNEL2x8_L2 256,64,11,0
  143. dcbt BO, T4
  144. KERNEL2x8_L2 256,64,12,0
  145. KERNEL2x8_L2 256,64,13,0
  146. KERNEL2x8_L2 256,64,14,0
  147. KERNEL2x8_L2 256,64,15,0
  148. KERNEL2x8_L2 256,64,16,0
  149. KERNEL2x8_L2 256,64,17,0
  150. KERNEL2x8_L2 256,64,18,0
  151. KERNEL2x8_L2 256,64,19,0
  152. KERNEL2x8_L2 256,64,20,0
  153. KERNEL2x8_L2 256,64,21,0
  154. KERNEL2x8_L2 256,64,22,0
  155. KERNEL2x8_L2 256,64,23,0
  156. KERNEL2x8_L2 256,64,24,0
  157. KERNEL2x8_L2 256,64,25,0
  158. KERNEL2x8_L2 256,64,26,0
  159. KERNEL2x8_L2 256,64,27,0
  160. KERNEL2x8_L2 256,64,28,0
  161. KERNEL2x8_L2 256,64,29,0
  162. KERNEL2x8_L2 256,64,30,0
  163. KERNEL2x8_E2 256,64,31,1
  164. blr
  165. MY_ALIGN
  166. ZGEMM_2x8_L32_SUB:
  167. /*----------------------------------------*/
  168. LOAD2x8_2
  169. dcbt AO, PRE
  170. dcbt BO, PRE
  171. KERNEL2x8_L2 256,64,0,0
  172. KERNEL2x8_L2 256,64,1,0
  173. dcbt AO, T2
  174. KERNEL2x8_L2 256,64,2,0
  175. KERNEL2x8_L2 256,64,3,0
  176. dcbt AO, T3
  177. dcbt BO, T2
  178. KERNEL2x8_L2 256,64,4,0
  179. KERNEL2x8_L2 256,64,5,0
  180. dcbt AO, T4
  181. KERNEL2x8_L2 256,64,6,0
  182. KERNEL2x8_L2 256,64,7,0
  183. dcbt AO, T5
  184. dcbt BO, T3
  185. KERNEL2x8_L2 256,64,8,0
  186. KERNEL2x8_L2 256,64,9,0
  187. KERNEL2x8_L2 256,64,10,0
  188. KERNEL2x8_L2 256,64,11,0
  189. dcbt BO, T4
  190. KERNEL2x8_L2 256,64,12,0
  191. KERNEL2x8_L2 256,64,13,0
  192. KERNEL2x8_L2 256,64,14,0
  193. KERNEL2x8_E2 256,64,15,1
  194. blr
  195. MY_ALIGN
  196. ZGEMM_2x8_L16_SUB:
  197. /*----------------------------------------*/
  198. LOAD2x8_2
  199. dcbt AO, PRE
  200. dcbt BO, PRE
  201. KERNEL2x8_L2 256,64,0,0
  202. KERNEL2x8_L2 256,64,1,0
  203. dcbt AO, T2
  204. KERNEL2x8_L2 256,64,2,0
  205. KERNEL2x8_L2 256,64,3,0
  206. dcbt AO, T3
  207. dcbt BO, T2
  208. KERNEL2x8_L2 256,64,4,0
  209. KERNEL2x8_L2 256,64,5,0
  210. dcbt AO, T4
  211. KERNEL2x8_L2 256,64,6,0
  212. KERNEL2x8_E2 256,64,7,1
  213. blr
  214. MY_ALIGN
  215. ZGEMM_2x4_LMAIN_SUB:
  216. /*----------------------------------------*/
  217. mtctr T8
  218. LOAD2x4_2
  219. MY_ALIGN
  220. ZGEMM_L2x4_LOOP:
  221. /*----------------------------------------*/
  222. KERNEL2x4_L2 128,64,0,0
  223. ZGEMM_L2x4_K32:
  224. /*----------------------------------------*/
  225. KERNEL2x4_L2 128,64,1,0
  226. KERNEL2x4_L2 128,64,2,0
  227. KERNEL2x4_L2 128,64,3,0
  228. KERNEL2x4_L2 128,64,4,0
  229. KERNEL2x4_L2 128,64,5,0
  230. KERNEL2x4_L2 128,64,6,0
  231. KERNEL2x4_L2 128,64,7,0
  232. KERNEL2x4_L2 128,64,8,0
  233. KERNEL2x4_L2 128,64,9,0
  234. KERNEL2x4_L2 128,64,10,0
  235. KERNEL2x4_L2 128,64,11,0
  236. KERNEL2x4_L2 128,64,12,0
  237. KERNEL2x4_L2 128,64,13,0
  238. KERNEL2x4_L2 128,64,14,0
  239. KERNEL2x4_L2 128,64,15,1
  240. bdnz ZGEMM_L2x4_LOOP
  241. MY_ALIGN
  242. ZGEMM_L2x4_LOOP_END:
  243. /*----------------------------------------*/
  244. END2x4_2
  245. blr
  246. MY_ALIGN
  247. ZGEMM_2x4_L16_SUB:
  248. /*----------------------------------------*/
  249. LOAD2x4_2
  250. KERNEL2x4_L2 128,64,0,0
  251. KERNEL2x4_L2 128,64,1,0
  252. KERNEL2x4_L2 128,64,2,0
  253. KERNEL2x4_L2 128,64,3,0
  254. KERNEL2x4_L2 128,64,4,0
  255. KERNEL2x4_L2 128,64,5,0
  256. KERNEL2x4_L2 128,64,6,0
  257. KERNEL2x4_E2 128,64,7,1
  258. blr
  259. MY_ALIGN
  260. ZGEMM_2x4_L8_SUB:
  261. /*----------------------------------------*/
  262. LOAD2x4_2
  263. KERNEL2x4_L2 128,64,0,0
  264. KERNEL2x4_L2 128,64,1,0
  265. KERNEL2x4_L2 128,64,2,0
  266. KERNEL2x4_E2 128,64,3,1
  267. blr
  268. ZGEMM_2x2_LMAIN_SUB:
  269. /*----------------------------------------*/
  270. mtctr T8
  271. LOAD2x2_2
  272. MY_ALIGN
  273. ZGEMM_L2x2_LOOP:
  274. /*----------------------------------------*/
  275. KERNEL2x2_L2 64,64,0,0
  276. ZGEMM_L2x2_K32:
  277. /*----------------------------------------*/
  278. KERNEL2x2_L2 64,64,1,0
  279. KERNEL2x2_L2 64,64,2,0
  280. KERNEL2x2_L2 64,64,3,0
  281. KERNEL2x2_L2 64,64,4,0
  282. KERNEL2x2_L2 64,64,5,0
  283. KERNEL2x2_L2 64,64,6,0
  284. KERNEL2x2_L2 64,64,7,0
  285. KERNEL2x2_L2 64,64,8,0
  286. KERNEL2x2_L2 64,64,9,0
  287. KERNEL2x2_L2 64,64,10,0
  288. KERNEL2x2_L2 64,64,11,0
  289. KERNEL2x2_L2 64,64,12,0
  290. KERNEL2x2_L2 64,64,13,0
  291. KERNEL2x2_L2 64,64,14,0
  292. KERNEL2x2_L2 64,64,15,1
  293. bdnz ZGEMM_L2x2_LOOP
  294. MY_ALIGN
  295. ZGEMM_L2x2_LOOP_END:
  296. /*----------------------------------------*/
  297. END2x2_2
  298. blr
  299. MY_ALIGN
  300. ZGEMM_2x2_L16_SUB:
  301. /*----------------------------------------*/
  302. LOAD2x2_2
  303. KERNEL2x2_L2 64,64,0,0
  304. KERNEL2x2_L2 64,64,1,0
  305. KERNEL2x2_L2 64,64,2,0
  306. KERNEL2x2_L2 64,64,3,0
  307. KERNEL2x2_L2 64,64,4,0
  308. KERNEL2x2_L2 64,64,5,0
  309. KERNEL2x2_L2 64,64,6,0
  310. KERNEL2x2_E2 64,64,7,1
  311. blr
  312. MY_ALIGN
  313. ZGEMM_2x2_L8_SUB:
  314. /*----------------------------------------*/
  315. LOAD2x2_2
  316. KERNEL2x2_L2 64,64,0,0
  317. KERNEL2x2_L2 64,64,1,0
  318. KERNEL2x2_L2 64,64,2,0
  319. KERNEL2x2_E2 64,64,3,1
  320. blr
  321. ZGEMM_2x1_LMAIN_SUB:
  322. /*----------------------------------------*/
  323. mtctr T8
  324. LOAD2x1_2
  325. MY_ALIGN
  326. ZGEMM_L2x1_LOOP:
  327. /*----------------------------------------*/
  328. KERNEL2x1_L2 32,64,0,0
  329. ZGEMM_L2x1_K32:
  330. /*----------------------------------------*/
  331. KERNEL2x1_L2 32,64,1,0
  332. KERNEL2x1_L2 32,64,2,0
  333. KERNEL2x1_L2 32,64,3,0
  334. KERNEL2x1_L2 32,64,4,0
  335. KERNEL2x1_L2 32,64,5,0
  336. KERNEL2x1_L2 32,64,6,0
  337. KERNEL2x1_L2 32,64,7,0
  338. KERNEL2x1_L2 32,64,8,0
  339. KERNEL2x1_L2 32,64,9,0
  340. KERNEL2x1_L2 32,64,10,0
  341. KERNEL2x1_L2 32,64,11,0
  342. KERNEL2x1_L2 32,64,12,0
  343. KERNEL2x1_L2 32,64,13,0
  344. KERNEL2x1_L2 32,64,14,0
  345. KERNEL2x1_L2 32,64,15,1
  346. bdnz ZGEMM_L2x1_LOOP
  347. MY_ALIGN
  348. ZGEMM_L2x1_LOOP_END:
  349. /*----------------------------------------*/
  350. END2x1_2
  351. blr
  352. MY_ALIGN
  353. ZGEMM_2x1_L16_SUB:
  354. /*----------------------------------------*/
  355. LOAD2x1_2
  356. KERNEL2x1_L2 32,64,0,0
  357. KERNEL2x1_L2 32,64,1,0
  358. KERNEL2x1_L2 32,64,2,0
  359. KERNEL2x1_L2 32,64,3,0
  360. KERNEL2x1_L2 32,64,4,0
  361. KERNEL2x1_L2 32,64,5,0
  362. KERNEL2x1_L2 32,64,6,0
  363. KERNEL2x1_E2 32,64,7,1
  364. blr
  365. MY_ALIGN
  366. ZGEMM_2x1_L8_SUB:
  367. /*----------------------------------------*/
  368. LOAD2x1_2
  369. KERNEL2x1_L2 32,64,0,0
  370. KERNEL2x1_L2 32,64,1,0
  371. KERNEL2x1_L2 32,64,2,0
  372. KERNEL2x1_E2 32,64,3,1
  373. blr
  374. /* MAIN LOOP BEGINS */
  375. MY_ALIGN
  376. ZGEMM_L2:
  377. /*----------------------------------------*/
  378. #if defined(TRMMKERNEL) && !defined(LEFT)
  379. neg TEMP_REG, OFFSET
  380. #endif
  381. srawi. J, N, 1
  382. ble ZGEMM_L2_END
  383. ZGEMM_L2_BEGIN:
  384. /*----------------------------------------*/
  385. mr CO, C
  386. slwi T1, LDC , 1
  387. add T2,C,LDC
  388. mr AO, A
  389. add C, C, T1
  390. #if defined(TRMMKERNEL) && defined(LEFT)
  391. mr TEMP_REG, OFFSET /*off = offset;*/
  392. #endif
  393. srawi. I, M, 3
  394. ble ZGEMM_L2x8_END
  395. dcbt CO,r0 /*just prefetch*/
  396. dcbt T2,r0
  397. ZGEMM_L2x8_BEGIN:
  398. /*----------------------------------------*/
  399. #if defined(TRMMKERNEL)
  400. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
  401. #else
  402. mr BO, B
  403. dcbt B, r0
  404. #endif
  405. dcbt AO, r0
  406. #if defined(TRMMKERNEL)
  407. REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
  408. mr T1, T6
  409. /* TEMPS FOR PREFETCH */
  410. li T2, 1024
  411. li T3, 1024+512
  412. addi T1,T1, -2
  413. /* TEMPS FOR PREFETCH */
  414. li T4, 2048
  415. li T5, 2048+512
  416. srawi. T8, T1, 7 /**(T11-2) % 128x */
  417. #else
  418. mr T1, K
  419. /* TEMPS FOR PREFETCH */
  420. li T2, 1024
  421. li T3, 1024+512
  422. addi T1,T1, -2
  423. /* TEMPS FOR PREFETCH */
  424. li T4, 2048
  425. li T5, 2048+512
  426. srawi. T8, T1, 7 /**(K-2) % 128x */
  427. #endif
  428. ZERO2x8
  429. ble ZGEMM_L2x8_SUB0
  430. bl ZGEMM_L2x8_LMAIN_SUB
  431. andi. L, T1, 127
  432. ble ZGEMM_L2x8_SAVE
  433. b ZGEMM_L2x8_SUB2
  434. ZGEMM_L2x8_SUB0:
  435. /*----------------------------------------*/
  436. #if defined(TRMMKERNEL)
  437. andi. L, T6, 255
  438. cmpwi T6,129
  439. #else
  440. andi. L, K, 255
  441. cmpwi K,129
  442. #endif
  443. li T8,1
  444. bne CMP2x8_128K
  445. addi BO,BO,-32
  446. addi AO,AO,-128
  447. LOAD2x8O 128,32
  448. END2x8_WITHOUT_ADD
  449. LOAD2x8_2O 256, 64
  450. mtctr T8
  451. bl ZGEMM_L2x8_K128
  452. b ZGEMM_L2x8_SAVE
  453. CMP2x8_128K:
  454. /*----------------------------------------*/
  455. #if defined(TRMMKERNEL)
  456. cmpwi T6,128
  457. #else
  458. cmpwi K,128
  459. #endif
  460. bne ZGEMM_L2x8_SUB2
  461. MY_ALIGN
  462. mtctr T8
  463. addi BO,BO,-64
  464. addi AO,AO,-256
  465. LOAD2x8_2O 256,64
  466. bl ZGEMM_L2x8_K128
  467. b ZGEMM_L2x8_SAVE
  468. MY_ALIGN
  469. ZGEMM_L2x8_SUB2:
  470. /*----------------------------------------*/
  471. andi. T1,L, 64
  472. ble ZGEMM_L2x8_SUB2_32
  473. bl ZGEMM_2x8_L64_SUB
  474. MY_ALIGN
  475. ZGEMM_L2x8_SUB2_32:
  476. /*----------------------------------------*/
  477. andi. T1,L, 32
  478. ble ZGEMM_L2x8_SUB2_16
  479. bl ZGEMM_2x8_L32_SUB
  480. MY_ALIGN
  481. ZGEMM_L2x8_SUB2_16:
  482. /*----------------------------------------*/
  483. andi. T1,L, 16
  484. ble ZGEMM_L2x8_SUB2_8
  485. bl ZGEMM_2x8_L16_SUB
  486. MY_ALIGN
  487. ZGEMM_L2x8_SUB2_8:
  488. /*----------------------------------------*/
  489. andi. T1,L, 8
  490. ble ZGEMM_L2x8_SUB2_4
  491. LOAD2x8_2
  492. KERNEL2x8_L2 256,64, 0,0
  493. KERNEL2x8_L2 256,64, 1,0
  494. KERNEL2x8_L2 256,64, 2,0
  495. KERNEL2x8_E2 256,64, 3,1
  496. MY_ALIGN
  497. ZGEMM_L2x8_SUB2_4:
  498. /*----------------------------------------*/
  499. andi. T1,L, 4
  500. ble ZGEMM_L2x8_SUB2_2
  501. LOAD2x8_2
  502. KERNEL2x8_L2 256,64, 0,0
  503. KERNEL2x8_E2 256,64, 1,1
  504. MY_ALIGN
  505. ZGEMM_L2x8_SUB2_2:
  506. /*----------------------------------------*/
  507. andi. T1,L, 2
  508. ble ZGEMM_L2x8_SUB2_1
  509. LOAD2x8_2
  510. KERNEL2x8_E2 256,64, 0,1
  511. MY_ALIGN
  512. ZGEMM_L2x8_SUB2_1:
  513. /*----------------------------------------*/
  514. andi. T1,L, 1
  515. ble ZGEMM_L2x8_SAVE
  516. KERNEL2x8
  517. ZGEMM_L2x8_SAVE:
  518. /*----------------------------------------*/
  519. addic. I, I, -1
  520. SAVE2x8
  521. #if defined(TRMMKERNEL)
  522. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
  523. #endif
  524. bgt ZGEMM_L2x8_BEGIN
  525. andi. T2, M, 7
  526. ble ZGEMM_L2x1_END
  527. andi. T1, M, 4
  528. ble ZGEMM_L2x4_END
  529. b ZGEMM_L2x4_BEGIN
  530. MY_ALIGN
  531. ZGEMM_L2x8_END:
  532. /*----------------------------------------*/
  533. ZGEMM_L2x4_BEGIN:
  534. /*----------------------------------------*/
  535. andi. T2, M, 7
  536. ble ZGEMM_L2x1_END
  537. andi. T1, M, 4
  538. ble ZGEMM_L2x4_END
  539. #if defined(TRMMKERNEL)
  540. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
  541. #else
  542. mr BO, B
  543. #endif
  544. #if defined(TRMMKERNEL)
  545. REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
  546. mr T1, T6
  547. addi T1,T1, -2
  548. srawi. T8, T1, 5 /**(T11-2) % 32x */
  549. #else
  550. mr T1, K
  551. addi T1,T1, -2
  552. srawi. T8, T1, 5 /**(K-2) % 32x */
  553. #endif
  554. ZERO2x4
  555. ble ZGEMM_L2x4_SUB0
  556. bl ZGEMM_2x4_LMAIN_SUB
  557. andi. L, T1, 31
  558. ble ZGEMM_L2x4_SAVE
  559. b ZGEMM_L2x4_SUB2
  560. ZGEMM_L2x4_SUB0:
  561. /*----------------------------------------*/
  562. #if defined(TRMMKERNEL)
  563. andi. L, T6, 63
  564. cmpwi T6,33
  565. #else
  566. andi. L, K, 63
  567. cmpwi K,33
  568. #endif
  569. li T8,1
  570. bne CMP2x4_32K
  571. addi BO,BO,-32
  572. addi AO,AO,-64
  573. LOAD2x4O 64,32
  574. END2x4_WITHOUT_ADD
  575. LOAD2x4_2O 128, 64
  576. mtctr T8
  577. bl ZGEMM_L2x4_K32
  578. b ZGEMM_L2x4_SAVE
  579. CMP2x4_32K:
  580. /*----------------------------------------*/
  581. #if defined(TRMMKERNEL)
  582. cmpwi T6,32
  583. #else
  584. cmpwi K,32
  585. #endif
  586. bne ZGEMM_L2x4_SUB2
  587. MY_ALIGN
  588. mtctr T8
  589. addi BO,BO,-64
  590. addi AO,AO,-128
  591. LOAD2x4_2O 128,64
  592. bl ZGEMM_L2x4_K32
  593. b ZGEMM_L2x4_SAVE
  594. MY_ALIGN
  595. MY_ALIGN
  596. ZGEMM_L2x4_SUB2:
  597. /*----------------------------------------*/
  598. andi. T1,L, 16
  599. ble ZGEMM_L2x4_SUB2_8
  600. bl ZGEMM_2x4_L16_SUB
  601. MY_ALIGN
  602. ZGEMM_L2x4_SUB2_8:
  603. /*----------------------------------------*/
  604. andi. T1,L, 8
  605. ble ZGEMM_L2x4_SUB2_4
  606. bl ZGEMM_2x4_L8_SUB
  607. MY_ALIGN
  608. ZGEMM_L2x4_SUB2_4:
  609. /*----------------------------------------*/
  610. andi. T1,L, 4
  611. ble ZGEMM_L2x4_SUB2_2
  612. LOAD2x4_2
  613. KERNEL2x4_L2 128,64, 0,0
  614. KERNEL2x4_E2 128,64, 1,1
  615. MY_ALIGN
  616. ZGEMM_L2x4_SUB2_2:
  617. /*----------------------------------------*/
  618. andi. T1,L, 2
  619. ble ZGEMM_L2x4_SUB2_1
  620. LOAD2x4_2
  621. KERNEL2x4_E2 128,64, 0,1
  622. MY_ALIGN
  623. ZGEMM_L2x4_SUB2_1:
  624. /*----------------------------------------*/
  625. andi. T1,L, 1
  626. ble ZGEMM_L2x4_SAVE
  627. KERNEL2x4
  628. ZGEMM_L2x4_SAVE:
  629. /*----------------------------------------*/
  630. SAVE2x4
  631. #if defined(TRMMKERNEL)
  632. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
  633. #endif
  634. ZGEMM_L2x4_END:
  635. /*----------------------------------------*/
  636. ZGEMM_L2x2_BEGIN:
  637. /*----------------------------------------*/
  638. andi. T1, M, 2
  639. ble ZGEMM_L2x2_END
  640. #if defined(TRMMKERNEL)
  641. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
  642. #else
  643. mr BO, B
  644. #endif
  645. #if defined(TRMMKERNEL)
  646. REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
  647. mr T1, T6
  648. addi T1,T1, -2
  649. srawi. T8, T1, 5 /**(T11-2) % 32x */
  650. #else
  651. mr T1, K
  652. addi T1,T1, -2
  653. srawi. T8, T1, 5 /**(K-2) % 32x */
  654. #endif
  655. ZERO2x2
  656. ble ZGEMM_L2x2_SUB0
  657. bl ZGEMM_2x2_LMAIN_SUB
  658. andi. L, T1, 31
  659. ble ZGEMM_L2x2_SAVE
  660. b ZGEMM_L2x2_SUB2
  661. ZGEMM_L2x2_SUB0:
  662. /*----------------------------------------*/
  663. #if defined(TRMMKERNEL)
  664. andi. L, T6, 63
  665. cmpwi T6,33
  666. #else
  667. andi. L, K, 63
  668. cmpwi K,33
  669. #endif
  670. li T8,1
  671. bne CMP2x2_32K
  672. addi BO,BO,-32
  673. addi AO,AO,-32
  674. LOAD2x2O 32,32
  675. END2x2_WITHOUT_ADD
  676. LOAD2x2_2O 64, 64
  677. mtctr T8
  678. bl ZGEMM_L2x2_K32
  679. b ZGEMM_L2x2_SAVE
  680. CMP2x2_32K:
  681. /*----------------------------------------*/
  682. #if defined(TRMMKERNEL)
  683. cmpwi T6,32
  684. #else
  685. cmpwi K,32
  686. #endif
  687. bne ZGEMM_L2x2_SUB2
  688. MY_ALIGN
  689. mtctr T8
  690. addi BO,BO,-64
  691. addi AO,AO,-64
  692. LOAD2x2_2O 64,64
  693. bl ZGEMM_L2x2_K32
  694. b ZGEMM_L2x2_SAVE
  695. MY_ALIGN
  696. MY_ALIGN
  697. ZGEMM_L2x2_SUB2:
  698. /*----------------------------------------*/
  699. andi. T1,L, 16
  700. ble ZGEMM_L2x2_SUB2_8
  701. bl ZGEMM_2x2_L16_SUB
  702. MY_ALIGN
  703. ZGEMM_L2x2_SUB2_8:
  704. /*----------------------------------------*/
  705. andi. T1,L, 8
  706. ble ZGEMM_L2x2_SUB2_4
  707. bl ZGEMM_2x2_L8_SUB
  708. MY_ALIGN
  709. ZGEMM_L2x2_SUB2_4:
  710. /*----------------------------------------*/
  711. andi. T1,L, 4
  712. ble ZGEMM_L2x2_SUB2_2
  713. LOAD2x2_2
  714. KERNEL2x2_L2 64,64, 0,0
  715. KERNEL2x2_E2 64,64, 1,1
  716. MY_ALIGN
  717. ZGEMM_L2x2_SUB2_2:
  718. /*----------------------------------------*/
  719. andi. T1,L, 2
  720. ble ZGEMM_L2x2_SUB2_1
  721. LOAD2x2_2
  722. KERNEL2x2_E2 64,64, 0,1
  723. MY_ALIGN
  724. ZGEMM_L2x2_SUB2_1:
  725. /*----------------------------------------*/
  726. andi. T1,L, 1
  727. ble ZGEMM_L2x2_SAVE
  728. KERNEL2x2
  729. ZGEMM_L2x2_SAVE:
  730. /*----------------------------------------*/
  731. SAVE2x2
  732. #if defined(TRMMKERNEL)
  733. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
  734. #endif
  735. ZGEMM_L2x2_END:
  736. /*----------------------------------------*/
  737. ZGEMM_L2x1_BEGIN:
  738. /*----------------------------------------*/
  739. andi. T1, M, 1
  740. ble ZGEMM_L2x1_END
  741. #if defined(TRMMKERNEL)
  742. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
  743. #else
  744. mr BO, B
  745. #endif
  746. #if defined(TRMMKERNEL)
  747. REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
  748. mr T1, T6
  749. addi T1,T1, -2
  750. srawi. T8, T1, 5 /**(T11-2) % 32x */
  751. #else
  752. mr T1, K
  753. addi T1,T1, -2
  754. srawi. T8, T1, 5 /**(K-2) % 32x */
  755. #endif
  756. ZERO2x1
  757. ble ZGEMM_L2x1_SUB0
  758. bl ZGEMM_2x1_LMAIN_SUB
  759. andi. L, T1, 31
  760. ble ZGEMM_L2x1_SAVE
  761. b ZGEMM_L2x1_SUB2
  762. ZGEMM_L2x1_SUB0:
  763. /*----------------------------------------*/
  764. #if defined(TRMMKERNEL)
  765. andi. L, T6, 63
  766. cmpwi T6,33
  767. #else
  768. andi. L, K, 63
  769. cmpwi K,33
  770. #endif
  771. li T8,1
  772. bne CMP2x1_32K
  773. addi BO,BO,-32
  774. addi AO,AO,-16
  775. LOAD2x1O 16,32
  776. END2x1_WITHOUT_ADD
  777. LOAD2x1_2O 32, 64
  778. mtctr T8
  779. bl ZGEMM_L2x1_K32
  780. b ZGEMM_L2x1_SAVE
  781. CMP2x1_32K:
  782. /*----------------------------------------*/
  783. #if defined(TRMMKERNEL)
  784. cmpwi T6,32
  785. #else
  786. cmpwi K,32
  787. #endif
  788. bne ZGEMM_L2x1_SUB2
  789. MY_ALIGN
  790. mtctr T8
  791. addi BO,BO,-64
  792. addi AO,AO,-32
  793. LOAD2x1_2O 32,64
  794. bl ZGEMM_L2x1_K32
  795. b ZGEMM_L2x1_SAVE
  796. MY_ALIGN
  797. MY_ALIGN
  798. ZGEMM_L2x1_SUB2:
  799. /*----------------------------------------*/
  800. andi. T1,L, 16
  801. ble ZGEMM_L2x1_SUB2_8
  802. bl ZGEMM_2x1_L16_SUB
  803. MY_ALIGN
  804. ZGEMM_L2x1_SUB2_8:
  805. /*----------------------------------------*/
  806. andi. T1,L, 8
  807. ble ZGEMM_L2x1_SUB2_4
  808. bl ZGEMM_2x1_L8_SUB
  809. MY_ALIGN
  810. ZGEMM_L2x1_SUB2_4:
  811. /*----------------------------------------*/
  812. andi. T1,L, 4
  813. ble ZGEMM_L2x1_SUB2_2
  814. LOAD2x1_2
  815. KERNEL2x1_L2 32,64, 0,0
  816. KERNEL2x1_E2 32,64, 1,1
  817. MY_ALIGN
  818. ZGEMM_L2x1_SUB2_2:
  819. /*----------------------------------------*/
  820. andi. T1,L, 2
  821. ble ZGEMM_L2x1_SUB2_1
  822. LOAD2x1_2
  823. KERNEL2x1_E2 32,64, 0,1
  824. MY_ALIGN
  825. ZGEMM_L2x1_SUB2_1:
  826. /*----------------------------------------*/
  827. andi. T1,L, 1
  828. ble ZGEMM_L2x1_SAVE
  829. KERNEL2x1
  830. ZGEMM_L2x1_SAVE:
  831. /*----------------------------------------*/
  832. SAVE2x1
  833. #if defined(TRMMKERNEL)
  834. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
  835. #endif
  836. ZGEMM_L2x1_END:
  837. /*----------------------------------------*/
  838. slwi T1, K, 5
  839. addic. J, J, -1
  840. add B, B, T1
  841. #if defined(TRMMKERNEL) && !defined(LEFT)
  842. addi TEMP_REG, TEMP_REG, 2
  843. #endif
  844. bgt ZGEMM_L2_BEGIN
  845. ZGEMM_L2_END:
  846. b ZGEMM_L1
  847. /* MINI SUBROUTINES */
  848. /* 1x8 MAIN 128x+2 LOOP */
  849. ZGEMM_L1x8_LMAIN_SUB:
  850. /*----------------------------------------*/
  851. mtctr T8
  852. LOAD1x8_2
  853. MY_ALIGN
  854. ZGEMM_L1x8_LOOP:
  855. /*----------------------------------------*/
  856. dcbt AO, PRE
  857. dcbt BO, PRE
  858. KERNEL1x8_L2 256,32,0,0
  859. ZGEMM_L1x8_K128:
  860. /*----------------------------------------*/
  861. KERNEL1x8_L2 256,32,1,0
  862. dcbt AO, T2
  863. KERNEL1x8_L2 256,32,2,0
  864. KERNEL1x8_L2 256,32,3,0
  865. dcbt AO, T3
  866. dcbt BO, T2
  867. KERNEL1x8_L2 256,32,4,0
  868. KERNEL1x8_L2 256,32,5,0
  869. dcbt AO, T4
  870. KERNEL1x8_L2 256,32,6,0
  871. KERNEL1x8_L2 256,32,7,0
  872. dcbt AO, T5
  873. dcbt BO, T3
  874. KERNEL1x8_L2 256,32,8,0
  875. KERNEL1x8_L2 256,32,9,0
  876. KERNEL1x8_L2 256,32,10,0
  877. KERNEL1x8_L2 256,32,11,0
  878. dcbt BO, T4
  879. KERNEL1x8_L2 256,32,12,0
  880. KERNEL1x8_L2 256,32,13,0
  881. KERNEL1x8_L2 256,32,14,0
  882. KERNEL1x8_L2 256,32,15,0
  883. KERNEL1x8_L2 256,32,16,0
  884. KERNEL1x8_L2 256,32,17,0
  885. KERNEL1x8_L2 256,32,18,0
  886. KERNEL1x8_L2 256,32,19,0
  887. KERNEL1x8_L2 256,32,20,0
  888. KERNEL1x8_L2 256,32,21,0
  889. KERNEL1x8_L2 256,32,22,0
  890. KERNEL1x8_L2 256,32,23,0
  891. KERNEL1x8_L2 256,32,24,0
  892. KERNEL1x8_L2 256,32,25,0
  893. KERNEL1x8_L2 256,32,26,0
  894. KERNEL1x8_L2 256,32,27,0
  895. KERNEL1x8_L2 256,32,28,0
  896. KERNEL1x8_L2 256,32,29,0
  897. KERNEL1x8_L2 256,32,30,0
  898. KERNEL1x8_L2 256,32,31,0
  899. KERNEL1x8_L2 256,32,32,0
  900. KERNEL1x8_L2 256,32,33,0
  901. KERNEL1x8_L2 256,32,34,0
  902. KERNEL1x8_L2 256,32,35,0
  903. KERNEL1x8_L2 256,32,36,0
  904. KERNEL1x8_L2 256,32,37,0
  905. KERNEL1x8_L2 256,32,38,0
  906. KERNEL1x8_L2 256,32,39,0
  907. KERNEL1x8_L2 256,32,40,0
  908. KERNEL1x8_L2 256,32,41,0
  909. KERNEL1x8_L2 256,32,42,0
  910. KERNEL1x8_L2 256,32,43,0
  911. KERNEL1x8_L2 256,32,44,0
  912. KERNEL1x8_L2 256,32,45,0
  913. KERNEL1x8_L2 256,32,46,0
  914. KERNEL1x8_L2 256,32,47,0
  915. KERNEL1x8_L2 256,32,48,0
  916. KERNEL1x8_L2 256,32,49,0
  917. KERNEL1x8_L2 256,32,50,0
  918. KERNEL1x8_L2 256,32,51,0
  919. KERNEL1x8_L2 256,32,52,0
  920. KERNEL1x8_L2 256,32,53,0
  921. KERNEL1x8_L2 256,32,54,0
  922. KERNEL1x8_L2 256,32,55,0
  923. KERNEL1x8_L2 256,32,56,0
  924. KERNEL1x8_L2 256,32,57,0
  925. KERNEL1x8_L2 256,32,58,0
  926. KERNEL1x8_L2 256,32,59,0
  927. KERNEL1x8_L2 256,32,60,0
  928. KERNEL1x8_L2 256,32,61,0
  929. KERNEL1x8_L2 256,32,62,0
  930. KERNEL1x8_L2 256,32,63,1
  931. bdnz ZGEMM_L1x8_LOOP
  932. MY_ALIGN
  933. ZGEMM_L1x8_LOOP_END:
  934. /*----------------------------------------*/
  935. END1x8_2
  936. blr
  937. MY_ALIGN
  938. ZGEMM_1x8_L64_SUB:
  939. /*----------------------------------------*/
  940. LOAD1x8_2
  941. dcbt AO, PRE
  942. dcbt BO, PRE
  943. KERNEL1x8_L2 256,32,0,0
  944. KERNEL1x8_L2 256,32,1,0
  945. dcbt AO, T2
  946. KERNEL1x8_L2 256,32,2,0
  947. KERNEL1x8_L2 256,32,3,0
  948. dcbt AO, T3
  949. dcbt BO, T2
  950. KERNEL1x8_L2 256,32,4,0
  951. KERNEL1x8_L2 256,32,5,0
  952. dcbt AO, T4
  953. KERNEL1x8_L2 256,32,6,0
  954. KERNEL1x8_L2 256,32,7,0
  955. dcbt AO, T5
  956. dcbt BO, T3
  957. KERNEL1x8_L2 256,32,8,0
  958. KERNEL1x8_L2 256,32,9,0
  959. KERNEL1x8_L2 256,32,10,0
  960. KERNEL1x8_L2 256,32,11,0
  961. dcbt BO, T4
  962. KERNEL1x8_L2 256,32,12,0
  963. KERNEL1x8_L2 256,32,13,0
  964. KERNEL1x8_L2 256,32,14,0
  965. KERNEL1x8_L2 256,32,15,0
  966. KERNEL1x8_L2 256,32,16,0
  967. KERNEL1x8_L2 256,32,17,0
  968. KERNEL1x8_L2 256,32,18,0
  969. KERNEL1x8_L2 256,32,19,0
  970. KERNEL1x8_L2 256,32,20,0
  971. KERNEL1x8_L2 256,32,21,0
  972. KERNEL1x8_L2 256,32,22,0
  973. KERNEL1x8_L2 256,32,23,0
  974. KERNEL1x8_L2 256,32,24,0
  975. KERNEL1x8_L2 256,32,25,0
  976. KERNEL1x8_L2 256,32,26,0
  977. KERNEL1x8_L2 256,32,27,0
  978. KERNEL1x8_L2 256,32,28,0
  979. KERNEL1x8_L2 256,32,29,0
  980. KERNEL1x8_L2 256,32,30,0
  981. KERNEL1x8_E2 256,32,31,1
  982. blr
  983. MY_ALIGN
  984. ZGEMM_1x8_L32_SUB:
  985. /*----------------------------------------*/
  986. LOAD1x8_2
  987. dcbt AO, PRE
  988. dcbt BO, PRE
  989. KERNEL1x8_L2 256,32,0,0
  990. KERNEL1x8_L2 256,32,1,0
  991. dcbt AO, T2
  992. KERNEL1x8_L2 256,32,2,0
  993. KERNEL1x8_L2 256,32,3,0
  994. dcbt AO, T3
  995. dcbt BO, T2
  996. KERNEL1x8_L2 256,32,4,0
  997. KERNEL1x8_L2 256,32,5,0
  998. dcbt AO, T4
  999. KERNEL1x8_L2 256,32,6,0
  1000. KERNEL1x8_L2 256,32,7,0
  1001. dcbt AO, T5
  1002. dcbt BO, T3
  1003. KERNEL1x8_L2 256,32,8,0
  1004. KERNEL1x8_L2 256,32,9,0
  1005. KERNEL1x8_L2 256,32,10,0
  1006. KERNEL1x8_L2 256,32,11,0
  1007. dcbt BO, T4
  1008. KERNEL1x8_L2 256,32,12,0
  1009. KERNEL1x8_L2 256,32,13,0
  1010. KERNEL1x8_L2 256,32,14,0
  1011. KERNEL1x8_E2 256,32,15,1
  1012. blr
  1013. MY_ALIGN
  1014. ZGEMM_1x8_L16_SUB:
  1015. /*----------------------------------------*/
  1016. LOAD1x8_2
  1017. dcbt AO, PRE
  1018. dcbt BO, PRE
  1019. KERNEL1x8_L2 256,32,0,0
  1020. KERNEL1x8_L2 256,32,1,0
  1021. dcbt AO, T2
  1022. KERNEL1x8_L2 256,32,2,0
  1023. KERNEL1x8_L2 256,32,3,0
  1024. dcbt AO, T3
  1025. dcbt BO, T2
  1026. KERNEL1x8_L2 256,32,4,0
  1027. KERNEL1x8_L2 256,32,5,0
  1028. dcbt AO, T4
  1029. KERNEL1x8_L2 256,32,6,0
  1030. KERNEL1x8_E2 256,32,7,1
  1031. blr
  1032. MY_ALIGN
  1033. ZGEMM_1x4_LMAIN_SUB:
  1034. /*----------------------------------------*/
  1035. mtctr T8
  1036. LOAD1x4_2
  1037. MY_ALIGN
  1038. ZGEMM_L1x4_LOOP:
  1039. /*----------------------------------------*/
  1040. KERNEL1x4_L2 128,32,0,0
  1041. ZGEMM_L1x4_K32:
  1042. /*----------------------------------------*/
  1043. KERNEL1x4_L2 128,32,1,0
  1044. KERNEL1x4_L2 128,32,2,0
  1045. KERNEL1x4_L2 128,32,3,0
  1046. KERNEL1x4_L2 128,32,4,0
  1047. KERNEL1x4_L2 128,32,5,0
  1048. KERNEL1x4_L2 128,32,6,0
  1049. KERNEL1x4_L2 128,32,7,0
  1050. KERNEL1x4_L2 128,32,8,0
  1051. KERNEL1x4_L2 128,32,9,0
  1052. KERNEL1x4_L2 128,32,10,0
  1053. KERNEL1x4_L2 128,32,11,0
  1054. KERNEL1x4_L2 128,32,12,0
  1055. KERNEL1x4_L2 128,32,13,0
  1056. KERNEL1x4_L2 128,32,14,0
  1057. KERNEL1x4_L2 128,32,15,1
  1058. bdnz ZGEMM_L1x4_LOOP
  1059. MY_ALIGN
  1060. ZGEMM_L1x4_LOOP_END:
  1061. /*----------------------------------------*/
  1062. END1x4_2
  1063. blr
  1064. MY_ALIGN
  1065. ZGEMM_1x4_L16_SUB:
  1066. /*----------------------------------------*/
  1067. LOAD1x4_2
  1068. KERNEL1x4_L2 128,32,0,0
  1069. KERNEL1x4_L2 128,32,1,0
  1070. KERNEL1x4_L2 128,32,2,0
  1071. KERNEL1x4_L2 128,32,3,0
  1072. KERNEL1x4_L2 128,32,4,0
  1073. KERNEL1x4_L2 128,32,5,0
  1074. KERNEL1x4_L2 128,32,6,0
  1075. KERNEL1x4_E2 128,32,7,1
  1076. blr
  1077. MY_ALIGN
  1078. ZGEMM_1x4_L8_SUB:
  1079. /*----------------------------------------*/
  1080. LOAD1x4_2
  1081. KERNEL1x4_L2 128,32,0,0
  1082. KERNEL1x4_L2 128,32,1,0
  1083. KERNEL1x4_L2 128,32,2,0
  1084. KERNEL1x4_E2 128,32,3,1
  1085. blr
  1086. ZGEMM_1x2_LMAIN_SUB:
  1087. /*----------------------------------------*/
  1088. mtctr T8
  1089. LOAD1x2_2
  1090. MY_ALIGN
  1091. ZGEMM_L1x2_LOOP:
  1092. /*----------------------------------------*/
  1093. KERNEL1x2_L2 64,32,0,0
  1094. ZGEMM_L1x2_K32:
  1095. /*----------------------------------------*/
  1096. KERNEL1x2_L2 64,32,1,0
  1097. KERNEL1x2_L2 64,32,2,0
  1098. KERNEL1x2_L2 64,32,3,0
  1099. KERNEL1x2_L2 64,32,4,0
  1100. KERNEL1x2_L2 64,32,5,0
  1101. KERNEL1x2_L2 64,32,6,0
  1102. KERNEL1x2_L2 64,32,7,0
  1103. KERNEL1x2_L2 64,32,8,0
  1104. KERNEL1x2_L2 64,32,9,0
  1105. KERNEL1x2_L2 64,32,10,0
  1106. KERNEL1x2_L2 64,32,11,0
  1107. KERNEL1x2_L2 64,32,12,0
  1108. KERNEL1x2_L2 64,32,13,0
  1109. KERNEL1x2_L2 64,32,14,0
  1110. KERNEL1x2_L2 64,32,15,1
  1111. bdnz ZGEMM_L1x2_LOOP
  1112. MY_ALIGN
  1113. ZGEMM_L1x2_LOOP_END:
  1114. /*----------------------------------------*/
  1115. END1x2_2
  1116. blr
  1117. MY_ALIGN
  1118. ZGEMM_1x2_L16_SUB:
  1119. /*----------------------------------------*/
  1120. LOAD1x2_2
  1121. KERNEL1x2_L2 64,32,0,0
  1122. KERNEL1x2_L2 64,32,1,0
  1123. KERNEL1x2_L2 64,32,2,0
  1124. KERNEL1x2_L2 64,32,3,0
  1125. KERNEL1x2_L2 64,32,4,0
  1126. KERNEL1x2_L2 64,32,5,0
  1127. KERNEL1x2_L2 64,32,6,0
  1128. KERNEL1x2_E2 64,32,7,1
  1129. blr
  1130. MY_ALIGN
  1131. ZGEMM_1x2_L8_SUB:
  1132. /*----------------------------------------*/
  1133. LOAD1x2_2
  1134. KERNEL1x2_L2 64,32,0,0
  1135. KERNEL1x2_L2 64,32,1,0
  1136. KERNEL1x2_L2 64,32,2,0
  1137. KERNEL1x2_E2 64,32,3,1
  1138. blr
  1139. ZGEMM_1x1_LMAIN_SUB:
  1140. /*----------------------------------------*/
  1141. mtctr T8
  1142. LOAD1x1_2
  1143. MY_ALIGN
  1144. ZGEMM_L1x1_LOOP:
  1145. /*----------------------------------------*/
  1146. KERNEL1x1_L2 32,32,0,0
  1147. ZGEMM_L1x1_K32:
  1148. /*----------------------------------------*/
  1149. KERNEL1x1_L2 32,32,1,0
  1150. KERNEL1x1_L2 32,32,2,0
  1151. KERNEL1x1_L2 32,32,3,0
  1152. KERNEL1x1_L2 32,32,4,0
  1153. KERNEL1x1_L2 32,32,5,0
  1154. KERNEL1x1_L2 32,32,6,0
  1155. KERNEL1x1_L2 32,32,7,0
  1156. KERNEL1x1_L2 32,32,8,0
  1157. KERNEL1x1_L2 32,32,9,0
  1158. KERNEL1x1_L2 32,32,10,0
  1159. KERNEL1x1_L2 32,32,11,0
  1160. KERNEL1x1_L2 32,32,12,0
  1161. KERNEL1x1_L2 32,32,13,0
  1162. KERNEL1x1_L2 32,32,14,0
  1163. KERNEL1x1_L2 32,32,15,1
  1164. bdnz ZGEMM_L1x1_LOOP
  1165. MY_ALIGN
  1166. ZGEMM_L1x1_LOOP_END:
  1167. /*----------------------------------------*/
  1168. END1x1_2
  1169. blr
  1170. MY_ALIGN
  1171. ZGEMM_1x1_L16_SUB:
  1172. /*----------------------------------------*/
  1173. LOAD1x1_2
  1174. KERNEL1x1_L2 32,32,0,0
  1175. KERNEL1x1_L2 32,32,1,0
  1176. KERNEL1x1_L2 32,32,2,0
  1177. KERNEL1x1_L2 32,32,3,0
  1178. KERNEL1x1_L2 32,32,4,0
  1179. KERNEL1x1_L2 32,32,5,0
  1180. KERNEL1x1_L2 32,32,6,0
  1181. KERNEL1x1_E2 32,32,7,1
  1182. blr
  1183. MY_ALIGN
  1184. ZGEMM_1x1_L8_SUB:
  1185. /*----------------------------------------*/
  1186. LOAD1x1_2
  1187. KERNEL1x1_L2 32,32,0,0
  1188. KERNEL1x1_L2 32,32,1,0
  1189. KERNEL1x1_L2 32,32,2,0
  1190. KERNEL1x1_E2 32,32,3,1
  1191. blr
  1192. /*----------------------N1 BEGINS---------*/
  1193. ZGEMM_L1:
  1194. /*----------------------------------------*/
  1195. andi. T1, N, 1
  1196. ble ZGEMM_L1_END
  1197. ZGEMM_L1_BEGIN:
  1198. /*----------------------------------------*/
  1199. mr CO, C
  1200. add T2,C,LDC
  1201. mr AO, A
  1202. add C, C, T1
  1203. #if defined(TRMMKERNEL) && defined(LEFT)
  1204. mr TEMP_REG, OFFSET /*off = offset;*/
  1205. #endif
  1206. srawi. I, M, 3
  1207. ble ZGEMM_L1x8_END
  1208. dcbt CO,r0 /*just prefetch*/
  1209. dcbt T2,r0
  1210. ZGEMM_L1x8_BEGIN:
  1211. /*----------------------------------------*/
  1212. #if defined(TRMMKERNEL)
  1213. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
  1214. #else
  1215. mr BO, B
  1216. dcbt B, r0
  1217. #endif
  1218. dcbt AO, r0
  1219. #if defined(TRMMKERNEL)
  1220. REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
  1221. mr T1, T6
  1222. /* TEMPS FOR PREFETCH */
  1223. li T2, 1024
  1224. li T3, 1024+512
  1225. addi T1,T1, -2
  1226. /* TEMPS FOR PREFETCH */
  1227. li T4, 2048
  1228. li T5, 2048+512
  1229. srawi. T8, T1, 7 /**(T11-2) % 128x */
  1230. #else
  1231. mr T1, K
  1232. /* TEMPS FOR PREFETCH */
  1233. li T2, 1024
  1234. li T3, 1024+512
  1235. addi T1,T1, -2
  1236. /* TEMPS FOR PREFETCH */
  1237. li T4, 2048
  1238. li T5, 2048+512
  1239. srawi. T8, T1, 7 /**(K-2) % 128x */
  1240. #endif
  1241. ZERO1x8
  1242. ble ZGEMM_L1x8_SUB0
  1243. bl ZGEMM_L1x8_LMAIN_SUB
  1244. andi. L, T1, 127
  1245. ble ZGEMM_L1x8_SAVE
  1246. b ZGEMM_L1x8_SUB2
  1247. ZGEMM_L1x8_SUB0:
  1248. /*----------------------------------------*/
  1249. #if defined(TRMMKERNEL)
  1250. andi. L, T6, 255
  1251. cmpwi T6,129
  1252. #else
  1253. andi. L, K, 255
  1254. cmpwi K,129
  1255. #endif
  1256. li T8,1
  1257. bne CMP1x8_128K
  1258. addi BO,BO,-16
  1259. addi AO,AO,-128
  1260. LOAD1x8O 128,16
  1261. END1x8_WITHOUT_ADD
  1262. LOAD1x8_2O 256, 32
  1263. mtctr T8
  1264. bl ZGEMM_L1x8_K128
  1265. b ZGEMM_L1x8_SAVE
  1266. CMP1x8_128K:
  1267. /*----------------------------------------*/
  1268. #if defined(TRMMKERNEL)
  1269. cmpwi T6,128
  1270. #else
  1271. cmpwi K,128
  1272. #endif
  1273. bne ZGEMM_L1x8_SUB2
  1274. MY_ALIGN
  1275. mtctr T8
  1276. addi BO,BO,-32
  1277. addi AO,AO,-256
  1278. LOAD1x8_2O 256,32
  1279. bl ZGEMM_L1x8_K128
  1280. b ZGEMM_L1x8_SAVE
  1281. MY_ALIGN
  1282. ZGEMM_L1x8_SUB2:
  1283. /*----------------------------------------*/
  1284. andi. T1,L, 64
  1285. ble ZGEMM_L1x8_SUB2_32
  1286. bl ZGEMM_1x8_L64_SUB
  1287. MY_ALIGN
  1288. ZGEMM_L1x8_SUB2_32:
  1289. /*----------------------------------------*/
  1290. andi. T1,L, 32
  1291. ble ZGEMM_L1x8_SUB2_16
  1292. bl ZGEMM_1x8_L32_SUB
  1293. MY_ALIGN
  1294. ZGEMM_L1x8_SUB2_16:
  1295. /*----------------------------------------*/
  1296. andi. T1,L, 16
  1297. ble ZGEMM_L1x8_SUB2_8
  1298. bl ZGEMM_1x8_L16_SUB
  1299. MY_ALIGN
  1300. ZGEMM_L1x8_SUB2_8:
  1301. /*----------------------------------------*/
  1302. andi. T1,L, 8
  1303. ble ZGEMM_L1x8_SUB2_4
  1304. LOAD1x8_2
  1305. KERNEL1x8_L2 256,32, 0,0
  1306. KERNEL1x8_L2 256,32, 1,0
  1307. KERNEL1x8_L2 256,32, 2,0
  1308. KERNEL1x8_E2 256,32, 3,1
  1309. MY_ALIGN
  1310. ZGEMM_L1x8_SUB2_4:
  1311. /*----------------------------------------*/
  1312. andi. T1,L, 4
  1313. ble ZGEMM_L1x8_SUB2_2
  1314. LOAD1x8_2
  1315. KERNEL1x8_L2 256,32, 0,0
  1316. KERNEL1x8_E2 256,32, 1,1
  1317. MY_ALIGN
  1318. ZGEMM_L1x8_SUB2_2:
  1319. /*----------------------------------------*/
  1320. andi. T1,L, 2
  1321. ble ZGEMM_L1x8_SUB2_1
  1322. LOAD1x8_2
  1323. KERNEL1x8_E2 256,32, 0,1
  1324. MY_ALIGN
  1325. ZGEMM_L1x8_SUB2_1:
  1326. /*----------------------------------------*/
  1327. andi. T1,L, 1
  1328. ble ZGEMM_L1x8_SAVE
  1329. KERNEL1x8
  1330. ZGEMM_L1x8_SAVE:
  1331. /*----------------------------------------*/
  1332. addic. I, I, -1
  1333. SAVE1x8
  1334. #if defined(TRMMKERNEL)
  1335. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
  1336. #endif
  1337. bgt ZGEMM_L1x8_BEGIN
  1338. andi. T2, M, 7
  1339. ble ZGEMM_L1x1_END
  1340. andi. T1, M, 4
  1341. ble ZGEMM_L1x4_END
  1342. b ZGEMM_L1x4_BEGIN
  1343. MY_ALIGN
  1344. ZGEMM_L1x8_END:
  1345. /*----------------------------------------*/
  1346. ZGEMM_L1x4_BEGIN:
  1347. /*----------------------------------------*/
  1348. andi. T2, M, 7
  1349. ble ZGEMM_L1x1_END
  1350. andi. T1, M, 4
  1351. ble ZGEMM_L1x4_END
  1352. #if defined(TRMMKERNEL)
  1353. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
  1354. #else
  1355. mr BO, B
  1356. #endif
  1357. #if defined(TRMMKERNEL)
  1358. REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
  1359. mr T1, T6
  1360. addi T1,T1, -2
  1361. srawi. T8, T1, 5 /**(T11-2) % 32x */
  1362. #else
  1363. mr T1, K
  1364. addi T1,T1, -2
  1365. srawi. T8, T1, 5 /**(K-2) % 32x */
  1366. #endif
  1367. ZERO1x4
  1368. ble ZGEMM_L1x4_SUB0
  1369. bl ZGEMM_1x4_LMAIN_SUB
  1370. andi. L, T1, 31
  1371. ble ZGEMM_L1x4_SAVE
  1372. b ZGEMM_L1x4_SUB2
  1373. ZGEMM_L1x4_SUB0:
  1374. /*----------------------------------------*/
  1375. #if defined(TRMMKERNEL)
  1376. andi. L, T6, 63
  1377. cmpwi T6,33
  1378. #else
  1379. andi. L, K, 63
  1380. cmpwi K,33
  1381. #endif
  1382. li T8,1
  1383. bne CMP1x4_32K
  1384. addi BO,BO,-16
  1385. addi AO,AO,-64
  1386. LOAD1x4O 64,16
  1387. END1x4_WITHOUT_ADD
  1388. LOAD1x4_2O 128, 32
  1389. mtctr T8
  1390. bl ZGEMM_L1x4_K32
  1391. b ZGEMM_L1x4_SAVE
  1392. CMP1x4_32K:
  1393. /*----------------------------------------*/
  1394. #if defined(TRMMKERNEL)
  1395. cmpwi T6,32
  1396. #else
  1397. cmpwi K,32
  1398. #endif
  1399. bne ZGEMM_L1x4_SUB2
  1400. MY_ALIGN
  1401. mtctr T8
  1402. addi BO,BO,-32
  1403. addi AO,AO,-128
  1404. LOAD1x4_2O 128,32
  1405. bl ZGEMM_L1x4_K32
  1406. b ZGEMM_L1x4_SAVE
  1407. MY_ALIGN
  1408. MY_ALIGN
  1409. ZGEMM_L1x4_SUB2:
  1410. /*----------------------------------------*/
  1411. andi. T1,L, 16
  1412. ble ZGEMM_L1x4_SUB2_8
  1413. bl ZGEMM_1x4_L16_SUB
  1414. MY_ALIGN
  1415. ZGEMM_L1x4_SUB2_8:
  1416. /*----------------------------------------*/
  1417. andi. T1,L, 8
  1418. ble ZGEMM_L1x4_SUB2_4
  1419. bl ZGEMM_1x4_L8_SUB
  1420. MY_ALIGN
  1421. ZGEMM_L1x4_SUB2_4:
  1422. /*----------------------------------------*/
  1423. andi. T1,L, 4
  1424. ble ZGEMM_L1x4_SUB2_2
  1425. LOAD1x4_2
  1426. KERNEL1x4_L2 128,32, 0,0
  1427. KERNEL1x4_E2 128,32, 1,1
  1428. MY_ALIGN
  1429. ZGEMM_L1x4_SUB2_2:
  1430. /*----------------------------------------*/
  1431. andi. T1,L, 2
  1432. ble ZGEMM_L1x4_SUB2_1
  1433. LOAD1x4_2
  1434. KERNEL1x4_E2 128,32, 0,1
  1435. MY_ALIGN
  1436. ZGEMM_L1x4_SUB2_1:
  1437. /*----------------------------------------*/
  1438. andi. T1,L, 1
  1439. ble ZGEMM_L1x4_SAVE
  1440. KERNEL1x4
  1441. ZGEMM_L1x4_SAVE:
  1442. /*----------------------------------------*/
  1443. SAVE1x4
  1444. #if defined(TRMMKERNEL)
  1445. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
  1446. #endif
  1447. ZGEMM_L1x4_END:
  1448. /*----------------------------------------*/
  1449. ZGEMM_L1x2_BEGIN:
  1450. /*----------------------------------------*/
  1451. andi. T1, M, 2
  1452. ble ZGEMM_L1x2_END
  1453. #if defined(TRMMKERNEL)
  1454. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
  1455. #else
  1456. mr BO, B
  1457. #endif
  1458. #if defined(TRMMKERNEL)
  1459. REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
  1460. mr T1, T6
  1461. addi T1,T1, -2
  1462. srawi. T8, T1, 5 /**(T11-2) % 32x */
  1463. #else
  1464. mr T1, K
  1465. addi T1,T1, -2
  1466. srawi. T8, T1, 5 /**(K-2) % 32x */
  1467. #endif
  1468. ZERO1x2
  1469. ble ZGEMM_L1x2_SUB0
  1470. bl ZGEMM_1x2_LMAIN_SUB
  1471. andi. L, T1, 31
  1472. ble ZGEMM_L1x2_SAVE
  1473. b ZGEMM_L1x2_SUB2
  1474. ZGEMM_L1x2_SUB0:
  1475. /*----------------------------------------*/
  1476. #if defined(TRMMKERNEL)
  1477. andi. L, T6, 63
  1478. cmpwi T6,33
  1479. #else
  1480. andi. L, K, 63
  1481. cmpwi K,33
  1482. #endif
  1483. li T8,1
  1484. bne CMP1x2_32K
  1485. addi BO,BO,-16
  1486. addi AO,AO,-32
  1487. LOAD1x2O 32,16
  1488. END1x2_WITHOUT_ADD
  1489. LOAD1x2_2O 64, 32
  1490. mtctr T8
  1491. bl ZGEMM_L1x2_K32
  1492. b ZGEMM_L1x2_SAVE
  1493. CMP1x2_32K:
  1494. /*----------------------------------------*/
  1495. #if defined(TRMMKERNEL)
  1496. cmpwi T6,32
  1497. #else
  1498. cmpwi K,32
  1499. #endif
  1500. bne ZGEMM_L1x2_SUB2
  1501. MY_ALIGN
  1502. mtctr T8
  1503. addi BO,BO,-32
  1504. addi AO,AO,-64
  1505. LOAD1x2_2O 64,32
  1506. bl ZGEMM_L1x2_K32
  1507. b ZGEMM_L1x2_SAVE
  1508. MY_ALIGN
  1509. MY_ALIGN
  1510. ZGEMM_L1x2_SUB2:
  1511. /*----------------------------------------*/
  1512. andi. T1,L, 16
  1513. ble ZGEMM_L1x2_SUB2_8
  1514. bl ZGEMM_1x2_L16_SUB
  1515. MY_ALIGN
  1516. ZGEMM_L1x2_SUB2_8:
  1517. /*----------------------------------------*/
  1518. andi. T1,L, 8
  1519. ble ZGEMM_L1x2_SUB2_4
  1520. bl ZGEMM_1x2_L8_SUB
  1521. MY_ALIGN
  1522. ZGEMM_L1x2_SUB2_4:
  1523. /*----------------------------------------*/
  1524. andi. T1,L, 4
  1525. ble ZGEMM_L1x2_SUB2_2
  1526. LOAD1x2_2
  1527. KERNEL1x2_L2 64,32, 0,0
  1528. KERNEL1x2_E2 64,32, 1,1
  1529. MY_ALIGN
  1530. ZGEMM_L1x2_SUB2_2:
  1531. /*----------------------------------------*/
  1532. andi. T1,L, 2
  1533. ble ZGEMM_L1x2_SUB2_1
  1534. LOAD1x2_2
  1535. KERNEL1x2_E2 64,32, 0,1
  1536. MY_ALIGN
  1537. ZGEMM_L1x2_SUB2_1:
  1538. /*----------------------------------------*/
  1539. andi. T1,L, 1
  1540. ble ZGEMM_L1x2_SAVE
  1541. KERNEL1x2
  1542. ZGEMM_L1x2_SAVE:
  1543. /*----------------------------------------*/
  1544. SAVE1x2
  1545. #if defined(TRMMKERNEL)
  1546. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
  1547. #endif
  1548. ZGEMM_L1x2_END:
  1549. /*----------------------------------------*/
  1550. ZGEMM_L1x1_BEGIN:
  1551. /*----------------------------------------*/
  1552. andi. T1, M, 1
  1553. ble ZGEMM_L1x1_END
  1554. #if defined(TRMMKERNEL)
  1555. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
  1556. #else
  1557. mr BO, B
  1558. #endif
  1559. #if defined(TRMMKERNEL)
  1560. REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
  1561. mr T1, T6
  1562. addi T1,T1, -2
  1563. srawi. T8, T1, 5 /**(T11-2) % 32x */
  1564. #else
  1565. mr T1, K
  1566. addi T1,T1, -2
  1567. srawi. T8, T1, 5 /**(K-2) % 32x */
  1568. #endif
  1569. ZERO1x1
  1570. ble ZGEMM_L1x1_SUB0
  1571. bl ZGEMM_1x1_LMAIN_SUB
  1572. andi. L, T1, 31
  1573. ble ZGEMM_L1x1_SAVE
  1574. b ZGEMM_L1x1_SUB2
  1575. ZGEMM_L1x1_SUB0:
  1576. /*----------------------------------------*/
  1577. #if defined(TRMMKERNEL)
  1578. andi. L, T6, 63
  1579. cmpwi T6,33
  1580. #else
  1581. andi. L, K, 63
  1582. cmpwi K,33
  1583. #endif
  1584. li T8,1
  1585. bne CMP1x1_32K
  1586. addi BO,BO,-16
  1587. addi AO,AO,-16
  1588. LOAD1x1O 16,16
  1589. END1x1_WITHOUT_ADD
  1590. LOAD1x1_2O 32, 32
  1591. mtctr T8
  1592. bl ZGEMM_L1x1_K32
  1593. b ZGEMM_L1x1_SAVE
  1594. CMP1x1_32K:
  1595. /*----------------------------------------*/
  1596. #if defined(TRMMKERNEL)
  1597. cmpwi T6,32
  1598. #else
  1599. cmpwi K,32
  1600. #endif
  1601. bne ZGEMM_L1x1_SUB2
  1602. MY_ALIGN
  1603. mtctr T8
  1604. addi BO,BO,-32
  1605. addi AO,AO,-32
  1606. LOAD1x1_2O 32,32
  1607. bl ZGEMM_L1x1_K32
  1608. b ZGEMM_L1x1_SAVE
  1609. MY_ALIGN
  1610. MY_ALIGN
  1611. ZGEMM_L1x1_SUB2:
  1612. /*----------------------------------------*/
  1613. andi. T1,L, 16
  1614. ble ZGEMM_L1x1_SUB2_8
  1615. bl ZGEMM_1x1_L16_SUB
  1616. MY_ALIGN
  1617. ZGEMM_L1x1_SUB2_8:
  1618. /*----------------------------------------*/
  1619. andi. T1,L, 8
  1620. ble ZGEMM_L1x1_SUB2_4
  1621. bl ZGEMM_1x1_L8_SUB
  1622. MY_ALIGN
  1623. ZGEMM_L1x1_SUB2_4:
  1624. /*----------------------------------------*/
  1625. andi. T1,L, 4
  1626. ble ZGEMM_L1x1_SUB2_2
  1627. LOAD1x1_2
  1628. KERNEL1x1_L2 32,32, 0,0
  1629. KERNEL1x1_E2 32,32, 1,1
  1630. MY_ALIGN
  1631. ZGEMM_L1x1_SUB2_2:
  1632. /*----------------------------------------*/
  1633. andi. T1,L, 2
  1634. ble ZGEMM_L1x1_SUB2_1
  1635. LOAD1x1_2
  1636. KERNEL1x1_E2 32,32, 0,1
  1637. MY_ALIGN
  1638. ZGEMM_L1x1_SUB2_1:
  1639. /*----------------------------------------*/
  1640. andi. T1,L, 1
  1641. ble ZGEMM_L1x1_SAVE
  1642. KERNEL1x1
  1643. ZGEMM_L1x1_SAVE:
  1644. /*----------------------------------------*/
  1645. SAVE1x1
  1646. #if defined(TRMMKERNEL)
  1647. REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
  1648. #endif
  1649. ZGEMM_L1x1_END:
  1650. /*----------------------------------------*/
  1651. #if defined(TRMMKERNEL) && !defined(LEFT)
  1652. addi TEMP_REG, TEMP_REG, 1
  1653. #endif
  1654. ZGEMM_L1_END:
  1655. /*----------------------------------------*/