You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_logic_power9.S 31 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981
  1. /***************************************************************************
  2. Copyright (c) 2013-2019 The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define MY_ALIGN .align 3
  28. #if defined(TRMMKERNEL) && !defined(LEFT)
  29. neg TEMP_REG, OFFSET
  30. #endif
  31. srawi. J, N, 2
  32. ble LDGEMM_L4_END
  33. LDGEMM_L4_BEGIN:
  34. li T1, 128
  35. li T2, 256
  36. mr AO, A
  37. mr CO, C
  38. slwi T3, LDC , 2
  39. add C, C, T3
  40. dcbt A, T1
  41. dcbt A, T2
  42. #if defined(TRMMKERNEL) && defined(LEFT)
  43. mr TEMP_REG, OFFSET /*off = offset;*/
  44. #endif
  45. srawi. I, M, 4
  46. ble LDGEMM_L4x16_END
  47. MY_ALIGN
  48. LDGEMM_L4x16_BEGIN:
  49. li L, -128
  50. SAVE4x16_REGS
  51. #if defined(TRMMKERNEL)
  52. REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4
  53. #else
  54. mr BO, B
  55. #endif
  56. and T1, CO, L
  57. and T2, C2, L
  58. and T3, C3, L
  59. and T4, C4, L
  60. dcbt T1, r0
  61. dcbt T2, r0
  62. dcbt T3, r0
  63. dcbt T4, r0
  64. addi T1, T1, 128
  65. addi T2, T2, 128
  66. addi T3, T3, 128
  67. addi T4, T4, 128
  68. dcbt T1, r0
  69. dcbt T2, r0
  70. dcbt T3, r0
  71. dcbt T4, r0
  72. #if defined(TRMMKERNEL)
  73. REFRESH_TEMP_BK T3,K,TEMP_REG,16,4
  74. srawi. L, T3, 5
  75. #else
  76. srawi. L, K, 5
  77. #endif
  78. ble LDGEMM_L4x16_SUB0
  79. MY_ALIGN
  80. LDGEMM_L4x16_LOOP_START:
  81. li T2, 512
  82. LOAD4x16_1
  83. ##OffsetA=128 OffsetB=32
  84. addi AO,AO,2176
  85. # addi BO,BO,32
  86. addic. L, L, -1
  87. ble LDGEMM_L4x16_LOOP_END
  88. mtctr L
  89. MY_ALIGN
  90. LDGEMM_L4x16_LOOP:
  91. #dcbt AO, PRE
  92. KERNEL4x16_I1_L2_2 -2048,32, 0,0
  93. KERNEL4x16_I1_L2_2 -2048,32, 1,0
  94. KERNEL4x16_I1_L2_2 -2048,32, 2,0
  95. KERNEL4x16_I1_L2_2 -2048,32, 3,0
  96. KERNEL4x16_I1_L2_2 -2048,32, 4,0
  97. KERNEL4x16_I1_L2_2 -2048,32, 5,0
  98. KERNEL4x16_I1_L2_2 -2048,32, 6,0
  99. KERNEL4x16_I1_L2_2 -2048,32, 7,0
  100. KERNEL4x16_I1_L2_2 -2048,32, 8,0
  101. KERNEL4x16_I1_L2_2 -2048,32, 9,0
  102. KERNEL4x16_I1_L2_2 -2048,32, 10,0
  103. KERNEL4x16_I1_L2_2 -2048,32, 11,0
  104. KERNEL4x16_I1_L2_2 -2048,32, 12,0
  105. KERNEL4x16_I1_L2_2 -2048,32, 13,0
  106. KERNEL4x16_I1_L2_2 -2048,32, 14,0
  107. KERNEL4x16_I1_L2_2 -2048,32, 15,1
  108. bdnz LDGEMM_L4x16_LOOP
  109. MY_ALIGN
  110. MY_ALIGN
  111. LDGEMM_L4x16_LOOP_END:
  112. KERNEL4x16_I1_L2_2 -2048,32, 0,0
  113. KERNEL4x16_I1_L2_2 -2048,32, 1,0
  114. KERNEL4x16_I1_L2_2 -2048,32, 2,0
  115. KERNEL4x16_I1_L2_2 -2048,32, 3,0
  116. KERNEL4x16_I1_L2_2 -2048,32, 4,0
  117. KERNEL4x16_I1_L2_2 -2048,32, 5,0
  118. KERNEL4x16_I1_L2_2 -2048,32, 6,0
  119. KERNEL4x16_I1_L2_2 -2048,32, 7,0
  120. KERNEL4x16_I1_L2_2 -2048,32, 8,0
  121. KERNEL4x16_I1_L2_2 -2048,32, 9,0
  122. KERNEL4x16_I1_L2_2 -2048,32, 10,0
  123. KERNEL4x16_I1_L2_2 -2048,32, 11,0
  124. KERNEL4x16_I1_L2_2 -2048,32, 12,0
  125. KERNEL4x16_I1_L2_2 -2048,32, 13,0
  126. KERNEL4x16_I1_L2_2 -2048,32, 14,0
  127. KERNEL4x16_I1_L2_3 -2048,32, 15,1
  128. b LDGEMM_L4x16_SUB1
  129. MY_ALIGN
  130. LDGEMM_L4x16_SUB0:
  131. #if defined(TRMMKERNEL)
  132. andi. L, T3, 31
  133. #else
  134. andi. L, K, 31
  135. #endif
  136. KERNEL4x16 1
  137. addic. L, L, -1
  138. ble LDGEMM_L4x16_SAVE
  139. b LDGEMM_L4x16_SUB2
  140. MY_ALIGN
  141. LDGEMM_L4x16_SUB1:
  142. #if defined(TRMMKERNEL)
  143. andi. L, T3, 31
  144. #else
  145. andi. L, K, 31
  146. #endif
  147. ble LDGEMM_L4x16_SAVE
  148. MY_ALIGN
  149. LDGEMM_L4x16_SUB2:
  150. andi. T1,L, 16
  151. ble LDGEMM_L4x16_SUB2_8
  152. LOAD4x16_0
  153. KERNEL4x16_I1_L2_2 128,32, 0,0
  154. KERNEL4x16_I1_L2_2 128,32, 1,0
  155. KERNEL4x16_I1_L2_2 128,32, 2,0
  156. KERNEL4x16_I1_L2_2 128,32, 3,0
  157. KERNEL4x16_I1_L2_2 128,32, 4,0
  158. KERNEL4x16_I1_L2_2 128,32, 5,0
  159. KERNEL4x16_I1_L2_2 128,32, 6,0
  160. KERNEL4x16_I1_L2_3 128,32, 7,1
  161. MY_ALIGN
  162. LDGEMM_L4x16_SUB2_8:
  163. andi. T1,L, 8
  164. ble LDGEMM_L4x16_SUB2_4
  165. LOAD4x16_0
  166. KERNEL4x16_I1_L2_2 128,32, 0,0
  167. KERNEL4x16_I1_L2_2 128,32, 1,0
  168. KERNEL4x16_I1_L2_2 128,32, 2,0
  169. KERNEL4x16_I1_L2_3 128,32, 3,1
  170. MY_ALIGN
  171. LDGEMM_L4x16_SUB2_4:
  172. andi. T1,L, 4
  173. ble LDGEMM_L4x16_SUB2_2
  174. LOAD4x16_0
  175. KERNEL4x16_I1_L2_2 128,32, 0,0
  176. KERNEL4x16_I1_L2_3 128,32, 1,1
  177. MY_ALIGN
  178. LDGEMM_L4x16_SUB2_2:
  179. andi. T1,L, 2
  180. ble LDGEMM_L4x16_SUB2_1
  181. LOAD4x16_0
  182. KERNEL4x16_I1_L2_3 128,32, 0,1
  183. MY_ALIGN
  184. LDGEMM_L4x16_SUB2_1:
  185. andi. T1,L, 1
  186. ble LDGEMM_L4x16_SAVE
  187. KERNEL4x16 0
  188. # addic. L, L, -1
  189. # bgt LDGEMM_L4x16_SUB2
  190. MY_ALIGN
  191. LDGEMM_L4x16_SAVE:
  192. SAVE4x16
  193. #if defined(TRMMKERNEL)
  194. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4
  195. #endif
  196. addic. I, I, -1
  197. bgt+ LDGEMM_L4x16_BEGIN
  198. LDGEMM_L4x16_END:
  199. LDGEMM_L4x8_BEGIN:
  200. andi. T2, M, 15
  201. ble LDGEMM_L4x1_END
  202. andi. T1, M, 8
  203. ble LDGEMM_L4x8_END
  204. #if defined(TRMMKERNEL)
  205. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4
  206. REFRESH_TEMP_BK T3,K,TEMP_REG,8,4
  207. srawi. L, T3, 4
  208. #else
  209. mr BO, B
  210. srawi. L, K, 4
  211. #endif
  212. ble LDGEMM_L4x8_SUB0
  213. LDGEMM_L4x8_LOOP_START:
  214. LOAD4x8_1
  215. ##OffsetA=64 OffsetB=32
  216. addic. L, L, -1
  217. ble LDGEMM_L4x8_LOOP_END
  218. mtctr L
  219. MY_ALIGN
  220. LDGEMM_L4x8_LOOP:
  221. KERNEL4x8_I1_L2_2 64,32, 0,0
  222. KERNEL4x8_I1_L2_2 64,32, 1,0
  223. KERNEL4x8_I1_L2_2 64,32, 2,0
  224. KERNEL4x8_I1_L2_2 64,32, 3,0
  225. KERNEL4x8_I1_L2_2 64,32, 4,0
  226. KERNEL4x8_I1_L2_2 64,32, 5,0
  227. KERNEL4x8_I1_L2_2 64,32, 6,0
  228. KERNEL4x8_I1_L2_2 64,32, 7,1
  229. bdnz LDGEMM_L4x8_LOOP
  230. MY_ALIGN
  231. LDGEMM_L4x8_LOOP_END:
  232. KERNEL4x8_I1_L2_2 64,32, 0,0
  233. KERNEL4x8_I1_L2_2 64,32, 1,0
  234. KERNEL4x8_I1_L2_2 64,32, 2,0
  235. KERNEL4x8_I1_L2_2 64,32, 3,0
  236. KERNEL4x8_I1_L2_2 64,32, 4,0
  237. KERNEL4x8_I1_L2_2 64,32, 5,0
  238. KERNEL4x8_I1_L2_2 64,32, 6,0
  239. KERNEL4x8_I1_L2_3 64,32, 7,1
  240. b LDGEMM_L4x8_SUB1
  241. MY_ALIGN
  242. LDGEMM_L4x8_SUB0:
  243. #if defined(TRMMKERNEL)
  244. andi. L, T3, 15
  245. #else
  246. andi. L, K, 15
  247. #endif
  248. KERNEL4x8 1
  249. addic. L, L, -1
  250. ble LDGEMM_L4x8_SAVE
  251. b LDGEMM_L4x8_SUB2
  252. MY_ALIGN
  253. LDGEMM_L4x8_SUB1:
  254. #if defined(TRMMKERNEL)
  255. andi. L, T3, 15
  256. #else
  257. andi. L, K, 15
  258. #endif
  259. ble LDGEMM_L4x8_SAVE
  260. MY_ALIGN
  261. LDGEMM_L4x8_SUB2:
  262. andi. T1,L, 8
  263. ble LDGEMM_L4x8_SUB2_4
  264. LOAD4x8_0
  265. KERNEL4x8_I1_L2_2 64,32, 0,0
  266. KERNEL4x8_I1_L2_2 64,32, 1,0
  267. KERNEL4x8_I1_L2_2 64,32, 2,0
  268. KERNEL4x8_I1_L2_3 64,32, 3,1
  269. MY_ALIGN
  270. LDGEMM_L4x8_SUB2_4:
  271. andi. T1,L, 4
  272. ble LDGEMM_L4x8_SUB2_2
  273. LOAD4x8_0
  274. KERNEL4x8_I1_L2_2 64,32, 0,0
  275. KERNEL4x8_I1_L2_3 64,32, 1,1
  276. MY_ALIGN
  277. LDGEMM_L4x8_SUB2_2:
  278. andi. T1,L, 2
  279. ble LDGEMM_L4x8_SUB2_1
  280. LOAD4x8_0
  281. KERNEL4x8_I1_L2_3 64,32, 0,1
  282. MY_ALIGN
  283. LDGEMM_L4x8_SUB2_1:
  284. andi. T1,L, 1
  285. ble LDGEMM_L4x8_SAVE
  286. KERNEL4x8 0
  287. MY_ALIGN
  288. LDGEMM_L4x8_SAVE:
  289. SAVE4x8
  290. #if defined(TRMMKERNEL)
  291. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4
  292. #endif
  293. LDGEMM_L4x8_END:
  294. LDGEMM_L4x4_BEGIN:
  295. andi. T1, M, 4
  296. ble LDGEMM_L4x4_END
  297. #if defined(TRMMKERNEL)
  298. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4
  299. REFRESH_TEMP_BK T3,K,TEMP_REG,4,4
  300. srawi. L, T3, 3
  301. #else
  302. mr BO, B
  303. srawi. L, K, 3
  304. #endif
  305. ble LDGEMM_L4x4_SUB0
  306. cmpwi cr0, L, 1
  307. ble LDGEMM_L4x4_SUB4
  308. LDGEMM_L4x4_LOOP_START:
  309. #dcbt AO, PRE
  310. LOAD4x4_1
  311. KERNEL4x4_I1
  312. KERNEL4x4_2
  313. KERNEL4x4_1
  314. #dcbt AO, PRE
  315. KERNEL4x4_2
  316. KERNEL4x4_1
  317. KERNEL4x4_2
  318. KERNEL4x4_1
  319. #dcbt AO, PRE
  320. KERNEL4x4_2
  321. addic. L, L, -2
  322. ble LDGEMM_L4x4_LOOP_END
  323. MY_ALIGN
  324. LDGEMM_L4x4_LOOP:
  325. KERNEL4x4_1
  326. KERNEL4x4_2
  327. KERNEL4x4_1
  328. #dcbt AO, PRE
  329. KERNEL4x4_2
  330. KERNEL4x4_1
  331. KERNEL4x4_2
  332. KERNEL4x4_1
  333. #dcbt AO, PRE
  334. KERNEL4x4_2
  335. addic. L, L, -1
  336. bgt LDGEMM_L4x4_LOOP
  337. LDGEMM_L4x4_LOOP_END:
  338. KERNEL4x4_1
  339. KERNEL4x4_2
  340. KERNEL4x4_1
  341. KERNEL4x4_2
  342. KERNEL4x4_1
  343. KERNEL4x4_2
  344. KERNEL4x4_1
  345. KERNEL4x4_E2
  346. b LDGEMM_L4x4_SUB1
  347. LDGEMM_L4x4_SUB4:
  348. KERNEL4x4_SUBI1
  349. KERNEL4x4_SUB1
  350. KERNEL4x4_SUB1
  351. KERNEL4x4_SUB1
  352. KERNEL4x4_SUB1
  353. KERNEL4x4_SUB1
  354. KERNEL4x4_SUB1
  355. KERNEL4x4_SUB1
  356. b LDGEMM_L4x4_SUB1
  357. LDGEMM_L4x4_SUB0:
  358. #if defined(TRMMKERNEL)
  359. andi. L, T3, 7
  360. #else
  361. andi. L, K, 7
  362. #endif
  363. KERNEL4x4_SUBI1
  364. addic. L, L, -1
  365. ble LDGEMM_L4x4_SAVE
  366. b LDGEMM_L4x4_SUB2
  367. LDGEMM_L4x4_SUB1:
  368. #if defined(TRMMKERNEL)
  369. andi. L, T3, 7
  370. #else
  371. andi. L, K, 7
  372. #endif
  373. ble LDGEMM_L4x4_SAVE
  374. LDGEMM_L4x4_SUB2:
  375. KERNEL4x4_SUB1
  376. addic. L, L, -1
  377. bgt LDGEMM_L4x4_SUB2
  378. LDGEMM_L4x4_SAVE:
  379. SAVE4x4
  380. #if defined(TRMMKERNEL)
  381. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4
  382. #endif
  383. LDGEMM_L4x4_END:
  384. LDGEMM_L4x2_BEGIN:
  385. andi. T1, M, 2
  386. ble LDGEMM_L4x2_END
  387. #if defined(TRMMKERNEL)
  388. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4
  389. REFRESH_TEMP_BK T3,K,TEMP_REG,2,4
  390. srawi. L, T3, 3
  391. #else
  392. mr BO, B
  393. srawi. L, K, 3
  394. #endif
  395. ble LDGEMM_L4x2_SUB0
  396. cmpwi cr0, L, 1
  397. ble LDGEMM_L4x2_SUB4
  398. LDGEMM_L4x2_LOOP_START:
  399. LOAD4x2_1
  400. KERNEL4x2_I1
  401. KERNEL4x2_2
  402. KERNEL4x2_1
  403. KERNEL4x2_2
  404. KERNEL4x2_1
  405. KERNEL4x2_2
  406. KERNEL4x2_1
  407. KERNEL4x2_2
  408. addic. L, L, -2
  409. ble LDGEMM_L4x2_LOOP_END
  410. MY_ALIGN
  411. LDGEMM_L4x2_LOOP:
  412. KERNEL4x2_1
  413. KERNEL4x2_2
  414. KERNEL4x2_1
  415. KERNEL4x2_2
  416. KERNEL4x2_1
  417. KERNEL4x2_2
  418. KERNEL4x2_1
  419. KERNEL4x2_2
  420. addic. L, L, -1
  421. bgt LDGEMM_L4x2_LOOP
  422. LDGEMM_L4x2_LOOP_END:
  423. KERNEL4x2_1
  424. KERNEL4x2_2
  425. KERNEL4x2_1
  426. KERNEL4x2_2
  427. KERNEL4x2_1
  428. KERNEL4x2_2
  429. KERNEL4x2_1
  430. KERNEL4x2_E2
  431. b LDGEMM_L4x2_SUB1
  432. LDGEMM_L4x2_SUB4:
  433. KERNEL4x2_SUBI1
  434. KERNEL4x2_SUB1
  435. KERNEL4x2_SUB1
  436. KERNEL4x2_SUB1
  437. KERNEL4x2_SUB1
  438. KERNEL4x2_SUB1
  439. KERNEL4x2_SUB1
  440. KERNEL4x2_SUB1
  441. b LDGEMM_L4x2_SUB1
  442. LDGEMM_L4x2_SUB0:
  443. #if defined(TRMMKERNEL)
  444. andi. L, T3, 7
  445. #else
  446. andi. L, K, 7
  447. #endif
  448. KERNEL4x2_SUBI1
  449. addic. L, L, -1
  450. ble LDGEMM_L4x2_SAVE
  451. b LDGEMM_L4x2_SUB2
  452. LDGEMM_L4x2_SUB1:
  453. #if defined(TRMMKERNEL)
  454. andi. L, T3, 7
  455. #else
  456. andi. L, K, 7
  457. #endif
  458. ble LDGEMM_L4x2_SAVE
  459. LDGEMM_L4x2_SUB2:
  460. KERNEL4x2_SUB1
  461. addic. L, L, -1
  462. bgt LDGEMM_L4x2_SUB2
  463. LDGEMM_L4x2_SAVE:
  464. SAVE4x2
  465. #if defined(TRMMKERNEL)
  466. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4
  467. #endif
  468. LDGEMM_L4x2_END:
  469. LDGEMM_L4x1_BEGIN:
  470. andi. T1, M, 1
  471. ble LDGEMM_L4x1_END
  472. #if defined(TRMMKERNEL)
  473. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4
  474. REFRESH_TEMP_BK T3,K,TEMP_REG,1,4
  475. srawi. L, T3, 3
  476. #else
  477. mr BO, B
  478. srawi. L, K, 3
  479. #endif
  480. ble LDGEMM_L4x1_SUB0
  481. cmpwi cr0, L, 1
  482. ble LDGEMM_L4x1_SUB4
  483. LDGEMM_L4x1_LOOP_START:
  484. LOAD4x1_1
  485. KERNEL4x1_I1
  486. KERNEL4x1_2
  487. KERNEL4x1_1
  488. KERNEL4x1_2
  489. KERNEL4x1_1
  490. KERNEL4x1_2
  491. KERNEL4x1_1
  492. KERNEL4x1_2
  493. addic. L, L, -2
  494. ble LDGEMM_L4x1_LOOP_END
  495. MY_ALIGN
  496. LDGEMM_L4x1_LOOP:
  497. KERNEL4x1_1
  498. KERNEL4x1_2
  499. KERNEL4x1_1
  500. KERNEL4x1_2
  501. KERNEL4x1_1
  502. KERNEL4x1_2
  503. KERNEL4x1_1
  504. KERNEL4x1_2
  505. addic. L, L, -1
  506. bgt LDGEMM_L4x1_LOOP
  507. LDGEMM_L4x1_LOOP_END:
  508. KERNEL4x1_1
  509. KERNEL4x1_2
  510. KERNEL4x1_1
  511. KERNEL4x1_2
  512. KERNEL4x1_1
  513. KERNEL4x1_2
  514. KERNEL4x1_1
  515. KERNEL4x1_E2
  516. b LDGEMM_L4x1_SUB1
  517. LDGEMM_L4x1_SUB4:
  518. KERNEL4x1_SUBI1
  519. KERNEL4x1_SUB1
  520. KERNEL4x1_SUB1
  521. KERNEL4x1_SUB1
  522. KERNEL4x1_SUB1
  523. KERNEL4x1_SUB1
  524. KERNEL4x1_SUB1
  525. KERNEL4x1_SUB1
  526. b LDGEMM_L4x1_SUB1
  527. LDGEMM_L4x1_SUB0:
  528. #if defined(TRMMKERNEL)
  529. andi. L, T3, 7
  530. #else
  531. andi. L, K, 7
  532. #endif
  533. KERNEL4x1_SUBI1
  534. addic. L, L, -1
  535. ble LDGEMM_L4x1_SAVE
  536. b LDGEMM_L4x1_SUB2
  537. LDGEMM_L4x1_SUB1:
  538. #if defined(TRMMKERNEL)
  539. andi. L, T3, 7
  540. #else
  541. andi. L, K, 7
  542. #endif
  543. ble LDGEMM_L4x1_SAVE
  544. LDGEMM_L4x1_SUB2:
  545. KERNEL4x1_SUB1
  546. addic. L, L, -1
  547. bgt LDGEMM_L4x1_SUB2
  548. LDGEMM_L4x1_SAVE:
  549. SAVE4x1
  550. #if defined(TRMMKERNEL)
  551. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4
  552. #endif
  553. LDGEMM_L4x1_END:
  554. slwi T1, K, 5
  555. add B, B, T1
  556. #if defined(TRMMKERNEL) && !defined(LEFT)
  557. addi TEMP_REG, TEMP_REG, 4
  558. #endif
  559. addic. J, J, -1
  560. bgt LDGEMM_L4_BEGIN
  561. andi. T2, N, 3
  562. ble .L999
  563. LDGEMM_L4_END:
  564. b LDGEMM_L2_BEGIN
  565. .L999_H1:
  566. b .L999
  567. LDGEMM_L2_BEGIN:
  568. #if defined(TRMMKERNEL) && defined(LEFT)
  569. mr TEMP_REG, OFFSET /*off = offset;*/
  570. #endif
  571. andi. T1, N, 2
  572. ble LDGEMM_L2_END
  573. mr CO, C
  574. mr AO, A
  575. slwi T1, LDC , 1
  576. add C, C, T1
  577. srawi. I, M, 4
  578. ble LDGEMM_L2x16_END
  579. LDGEMM_L2x16_BEGIN:
  580. #if defined(TRMMKERNEL)
  581. REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2
  582. REFRESH_TEMP_BK T3,K,TEMP_REG,16,2
  583. srawi. L, T3, 3
  584. #else
  585. mr BO, B
  586. srawi. L, K, 3
  587. #endif
  588. ble LDGEMM_L2x16_SUB0
  589. cmpwi cr0, L, 1
  590. ble LDGEMM_L2x16_SUB4
  591. LDGEMM_L2x16_LOOP_START:
  592. #dcbt AO, PRE
  593. LOAD2x16_1
  594. #dcbt AO, PRE
  595. KERNEL2x16_I1
  596. #dcbt AO, PRE
  597. KERNEL2x16_2
  598. #dcbt AO, PRE
  599. KERNEL2x16_1
  600. #dcbt AO, PRE
  601. KERNEL2x16_2
  602. #dcbt AO, PRE
  603. KERNEL2x16_1
  604. #dcbt AO, PRE
  605. KERNEL2x16_2
  606. #dcbt AO, PRE
  607. KERNEL2x16_1
  608. #dcbt AO, PRE
  609. KERNEL2x16_2
  610. addic. L, L, -2
  611. ble LDGEMM_L2x16_LOOP_END
  612. MY_ALIGN
  613. LDGEMM_L2x16_LOOP:
  614. #dcbt AO, PRE
  615. KERNEL2x16_1
  616. #dcbt AO, PRE
  617. KERNEL2x16_2
  618. #dcbt AO, PRE
  619. KERNEL2x16_1
  620. #dcbt AO, PRE
  621. KERNEL2x16_2
  622. #dcbt AO, PRE
  623. KERNEL2x16_1
  624. #dcbt AO, PRE
  625. KERNEL2x16_2
  626. #dcbt AO, PRE
  627. KERNEL2x16_1
  628. #dcbt AO, PRE
  629. KERNEL2x16_2
  630. addic. L, L, -1
  631. bgt LDGEMM_L2x16_LOOP
  632. LDGEMM_L2x16_LOOP_END:
  633. #dcbt AO, PRE
  634. KERNEL2x16_1
  635. #dcbt AO, PRE
  636. KERNEL2x16_2
  637. #dcbt AO, PRE
  638. KERNEL2x16_1
  639. #dcbt AO, PRE
  640. KERNEL2x16_2
  641. #dcbt AO, PRE
  642. KERNEL2x16_1
  643. #dcbt AO, PRE
  644. KERNEL2x16_2
  645. #dcbt AO, PRE
  646. KERNEL2x16_1
  647. KERNEL2x16_E2
  648. b LDGEMM_L2x16_SUB1
  649. LDGEMM_L2x16_SUB4:
  650. #dcbt AO, PRE
  651. KERNEL2x16_SUBI1
  652. #dcbt AO, PRE
  653. KERNEL2x16_SUB1
  654. #dcbt AO, PRE
  655. KERNEL2x16_SUB1
  656. #dcbt AO, PRE
  657. KERNEL2x16_SUB1
  658. KERNEL2x16_SUB1
  659. KERNEL2x16_SUB1
  660. KERNEL2x16_SUB1
  661. KERNEL2x16_SUB1
  662. b LDGEMM_L2x16_SUB1
  663. LDGEMM_L2x16_SUB0:
  664. #if defined(TRMMKERNEL)
  665. andi. L, T3, 7
  666. #else
  667. andi. L, K, 7
  668. #endif
  669. KERNEL2x16_SUBI1
  670. addic. L, L, -1
  671. ble LDGEMM_L2x16_SAVE
  672. b LDGEMM_L2x16_SUB2
  673. LDGEMM_L2x16_SUB1:
  674. #if defined(TRMMKERNEL)
  675. andi. L, T3, 7
  676. #else
  677. andi. L, K, 7
  678. #endif
  679. ble LDGEMM_L2x16_SAVE
  680. LDGEMM_L2x16_SUB2:
  681. KERNEL2x16_SUB1
  682. addic. L, L, -1
  683. bgt LDGEMM_L2x16_SUB2
  684. LDGEMM_L2x16_SAVE:
  685. SAVE2x16
  686. #if defined(TRMMKERNEL)
  687. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2
  688. #endif
  689. addic. I, I, -1
  690. bgt LDGEMM_L2x16_BEGIN
  691. LDGEMM_L2x16_END:
  692. LDGEMM_L2x8_BEGIN:
  693. andi. T2, M, 15
  694. ble LDGEMM_L2x1_END
  695. andi. T1, M, 8
  696. ble LDGEMM_L2x8_END
  697. #if defined(TRMMKERNEL)
  698. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
  699. REFRESH_TEMP_BK T3,K,TEMP_REG,8,2
  700. srawi. L, T3, 3
  701. #else
  702. mr BO, B
  703. srawi. L, K, 3
  704. #endif
  705. ble LDGEMM_L2x8_SUB0
  706. cmpwi cr0, L, 1
  707. ble LDGEMM_L2x8_SUB4
  708. LDGEMM_L2x8_LOOP_START:
  709. #dcbt AO, PRE
  710. LOAD2x8_1
  711. KERNEL2x8_I1
  712. #dcbt AO, PRE
  713. KERNEL2x8_2
  714. KERNEL2x8_1
  715. #dcbt AO, PRE
  716. KERNEL2x8_2
  717. KERNEL2x8_1
  718. #dcbt AO, PRE
  719. KERNEL2x8_2
  720. KERNEL2x8_1
  721. #dcbt AO, PRE
  722. KERNEL2x8_2
  723. addic. L, L, -2
  724. ble LDGEMM_L2x8_LOOP_END
  725. MY_ALIGN
  726. LDGEMM_L2x8_LOOP:
  727. KERNEL2x8_1
  728. #dcbt AO, PRE
  729. KERNEL2x8_2
  730. KERNEL2x8_1
  731. #dcbt AO, PRE
  732. KERNEL2x8_2
  733. KERNEL2x8_1
  734. #dcbt AO, PRE
  735. KERNEL2x8_2
  736. KERNEL2x8_1
  737. #dcbt AO, PRE
  738. KERNEL2x8_2
  739. addic. L, L, -1
  740. bgt LDGEMM_L2x8_LOOP
  741. LDGEMM_L2x8_LOOP_END:
  742. KERNEL2x8_1
  743. KERNEL2x8_2
  744. KERNEL2x8_1
  745. KERNEL2x8_2
  746. KERNEL2x8_1
  747. KERNEL2x8_2
  748. KERNEL2x8_1
  749. KERNEL2x8_E2
  750. b LDGEMM_L2x8_SUB1
  751. LDGEMM_L2x8_SUB4:
  752. KERNEL2x8_SUBI1
  753. KERNEL2x8_SUB1
  754. KERNEL2x8_SUB1
  755. KERNEL2x8_SUB1
  756. KERNEL2x8_SUB1
  757. KERNEL2x8_SUB1
  758. KERNEL2x8_SUB1
  759. KERNEL2x8_SUB1
  760. b LDGEMM_L2x8_SUB1
  761. LDGEMM_L2x8_SUB0:
  762. #if defined(TRMMKERNEL)
  763. andi. L, T3, 7
  764. #else
  765. andi. L, K, 7
  766. #endif
  767. KERNEL2x8_SUBI1
  768. addic. L, L, -1
  769. ble LDGEMM_L2x8_SAVE
  770. b LDGEMM_L2x8_SUB2
  771. LDGEMM_L2x8_SUB1:
  772. #if defined(TRMMKERNEL)
  773. andi. L, T3, 7
  774. #else
  775. andi. L, K, 7
  776. #endif
  777. ble LDGEMM_L2x8_SAVE
  778. LDGEMM_L2x8_SUB2:
  779. KERNEL2x8_SUB1
  780. addic. L, L, -1
  781. bgt LDGEMM_L2x8_SUB2
  782. LDGEMM_L2x8_SAVE:
  783. SAVE2x8
  784. #if defined(TRMMKERNEL)
  785. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2
  786. #endif
  787. LDGEMM_L2x8_END:
  788. LDGEMM_L2x4_BEGIN:
  789. andi. T1, M, 4
  790. ble LDGEMM_L2x4_END
  791. #if defined(TRMMKERNEL)
  792. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
  793. REFRESH_TEMP_BK T3,K,TEMP_REG,4,2
  794. srawi. L, T3, 3
  795. #else
  796. mr BO, B
  797. srawi. L, K, 3
  798. #endif
  799. ble LDGEMM_L2x4_SUB0
  800. cmpwi cr0, L, 1
  801. ble LDGEMM_L2x4_SUB4
  802. LDGEMM_L2x4_LOOP_START:
  803. LOAD2x4_1
  804. KERNEL2x4_I1
  805. KERNEL2x4_2
  806. KERNEL2x4_1
  807. KERNEL2x4_2
  808. KERNEL2x4_1
  809. KERNEL2x4_2
  810. KERNEL2x4_1
  811. KERNEL2x4_2
  812. addic. L, L, -2
  813. ble LDGEMM_L2x4_LOOP_END
  814. MY_ALIGN
  815. LDGEMM_L2x4_LOOP:
  816. KERNEL2x4_1
  817. KERNEL2x4_2
  818. KERNEL2x4_1
  819. KERNEL2x4_2
  820. KERNEL2x4_1
  821. KERNEL2x4_2
  822. KERNEL2x4_1
  823. KERNEL2x4_2
  824. addic. L, L, -1
  825. bgt LDGEMM_L2x4_LOOP
  826. LDGEMM_L2x4_LOOP_END:
  827. KERNEL2x4_1
  828. KERNEL2x4_2
  829. KERNEL2x4_1
  830. KERNEL2x4_2
  831. KERNEL2x4_1
  832. KERNEL2x4_2
  833. KERNEL2x4_1
  834. KERNEL2x4_E2
  835. b LDGEMM_L2x4_SUB1
  836. LDGEMM_L2x4_SUB4:
  837. KERNEL2x4_SUBI1
  838. KERNEL2x4_SUB1
  839. KERNEL2x4_SUB1
  840. KERNEL2x4_SUB1
  841. KERNEL2x4_SUB1
  842. KERNEL2x4_SUB1
  843. KERNEL2x4_SUB1
  844. KERNEL2x4_SUB1
  845. b LDGEMM_L2x4_SUB1
  846. LDGEMM_L2x4_SUB0:
  847. #if defined(TRMMKERNEL)
  848. andi. L, T3, 7
  849. #else
  850. andi. L, K, 7
  851. #endif
  852. KERNEL2x4_SUBI1
  853. addic. L, L, -1
  854. ble LDGEMM_L2x4_SAVE
  855. b LDGEMM_L2x4_SUB2
  856. LDGEMM_L2x4_SUB1:
  857. #if defined(TRMMKERNEL)
  858. andi. L, T3, 7
  859. #else
  860. andi. L, K, 7
  861. #endif
  862. ble LDGEMM_L2x4_SAVE
  863. LDGEMM_L2x4_SUB2:
  864. KERNEL2x4_SUB1
  865. addic. L, L, -1
  866. bgt LDGEMM_L2x4_SUB2
  867. LDGEMM_L2x4_SAVE:
  868. SAVE2x4
  869. #if defined(TRMMKERNEL)
  870. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2
  871. #endif
  872. LDGEMM_L2x4_END:
  873. LDGEMM_L2x2_BEGIN:
  874. andi. T1, M, 2
  875. ble LDGEMM_L2x2_END
  876. #if defined(TRMMKERNEL)
  877. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
  878. REFRESH_TEMP_BK T3,K,TEMP_REG,2,2
  879. srawi. L, T3, 3
  880. #else
  881. mr BO, B
  882. srawi. L, K, 3
  883. #endif
  884. ble LDGEMM_L2x2_SUB0
  885. cmpwi cr0, L, 1
  886. ble LDGEMM_L2x2_SUB4
  887. LDGEMM_L2x2_LOOP_START:
  888. LOAD2x2_1
  889. KERNEL2x2_I1
  890. KERNEL2x2_2
  891. KERNEL2x2_1
  892. KERNEL2x2_2
  893. KERNEL2x2_1
  894. KERNEL2x2_2
  895. KERNEL2x2_1
  896. KERNEL2x2_2
  897. addic. L, L, -2
  898. ble LDGEMM_L2x2_LOOP_END
  899. MY_ALIGN
  900. LDGEMM_L2x2_LOOP:
  901. KERNEL2x2_1
  902. KERNEL2x2_2
  903. KERNEL2x2_1
  904. KERNEL2x2_2
  905. KERNEL2x2_1
  906. KERNEL2x2_2
  907. KERNEL2x2_1
  908. KERNEL2x2_2
  909. addic. L, L, -1
  910. bgt LDGEMM_L2x2_LOOP
  911. LDGEMM_L2x2_LOOP_END:
  912. KERNEL2x2_1
  913. KERNEL2x2_2
  914. KERNEL2x2_1
  915. KERNEL2x2_2
  916. KERNEL2x2_1
  917. KERNEL2x2_2
  918. KERNEL2x2_1
  919. KERNEL2x2_E2
  920. b LDGEMM_L2x2_SUB1
  921. LDGEMM_L2x2_SUB4:
  922. KERNEL2x2_SUBI1
  923. KERNEL2x2_SUB1
  924. KERNEL2x2_SUB1
  925. KERNEL2x2_SUB1
  926. KERNEL2x2_SUB1
  927. KERNEL2x2_SUB1
  928. KERNEL2x2_SUB1
  929. KERNEL2x2_SUB1
  930. b LDGEMM_L2x2_SUB1
  931. LDGEMM_L2x2_SUB0:
  932. #if defined(TRMMKERNEL)
  933. andi. L, T3, 7
  934. #else
  935. andi. L, K, 7
  936. #endif
  937. KERNEL2x2_SUBI1
  938. addic. L, L, -1
  939. ble LDGEMM_L2x2_SAVE
  940. b LDGEMM_L2x2_SUB2
  941. LDGEMM_L2x2_SUB1:
  942. #if defined(TRMMKERNEL)
  943. andi. L, T3, 7
  944. #else
  945. andi. L, K, 7
  946. #endif
  947. ble LDGEMM_L2x2_SAVE
  948. LDGEMM_L2x2_SUB2:
  949. KERNEL2x2_SUB1
  950. addic. L, L, -1
  951. bgt LDGEMM_L2x2_SUB2
  952. LDGEMM_L2x2_SAVE:
  953. SAVE2x2
  954. #if defined(TRMMKERNEL)
  955. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2
  956. #endif
  957. LDGEMM_L2x2_END:
  958. LDGEMM_L2x1_BEGIN:
  959. andi. T1, M, 1
  960. ble LDGEMM_L2x1_END
  961. #if defined(TRMMKERNEL)
  962. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
  963. REFRESH_TEMP_BK T3,K,TEMP_REG,1,2
  964. srawi. L, T3, 3
  965. #else
  966. mr BO, B
  967. srawi. L, K, 3
  968. #endif
  969. ble LDGEMM_L2x1_SUB0
  970. cmpwi cr0, L, 1
  971. ble LDGEMM_L2x1_SUB4
  972. LDGEMM_L2x1_LOOP_START:
  973. LOAD2x1_1
  974. KERNEL2x1_I1
  975. KERNEL2x1_2
  976. KERNEL2x1_1
  977. KERNEL2x1_2
  978. KERNEL2x1_1
  979. KERNEL2x1_2
  980. KERNEL2x1_1
  981. KERNEL2x1_2
  982. addic. L, L, -2
  983. ble LDGEMM_L2x1_LOOP_END
  984. MY_ALIGN
  985. LDGEMM_L2x1_LOOP:
  986. KERNEL2x1_1
  987. KERNEL2x1_2
  988. KERNEL2x1_1
  989. KERNEL2x1_2
  990. KERNEL2x1_1
  991. KERNEL2x1_2
  992. KERNEL2x1_1
  993. KERNEL2x1_2
  994. addic. L, L, -1
  995. bgt LDGEMM_L2x1_LOOP
  996. LDGEMM_L2x1_LOOP_END:
  997. KERNEL2x1_1
  998. KERNEL2x1_2
  999. KERNEL2x1_1
  1000. KERNEL2x1_2
  1001. KERNEL2x1_1
  1002. KERNEL2x1_2
  1003. KERNEL2x1_1
  1004. KERNEL2x1_E2
  1005. b LDGEMM_L2x1_SUB1
  1006. LDGEMM_L2x1_SUB4:
  1007. KERNEL2x1_SUBI1
  1008. KERNEL2x1_SUB1
  1009. KERNEL2x1_SUB1
  1010. KERNEL2x1_SUB1
  1011. KERNEL2x1_SUB1
  1012. KERNEL2x1_SUB1
  1013. KERNEL2x1_SUB1
  1014. KERNEL2x1_SUB1
  1015. b LDGEMM_L2x1_SUB1
  1016. LDGEMM_L2x1_SUB0:
  1017. #if defined(TRMMKERNEL)
  1018. andi. L, T3, 7
  1019. #else
  1020. andi. L, K, 7
  1021. #endif
  1022. KERNEL2x1_SUBI1
  1023. addic. L, L, -1
  1024. ble LDGEMM_L2x1_SAVE
  1025. b LDGEMM_L2x1_SUB2
  1026. LDGEMM_L2x1_SUB1:
  1027. #if defined(TRMMKERNEL)
  1028. andi. L, T3, 7
  1029. #else
  1030. andi. L, K, 7
  1031. #endif
  1032. ble LDGEMM_L2x1_SAVE
  1033. LDGEMM_L2x1_SUB2:
  1034. KERNEL2x1_SUB1
  1035. addic. L, L, -1
  1036. bgt LDGEMM_L2x1_SUB2
  1037. LDGEMM_L2x1_SAVE:
  1038. SAVE2x1
  1039. #if defined(TRMMKERNEL)
  1040. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2
  1041. #endif
  1042. LDGEMM_L2x1_END:
  1043. slwi T1, K, 4
  1044. add B, B, T1
  1045. #if defined(TRMMKERNEL) && !defined(LEFT)
  1046. addi TEMP_REG, TEMP_REG, 2
  1047. #endif
  1048. LDGEMM_L2_END:
  1049. LDGEMM_L1_BEGIN:
  1050. #if defined(TRMMKERNEL) && defined(LEFT)
  1051. mr TEMP_REG, OFFSET /*off = offset;*/
  1052. #endif
  1053. andi. T1, N, 1
  1054. ble LDGEMM_L1_END
  1055. mr CO, C
  1056. mr AO, A
  1057. srawi. I, M, 4
  1058. ble LDGEMM_L1x16_END
  1059. LDGEMM_L1x16_BEGIN:
  1060. #if defined(TRMMKERNEL)
  1061. REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1
  1062. REFRESH_TEMP_BK T3,K,TEMP_REG,16,1
  1063. srawi. L, T3, 3
  1064. #else
  1065. mr BO, B
  1066. srawi. L, K, 3
  1067. #endif
  1068. ble LDGEMM_L1x16_SUB0
  1069. cmpwi cr0, L, 1
  1070. ble LDGEMM_L1x16_SUB4
  1071. LDGEMM_L1x16_LOOP_START:
  1072. #dcbt AO, PRE
  1073. LOAD1x16_1
  1074. #dcbt AO, PRE
  1075. KERNEL1x16_I1
  1076. #dcbt AO, PRE
  1077. KERNEL1x16_2
  1078. #dcbt AO, PRE
  1079. KERNEL1x16_1
  1080. #dcbt AO, PRE
  1081. KERNEL1x16_2
  1082. #dcbt AO, PRE
  1083. KERNEL1x16_1
  1084. #dcbt AO, PRE
  1085. KERNEL1x16_2
  1086. #dcbt AO, PRE
  1087. KERNEL1x16_1
  1088. #dcbt AO, PRE
  1089. KERNEL1x16_2
  1090. addic. L, L, -2
  1091. ble LDGEMM_L1x16_LOOP_END
  1092. MY_ALIGN
  1093. LDGEMM_L1x16_LOOP:
  1094. #dcbt AO, PRE
  1095. KERNEL1x16_1
  1096. #dcbt AO, PRE
  1097. KERNEL1x16_2
  1098. #dcbt AO, PRE
  1099. KERNEL1x16_1
  1100. #dcbt AO, PRE
  1101. KERNEL1x16_2
  1102. #dcbt AO, PRE
  1103. KERNEL1x16_1
  1104. #dcbt AO, PRE
  1105. KERNEL1x16_2
  1106. #dcbt AO, PRE
  1107. KERNEL1x16_1
  1108. #dcbt AO, PRE
  1109. KERNEL1x16_2
  1110. addic. L, L, -1
  1111. bgt LDGEMM_L1x16_LOOP
  1112. LDGEMM_L1x16_LOOP_END:
  1113. #dcbt AO, PRE
  1114. KERNEL1x16_1
  1115. #dcbt AO, PRE
  1116. KERNEL1x16_2
  1117. #dcbt AO, PRE
  1118. KERNEL1x16_1
  1119. #dcbt AO, PRE
  1120. KERNEL1x16_2
  1121. #dcbt AO, PRE
  1122. KERNEL1x16_1
  1123. #dcbt AO, PRE
  1124. KERNEL1x16_2
  1125. #dcbt AO, PRE
  1126. KERNEL1x16_1
  1127. KERNEL1x16_E2
  1128. b LDGEMM_L1x16_SUB1
  1129. LDGEMM_L1x16_SUB4:
  1130. #dcbt AO, PRE
  1131. KERNEL1x16_SUBI1
  1132. #dcbt AO, PRE
  1133. KERNEL1x16_SUB1
  1134. #dcbt AO, PRE
  1135. KERNEL1x16_SUB1
  1136. #dcbt AO, PRE
  1137. KERNEL1x16_SUB1
  1138. KERNEL1x16_SUB1
  1139. KERNEL1x16_SUB1
  1140. KERNEL1x16_SUB1
  1141. KERNEL1x16_SUB1
  1142. b LDGEMM_L1x16_SUB1
  1143. LDGEMM_L1x16_SUB0:
  1144. #if defined(TRMMKERNEL)
  1145. andi. L, T3, 7
  1146. #else
  1147. andi. L, K, 7
  1148. #endif
  1149. KERNEL1x16_SUBI1
  1150. addic. L, L, -1
  1151. ble LDGEMM_L1x16_SAVE
  1152. b LDGEMM_L1x16_SUB2
  1153. LDGEMM_L1x16_SUB1:
  1154. #if defined(TRMMKERNEL)
  1155. andi. L, T3, 7
  1156. #else
  1157. andi. L, K, 7
  1158. #endif
  1159. ble LDGEMM_L1x16_SAVE
  1160. LDGEMM_L1x16_SUB2:
  1161. KERNEL1x16_SUB1
  1162. addic. L, L, -1
  1163. bgt LDGEMM_L1x16_SUB2
  1164. LDGEMM_L1x16_SAVE:
  1165. SAVE1x16
  1166. #if defined(TRMMKERNEL)
  1167. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1
  1168. #endif
  1169. addic. I, I, -1
  1170. bgt LDGEMM_L1x16_BEGIN
  1171. LDGEMM_L1x16_END:
  1172. LDGEMM_L1x8_BEGIN:
  1173. andi. T2, M, 15
  1174. ble LDGEMM_L1x1_END
  1175. andi. T1, M, 8
  1176. ble LDGEMM_L1x8_END
  1177. #if defined(TRMMKERNEL)
  1178. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
  1179. REFRESH_TEMP_BK T3,K,TEMP_REG,8,1
  1180. srawi. L, T3, 3
  1181. #else
  1182. mr BO, B
  1183. srawi. L, K, 3
  1184. #endif
  1185. ble LDGEMM_L1x8_SUB0
  1186. cmpwi cr0, L, 1
  1187. ble LDGEMM_L1x8_SUB4
  1188. LDGEMM_L1x8_LOOP_START:
  1189. #dcbt AO, PRE
  1190. LOAD1x8_1
  1191. KERNEL1x8_I1
  1192. #dcbt AO, PRE
  1193. KERNEL1x8_2
  1194. KERNEL1x8_1
  1195. #dcbt AO, PRE
  1196. KERNEL1x8_2
  1197. KERNEL1x8_1
  1198. #dcbt AO, PRE
  1199. KERNEL1x8_2
  1200. KERNEL1x8_1
  1201. #dcbt AO, PRE
  1202. KERNEL1x8_2
  1203. addic. L, L, -2
  1204. ble LDGEMM_L1x8_LOOP_END
  1205. MY_ALIGN
  1206. LDGEMM_L1x8_LOOP:
  1207. KERNEL1x8_1
  1208. #dcbt AO, PRE
  1209. KERNEL1x8_2
  1210. KERNEL1x8_1
  1211. #dcbt AO, PRE
  1212. KERNEL1x8_2
  1213. KERNEL1x8_1
  1214. #dcbt AO, PRE
  1215. KERNEL1x8_2
  1216. KERNEL1x8_1
  1217. #dcbt AO, PRE
  1218. KERNEL1x8_2
  1219. addic. L, L, -1
  1220. bgt LDGEMM_L1x8_LOOP
  1221. LDGEMM_L1x8_LOOP_END:
  1222. KERNEL1x8_1
  1223. KERNEL1x8_2
  1224. KERNEL1x8_1
  1225. KERNEL1x8_2
  1226. KERNEL1x8_1
  1227. KERNEL1x8_2
  1228. KERNEL1x8_1
  1229. KERNEL1x8_E2
  1230. b LDGEMM_L1x8_SUB1
  1231. LDGEMM_L1x8_SUB4:
  1232. KERNEL1x8_SUBI1
  1233. KERNEL1x8_SUB1
  1234. KERNEL1x8_SUB1
  1235. KERNEL1x8_SUB1
  1236. KERNEL1x8_SUB1
  1237. KERNEL1x8_SUB1
  1238. KERNEL1x8_SUB1
  1239. KERNEL1x8_SUB1
  1240. b LDGEMM_L1x8_SUB1
  1241. LDGEMM_L1x8_SUB0:
  1242. #if defined(TRMMKERNEL)
  1243. andi. L, T3, 7
  1244. #else
  1245. andi. L, K, 7
  1246. #endif
  1247. KERNEL1x8_SUBI1
  1248. addic. L, L, -1
  1249. ble LDGEMM_L1x8_SAVE
  1250. b LDGEMM_L1x8_SUB2
  1251. LDGEMM_L1x8_SUB1:
  1252. #if defined(TRMMKERNEL)
  1253. andi. L, T3, 7
  1254. #else
  1255. andi. L, K, 7
  1256. #endif
  1257. ble LDGEMM_L1x8_SAVE
  1258. LDGEMM_L1x8_SUB2:
  1259. KERNEL1x8_SUB1
  1260. addic. L, L, -1
  1261. bgt LDGEMM_L1x8_SUB2
  1262. LDGEMM_L1x8_SAVE:
  1263. SAVE1x8
  1264. #if defined(TRMMKERNEL)
  1265. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1
  1266. #endif
  1267. LDGEMM_L1x8_END:
  1268. LDGEMM_L1x4_BEGIN:
  1269. andi. T1, M, 4
  1270. ble LDGEMM_L1x4_END
  1271. #if defined(TRMMKERNEL)
  1272. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
  1273. REFRESH_TEMP_BK T3,K,TEMP_REG,4,1
  1274. srawi. L, T3, 3
  1275. #else
  1276. mr BO, B
  1277. srawi. L, K, 3
  1278. #endif
  1279. ble LDGEMM_L1x4_SUB0
  1280. cmpwi cr0, L, 1
  1281. ble LDGEMM_L1x4_SUB4
  1282. LDGEMM_L1x4_LOOP_START:
  1283. LOAD1x4_1
  1284. KERNEL1x4_I1
  1285. KERNEL1x4_2
  1286. KERNEL1x4_1
  1287. KERNEL1x4_2
  1288. KERNEL1x4_1
  1289. KERNEL1x4_2
  1290. KERNEL1x4_1
  1291. KERNEL1x4_2
  1292. addic. L, L, -2
  1293. ble LDGEMM_L1x4_LOOP_END
  1294. MY_ALIGN
  1295. LDGEMM_L1x4_LOOP:
  1296. KERNEL1x4_1
  1297. KERNEL1x4_2
  1298. KERNEL1x4_1
  1299. KERNEL1x4_2
  1300. KERNEL1x4_1
  1301. KERNEL1x4_2
  1302. KERNEL1x4_1
  1303. KERNEL1x4_2
  1304. addic. L, L, -1
  1305. bgt LDGEMM_L1x4_LOOP
  1306. LDGEMM_L1x4_LOOP_END:
  1307. KERNEL1x4_1
  1308. KERNEL1x4_2
  1309. KERNEL1x4_1
  1310. KERNEL1x4_2
  1311. KERNEL1x4_1
  1312. KERNEL1x4_2
  1313. KERNEL1x4_1
  1314. KERNEL1x4_E2
  1315. b LDGEMM_L1x4_SUB1
  1316. LDGEMM_L1x4_SUB4:
  1317. KERNEL1x4_SUBI1
  1318. KERNEL1x4_SUB1
  1319. KERNEL1x4_SUB1
  1320. KERNEL1x4_SUB1
  1321. KERNEL1x4_SUB1
  1322. KERNEL1x4_SUB1
  1323. KERNEL1x4_SUB1
  1324. KERNEL1x4_SUB1
  1325. b LDGEMM_L1x4_SUB1
  1326. LDGEMM_L1x4_SUB0:
  1327. #if defined(TRMMKERNEL)
  1328. andi. L, T3, 7
  1329. #else
  1330. andi. L, K, 7
  1331. #endif
  1332. KERNEL1x4_SUBI1
  1333. addic. L, L, -1
  1334. ble LDGEMM_L1x4_SAVE
  1335. b LDGEMM_L1x4_SUB2
  1336. LDGEMM_L1x4_SUB1:
  1337. #if defined(TRMMKERNEL)
  1338. andi. L, T3, 7
  1339. #else
  1340. andi. L, K, 7
  1341. #endif
  1342. ble LDGEMM_L1x4_SAVE
  1343. LDGEMM_L1x4_SUB2:
  1344. KERNEL1x4_SUB1
  1345. addic. L, L, -1
  1346. bgt LDGEMM_L1x4_SUB2
  1347. LDGEMM_L1x4_SAVE:
  1348. SAVE1x4
  1349. #if defined(TRMMKERNEL)
  1350. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1
  1351. #endif
  1352. LDGEMM_L1x4_END:
  1353. LDGEMM_L1x2_BEGIN:
  1354. andi. T1, M, 2
  1355. ble LDGEMM_L1x2_END
  1356. #if defined(TRMMKERNEL)
  1357. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
  1358. REFRESH_TEMP_BK T3,K,TEMP_REG,2,1
  1359. srawi. L, T3, 3
  1360. #else
  1361. mr BO, B
  1362. srawi. L, K, 3
  1363. #endif
  1364. ble LDGEMM_L1x2_SUB0
  1365. cmpwi cr0, L, 1
  1366. ble LDGEMM_L1x2_SUB4
  1367. LDGEMM_L1x2_LOOP_START:
  1368. LOAD1x2_1
  1369. KERNEL1x2_I1
  1370. KERNEL1x2_2
  1371. KERNEL1x2_1
  1372. KERNEL1x2_2
  1373. KERNEL1x2_1
  1374. KERNEL1x2_2
  1375. KERNEL1x2_1
  1376. KERNEL1x2_2
  1377. addic. L, L, -2
  1378. ble LDGEMM_L1x2_LOOP_END
  1379. MY_ALIGN
  1380. LDGEMM_L1x2_LOOP:
  1381. KERNEL1x2_1
  1382. KERNEL1x2_2
  1383. KERNEL1x2_1
  1384. KERNEL1x2_2
  1385. KERNEL1x2_1
  1386. KERNEL1x2_2
  1387. KERNEL1x2_1
  1388. KERNEL1x2_2
  1389. addic. L, L, -1
  1390. bgt LDGEMM_L1x2_LOOP
  1391. LDGEMM_L1x2_LOOP_END:
  1392. KERNEL1x2_1
  1393. KERNEL1x2_2
  1394. KERNEL1x2_1
  1395. KERNEL1x2_2
  1396. KERNEL1x2_1
  1397. KERNEL1x2_2
  1398. KERNEL1x2_1
  1399. KERNEL1x2_E2
  1400. b LDGEMM_L1x2_SUB1
  1401. LDGEMM_L1x2_SUB4:
  1402. KERNEL1x2_SUBI1
  1403. KERNEL1x2_SUB1
  1404. KERNEL1x2_SUB1
  1405. KERNEL1x2_SUB1
  1406. KERNEL1x2_SUB1
  1407. KERNEL1x2_SUB1
  1408. KERNEL1x2_SUB1
  1409. KERNEL1x2_SUB1
  1410. b LDGEMM_L1x2_SUB1
  1411. LDGEMM_L1x2_SUB0:
  1412. #if defined(TRMMKERNEL)
  1413. andi. L, T3, 7
  1414. #else
  1415. andi. L, K, 7
  1416. #endif
  1417. KERNEL1x2_SUBI1
  1418. addic. L, L, -1
  1419. ble LDGEMM_L1x2_SAVE
  1420. b LDGEMM_L1x2_SUB2
  1421. LDGEMM_L1x2_SUB1:
  1422. #if defined(TRMMKERNEL)
  1423. andi. L, T3, 7
  1424. #else
  1425. andi. L, K, 7
  1426. #endif
  1427. ble LDGEMM_L1x2_SAVE
  1428. LDGEMM_L1x2_SUB2:
  1429. KERNEL1x2_SUB1
  1430. addic. L, L, -1
  1431. bgt LDGEMM_L1x2_SUB2
  1432. LDGEMM_L1x2_SAVE:
  1433. SAVE1x2
  1434. #if defined(TRMMKERNEL)
  1435. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1
  1436. #endif
  1437. LDGEMM_L1x2_END:
  1438. LDGEMM_L1x1_BEGIN:
  1439. andi. T1, M, 1
  1440. ble LDGEMM_L1x1_END
  1441. #if defined(TRMMKERNEL)
  1442. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
  1443. REFRESH_TEMP_BK T3,K,TEMP_REG,1,1
  1444. srawi. L, T3, 3
  1445. #else
  1446. mr BO, B
  1447. srawi. L, K, 3
  1448. #endif
  1449. ble LDGEMM_L1x1_SUB0
  1450. cmpwi cr0, L, 1
  1451. ble LDGEMM_L1x1_SUB4
  1452. LDGEMM_L1x1_LOOP_START:
  1453. LOAD1x1_1
  1454. KERNEL1x1_I1
  1455. KERNEL1x1_2
  1456. KERNEL1x1_1
  1457. KERNEL1x1_2
  1458. KERNEL1x1_1
  1459. KERNEL1x1_2
  1460. KERNEL1x1_1
  1461. KERNEL1x1_2
  1462. addic. L, L, -2
  1463. ble LDGEMM_L1x1_LOOP_END
  1464. MY_ALIGN
  1465. LDGEMM_L1x1_LOOP:
  1466. KERNEL1x1_1
  1467. KERNEL1x1_2
  1468. KERNEL1x1_1
  1469. KERNEL1x1_2
  1470. KERNEL1x1_1
  1471. KERNEL1x1_2
  1472. KERNEL1x1_1
  1473. KERNEL1x1_2
  1474. addic. L, L, -1
  1475. bgt LDGEMM_L1x1_LOOP
  1476. LDGEMM_L1x1_LOOP_END:
  1477. KERNEL1x1_1
  1478. KERNEL1x1_2
  1479. KERNEL1x1_1
  1480. KERNEL1x1_2
  1481. KERNEL1x1_1
  1482. KERNEL1x1_2
  1483. KERNEL1x1_1
  1484. KERNEL1x1_E2
  1485. b LDGEMM_L1x1_SUB1
  1486. LDGEMM_L1x1_SUB4:
  1487. KERNEL1x1_SUBI1
  1488. KERNEL1x1_SUB1
  1489. KERNEL1x1_SUB1
  1490. KERNEL1x1_SUB1
  1491. KERNEL1x1_SUB1
  1492. KERNEL1x1_SUB1
  1493. KERNEL1x1_SUB1
  1494. KERNEL1x1_SUB1
  1495. b LDGEMM_L1x1_SUB1
  1496. LDGEMM_L1x1_SUB0:
  1497. #if defined(TRMMKERNEL)
  1498. andi. L, T3, 7
  1499. #else
  1500. andi. L, K, 7
  1501. #endif
  1502. KERNEL1x1_SUBI1
  1503. addic. L, L, -1
  1504. ble LDGEMM_L1x1_SAVE
  1505. b LDGEMM_L1x1_SUB2
  1506. LDGEMM_L1x1_SUB1:
  1507. #if defined(TRMMKERNEL)
  1508. andi. L, T3, 7
  1509. #else
  1510. andi. L, K, 7
  1511. #endif
  1512. ble LDGEMM_L1x1_SAVE
  1513. LDGEMM_L1x1_SUB2:
  1514. KERNEL1x1_SUB1
  1515. addic. L, L, -1
  1516. bgt LDGEMM_L1x1_SUB2
  1517. LDGEMM_L1x1_SAVE:
  1518. SAVE1x1
  1519. #if defined(TRMMKERNEL)
  1520. REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1
  1521. #endif
  1522. LDGEMM_L1x1_END:
  1523. #if defined(TRMMKERNEL) && !defined(LEFT)
  1524. addi TEMP_REG, TEMP_REG, 1
  1525. #endif
  1526. LDGEMM_L1_END: