You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_logic_power9.S 46 kB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192
  1. #define MY_ALIGN .align 3
  2. b L8
  3. MY_ALIGN
  4. LSGEMM_L8x16_LMAIN_SUB:
  5. LOAD8x16_2
  6. MY_ALIGN
  7. LSGEMM_L8x16_LOOP:
  8. KERNEL8x16_L2 128,64,0,0
  9. LSGEMM_L8x16_K128:
  10. KERNEL8x16_L2 128,64,1,0
  11. KERNEL8x16_I1_L4_2 128,64, 1,0
  12. KERNEL8x16_I1_L4_2 128,64, 2,0
  13. KERNEL8x16_I1_L4_2 128,64, 3,0
  14. KERNEL8x16_I1_L4_2 128,64, 4,0
  15. KERNEL8x16_I1_L4_2 128,64, 5,0
  16. KERNEL8x16_I1_L4_2 128,64, 6,0
  17. KERNEL8x16_I1_L4_2 128,64, 7,0
  18. KERNEL8x16_I1_L4_2 128,64, 8,0
  19. KERNEL8x16_I1_L4_2 128,64, 9,0
  20. KERNEL8x16_I1_L4_2 128,64, 10,0
  21. KERNEL8x16_I1_L4_2 128,64, 11,0
  22. KERNEL8x16_I1_L4_2 128,64, 12,0
  23. KERNEL8x16_I1_L4_2 128,64, 13,0
  24. KERNEL8x16_I1_L4_2 128,64, 14,0
  25. KERNEL8x16_I1_L4_2 128,64, 15,0
  26. KERNEL8x16_I1_L4_2 128,64, 16,0
  27. KERNEL8x16_I1_L4_2 128,64, 17,0
  28. KERNEL8x16_I1_L4_2 128,64, 18,0
  29. KERNEL8x16_I1_L4_2 128,64, 19,0
  30. KERNEL8x16_I1_L4_2 128,64, 20,0
  31. KERNEL8x16_I1_L4_2 128,64, 21,0
  32. KERNEL8x16_I1_L4_2 128,64, 22,0
  33. KERNEL8x16_I1_L4_2 128,64, 23,0
  34. KERNEL8x16_I1_L4_2 128,64, 24,0
  35. KERNEL8x16_I1_L4_2 128,64, 25,0
  36. KERNEL8x16_I1_L4_2 128,64, 26,0
  37. KERNEL8x16_I1_L4_2 128,64, 27,0
  38. KERNEL8x16_I1_L4_2 128,64, 28,0
  39. KERNEL8x16_I1_L4_2 128,64, 29,0
  40. KERNEL8x16_I1_L4_2 128,64, 30,0
  41. KERNEL8x16_I1_L4_2 128,64, 31,1
  42. bdnz LSGEMM_L8x16_LOOP
  43. MY_ALIGN
  44. LSGEMM_L8x16_LOOP_END:
  45. END8x16_2
  46. blr
  47. MY_ALIGN
  48. LSGEMM_L8x16_L64_SUB:
  49. LOAD8x16_2
  50. KERNEL8x16_I1_L4_2 128,64, 0,0
  51. KERNEL8x16_I1_L4_2 128,64, 1,0
  52. KERNEL8x16_I1_L4_2 128,64, 2,0
  53. KERNEL8x16_I1_L4_2 128,64,3,0
  54. KERNEL8x16_I1_L4_2 128,64,4,0
  55. KERNEL8x16_I1_L4_2 128,64,5,0
  56. KERNEL8x16_I1_L4_2 128,64,6,0
  57. KERNEL8x16_I1_L4_2 128,64,7,0
  58. KERNEL8x16_I1_L4_2 128,64,8,0
  59. KERNEL8x16_I1_L4_2 128,64,9,0
  60. KERNEL8x16_I1_L4_2 128,64,10,0
  61. KERNEL8x16_I1_L4_2 128,64,11,0
  62. KERNEL8x16_I1_L4_2 128,64,12,0
  63. KERNEL8x16_I1_L4_2 128,64,13,0
  64. KERNEL8x16_I1_L4_2 128,64,14,0
  65. KERNEL8x16_I1_L4_3 128,64,15,1
  66. blr
  67. LSGEMM_L8x16_L32_SUB:
  68. LOAD8x16_2
  69. KERNEL8x16_I1_L4_2 128,64,0,0
  70. KERNEL8x16_I1_L4_2 128,64,1,0
  71. KERNEL8x16_I1_L4_2 128,64,2,0
  72. KERNEL8x16_I1_L4_2 128,64,3,0
  73. KERNEL8x16_I1_L4_2 128,64,4,0
  74. KERNEL8x16_I1_L4_2 128,64,5,0
  75. KERNEL8x16_I1_L4_2 128,64,6,0
  76. KERNEL8x16_I1_L4_3 128,64,7,1
  77. blr
  78. LSGEMM_L8x16_L16_SUB:
  79. LOAD8x16_2
  80. KERNEL8x16_I1_L4_2 128,64,0,0
  81. KERNEL8x16_I1_L4_2 128,64,1,0
  82. KERNEL8x16_I1_L4_2 128,64,2,0
  83. KERNEL8x16_I1_L4_3 128,64,3,1
  84. blr
  85. L8:
  86. #if defined(TRMMKERNEL) && !defined(LEFT)
  87. neg TEMP_REG, OFFSET
  88. #endif
  89. srawi. J, N, 3
  90. ble LSGEMM_L8_END
  91. LSGEMM_L8_BEGIN:
  92. li T1, 128
  93. li T2, 256
  94. mr AO, A
  95. mr CO, C
  96. slwi T3, LDC , 3
  97. add C, C, T3
  98. dcbt A, T1
  99. dcbt A, T2
  100. #if defined(TRMMKERNEL) && defined(LEFT)
  101. mr TEMP_REG, OFFSET /*off = offset;*/
  102. #endif
  103. srawi. I, M, 4
  104. ble LSGEMM_L8x16_END
  105. MY_ALIGN
  106. LSGEMM_L8x16_BEGIN:
  107. #if defined(TRMMKERNEL)
  108. REFRESH_POINTERS AO,BO,TEMP_REG,B,16,8
  109. #else
  110. mr BO, B
  111. #endif
  112. #if defined(TRMMKERNEL)
  113. REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
  114. mr T12, T11
  115. addi T12,T12, -2
  116. srawi. L, T12, 7 /**(T11-2) % 128x */
  117. #else
  118. mr T12, K
  119. addi T12,T12, -2
  120. srawi. L, T12, 7 /**(K-2) % 128x */
  121. #endif
  122. ZERO8x16
  123. mtctr L
  124. ble LSGEMM_L8x16_SUB0
  125. bl LSGEMM_L8x16_LMAIN_SUB
  126. andi. L, T12, 127
  127. ble LSGEMM_L8x16_SAVE
  128. b LSGEMM_L8x16_SUB2
  129. MY_ALIGN
  130. LSGEMM_L8x16_SUB0:
  131. #if defined(TRMMKERNEL)
  132. andi. L, T11, 255
  133. cmpwi T11,128
  134. #else
  135. andi. L, K, 255
  136. cmpwi K,129
  137. #endif
  138. li T10,1
  139. bne CMP8x16_128K
  140. addi BO,BO,-32
  141. addi AO,AO,-64
  142. LOAD8x16 64,32
  143. END8x16_WITHOUT_ADD
  144. LOAD8x16_2O AO,BO, 128, 64
  145. mtctr T10
  146. bl LSGEMM_L8x16_K128
  147. b LSGEMM_L8x16_SAVE
  148. CMP8x16_128K:
  149. /*----------------------------------------*/
  150. #if defined(TRMMKERNEL)
  151. cmpwi T11,128
  152. #else
  153. cmpwi K,128
  154. #endif
  155. bne LSGEMM_L8x16_SUB2
  156. MY_ALIGN
  157. mtctr T10
  158. addi BO,BO,-64
  159. addi AO,AO,-128
  160. LOAD8x16_2O AO,BO, 128,64
  161. bl LSGEMM_L8x16_K128
  162. b LSGEMM_L8x16_SAVE
  163. MY_ALIGN
  164. LSGEMM_L8x16_SUB2:
  165. andi. T10,L,64
  166. ble LSGEMM_L8x16_SUB2_32
  167. bl LSGEMM_L8x16_L64_SUB
  168. MY_ALIGN
  169. LSGEMM_L8x16_SUB2_32:
  170. andi. T10,L, 32
  171. ble LSGEMM_L8x16_SUB2_16
  172. bl LSGEMM_L8x16_L32_SUB
  173. MY_ALIGN
  174. LSGEMM_L8x16_SUB2_16:
  175. andi. T10,L, 16
  176. ble LSGEMM_L8x16_SUB2_8
  177. bl LSGEMM_L8x16_L16_SUB
  178. MY_ALIGN
  179. LSGEMM_L8x16_SUB2_8:
  180. andi. T10,L, 8
  181. ble LSGEMM_L8x16_SUB2_4
  182. LOAD8x16_2
  183. KERNEL8x16_I1_L4_2 128,64, 0,0
  184. KERNEL8x16_I1_L4_3 128,64, 1,1
  185. MY_ALIGN
  186. LSGEMM_L8x16_SUB2_4:
  187. andi. T10,L, 4
  188. ble LSGEMM_L8x16_SUB2_2
  189. LOAD8x16_2
  190. KERNEL8x16_I1_L4_3 128,64, 0,1
  191. MY_ALIGN
  192. LSGEMM_L8x16_SUB2_2:
  193. andi. T10,L, 2
  194. ble LSGEMM_L8x16_SUB2_1
  195. LOAD8x16_2
  196. KERNEL8x16_E2 128,64, 0,1
  197. MY_ALIGN
  198. LSGEMM_L8x16_SUB2_1:
  199. andi. T10,L, 1
  200. ble LSGEMM_L8x16_SAVE
  201. KERNEL8x16 0
  202. MY_ALIGN
  203. LSGEMM_L8x16_SAVE:
  204. SAVE8x16
  205. #if defined(TRMMKERNEL)
  206. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8
  207. #endif
  208. addic. I, I, -1
  209. bgt+ LSGEMM_L8x16_BEGIN
  210. MY_ALIGN
  211. LSGEMM_L8x16_END:
  212. LSGEMM_L8x8_BEGIN:
  213. andi. T2, M, 15
  214. ble LSGEMM_L8x1_END
  215. andi. T1, M, 8
  216. ble LSGEMM_L8x8_END
  217. #if defined(TRMMKERNEL)
  218. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,8
  219. #else
  220. mr BO, B
  221. #endif
  222. #if defined(TRMMKERNEL)
  223. REFRESH_TEMP_BK T11,K,TEMP_REG,8,8
  224. mr T12, T11
  225. addi T12,T12, -1
  226. srawi. L, T12, 4 /**(T11-1) % 16x */
  227. #else
  228. mr T12, K
  229. addi T12,T12, -1
  230. srawi. L, T12, 4 /**(K-1) % 16x */
  231. #endif
  232. ZERO8x8
  233. ble LSGEMM_L8x8_SUB0
  234. MY_ALIGN
  235. LSGEMM_L8x8_LOOP_START:
  236. LOAD8x8_0 /*we already zeroed */
  237. mtctr L
  238. MY_ALIGN
  239. LSGEMM_L8x8_LOOP:
  240. KERNEL8x8_I1_L4_2 32,32, 0,0
  241. KERNEL8x8_I1_L4_2 32,32, 1,0
  242. KERNEL8x8_I1_L4_2 32,32, 2,0
  243. KERNEL8x8_I1_L4_2 32,32, 3,1
  244. bdnz LSGEMM_L8x8_LOOP
  245. MY_ALIGN
  246. LSGEMM_L8x8_LOOP_END:
  247. END8x8 0, AO, BO, 32, 32
  248. b LSGEMM_L8x8_SUB1
  249. MY_ALIGN
  250. LSGEMM_L8x8_SUB0:
  251. #if defined(TRMMKERNEL)
  252. andi. L, T11, 31
  253. #else
  254. andi. L, K, 31
  255. #endif
  256. b LSGEMM_L8x8_SUB2
  257. MY_ALIGN
  258. LSGEMM_L8x8_SUB1:
  259. #if defined(TRMMKERNEL)
  260. andi. L, T12, 15
  261. #else
  262. andi. L, T12, 15
  263. #endif
  264. ble LSGEMM_L8x8_SAVE
  265. MY_ALIGN
  266. LSGEMM_L8x8_SUB2:
  267. srawi. T1,L, 3
  268. ble LSGEMM_L8x8_SUB2_4
  269. mtctr T1
  270. MY_ALIGN
  271. LSGEMM_L8x8_SUB2_LOOP:
  272. LOAD8x8_0
  273. KERNEL8x8_I1_L4_2 32,32, 0,0
  274. KERNEL8x8_I1_L4_3 32,32, 1,1
  275. bdnz LSGEMM_L8x8_SUB2_LOOP
  276. MY_ALIGN
  277. LSGEMM_L8x8_SUB2_4:
  278. andi. T1,L, 4
  279. ble LSGEMM_L8x8_SUB2_2
  280. LOAD8x8_0
  281. KERNEL8x8_I1_L4_3 32,32, 0,1
  282. MY_ALIGN
  283. LSGEMM_L8x8_SUB2_2:
  284. andi. T1,L, 2
  285. ble LSGEMM_L8x8_SUB2_1
  286. LOAD8x8_0
  287. KERNEL8x8_I1_L2_3 32,32, 0,1
  288. MY_ALIGN
  289. LSGEMM_L8x8_SUB2_1:
  290. andi. T1,L, 1
  291. ble LSGEMM_L8x8_SAVE
  292. KERNEL8x8 0
  293. MY_ALIGN
  294. LSGEMM_L8x8_SAVE:
  295. SAVE8x8
  296. #if defined(TRMMKERNEL)
  297. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8
  298. #endif
  299. MY_ALIGN
  300. LSGEMM_L8x8_END:
  301. LSGEMM_L8x4_BEGIN:
  302. andi. T2, M, 15
  303. ble LSGEMM_L8x1_END
  304. andi. T1, M, 4
  305. ble LSGEMM_L8x4_END
  306. #if defined(TRMMKERNEL)
  307. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,8
  308. #else
  309. mr BO, B
  310. #endif
  311. #if defined(TRMMKERNEL)
  312. REFRESH_TEMP_BK T11,K,TEMP_REG,4,8
  313. mr T12, T11
  314. addi T12,T12, -1
  315. srawi. L, T12, 4 /**(T11-1) % 16x */
  316. #else
  317. mr T12, K
  318. addi T12,T12, -1
  319. srawi. L, T12, 4 /**(K-1) % 16x */
  320. #endif
  321. ZERO8x4
  322. ble LSGEMM_L8x4_SUB0
  323. MY_ALIGN
  324. LSGEMM_L8x4_LOOP_START:
  325. LOAD8x4_0 /*we already zeroed */
  326. mtctr L
  327. MY_ALIGN
  328. LSGEMM_L8x4_LOOP:
  329. KERNEL8x4_I1_L4_2 16,32, 0,0
  330. KERNEL8x4_I1_L4_2 16,32, 1,0
  331. KERNEL8x4_I1_L4_2 16,32, 2,0
  332. KERNEL8x4_I1_L4_2 16,32, 3,1
  333. bdnz LSGEMM_L8x4_LOOP
  334. MY_ALIGN
  335. LSGEMM_L8x4_LOOP_END:
  336. END8x4 0, AO, BO, 16, 32
  337. b LSGEMM_L8x4_SUB1
  338. MY_ALIGN
  339. LSGEMM_L8x4_SUB0:
  340. #if defined(TRMMKERNEL)
  341. andi. L, T11, 31
  342. #else
  343. andi. L, K, 31
  344. #endif
  345. b LSGEMM_L8x4_SUB2
  346. MY_ALIGN
  347. LSGEMM_L8x4_SUB1:
  348. #if defined(TRMMKERNEL)
  349. andi. L, T12, 15
  350. #else
  351. andi. L, T12, 15
  352. #endif
  353. ble LSGEMM_L8x4_SAVE
  354. MY_ALIGN
  355. LSGEMM_L8x4_SUB2:
  356. srawi. T1,L, 3
  357. ble LSGEMM_L8x4_SUB2_4
  358. mtctr T1
  359. MY_ALIGN
  360. LSGEMM_L8x4_SUB2_LOOP:
  361. LOAD8x4_0
  362. KERNEL8x4_I1_L4_2 16,32, 0,0
  363. KERNEL8x4_I1_L4_3 16,32, 1,1
  364. bdnz LSGEMM_L8x4_SUB2_LOOP
  365. MY_ALIGN
  366. LSGEMM_L8x4_SUB2_4:
  367. andi. T1,L, 4
  368. ble LSGEMM_L8x4_SUB2_2
  369. LOAD8x4_0
  370. KERNEL8x4_I1_L4_3 16,32, 0,1
  371. MY_ALIGN
  372. LSGEMM_L8x4_SUB2_2:
  373. andi. T1,L, 2
  374. ble LSGEMM_L8x4_SUB2_1
  375. LOAD8x4_0
  376. KERNEL8x4_I1_L2_3 16,32, 0,1
  377. MY_ALIGN
  378. LSGEMM_L8x4_SUB2_1:
  379. andi. T1,L, 1
  380. ble LSGEMM_L8x4_SAVE
  381. KERNEL8x4 0
  382. MY_ALIGN
  383. LSGEMM_L8x4_SAVE:
  384. SAVE8x4
  385. #if defined(TRMMKERNEL)
  386. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8
  387. #endif
  388. MY_ALIGN
  389. LSGEMM_L8x4_END:
  390. LSGEMM_L8x2_BEGIN:
  391. andi. T1, M, 2
  392. ble LSGEMM_L8x2_END
  393. #if defined(TRMMKERNEL)
  394. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,8
  395. #else
  396. mr BO, B
  397. #endif
  398. #if defined(TRMMKERNEL)
  399. REFRESH_TEMP_BK T11,K,TEMP_REG,2,8
  400. srawi. L, T11, 3 /**(T11) % 8x */
  401. #else
  402. srawi. L, K, 3 /**(K) % 8x */
  403. #endif
  404. ZERO8x2
  405. ble LSGEMM_L8x2_SUB0
  406. MY_ALIGN
  407. LSGEMM_L8x2_LOOP_START:
  408. mtctr L
  409. MY_ALIGN
  410. LSGEMM_L8x2_LOOP:
  411. KERNEL8x2_2 0,0, 0,0
  412. KERNEL8x2_2 0,0, 1,0
  413. KERNEL8x2_2 0,0, 2,0
  414. KERNEL8x2_2 0,0, 3,1
  415. bdnz LSGEMM_L8x2_LOOP
  416. MY_ALIGN
  417. LSGEMM_L8x2_LOOP_END:
  418. LSGEMM_L8x2_SUB0:
  419. #if defined(TRMMKERNEL)
  420. andi. L, T11, 7
  421. #else
  422. andi. L, K, 7
  423. #endif
  424. ble LSGEMM_L8x2_SAVE
  425. MY_ALIGN
  426. LSGEMM_L8x2_SUB2:
  427. andi. T1,L, 4
  428. ble LSGEMM_L8x2_SUB2_2
  429. KERNEL8x2_2 0,0, 0,0
  430. KERNEL8x2_2 0,0, 1,1
  431. MY_ALIGN
  432. LSGEMM_L8x2_SUB2_2:
  433. andi. T1,L, 2
  434. ble LSGEMM_L8x2_SUB2_1
  435. KERNEL8x2_2 0,0, 0,1
  436. MY_ALIGN
  437. LSGEMM_L8x2_SUB2_1:
  438. andi. T1,L, 1
  439. ble LSGEMM_L8x2_SAVE
  440. KERNEL8x2
  441. MY_ALIGN
  442. LSGEMM_L8x2_SAVE:
  443. SAVE8x2
  444. #if defined(TRMMKERNEL)
  445. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8
  446. #endif
  447. MY_ALIGN
  448. LSGEMM_L8x2_END:
  449. LSGEMM_L8x1_BEGIN:
  450. andi. T1, M, 1
  451. ble LSGEMM_L8x1_END
  452. #if defined(TRMMKERNEL)
  453. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,8
  454. #else
  455. mr BO, B
  456. #endif
  457. #if defined(TRMMKERNEL)
  458. REFRESH_TEMP_BK T11,K,TEMP_REG,1,8
  459. srawi. L, T11, 3 /**(T11) % 8x */
  460. #else
  461. srawi. L, K, 3 /**(K) % 8x */
  462. #endif
  463. ZERO8x1
  464. ble LSGEMM_L8x1_SUB0
  465. MY_ALIGN
  466. LSGEMM_L8x1_LOOP_START:
  467. mtctr L
  468. MY_ALIGN
  469. LSGEMM_L8x1_LOOP:
  470. KERNEL8x1_4 0,0, 0,0
  471. KERNEL8x1_4 0,0, 1,1
  472. bdnz LSGEMM_L8x1_LOOP
  473. MY_ALIGN
  474. LSGEMM_L8x1_LOOP_END:
  475. LSGEMM_L8x1_SUB0:
  476. #if defined(TRMMKERNEL)
  477. andi. L, T11, 7
  478. #else
  479. andi. L, K, 7
  480. #endif
  481. ble LSGEMM_L8x1_SAVE
  482. MY_ALIGN
  483. LSGEMM_L8x1_SUB2:
  484. andi. T1,L, 4
  485. ble LSGEMM_L8x1_SUB2_2
  486. KERNEL8x1_4 0,0, 0,1
  487. MY_ALIGN
  488. LSGEMM_L8x1_SUB2_2:
  489. andi. T1,L, 2
  490. ble LSGEMM_L8x1_SUB2_1
  491. KERNEL8x1_2
  492. MY_ALIGN
  493. LSGEMM_L8x1_SUB2_1:
  494. andi. T1,L, 1
  495. ble LSGEMM_L8x1_SAVE
  496. KERNEL8x1
  497. MY_ALIGN
  498. LSGEMM_L8x1_SAVE:
  499. SAVE8x1
  500. #if defined(TRMMKERNEL)
  501. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8
  502. #endif
  503. MY_ALIGN
  504. LSGEMM_L8x1_END:
  505. slwi T1, K, 5
  506. add B, B, T1
  507. #if defined(TRMMKERNEL) && !defined(LEFT)
  508. addi TEMP_REG, TEMP_REG, 8
  509. #endif
  510. addic. J, J, -1
  511. bgt LSGEMM_L8_BEGIN
  512. LSGEMM_L8_END:
  513. /* b LSGEMM_L4_BEGIN*/
  514. andi. T1, N, 4
  515. ble LSGEMM_L4_END
  516. LSGEMM_L4_BEGIN:
  517. mr AO, A
  518. mr CO, C
  519. slwi T3, LDC , 2
  520. add C, C, T3
  521. #if defined(TRMMKERNEL) && defined(LEFT)
  522. mr TEMP_REG, OFFSET /*off = offset;*/
  523. #endif
  524. srawi. I, M, 4
  525. ble LSGEMM_L4x16_END
  526. MY_ALIGN
  527. LSGEMM_L4x16_BEGIN:
  528. #if defined(TRMMKERNEL)
  529. REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4
  530. #else
  531. mr BO, B
  532. #endif
  533. #if defined(TRMMKERNEL)
  534. REFRESH_TEMP_BK T11,K,TEMP_REG,16,4
  535. mr T12, T11
  536. addi T12,T12, -1
  537. srawi. L, T12, 6 /**(T11-1) % 64x */
  538. #else
  539. mr T12, K
  540. addi T12,T12, -1
  541. srawi. L, T12, 6 /**(K-1) % 64x */
  542. #endif
  543. ZERO4x16
  544. ble LSGEMM_L4x16_SUB0
  545. MY_ALIGN
  546. LSGEMM_L4x16_LOOP_START:
  547. LOAD4x16_0 /*we already zeroed */
  548. ##OffsetA=64 OffsetB=16
  549. addi AO,AO,2112
  550. addi BO,BO,16
  551. mtctr L
  552. MY_ALIGN
  553. LSGEMM_L4x16_LOOP:
  554. KERNEL4x16_I1_L4_2 -2048,0, 0,0
  555. KERNEL4x16_I1_L4_2 -2048,0, 1,0
  556. KERNEL4x16_I1_L4_2 -2048,0, 2,0
  557. KERNEL4x16_I1_L4_2 -2048,0, 3,0
  558. KERNEL4x16_I1_L4_2 -2048,0, 4,0
  559. KERNEL4x16_I1_L4_2 -2048,0, 5,0
  560. KERNEL4x16_I1_L4_2 -2048,0, 6,0
  561. KERNEL4x16_I1_L4_2 -2048,0, 7,0
  562. KERNEL4x16_I1_L4_2 -2048,0, 8,0
  563. KERNEL4x16_I1_L4_2 -2048,0, 9,0
  564. KERNEL4x16_I1_L4_2 -2048,0, 10,0
  565. KERNEL4x16_I1_L4_2 -2048,0, 11,0
  566. KERNEL4x16_I1_L4_2 -2048,0, 12,0
  567. KERNEL4x16_I1_L4_2 -2048,0, 13,0
  568. KERNEL4x16_I1_L4_2 -2048,0, 14,0
  569. KERNEL4x16_I1_L4_2 -2048,0, 15,1
  570. bdnz LSGEMM_L4x16_LOOP
  571. MY_ALIGN
  572. LSGEMM_L4x16_LOOP_END:
  573. END4x16 0, AO, BO, -2048, 0
  574. b LSGEMM_L4x16_SUB1
  575. MY_ALIGN
  576. LSGEMM_L4x16_SUB0:
  577. #if defined(TRMMKERNEL)
  578. andi. L, T11, 127
  579. #else
  580. andi. L, K, 127
  581. #endif
  582. b LSGEMM_L4x16_SUB2
  583. MY_ALIGN
  584. LSGEMM_L4x16_SUB1:
  585. #if defined(TRMMKERNEL)
  586. andi. L, T12, 63
  587. #else
  588. andi. L, T12, 63
  589. #endif
  590. ble LSGEMM_L4x16_SAVE
  591. MY_ALIGN
  592. LSGEMM_L4x16_SUB2:
  593. srawi. T10,L, 5
  594. ble LSGEMM_L4x16_SUB2_16
  595. mtctr T10
  596. MY_ALIGN
  597. LSGEMM_L4x16_SUB2_LOOP:
  598. LOAD4x16_0
  599. KERNEL4x16_I1_L4_2 64,16, 0,0
  600. KERNEL4x16_I1_L4_2 64,16, 1,0
  601. KERNEL4x16_I1_L4_2 64,16, 2,0
  602. KERNEL4x16_I1_L4_2 64,16, 3,0
  603. KERNEL4x16_I1_L4_2 64,16, 4,0
  604. KERNEL4x16_I1_L4_2 64,16, 5,0
  605. KERNEL4x16_I1_L4_2 64,16, 6,0
  606. KERNEL4x16_I1_L4_3 64,16, 7,1
  607. bdnz LSGEMM_L4x16_SUB2_LOOP
  608. MY_ALIGN
  609. LSGEMM_L4x16_SUB2_16:
  610. andi. T10,L, 16
  611. ble LSGEMM_L4x16_SUB2_8
  612. LOAD4x16_0
  613. KERNEL4x16_I1_L4_2 64,16, 0,0
  614. KERNEL4x16_I1_L4_2 64,16, 1,0
  615. KERNEL4x16_I1_L4_2 64,16, 2,0
  616. KERNEL4x16_I1_L4_3 64,16, 3,1
  617. MY_ALIGN
  618. LSGEMM_L4x16_SUB2_8:
  619. andi. T10,L, 8
  620. ble LSGEMM_L4x16_SUB2_4
  621. LOAD4x16_0
  622. KERNEL4x16_I1_L4_2 64,16, 0,0
  623. KERNEL4x16_I1_L4_3 64,16, 1,1
  624. MY_ALIGN
  625. LSGEMM_L4x16_SUB2_4:
  626. andi. T10,L, 4
  627. ble LSGEMM_L4x16_SUB2_2
  628. LOAD4x16_0
  629. KERNEL4x16_I1_L4_3 64,16, 0,1
  630. MY_ALIGN
  631. LSGEMM_L4x16_SUB2_2:
  632. andi. T10,L, 2
  633. ble LSGEMM_L4x16_SUB2_1
  634. LOAD4x16_0
  635. KERNEL4x16_I1_L2_3 64,16, 0,1
  636. MY_ALIGN
  637. LSGEMM_L4x16_SUB2_1:
  638. andi. T10,L, 1
  639. ble LSGEMM_L4x16_SAVE
  640. KERNEL4x16 0
  641. # addic. L, L, -1
  642. # bgt LSGEMM_L4x16_SUB2
  643. MY_ALIGN
  644. LSGEMM_L4x16_SAVE:
  645. SAVE4x16
  646. #if defined(TRMMKERNEL)
  647. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4
  648. #endif
  649. addic. I, I, -1
  650. bgt+ LSGEMM_L4x16_BEGIN
  651. MY_ALIGN
  652. LSGEMM_L4x16_END:
  653. LSGEMM_L4x8_BEGIN:
  654. andi. T2, M, 15
  655. ble LSGEMM_L4x1_END
  656. andi. T1, M, 8
  657. ble LSGEMM_L4x8_END
  658. #if defined(TRMMKERNEL)
  659. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4
  660. #else
  661. mr BO, B
  662. #endif
  663. #if defined(TRMMKERNEL)
  664. REFRESH_TEMP_BK T11,K,TEMP_REG,8,4
  665. mr T12, T11
  666. addi T12,T12, -1
  667. srawi. L, T12, 4 /**(T11-1) % 16x */
  668. #else
  669. mr T12, K
  670. addi T12,T12, -1
  671. srawi. L, T12, 4 /**(K-1) % 16x */
  672. #endif
  673. ZERO4x8
  674. ble LSGEMM_L4x8_SUB0
  675. MY_ALIGN
  676. LSGEMM_L4x8_LOOP_START:
  677. LOAD4x8_0 /*we already zeroed */
  678. mtctr L
  679. MY_ALIGN
  680. LSGEMM_L4x8_LOOP:
  681. KERNEL4x8_I1_L4_2 32,16, 0,0
  682. KERNEL4x8_I1_L4_2 32,16, 1,0
  683. KERNEL4x8_I1_L4_2 32,16, 2,0
  684. KERNEL4x8_I1_L4_2 32,16, 3,1
  685. bdnz LSGEMM_L4x8_LOOP
  686. MY_ALIGN
  687. LSGEMM_L4x8_LOOP_END:
  688. END4x8 0, AO, BO, 32, 16
  689. b LSGEMM_L4x8_SUB1
  690. MY_ALIGN
  691. LSGEMM_L4x8_SUB0:
  692. #if defined(TRMMKERNEL)
  693. andi. L, T11, 31
  694. #else
  695. andi. L, K, 31
  696. #endif
  697. b LSGEMM_L4x8_SUB2
  698. MY_ALIGN
  699. LSGEMM_L4x8_SUB1:
  700. #if defined(TRMMKERNEL)
  701. andi. L, T12, 15
  702. #else
  703. andi. L, T12, 15
  704. #endif
  705. ble LSGEMM_L4x8_SAVE
  706. MY_ALIGN
  707. LSGEMM_L4x8_SUB2:
  708. srawi. T1,L, 3
  709. ble LSGEMM_L4x8_SUB2_4
  710. mtctr T1
  711. MY_ALIGN
  712. LSGEMM_L4x8_SUB2_LOOP:
  713. LOAD4x8_0
  714. KERNEL4x8_I1_L4_2 32,16, 0,0
  715. KERNEL4x8_I1_L4_3 32,16, 1,1
  716. bdnz LSGEMM_L4x8_SUB2_LOOP
  717. MY_ALIGN
  718. LSGEMM_L4x8_SUB2_4:
  719. andi. T1,L, 4
  720. ble LSGEMM_L4x8_SUB2_2
  721. LOAD4x8_0
  722. KERNEL4x8_I1_L4_3 32,16, 0,1
  723. MY_ALIGN
  724. LSGEMM_L4x8_SUB2_2:
  725. andi. T1,L, 2
  726. ble LSGEMM_L4x8_SUB2_1
  727. LOAD4x8_0
  728. KERNEL4x8_I1_L2_3 32,16, 0,1
  729. MY_ALIGN
  730. LSGEMM_L4x8_SUB2_1:
  731. andi. T1,L, 1
  732. ble LSGEMM_L4x8_SAVE
  733. KERNEL4x8 0
  734. MY_ALIGN
  735. LSGEMM_L4x8_SAVE:
  736. SAVE4x8
  737. #if defined(TRMMKERNEL)
  738. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4
  739. #endif
  740. MY_ALIGN
  741. LSGEMM_L4x8_END:
  742. LSGEMM_L4x4_BEGIN:
  743. andi. T2, M, 15
  744. ble LSGEMM_L4x1_END
  745. andi. T1, M, 4
  746. ble LSGEMM_L4x4_END
  747. #if defined(TRMMKERNEL)
  748. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4
  749. #else
  750. mr BO, B
  751. #endif
  752. #if defined(TRMMKERNEL)
  753. REFRESH_TEMP_BK T11,K,TEMP_REG,4,4
  754. mr T12, T11
  755. addi T12,T12, -1
  756. srawi. L, T12, 4 /**(T11-1) % 16x */
  757. #else
  758. mr T12, K
  759. addi T12,T12, -1
  760. srawi. L, T12, 4 /**(K-1) % 16x */
  761. #endif
  762. ZERO4x4
  763. ble LSGEMM_L4x4_SUB0
  764. MY_ALIGN
  765. LSGEMM_L4x4_LOOP_START:
  766. LOAD4x4_0 /*we already zeroed */
  767. mtctr L
  768. MY_ALIGN
  769. LSGEMM_L4x4_LOOP:
  770. KERNEL4x4_I1_L4_2 16,16, 0,0
  771. KERNEL4x4_I1_L4_2 16,16, 1,0
  772. KERNEL4x4_I1_L4_2 16,16, 2,0
  773. KERNEL4x4_I1_L4_2 16,16, 3,1
  774. bdnz LSGEMM_L4x4_LOOP
  775. MY_ALIGN
  776. LSGEMM_L4x4_LOOP_END:
  777. END4x4 0, AO, BO, 16, 16
  778. b LSGEMM_L4x4_SUB1
  779. MY_ALIGN
  780. LSGEMM_L4x4_SUB0:
  781. #if defined(TRMMKERNEL)
  782. andi. L, T11, 31
  783. #else
  784. andi. L, K, 31
  785. #endif
  786. b LSGEMM_L4x4_SUB2
  787. MY_ALIGN
  788. LSGEMM_L4x4_SUB1:
  789. #if defined(TRMMKERNEL)
  790. andi. L, T12, 15
  791. #else
  792. andi. L, T12, 15
  793. #endif
  794. ble LSGEMM_L4x4_SAVE
  795. MY_ALIGN
  796. LSGEMM_L4x4_SUB2:
  797. srawi. T1,L, 3
  798. ble LSGEMM_L4x4_SUB2_4
  799. mtctr T1
  800. MY_ALIGN
  801. LSGEMM_L4x4_SUB2_LOOP:
  802. LOAD4x4_0
  803. KERNEL4x4_I1_L4_2 16,16, 0,0
  804. KERNEL4x4_I1_L4_3 16,16, 1,1
  805. bdnz LSGEMM_L4x4_SUB2_LOOP
  806. MY_ALIGN
  807. LSGEMM_L4x4_SUB2_4:
  808. andi. T1,L, 4
  809. ble LSGEMM_L4x4_SUB2_2
  810. LOAD4x4_0
  811. KERNEL4x4_I1_L4_3 16,16, 0,1
  812. MY_ALIGN
  813. LSGEMM_L4x4_SUB2_2:
  814. andi. T1,L, 2
  815. ble LSGEMM_L4x4_SUB2_1
  816. LOAD4x4_0
  817. KERNEL4x4_I1_L2_3 16,16, 0,1
  818. MY_ALIGN
  819. LSGEMM_L4x4_SUB2_1:
  820. andi. T1,L, 1
  821. ble LSGEMM_L4x4_SAVE
  822. KERNEL4x4 0
  823. MY_ALIGN
  824. LSGEMM_L4x4_SAVE:
  825. SAVE4x4
  826. #if defined(TRMMKERNEL)
  827. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4
  828. #endif
  829. MY_ALIGN
  830. LSGEMM_L4x4_END:
  831. LSGEMM_L4x2_BEGIN:
  832. andi. T1, M, 2
  833. ble LSGEMM_L4x2_END
  834. #if defined(TRMMKERNEL)
  835. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4
  836. #else
  837. mr BO, B
  838. #endif
  839. #if defined(TRMMKERNEL)
  840. REFRESH_TEMP_BK T11,K,TEMP_REG,2,4
  841. srawi. L, T11, 3 /**(T11) % 8x */
  842. #else
  843. srawi. L, K, 3 /**(K) % 8x */
  844. #endif
  845. ZERO4x2
  846. ble LSGEMM_L4x2_SUB0
  847. MY_ALIGN
  848. LSGEMM_L4x2_LOOP_START:
  849. mtctr L
  850. MY_ALIGN
  851. LSGEMM_L4x2_LOOP:
  852. KERNEL4x2_2 0,0, 0,0
  853. KERNEL4x2_2 0,0, 1,0
  854. KERNEL4x2_2 0,0, 2,0
  855. KERNEL4x2_2 0,0, 3,1
  856. bdnz LSGEMM_L4x2_LOOP
  857. MY_ALIGN
  858. LSGEMM_L4x2_LOOP_END:
  859. LSGEMM_L4x2_SUB0:
  860. #if defined(TRMMKERNEL)
  861. andi. L, T11, 7
  862. #else
  863. andi. L, K, 7
  864. #endif
  865. ble LSGEMM_L4x2_SAVE
  866. MY_ALIGN
  867. LSGEMM_L4x2_SUB2:
  868. andi. T1,L, 4
  869. ble LSGEMM_L4x2_SUB2_2
  870. KERNEL4x2_2 0,0, 0,0
  871. KERNEL4x2_2 0,0, 1,1
  872. MY_ALIGN
  873. LSGEMM_L4x2_SUB2_2:
  874. andi. T1,L, 2
  875. ble LSGEMM_L4x2_SUB2_1
  876. KERNEL4x2_2 0,0, 0,1
  877. MY_ALIGN
  878. LSGEMM_L4x2_SUB2_1:
  879. andi. T1,L, 1
  880. ble LSGEMM_L4x2_SAVE
  881. KERNEL4x2
  882. MY_ALIGN
  883. LSGEMM_L4x2_SAVE:
  884. SAVE4x2
  885. #if defined(TRMMKERNEL)
  886. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4
  887. #endif
  888. MY_ALIGN
  889. LSGEMM_L4x2_END:
  890. LSGEMM_L4x1_BEGIN:
  891. andi. T1, M, 1
  892. ble LSGEMM_L4x1_END
  893. #if defined(TRMMKERNEL)
  894. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4
  895. #else
  896. mr BO, B
  897. #endif
  898. #if defined(TRMMKERNEL)
  899. REFRESH_TEMP_BK T11,K,TEMP_REG,1,4
  900. srawi. L, T11, 3 /**(T11) % 8x */
  901. #else
  902. srawi. L, K, 3 /**(K) % 8x */
  903. #endif
  904. ZERO4x1
  905. ble LSGEMM_L4x1_SUB0
  906. MY_ALIGN
  907. LSGEMM_L4x1_LOOP_START:
  908. mtctr L
  909. MY_ALIGN
  910. LSGEMM_L4x1_LOOP:
  911. KERNEL4x1_4 0,0, 0,0
  912. KERNEL4x1_4 0,0, 1,1
  913. bdnz LSGEMM_L4x1_LOOP
  914. MY_ALIGN
  915. LSGEMM_L4x1_LOOP_END:
  916. LSGEMM_L4x1_SUB0:
  917. #if defined(TRMMKERNEL)
  918. andi. L, T11, 7
  919. #else
  920. andi. L, K, 7
  921. #endif
  922. ble LSGEMM_L4x1_SAVE
  923. MY_ALIGN
  924. LSGEMM_L4x1_SUB2:
  925. andi. T1,L, 4
  926. ble LSGEMM_L4x1_SUB2_2
  927. KERNEL4x1_4 0,0, 0,1
  928. MY_ALIGN
  929. LSGEMM_L4x1_SUB2_2:
  930. andi. T1,L, 2
  931. ble LSGEMM_L4x1_SUB2_1
  932. KERNEL4x1_2
  933. MY_ALIGN
  934. LSGEMM_L4x1_SUB2_1:
  935. andi. T1,L, 1
  936. ble LSGEMM_L4x1_SAVE
  937. KERNEL4x1
  938. MY_ALIGN
  939. LSGEMM_L4x1_SAVE:
  940. SAVE4x1
  941. #if defined(TRMMKERNEL)
  942. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4
  943. #endif
  944. MY_ALIGN
  945. LSGEMM_L4x1_END:
  946. slwi T1, K, 4
  947. add B, B, T1
  948. #if defined(TRMMKERNEL) && !defined(LEFT)
  949. addi TEMP_REG, TEMP_REG, 4
  950. #endif
  951. andi. T2, N, 3
  952. ble .L999
  953. LSGEMM_L4_END:
  954. andi. T1, N, 2
  955. ble LSGEMM_L2_END
  956. LSGEMM_L2_BEGIN:
  957. mr AO, A
  958. mr CO, C
  959. slwi T3, LDC , 1
  960. add C, C, T3
  961. #if defined(TRMMKERNEL) && defined(LEFT)
  962. mr TEMP_REG, OFFSET /*off = offset;*/
  963. #endif
  964. srawi. I, M, 4
  965. ble LSGEMM_L2x16_END
  966. MY_ALIGN
  967. LSGEMM_L2x16_BEGIN:
  968. #if defined(TRMMKERNEL)
  969. REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2
  970. #else
  971. mr BO, B
  972. #endif
  973. #if defined(TRMMKERNEL)
  974. REFRESH_TEMP_BK T11,K,TEMP_REG,16,2
  975. srawi. L, T11, 6 /**(T11 ) % 64x */
  976. #else
  977. srawi. L, K, 6 /**(K ) % 64x */
  978. #endif
  979. ZERO2x16
  980. ble LSGEMM_L2x16_SUB0
  981. addi AO,AO,2048
  982. mtctr L
  983. MY_ALIGN
  984. LSGEMM_L2x16_LOOP:
  985. KERNEL2x16_4 -2048,0, 0,0
  986. KERNEL2x16_4 -2048,0, 1,0
  987. KERNEL2x16_4 -2048,0, 2,0
  988. KERNEL2x16_4 -2048,0, 3,0
  989. KERNEL2x16_4 -2048,0, 4,0
  990. KERNEL2x16_4 -2048,0, 5,0
  991. KERNEL2x16_4 -2048,0, 6,0
  992. KERNEL2x16_4 -2048,0, 7,0
  993. KERNEL2x16_4 -2048,0, 8,0
  994. KERNEL2x16_4 -2048,0, 9,0
  995. KERNEL2x16_4 -2048,0, 10,0
  996. KERNEL2x16_4 -2048,0, 11,0
  997. KERNEL2x16_4 -2048,0, 12,0
  998. KERNEL2x16_4 -2048,0, 13,0
  999. KERNEL2x16_4 -2048,0, 14,0
  1000. KERNEL2x16_4 -2048,0, 15,1
  1001. bdnz LSGEMM_L2x16_LOOP
  1002. MY_ALIGN
  1003. addi AO,AO, -2048
  1004. MY_ALIGN
  1005. LSGEMM_L2x16_SUB0:
  1006. #if defined(TRMMKERNEL)
  1007. andi. L, T11, 63
  1008. #else
  1009. andi. L, K, 63
  1010. #endif
  1011. ble LSGEMM_L2x16_SAVE
  1012. MY_ALIGN
  1013. LSGEMM_L2x16_SUB2:
  1014. andi. T10,L, 32
  1015. ble LSGEMM_L2x16_SUB2_16
  1016. KERNEL2x16_4 0,0, 0,0
  1017. KERNEL2x16_4 0,0, 1,0
  1018. KERNEL2x16_4 0,0, 2,0
  1019. KERNEL2x16_4 0,0, 3,0
  1020. KERNEL2x16_4 0,0, 4,0
  1021. KERNEL2x16_4 0,0, 5,0
  1022. KERNEL2x16_4 0,0, 6,0
  1023. KERNEL2x16_4 0,0, 7,1
  1024. MY_ALIGN
  1025. LSGEMM_L2x16_SUB2_16:
  1026. andi. T10,L, 16
  1027. ble LSGEMM_L2x16_SUB2_8
  1028. KERNEL2x16_4 0,0, 0,0
  1029. KERNEL2x16_4 0,0, 1,0
  1030. KERNEL2x16_4 0,0, 2,0
  1031. KERNEL2x16_4 0,0, 3,1
  1032. MY_ALIGN
  1033. LSGEMM_L2x16_SUB2_8:
  1034. andi. T10,L, 8
  1035. ble LSGEMM_L2x16_SUB2_4
  1036. KERNEL2x16_4 0,0, 0,0
  1037. KERNEL2x16_4 0,0, 1,1
  1038. MY_ALIGN
  1039. LSGEMM_L2x16_SUB2_4:
  1040. andi. T10,L, 4
  1041. ble LSGEMM_L2x16_SUB2_2
  1042. KERNEL2x16_4 0,0, 0,1
  1043. MY_ALIGN
  1044. LSGEMM_L2x16_SUB2_2:
  1045. andi. T10,L, 2
  1046. ble LSGEMM_L2x16_SUB2_1
  1047. KERNEL2x16_2 0,0, 0,1
  1048. MY_ALIGN
  1049. LSGEMM_L2x16_SUB2_1:
  1050. andi. T10,L, 1
  1051. ble LSGEMM_L2x16_SAVE
  1052. KERNEL2x16
  1053. MY_ALIGN
  1054. LSGEMM_L2x16_SAVE:
  1055. SAVE2x16
  1056. #if defined(TRMMKERNEL)
  1057. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2
  1058. #endif
  1059. addic. I, I, -1
  1060. bgt+ LSGEMM_L2x16_BEGIN
  1061. MY_ALIGN
  1062. LSGEMM_L2x16_END:
  1063. andi. I, M, 8
  1064. ble LSGEMM_L2x8_END
  1065. MY_ALIGN
  1066. LSGEMM_L2x8_BEGIN:
  1067. #if defined(TRMMKERNEL)
  1068. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2
  1069. #else
  1070. mr BO, B
  1071. #endif
  1072. #if defined(TRMMKERNEL)
  1073. REFRESH_TEMP_BK T11,K,TEMP_REG,8,2
  1074. srawi. L, T11, 6 /**(T11 ) % 64x */
  1075. #else
  1076. srawi. L, K, 6 /**(K ) % 64x */
  1077. #endif
  1078. ZERO2x8
  1079. ble LSGEMM_L2x8_SUB0
  1080. addi AO,AO,2048
  1081. mtctr L
  1082. MY_ALIGN
  1083. LSGEMM_L2x8_LOOP:
  1084. KERNEL2x8_4 -2048,0, 0,0
  1085. KERNEL2x8_4 -2048,0, 1,0
  1086. KERNEL2x8_4 -2048,0, 2,0
  1087. KERNEL2x8_4 -2048,0, 3,0
  1088. KERNEL2x8_4 -2048,0, 4,0
  1089. KERNEL2x8_4 -2048,0, 5,0
  1090. KERNEL2x8_4 -2048,0, 6,0
  1091. KERNEL2x8_4 -2048,0, 7,0
  1092. KERNEL2x8_4 -2048,0, 8,0
  1093. KERNEL2x8_4 -2048,0, 9,0
  1094. KERNEL2x8_4 -2048,0, 10,0
  1095. KERNEL2x8_4 -2048,0, 11,0
  1096. KERNEL2x8_4 -2048,0, 12,0
  1097. KERNEL2x8_4 -2048,0, 13,0
  1098. KERNEL2x8_4 -2048,0, 14,0
  1099. KERNEL2x8_4 -2048,0, 15,1
  1100. bdnz LSGEMM_L2x8_LOOP
  1101. MY_ALIGN
  1102. addi AO,AO, -2048
  1103. MY_ALIGN
  1104. LSGEMM_L2x8_SUB0:
  1105. #if defined(TRMMKERNEL)
  1106. andi. L, T11, 63
  1107. #else
  1108. andi. L, K, 63
  1109. #endif
  1110. ble LSGEMM_L2x8_SAVE
  1111. MY_ALIGN
  1112. LSGEMM_L2x8_SUB2:
  1113. andi. T10,L, 32
  1114. ble LSGEMM_L2x8_SUB2_16
  1115. KERNEL2x8_4 0,0, 0,0
  1116. KERNEL2x8_4 0,0, 1,0
  1117. KERNEL2x8_4 0,0, 2,0
  1118. KERNEL2x8_4 0,0, 3,0
  1119. KERNEL2x8_4 0,0, 4,0
  1120. KERNEL2x8_4 0,0, 5,0
  1121. KERNEL2x8_4 0,0, 6,0
  1122. KERNEL2x8_4 0,0, 7,1
  1123. MY_ALIGN
  1124. LSGEMM_L2x8_SUB2_16:
  1125. andi. T10,L, 16
  1126. ble LSGEMM_L2x8_SUB2_8
  1127. KERNEL2x8_4 0,0, 0,0
  1128. KERNEL2x8_4 0,0, 1,0
  1129. KERNEL2x8_4 0,0, 2,0
  1130. KERNEL2x8_4 0,0, 3,1
  1131. MY_ALIGN
  1132. LSGEMM_L2x8_SUB2_8:
  1133. andi. T10,L, 8
  1134. ble LSGEMM_L2x8_SUB2_4
  1135. KERNEL2x8_4 0,0, 0,0
  1136. KERNEL2x8_4 0,0, 1,1
  1137. MY_ALIGN
  1138. LSGEMM_L2x8_SUB2_4:
  1139. andi. T10,L, 4
  1140. ble LSGEMM_L2x8_SUB2_2
  1141. KERNEL2x8_4 0,0, 0,1
  1142. MY_ALIGN
  1143. LSGEMM_L2x8_SUB2_2:
  1144. andi. T10,L, 2
  1145. ble LSGEMM_L2x8_SUB2_1
  1146. KERNEL2x8_2 0,0, 0,1
  1147. MY_ALIGN
  1148. LSGEMM_L2x8_SUB2_1:
  1149. andi. T10,L, 1
  1150. ble LSGEMM_L2x8_SAVE
  1151. KERNEL2x8
  1152. MY_ALIGN
  1153. LSGEMM_L2x8_SAVE:
  1154. SAVE2x8
  1155. #if defined(TRMMKERNEL)
  1156. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2
  1157. #endif
  1158. MY_ALIGN
  1159. LSGEMM_L2x8_END:
  1160. andi. I, M, 4
  1161. ble LSGEMM_L2x4_END
  1162. MY_ALIGN
  1163. LSGEMM_L2x4_BEGIN:
  1164. #if defined(TRMMKERNEL)
  1165. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2
  1166. #else
  1167. mr BO, B
  1168. #endif
  1169. #if defined(TRMMKERNEL)
  1170. REFRESH_TEMP_BK T11,K,TEMP_REG,4,2
  1171. srawi. L, T11, 6 /**(T11 ) % 64x */
  1172. #else
  1173. srawi. L, K, 6 /**(K ) % 64x */
  1174. #endif
  1175. ZERO2x4
  1176. ble LSGEMM_L2x4_SUB0
  1177. mtctr L
  1178. MY_ALIGN
  1179. LSGEMM_L2x4_LOOP:
  1180. KERNEL2x4_4 0,0, 0,0
  1181. KERNEL2x4_4 0,0, 1,0
  1182. KERNEL2x4_4 0,0, 2,0
  1183. KERNEL2x4_4 0,0, 3,0
  1184. KERNEL2x4_4 0,0, 4,0
  1185. KERNEL2x4_4 0,0, 5,0
  1186. KERNEL2x4_4 0,0, 6,0
  1187. KERNEL2x4_4 0,0, 7,0
  1188. KERNEL2x4_4 0,0, 8,0
  1189. KERNEL2x4_4 0,0, 9,0
  1190. KERNEL2x4_4 0,0, 10,0
  1191. KERNEL2x4_4 0,0, 11,0
  1192. KERNEL2x4_4 0,0, 12,0
  1193. KERNEL2x4_4 0,0, 13,0
  1194. KERNEL2x4_4 0,0, 14,0
  1195. KERNEL2x4_4 0,0, 15,1
  1196. bdnz LSGEMM_L2x4_LOOP
  1197. MY_ALIGN
  1198. MY_ALIGN
  1199. LSGEMM_L2x4_SUB0:
  1200. #if defined(TRMMKERNEL)
  1201. andi. L, T11, 63
  1202. #else
  1203. andi. L, K, 63
  1204. #endif
  1205. ble LSGEMM_L2x4_SAVE
  1206. MY_ALIGN
  1207. LSGEMM_L2x4_SUB2:
  1208. andi. T10,L, 32
  1209. ble LSGEMM_L2x4_SUB2_16
  1210. KERNEL2x4_4 0,0, 0,0
  1211. KERNEL2x4_4 0,0, 1,0
  1212. KERNEL2x4_4 0,0, 2,0
  1213. KERNEL2x4_4 0,0, 3,0
  1214. KERNEL2x4_4 0,0, 4,0
  1215. KERNEL2x4_4 0,0, 5,0
  1216. KERNEL2x4_4 0,0, 6,0
  1217. KERNEL2x4_4 0,0, 7,1
  1218. MY_ALIGN
  1219. LSGEMM_L2x4_SUB2_16:
  1220. andi. T10,L, 16
  1221. ble LSGEMM_L2x4_SUB2_8
  1222. KERNEL2x4_4 0,0, 0,0
  1223. KERNEL2x4_4 0,0, 1,0
  1224. KERNEL2x4_4 0,0, 2,0
  1225. KERNEL2x4_4 0,0, 3,1
  1226. MY_ALIGN
  1227. LSGEMM_L2x4_SUB2_8:
  1228. andi. T10,L, 8
  1229. ble LSGEMM_L2x4_SUB2_4
  1230. KERNEL2x4_4 0,0, 0,0
  1231. KERNEL2x4_4 0,0, 1,1
  1232. MY_ALIGN
  1233. LSGEMM_L2x4_SUB2_4:
  1234. andi. T10,L, 4
  1235. ble LSGEMM_L2x4_SUB2_2
  1236. KERNEL2x4_4 0,0, 0,1
  1237. MY_ALIGN
  1238. LSGEMM_L2x4_SUB2_2:
  1239. andi. T10,L, 2
  1240. ble LSGEMM_L2x4_SUB2_1
  1241. KERNEL2x4_2 0,0, 0,1
  1242. MY_ALIGN
  1243. LSGEMM_L2x4_SUB2_1:
  1244. andi. T10,L, 1
  1245. ble LSGEMM_L2x4_SAVE
  1246. KERNEL2x4
  1247. MY_ALIGN
  1248. LSGEMM_L2x4_SAVE:
  1249. SAVE2x4
  1250. #if defined(TRMMKERNEL)
  1251. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2
  1252. #endif
  1253. MY_ALIGN
  1254. LSGEMM_L2x4_END:
  1255. andi. I, M, 2
  1256. ble LSGEMM_L2x2_END
  1257. MY_ALIGN
  1258. LSGEMM_L2x2_BEGIN:
  1259. #if defined(TRMMKERNEL)
  1260. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2
  1261. #else
  1262. mr BO, B
  1263. #endif
  1264. #if defined(TRMMKERNEL)
  1265. REFRESH_TEMP_BK T11,K,TEMP_REG,2,2
  1266. srawi. L, T11, 6 /**(T11 ) % 64x */
  1267. #else
  1268. srawi. L, K, 6 /**(K ) % 64x */
  1269. #endif
  1270. ZERO2x2
  1271. ble LSGEMM_L2x2_SUB0
  1272. mtctr L
  1273. MY_ALIGN
  1274. LSGEMM_L2x2_LOOP:
  1275. KERNEL2x2_4 0,0, 0,0
  1276. KERNEL2x2_4 0,0, 1,0
  1277. KERNEL2x2_4 0,0, 2,0
  1278. KERNEL2x2_4 0,0, 3,0
  1279. KERNEL2x2_4 0,0, 4,0
  1280. KERNEL2x2_4 0,0, 5,0
  1281. KERNEL2x2_4 0,0, 6,0
  1282. KERNEL2x2_4 0,0, 7,0
  1283. KERNEL2x2_4 0,0, 8,0
  1284. KERNEL2x2_4 0,0, 9,0
  1285. KERNEL2x2_4 0,0, 10,0
  1286. KERNEL2x2_4 0,0, 11,0
  1287. KERNEL2x2_4 0,0, 12,0
  1288. KERNEL2x2_4 0,0, 13,0
  1289. KERNEL2x2_4 0,0, 14,0
  1290. KERNEL2x2_4 0,0, 15,1
  1291. bdnz LSGEMM_L2x2_LOOP
  1292. MY_ALIGN
  1293. MY_ALIGN
  1294. LSGEMM_L2x2_SUB0:
  1295. #if defined(TRMMKERNEL)
  1296. andi. L, T11, 63
  1297. #else
  1298. andi. L, K, 63
  1299. #endif
  1300. ble LSGEMM_L2x2_SAVE
  1301. MY_ALIGN
  1302. LSGEMM_L2x2_SUB2:
  1303. andi. T10,L, 32
  1304. ble LSGEMM_L2x2_SUB2_16
  1305. KERNEL2x2_4 0,0, 0,0
  1306. KERNEL2x2_4 0,0, 1,0
  1307. KERNEL2x2_4 0,0, 2,0
  1308. KERNEL2x2_4 0,0, 3,0
  1309. KERNEL2x2_4 0,0, 4,0
  1310. KERNEL2x2_4 0,0, 5,0
  1311. KERNEL2x2_4 0,0, 6,0
  1312. KERNEL2x2_4 0,0, 7,1
  1313. MY_ALIGN
  1314. LSGEMM_L2x2_SUB2_16:
  1315. andi. T10,L, 16
  1316. ble LSGEMM_L2x2_SUB2_8
  1317. KERNEL2x2_4 0,0, 0,0
  1318. KERNEL2x2_4 0,0, 1,0
  1319. KERNEL2x2_4 0,0, 2,0
  1320. KERNEL2x2_4 0,0, 3,1
  1321. MY_ALIGN
  1322. LSGEMM_L2x2_SUB2_8:
  1323. andi. T10,L, 8
  1324. ble LSGEMM_L2x2_SUB2_4
  1325. KERNEL2x2_4 0,0, 0,0
  1326. KERNEL2x2_4 0,0, 1,1
  1327. MY_ALIGN
  1328. LSGEMM_L2x2_SUB2_4:
  1329. andi. T10,L, 4
  1330. ble LSGEMM_L2x2_SUB2_2
  1331. KERNEL2x2_4 0,0, 0,1
  1332. MY_ALIGN
  1333. LSGEMM_L2x2_SUB2_2:
  1334. andi. T10,L, 2
  1335. ble LSGEMM_L2x2_SUB2_1
  1336. KERNEL2x2_2 0,0, 0,1
  1337. MY_ALIGN
  1338. LSGEMM_L2x2_SUB2_1:
  1339. andi. T10,L, 1
  1340. ble LSGEMM_L2x2_SAVE
  1341. KERNEL2x2
  1342. MY_ALIGN
  1343. LSGEMM_L2x2_SAVE:
  1344. SAVE2x2
  1345. #if defined(TRMMKERNEL)
  1346. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2
  1347. #endif
  1348. MY_ALIGN
  1349. LSGEMM_L2x2_END:
  1350. andi. I, M, 1
  1351. ble LSGEMM_L2x1_END
  1352. MY_ALIGN
  1353. LSGEMM_L2x1_BEGIN:
  1354. #if defined(TRMMKERNEL)
  1355. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2
  1356. #else
  1357. mr BO, B
  1358. #endif
  1359. #if defined(TRMMKERNEL)
  1360. REFRESH_TEMP_BK T11,K,TEMP_REG,1,2
  1361. srawi. L, T11, 6 /**(T11 ) % 64x */
  1362. #else
  1363. srawi. L, K, 6 /**(K ) % 64x */
  1364. #endif
  1365. ZERO2x1
  1366. ble LSGEMM_L2x1_SUB0
  1367. mtctr L
  1368. MY_ALIGN
  1369. LSGEMM_L2x1_LOOP:
  1370. KERNEL2x1_4 0,0, 0,0
  1371. KERNEL2x1_4 0,0, 1,0
  1372. KERNEL2x1_4 0,0, 2,0
  1373. KERNEL2x1_4 0,0, 3,0
  1374. KERNEL2x1_4 0,0, 4,0
  1375. KERNEL2x1_4 0,0, 5,0
  1376. KERNEL2x1_4 0,0, 6,0
  1377. KERNEL2x1_4 0,0, 7,0
  1378. KERNEL2x1_4 0,0, 8,0
  1379. KERNEL2x1_4 0,0, 9,0
  1380. KERNEL2x1_4 0,0, 10,0
  1381. KERNEL2x1_4 0,0, 11,0
  1382. KERNEL2x1_4 0,0, 12,0
  1383. KERNEL2x1_4 0,0, 13,0
  1384. KERNEL2x1_4 0,0, 14,0
  1385. KERNEL2x1_4 0,0, 15,1
  1386. bdnz LSGEMM_L2x1_LOOP
  1387. MY_ALIGN
  1388. MY_ALIGN
  1389. LSGEMM_L2x1_SUB0:
  1390. #if defined(TRMMKERNEL)
  1391. andi. L, T11, 63
  1392. #else
  1393. andi. L, K, 63
  1394. #endif
  1395. ble LSGEMM_L2x1_SAVE
  1396. MY_ALIGN
  1397. LSGEMM_L2x1_SUB2:
  1398. andi. T10,L, 32
  1399. ble LSGEMM_L2x1_SUB2_16
  1400. KERNEL2x1_4 0,0, 0,0
  1401. KERNEL2x1_4 0,0, 1,0
  1402. KERNEL2x1_4 0,0, 2,0
  1403. KERNEL2x1_4 0,0, 3,0
  1404. KERNEL2x1_4 0,0, 4,0
  1405. KERNEL2x1_4 0,0, 5,0
  1406. KERNEL2x1_4 0,0, 6,0
  1407. KERNEL2x1_4 0,0, 7,1
  1408. MY_ALIGN
  1409. LSGEMM_L2x1_SUB2_16:
  1410. andi. T10,L, 16
  1411. ble LSGEMM_L2x1_SUB2_8
  1412. KERNEL2x1_4 0,0, 0,0
  1413. KERNEL2x1_4 0,0, 1,0
  1414. KERNEL2x1_4 0,0, 2,0
  1415. KERNEL2x1_4 0,0, 3,1
  1416. MY_ALIGN
  1417. LSGEMM_L2x1_SUB2_8:
  1418. andi. T10,L, 8
  1419. ble LSGEMM_L2x1_SUB2_4
  1420. KERNEL2x1_4 0,0, 0,0
  1421. KERNEL2x1_4 0,0, 1,1
  1422. MY_ALIGN
  1423. LSGEMM_L2x1_SUB2_4:
  1424. andi. T10,L, 4
  1425. ble LSGEMM_L2x1_SUB2_2
  1426. KERNEL2x1_4 0,0, 0,1
  1427. MY_ALIGN
  1428. LSGEMM_L2x1_SUB2_2:
  1429. andi. T10,L, 2
  1430. ble LSGEMM_L2x1_SUB2_1
  1431. KERNEL2x1_2 0,0, 0,1
  1432. MY_ALIGN
  1433. LSGEMM_L2x1_SUB2_1:
  1434. andi. T10,L, 1
  1435. ble LSGEMM_L2x1_SAVE
  1436. KERNEL2x1
  1437. MY_ALIGN
  1438. LSGEMM_L2x1_SAVE:
  1439. SAVE2x1
  1440. #if defined(TRMMKERNEL)
  1441. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2
  1442. #endif
  1443. MY_ALIGN
  1444. LSGEMM_L2x1_END:
  1445. slwi T1, K, 3
  1446. add B, B, T1
  1447. #if defined(TRMMKERNEL) && !defined(LEFT)
  1448. addi TEMP_REG, TEMP_REG, 2
  1449. #endif
  1450. LSGEMM_L2_END:
  1451. andi. T1, N, 1
  1452. ble LSGEMM_END
  1453. LSGEMM_1_BEGIN:
  1454. mr AO, A
  1455. mr CO, C
  1456. add C, C, LDC
  1457. #if defined(TRMMKERNEL) && defined(LEFT)
  1458. mr TEMP_REG, OFFSET /*off = offset;*/
  1459. #endif
  1460. srawi. I, M, 4
  1461. ble LSGEMM_1x16_END
  1462. MY_ALIGN
  1463. LSGEMM_1x16_BEGIN:
  1464. #if defined(TRMMKERNEL)
  1465. REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1
  1466. #else
  1467. mr BO, B
  1468. #endif
  1469. #if defined(TRMMKERNEL)
  1470. REFRESH_TEMP_BK T11,K,TEMP_REG,16,1
  1471. srawi. L, T11, 6 /**(T11 ) % 64x */
  1472. #else
  1473. srawi. L, K, 6 /**(K ) % 64x */
  1474. #endif
  1475. ZERO1x16
  1476. ble LSGEMM_1x16_SUB0
  1477. addi AO,AO,2048
  1478. mtctr L
  1479. MY_ALIGN
  1480. LSGEMM_1x16_LOOP:
  1481. KERNEL1x16_4 -2048,0, 0,0
  1482. KERNEL1x16_4 -2048,0, 1,0
  1483. KERNEL1x16_4 -2048,0, 2,0
  1484. KERNEL1x16_4 -2048,0, 3,0
  1485. KERNEL1x16_4 -2048,0, 4,0
  1486. KERNEL1x16_4 -2048,0, 5,0
  1487. KERNEL1x16_4 -2048,0, 6,0
  1488. KERNEL1x16_4 -2048,0, 7,0
  1489. KERNEL1x16_4 -2048,0, 8,0
  1490. KERNEL1x16_4 -2048,0, 9,0
  1491. KERNEL1x16_4 -2048,0, 10,0
  1492. KERNEL1x16_4 -2048,0, 11,0
  1493. KERNEL1x16_4 -2048,0, 12,0
  1494. KERNEL1x16_4 -2048,0, 13,0
  1495. KERNEL1x16_4 -2048,0, 14,0
  1496. KERNEL1x16_4 -2048,0, 15,1
  1497. bdnz LSGEMM_1x16_LOOP
  1498. MY_ALIGN
  1499. addi AO,AO, -2048
  1500. MY_ALIGN
  1501. LSGEMM_1x16_SUB0:
  1502. #if defined(TRMMKERNEL)
  1503. andi. L, T11, 63
  1504. #else
  1505. andi. L, K, 63
  1506. #endif
  1507. ble LSGEMM_1x16_SAVE
  1508. MY_ALIGN
  1509. LSGEMM_1x16_SUB2:
  1510. andi. T10,L, 32
  1511. ble LSGEMM_1x16_SUB2_16
  1512. KERNEL1x16_4 0,0, 0,0
  1513. KERNEL1x16_4 0,0, 1,0
  1514. KERNEL1x16_4 0,0, 2,0
  1515. KERNEL1x16_4 0,0, 3,0
  1516. KERNEL1x16_4 0,0, 4,0
  1517. KERNEL1x16_4 0,0, 5,0
  1518. KERNEL1x16_4 0,0, 6,0
  1519. KERNEL1x16_4 0,0, 7,1
  1520. MY_ALIGN
  1521. LSGEMM_1x16_SUB2_16:
  1522. andi. T10,L, 16
  1523. ble LSGEMM_1x16_SUB2_8
  1524. KERNEL1x16_4 0,0, 0,0
  1525. KERNEL1x16_4 0,0, 1,0
  1526. KERNEL1x16_4 0,0, 2,0
  1527. KERNEL1x16_4 0,0, 3,1
  1528. MY_ALIGN
  1529. LSGEMM_1x16_SUB2_8:
  1530. andi. T10,L, 8
  1531. ble LSGEMM_1x16_SUB2_4
  1532. KERNEL1x16_4 0,0, 0,0
  1533. KERNEL1x16_4 0,0, 1,1
  1534. MY_ALIGN
  1535. LSGEMM_1x16_SUB2_4:
  1536. andi. T10,L, 4
  1537. ble LSGEMM_1x16_SUB2_2
  1538. KERNEL1x16_4 0,0, 0,1
  1539. MY_ALIGN
  1540. LSGEMM_1x16_SUB2_2:
  1541. andi. T10,L, 2
  1542. ble LSGEMM_1x16_SUB2_1
  1543. KERNEL1x16_2 0,0, 0,1
  1544. MY_ALIGN
  1545. LSGEMM_1x16_SUB2_1:
  1546. andi. T10,L, 1
  1547. ble LSGEMM_1x16_SAVE
  1548. KERNEL1x16
  1549. MY_ALIGN
  1550. LSGEMM_1x16_SAVE:
  1551. SAVE1x16
  1552. #if defined(TRMMKERNEL)
  1553. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1
  1554. #endif
  1555. addic. I, I, -1
  1556. bgt+ LSGEMM_1x16_BEGIN
  1557. MY_ALIGN
  1558. LSGEMM_1x16_END:
  1559. andi. I, M, 8
  1560. ble LSGEMM_1x8_END
  1561. MY_ALIGN
  1562. LSGEMM_1x8_BEGIN:
  1563. #if defined(TRMMKERNEL)
  1564. REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1
  1565. #else
  1566. mr BO, B
  1567. #endif
  1568. #if defined(TRMMKERNEL)
  1569. REFRESH_TEMP_BK T11,K,TEMP_REG,8,1
  1570. srawi. L, T11, 6 /**(T11 ) % 64x */
  1571. #else
  1572. srawi. L, K, 6 /**(K ) % 64x */
  1573. #endif
  1574. ZERO1x8
  1575. ble LSGEMM_1x8_SUB0
  1576. addi AO,AO,2048
  1577. mtctr L
  1578. MY_ALIGN
  1579. LSGEMM_1x8_LOOP:
  1580. KERNEL1x8_4 -2048,0, 0,0
  1581. KERNEL1x8_4 -2048,0, 1,0
  1582. KERNEL1x8_4 -2048,0, 2,0
  1583. KERNEL1x8_4 -2048,0, 3,0
  1584. KERNEL1x8_4 -2048,0, 4,0
  1585. KERNEL1x8_4 -2048,0, 5,0
  1586. KERNEL1x8_4 -2048,0, 6,0
  1587. KERNEL1x8_4 -2048,0, 7,0
  1588. KERNEL1x8_4 -2048,0, 8,0
  1589. KERNEL1x8_4 -2048,0, 9,0
  1590. KERNEL1x8_4 -2048,0, 10,0
  1591. KERNEL1x8_4 -2048,0, 11,0
  1592. KERNEL1x8_4 -2048,0, 12,0
  1593. KERNEL1x8_4 -2048,0, 13,0
  1594. KERNEL1x8_4 -2048,0, 14,0
  1595. KERNEL1x8_4 -2048,0, 15,1
  1596. bdnz LSGEMM_1x8_LOOP
  1597. MY_ALIGN
  1598. addi AO,AO, -2048
  1599. MY_ALIGN
  1600. LSGEMM_1x8_SUB0:
  1601. #if defined(TRMMKERNEL)
  1602. andi. L, T11, 63
  1603. #else
  1604. andi. L, K, 63
  1605. #endif
  1606. ble LSGEMM_1x8_SAVE
  1607. MY_ALIGN
  1608. LSGEMM_1x8_SUB2:
  1609. andi. T10,L, 32
  1610. ble LSGEMM_1x8_SUB2_16
  1611. KERNEL1x8_4 0,0, 0,0
  1612. KERNEL1x8_4 0,0, 1,0
  1613. KERNEL1x8_4 0,0, 2,0
  1614. KERNEL1x8_4 0,0, 3,0
  1615. KERNEL1x8_4 0,0, 4,0
  1616. KERNEL1x8_4 0,0, 5,0
  1617. KERNEL1x8_4 0,0, 6,0
  1618. KERNEL1x8_4 0,0, 7,1
  1619. MY_ALIGN
  1620. LSGEMM_1x8_SUB2_16:
  1621. andi. T10,L, 16
  1622. ble LSGEMM_1x8_SUB2_8
  1623. KERNEL1x8_4 0,0, 0,0
  1624. KERNEL1x8_4 0,0, 1,0
  1625. KERNEL1x8_4 0,0, 2,0
  1626. KERNEL1x8_4 0,0, 3,1
  1627. MY_ALIGN
  1628. LSGEMM_1x8_SUB2_8:
  1629. andi. T10,L, 8
  1630. ble LSGEMM_1x8_SUB2_4
  1631. KERNEL1x8_4 0,0, 0,0
  1632. KERNEL1x8_4 0,0, 1,1
  1633. MY_ALIGN
  1634. LSGEMM_1x8_SUB2_4:
  1635. andi. T10,L, 4
  1636. ble LSGEMM_1x8_SUB2_2
  1637. KERNEL1x8_4 0,0, 0,1
  1638. MY_ALIGN
  1639. LSGEMM_1x8_SUB2_2:
  1640. andi. T10,L, 2
  1641. ble LSGEMM_1x8_SUB2_1
  1642. KERNEL1x8_2 0,0, 0,1
  1643. MY_ALIGN
  1644. LSGEMM_1x8_SUB2_1:
  1645. andi. T10,L, 1
  1646. ble LSGEMM_1x8_SAVE
  1647. KERNEL1x8
  1648. MY_ALIGN
  1649. LSGEMM_1x8_SAVE:
  1650. SAVE1x8
  1651. #if defined(TRMMKERNEL)
  1652. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1
  1653. #endif
  1654. MY_ALIGN
  1655. LSGEMM_1x8_END:
  1656. andi. I, M, 4
  1657. ble LSGEMM_1x4_END
  1658. MY_ALIGN
  1659. LSGEMM_1x4_BEGIN:
  1660. #if defined(TRMMKERNEL)
  1661. REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1
  1662. #else
  1663. mr BO, B
  1664. #endif
  1665. #if defined(TRMMKERNEL)
  1666. REFRESH_TEMP_BK T11,K,TEMP_REG,4,1
  1667. srawi. L, T11, 6 /**(T11 ) % 64x */
  1668. #else
  1669. srawi. L, K, 6 /**(K ) % 64x */
  1670. #endif
  1671. ZERO1x4
  1672. ble LSGEMM_1x4_SUB0
  1673. mtctr L
  1674. MY_ALIGN
  1675. LSGEMM_1x4_LOOP:
  1676. KERNEL1x4_4 0,0, 0,0
  1677. KERNEL1x4_4 0,0, 1,0
  1678. KERNEL1x4_4 0,0, 2,0
  1679. KERNEL1x4_4 0,0, 3,0
  1680. KERNEL1x4_4 0,0, 4,0
  1681. KERNEL1x4_4 0,0, 5,0
  1682. KERNEL1x4_4 0,0, 6,0
  1683. KERNEL1x4_4 0,0, 7,0
  1684. KERNEL1x4_4 0,0, 8,0
  1685. KERNEL1x4_4 0,0, 9,0
  1686. KERNEL1x4_4 0,0, 10,0
  1687. KERNEL1x4_4 0,0, 11,0
  1688. KERNEL1x4_4 0,0, 12,0
  1689. KERNEL1x4_4 0,0, 13,0
  1690. KERNEL1x4_4 0,0, 14,0
  1691. KERNEL1x4_4 0,0, 15,1
  1692. bdnz LSGEMM_1x4_LOOP
  1693. MY_ALIGN
  1694. MY_ALIGN
  1695. LSGEMM_1x4_SUB0:
  1696. #if defined(TRMMKERNEL)
  1697. andi. L, T11, 63
  1698. #else
  1699. andi. L, K, 63
  1700. #endif
  1701. ble LSGEMM_1x4_SAVE
  1702. MY_ALIGN
  1703. LSGEMM_1x4_SUB2:
  1704. andi. T10,L, 32
  1705. ble LSGEMM_1x4_SUB2_16
  1706. KERNEL1x4_4 0,0, 0,0
  1707. KERNEL1x4_4 0,0, 1,0
  1708. KERNEL1x4_4 0,0, 2,0
  1709. KERNEL1x4_4 0,0, 3,0
  1710. KERNEL1x4_4 0,0, 4,0
  1711. KERNEL1x4_4 0,0, 5,0
  1712. KERNEL1x4_4 0,0, 6,0
  1713. KERNEL1x4_4 0,0, 7,1
  1714. MY_ALIGN
  1715. LSGEMM_1x4_SUB2_16:
  1716. andi. T10,L, 16
  1717. ble LSGEMM_1x4_SUB2_8
  1718. KERNEL1x4_4 0,0, 0,0
  1719. KERNEL1x4_4 0,0, 1,0
  1720. KERNEL1x4_4 0,0, 2,0
  1721. KERNEL1x4_4 0,0, 3,1
  1722. MY_ALIGN
  1723. LSGEMM_1x4_SUB2_8:
  1724. andi. T10,L, 8
  1725. ble LSGEMM_1x4_SUB2_4
  1726. KERNEL1x4_4 0,0, 0,0
  1727. KERNEL1x4_4 0,0, 1,1
  1728. MY_ALIGN
  1729. LSGEMM_1x4_SUB2_4:
  1730. andi. T10,L, 4
  1731. ble LSGEMM_1x4_SUB2_2
  1732. KERNEL1x4_4 0,0, 0,1
  1733. MY_ALIGN
  1734. LSGEMM_1x4_SUB2_2:
  1735. andi. T10,L, 2
  1736. ble LSGEMM_1x4_SUB2_1
  1737. KERNEL1x4_2 0,0, 0,1
  1738. MY_ALIGN
  1739. LSGEMM_1x4_SUB2_1:
  1740. andi. T10,L, 1
  1741. ble LSGEMM_1x4_SAVE
  1742. KERNEL1x4
  1743. MY_ALIGN
  1744. LSGEMM_1x4_SAVE:
  1745. SAVE1x4
  1746. #if defined(TRMMKERNEL)
  1747. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1
  1748. #endif
  1749. MY_ALIGN
  1750. LSGEMM_1x4_END:
  1751. andi. I, M, 2
  1752. ble LSGEMM_1x2_END
  1753. MY_ALIGN
  1754. LSGEMM_1x2_BEGIN:
  1755. #if defined(TRMMKERNEL)
  1756. REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1
  1757. #else
  1758. mr BO, B
  1759. #endif
  1760. #if defined(TRMMKERNEL)
  1761. REFRESH_TEMP_BK T11,K,TEMP_REG,2,1
  1762. srawi. L, T11, 6 /**(T11 ) % 64x */
  1763. #else
  1764. srawi. L, K, 6 /**(K ) % 64x */
  1765. #endif
  1766. ZERO1x2
  1767. ble LSGEMM_1x2_SUB0
  1768. mtctr L
  1769. MY_ALIGN
  1770. LSGEMM_1x2_LOOP:
  1771. KERNEL1x2_4 0,0, 0,0
  1772. KERNEL1x2_4 0,0, 1,0
  1773. KERNEL1x2_4 0,0, 2,0
  1774. KERNEL1x2_4 0,0, 3,0
  1775. KERNEL1x2_4 0,0, 4,0
  1776. KERNEL1x2_4 0,0, 5,0
  1777. KERNEL1x2_4 0,0, 6,0
  1778. KERNEL1x2_4 0,0, 7,0
  1779. KERNEL1x2_4 0,0, 8,0
  1780. KERNEL1x2_4 0,0, 9,0
  1781. KERNEL1x2_4 0,0, 10,0
  1782. KERNEL1x2_4 0,0, 11,0
  1783. KERNEL1x2_4 0,0, 12,0
  1784. KERNEL1x2_4 0,0, 13,0
  1785. KERNEL1x2_4 0,0, 14,0
  1786. KERNEL1x2_4 0,0, 15,1
  1787. bdnz LSGEMM_1x2_LOOP
  1788. MY_ALIGN
  1789. MY_ALIGN
  1790. LSGEMM_1x2_SUB0:
  1791. #if defined(TRMMKERNEL)
  1792. andi. L, T11, 63
  1793. #else
  1794. andi. L, K, 63
  1795. #endif
  1796. ble LSGEMM_1x2_SAVE
  1797. MY_ALIGN
  1798. LSGEMM_1x2_SUB2:
  1799. andi. T10,L, 32
  1800. ble LSGEMM_1x2_SUB2_16
  1801. KERNEL1x2_4 0,0, 0,0
  1802. KERNEL1x2_4 0,0, 1,0
  1803. KERNEL1x2_4 0,0, 2,0
  1804. KERNEL1x2_4 0,0, 3,0
  1805. KERNEL1x2_4 0,0, 4,0
  1806. KERNEL1x2_4 0,0, 5,0
  1807. KERNEL1x2_4 0,0, 6,0
  1808. KERNEL1x2_4 0,0, 7,1
  1809. MY_ALIGN
  1810. LSGEMM_1x2_SUB2_16:
  1811. andi. T10,L, 16
  1812. ble LSGEMM_1x2_SUB2_8
  1813. KERNEL1x2_4 0,0, 0,0
  1814. KERNEL1x2_4 0,0, 1,0
  1815. KERNEL1x2_4 0,0, 2,0
  1816. KERNEL1x2_4 0,0, 3,1
  1817. MY_ALIGN
  1818. LSGEMM_1x2_SUB2_8:
  1819. andi. T10,L, 8
  1820. ble LSGEMM_1x2_SUB2_4
  1821. KERNEL1x2_4 0,0, 0,0
  1822. KERNEL1x2_4 0,0, 1,1
  1823. MY_ALIGN
  1824. LSGEMM_1x2_SUB2_4:
  1825. andi. T10,L, 4
  1826. ble LSGEMM_1x2_SUB2_2
  1827. KERNEL1x2_4 0,0, 0,1
  1828. MY_ALIGN
  1829. LSGEMM_1x2_SUB2_2:
  1830. andi. T10,L, 2
  1831. ble LSGEMM_1x2_SUB2_1
  1832. KERNEL1x2_2 0,0, 0,1
  1833. MY_ALIGN
  1834. LSGEMM_1x2_SUB2_1:
  1835. andi. T10,L, 1
  1836. ble LSGEMM_1x2_SAVE
  1837. KERNEL1x2
  1838. MY_ALIGN
  1839. LSGEMM_1x2_SAVE:
  1840. SAVE1x2
  1841. #if defined(TRMMKERNEL)
  1842. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1
  1843. #endif
  1844. MY_ALIGN
  1845. LSGEMM_1x2_END:
  1846. andi. I, M, 1
  1847. ble LSGEMM_1x1_END
  1848. MY_ALIGN
  1849. LSGEMM_1x1_BEGIN:
  1850. #if defined(TRMMKERNEL)
  1851. REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1
  1852. #else
  1853. mr BO, B
  1854. #endif
  1855. #if defined(TRMMKERNEL)
  1856. REFRESH_TEMP_BK T11,K,TEMP_REG,1,1
  1857. srawi. L, T11, 6 /**(T11 ) % 64x */
  1858. #else
  1859. srawi. L, K, 6 /**(K ) % 64x */
  1860. #endif
  1861. ZERO1x1
  1862. ble LSGEMM_1x1_SUB0
  1863. mtctr L
  1864. MY_ALIGN
  1865. LSGEMM_1x1_LOOP:
  1866. KERNEL1x1_16 0,0, 0,0
  1867. KERNEL1x1_16 0,0, 1,0
  1868. KERNEL1x1_16 0,0, 2,0
  1869. KERNEL1x1_16 0,0, 3,1
  1870. bdnz LSGEMM_1x1_LOOP
  1871. MY_ALIGN
  1872. MY_ALIGN
  1873. LSGEMM_1x1_SUB0:
  1874. #if defined(TRMMKERNEL)
  1875. andi. L, T11, 63
  1876. #else
  1877. andi. L, K, 63
  1878. #endif
  1879. ble LSGEMM_1x1_SAVE
  1880. MY_ALIGN
  1881. LSGEMM_1x1_SUB2:
  1882. andi. T10,L, 32
  1883. ble LSGEMM_1x1_SUB2_16
  1884. KERNEL1x1_16 0,0, 0,0
  1885. KERNEL1x1_16 0,0, 1,1
  1886. MY_ALIGN
  1887. LSGEMM_1x1_SUB2_16:
  1888. andi. T10,L, 16
  1889. ble LSGEMM_1x1_SUB2_8
  1890. KERNEL1x1_16 0,0, 0,1
  1891. MY_ALIGN
  1892. LSGEMM_1x1_SUB2_8:
  1893. andi. T10,L, 8
  1894. ble LSGEMM_1x1_SUB2_4
  1895. KERNEL1x1_8 0,0, 0,1
  1896. MY_ALIGN
  1897. LSGEMM_1x1_SUB2_4:
  1898. andi. T10,L, 4
  1899. ble LSGEMM_1x1_SUB2_2
  1900. KERNEL1x1_4 0,0, 0,1
  1901. MY_ALIGN
  1902. LSGEMM_1x1_SUB2_2:
  1903. andi. T10,L, 2
  1904. ble LSGEMM_1x1_SUB2_1
  1905. KERNEL1x1_2 0,0, 0,1
  1906. MY_ALIGN
  1907. LSGEMM_1x1_SUB2_1:
  1908. andi. T10,L, 1
  1909. ble LSGEMM_1x1_SAVE
  1910. KERNEL1x1
  1911. MY_ALIGN
  1912. LSGEMM_1x1_SAVE:
  1913. SAVE1x1
  1914. #if defined(TRMMKERNEL)
  1915. REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1
  1916. #endif
  1917. MY_ALIGN
  1918. LSGEMM_1x1_END:
  1919. slwi T1, K, 2
  1920. add B, B, T1
  1921. #if defined(TRMMKERNEL) && !defined(LEFT)
  1922. addi TEMP_REG, TEMP_REG, 1
  1923. #endif
  1924. LSGEMM_END: