You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_logic_16x8_power8.S 29 kB

9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333
  1. /***************************************************************************
  2. Copyright (c) 2013-2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2016/04/21 Werner Saar (wernsaar@googlemail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. * LAPACK-TEST : OK
  33. **************************************************************************************/
  34. srawi. J, N, 3
  35. ble SGEMM_L8_END
  36. SGEMM_L8_BEGIN:
  37. mr BO, B
  38. mr BBO, BBUFFER
  39. srawi. T1, K, 2
  40. ble SGEMM_L8_COPYB1
  41. SGEMM_L8_COPYB4:
  42. dcbt BO, PRE
  43. dcbtst BBO, PRE
  44. COPYB_4x8
  45. addic. T1, T1, -1
  46. ble SGEMM_L8_COPYB1
  47. dcbtst BBO, PRE
  48. COPYB_4x8
  49. addic. T1, T1, -1
  50. ble SGEMM_L8_COPYB1
  51. dcbtst BBO, PRE
  52. COPYB_4x8
  53. addic. T1, T1, -1
  54. ble SGEMM_L8_COPYB1
  55. dcbtst BBO, PRE
  56. COPYB_4x8
  57. addic. T1, T1, -1
  58. bgt SGEMM_L8_COPYB4
  59. SGEMM_L8_COPYB1:
  60. andi. T1, K, 3
  61. ble SGEMM_L8_COPYB_END
  62. SGEMM_L8_COPYB1_LOOP:
  63. COPYB_1x8
  64. addic. T1, T1, -1
  65. bgt SGEMM_L8_COPYB1_LOOP
  66. SGEMM_L8_COPYB_END:
  67. mr CO, C
  68. mr AO, A
  69. slwi T1, LDC , 3
  70. add C, C, T1
  71. srawi. I, M, 4
  72. ble SGEMM_L8x16_END
  73. SGEMM_L8x16_BEGIN:
  74. mr BO, BBUFFER
  75. srawi. L, K, 3
  76. ble SGEMM_L8x16_SUB0
  77. cmpwi cr0, L, 1
  78. ble SGEMM_L8x16_SUB4
  79. SGEMM_L8x16_LOOP_START:
  80. dcbt AO, PRE
  81. dcbt BO, PRE
  82. LOAD8x16_1
  83. dcbt BO, PRE
  84. KERNEL8x16_I1
  85. dcbt AO, PRE
  86. dcbt BO, PRE
  87. KERNEL8x16_2
  88. dcbt BO, PRE
  89. KERNEL8x16_1
  90. dcbt AO, PRE
  91. dcbt BO, PRE
  92. KERNEL8x16_2
  93. dcbt BO, PRE
  94. KERNEL8x16_1
  95. dcbt AO, PRE
  96. dcbt BO, PRE
  97. KERNEL8x16_2
  98. dcbt BO, PRE
  99. KERNEL8x16_1
  100. dcbt AO, PRE
  101. dcbt BO, PRE
  102. KERNEL8x16_2
  103. addic. L, L, -2
  104. ble SGEMM_L8x16_LOOP_END
  105. .align 5
  106. SGEMM_L8x16_LOOP:
  107. dcbt BO, PRE
  108. KERNEL8x16_1
  109. dcbt AO, PRE
  110. dcbt BO, PRE
  111. KERNEL8x16_2
  112. dcbt BO, PRE
  113. KERNEL8x16_1
  114. dcbt AO, PRE
  115. dcbt BO, PRE
  116. KERNEL8x16_2
  117. dcbt BO, PRE
  118. KERNEL8x16_1
  119. dcbt AO, PRE
  120. dcbt BO, PRE
  121. KERNEL8x16_2
  122. dcbt BO, PRE
  123. KERNEL8x16_1
  124. dcbt AO, PRE
  125. dcbt BO, PRE
  126. KERNEL8x16_2
  127. addic. L, L, -1
  128. bgt SGEMM_L8x16_LOOP
  129. SGEMM_L8x16_LOOP_END:
  130. dcbt BO, PRE
  131. KERNEL8x16_1
  132. dcbt AO, PRE
  133. dcbt BO, PRE
  134. KERNEL8x16_2
  135. dcbt BO, PRE
  136. KERNEL8x16_1
  137. dcbt AO, PRE
  138. KERNEL8x16_2
  139. KERNEL8x16_1
  140. dcbt AO, PRE
  141. KERNEL8x16_2
  142. KERNEL8x16_1
  143. KERNEL8x16_E2
  144. b SGEMM_L8x16_SUB1
  145. SGEMM_L8x16_SUB4:
  146. dcbt AO, PRE
  147. KERNEL8x16_SUBI1
  148. KERNEL8x16_SUB1
  149. dcbt AO, PRE
  150. KERNEL8x16_SUB1
  151. KERNEL8x16_SUB1
  152. KERNEL8x16_SUB1
  153. KERNEL8x16_SUB1
  154. KERNEL8x16_SUB1
  155. KERNEL8x16_SUB1
  156. b SGEMM_L8x16_SUB1
  157. SGEMM_L8x16_SUB0:
  158. andi. L, K, 7
  159. KERNEL8x16_SUBI1
  160. addic. L, L, -1
  161. ble SGEMM_L8x16_SAVE
  162. b SGEMM_L8x16_SUB2
  163. SGEMM_L8x16_SUB1:
  164. andi. L, K, 7
  165. ble SGEMM_L8x16_SAVE
  166. SGEMM_L8x16_SUB2:
  167. KERNEL8x16_SUB1
  168. addic. L, L, -1
  169. bgt SGEMM_L8x16_SUB2
  170. SGEMM_L8x16_SAVE:
  171. SAVE8x16
  172. addic. I, I, -1
  173. bgt SGEMM_L8x16_BEGIN
  174. SGEMM_L8x16_END:
  175. SGEMM_L8x8_BEGIN:
  176. andi. T2, M, 15
  177. ble SGEMM_L8x1_END
  178. andi. T1, M, 8
  179. ble SGEMM_L8x8_END
  180. mr BO, BBUFFER
  181. srawi. L, K, 3
  182. ble SGEMM_L8x8_SUB0
  183. cmpwi cr0, L, 1
  184. ble SGEMM_L8x8_SUB4
  185. SGEMM_L8x8_LOOP_START:
  186. LOAD8x8_1
  187. KERNEL8x8_I1
  188. KERNEL8x8_2
  189. KERNEL8x8_1
  190. KERNEL8x8_2
  191. KERNEL8x8_1
  192. KERNEL8x8_2
  193. KERNEL8x8_1
  194. KERNEL8x8_2
  195. addic. L, L, -2
  196. ble SGEMM_L8x8_LOOP_END
  197. .align 5
  198. SGEMM_L8x8_LOOP:
  199. KERNEL8x8_1
  200. KERNEL8x8_2
  201. KERNEL8x8_1
  202. KERNEL8x8_2
  203. KERNEL8x8_1
  204. KERNEL8x8_2
  205. KERNEL8x8_1
  206. KERNEL8x8_2
  207. addic. L, L, -1
  208. bgt SGEMM_L8x8_LOOP
  209. SGEMM_L8x8_LOOP_END:
  210. KERNEL8x8_1
  211. KERNEL8x8_2
  212. KERNEL8x8_1
  213. KERNEL8x8_2
  214. KERNEL8x8_1
  215. KERNEL8x8_2
  216. KERNEL8x8_1
  217. KERNEL8x8_E2
  218. b SGEMM_L8x8_SUB1
  219. SGEMM_L8x8_SUB4:
  220. KERNEL8x8_SUBI1
  221. KERNEL8x8_SUB1
  222. KERNEL8x8_SUB1
  223. KERNEL8x8_SUB1
  224. KERNEL8x8_SUB1
  225. KERNEL8x8_SUB1
  226. KERNEL8x8_SUB1
  227. KERNEL8x8_SUB1
  228. b SGEMM_L8x8_SUB1
  229. SGEMM_L8x8_SUB0:
  230. andi. L, K, 7
  231. KERNEL8x8_SUBI1
  232. addic. L, L, -1
  233. ble SGEMM_L8x8_SAVE
  234. b SGEMM_L8x8_SUB2
  235. SGEMM_L8x8_SUB1:
  236. andi. L, K, 7
  237. ble SGEMM_L8x8_SAVE
  238. SGEMM_L8x8_SUB2:
  239. KERNEL8x8_SUB1
  240. addic. L, L, -1
  241. bgt SGEMM_L8x8_SUB2
  242. SGEMM_L8x8_SAVE:
  243. SAVE8x8
  244. SGEMM_L8x8_END:
  245. SGEMM_L8x4_BEGIN:
  246. andi. T1, M, 4
  247. ble SGEMM_L8x4_END
  248. mr BO, BBUFFER
  249. srawi. L, K, 3
  250. ble SGEMM_L8x4_SUB0
  251. cmpwi cr0, L, 1
  252. ble SGEMM_L8x4_SUB4
  253. SGEMM_L8x4_LOOP_START:
  254. LOAD8x4_1
  255. KERNEL8x4_I1
  256. KERNEL8x4_2
  257. KERNEL8x4_1
  258. KERNEL8x4_2
  259. KERNEL8x4_1
  260. KERNEL8x4_2
  261. KERNEL8x4_1
  262. KERNEL8x4_2
  263. addic. L, L, -2
  264. ble SGEMM_L8x4_LOOP_END
  265. .align 5
  266. SGEMM_L8x4_LOOP:
  267. KERNEL8x4_1
  268. KERNEL8x4_2
  269. KERNEL8x4_1
  270. KERNEL8x4_2
  271. KERNEL8x4_1
  272. KERNEL8x4_2
  273. KERNEL8x4_1
  274. KERNEL8x4_2
  275. addic. L, L, -1
  276. bgt SGEMM_L8x4_LOOP
  277. SGEMM_L8x4_LOOP_END:
  278. KERNEL8x4_1
  279. KERNEL8x4_2
  280. KERNEL8x4_1
  281. KERNEL8x4_2
  282. KERNEL8x4_1
  283. KERNEL8x4_2
  284. KERNEL8x4_1
  285. KERNEL8x4_E2
  286. b SGEMM_L8x4_SUB1
  287. SGEMM_L8x4_SUB4:
  288. KERNEL8x4_SUBI1
  289. KERNEL8x4_SUB1
  290. KERNEL8x4_SUB1
  291. KERNEL8x4_SUB1
  292. KERNEL8x4_SUB1
  293. KERNEL8x4_SUB1
  294. KERNEL8x4_SUB1
  295. KERNEL8x4_SUB1
  296. b SGEMM_L8x4_SUB1
  297. SGEMM_L8x4_SUB0:
  298. andi. L, K, 7
  299. KERNEL8x4_SUBI1
  300. addic. L, L, -1
  301. ble SGEMM_L8x4_SAVE
  302. b SGEMM_L8x4_SUB2
  303. SGEMM_L8x4_SUB1:
  304. andi. L, K, 7
  305. ble SGEMM_L8x4_SAVE
  306. SGEMM_L8x4_SUB2:
  307. KERNEL8x4_SUB1
  308. addic. L, L, -1
  309. bgt SGEMM_L8x4_SUB2
  310. SGEMM_L8x4_SAVE:
  311. SAVE8x4
  312. SGEMM_L8x4_END:
  313. SGEMM_L8x2_BEGIN:
  314. andi. T1, M, 2
  315. ble SGEMM_L8x2_END
  316. mr BO, BBUFFER
  317. srawi. L, K, 3
  318. ble SGEMM_L8x2_SUB0
  319. cmpwi cr0, L, 1
  320. ble SGEMM_L8x2_SUB4
  321. SGEMM_L8x2_LOOP_START:
  322. LOAD8x2_1
  323. KERNEL8x2_I1
  324. KERNEL8x2_2
  325. KERNEL8x2_1
  326. KERNEL8x2_2
  327. KERNEL8x2_1
  328. KERNEL8x2_2
  329. KERNEL8x2_1
  330. KERNEL8x2_2
  331. addic. L, L, -2
  332. ble SGEMM_L8x2_LOOP_END
  333. .align 5
  334. SGEMM_L8x2_LOOP:
  335. KERNEL8x2_1
  336. KERNEL8x2_2
  337. KERNEL8x2_1
  338. KERNEL8x2_2
  339. KERNEL8x2_1
  340. KERNEL8x2_2
  341. KERNEL8x2_1
  342. KERNEL8x2_2
  343. addic. L, L, -1
  344. bgt SGEMM_L8x2_LOOP
  345. SGEMM_L8x2_LOOP_END:
  346. KERNEL8x2_1
  347. KERNEL8x2_2
  348. KERNEL8x2_1
  349. KERNEL8x2_2
  350. KERNEL8x2_1
  351. KERNEL8x2_2
  352. KERNEL8x2_1
  353. KERNEL8x2_E2
  354. b SGEMM_L8x2_SUB1
  355. SGEMM_L8x2_SUB4:
  356. KERNEL8x2_SUBI1
  357. KERNEL8x2_SUB1
  358. KERNEL8x2_SUB1
  359. KERNEL8x2_SUB1
  360. KERNEL8x2_SUB1
  361. KERNEL8x2_SUB1
  362. KERNEL8x2_SUB1
  363. KERNEL8x2_SUB1
  364. b SGEMM_L8x2_SUB1
  365. SGEMM_L8x2_SUB0:
  366. andi. L, K, 7
  367. KERNEL8x2_SUBI1
  368. addic. L, L, -1
  369. ble SGEMM_L8x2_SAVE
  370. b SGEMM_L8x2_SUB2
  371. SGEMM_L8x2_SUB1:
  372. andi. L, K, 7
  373. ble SGEMM_L8x2_SAVE
  374. SGEMM_L8x2_SUB2:
  375. KERNEL8x2_SUB1
  376. addic. L, L, -1
  377. bgt SGEMM_L8x2_SUB2
  378. SGEMM_L8x2_SAVE:
  379. SAVE8x2
  380. SGEMM_L8x2_END:
  381. SGEMM_L8x1_BEGIN:
  382. andi. T1, M, 1
  383. ble SGEMM_L8x1_END
  384. mr BO, BBUFFER
  385. srawi. L, K, 3
  386. ble SGEMM_L8x1_SUB0
  387. cmpwi cr0, L, 1
  388. ble SGEMM_L8x1_SUB4
  389. SGEMM_L8x1_LOOP_START:
  390. LOAD8x1_1
  391. KERNEL8x1_I1
  392. KERNEL8x1_2
  393. KERNEL8x1_1
  394. KERNEL8x1_2
  395. KERNEL8x1_1
  396. KERNEL8x1_2
  397. KERNEL8x1_1
  398. KERNEL8x1_2
  399. addic. L, L, -2
  400. ble SGEMM_L8x1_LOOP_END
  401. .align 5
  402. SGEMM_L8x1_LOOP:
  403. KERNEL8x1_1
  404. KERNEL8x1_2
  405. KERNEL8x1_1
  406. KERNEL8x1_2
  407. KERNEL8x1_1
  408. KERNEL8x1_2
  409. KERNEL8x1_1
  410. KERNEL8x1_2
  411. addic. L, L, -1
  412. bgt SGEMM_L8x1_LOOP
  413. SGEMM_L8x1_LOOP_END:
  414. KERNEL8x1_1
  415. KERNEL8x1_2
  416. KERNEL8x1_1
  417. KERNEL8x1_2
  418. KERNEL8x1_1
  419. KERNEL8x1_2
  420. KERNEL8x1_1
  421. KERNEL8x1_E2
  422. b SGEMM_L8x1_SUB1
  423. SGEMM_L8x1_SUB4:
  424. KERNEL8x1_SUBI1
  425. KERNEL8x1_SUB1
  426. KERNEL8x1_SUB1
  427. KERNEL8x1_SUB1
  428. KERNEL8x1_SUB1
  429. KERNEL8x1_SUB1
  430. KERNEL8x1_SUB1
  431. KERNEL8x1_SUB1
  432. b SGEMM_L8x1_SUB1
  433. SGEMM_L8x1_SUB0:
  434. andi. L, K, 7
  435. KERNEL8x1_SUBI1
  436. addic. L, L, -1
  437. ble SGEMM_L8x1_SAVE
  438. b SGEMM_L8x1_SUB2
  439. SGEMM_L8x1_SUB1:
  440. andi. L, K, 7
  441. ble SGEMM_L8x1_SAVE
  442. SGEMM_L8x1_SUB2:
  443. KERNEL8x1_SUB1
  444. addic. L, L, -1
  445. bgt SGEMM_L8x1_SUB2
  446. SGEMM_L8x1_SAVE:
  447. SAVE8x1
  448. SGEMM_L8x1_END:
  449. slwi T1, K, 5
  450. add B, B, T1
  451. addic. J, J, -1
  452. bgt SGEMM_L8_BEGIN
  453. andi. T2, N, 7
  454. ble L999
  455. SGEMM_L8_END:
  456. b SGEMM_L4_BEGIN
  457. L999_H1:
  458. b L999
  459. SGEMM_L4_BEGIN:
  460. mr BO, B
  461. mr BBO, BBUFFER
  462. slwi T1, K, 2
  463. SGEMM_L4_COPYB:
  464. dcbtst BBO, PRE
  465. lxvw4x vs3, o0, BO
  466. lxvw4x vs11, o16, BO
  467. xxspltw vs4, vs3, 0
  468. xxspltw vs5, vs3, 1
  469. xxspltw vs6, vs3, 2
  470. xxspltw vs7, vs3, 3
  471. xxspltw vs12, vs11, 0
  472. xxspltw vs13, vs11, 1
  473. xxspltw vs14, vs11, 2
  474. xxspltw vs15, vs11, 3
  475. stxvw4x vs4, o0, BBO
  476. stxvw4x vs5, o16, BBO
  477. stxvw4x vs6, o32, BBO
  478. stxvw4x vs7, o48, BBO
  479. addi BO, BO, 32
  480. addi BBO, BBO, 64
  481. stxvw4x vs12, o0, BBO
  482. stxvw4x vs13, o16, BBO
  483. stxvw4x vs14, o32, BBO
  484. stxvw4x vs15, o48, BBO
  485. addic. T1, T1, -8
  486. addi BBO, BBO, 64
  487. bge SGEMM_L4_COPYB
  488. andi. T1, N, 4
  489. ble SGEMM_L4_END
  490. mr CO, C
  491. mr AO, A
  492. slwi T1, LDC , 2
  493. add C, C, T1
  494. srawi. I, M, 4
  495. ble SGEMM_L4x16_END
  496. SGEMM_L4x16_BEGIN:
  497. mr BO, BBUFFER
  498. srawi. L, K, 3
  499. ble SGEMM_L4x16_SUB0
  500. cmpwi cr0, L, 1
  501. ble SGEMM_L4x16_SUB4
  502. SGEMM_L4x16_LOOP_START:
  503. dcbt AO, PRE
  504. LOAD4x16_1
  505. KERNEL4x16_I1
  506. dcbt AO, PRE
  507. KERNEL4x16_2
  508. KERNEL4x16_1
  509. dcbt AO, PRE
  510. KERNEL4x16_2
  511. KERNEL4x16_1
  512. dcbt AO, PRE
  513. KERNEL4x16_2
  514. KERNEL4x16_1
  515. dcbt AO, PRE
  516. KERNEL4x16_2
  517. addic. L, L, -2
  518. ble SGEMM_L4x16_LOOP_END
  519. .align 5
  520. SGEMM_L4x16_LOOP:
  521. KERNEL4x16_1
  522. dcbt AO, PRE
  523. KERNEL4x16_2
  524. KERNEL4x16_1
  525. dcbt AO, PRE
  526. KERNEL4x16_2
  527. KERNEL4x16_1
  528. dcbt AO, PRE
  529. KERNEL4x16_2
  530. KERNEL4x16_1
  531. dcbt AO, PRE
  532. KERNEL4x16_2
  533. addic. L, L, -1
  534. bgt SGEMM_L4x16_LOOP
  535. SGEMM_L4x16_LOOP_END:
  536. KERNEL4x16_1
  537. dcbt AO, PRE
  538. KERNEL4x16_2
  539. KERNEL4x16_1
  540. dcbt AO, PRE
  541. KERNEL4x16_2
  542. KERNEL4x16_1
  543. dcbt AO, PRE
  544. KERNEL4x16_2
  545. KERNEL4x16_1
  546. KERNEL4x16_E2
  547. b SGEMM_L4x16_SUB1
  548. SGEMM_L4x16_SUB4:
  549. dcbt AO, PRE
  550. KERNEL4x16_SUBI1
  551. KERNEL4x16_SUB1
  552. dcbt AO, PRE
  553. KERNEL4x16_SUB1
  554. KERNEL4x16_SUB1
  555. KERNEL4x16_SUB1
  556. KERNEL4x16_SUB1
  557. KERNEL4x16_SUB1
  558. KERNEL4x16_SUB1
  559. b SGEMM_L4x16_SUB1
  560. SGEMM_L4x16_SUB0:
  561. andi. L, K, 7
  562. KERNEL4x16_SUBI1
  563. addic. L, L, -1
  564. ble SGEMM_L4x16_SAVE
  565. b SGEMM_L4x16_SUB2
  566. SGEMM_L4x16_SUB1:
  567. andi. L, K, 7
  568. ble SGEMM_L4x16_SAVE
  569. SGEMM_L4x16_SUB2:
  570. KERNEL4x16_SUB1
  571. addic. L, L, -1
  572. bgt SGEMM_L4x16_SUB2
  573. SGEMM_L4x16_SAVE:
  574. SAVE4x16
  575. addic. I, I, -1
  576. bgt SGEMM_L4x16_BEGIN
  577. SGEMM_L4x16_END:
  578. SGEMM_L4x8_BEGIN:
  579. andi. T2, M, 15
  580. ble SGEMM_L4x1_END
  581. andi. T1, M, 8
  582. ble SGEMM_L4x8_END
  583. mr BO, BBUFFER
  584. srawi. L, K, 3
  585. ble SGEMM_L4x8_SUB0
  586. cmpwi cr0, L, 1
  587. ble SGEMM_L4x8_SUB4
  588. SGEMM_L4x8_LOOP_START:
  589. LOAD4x8_1
  590. KERNEL4x8_I1
  591. KERNEL4x8_2
  592. KERNEL4x8_1
  593. KERNEL4x8_2
  594. KERNEL4x8_1
  595. KERNEL4x8_2
  596. KERNEL4x8_1
  597. KERNEL4x8_2
  598. addic. L, L, -2
  599. ble SGEMM_L4x8_LOOP_END
  600. .align 5
  601. SGEMM_L4x8_LOOP:
  602. KERNEL4x8_1
  603. KERNEL4x8_2
  604. KERNEL4x8_1
  605. KERNEL4x8_2
  606. KERNEL4x8_1
  607. KERNEL4x8_2
  608. KERNEL4x8_1
  609. KERNEL4x8_2
  610. addic. L, L, -1
  611. bgt SGEMM_L4x8_LOOP
  612. SGEMM_L4x8_LOOP_END:
  613. KERNEL4x8_1
  614. KERNEL4x8_2
  615. KERNEL4x8_1
  616. KERNEL4x8_2
  617. KERNEL4x8_1
  618. KERNEL4x8_2
  619. KERNEL4x8_1
  620. KERNEL4x8_E2
  621. b SGEMM_L4x8_SUB1
  622. SGEMM_L4x8_SUB4:
  623. KERNEL4x8_SUBI1
  624. KERNEL4x8_SUB1
  625. KERNEL4x8_SUB1
  626. KERNEL4x8_SUB1
  627. KERNEL4x8_SUB1
  628. KERNEL4x8_SUB1
  629. KERNEL4x8_SUB1
  630. KERNEL4x8_SUB1
  631. b SGEMM_L4x8_SUB1
  632. SGEMM_L4x8_SUB0:
  633. andi. L, K, 7
  634. KERNEL4x8_SUBI1
  635. addic. L, L, -1
  636. ble SGEMM_L4x8_SAVE
  637. b SGEMM_L4x8_SUB2
  638. SGEMM_L4x8_SUB1:
  639. andi. L, K, 7
  640. ble SGEMM_L4x8_SAVE
  641. SGEMM_L4x8_SUB2:
  642. KERNEL4x8_SUB1
  643. addic. L, L, -1
  644. bgt SGEMM_L4x8_SUB2
  645. SGEMM_L4x8_SAVE:
  646. SAVE4x8
  647. SGEMM_L4x8_END:
  648. SGEMM_L4x4_BEGIN:
  649. andi. T1, M, 4
  650. ble SGEMM_L4x4_END
  651. mr BO, BBUFFER
  652. srawi. L, K, 3
  653. ble SGEMM_L4x4_SUB0
  654. cmpwi cr0, L, 1
  655. ble SGEMM_L4x4_SUB4
  656. SGEMM_L4x4_LOOP_START:
  657. LOAD4x4_1
  658. KERNEL4x4_I1
  659. KERNEL4x4_2
  660. KERNEL4x4_1
  661. KERNEL4x4_2
  662. KERNEL4x4_1
  663. KERNEL4x4_2
  664. KERNEL4x4_1
  665. KERNEL4x4_2
  666. addic. L, L, -2
  667. ble SGEMM_L4x4_LOOP_END
  668. .align 5
  669. SGEMM_L4x4_LOOP:
  670. KERNEL4x4_1
  671. KERNEL4x4_2
  672. KERNEL4x4_1
  673. KERNEL4x4_2
  674. KERNEL4x4_1
  675. KERNEL4x4_2
  676. KERNEL4x4_1
  677. KERNEL4x4_2
  678. addic. L, L, -1
  679. bgt SGEMM_L4x4_LOOP
  680. SGEMM_L4x4_LOOP_END:
  681. KERNEL4x4_1
  682. KERNEL4x4_2
  683. KERNEL4x4_1
  684. KERNEL4x4_2
  685. KERNEL4x4_1
  686. KERNEL4x4_2
  687. KERNEL4x4_1
  688. KERNEL4x4_E2
  689. b SGEMM_L4x4_SUB1
  690. SGEMM_L4x4_SUB4:
  691. KERNEL4x4_SUBI1
  692. KERNEL4x4_SUB1
  693. KERNEL4x4_SUB1
  694. KERNEL4x4_SUB1
  695. KERNEL4x4_SUB1
  696. KERNEL4x4_SUB1
  697. KERNEL4x4_SUB1
  698. KERNEL4x4_SUB1
  699. b SGEMM_L4x4_SUB1
  700. SGEMM_L4x4_SUB0:
  701. andi. L, K, 7
  702. KERNEL4x4_SUBI1
  703. addic. L, L, -1
  704. ble SGEMM_L4x4_SAVE
  705. b SGEMM_L4x4_SUB2
  706. SGEMM_L4x4_SUB1:
  707. andi. L, K, 7
  708. ble SGEMM_L4x4_SAVE
  709. SGEMM_L4x4_SUB2:
  710. KERNEL4x4_SUB1
  711. addic. L, L, -1
  712. bgt SGEMM_L4x4_SUB2
  713. SGEMM_L4x4_SAVE:
  714. SAVE4x4
  715. SGEMM_L4x4_END:
  716. SGEMM_L4x2_BEGIN:
  717. andi. T1, M, 2
  718. ble SGEMM_L4x2_END
  719. mr BO, BBUFFER
  720. srawi. L, K, 3
  721. ble SGEMM_L4x2_SUB0
  722. cmpwi cr0, L, 1
  723. ble SGEMM_L4x2_SUB4
  724. SGEMM_L4x2_LOOP_START:
  725. LOAD4x2_1
  726. KERNEL4x2_I1
  727. KERNEL4x2_2
  728. KERNEL4x2_1
  729. KERNEL4x2_2
  730. KERNEL4x2_1
  731. KERNEL4x2_2
  732. KERNEL4x2_1
  733. KERNEL4x2_2
  734. addic. L, L, -2
  735. ble SGEMM_L4x2_LOOP_END
  736. .align 5
  737. SGEMM_L4x2_LOOP:
  738. KERNEL4x2_1
  739. KERNEL4x2_2
  740. KERNEL4x2_1
  741. KERNEL4x2_2
  742. KERNEL4x2_1
  743. KERNEL4x2_2
  744. KERNEL4x2_1
  745. KERNEL4x2_2
  746. addic. L, L, -1
  747. bgt SGEMM_L4x2_LOOP
  748. SGEMM_L4x2_LOOP_END:
  749. KERNEL4x2_1
  750. KERNEL4x2_2
  751. KERNEL4x2_1
  752. KERNEL4x2_2
  753. KERNEL4x2_1
  754. KERNEL4x2_2
  755. KERNEL4x2_1
  756. KERNEL4x2_E2
  757. b SGEMM_L4x2_SUB1
  758. SGEMM_L4x2_SUB4:
  759. KERNEL4x2_SUBI1
  760. KERNEL4x2_SUB1
  761. KERNEL4x2_SUB1
  762. KERNEL4x2_SUB1
  763. KERNEL4x2_SUB1
  764. KERNEL4x2_SUB1
  765. KERNEL4x2_SUB1
  766. KERNEL4x2_SUB1
  767. b SGEMM_L4x2_SUB1
  768. SGEMM_L4x2_SUB0:
  769. andi. L, K, 7
  770. KERNEL4x2_SUBI1
  771. addic. L, L, -1
  772. ble SGEMM_L4x2_SAVE
  773. b SGEMM_L4x2_SUB2
  774. SGEMM_L4x2_SUB1:
  775. andi. L, K, 7
  776. ble SGEMM_L4x2_SAVE
  777. SGEMM_L4x2_SUB2:
  778. KERNEL4x2_SUB1
  779. addic. L, L, -1
  780. bgt SGEMM_L4x2_SUB2
  781. SGEMM_L4x2_SAVE:
  782. SAVE4x2
  783. SGEMM_L4x2_END:
  784. SGEMM_L4x1_BEGIN:
  785. andi. T1, M, 1
  786. ble SGEMM_L4x1_END
  787. mr BO, BBUFFER
  788. srawi. L, K, 3
  789. ble SGEMM_L4x1_SUB0
  790. cmpwi cr0, L, 1
  791. ble SGEMM_L4x1_SUB4
  792. SGEMM_L4x1_LOOP_START:
  793. LOAD4x1_1
  794. KERNEL4x1_I1
  795. KERNEL4x1_2
  796. KERNEL4x1_1
  797. KERNEL4x1_2
  798. KERNEL4x1_1
  799. KERNEL4x1_2
  800. KERNEL4x1_1
  801. KERNEL4x1_2
  802. addic. L, L, -2
  803. ble SGEMM_L4x1_LOOP_END
  804. .align 5
  805. SGEMM_L4x1_LOOP:
  806. KERNEL4x1_1
  807. KERNEL4x1_2
  808. KERNEL4x1_1
  809. KERNEL4x1_2
  810. KERNEL4x1_1
  811. KERNEL4x1_2
  812. KERNEL4x1_1
  813. KERNEL4x1_2
  814. addic. L, L, -1
  815. bgt SGEMM_L4x1_LOOP
  816. SGEMM_L4x1_LOOP_END:
  817. KERNEL4x1_1
  818. KERNEL4x1_2
  819. KERNEL4x1_1
  820. KERNEL4x1_2
  821. KERNEL4x1_1
  822. KERNEL4x1_2
  823. KERNEL4x1_1
  824. KERNEL4x1_E2
  825. b SGEMM_L4x1_SUB1
  826. SGEMM_L4x1_SUB4:
  827. KERNEL4x1_SUBI1
  828. KERNEL4x1_SUB1
  829. KERNEL4x1_SUB1
  830. KERNEL4x1_SUB1
  831. KERNEL4x1_SUB1
  832. KERNEL4x1_SUB1
  833. KERNEL4x1_SUB1
  834. KERNEL4x1_SUB1
  835. b SGEMM_L4x1_SUB1
  836. SGEMM_L4x1_SUB0:
  837. andi. L, K, 7
  838. KERNEL4x1_SUBI1
  839. addic. L, L, -1
  840. ble SGEMM_L4x1_SAVE
  841. b SGEMM_L4x1_SUB2
  842. SGEMM_L4x1_SUB1:
  843. andi. L, K, 7
  844. ble SGEMM_L4x1_SAVE
  845. SGEMM_L4x1_SUB2:
  846. KERNEL4x1_SUB1
  847. addic. L, L, -1
  848. bgt SGEMM_L4x1_SUB2
  849. SGEMM_L4x1_SAVE:
  850. SAVE4x1
  851. SGEMM_L4x1_END:
  852. slwi T1, K, 4
  853. add B, B, T1
  854. SGEMM_L4_END:
  855. SGEMM_L2_BEGIN:
  856. mr BO, B
  857. mr BBO, BBUFFER
  858. slwi T1, K, 1
  859. SGEMM_L2_COPYB:
  860. dcbtst BBO, PRE
  861. lxvw4x vs3, o0, BO
  862. lxvw4x vs11, o16, BO
  863. xxspltw vs4, vs3, 0
  864. xxspltw vs5, vs3, 1
  865. xxspltw vs6, vs3, 2
  866. xxspltw vs7, vs3, 3
  867. xxspltw vs12, vs11, 0
  868. xxspltw vs13, vs11, 1
  869. xxspltw vs14, vs11, 2
  870. xxspltw vs15, vs11, 3
  871. stxvw4x vs4, o0, BBO
  872. stxvw4x vs5, o16, BBO
  873. stxvw4x vs6, o32, BBO
  874. stxvw4x vs7, o48, BBO
  875. addi BO, BO, 32
  876. addi BBO, BBO, 64
  877. stxvw4x vs12, o0, BBO
  878. stxvw4x vs13, o16, BBO
  879. stxvw4x vs14, o32, BBO
  880. stxvw4x vs15, o48, BBO
  881. addic. T1, T1, -8
  882. addi BBO, BBO, 64
  883. bge SGEMM_L2_COPYB
  884. andi. T1, N, 2
  885. ble SGEMM_L2_END
  886. mr CO, C
  887. mr AO, A
  888. slwi T1, LDC , 1
  889. add C, C, T1
  890. srawi. I, M, 4
  891. ble SGEMM_L2x16_END
  892. SGEMM_L2x16_BEGIN:
  893. mr BO, BBUFFER
  894. srawi. L, K, 3
  895. ble SGEMM_L2x16_SUB0
  896. cmpwi cr0, L, 1
  897. ble SGEMM_L2x16_SUB4
  898. SGEMM_L2x16_LOOP_START:
  899. dcbt AO, PRE
  900. LOAD2x16_1
  901. KERNEL2x16_I1
  902. dcbt AO, PRE
  903. KERNEL2x16_2
  904. KERNEL2x16_1
  905. dcbt AO, PRE
  906. KERNEL2x16_2
  907. KERNEL2x16_1
  908. dcbt AO, PRE
  909. KERNEL2x16_2
  910. KERNEL2x16_1
  911. dcbt AO, PRE
  912. KERNEL2x16_2
  913. addic. L, L, -2
  914. ble SGEMM_L2x16_LOOP_END
  915. .align 5
  916. SGEMM_L2x16_LOOP:
  917. KERNEL2x16_1
  918. dcbt AO, PRE
  919. KERNEL2x16_2
  920. KERNEL2x16_1
  921. dcbt AO, PRE
  922. KERNEL2x16_2
  923. KERNEL2x16_1
  924. dcbt AO, PRE
  925. KERNEL2x16_2
  926. KERNEL2x16_1
  927. dcbt AO, PRE
  928. KERNEL2x16_2
  929. addic. L, L, -1
  930. bgt SGEMM_L2x16_LOOP
  931. SGEMM_L2x16_LOOP_END:
  932. KERNEL2x16_1
  933. dcbt AO, PRE
  934. KERNEL2x16_2
  935. KERNEL2x16_1
  936. dcbt AO, PRE
  937. KERNEL2x16_2
  938. KERNEL2x16_1
  939. dcbt AO, PRE
  940. KERNEL2x16_2
  941. KERNEL2x16_1
  942. KERNEL2x16_E2
  943. b SGEMM_L2x16_SUB1
  944. SGEMM_L2x16_SUB4:
  945. dcbt AO, PRE
  946. KERNEL2x16_SUBI1
  947. KERNEL2x16_SUB1
  948. dcbt AO, PRE
  949. KERNEL2x16_SUB1
  950. KERNEL2x16_SUB1
  951. KERNEL2x16_SUB1
  952. KERNEL2x16_SUB1
  953. KERNEL2x16_SUB1
  954. KERNEL2x16_SUB1
  955. b SGEMM_L2x16_SUB1
  956. SGEMM_L2x16_SUB0:
  957. andi. L, K, 7
  958. KERNEL2x16_SUBI1
  959. addic. L, L, -1
  960. ble SGEMM_L2x16_SAVE
  961. b SGEMM_L2x16_SUB2
  962. SGEMM_L2x16_SUB1:
  963. andi. L, K, 7
  964. ble SGEMM_L2x16_SAVE
  965. SGEMM_L2x16_SUB2:
  966. KERNEL2x16_SUB1
  967. addic. L, L, -1
  968. bgt SGEMM_L2x16_SUB2
  969. SGEMM_L2x16_SAVE:
  970. SAVE2x16
  971. addic. I, I, -1
  972. bgt SGEMM_L2x16_BEGIN
  973. SGEMM_L2x16_END:
  974. SGEMM_L2x8_BEGIN:
  975. andi. T2, M, 15
  976. ble SGEMM_L2x1_END
  977. andi. T1, M, 8
  978. ble SGEMM_L2x8_END
  979. mr BO, BBUFFER
  980. srawi. L, K, 3
  981. ble SGEMM_L2x8_SUB0
  982. cmpwi cr0, L, 1
  983. ble SGEMM_L2x8_SUB4
  984. SGEMM_L2x8_LOOP_START:
  985. LOAD2x8_1
  986. KERNEL2x8_I1
  987. KERNEL2x8_2
  988. KERNEL2x8_1
  989. KERNEL2x8_2
  990. KERNEL2x8_1
  991. KERNEL2x8_2
  992. KERNEL2x8_1
  993. KERNEL2x8_2
  994. addic. L, L, -2
  995. ble SGEMM_L2x8_LOOP_END
  996. .align 5
  997. SGEMM_L2x8_LOOP:
  998. KERNEL2x8_1
  999. KERNEL2x8_2
  1000. KERNEL2x8_1
  1001. KERNEL2x8_2
  1002. KERNEL2x8_1
  1003. KERNEL2x8_2
  1004. KERNEL2x8_1
  1005. KERNEL2x8_2
  1006. addic. L, L, -1
  1007. bgt SGEMM_L2x8_LOOP
  1008. SGEMM_L2x8_LOOP_END:
  1009. KERNEL2x8_1
  1010. KERNEL2x8_2
  1011. KERNEL2x8_1
  1012. KERNEL2x8_2
  1013. KERNEL2x8_1
  1014. KERNEL2x8_2
  1015. KERNEL2x8_1
  1016. KERNEL2x8_E2
  1017. b SGEMM_L2x8_SUB1
  1018. SGEMM_L2x8_SUB4:
  1019. KERNEL2x8_SUBI1
  1020. KERNEL2x8_SUB1
  1021. KERNEL2x8_SUB1
  1022. KERNEL2x8_SUB1
  1023. KERNEL2x8_SUB1
  1024. KERNEL2x8_SUB1
  1025. KERNEL2x8_SUB1
  1026. KERNEL2x8_SUB1
  1027. b SGEMM_L2x8_SUB1
  1028. SGEMM_L2x8_SUB0:
  1029. andi. L, K, 7
  1030. KERNEL2x8_SUBI1
  1031. addic. L, L, -1
  1032. ble SGEMM_L2x8_SAVE
  1033. b SGEMM_L2x8_SUB2
  1034. SGEMM_L2x8_SUB1:
  1035. andi. L, K, 7
  1036. ble SGEMM_L2x8_SAVE
  1037. SGEMM_L2x8_SUB2:
  1038. KERNEL2x8_SUB1
  1039. addic. L, L, -1
  1040. bgt SGEMM_L2x8_SUB2
  1041. SGEMM_L2x8_SAVE:
  1042. SAVE2x8
  1043. SGEMM_L2x8_END:
  1044. SGEMM_L2x4_BEGIN:
  1045. andi. T1, M, 4
  1046. ble SGEMM_L2x4_END
  1047. mr BO, BBUFFER
  1048. srawi. L, K, 3
  1049. ble SGEMM_L2x4_SUB0
  1050. cmpwi cr0, L, 1
  1051. ble SGEMM_L2x4_SUB4
  1052. SGEMM_L2x4_LOOP_START:
  1053. LOAD2x4_1
  1054. KERNEL2x4_I1
  1055. KERNEL2x4_2
  1056. KERNEL2x4_1
  1057. KERNEL2x4_2
  1058. KERNEL2x4_1
  1059. KERNEL2x4_2
  1060. KERNEL2x4_1
  1061. KERNEL2x4_2
  1062. addic. L, L, -2
  1063. ble SGEMM_L2x4_LOOP_END
  1064. .align 5
  1065. SGEMM_L2x4_LOOP:
  1066. KERNEL2x4_1
  1067. KERNEL2x4_2
  1068. KERNEL2x4_1
  1069. KERNEL2x4_2
  1070. KERNEL2x4_1
  1071. KERNEL2x4_2
  1072. KERNEL2x4_1
  1073. KERNEL2x4_2
  1074. addic. L, L, -1
  1075. bgt SGEMM_L2x4_LOOP
  1076. SGEMM_L2x4_LOOP_END:
  1077. KERNEL2x4_1
  1078. KERNEL2x4_2
  1079. KERNEL2x4_1
  1080. KERNEL2x4_2
  1081. KERNEL2x4_1
  1082. KERNEL2x4_2
  1083. KERNEL2x4_1
  1084. KERNEL2x4_E2
  1085. b SGEMM_L2x4_SUB1
  1086. SGEMM_L2x4_SUB4:
  1087. KERNEL2x4_SUBI1
  1088. KERNEL2x4_SUB1
  1089. KERNEL2x4_SUB1
  1090. KERNEL2x4_SUB1
  1091. KERNEL2x4_SUB1
  1092. KERNEL2x4_SUB1
  1093. KERNEL2x4_SUB1
  1094. KERNEL2x4_SUB1
  1095. b SGEMM_L2x4_SUB1
  1096. SGEMM_L2x4_SUB0:
  1097. andi. L, K, 7
  1098. KERNEL2x4_SUBI1
  1099. addic. L, L, -1
  1100. ble SGEMM_L2x4_SAVE
  1101. b SGEMM_L2x4_SUB2
  1102. SGEMM_L2x4_SUB1:
  1103. andi. L, K, 7
  1104. ble SGEMM_L2x4_SAVE
  1105. SGEMM_L2x4_SUB2:
  1106. KERNEL2x4_SUB1
  1107. addic. L, L, -1
  1108. bgt SGEMM_L2x4_SUB2
  1109. SGEMM_L2x4_SAVE:
  1110. SAVE2x4
  1111. SGEMM_L2x4_END:
  1112. SGEMM_L2x2_BEGIN:
  1113. andi. T1, M, 2
  1114. ble SGEMM_L2x2_END
  1115. mr BO, BBUFFER
  1116. srawi. L, K, 3
  1117. ble SGEMM_L2x2_SUB0
  1118. cmpwi cr0, L, 1
  1119. ble SGEMM_L2x2_SUB4
  1120. SGEMM_L2x2_LOOP_START:
  1121. LOAD2x2_1
  1122. KERNEL2x2_I1
  1123. KERNEL2x2_2
  1124. KERNEL2x2_1
  1125. KERNEL2x2_2
  1126. KERNEL2x2_1
  1127. KERNEL2x2_2
  1128. KERNEL2x2_1
  1129. KERNEL2x2_2
  1130. addic. L, L, -2
  1131. ble SGEMM_L2x2_LOOP_END
  1132. .align 5
  1133. SGEMM_L2x2_LOOP:
  1134. KERNEL2x2_1
  1135. KERNEL2x2_2
  1136. KERNEL2x2_1
  1137. KERNEL2x2_2
  1138. KERNEL2x2_1
  1139. KERNEL2x2_2
  1140. KERNEL2x2_1
  1141. KERNEL2x2_2
  1142. addic. L, L, -1
  1143. bgt SGEMM_L2x2_LOOP
  1144. SGEMM_L2x2_LOOP_END:
  1145. KERNEL2x2_1
  1146. KERNEL2x2_2
  1147. KERNEL2x2_1
  1148. KERNEL2x2_2
  1149. KERNEL2x2_1
  1150. KERNEL2x2_2
  1151. KERNEL2x2_1
  1152. KERNEL2x2_E2
  1153. b SGEMM_L2x2_SUB1
  1154. SGEMM_L2x2_SUB4:
  1155. KERNEL2x2_SUBI1
  1156. KERNEL2x2_SUB1
  1157. KERNEL2x2_SUB1
  1158. KERNEL2x2_SUB1
  1159. KERNEL2x2_SUB1
  1160. KERNEL2x2_SUB1
  1161. KERNEL2x2_SUB1
  1162. KERNEL2x2_SUB1
  1163. b SGEMM_L2x2_SUB1
  1164. SGEMM_L2x2_SUB0:
  1165. andi. L, K, 7
  1166. KERNEL2x2_SUBI1
  1167. addic. L, L, -1
  1168. ble SGEMM_L2x2_SAVE
  1169. b SGEMM_L2x2_SUB2
  1170. SGEMM_L2x2_SUB1:
  1171. andi. L, K, 7
  1172. ble SGEMM_L2x2_SAVE
  1173. SGEMM_L2x2_SUB2:
  1174. KERNEL2x2_SUB1
  1175. addic. L, L, -1
  1176. bgt SGEMM_L2x2_SUB2
  1177. SGEMM_L2x2_SAVE:
  1178. SAVE2x2
  1179. SGEMM_L2x2_END:
  1180. SGEMM_L2x1_BEGIN:
  1181. andi. T1, M, 1
  1182. ble SGEMM_L2x1_END
  1183. mr BO, BBUFFER
  1184. srawi. L, K, 3
  1185. ble SGEMM_L2x1_SUB0
  1186. cmpwi cr0, L, 1
  1187. ble SGEMM_L2x1_SUB4
  1188. SGEMM_L2x1_LOOP_START:
  1189. LOAD2x1_1
  1190. KERNEL2x1_I1
  1191. KERNEL2x1_2
  1192. KERNEL2x1_1
  1193. KERNEL2x1_2
  1194. KERNEL2x1_1
  1195. KERNEL2x1_2
  1196. KERNEL2x1_1
  1197. KERNEL2x1_2
  1198. addic. L, L, -2
  1199. ble SGEMM_L2x1_LOOP_END
  1200. .align 5
  1201. SGEMM_L2x1_LOOP:
  1202. KERNEL2x1_1
  1203. KERNEL2x1_2
  1204. KERNEL2x1_1
  1205. KERNEL2x1_2
  1206. KERNEL2x1_1
  1207. KERNEL2x1_2
  1208. KERNEL2x1_1
  1209. KERNEL2x1_2
  1210. addic. L, L, -1
  1211. bgt SGEMM_L2x1_LOOP
  1212. SGEMM_L2x1_LOOP_END:
  1213. KERNEL2x1_1
  1214. KERNEL2x1_2
  1215. KERNEL2x1_1
  1216. KERNEL2x1_2
  1217. KERNEL2x1_1
  1218. KERNEL2x1_2
  1219. KERNEL2x1_1
  1220. KERNEL2x1_E2
  1221. b SGEMM_L2x1_SUB1
  1222. SGEMM_L2x1_SUB4:
  1223. KERNEL2x1_SUBI1
  1224. KERNEL2x1_SUB1
  1225. KERNEL2x1_SUB1
  1226. KERNEL2x1_SUB1
  1227. KERNEL2x1_SUB1
  1228. KERNEL2x1_SUB1
  1229. KERNEL2x1_SUB1
  1230. KERNEL2x1_SUB1
  1231. b SGEMM_L2x1_SUB1
  1232. SGEMM_L2x1_SUB0:
  1233. andi. L, K, 7
  1234. KERNEL2x1_SUBI1
  1235. addic. L, L, -1
  1236. ble SGEMM_L2x1_SAVE
  1237. b SGEMM_L2x1_SUB2
  1238. SGEMM_L2x1_SUB1:
  1239. andi. L, K, 7
  1240. ble SGEMM_L2x1_SAVE
  1241. SGEMM_L2x1_SUB2:
  1242. KERNEL2x1_SUB1
  1243. addic. L, L, -1
  1244. bgt SGEMM_L2x1_SUB2
  1245. SGEMM_L2x1_SAVE:
  1246. SAVE2x1
  1247. SGEMM_L2x1_END:
  1248. slwi T1, K, 3
  1249. add B, B, T1
  1250. SGEMM_L2_END:
  1251. SGEMM_L1_BEGIN:
  1252. mr BO, B
  1253. mr BBO, BBUFFER
  1254. slwi T1, K, 0
  1255. SGEMM_L1_COPYB:
  1256. dcbtst BBO, PRE
  1257. lxvw4x vs3, o0, BO
  1258. lxvw4x vs11, o16, BO
  1259. xxspltw vs4, vs3, 0
  1260. xxspltw vs5, vs3, 1
  1261. xxspltw vs6, vs3, 2
  1262. xxspltw vs7, vs3, 3
  1263. xxspltw vs12, vs11, 0
  1264. xxspltw vs13, vs11, 1
  1265. xxspltw vs14, vs11, 2
  1266. xxspltw vs15, vs11, 3
  1267. stxvw4x vs4, o0, BBO
  1268. stxvw4x vs5, o16, BBO
  1269. stxvw4x vs6, o32, BBO
  1270. stxvw4x vs7, o48, BBO
  1271. addi BO, BO, 32
  1272. addi BBO, BBO, 64
  1273. stxvw4x vs12, o0, BBO
  1274. stxvw4x vs13, o16, BBO
  1275. stxvw4x vs14, o32, BBO
  1276. stxvw4x vs15, o48, BBO
  1277. addic. T1, T1, -8
  1278. addi BBO, BBO, 64
  1279. bge SGEMM_L1_COPYB
  1280. andi. T1, N, 1
  1281. ble SGEMM_L1_END
  1282. mr CO, C
  1283. mr AO, A
  1284. srawi. I, M, 4
  1285. ble SGEMM_L1x16_END
  1286. SGEMM_L1x16_BEGIN:
  1287. mr BO, BBUFFER
  1288. srawi. L, K, 3
  1289. ble SGEMM_L1x16_SUB0
  1290. cmpwi cr0, L, 1
  1291. ble SGEMM_L1x16_SUB4
  1292. SGEMM_L1x16_LOOP_START:
  1293. dcbt AO, PRE
  1294. LOAD1x16_1
  1295. KERNEL1x16_I1
  1296. dcbt AO, PRE
  1297. KERNEL1x16_2
  1298. KERNEL1x16_1
  1299. dcbt AO, PRE
  1300. KERNEL1x16_2
  1301. KERNEL1x16_1
  1302. dcbt AO, PRE
  1303. KERNEL1x16_2
  1304. KERNEL1x16_1
  1305. dcbt AO, PRE
  1306. KERNEL1x16_2
  1307. addic. L, L, -2
  1308. ble SGEMM_L1x16_LOOP_END
  1309. .align 5
  1310. SGEMM_L1x16_LOOP:
  1311. KERNEL1x16_1
  1312. dcbt AO, PRE
  1313. KERNEL1x16_2
  1314. KERNEL1x16_1
  1315. dcbt AO, PRE
  1316. KERNEL1x16_2
  1317. KERNEL1x16_1
  1318. dcbt AO, PRE
  1319. KERNEL1x16_2
  1320. KERNEL1x16_1
  1321. dcbt AO, PRE
  1322. KERNEL1x16_2
  1323. addic. L, L, -1
  1324. bgt SGEMM_L1x16_LOOP
  1325. SGEMM_L1x16_LOOP_END:
  1326. KERNEL1x16_1
  1327. dcbt AO, PRE
  1328. KERNEL1x16_2
  1329. KERNEL1x16_1
  1330. dcbt AO, PRE
  1331. KERNEL1x16_2
  1332. KERNEL1x16_1
  1333. dcbt AO, PRE
  1334. KERNEL1x16_2
  1335. KERNEL1x16_1
  1336. KERNEL1x16_E2
  1337. b SGEMM_L1x16_SUB1
  1338. SGEMM_L1x16_SUB4:
  1339. dcbt AO, PRE
  1340. KERNEL1x16_SUBI1
  1341. KERNEL1x16_SUB1
  1342. dcbt AO, PRE
  1343. KERNEL1x16_SUB1
  1344. KERNEL1x16_SUB1
  1345. KERNEL1x16_SUB1
  1346. KERNEL1x16_SUB1
  1347. KERNEL1x16_SUB1
  1348. KERNEL1x16_SUB1
  1349. b SGEMM_L1x16_SUB1
  1350. SGEMM_L1x16_SUB0:
  1351. andi. L, K, 7
  1352. KERNEL1x16_SUBI1
  1353. addic. L, L, -1
  1354. ble SGEMM_L1x16_SAVE
  1355. b SGEMM_L1x16_SUB2
  1356. SGEMM_L1x16_SUB1:
  1357. andi. L, K, 7
  1358. ble SGEMM_L1x16_SAVE
  1359. SGEMM_L1x16_SUB2:
  1360. KERNEL1x16_SUB1
  1361. addic. L, L, -1
  1362. bgt SGEMM_L1x16_SUB2
  1363. SGEMM_L1x16_SAVE:
  1364. SAVE1x16
  1365. addic. I, I, -1
  1366. bgt SGEMM_L1x16_BEGIN
  1367. SGEMM_L1x16_END:
  1368. SGEMM_L1x8_BEGIN:
  1369. andi. T2, M, 15
  1370. ble SGEMM_L1x1_END
  1371. andi. T1, M, 8
  1372. ble SGEMM_L1x8_END
  1373. mr BO, BBUFFER
  1374. srawi. L, K, 3
  1375. ble SGEMM_L1x8_SUB0
  1376. cmpwi cr0, L, 1
  1377. ble SGEMM_L1x8_SUB4
  1378. SGEMM_L1x8_LOOP_START:
  1379. LOAD1x8_1
  1380. KERNEL1x8_I1
  1381. KERNEL1x8_2
  1382. KERNEL1x8_1
  1383. KERNEL1x8_2
  1384. KERNEL1x8_1
  1385. KERNEL1x8_2
  1386. KERNEL1x8_1
  1387. KERNEL1x8_2
  1388. addic. L, L, -2
  1389. ble SGEMM_L1x8_LOOP_END
  1390. .align 5
  1391. SGEMM_L1x8_LOOP:
  1392. KERNEL1x8_1
  1393. KERNEL1x8_2
  1394. KERNEL1x8_1
  1395. KERNEL1x8_2
  1396. KERNEL1x8_1
  1397. KERNEL1x8_2
  1398. KERNEL1x8_1
  1399. KERNEL1x8_2
  1400. addic. L, L, -1
  1401. bgt SGEMM_L1x8_LOOP
  1402. SGEMM_L1x8_LOOP_END:
  1403. KERNEL1x8_1
  1404. KERNEL1x8_2
  1405. KERNEL1x8_1
  1406. KERNEL1x8_2
  1407. KERNEL1x8_1
  1408. KERNEL1x8_2
  1409. KERNEL1x8_1
  1410. KERNEL1x8_E2
  1411. b SGEMM_L1x8_SUB1
  1412. SGEMM_L1x8_SUB4:
  1413. KERNEL1x8_SUBI1
  1414. KERNEL1x8_SUB1
  1415. KERNEL1x8_SUB1
  1416. KERNEL1x8_SUB1
  1417. KERNEL1x8_SUB1
  1418. KERNEL1x8_SUB1
  1419. KERNEL1x8_SUB1
  1420. KERNEL1x8_SUB1
  1421. b SGEMM_L1x8_SUB1
  1422. SGEMM_L1x8_SUB0:
  1423. andi. L, K, 7
  1424. KERNEL1x8_SUBI1
  1425. addic. L, L, -1
  1426. ble SGEMM_L1x8_SAVE
  1427. b SGEMM_L1x8_SUB2
  1428. SGEMM_L1x8_SUB1:
  1429. andi. L, K, 7
  1430. ble SGEMM_L1x8_SAVE
  1431. SGEMM_L1x8_SUB2:
  1432. KERNEL1x8_SUB1
  1433. addic. L, L, -1
  1434. bgt SGEMM_L1x8_SUB2
  1435. SGEMM_L1x8_SAVE:
  1436. SAVE1x8
  1437. SGEMM_L1x8_END:
  1438. SGEMM_L1x4_BEGIN:
  1439. andi. T1, M, 4
  1440. ble SGEMM_L1x4_END
  1441. mr BO, BBUFFER
  1442. srawi. L, K, 3
  1443. ble SGEMM_L1x4_SUB0
  1444. cmpwi cr0, L, 1
  1445. ble SGEMM_L1x4_SUB4
  1446. SGEMM_L1x4_LOOP_START:
  1447. LOAD1x4_1
  1448. KERNEL1x4_I1
  1449. KERNEL1x4_2
  1450. KERNEL1x4_1
  1451. KERNEL1x4_2
  1452. KERNEL1x4_1
  1453. KERNEL1x4_2
  1454. KERNEL1x4_1
  1455. KERNEL1x4_2
  1456. addic. L, L, -2
  1457. ble SGEMM_L1x4_LOOP_END
  1458. .align 5
  1459. SGEMM_L1x4_LOOP:
  1460. KERNEL1x4_1
  1461. KERNEL1x4_2
  1462. KERNEL1x4_1
  1463. KERNEL1x4_2
  1464. KERNEL1x4_1
  1465. KERNEL1x4_2
  1466. KERNEL1x4_1
  1467. KERNEL1x4_2
  1468. addic. L, L, -1
  1469. bgt SGEMM_L1x4_LOOP
  1470. SGEMM_L1x4_LOOP_END:
  1471. KERNEL1x4_1
  1472. KERNEL1x4_2
  1473. KERNEL1x4_1
  1474. KERNEL1x4_2
  1475. KERNEL1x4_1
  1476. KERNEL1x4_2
  1477. KERNEL1x4_1
  1478. KERNEL1x4_E2
  1479. b SGEMM_L1x4_SUB1
  1480. SGEMM_L1x4_SUB4:
  1481. KERNEL1x4_SUBI1
  1482. KERNEL1x4_SUB1
  1483. KERNEL1x4_SUB1
  1484. KERNEL1x4_SUB1
  1485. KERNEL1x4_SUB1
  1486. KERNEL1x4_SUB1
  1487. KERNEL1x4_SUB1
  1488. KERNEL1x4_SUB1
  1489. b SGEMM_L1x4_SUB1
  1490. SGEMM_L1x4_SUB0:
  1491. andi. L, K, 7
  1492. KERNEL1x4_SUBI1
  1493. addic. L, L, -1
  1494. ble SGEMM_L1x4_SAVE
  1495. b SGEMM_L1x4_SUB2
  1496. SGEMM_L1x4_SUB1:
  1497. andi. L, K, 7
  1498. ble SGEMM_L1x4_SAVE
  1499. SGEMM_L1x4_SUB2:
  1500. KERNEL1x4_SUB1
  1501. addic. L, L, -1
  1502. bgt SGEMM_L1x4_SUB2
  1503. SGEMM_L1x4_SAVE:
  1504. SAVE1x4
  1505. SGEMM_L1x4_END:
  1506. SGEMM_L1x2_BEGIN:
  1507. andi. T1, M, 2
  1508. ble SGEMM_L1x2_END
  1509. mr BO, BBUFFER
  1510. srawi. L, K, 3
  1511. ble SGEMM_L1x2_SUB0
  1512. cmpwi cr0, L, 1
  1513. ble SGEMM_L1x2_SUB4
  1514. SGEMM_L1x2_LOOP_START:
  1515. LOAD1x2_1
  1516. KERNEL1x2_I1
  1517. KERNEL1x2_2
  1518. KERNEL1x2_1
  1519. KERNEL1x2_2
  1520. KERNEL1x2_1
  1521. KERNEL1x2_2
  1522. KERNEL1x2_1
  1523. KERNEL1x2_2
  1524. addic. L, L, -2
  1525. ble SGEMM_L1x2_LOOP_END
  1526. .align 5
  1527. SGEMM_L1x2_LOOP:
  1528. KERNEL1x2_1
  1529. KERNEL1x2_2
  1530. KERNEL1x2_1
  1531. KERNEL1x2_2
  1532. KERNEL1x2_1
  1533. KERNEL1x2_2
  1534. KERNEL1x2_1
  1535. KERNEL1x2_2
  1536. addic. L, L, -1
  1537. bgt SGEMM_L1x2_LOOP
  1538. SGEMM_L1x2_LOOP_END:
  1539. KERNEL1x2_1
  1540. KERNEL1x2_2
  1541. KERNEL1x2_1
  1542. KERNEL1x2_2
  1543. KERNEL1x2_1
  1544. KERNEL1x2_2
  1545. KERNEL1x2_1
  1546. KERNEL1x2_E2
  1547. b SGEMM_L1x2_SUB1
  1548. SGEMM_L1x2_SUB4:
  1549. KERNEL1x2_SUBI1
  1550. KERNEL1x2_SUB1
  1551. KERNEL1x2_SUB1
  1552. KERNEL1x2_SUB1
  1553. KERNEL1x2_SUB1
  1554. KERNEL1x2_SUB1
  1555. KERNEL1x2_SUB1
  1556. KERNEL1x2_SUB1
  1557. b SGEMM_L1x2_SUB1
  1558. SGEMM_L1x2_SUB0:
  1559. andi. L, K, 7
  1560. KERNEL1x2_SUBI1
  1561. addic. L, L, -1
  1562. ble SGEMM_L1x2_SAVE
  1563. b SGEMM_L1x2_SUB2
  1564. SGEMM_L1x2_SUB1:
  1565. andi. L, K, 7
  1566. ble SGEMM_L1x2_SAVE
  1567. SGEMM_L1x2_SUB2:
  1568. KERNEL1x2_SUB1
  1569. addic. L, L, -1
  1570. bgt SGEMM_L1x2_SUB2
  1571. SGEMM_L1x2_SAVE:
  1572. SAVE1x2
  1573. SGEMM_L1x2_END:
  1574. SGEMM_L1x1_BEGIN:
  1575. andi. T1, M, 1
  1576. ble SGEMM_L1x1_END
  1577. mr BO, BBUFFER
  1578. srawi. L, K, 3
  1579. ble SGEMM_L1x1_SUB0
  1580. cmpwi cr0, L, 1
  1581. ble SGEMM_L1x1_SUB4
  1582. SGEMM_L1x1_LOOP_START:
  1583. LOAD1x1_1
  1584. KERNEL1x1_I1
  1585. KERNEL1x1_2
  1586. KERNEL1x1_1
  1587. KERNEL1x1_2
  1588. KERNEL1x1_1
  1589. KERNEL1x1_2
  1590. KERNEL1x1_1
  1591. KERNEL1x1_2
  1592. addic. L, L, -2
  1593. ble SGEMM_L1x1_LOOP_END
  1594. .align 5
  1595. SGEMM_L1x1_LOOP:
  1596. KERNEL1x1_1
  1597. KERNEL1x1_2
  1598. KERNEL1x1_1
  1599. KERNEL1x1_2
  1600. KERNEL1x1_1
  1601. KERNEL1x1_2
  1602. KERNEL1x1_1
  1603. KERNEL1x1_2
  1604. addic. L, L, -1
  1605. bgt SGEMM_L1x1_LOOP
  1606. SGEMM_L1x1_LOOP_END:
  1607. KERNEL1x1_1
  1608. KERNEL1x1_2
  1609. KERNEL1x1_1
  1610. KERNEL1x1_2
  1611. KERNEL1x1_1
  1612. KERNEL1x1_2
  1613. KERNEL1x1_1
  1614. KERNEL1x1_E2
  1615. b SGEMM_L1x1_SUB1
  1616. SGEMM_L1x1_SUB4:
  1617. KERNEL1x1_SUBI1
  1618. KERNEL1x1_SUB1
  1619. KERNEL1x1_SUB1
  1620. KERNEL1x1_SUB1
  1621. KERNEL1x1_SUB1
  1622. KERNEL1x1_SUB1
  1623. KERNEL1x1_SUB1
  1624. KERNEL1x1_SUB1
  1625. b SGEMM_L1x1_SUB1
  1626. SGEMM_L1x1_SUB0:
  1627. andi. L, K, 7
  1628. KERNEL1x1_SUBI1
  1629. addic. L, L, -1
  1630. ble SGEMM_L1x1_SAVE
  1631. b SGEMM_L1x1_SUB2
  1632. SGEMM_L1x1_SUB1:
  1633. andi. L, K, 7
  1634. ble SGEMM_L1x1_SAVE
  1635. SGEMM_L1x1_SUB2:
  1636. KERNEL1x1_SUB1
  1637. addic. L, L, -1
  1638. bgt SGEMM_L1x1_SUB2
  1639. SGEMM_L1x1_SAVE:
  1640. SAVE1x1
  1641. SGEMM_L1x1_END:
  1642. SGEMM_L1_END: