You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_2x8.S 51 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561
  1. /*********************************************************************/
  2. /* Copyright 2005-2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define APREFETCHSIZE 24
  41. #define APREFETCH_CATEGORY 0
  42. #define M %i0
  43. #define N %i1
  44. #define K %i2
  45. #if defined(DOUBLE) && !defined(__64BIT__)
  46. #define A %i5
  47. #define B %i4
  48. #else
  49. #define A %i4
  50. #define B %i5
  51. #endif
  52. #define C %o4
  53. #define LDC %o5
  54. #define AO %l0
  55. #define BO %l1
  56. #define I %l2
  57. #define J %l3
  58. #define L %l4
  59. #define BB %o7
  60. #define C1 %o0
  61. #define C2 %o1
  62. #define C3 %o2
  63. #define C4 %o3
  64. #define C5 %l5
  65. #define C6 %l6
  66. #define C7 %l7
  67. #define C8 %i3
  68. #define OFFSET %g1
  69. #define KK %g2
  70. #define TEMP1 %g3
  71. #define TEMP2 %g4
  72. #ifdef DOUBLE
  73. #define c01 %f0
  74. #define c02 %f2
  75. #define c03 %f4
  76. #define c04 %f6
  77. #define c05 %f8
  78. #define c06 %f10
  79. #define c07 %f12
  80. #define c08 %f14
  81. #define c09 %f16
  82. #define c10 %f18
  83. #define c11 %f20
  84. #define c12 %f22
  85. #define c13 %f24
  86. #define c14 %f26
  87. #define c15 %f28
  88. #define c16 %f30
  89. #define a1 %f32
  90. #define a2 %f34
  91. #define a3 %f36
  92. #define a4 %f38
  93. #define a5 %f40
  94. #define b1 %f42
  95. #define b2 %f44
  96. #define b3 %f46
  97. #define b4 %f48
  98. #define b5 %f50
  99. #define b6 %f52
  100. #define b7 %f54
  101. #define b8 %f56
  102. #define b9 %f58
  103. #define ALPHA %f62
  104. #define cc01 0
  105. #define cc02 2
  106. #define cc03 4
  107. #define cc04 6
  108. #define cc05 8
  109. #define cc06 10
  110. #define cc07 12
  111. #define cc08 14
  112. #define cc09 16
  113. #define cc10 18
  114. #define cc11 20
  115. #define cc12 22
  116. #define cc13 24
  117. #define cc14 26
  118. #define cc15 28
  119. #define cc16 30
  120. #define aa1 1
  121. #define aa2 3
  122. #define aa3 5
  123. #define aa4 7
  124. #define aa5 9
  125. #define bb1 11
  126. #define bb2 13
  127. #define bb3 15
  128. #define bb4 17
  129. #define bb5 19
  130. #define bb6 21
  131. #define bb7 23
  132. #define bb8 25
  133. #define bb9 27
  134. #define alpha 31
  135. #else
  136. #define c01 %f0
  137. #define c02 %f1
  138. #define c03 %f2
  139. #define c04 %f3
  140. #define c05 %f4
  141. #define c06 %f5
  142. #define c07 %f6
  143. #define c08 %f7
  144. #define c09 %f8
  145. #define c10 %f9
  146. #define c11 %f10
  147. #define c12 %f11
  148. #define c13 %f12
  149. #define c14 %f13
  150. #define c15 %f14
  151. #define c16 %f15
  152. #define a1 %f16
  153. #define a2 %f17
  154. #define a3 %f18
  155. #define a4 %f19
  156. #define a5 %f20
  157. #define b1 %f21
  158. #define b2 %f22
  159. #define b3 %f23
  160. #define b4 %f24
  161. #define b5 %f25
  162. #define b6 %f26
  163. #define b7 %f27
  164. #define b8 %f28
  165. #define b9 %f29
  166. #define ALPHA %f31
  167. #define cc01 0
  168. #define cc02 1
  169. #define cc03 2
  170. #define cc04 3
  171. #define cc05 4
  172. #define cc06 5
  173. #define cc07 6
  174. #define cc08 7
  175. #define cc09 8
  176. #define cc10 9
  177. #define cc11 10
  178. #define cc12 11
  179. #define cc13 12
  180. #define cc14 13
  181. #define cc15 14
  182. #define cc16 15
  183. #define aa1 16
  184. #define aa2 17
  185. #define aa3 18
  186. #define aa4 19
  187. #define aa5 20
  188. #define bb1 21
  189. #define bb2 22
  190. #define bb3 23
  191. #define bb4 24
  192. #define bb5 25
  193. #define bb6 26
  194. #define bb7 27
  195. #define bb8 28
  196. #define bb9 29
  197. #define alpha 31
  198. #endif
  199. .register %g2, #scratch
  200. .register %g3, #scratch
  201. PROLOGUE
  202. SAVESP
  203. nop
  204. #ifndef __64BIT__
  205. #ifdef DOUBLE
  206. st %i3, [%sp + STACK_START + 16]
  207. st %i4, [%sp + STACK_START + 20]
  208. ld [%sp + STACK_START + 28], B
  209. ld [%sp + STACK_START + 32], C
  210. ld [%sp + STACK_START + 36], LDC
  211. #ifdef TRMMKERNEL
  212. ld [%sp + STACK_START + 40], OFFSET
  213. #endif
  214. #else
  215. st %i3, [%sp + STACK_START + 16]
  216. ld [%sp + STACK_START + 28], C
  217. ld [%sp + STACK_START + 32], LDC
  218. #ifdef TRMMKERNEL
  219. ld [%sp + STACK_START + 36], OFFSET
  220. #endif
  221. #endif
  222. LDF [%sp + STACK_START + 16], ALPHA
  223. #ifdef TRMMKERNEL
  224. st %g1, [%sp + STACK_START + 8]
  225. st %g2, [%sp + STACK_START + 12]
  226. st %g3, [%sp + STACK_START + 16]
  227. st %g4, [%sp + STACK_START + 20]
  228. #endif
  229. #else
  230. ldx [%sp+ STACK_START + 56], C
  231. ldx [%sp+ STACK_START + 64], LDC
  232. #ifdef TRMMKERNEL
  233. ldx [%sp+ STACK_START + 72], OFFSET
  234. #endif
  235. #ifdef DOUBLE
  236. FMOV %f6, ALPHA
  237. #else
  238. FMOV %f7, ALPHA
  239. #endif
  240. #ifdef TRMMKERNEL
  241. stx %g1, [%sp + STACK_START + 32]
  242. stx %g2, [%sp + STACK_START + 40]
  243. stx %g3, [%sp + STACK_START + 48]
  244. stx %g4, [%sp + STACK_START + 56]
  245. #endif
  246. #endif
  247. #if defined(TRMMKERNEL) && !defined(LEFT)
  248. neg OFFSET, KK
  249. #endif
  250. sra N, 3, J
  251. cmp J, 0
  252. ble,pn %icc, .LL30
  253. sll LDC, BASE_SHIFT, LDC
  254. .LL11:
  255. mov C, C1
  256. add C, LDC, C2
  257. add C2, LDC, C3
  258. add C3, LDC, C4
  259. add C4, LDC, C5
  260. add C5, LDC, C6
  261. add C6, LDC, C7
  262. add C7, LDC, C8
  263. add C8, LDC, C
  264. sll K, BASE_SHIFT + 3, BB
  265. #if defined(TRMMKERNEL) && defined(LEFT)
  266. mov OFFSET, KK
  267. #endif
  268. mov A, AO
  269. sra M, 1, I
  270. cmp I, 0
  271. ble,pn %icc, .LL20
  272. add B, BB, BB
  273. .align 4
  274. .LL12:
  275. prefetch [BB + 0 * SIZE], 1
  276. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
  277. mov B, BO
  278. #else
  279. sll KK, BASE_SHIFT + 1, TEMP1
  280. sll KK, BASE_SHIFT + 3, TEMP2
  281. add AO, TEMP1, AO
  282. add B, TEMP2, BO
  283. #endif
  284. LDF [AO + 0 * SIZE], a1
  285. LDF [AO + 1 * SIZE], a2
  286. LDF [AO + 8 * SIZE], a5
  287. LDF [BO + 0 * SIZE], b1
  288. LDF [BO + 1 * SIZE], b2
  289. FCLR (cc01)
  290. LDF [BO + 2 * SIZE], b3
  291. FCLR (cc05)
  292. LDF [BO + 3 * SIZE], b4
  293. FCLR (cc09)
  294. LDF [BO + 4 * SIZE], b5
  295. FCLR (cc13)
  296. LDF [BO + 5 * SIZE], b6
  297. FCLR (cc02)
  298. LDF [BO + 6 * SIZE], b7
  299. FCLR (cc06)
  300. LDF [BO + 7 * SIZE], b8
  301. FCLR (cc10)
  302. LDF [BO + 8 * SIZE], b9
  303. FCLR (cc14)
  304. prefetch [C1 + 1 * SIZE], 3
  305. FCLR (cc03)
  306. prefetch [C2 + 2 * SIZE], 3
  307. FCLR (cc07)
  308. prefetch [C3 + 1 * SIZE], 3
  309. FCLR (cc11)
  310. prefetch [C4 + 2 * SIZE], 3
  311. FCLR (cc15)
  312. prefetch [C5 + 1 * SIZE], 3
  313. FCLR (cc04)
  314. prefetch [C6 + 2 * SIZE], 3
  315. FCLR (cc08)
  316. prefetch [C7 + 1 * SIZE], 3
  317. FCLR (cc12)
  318. prefetch [C8 + 2 * SIZE], 3
  319. FCLR (cc16)
  320. #ifndef TRMMKERNEL
  321. sra K, 3, L
  322. #else
  323. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  324. sub K, KK, L
  325. #elif defined(LEFT)
  326. add KK, 2, L
  327. #else
  328. add KK, 8, L
  329. #endif
  330. sra L, 3, L
  331. #endif
  332. cmp L, 0
  333. ble,pn %icc, .LL15
  334. add BB, 32 * SIZE, BB
  335. .align 4
  336. .LL13:
  337. FMADD (aa1, bb1, cc01, cc01)
  338. FMADD (aa2, bb1, cc02, cc02)
  339. FMADD (aa1, bb2, cc03, cc03)
  340. FMADD (aa2, bb2, cc04, cc04)
  341. FMADD (aa1, bb3, cc05, cc05)
  342. LDF [BO + 16 * SIZE], b1
  343. FMADD (aa2, bb3, cc06, cc06)
  344. LDF [BO + 9 * SIZE], b2
  345. FMADD (aa1, bb4, cc07, cc07)
  346. LDF [BO + 10 * SIZE], b3
  347. FMADD (aa2, bb4, cc08, cc08)
  348. LDF [BO + 11 * SIZE], b4
  349. FMADD (aa1, bb5, cc09, cc09)
  350. LDF [AO + 2 * SIZE], a3
  351. FMADD (aa2, bb5, cc10, cc10)
  352. LDF [AO + 3 * SIZE], a4
  353. FMADD (aa1, bb6, cc11, cc11)
  354. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  355. FMADD (aa2, bb6, cc12, cc12)
  356. nop
  357. FMADD (aa1, bb7, cc13, cc13)
  358. LDF [BO + 12 * SIZE], b5
  359. FMADD (aa2, bb7, cc14, cc14)
  360. LDF [BO + 13 * SIZE], b6
  361. FMADD (aa1, bb8, cc15, cc15)
  362. LDF [BO + 14 * SIZE], b7
  363. FMADD (aa2, bb8, cc16, cc16)
  364. LDF [BO + 15 * SIZE], b8
  365. FMADD (aa3, bb9, cc01, cc01)
  366. FMADD (aa4, bb9, cc02, cc02)
  367. FMADD (aa3, bb2, cc03, cc03)
  368. FMADD (aa4, bb2, cc04, cc04)
  369. FMADD (aa3, bb3, cc05, cc05)
  370. LDF [BO + 24 * SIZE], b9
  371. FMADD (aa4, bb3, cc06, cc06)
  372. LDF [BO + 17 * SIZE], b2
  373. FMADD (aa3, bb4, cc07, cc07)
  374. LDF [BO + 18 * SIZE], b3
  375. FMADD (aa4, bb4, cc08, cc08)
  376. LDF [BO + 19 * SIZE], b4
  377. FMADD (aa3, bb5, cc09, cc09)
  378. LDF [AO + 4 * SIZE], a1
  379. FMADD (aa4, bb5, cc10, cc10)
  380. LDF [AO + 5 * SIZE], a2
  381. FMADD (aa3, bb6, cc11, cc11)
  382. add L, -1, L
  383. FMADD (aa4, bb6, cc12, cc12)
  384. nop
  385. FMADD (aa3, bb7, cc13, cc13)
  386. LDF [BO + 20 * SIZE], b5
  387. FMADD (aa4, bb7, cc14, cc14)
  388. LDF [BO + 21 * SIZE], b6
  389. FMADD (aa3, bb8, cc15, cc15)
  390. LDF [BO + 22 * SIZE], b7
  391. FMADD (aa4, bb8, cc16, cc16)
  392. LDF [BO + 23 * SIZE], b8
  393. FMADD (aa1, bb1, cc01, cc01)
  394. FMADD (aa2, bb1, cc02, cc02)
  395. FMADD (aa1, bb2, cc03, cc03)
  396. FMADD (aa2, bb2, cc04, cc04)
  397. FMADD (aa1, bb3, cc05, cc05)
  398. LDF [BO + 32 * SIZE], b1
  399. FMADD (aa2, bb3, cc06, cc06)
  400. LDF [BO + 25 * SIZE], b2
  401. FMADD (aa1, bb4, cc07, cc07)
  402. LDF [BO + 26 * SIZE], b3
  403. FMADD (aa2, bb4, cc08, cc08)
  404. LDF [BO + 27 * SIZE], b4
  405. FMADD (aa1, bb5, cc09, cc09)
  406. LDF [AO + 6 * SIZE], a3
  407. FMADD (aa2, bb5, cc10, cc10)
  408. LDF [AO + 7 * SIZE], a4
  409. FMADD (aa1, bb6, cc11, cc11)
  410. nop
  411. FMADD (aa2, bb6, cc12, cc12)
  412. nop
  413. FMADD (aa1, bb7, cc13, cc13)
  414. LDF [BO + 28 * SIZE], b5
  415. FMADD (aa2, bb7, cc14, cc14)
  416. LDF [BO + 29 * SIZE], b6
  417. FMADD (aa1, bb8, cc15, cc15)
  418. LDF [BO + 30 * SIZE], b7
  419. FMADD (aa2, bb8, cc16, cc16)
  420. LDF [BO + 31 * SIZE], b8
  421. FMADD (aa3, bb9, cc01, cc01)
  422. FMADD (aa4, bb9, cc02, cc02)
  423. FMADD (aa3, bb2, cc03, cc03)
  424. FMADD (aa4, bb2, cc04, cc04)
  425. FMADD (aa3, bb3, cc05, cc05)
  426. LDF [BO + 40 * SIZE], b9
  427. FMADD (aa4, bb3, cc06, cc06)
  428. LDF [BO + 33 * SIZE], b2
  429. FMADD (aa3, bb4, cc07, cc07)
  430. LDF [BO + 34 * SIZE], b3
  431. FMADD (aa4, bb4, cc08, cc08)
  432. LDF [BO + 35 * SIZE], b4
  433. FMADD (aa3, bb5, cc09, cc09)
  434. LDF [AO + 16 * SIZE], a1 /****/
  435. FMADD (aa4, bb5, cc10, cc10)
  436. LDF [AO + 9 * SIZE], a2
  437. FMADD (aa3, bb6, cc11, cc11)
  438. nop
  439. FMADD (aa4, bb6, cc12, cc12)
  440. nop
  441. FMADD (aa3, bb7, cc13, cc13)
  442. LDF [BO + 36 * SIZE], b5
  443. FMADD (aa4, bb7, cc14, cc14)
  444. LDF [BO + 37 * SIZE], b6
  445. FMADD (aa3, bb8, cc15, cc15)
  446. LDF [BO + 38 * SIZE], b7
  447. FMADD (aa4, bb8, cc16, cc16)
  448. LDF [BO + 39 * SIZE], b8
  449. FMADD (aa5, bb1, cc01, cc01)
  450. FMADD (aa2, bb1, cc02, cc02)
  451. FMADD (aa5, bb2, cc03, cc03)
  452. FMADD (aa2, bb2, cc04, cc04)
  453. FMADD (aa5, bb3, cc05, cc05)
  454. LDF [BO + 48 * SIZE], b1
  455. FMADD (aa2, bb3, cc06, cc06)
  456. LDF [BO + 41 * SIZE], b2
  457. FMADD (aa5, bb4, cc07, cc07)
  458. LDF [BO + 42 * SIZE], b3
  459. FMADD (aa2, bb4, cc08, cc08)
  460. LDF [BO + 43 * SIZE], b4
  461. FMADD (aa5, bb5, cc09, cc09)
  462. LDF [AO + 10 * SIZE], a3
  463. FMADD (aa2, bb5, cc10, cc10)
  464. LDF [AO + 11 * SIZE], a4
  465. FMADD (aa5, bb6, cc11, cc11)
  466. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  467. FMADD (aa2, bb6, cc12, cc12)
  468. nop
  469. FMADD (aa5, bb7, cc13, cc13)
  470. LDF [BO + 44 * SIZE], b5
  471. FMADD (aa2, bb7, cc14, cc14)
  472. LDF [BO + 45 * SIZE], b6
  473. FMADD (aa5, bb8, cc15, cc15)
  474. LDF [BO + 46 * SIZE], b7
  475. FMADD (aa2, bb8, cc16, cc16)
  476. LDF [BO + 47 * SIZE], b8
  477. FMADD (aa3, bb9, cc01, cc01)
  478. FMADD (aa4, bb9, cc02, cc02)
  479. FMADD (aa3, bb2, cc03, cc03)
  480. FMADD (aa4, bb2, cc04, cc04)
  481. FMADD (aa3, bb3, cc05, cc05)
  482. LDF [BO + 56 * SIZE], b9
  483. FMADD (aa4, bb3, cc06, cc06)
  484. LDF [BO + 49 * SIZE], b2
  485. FMADD (aa3, bb4, cc07, cc07)
  486. LDF [BO + 50 * SIZE], b3
  487. FMADD (aa4, bb4, cc08, cc08)
  488. LDF [BO + 51 * SIZE], b4
  489. FMADD (aa3, bb5, cc09, cc09)
  490. LDF [AO + 12 * SIZE], a5
  491. FMADD (aa4, bb5, cc10, cc10)
  492. LDF [AO + 13 * SIZE], a2
  493. FMADD (aa3, bb6, cc11, cc11)
  494. cmp L, 0
  495. FMADD (aa4, bb6, cc12, cc12)
  496. nop
  497. FMADD (aa3, bb7, cc13, cc13)
  498. LDF [BO + 52 * SIZE], b5
  499. FMADD (aa4, bb7, cc14, cc14)
  500. LDF [BO + 53 * SIZE], b6
  501. FMADD (aa3, bb8, cc15, cc15)
  502. LDF [BO + 54 * SIZE], b7
  503. FMADD (aa4, bb8, cc16, cc16)
  504. LDF [BO + 55 * SIZE], b8
  505. FMADD (aa5, bb1, cc01, cc01)
  506. FMADD (aa2, bb1, cc02, cc02)
  507. FMADD (aa5, bb2, cc03, cc03)
  508. FMADD (aa2, bb2, cc04, cc04)
  509. FMADD (aa5, bb3, cc05, cc05)
  510. LDF [BO + 64 * SIZE], b1
  511. FMADD (aa2, bb3, cc06, cc06)
  512. LDF [BO + 57 * SIZE], b2
  513. FMADD (aa5, bb4, cc07, cc07)
  514. LDF [BO + 58 * SIZE], b3
  515. FMADD (aa2, bb4, cc08, cc08)
  516. LDF [BO + 59 * SIZE], b4
  517. FMADD (aa5, bb5, cc09, cc09)
  518. LDF [AO + 14 * SIZE], a3
  519. FMADD (aa2, bb5, cc10, cc10)
  520. LDF [AO + 15 * SIZE], a4
  521. FMADD (aa5, bb6, cc11, cc11)
  522. add BO, 64 * SIZE, BO
  523. FMADD (aa2, bb6, cc12, cc12)
  524. add AO, 16 * SIZE, AO
  525. FMADD (aa5, bb7, cc13, cc13)
  526. LDF [BO - 4 * SIZE], b5
  527. FMADD (aa2, bb7, cc14, cc14)
  528. LDF [BO - 3 * SIZE], b6
  529. FMADD (aa5, bb8, cc15, cc15)
  530. LDF [BO - 2 * SIZE], b7
  531. FMADD (aa2, bb8, cc16, cc16)
  532. LDF [BO - 1 * SIZE], b8
  533. FMADD (aa3, bb9, cc01, cc01)
  534. FMADD (aa4, bb9, cc02, cc02)
  535. FMADD (aa3, bb2, cc03, cc03)
  536. FMADD (aa4, bb2, cc04, cc04)
  537. FMADD (aa3, bb3, cc05, cc05)
  538. LDF [BO + 8 * SIZE], b9
  539. FMADD (aa4, bb3, cc06, cc06)
  540. LDF [BO + 1 * SIZE], b2
  541. FMADD (aa3, bb4, cc07, cc07)
  542. LDF [BO + 2 * SIZE], b3
  543. FMADD (aa4, bb4, cc08, cc08)
  544. LDF [BO + 3 * SIZE], b4
  545. FMADD (aa3, bb5, cc09, cc09)
  546. LDF [AO + 8 * SIZE], a5 /****/
  547. FMADD (aa4, bb5, cc10, cc10)
  548. LDF [AO + 1 * SIZE], a2
  549. FMADD (aa3, bb6, cc11, cc11)
  550. FMADD (aa4, bb6, cc12, cc12)
  551. FMADD (aa3, bb7, cc13, cc13)
  552. LDF [BO + 4 * SIZE], b5
  553. FMADD (aa4, bb7, cc14, cc14)
  554. LDF [BO + 5 * SIZE], b6
  555. FMADD (aa3, bb8, cc15, cc15)
  556. LDF [BO + 6 * SIZE], b7
  557. FMADD (aa4, bb8, cc16, cc16)
  558. ble,pn %icc, .LL15
  559. LDF [BO + 7 * SIZE], b8
  560. FMADD (aa1, bb1, cc01, cc01)
  561. FMADD (aa2, bb1, cc02, cc02)
  562. FMADD (aa1, bb2, cc03, cc03)
  563. FMADD (aa2, bb2, cc04, cc04)
  564. FMADD (aa1, bb3, cc05, cc05)
  565. LDF [BO + 16 * SIZE], b1
  566. FMADD (aa2, bb3, cc06, cc06)
  567. LDF [BO + 9 * SIZE], b2
  568. FMADD (aa1, bb4, cc07, cc07)
  569. LDF [BO + 10 * SIZE], b3
  570. FMADD (aa2, bb4, cc08, cc08)
  571. LDF [BO + 11 * SIZE], b4
  572. FMADD (aa1, bb5, cc09, cc09)
  573. LDF [AO + 2 * SIZE], a3
  574. FMADD (aa2, bb5, cc10, cc10)
  575. LDF [AO + 3 * SIZE], a4
  576. FMADD (aa1, bb6, cc11, cc11)
  577. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  578. FMADD (aa2, bb6, cc12, cc12)
  579. nop
  580. FMADD (aa1, bb7, cc13, cc13)
  581. LDF [BO + 12 * SIZE], b5
  582. FMADD (aa2, bb7, cc14, cc14)
  583. LDF [BO + 13 * SIZE], b6
  584. FMADD (aa1, bb8, cc15, cc15)
  585. LDF [BO + 14 * SIZE], b7
  586. FMADD (aa2, bb8, cc16, cc16)
  587. LDF [BO + 15 * SIZE], b8
  588. FMADD (aa3, bb9, cc01, cc01)
  589. FMADD (aa4, bb9, cc02, cc02)
  590. FMADD (aa3, bb2, cc03, cc03)
  591. FMADD (aa4, bb2, cc04, cc04)
  592. FMADD (aa3, bb3, cc05, cc05)
  593. LDF [BO + 24 * SIZE], b9
  594. FMADD (aa4, bb3, cc06, cc06)
  595. LDF [BO + 17 * SIZE], b2
  596. FMADD (aa3, bb4, cc07, cc07)
  597. LDF [BO + 18 * SIZE], b3
  598. FMADD (aa4, bb4, cc08, cc08)
  599. LDF [BO + 19 * SIZE], b4
  600. FMADD (aa3, bb5, cc09, cc09)
  601. LDF [AO + 4 * SIZE], a1
  602. FMADD (aa4, bb5, cc10, cc10)
  603. LDF [AO + 5 * SIZE], a2
  604. FMADD (aa3, bb6, cc11, cc11)
  605. add L, -1, L
  606. FMADD (aa4, bb6, cc12, cc12)
  607. nop
  608. FMADD (aa3, bb7, cc13, cc13)
  609. LDF [BO + 20 * SIZE], b5
  610. FMADD (aa4, bb7, cc14, cc14)
  611. LDF [BO + 21 * SIZE], b6
  612. FMADD (aa3, bb8, cc15, cc15)
  613. LDF [BO + 22 * SIZE], b7
  614. FMADD (aa4, bb8, cc16, cc16)
  615. LDF [BO + 23 * SIZE], b8
  616. FMADD (aa1, bb1, cc01, cc01)
  617. FMADD (aa2, bb1, cc02, cc02)
  618. FMADD (aa1, bb2, cc03, cc03)
  619. FMADD (aa2, bb2, cc04, cc04)
  620. FMADD (aa1, bb3, cc05, cc05)
  621. LDF [BO + 32 * SIZE], b1
  622. FMADD (aa2, bb3, cc06, cc06)
  623. LDF [BO + 25 * SIZE], b2
  624. FMADD (aa1, bb4, cc07, cc07)
  625. LDF [BO + 26 * SIZE], b3
  626. FMADD (aa2, bb4, cc08, cc08)
  627. LDF [BO + 27 * SIZE], b4
  628. FMADD (aa1, bb5, cc09, cc09)
  629. LDF [AO + 6 * SIZE], a3
  630. FMADD (aa2, bb5, cc10, cc10)
  631. LDF [AO + 7 * SIZE], a4
  632. FMADD (aa1, bb6, cc11, cc11)
  633. nop
  634. FMADD (aa2, bb6, cc12, cc12)
  635. nop
  636. FMADD (aa1, bb7, cc13, cc13)
  637. LDF [BO + 28 * SIZE], b5
  638. FMADD (aa2, bb7, cc14, cc14)
  639. LDF [BO + 29 * SIZE], b6
  640. FMADD (aa1, bb8, cc15, cc15)
  641. LDF [BO + 30 * SIZE], b7
  642. FMADD (aa2, bb8, cc16, cc16)
  643. LDF [BO + 31 * SIZE], b8
  644. FMADD (aa3, bb9, cc01, cc01)
  645. FMADD (aa4, bb9, cc02, cc02)
  646. FMADD (aa3, bb2, cc03, cc03)
  647. FMADD (aa4, bb2, cc04, cc04)
  648. FMADD (aa3, bb3, cc05, cc05)
  649. LDF [BO + 40 * SIZE], b9
  650. FMADD (aa4, bb3, cc06, cc06)
  651. LDF [BO + 33 * SIZE], b2
  652. FMADD (aa3, bb4, cc07, cc07)
  653. LDF [BO + 34 * SIZE], b3
  654. FMADD (aa4, bb4, cc08, cc08)
  655. LDF [BO + 35 * SIZE], b4
  656. FMADD (aa3, bb5, cc09, cc09)
  657. LDF [AO + 16 * SIZE], a1 /****/
  658. FMADD (aa4, bb5, cc10, cc10)
  659. LDF [AO + 9 * SIZE], a2
  660. FMADD (aa3, bb6, cc11, cc11)
  661. nop
  662. FMADD (aa4, bb6, cc12, cc12)
  663. nop
  664. FMADD (aa3, bb7, cc13, cc13)
  665. LDF [BO + 36 * SIZE], b5
  666. FMADD (aa4, bb7, cc14, cc14)
  667. LDF [BO + 37 * SIZE], b6
  668. FMADD (aa3, bb8, cc15, cc15)
  669. LDF [BO + 38 * SIZE], b7
  670. FMADD (aa4, bb8, cc16, cc16)
  671. LDF [BO + 39 * SIZE], b8
  672. FMADD (aa5, bb1, cc01, cc01)
  673. FMADD (aa2, bb1, cc02, cc02)
  674. FMADD (aa5, bb2, cc03, cc03)
  675. FMADD (aa2, bb2, cc04, cc04)
  676. FMADD (aa5, bb3, cc05, cc05)
  677. LDF [BO + 48 * SIZE], b1
  678. FMADD (aa2, bb3, cc06, cc06)
  679. LDF [BO + 41 * SIZE], b2
  680. FMADD (aa5, bb4, cc07, cc07)
  681. LDF [BO + 42 * SIZE], b3
  682. FMADD (aa2, bb4, cc08, cc08)
  683. LDF [BO + 43 * SIZE], b4
  684. FMADD (aa5, bb5, cc09, cc09)
  685. LDF [AO + 10 * SIZE], a3
  686. FMADD (aa2, bb5, cc10, cc10)
  687. LDF [AO + 11 * SIZE], a4
  688. FMADD (aa5, bb6, cc11, cc11)
  689. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  690. FMADD (aa2, bb6, cc12, cc12)
  691. nop
  692. FMADD (aa5, bb7, cc13, cc13)
  693. LDF [BO + 44 * SIZE], b5
  694. FMADD (aa2, bb7, cc14, cc14)
  695. LDF [BO + 45 * SIZE], b6
  696. FMADD (aa5, bb8, cc15, cc15)
  697. LDF [BO + 46 * SIZE], b7
  698. FMADD (aa2, bb8, cc16, cc16)
  699. LDF [BO + 47 * SIZE], b8
  700. FMADD (aa3, bb9, cc01, cc01)
  701. FMADD (aa4, bb9, cc02, cc02)
  702. FMADD (aa3, bb2, cc03, cc03)
  703. FMADD (aa4, bb2, cc04, cc04)
  704. FMADD (aa3, bb3, cc05, cc05)
  705. LDF [BO + 56 * SIZE], b9
  706. FMADD (aa4, bb3, cc06, cc06)
  707. LDF [BO + 49 * SIZE], b2
  708. FMADD (aa3, bb4, cc07, cc07)
  709. LDF [BO + 50 * SIZE], b3
  710. FMADD (aa4, bb4, cc08, cc08)
  711. LDF [BO + 51 * SIZE], b4
  712. FMADD (aa3, bb5, cc09, cc09)
  713. LDF [AO + 12 * SIZE], a5
  714. FMADD (aa4, bb5, cc10, cc10)
  715. LDF [AO + 13 * SIZE], a2
  716. FMADD (aa3, bb6, cc11, cc11)
  717. cmp L, 0
  718. FMADD (aa4, bb6, cc12, cc12)
  719. nop
  720. FMADD (aa3, bb7, cc13, cc13)
  721. LDF [BO + 52 * SIZE], b5
  722. FMADD (aa4, bb7, cc14, cc14)
  723. LDF [BO + 53 * SIZE], b6
  724. FMADD (aa3, bb8, cc15, cc15)
  725. LDF [BO + 54 * SIZE], b7
  726. FMADD (aa4, bb8, cc16, cc16)
  727. LDF [BO + 55 * SIZE], b8
  728. FMADD (aa5, bb1, cc01, cc01)
  729. FMADD (aa2, bb1, cc02, cc02)
  730. FMADD (aa5, bb2, cc03, cc03)
  731. FMADD (aa2, bb2, cc04, cc04)
  732. FMADD (aa5, bb3, cc05, cc05)
  733. LDF [BO + 64 * SIZE], b1
  734. FMADD (aa2, bb3, cc06, cc06)
  735. LDF [BO + 57 * SIZE], b2
  736. FMADD (aa5, bb4, cc07, cc07)
  737. LDF [BO + 58 * SIZE], b3
  738. FMADD (aa2, bb4, cc08, cc08)
  739. LDF [BO + 59 * SIZE], b4
  740. FMADD (aa5, bb5, cc09, cc09)
  741. LDF [AO + 14 * SIZE], a3
  742. FMADD (aa2, bb5, cc10, cc10)
  743. LDF [AO + 15 * SIZE], a4
  744. FMADD (aa5, bb6, cc11, cc11)
  745. add BO, 64 * SIZE, BO
  746. FMADD (aa2, bb6, cc12, cc12)
  747. add AO, 16 * SIZE, AO
  748. FMADD (aa5, bb7, cc13, cc13)
  749. LDF [BO - 4 * SIZE], b5
  750. FMADD (aa2, bb7, cc14, cc14)
  751. LDF [BO - 3 * SIZE], b6
  752. FMADD (aa5, bb8, cc15, cc15)
  753. LDF [BO - 2 * SIZE], b7
  754. FMADD (aa2, bb8, cc16, cc16)
  755. LDF [BO - 1 * SIZE], b8
  756. FMADD (aa3, bb9, cc01, cc01)
  757. FMADD (aa4, bb9, cc02, cc02)
  758. FMADD (aa3, bb2, cc03, cc03)
  759. FMADD (aa4, bb2, cc04, cc04)
  760. FMADD (aa3, bb3, cc05, cc05)
  761. LDF [BO + 8 * SIZE], b9
  762. FMADD (aa4, bb3, cc06, cc06)
  763. LDF [BO + 1 * SIZE], b2
  764. FMADD (aa3, bb4, cc07, cc07)
  765. LDF [BO + 2 * SIZE], b3
  766. FMADD (aa4, bb4, cc08, cc08)
  767. LDF [BO + 3 * SIZE], b4
  768. FMADD (aa3, bb5, cc09, cc09)
  769. LDF [AO + 8 * SIZE], a5 /****/
  770. FMADD (aa4, bb5, cc10, cc10)
  771. LDF [AO + 1 * SIZE], a2
  772. FMADD (aa3, bb6, cc11, cc11)
  773. FMADD (aa4, bb6, cc12, cc12)
  774. FMADD (aa3, bb7, cc13, cc13)
  775. LDF [BO + 4 * SIZE], b5
  776. FMADD (aa4, bb7, cc14, cc14)
  777. LDF [BO + 5 * SIZE], b6
  778. FMADD (aa3, bb8, cc15, cc15)
  779. LDF [BO + 6 * SIZE], b7
  780. FMADD (aa4, bb8, cc16, cc16)
  781. bg,pt %icc, .LL13
  782. LDF [BO + 7 * SIZE], b8
  783. .align 4
  784. .LL15:
  785. #ifndef TRMMKERNEL
  786. and K, 7, L
  787. #else
  788. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  789. sub K, KK, L
  790. #elif defined(LEFT)
  791. add KK, 2, L
  792. #else
  793. add KK, 8, L
  794. #endif
  795. and L, 7, L
  796. #endif
  797. cmp L, 0
  798. ble,a,pn %icc, .LL18
  799. nop
  800. .align 4
  801. .LL17:
  802. FMADD (aa1, bb1, cc01, cc01)
  803. add L, -1, L
  804. FMADD (aa2, bb1, cc02, cc02)
  805. nop
  806. FMADD (aa1, bb2, cc03, cc03)
  807. LDF [BO + 8 * SIZE], b1
  808. FMADD (aa2, bb2, cc04, cc04)
  809. LDF [BO + 9 * SIZE], b2
  810. FMADD (aa1, bb3, cc05, cc05)
  811. cmp L, 0
  812. FMADD (aa2, bb3, cc06, cc06)
  813. nop
  814. FMADD (aa1, bb4, cc07, cc07)
  815. LDF [BO + 10 * SIZE], b3
  816. FMADD (aa2, bb4, cc08, cc08)
  817. LDF [BO + 11 * SIZE], b4
  818. FMADD (aa1, bb5, cc09, cc09)
  819. nop
  820. FMADD (aa2, bb5, cc10, cc10)
  821. nop
  822. FMADD (aa1, bb6, cc11, cc11)
  823. LDF [BO + 12 * SIZE], b5
  824. FMADD (aa2, bb6, cc12, cc12)
  825. LDF [BO + 13 * SIZE], b6
  826. FMADD (aa1, bb7, cc13, cc13)
  827. add AO, 2 * SIZE, AO
  828. FMADD (aa2, bb7, cc14, cc14)
  829. add BO, 8 * SIZE, BO
  830. FMADD (aa1, bb8, cc15, cc15)
  831. LDF [AO + 0 * SIZE], a1
  832. FMADD (aa2, bb8, cc16, cc16)
  833. LDF [AO + 1 * SIZE], a2
  834. LDF [BO + 6 * SIZE], b7
  835. bg,pt %icc, .LL17
  836. LDF [BO + 7 * SIZE], b8
  837. nop
  838. .align 4
  839. .LL18:
  840. #ifndef TRMMKERNEL
  841. LDF [C1 + 0 * SIZE], a1
  842. LDF [C1 + 1 * SIZE], a2
  843. LDF [C2 + 0 * SIZE], a3
  844. LDF [C2 + 1 * SIZE], a4
  845. LDF [C3 + 0 * SIZE], b1
  846. LDF [C3 + 1 * SIZE], b2
  847. LDF [C4 + 0 * SIZE], b3
  848. LDF [C4 + 1 * SIZE], b4
  849. FMADD (alpha, cc01, aa1, cc01)
  850. LDF [C5 + 0 * SIZE], a1
  851. FMADD (alpha, cc02, aa2, cc02)
  852. LDF [C5 + 1 * SIZE], a2
  853. FMADD (alpha, cc03, aa3, cc03)
  854. LDF [C6 + 0 * SIZE], a3
  855. FMADD (alpha, cc04, aa4, cc04)
  856. LDF [C6 + 1 * SIZE], a4
  857. FMADD (alpha, cc05, bb1, cc05)
  858. LDF [C7 + 0 * SIZE], b1
  859. FMADD (alpha, cc06, bb2, cc06)
  860. LDF [C7 + 1 * SIZE], b2
  861. FMADD (alpha, cc07, bb3, cc07)
  862. LDF [C8 + 0 * SIZE], b3
  863. FMADD (alpha, cc08, bb4, cc08)
  864. LDF [C8 + 1 * SIZE], b4
  865. FMADD (alpha, cc09, aa1, cc09)
  866. STF c01, [C1 + 0 * SIZE]
  867. FMADD (alpha, cc10, aa2, cc10)
  868. STF c02, [C1 + 1 * SIZE]
  869. FMADD (alpha, cc11, aa3, cc11)
  870. STF c03, [C2 + 0 * SIZE]
  871. FMADD (alpha, cc12, aa4, cc12)
  872. STF c04, [C2 + 1 * SIZE]
  873. FMADD (alpha, cc13, bb1, cc13)
  874. STF c05, [C3 + 0 * SIZE]
  875. FMADD (alpha, cc14, bb2, cc14)
  876. STF c06, [C3 + 1 * SIZE]
  877. FMADD (alpha, cc15, bb3, cc15)
  878. STF c07, [C4 + 0 * SIZE]
  879. FMADD (alpha, cc16, bb4, cc16)
  880. STF c08, [C4 + 1 * SIZE]
  881. #else
  882. FMUL ALPHA, c01, c01
  883. FMUL ALPHA, c02, c02
  884. FMUL ALPHA, c03, c03
  885. FMUL ALPHA, c04, c04
  886. FMUL ALPHA, c05, c05
  887. FMUL ALPHA, c06, c06
  888. FMUL ALPHA, c07, c07
  889. FMUL ALPHA, c08, c08
  890. FMUL ALPHA, c09, c09
  891. STF c01, [C1 + 0 * SIZE]
  892. FMUL ALPHA, c10, c10
  893. STF c02, [C1 + 1 * SIZE]
  894. FMUL ALPHA, c11, c11
  895. STF c03, [C2 + 0 * SIZE]
  896. FMUL ALPHA, c12, c12
  897. STF c04, [C2 + 1 * SIZE]
  898. FMUL ALPHA, c13, c13
  899. STF c05, [C3 + 0 * SIZE]
  900. FMUL ALPHA, c14, c14
  901. STF c06, [C3 + 1 * SIZE]
  902. FMUL ALPHA, c15, c15
  903. STF c07, [C4 + 0 * SIZE]
  904. FMUL ALPHA, c16, c16
  905. STF c08, [C4 + 1 * SIZE]
  906. #endif
  907. STF c09, [C5 + 0 * SIZE]
  908. add C1, 2 * SIZE, C1
  909. STF c10, [C5 + 1 * SIZE]
  910. add C2, 2 * SIZE, C2
  911. STF c11, [C6 + 0 * SIZE]
  912. add C3, 2 * SIZE, C3
  913. STF c12, [C6 + 1 * SIZE]
  914. add C4, 2 * SIZE, C4
  915. STF c13, [C7 + 0 * SIZE]
  916. add C5, 2 * SIZE, C5
  917. STF c14, [C7 + 1 * SIZE]
  918. add C6, 2 * SIZE, C6
  919. STF c15, [C8 + 0 * SIZE]
  920. add C7, 2 * SIZE, C7
  921. STF c16, [C8 + 1 * SIZE]
  922. add C8, 2 * SIZE, C8
  923. #ifdef TRMMKERNEL
  924. #if ( defined(LEFT) && defined(TRANSA)) || \
  925. (!defined(LEFT) && !defined(TRANSA))
  926. sub K, KK, TEMP1
  927. #ifdef LEFT
  928. add TEMP1, -2, TEMP1
  929. #else
  930. add TEMP1, -8, TEMP1
  931. #endif
  932. sll TEMP1, BASE_SHIFT + 1, TEMP2
  933. sll TEMP1, BASE_SHIFT + 3, TEMP1
  934. add AO, TEMP2, AO
  935. add BO, TEMP1, BO
  936. #endif
  937. #ifdef LEFT
  938. add KK, 2, KK
  939. #endif
  940. #endif
  941. add I, -1, I
  942. cmp I, 0
  943. bg,pt %icc, .LL12
  944. nop
  945. .align 4
  946. .LL20:
  947. and M, 1, I
  948. cmp I, 0
  949. ble,pn %icc, .LL29
  950. nop
  951. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
  952. mov B, BO
  953. #else
  954. sll KK, BASE_SHIFT + 0, TEMP1
  955. sll KK, BASE_SHIFT + 3, TEMP2
  956. add AO, TEMP1, AO
  957. add B, TEMP2, BO
  958. #endif
  959. LDF [AO + 0 * SIZE], a1
  960. LDF [AO + 1 * SIZE], a2
  961. LDF [AO + 2 * SIZE], a3
  962. LDF [AO + 3 * SIZE], a4
  963. LDF [BO + 0 * SIZE], b1
  964. FCLR (cc01)
  965. LDF [BO + 1 * SIZE], b2
  966. FCLR (cc03)
  967. LDF [BO + 2 * SIZE], b3
  968. FCLR (cc05)
  969. LDF [BO + 3 * SIZE], b4
  970. FCLR (cc07)
  971. LDF [BO + 4 * SIZE], b5
  972. FCLR (cc09)
  973. LDF [BO + 5 * SIZE], b6
  974. FCLR (cc11)
  975. LDF [BO + 6 * SIZE], b7
  976. FCLR (cc13)
  977. LDF [BO + 7 * SIZE], b8
  978. FCLR (cc15)
  979. #ifndef TRMMKERNEL
  980. sra K, 2, L
  981. #else
  982. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  983. sub K, KK, L
  984. #elif defined(LEFT)
  985. add KK, 1, L
  986. #else
  987. add KK, 8, L
  988. #endif
  989. sra L, 2, L
  990. #endif
  991. cmp L, 0
  992. ble,pn %icc, .LL25
  993. LDF [BO + 8 * SIZE], b9
  994. .align 4
  995. .LL23:
  996. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  997. add L, -1, L
  998. FMADD (aa1, bb1, cc01, cc01)
  999. LDF [BO + 16 * SIZE], b1
  1000. FMADD (aa1, bb2, cc03, cc03)
  1001. LDF [BO + 9 * SIZE], b2
  1002. FMADD (aa1, bb3, cc05, cc05)
  1003. LDF [BO + 10 * SIZE], b3
  1004. FMADD (aa1, bb4, cc07, cc07)
  1005. LDF [BO + 11 * SIZE], b4
  1006. FMADD (aa1, bb5, cc09, cc09)
  1007. LDF [BO + 12 * SIZE], b5
  1008. FMADD (aa1, bb6, cc11, cc11)
  1009. LDF [BO + 13 * SIZE], b6
  1010. FMADD (aa1, bb7, cc13, cc13)
  1011. LDF [BO + 14 * SIZE], b7
  1012. FMADD (aa1, bb8, cc15, cc15)
  1013. LDF [BO + 15 * SIZE], b8
  1014. FMADD (aa2, bb9, cc01, cc01)
  1015. LDF [BO + 24 * SIZE], b9
  1016. FMADD (aa2, bb2, cc03, cc03)
  1017. LDF [BO + 17 * SIZE], b2
  1018. FMADD (aa2, bb3, cc05, cc05)
  1019. LDF [BO + 18 * SIZE], b3
  1020. FMADD (aa2, bb4, cc07, cc07)
  1021. LDF [BO + 19 * SIZE], b4
  1022. FMADD (aa2, bb5, cc09, cc09)
  1023. LDF [BO + 20 * SIZE], b5
  1024. FMADD (aa2, bb6, cc11, cc11)
  1025. LDF [BO + 21 * SIZE], b6
  1026. FMADD (aa2, bb7, cc13, cc13)
  1027. LDF [BO + 22 * SIZE], b7
  1028. FMADD (aa2, bb8, cc15, cc15)
  1029. LDF [BO + 23 * SIZE], b8
  1030. LDF [AO + 4 * SIZE], a1
  1031. LDF [AO + 5 * SIZE], a2
  1032. FMADD (aa3, bb1, cc01, cc01)
  1033. LDF [BO + 32 * SIZE], b1
  1034. FMADD (aa3, bb2, cc03, cc03)
  1035. LDF [BO + 25 * SIZE], b2
  1036. FMADD (aa3, bb3, cc05, cc05)
  1037. LDF [BO + 26 * SIZE], b3
  1038. FMADD (aa3, bb4, cc07, cc07)
  1039. LDF [BO + 27 * SIZE], b4
  1040. FMADD (aa3, bb5, cc09, cc09)
  1041. LDF [BO + 28 * SIZE], b5
  1042. FMADD (aa3, bb6, cc11, cc11)
  1043. LDF [BO + 29 * SIZE], b6
  1044. FMADD (aa3, bb7, cc13, cc13)
  1045. LDF [BO + 30 * SIZE], b7
  1046. FMADD (aa3, bb8, cc15, cc15)
  1047. LDF [BO + 31 * SIZE], b8
  1048. FMADD (aa4, bb9, cc01, cc01)
  1049. LDF [BO + 40 * SIZE], b9
  1050. FMADD (aa4, bb2, cc03, cc03)
  1051. LDF [BO + 33 * SIZE], b2
  1052. FMADD (aa4, bb3, cc05, cc05)
  1053. LDF [BO + 34 * SIZE], b3
  1054. FMADD (aa4, bb4, cc07, cc07)
  1055. LDF [BO + 35 * SIZE], b4
  1056. FMADD (aa4, bb5, cc09, cc09)
  1057. LDF [BO + 36 * SIZE], b5
  1058. FMADD (aa4, bb6, cc11, cc11)
  1059. LDF [BO + 37 * SIZE], b6
  1060. FMADD (aa4, bb7, cc13, cc13)
  1061. LDF [BO + 38 * SIZE], b7
  1062. FMADD (aa4, bb8, cc15, cc15)
  1063. LDF [BO + 39 * SIZE], b8
  1064. LDF [AO + 6 * SIZE], a3
  1065. LDF [AO + 7 * SIZE], a4
  1066. add AO, 4 * SIZE, AO
  1067. cmp L, 0
  1068. bg,pt %icc, .LL23
  1069. add BO, 32 * SIZE, BO
  1070. .align 4
  1071. .LL25:
  1072. #ifndef TRMMKERNEL
  1073. and K, 3, L
  1074. #else
  1075. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1076. sub K, KK, L
  1077. #elif defined(LEFT)
  1078. add KK, 1, L
  1079. #else
  1080. add KK, 8, L
  1081. #endif
  1082. and L, 3, L
  1083. #endif
  1084. cmp L, 0
  1085. ble,a,pn %icc, .LL28
  1086. nop
  1087. .align 4
  1088. .LL27:
  1089. FMADD (aa1, bb1, cc01, cc01)
  1090. LDF [BO + 8 * SIZE], b1
  1091. FMADD (aa1, bb2, cc03, cc03)
  1092. LDF [BO + 9 * SIZE], b2
  1093. FMADD (aa1, bb3, cc05, cc05)
  1094. LDF [BO + 10 * SIZE], b3
  1095. FMADD (aa1, bb4, cc07, cc07)
  1096. LDF [BO + 11 * SIZE], b4
  1097. FMADD (aa1, bb5, cc09, cc09)
  1098. LDF [BO + 12 * SIZE], b5
  1099. FMADD (aa1, bb6, cc11, cc11)
  1100. LDF [BO + 13 * SIZE], b6
  1101. FMADD (aa1, bb7, cc13, cc13)
  1102. LDF [BO + 14 * SIZE], b7
  1103. FMADD (aa1, bb8, cc15, cc15)
  1104. LDF [BO + 15 * SIZE], b8
  1105. LDF [AO + 1 * SIZE], a1
  1106. add AO, 1 * SIZE, AO
  1107. add L, -1, L
  1108. cmp L, 0
  1109. bg,pt %icc, .LL27
  1110. add BO, 8 * SIZE, BO
  1111. .align 4
  1112. .LL28:
  1113. #ifndef TRMMKERNEL
  1114. LDF [C1 + 0 * SIZE], a1
  1115. LDF [C2 + 0 * SIZE], a2
  1116. LDF [C3 + 0 * SIZE], a3
  1117. LDF [C4 + 0 * SIZE], a4
  1118. FMADD (alpha, cc01, aa1, cc01)
  1119. LDF [C5 + 0 * SIZE], b1
  1120. FMADD (alpha, cc03, aa2, cc03)
  1121. LDF [C6 + 0 * SIZE], b2
  1122. FMADD (alpha, cc05, aa3, cc05)
  1123. LDF [C7 + 0 * SIZE], b3
  1124. FMADD (alpha, cc07, aa4, cc07)
  1125. LDF [C8 + 0 * SIZE], b4
  1126. FMADD (alpha, cc09, bb1, cc09)
  1127. STF c01, [C1 + 0 * SIZE]
  1128. FMADD (alpha, cc11, bb2, cc11)
  1129. STF c03, [C2 + 0 * SIZE]
  1130. FMADD (alpha, cc13, bb3, cc13)
  1131. STF c05, [C3 + 0 * SIZE]
  1132. FMADD (alpha, cc15, bb4, cc15)
  1133. STF c07, [C4 + 0 * SIZE]
  1134. #else
  1135. FMUL ALPHA, c01, c01
  1136. FMUL ALPHA, c03, c03
  1137. FMUL ALPHA, c05, c05
  1138. FMUL ALPHA, c07, c07
  1139. FMUL ALPHA, c09, c09
  1140. STF c01, [C1 + 0 * SIZE]
  1141. FMUL ALPHA, c11, c11
  1142. STF c03, [C2 + 0 * SIZE]
  1143. FMUL ALPHA, c13, c13
  1144. STF c05, [C3 + 0 * SIZE]
  1145. FMUL ALPHA, c15, c15
  1146. STF c07, [C4 + 0 * SIZE]
  1147. #endif
  1148. STF c09, [C5 + 0 * SIZE]
  1149. STF c11, [C6 + 0 * SIZE]
  1150. STF c13, [C7 + 0 * SIZE]
  1151. STF c15, [C8 + 0 * SIZE]
  1152. #ifdef TRMMKERNEL
  1153. #if ( defined(LEFT) && defined(TRANSA)) || \
  1154. (!defined(LEFT) && !defined(TRANSA))
  1155. sub K, KK, TEMP1
  1156. #ifdef LEFT
  1157. add TEMP1, -1, TEMP1
  1158. #else
  1159. add TEMP1, -8, TEMP1
  1160. #endif
  1161. sll TEMP1, BASE_SHIFT + 0, TEMP2
  1162. sll TEMP1, BASE_SHIFT + 3, TEMP1
  1163. add AO, TEMP2, AO
  1164. add BO, TEMP1, BO
  1165. #endif
  1166. #ifdef LEFT
  1167. add KK, 1, KK
  1168. #endif
  1169. #endif
  1170. .align 4
  1171. .LL29:
  1172. #if defined(TRMMKERNEL) && !defined(LEFT)
  1173. add KK, 8, KK
  1174. #endif
  1175. add J, -1, J
  1176. cmp J, 0
  1177. bg,pt %icc, .LL11
  1178. mov BO, B
  1179. .align 4
  1180. .LL30:
  1181. and N, 4, J
  1182. cmp J, 0
  1183. ble,pn %icc, .LL50
  1184. mov C, C1
  1185. add C, LDC, C2
  1186. add C2, LDC, C3
  1187. add C3, LDC, C4
  1188. add C4, LDC, C
  1189. #if defined(TRMMKERNEL) && defined(LEFT)
  1190. mov OFFSET, KK
  1191. #endif
  1192. sra M, 1, I
  1193. cmp I, 0
  1194. ble,pn %icc, .LL40
  1195. mov A, AO
  1196. .align 4
  1197. .LL32:
  1198. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
  1199. mov B, BO
  1200. #else
  1201. sll KK, BASE_SHIFT + 1, TEMP1
  1202. sll KK, BASE_SHIFT + 2, TEMP2
  1203. add AO, TEMP1, AO
  1204. add B, TEMP2, BO
  1205. #endif
  1206. LDF [AO + 0 * SIZE], a1
  1207. LDF [AO + 1 * SIZE], a2
  1208. LDF [BO + 0 * SIZE], b1
  1209. LDF [BO + 1 * SIZE], b2
  1210. LDF [BO + 2 * SIZE], b3
  1211. LDF [BO + 3 * SIZE], b4
  1212. LDF [BO + 4 * SIZE], b5
  1213. LDF [BO + 5 * SIZE], b6
  1214. FCLR (cc01)
  1215. LDF [BO + 6 * SIZE], b7
  1216. FCLR (cc02)
  1217. LDF [BO + 7 * SIZE], b8
  1218. FCLR (cc03)
  1219. LDF [BO + 8 * SIZE], b9
  1220. FCLR (cc04)
  1221. prefetch [C1 + 2 * SIZE], 3
  1222. FCLR (cc05)
  1223. prefetch [C2 + 2 * SIZE], 3
  1224. FCLR (cc06)
  1225. prefetch [C3 + 2 * SIZE], 3
  1226. FCLR (cc07)
  1227. prefetch [C4 + 2 * SIZE], 3
  1228. FCLR (cc08)
  1229. #ifndef TRMMKERNEL
  1230. sra K, 2, L
  1231. #else
  1232. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1233. sub K, KK, L
  1234. #elif defined(LEFT)
  1235. add KK, 2, L
  1236. #else
  1237. add KK, 4, L
  1238. #endif
  1239. sra L, 2, L
  1240. #endif
  1241. cmp L, 0
  1242. ble,pn %icc, .LL35
  1243. nop
  1244. .align 4
  1245. .LL33:
  1246. FMADD (aa1, bb1, cc01, cc01)
  1247. LDF [AO + 2 * SIZE], a3
  1248. FMADD (aa2, bb1, cc02, cc02)
  1249. LDF [AO + 3 * SIZE], a4
  1250. FMADD (aa1, bb2, cc03, cc03)
  1251. LDF [BO + 16 * SIZE], b1
  1252. FMADD (aa2, bb2, cc04, cc04)
  1253. LDF [BO + 9 * SIZE], b2
  1254. FMADD (aa1, bb3, cc05, cc05)
  1255. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1256. FMADD (aa2, bb3, cc06, cc06)
  1257. add L, -1, L
  1258. FMADD (aa1, bb4, cc07, cc07)
  1259. LDF [BO + 10 * SIZE], b3
  1260. FMADD (aa2, bb4, cc08, cc08)
  1261. LDF [BO + 11 * SIZE], b4
  1262. FMADD (aa3, bb5, cc01, cc01)
  1263. LDF [AO + 4 * SIZE], a1
  1264. FMADD (aa4, bb5, cc02, cc02)
  1265. LDF [AO + 5 * SIZE], a2
  1266. FMADD (aa3, bb6, cc03, cc03)
  1267. LDF [BO + 12 * SIZE], b5
  1268. FMADD (aa4, bb6, cc04, cc04)
  1269. LDF [BO + 13 * SIZE], b6
  1270. FMADD (aa3, bb7, cc05, cc05)
  1271. cmp L, 0
  1272. FMADD (aa4, bb7, cc06, cc06)
  1273. add AO, 8 * SIZE, AO
  1274. FMADD (aa3, bb8, cc07, cc07)
  1275. LDF [BO + 14 * SIZE], b7
  1276. FMADD (aa4, bb8, cc08, cc08)
  1277. LDF [BO + 15 * SIZE], b8
  1278. FMADD (aa1, bb9, cc01, cc01)
  1279. LDF [AO - 2 * SIZE], a3
  1280. FMADD (aa2, bb9, cc02, cc02)
  1281. LDF [AO - 1 * SIZE], a4
  1282. FMADD (aa1, bb2, cc03, cc03)
  1283. LDF [BO + 24 * SIZE], b9
  1284. FMADD (aa2, bb2, cc04, cc04)
  1285. LDF [BO + 17 * SIZE], b2
  1286. FMADD (aa1, bb3, cc05, cc05)
  1287. add BO, 16 * SIZE, BO
  1288. FMADD (aa2, bb3, cc06, cc06)
  1289. nop
  1290. FMADD (aa1, bb4, cc07, cc07)
  1291. LDF [BO + 2 * SIZE], b3
  1292. FMADD (aa2, bb4, cc08, cc08)
  1293. LDF [BO + 3 * SIZE], b4
  1294. FMADD (aa3, bb5, cc01, cc01)
  1295. LDF [AO + 0 * SIZE], a1
  1296. FMADD (aa4, bb5, cc02, cc02)
  1297. LDF [AO + 1 * SIZE], a2
  1298. FMADD (aa3, bb6, cc03, cc03)
  1299. LDF [BO + 4 * SIZE], b5
  1300. FMADD (aa4, bb6, cc04, cc04)
  1301. LDF [BO + 5 * SIZE], b6
  1302. FMADD (aa3, bb7, cc05, cc05)
  1303. nop
  1304. FMADD (aa4, bb7, cc06, cc06)
  1305. LDF [BO + 6 * SIZE], b7
  1306. FMADD (aa3, bb8, cc07, cc07)
  1307. FMADD (aa4, bb8, cc08, cc08)
  1308. bg,pt %icc, .LL33
  1309. LDF [BO + 7 * SIZE], b8
  1310. .align 4
  1311. .LL35:
  1312. #ifndef TRMMKERNEL
  1313. and K, 3, L
  1314. #else
  1315. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1316. sub K, KK, L
  1317. #elif defined(LEFT)
  1318. add KK, 2, L
  1319. #else
  1320. add KK, 4, L
  1321. #endif
  1322. and L, 3, L
  1323. #endif
  1324. cmp L, 0
  1325. ble,a,pn %icc, .LL38
  1326. nop
  1327. .align 4
  1328. .LL37:
  1329. FMADD (aa1, bb1, cc01, cc01)
  1330. add L, -1, L
  1331. FMADD (aa2, bb1, cc02, cc02)
  1332. LDF [BO + 4 * SIZE], b1
  1333. FMADD (aa1, bb2, cc03, cc03)
  1334. add AO, 2 * SIZE, AO
  1335. FMADD (aa2, bb2, cc04, cc04)
  1336. LDF [BO + 5 * SIZE], b2
  1337. FMADD (aa1, bb3, cc05, cc05)
  1338. cmp L, 0
  1339. FMADD (aa2, bb3, cc06, cc06)
  1340. LDF [BO + 6 * SIZE], b3
  1341. FMADD (aa1, bb4, cc07, cc07)
  1342. LDF [AO + 0 * SIZE], a1
  1343. FMADD (aa2, bb4, cc08, cc08)
  1344. LDF [AO + 1 * SIZE], a2
  1345. LDF [BO + 7 * SIZE], b4
  1346. bg,pt %icc, .LL37
  1347. add BO, 4 * SIZE, BO
  1348. .align 4
  1349. .LL38:
  1350. #ifndef TRMMKERNEL
  1351. LDF [C1 + 0 * SIZE], a1
  1352. LDF [C1 + 1 * SIZE], a2
  1353. LDF [C2 + 0 * SIZE], a3
  1354. LDF [C2 + 1 * SIZE], a4
  1355. FMADD (alpha, cc01, aa1, cc01)
  1356. LDF [C3 + 0 * SIZE], b1
  1357. FMADD (alpha, cc02, aa2, cc02)
  1358. LDF [C3 + 1 * SIZE], b2
  1359. FMADD (alpha, cc03, aa3, cc03)
  1360. LDF [C4 + 0 * SIZE], b3
  1361. FMADD (alpha, cc04, aa4, cc04)
  1362. LDF [C4 + 1 * SIZE], b4
  1363. FMADD (alpha, cc05, bb1, cc05)
  1364. STF c01, [C1 + 0 * SIZE]
  1365. FMADD (alpha, cc06, bb2, cc06)
  1366. STF c02, [C1 + 1 * SIZE]
  1367. FMADD (alpha, cc07, bb3, cc07)
  1368. STF c03, [C2 + 0 * SIZE]
  1369. FMADD (alpha, cc08, bb4, cc08)
  1370. STF c04, [C2 + 1 * SIZE]
  1371. #else
  1372. FMUL ALPHA, c01, c01
  1373. FMUL ALPHA, c02, c02
  1374. FMUL ALPHA, c03, c03
  1375. FMUL ALPHA, c04, c04
  1376. FMUL ALPHA, c05, c05
  1377. STF c01, [C1 + 0 * SIZE]
  1378. FMUL ALPHA, c06, c06
  1379. STF c02, [C1 + 1 * SIZE]
  1380. FMUL ALPHA, c07, c07
  1381. STF c03, [C2 + 0 * SIZE]
  1382. FMUL ALPHA, c08, c08
  1383. STF c04, [C2 + 1 * SIZE]
  1384. #endif
  1385. STF c05, [C3 + 0 * SIZE]
  1386. add C1, 2 * SIZE, C1
  1387. STF c06, [C3 + 1 * SIZE]
  1388. add C2, 2 * SIZE, C2
  1389. STF c07, [C4 + 0 * SIZE]
  1390. add C3, 2 * SIZE, C3
  1391. STF c08, [C4 + 1 * SIZE]
  1392. add C4, 2 * SIZE, C4
  1393. #ifdef TRMMKERNEL
  1394. #if ( defined(LEFT) && defined(TRANSA)) || \
  1395. (!defined(LEFT) && !defined(TRANSA))
  1396. sub K, KK, TEMP1
  1397. #ifdef LEFT
  1398. add TEMP1, -2, TEMP1
  1399. #else
  1400. add TEMP1, -4, TEMP1
  1401. #endif
  1402. sll TEMP1, BASE_SHIFT + 1, TEMP2
  1403. sll TEMP1, BASE_SHIFT + 2, TEMP1
  1404. add AO, TEMP2, AO
  1405. add BO, TEMP1, BO
  1406. #endif
  1407. #ifdef LEFT
  1408. add KK, 2, KK
  1409. #endif
  1410. #endif
  1411. add I, -1, I
  1412. cmp I, 0
  1413. bg,pt %icc, .LL32
  1414. nop
  1415. .LL40:
  1416. and M, 1, I
  1417. cmp I, 0
  1418. ble,pn %icc, .LL49
  1419. nop
  1420. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
  1421. mov B, BO
  1422. #else
  1423. sll KK, BASE_SHIFT + 0, TEMP1
  1424. sll KK, BASE_SHIFT + 2, TEMP2
  1425. add AO, TEMP1, AO
  1426. add B, TEMP2, BO
  1427. #endif
  1428. LDF [AO + 0 * SIZE], a1
  1429. LDF [AO + 1 * SIZE], a2
  1430. LDF [AO + 2 * SIZE], a3
  1431. LDF [AO + 3 * SIZE], a4
  1432. LDF [BO + 0 * SIZE], b1
  1433. LDF [BO + 1 * SIZE], b2
  1434. LDF [BO + 2 * SIZE], b3
  1435. LDF [BO + 3 * SIZE], b4
  1436. LDF [BO + 4 * SIZE], b5
  1437. LDF [BO + 5 * SIZE], b6
  1438. FCLR (cc01)
  1439. LDF [BO + 6 * SIZE], b7
  1440. FCLR (cc03)
  1441. LDF [BO + 7 * SIZE], b8
  1442. FCLR (cc05)
  1443. LDF [BO + 8 * SIZE], b9
  1444. FCLR (cc07)
  1445. #ifndef TRMMKERNEL
  1446. sra K, 2, L
  1447. #else
  1448. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1449. sub K, KK, L
  1450. #elif defined(LEFT)
  1451. add KK, 1, L
  1452. #else
  1453. add KK, 4, L
  1454. #endif
  1455. sra L, 2, L
  1456. #endif
  1457. cmp L, 0
  1458. ble,pn %icc, .LL45
  1459. nop
  1460. .LL43:
  1461. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1462. add L, -1, L
  1463. FMADD (aa1, bb1, cc01, cc01)
  1464. LDF [BO + 16 * SIZE], b1
  1465. FMADD (aa1, bb2, cc03, cc03)
  1466. LDF [BO + 9 * SIZE], b2
  1467. FMADD (aa1, bb3, cc05, cc05)
  1468. LDF [BO + 10 * SIZE], b3
  1469. FMADD (aa1, bb4, cc07, cc07)
  1470. LDF [BO + 11 * SIZE], b4
  1471. LDF [AO + 4 * SIZE], a1
  1472. cmp L, 0
  1473. FMADD (aa2, bb5, cc01, cc01)
  1474. LDF [BO + 12 * SIZE], b5
  1475. FMADD (aa2, bb6, cc03, cc03)
  1476. LDF [BO + 13 * SIZE], b6
  1477. FMADD (aa2, bb7, cc05, cc05)
  1478. LDF [BO + 14 * SIZE], b7
  1479. FMADD (aa2, bb8, cc07, cc07)
  1480. LDF [BO + 15 * SIZE], b8
  1481. LDF [AO + 5 * SIZE], a2
  1482. add AO, 4 * SIZE, AO
  1483. FMADD (aa3, bb9, cc01, cc01)
  1484. LDF [BO + 24 * SIZE], b9
  1485. FMADD (aa3, bb2, cc03, cc03)
  1486. LDF [BO + 17 * SIZE], b2
  1487. FMADD (aa3, bb3, cc05, cc05)
  1488. LDF [BO + 18 * SIZE], b3
  1489. FMADD (aa3, bb4, cc07, cc07)
  1490. LDF [BO + 19 * SIZE], b4
  1491. LDF [AO + 2 * SIZE], a3
  1492. add BO, 16 * SIZE, BO
  1493. FMADD (aa4, bb5, cc01, cc01)
  1494. LDF [BO + 4 * SIZE], b5
  1495. FMADD (aa4, bb6, cc03, cc03)
  1496. LDF [BO + 5 * SIZE], b6
  1497. FMADD (aa4, bb7, cc05, cc05)
  1498. LDF [BO + 6 * SIZE], b7
  1499. FMADD (aa4, bb8, cc07, cc07)
  1500. LDF [BO + 7 * SIZE], b8
  1501. bg,pt %icc, .LL43
  1502. LDF [AO + 3 * SIZE], a4
  1503. .align 4
  1504. .LL45:
  1505. #ifndef TRMMKERNEL
  1506. and K, 3, L
  1507. #else
  1508. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1509. sub K, KK, L
  1510. #elif defined(LEFT)
  1511. add KK, 1, L
  1512. #else
  1513. add KK, 4, L
  1514. #endif
  1515. and L, 3, L
  1516. #endif
  1517. cmp L, 0
  1518. ble,a,pn %icc, .LL48
  1519. nop
  1520. .align 4
  1521. .LL47:
  1522. FMADD (aa1, bb1, cc01, cc01)
  1523. LDF [BO + 4 * SIZE], b1
  1524. add L, -1, L
  1525. FMADD (aa1, bb2, cc03, cc03)
  1526. LDF [BO + 5 * SIZE], b2
  1527. add AO, 1 * SIZE, AO
  1528. FMADD (aa1, bb3, cc05, cc05)
  1529. LDF [BO + 6 * SIZE], b3
  1530. cmp L, 0
  1531. FMADD (aa1, bb4, cc07, cc07)
  1532. LDF [BO + 7 * SIZE], b4
  1533. add BO, 4 * SIZE, BO
  1534. bg,pt %icc, .LL47
  1535. LDF [AO + 0 * SIZE], a1
  1536. .align 4
  1537. .LL48:
  1538. #ifndef TRMMKERNEL
  1539. LDF [C1 + 0 * SIZE], a1
  1540. LDF [C2 + 0 * SIZE], a2
  1541. LDF [C3 + 0 * SIZE], a3
  1542. LDF [C4 + 0 * SIZE], a4
  1543. FMADD (alpha, cc01, aa1, cc01)
  1544. FMADD (alpha, cc03, aa2, cc03)
  1545. FMADD (alpha, cc05, aa3, cc05)
  1546. FMADD (alpha, cc07, aa4, cc07)
  1547. #else
  1548. FMUL ALPHA, c01, c01
  1549. FMUL ALPHA, c03, c03
  1550. FMUL ALPHA, c05, c05
  1551. FMUL ALPHA, c07, c07
  1552. #endif
  1553. STF c01, [C1 + 0 * SIZE]
  1554. STF c03, [C2 + 0 * SIZE]
  1555. STF c05, [C3 + 0 * SIZE]
  1556. STF c07, [C4 + 0 * SIZE]
  1557. #ifdef TRMMKERNEL
  1558. #if ( defined(LEFT) && defined(TRANSA)) || \
  1559. (!defined(LEFT) && !defined(TRANSA))
  1560. sub K, KK, TEMP1
  1561. #ifdef LEFT
  1562. add TEMP1, -1, TEMP1
  1563. #else
  1564. add TEMP1, -4, TEMP1
  1565. #endif
  1566. sll TEMP1, BASE_SHIFT + 0, TEMP2
  1567. sll TEMP1, BASE_SHIFT + 2, TEMP1
  1568. add AO, TEMP2, AO
  1569. add BO, TEMP1, BO
  1570. #endif
  1571. #ifdef LEFT
  1572. add KK, 1, KK
  1573. #endif
  1574. #endif
  1575. .align 4
  1576. .LL49:
  1577. #if defined(TRMMKERNEL) && !defined(LEFT)
  1578. add KK, 4, KK
  1579. #endif
  1580. mov BO, B
  1581. .align 4
  1582. .LL50:
  1583. and N, 2, J
  1584. cmp J, 0
  1585. ble,pn %icc, .LL70
  1586. mov C, C1
  1587. add C, LDC, C2
  1588. add C2, LDC, C
  1589. #if defined(TRMMKERNEL) && defined(LEFT)
  1590. mov OFFSET, KK
  1591. #endif
  1592. sra M, 1, I
  1593. cmp I, 0
  1594. ble,pn %icc, .LL60
  1595. mov A, AO
  1596. .align 4
  1597. .LL52:
  1598. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
  1599. mov B, BO
  1600. #else
  1601. sll KK, BASE_SHIFT + 1, TEMP1
  1602. sll KK, BASE_SHIFT + 1, TEMP2
  1603. add AO, TEMP1, AO
  1604. add B, TEMP2, BO
  1605. #endif
  1606. LDF [AO + 0 * SIZE], a1
  1607. LDF [AO + 1 * SIZE], a2
  1608. LDF [AO + 2 * SIZE], a3
  1609. LDF [AO + 3 * SIZE], a4
  1610. LDF [BO + 0 * SIZE], b1
  1611. LDF [BO + 1 * SIZE], b2
  1612. LDF [BO + 2 * SIZE], b3
  1613. FCLR (cc01)
  1614. LDF [BO + 3 * SIZE], b4
  1615. FCLR (cc02)
  1616. LDF [BO + 4 * SIZE], b5
  1617. FCLR (cc03)
  1618. LDF [BO + 5 * SIZE], b6
  1619. FCLR (cc04)
  1620. LDF [BO + 6 * SIZE], b7
  1621. FCLR (cc05)
  1622. LDF [BO + 7 * SIZE], b8
  1623. FCLR (cc06)
  1624. prefetch [C1 + 2 * SIZE], 3
  1625. FCLR (cc07)
  1626. prefetch [C2 + 2 * SIZE], 3
  1627. FCLR (cc08)
  1628. #ifndef TRMMKERNEL
  1629. sra K, 2, L
  1630. #else
  1631. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1632. sub K, KK, L
  1633. #elif defined(LEFT)
  1634. add KK, 2, L
  1635. #else
  1636. add KK, 2, L
  1637. #endif
  1638. sra L, 2, L
  1639. #endif
  1640. cmp L, 0
  1641. ble,pn %icc, .LL55
  1642. nop
  1643. .align 4
  1644. .LL53:
  1645. FMADD (aa1, bb1, cc01, cc01)
  1646. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1647. FMADD (aa2, bb1, cc02, cc02)
  1648. LDF [BO + 8 * SIZE], b1
  1649. FMADD (aa1, bb2, cc03, cc03)
  1650. LDF [AO + 4 * SIZE], a1
  1651. FMADD (aa2, bb2, cc04, cc04)
  1652. LDF [AO + 5 * SIZE], a2
  1653. FMADD (aa3, bb3, cc01, cc01)
  1654. LDF [BO + 9 * SIZE], b2
  1655. FMADD (aa4, bb3, cc02, cc02)
  1656. LDF [BO + 10 * SIZE], b3
  1657. FMADD (aa3, bb4, cc03, cc03)
  1658. LDF [AO + 6 * SIZE], a3
  1659. FMADD (aa4, bb4, cc04, cc04)
  1660. LDF [AO + 7 * SIZE], a4
  1661. FMADD (aa1, bb5, cc01, cc01)
  1662. LDF [BO + 11 * SIZE], b4
  1663. FMADD (aa2, bb5, cc02, cc02)
  1664. LDF [BO + 12 * SIZE], b5
  1665. FMADD (aa1, bb6, cc03, cc03)
  1666. LDF [AO + 8 * SIZE], a1
  1667. FMADD (aa2, bb6, cc04, cc04)
  1668. LDF [AO + 9 * SIZE], a2
  1669. FMADD (aa3, bb7, cc01, cc01)
  1670. LDF [BO + 13 * SIZE], b6
  1671. FMADD (aa4, bb7, cc02, cc02)
  1672. LDF [BO + 14 * SIZE], b7
  1673. FMADD (aa3, bb8, cc03, cc03)
  1674. LDF [AO + 10 * SIZE], a3
  1675. FMADD (aa4, bb8, cc04, cc04)
  1676. LDF [AO + 11 * SIZE], a4
  1677. add AO, 8 * SIZE, AO
  1678. add L, -1, L
  1679. add BO, 8 * SIZE, BO
  1680. cmp L, 0
  1681. bg,pt %icc, .LL53
  1682. LDF [BO + 7 * SIZE], b8
  1683. .align 4
  1684. .LL55:
  1685. #ifndef TRMMKERNEL
  1686. and K, 3, L
  1687. #else
  1688. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1689. sub K, KK, L
  1690. #elif defined(LEFT)
  1691. add KK, 2, L
  1692. #else
  1693. add KK, 2, L
  1694. #endif
  1695. and L, 3, L
  1696. #endif
  1697. cmp L, 0
  1698. ble,a,pn %icc, .LL58
  1699. nop
  1700. .align 4
  1701. .LL57:
  1702. FMADD (aa1, bb1, cc01, cc01)
  1703. add L, -1, L
  1704. FMADD (aa2, bb1, cc02, cc02)
  1705. LDF [BO + 2 * SIZE], b1
  1706. FMADD (aa1, bb2, cc03, cc03)
  1707. LDF [AO + 2 * SIZE], a1
  1708. FMADD (aa2, bb2, cc04, cc04)
  1709. LDF [AO + 3 * SIZE], a2
  1710. add AO, 2 * SIZE, AO
  1711. cmp L, 0
  1712. add BO, 2 * SIZE, BO
  1713. bg,pt %icc, .LL57
  1714. LDF [BO + 1 * SIZE], b2
  1715. .align 4
  1716. .LL58:
  1717. #ifndef TRMMKERNEL
  1718. LDF [C1 + 0 * SIZE], a1
  1719. LDF [C1 + 1 * SIZE], a2
  1720. LDF [C2 + 0 * SIZE], a3
  1721. LDF [C2 + 1 * SIZE], a4
  1722. FMADD (alpha, cc01, aa1, cc01)
  1723. FMADD (alpha, cc02, aa2, cc02)
  1724. FMADD (alpha, cc03, aa3, cc03)
  1725. FMADD (alpha, cc04, aa4, cc04)
  1726. #else
  1727. FMUL ALPHA, c01, c01
  1728. FMUL ALPHA, c02, c02
  1729. FMUL ALPHA, c03, c03
  1730. FMUL ALPHA, c04, c04
  1731. #endif
  1732. STF c01, [C1 + 0 * SIZE]
  1733. add I, -1, I
  1734. STF c02, [C1 + 1 * SIZE]
  1735. add C1, 2 * SIZE, C1
  1736. STF c03, [C2 + 0 * SIZE]
  1737. cmp I, 0
  1738. STF c04, [C2 + 1 * SIZE]
  1739. add C2, 2 * SIZE, C2
  1740. #ifdef TRMMKERNEL
  1741. #if ( defined(LEFT) && defined(TRANSA)) || \
  1742. (!defined(LEFT) && !defined(TRANSA))
  1743. sub K, KK, TEMP1
  1744. #ifdef LEFT
  1745. add TEMP1, -2, TEMP1
  1746. #else
  1747. add TEMP1, -2, TEMP1
  1748. #endif
  1749. sll TEMP1, BASE_SHIFT + 1, TEMP2
  1750. sll TEMP1, BASE_SHIFT + 1, TEMP1
  1751. add AO, TEMP2, AO
  1752. add BO, TEMP1, BO
  1753. #endif
  1754. #ifdef LEFT
  1755. add KK, 2, KK
  1756. #endif
  1757. #endif
  1758. bg,pt %icc, .LL52
  1759. nop
  1760. .align 4
  1761. .LL60:
  1762. and M, 1, I
  1763. cmp I, 0
  1764. ble,pn %icc, .LL69
  1765. nop
  1766. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
  1767. mov B, BO
  1768. #else
  1769. sll KK, BASE_SHIFT + 0, TEMP1
  1770. sll KK, BASE_SHIFT + 1, TEMP2
  1771. add AO, TEMP1, AO
  1772. add B, TEMP2, BO
  1773. #endif
  1774. LDF [AO + 0 * SIZE], a1
  1775. LDF [AO + 1 * SIZE], a2
  1776. LDF [AO + 2 * SIZE], a3
  1777. LDF [AO + 3 * SIZE], a4
  1778. LDF [BO + 0 * SIZE], b1
  1779. LDF [BO + 1 * SIZE], b2
  1780. LDF [BO + 2 * SIZE], b3
  1781. LDF [BO + 3 * SIZE], b4
  1782. LDF [BO + 4 * SIZE], b5
  1783. LDF [BO + 5 * SIZE], b6
  1784. LDF [BO + 6 * SIZE], b7
  1785. FCLR (cc01)
  1786. LDF [BO + 7 * SIZE], b8
  1787. FCLR (cc03)
  1788. #ifndef TRMMKERNEL
  1789. sra K, 2, L
  1790. #else
  1791. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1792. sub K, KK, L
  1793. #elif defined(LEFT)
  1794. add KK, 1, L
  1795. #else
  1796. add KK, 2, L
  1797. #endif
  1798. sra L, 2, L
  1799. #endif
  1800. cmp L, 0
  1801. ble,pn %icc, .LL65
  1802. nop
  1803. .align 4
  1804. .LL63:
  1805. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1806. add L, -1, L
  1807. FMADD (aa1, bb1, cc01, cc01)
  1808. LDF [BO + 8 * SIZE], b1
  1809. FMADD (aa1, bb2, cc03, cc03)
  1810. LDF [BO + 9 * SIZE], b2
  1811. LDF [AO + 4 * SIZE], a1
  1812. cmp L, 0
  1813. FMADD (aa2, bb3, cc01, cc01)
  1814. LDF [BO + 10 * SIZE], b3
  1815. FMADD (aa2, bb4, cc03, cc03)
  1816. LDF [BO + 11 * SIZE], b4
  1817. LDF [AO + 5 * SIZE], a2
  1818. add AO, 4 * SIZE, AO
  1819. FMADD (aa3, bb5, cc01, cc01)
  1820. LDF [BO + 12 * SIZE], b5
  1821. FMADD (aa3, bb6, cc03, cc03)
  1822. LDF [BO + 13 * SIZE], b6
  1823. LDF [AO + 2 * SIZE], a3
  1824. add BO, 8 * SIZE, BO
  1825. FMADD (aa4, bb7, cc01, cc01)
  1826. LDF [BO + 6 * SIZE], b7
  1827. FMADD (aa4, bb8, cc03, cc03)
  1828. LDF [BO + 7 * SIZE], b8
  1829. bg,pt %icc, .LL63
  1830. LDF [AO + 3 * SIZE], a4
  1831. .align 4
  1832. .LL65:
  1833. #ifndef TRMMKERNEL
  1834. and K, 3, L
  1835. #else
  1836. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1837. sub K, KK, L
  1838. #elif defined(LEFT)
  1839. add KK, 1, L
  1840. #else
  1841. add KK, 2, L
  1842. #endif
  1843. and L, 3, L
  1844. #endif
  1845. cmp L, 0
  1846. ble,a,pn %icc, .LL68
  1847. nop
  1848. .align 4
  1849. .LL67:
  1850. FMADD (aa1, bb1, cc01, cc01)
  1851. LDF [BO + 2 * SIZE], b1
  1852. FMADD (aa1, bb2, cc03, cc03)
  1853. LDF [BO + 3 * SIZE], b2
  1854. LDF [AO + 1 * SIZE], a1
  1855. add L, -1, L
  1856. add AO, 1 * SIZE, AO
  1857. cmp L, 0
  1858. bg,pt %icc, .LL67
  1859. add BO, 2 * SIZE, BO
  1860. .align 4
  1861. .LL68:
  1862. #ifndef TRMMKERNEL
  1863. LDF [C1 + 0 * SIZE], a1
  1864. LDF [C2 + 0 * SIZE], a2
  1865. FMADD (alpha, cc01, aa1, cc01)
  1866. FMADD (alpha, cc03, aa2, cc03)
  1867. #else
  1868. FMUL ALPHA, c01, c01
  1869. FMUL ALPHA, c03, c03
  1870. #endif
  1871. STF c01, [C1 + 0 * SIZE]
  1872. STF c03, [C2 + 0 * SIZE]
  1873. #ifdef TRMMKERNEL
  1874. #if ( defined(LEFT) && defined(TRANSA)) || \
  1875. (!defined(LEFT) && !defined(TRANSA))
  1876. sub K, KK, TEMP1
  1877. #ifdef LEFT
  1878. add TEMP1, -1, TEMP1
  1879. #else
  1880. add TEMP1, -2, TEMP1
  1881. #endif
  1882. sll TEMP1, BASE_SHIFT + 0, TEMP2
  1883. sll TEMP1, BASE_SHIFT + 1, TEMP1
  1884. add AO, TEMP2, AO
  1885. add BO, TEMP1, BO
  1886. #endif
  1887. #ifdef LEFT
  1888. add KK, 1, KK
  1889. #endif
  1890. #endif
  1891. .align 4
  1892. .LL69:
  1893. #if defined(TRMMKERNEL) && !defined(LEFT)
  1894. add KK, 2, KK
  1895. #endif
  1896. mov BO, B
  1897. .align 4
  1898. .LL70:
  1899. and N, 1, J
  1900. cmp J, 0
  1901. ble,pn %icc, .LL999
  1902. mov C, C1
  1903. #if defined(TRMMKERNEL) && defined(LEFT)
  1904. mov OFFSET, KK
  1905. #endif
  1906. sra M, 1, I
  1907. cmp I, 0
  1908. ble,pn %icc, .LL80
  1909. mov A, AO
  1910. .align 4
  1911. .LL72:
  1912. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
  1913. mov B, BO
  1914. #else
  1915. sll KK, BASE_SHIFT + 1, TEMP1
  1916. sll KK, BASE_SHIFT + 0, TEMP2
  1917. add AO, TEMP1, AO
  1918. add B, TEMP2, BO
  1919. #endif
  1920. LDF [AO + 0 * SIZE], a1
  1921. LDF [AO + 1 * SIZE], a2
  1922. LDF [AO + 2 * SIZE], a3
  1923. LDF [AO + 3 * SIZE], a4
  1924. LDF [BO + 0 * SIZE], b1
  1925. LDF [BO + 1 * SIZE], b2
  1926. LDF [BO + 2 * SIZE], b3
  1927. FCLR (cc01)
  1928. LDF [BO + 3 * SIZE], b4
  1929. FCLR (cc02)
  1930. prefetch [C1 + 2 * SIZE], 3
  1931. #ifndef TRMMKERNEL
  1932. sra K, 2, L
  1933. #else
  1934. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1935. sub K, KK, L
  1936. #elif defined(LEFT)
  1937. add KK, 2, L
  1938. #else
  1939. add KK, 1, L
  1940. #endif
  1941. sra L, 2, L
  1942. #endif
  1943. cmp L, 0
  1944. ble,pn %icc, .LL75
  1945. nop
  1946. .LL73:
  1947. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1948. add L, -1, L
  1949. FMADD (aa1, bb1, cc01, cc01)
  1950. LDF [AO + 4 * SIZE], a1
  1951. FMADD (aa2, bb1, cc02, cc02)
  1952. LDF [AO + 5 * SIZE], a2
  1953. LDF [BO + 4 * SIZE], b1
  1954. cmp L, 0
  1955. FMADD (aa3, bb2, cc01, cc01)
  1956. LDF [AO + 6 * SIZE], a3
  1957. FMADD (aa4, bb2, cc02, cc02)
  1958. LDF [AO + 7 * SIZE], a4
  1959. LDF [BO + 5 * SIZE], b2
  1960. add BO, 4 * SIZE, BO
  1961. FMADD (aa1, bb3, cc01, cc01)
  1962. LDF [AO + 8 * SIZE], a1
  1963. FMADD (aa2, bb3, cc02, cc02)
  1964. LDF [AO + 9 * SIZE], a2
  1965. LDF [BO + 2 * SIZE], b3
  1966. add AO, 8 * SIZE, AO
  1967. FMADD (aa3, bb4, cc01, cc01)
  1968. LDF [AO + 2 * SIZE], a3
  1969. FMADD (aa4, bb4, cc02, cc02)
  1970. LDF [AO + 3 * SIZE], a4
  1971. bg,pt %icc, .LL73
  1972. LDF [BO + 3 * SIZE], b4
  1973. .align 4
  1974. .LL75:
  1975. #ifndef TRMMKERNEL
  1976. and K, 3, L
  1977. #else
  1978. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1979. sub K, KK, L
  1980. #elif defined(LEFT)
  1981. add KK, 2, L
  1982. #else
  1983. add KK, 1, L
  1984. #endif
  1985. and L, 3, L
  1986. #endif
  1987. cmp L, 0
  1988. ble,a,pn %icc, .LL78
  1989. nop
  1990. .align 4
  1991. .LL77:
  1992. FMADD (aa1, bb1, cc01, cc01)
  1993. LDF [AO + 2 * SIZE], a1
  1994. FMADD (aa2, bb1, cc02, cc02)
  1995. LDF [AO + 3 * SIZE], a2
  1996. LDF [BO + 1 * SIZE], b1
  1997. add L, -1, L
  1998. add AO, 2 * SIZE, AO
  1999. cmp L, 0
  2000. bg,pt %icc, .LL77
  2001. add BO, 1 * SIZE, BO
  2002. .align 4
  2003. .LL78:
  2004. #ifndef TRMMKERNEL
  2005. LDF [C1 + 0 * SIZE], a1
  2006. LDF [C1 + 1 * SIZE], a2
  2007. FMADD (alpha, cc01, aa1, cc01)
  2008. FMADD (alpha, cc02, aa2, cc02)
  2009. #else
  2010. FMUL ALPHA, c01, c01
  2011. FMUL ALPHA, c02, c02
  2012. #endif
  2013. STF c01, [C1 + 0 * SIZE]
  2014. add I, -1, I
  2015. STF c02, [C1 + 1 * SIZE]
  2016. cmp I, 0
  2017. #ifdef TRMMKERNEL
  2018. #if ( defined(LEFT) && defined(TRANSA)) || \
  2019. (!defined(LEFT) && !defined(TRANSA))
  2020. sub K, KK, TEMP1
  2021. #ifdef LEFT
  2022. add TEMP1, -2, TEMP1
  2023. #else
  2024. add TEMP1, -1, TEMP1
  2025. #endif
  2026. sll TEMP1, BASE_SHIFT + 1, TEMP2
  2027. sll TEMP1, BASE_SHIFT + 0, TEMP1
  2028. add AO, TEMP2, AO
  2029. add BO, TEMP1, BO
  2030. #endif
  2031. #ifdef LEFT
  2032. add KK, 2, KK
  2033. #endif
  2034. #endif
  2035. bg,pt %icc, .LL72
  2036. add C1, 2 * SIZE, C1
  2037. .align 4
  2038. .LL80:
  2039. and M, 1, I
  2040. cmp I, 0
  2041. ble,pn %icc, .LL999
  2042. nop
  2043. #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))))
  2044. mov B, BO
  2045. #else
  2046. sll KK, BASE_SHIFT + 0, TEMP1
  2047. sll KK, BASE_SHIFT + 0, TEMP2
  2048. add AO, TEMP1, AO
  2049. add B, TEMP2, BO
  2050. #endif
  2051. LDF [AO + 0 * SIZE], a1
  2052. LDF [BO + 0 * SIZE], b1
  2053. LDF [AO + 1 * SIZE], a2
  2054. LDF [BO + 1 * SIZE], b2
  2055. LDF [AO + 2 * SIZE], a3
  2056. LDF [BO + 2 * SIZE], b3
  2057. LDF [AO + 3 * SIZE], a4
  2058. LDF [BO + 3 * SIZE], b4
  2059. #ifndef TRMMKERNEL
  2060. sra K, 2, L
  2061. #else
  2062. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2063. sub K, KK, L
  2064. #elif defined(LEFT)
  2065. add KK, 1, L
  2066. #else
  2067. add KK, 1, L
  2068. #endif
  2069. sra L, 2, L
  2070. #endif
  2071. cmp L, 0
  2072. ble,pn %icc, .LL85
  2073. FCLR (cc01)
  2074. .align 4
  2075. .LL83:
  2076. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  2077. add L, -1, L
  2078. FMADD (aa1, bb1, cc01, cc01)
  2079. LDF [AO + 4 * SIZE], a1
  2080. LDF [BO + 4 * SIZE], b1
  2081. FMADD (aa2, bb2, cc01, cc01)
  2082. LDF [AO + 5 * SIZE], a2
  2083. LDF [BO + 5 * SIZE], b2
  2084. FMADD (aa3, bb3, cc01, cc01)
  2085. LDF [AO + 6 * SIZE], a3
  2086. LDF [BO + 6 * SIZE], b3
  2087. FMADD (aa4, bb4, cc01, cc01)
  2088. LDF [AO + 7 * SIZE], a4
  2089. LDF [BO + 7 * SIZE], b4
  2090. add AO, 4 * SIZE, AO
  2091. cmp L, 0
  2092. bg,pt %icc, .LL83
  2093. add BO, 4 * SIZE, BO
  2094. .align 4
  2095. .LL85:
  2096. #ifndef TRMMKERNEL
  2097. and K, 3, L
  2098. #else
  2099. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2100. sub K, KK, L
  2101. #elif defined(LEFT)
  2102. add KK, 1, L
  2103. #else
  2104. add KK, 1, L
  2105. #endif
  2106. and L, 3, L
  2107. #endif
  2108. cmp L, 0
  2109. ble,a,pn %icc, .LL88
  2110. nop
  2111. .align 4
  2112. .LL87:
  2113. FMADD (aa1, bb1, cc01, cc01)
  2114. LDF [AO + 1 * SIZE], a1
  2115. LDF [BO + 1 * SIZE], b1
  2116. add AO, 1 * SIZE, AO
  2117. add L, -1, L
  2118. cmp L, 0
  2119. bg,pt %icc, .LL87
  2120. add BO, 1 * SIZE, BO
  2121. .align 4
  2122. .LL88:
  2123. #ifndef TRMMKERNEL
  2124. LDF [C1 + 0 * SIZE], a1
  2125. FMADD (alpha, cc01, aa1, cc01)
  2126. #else
  2127. FMUL ALPHA, c01, c01
  2128. #endif
  2129. STF c01, [C1 + 0 * SIZE]
  2130. .align 4
  2131. .LL999:
  2132. #ifdef TRMMKERNEL
  2133. #ifndef __64BIT__
  2134. ld [%sp + STACK_START + 8], %g1
  2135. ld [%sp + STACK_START + 12], %g2
  2136. ld [%sp + STACK_START + 16], %g3
  2137. ld [%sp + STACK_START + 20], %g4
  2138. #else
  2139. ldx [%sp + STACK_START + 32], %g1
  2140. ldx [%sp + STACK_START + 40], %g2
  2141. ldx [%sp + STACK_START + 48], %g3
  2142. ldx [%sp + STACK_START + 56], %g4
  2143. #endif
  2144. #endif
  2145. return %i7 + 8
  2146. clr %o0
  2147. EPILOGUE