You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT_2x8.S 73 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896
  1. /*********************************************************************/
  2. /* Copyright 2005-2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define APREFETCHSIZE 24
  41. #define APREFETCH_CATEGORY 0
  42. #define M %i0
  43. #define N %i1
  44. #define K %i2
  45. #if defined(DOUBLE) && !defined(__64BIT__)
  46. #define A %i5
  47. #define B %i4
  48. #else
  49. #define A %i4
  50. #define B %i5
  51. #endif
  52. #define C %o4
  53. #define LDC %o5
  54. #define AO %l0
  55. #define BO %l1
  56. #define I %l2
  57. #define J %l3
  58. #define L %l4
  59. #define C1 %o0
  60. #define C2 %o1
  61. #define C3 %o2
  62. #define C4 %o3
  63. #define C5 %l5
  64. #define C6 %l6
  65. #define C7 %l7
  66. #define C8 %i3
  67. #define OFFSET %g1
  68. #define KK %g2
  69. #define TEMP1 %g3
  70. #define TEMP2 %g4
  71. #define AORIG %o7
  72. #ifdef DOUBLE
  73. #define c01 %f0
  74. #define c02 %f2
  75. #define c03 %f4
  76. #define c04 %f6
  77. #define c05 %f8
  78. #define c06 %f10
  79. #define c07 %f12
  80. #define c08 %f14
  81. #define c09 %f16
  82. #define c10 %f18
  83. #define c11 %f20
  84. #define c12 %f22
  85. #define c13 %f24
  86. #define c14 %f26
  87. #define c15 %f28
  88. #define c16 %f30
  89. #define a1 %f32
  90. #define a2 %f34
  91. #define a3 %f36
  92. #define a4 %f38
  93. #define a5 %f40
  94. #define b1 %f42
  95. #define b2 %f44
  96. #define b3 %f46
  97. #define b4 %f48
  98. #define b5 %f50
  99. #define b6 %f52
  100. #define b7 %f54
  101. #define b8 %f56
  102. #define b9 %f58
  103. #define cc01 0
  104. #define cc02 2
  105. #define cc03 4
  106. #define cc04 6
  107. #define cc05 8
  108. #define cc06 10
  109. #define cc07 12
  110. #define cc08 14
  111. #define cc09 16
  112. #define cc10 18
  113. #define cc11 20
  114. #define cc12 22
  115. #define cc13 24
  116. #define cc14 26
  117. #define cc15 28
  118. #define cc16 30
  119. #define aa1 1
  120. #define aa2 3
  121. #define aa3 5
  122. #define aa4 7
  123. #define aa5 9
  124. #define bb1 11
  125. #define bb2 13
  126. #define bb3 15
  127. #define bb4 17
  128. #define bb5 19
  129. #define bb6 21
  130. #define bb7 23
  131. #define bb8 25
  132. #define bb9 27
  133. #else
  134. #define c01 %f0
  135. #define c02 %f1
  136. #define c03 %f2
  137. #define c04 %f3
  138. #define c05 %f4
  139. #define c06 %f5
  140. #define c07 %f6
  141. #define c08 %f7
  142. #define c09 %f8
  143. #define c10 %f9
  144. #define c11 %f10
  145. #define c12 %f11
  146. #define c13 %f12
  147. #define c14 %f13
  148. #define c15 %f14
  149. #define c16 %f15
  150. #define a1 %f16
  151. #define a2 %f17
  152. #define a3 %f18
  153. #define a4 %f19
  154. #define a5 %f20
  155. #define b1 %f21
  156. #define b2 %f22
  157. #define b3 %f23
  158. #define b4 %f24
  159. #define b5 %f25
  160. #define b6 %f26
  161. #define b7 %f27
  162. #define b8 %f28
  163. #define b9 %f29
  164. #define cc01 0
  165. #define cc02 1
  166. #define cc03 2
  167. #define cc04 3
  168. #define cc05 4
  169. #define cc06 5
  170. #define cc07 6
  171. #define cc08 7
  172. #define cc09 8
  173. #define cc10 9
  174. #define cc11 10
  175. #define cc12 11
  176. #define cc13 12
  177. #define cc14 13
  178. #define cc15 14
  179. #define cc16 15
  180. #define aa1 16
  181. #define aa2 17
  182. #define aa3 18
  183. #define aa4 19
  184. #define aa5 20
  185. #define bb1 21
  186. #define bb2 22
  187. #define bb3 23
  188. #define bb4 24
  189. #define bb5 25
  190. #define bb6 26
  191. #define bb7 27
  192. #define bb8 28
  193. #define bb9 29
  194. #endif
  195. .register %g2, #scratch
  196. .register %g3, #scratch
  197. PROLOGUE
  198. SAVESP
  199. nop
  200. #ifndef __64BIT__
  201. #ifdef DOUBLE
  202. ld [%sp + STACK_START + 28], B
  203. ld [%sp + STACK_START + 32], C
  204. ld [%sp + STACK_START + 36], LDC
  205. ld [%sp + STACK_START + 40], OFFSET
  206. #else
  207. ld [%sp + STACK_START + 28], C
  208. ld [%sp + STACK_START + 32], LDC
  209. ld [%sp + STACK_START + 36], OFFSET
  210. #endif
  211. st %g1, [%sp + STACK_START + 8]
  212. st %g2, [%sp + STACK_START + 12]
  213. st %g3, [%sp + STACK_START + 16]
  214. st %g4, [%sp + STACK_START + 20]
  215. #else
  216. ldx [%sp+ STACK_START + 56], C
  217. ldx [%sp+ STACK_START + 64], LDC
  218. ldx [%sp+ STACK_START + 72], OFFSET
  219. stx %g1, [%sp + STACK_START + 32]
  220. stx %g2, [%sp + STACK_START + 40]
  221. stx %g3, [%sp + STACK_START + 48]
  222. stx %g4, [%sp + STACK_START + 56]
  223. #endif
  224. #if defined(TRMMKERNEL) && !defined(LEFT)
  225. neg OFFSET, KK
  226. #endif
  227. sll LDC, BASE_SHIFT, LDC
  228. #ifdef LN
  229. smul M, K, TEMP1
  230. sll TEMP1, BASE_SHIFT, TEMP1
  231. add A, TEMP1, A
  232. sll M, BASE_SHIFT, TEMP1
  233. add C, TEMP1, C
  234. #endif
  235. #ifdef RN
  236. neg OFFSET, KK
  237. #endif
  238. #ifdef RT
  239. smul N, K, TEMP1
  240. sll TEMP1, BASE_SHIFT, TEMP1
  241. add B, TEMP1, B
  242. smul N, LDC, TEMP1
  243. add C, TEMP1, C
  244. sub N, OFFSET, KK
  245. #endif
  246. sra N, 3, J
  247. cmp J, 0
  248. ble,pn %icc, .LL30
  249. nop
  250. .align 4
  251. .LL11:
  252. #ifdef RT
  253. sll K, BASE_SHIFT + 3, TEMP1
  254. sub B, TEMP1, B
  255. #endif
  256. #ifndef RT
  257. mov C, C1
  258. add C, LDC, C2
  259. add C2, LDC, C3
  260. add C3, LDC, C4
  261. add C4, LDC, C5
  262. add C5, LDC, C6
  263. add C6, LDC, C7
  264. add C7, LDC, C8
  265. add C8, LDC, C
  266. #else
  267. sub C, LDC, C8
  268. sub C8, LDC, C7
  269. sub C7, LDC, C6
  270. sub C6, LDC, C5
  271. sub C5, LDC, C4
  272. sub C4, LDC, C3
  273. sub C3, LDC, C2
  274. sub C2, LDC, C1
  275. sub C2, LDC, C
  276. #endif
  277. #ifdef LN
  278. add M, OFFSET, KK
  279. #endif
  280. #ifdef LT
  281. mov OFFSET, KK
  282. #endif
  283. #if defined(LN) || defined(RT)
  284. mov A, AORIG
  285. #else
  286. mov A, AO
  287. #endif
  288. sra M, 1, I
  289. cmp I, 0
  290. ble,pn %icc, .LL20
  291. nop
  292. .align 4
  293. .LL12:
  294. #if defined(LT) || defined(RN)
  295. mov B, BO
  296. #else
  297. #ifdef LN
  298. sll K, BASE_SHIFT + 1, TEMP1
  299. sub AORIG, TEMP1, AORIG
  300. #endif
  301. sll KK, BASE_SHIFT + 1, TEMP1
  302. sll KK, BASE_SHIFT + 3, TEMP2
  303. add AORIG, TEMP1, AO
  304. add B, TEMP2, BO
  305. #endif
  306. LDF [AO + 0 * SIZE], a1
  307. LDF [AO + 1 * SIZE], a2
  308. LDF [AO + 8 * SIZE], a5
  309. LDF [BO + 0 * SIZE], b1
  310. LDF [BO + 1 * SIZE], b2
  311. FCLR (cc01)
  312. LDF [BO + 2 * SIZE], b3
  313. FCLR (cc05)
  314. LDF [BO + 3 * SIZE], b4
  315. FCLR (cc09)
  316. LDF [BO + 4 * SIZE], b5
  317. FCLR (cc13)
  318. LDF [BO + 5 * SIZE], b6
  319. FCLR (cc02)
  320. LDF [BO + 6 * SIZE], b7
  321. FCLR (cc06)
  322. LDF [BO + 7 * SIZE], b8
  323. FCLR (cc10)
  324. LDF [BO + 8 * SIZE], b9
  325. FCLR (cc14)
  326. prefetch [C1 + 1 * SIZE], 3
  327. FCLR (cc03)
  328. prefetch [C2 + 2 * SIZE], 3
  329. FCLR (cc07)
  330. prefetch [C3 + 1 * SIZE], 3
  331. FCLR (cc11)
  332. prefetch [C4 + 2 * SIZE], 3
  333. FCLR (cc15)
  334. prefetch [C5 + 1 * SIZE], 3
  335. FCLR (cc04)
  336. prefetch [C6 + 2 * SIZE], 3
  337. FCLR (cc08)
  338. prefetch [C7 + 1 * SIZE], 3
  339. FCLR (cc12)
  340. prefetch [C8 + 2 * SIZE], 3
  341. FCLR (cc16)
  342. #if defined(LT) || defined(RN)
  343. sra KK, 3, L
  344. #else
  345. sub K, KK, L
  346. sra L, 3, L
  347. #endif
  348. cmp L, 0
  349. ble,pn %icc, .LL15
  350. nop
  351. .align 4
  352. .LL13:
  353. FMADD (aa1, bb1, cc01, cc01)
  354. FMADD (aa2, bb1, cc02, cc02)
  355. FMADD (aa1, bb2, cc03, cc03)
  356. FMADD (aa2, bb2, cc04, cc04)
  357. FMADD (aa1, bb3, cc05, cc05)
  358. LDF [BO + 16 * SIZE], b1
  359. FMADD (aa2, bb3, cc06, cc06)
  360. LDF [BO + 9 * SIZE], b2
  361. FMADD (aa1, bb4, cc07, cc07)
  362. LDF [BO + 10 * SIZE], b3
  363. FMADD (aa2, bb4, cc08, cc08)
  364. LDF [BO + 11 * SIZE], b4
  365. FMADD (aa1, bb5, cc09, cc09)
  366. LDF [AO + 2 * SIZE], a3
  367. FMADD (aa2, bb5, cc10, cc10)
  368. LDF [AO + 3 * SIZE], a4
  369. FMADD (aa1, bb6, cc11, cc11)
  370. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  371. FMADD (aa2, bb6, cc12, cc12)
  372. nop
  373. FMADD (aa1, bb7, cc13, cc13)
  374. LDF [BO + 12 * SIZE], b5
  375. FMADD (aa2, bb7, cc14, cc14)
  376. LDF [BO + 13 * SIZE], b6
  377. FMADD (aa1, bb8, cc15, cc15)
  378. LDF [BO + 14 * SIZE], b7
  379. FMADD (aa2, bb8, cc16, cc16)
  380. LDF [BO + 15 * SIZE], b8
  381. FMADD (aa3, bb9, cc01, cc01)
  382. FMADD (aa4, bb9, cc02, cc02)
  383. FMADD (aa3, bb2, cc03, cc03)
  384. FMADD (aa4, bb2, cc04, cc04)
  385. FMADD (aa3, bb3, cc05, cc05)
  386. LDF [BO + 24 * SIZE], b9
  387. FMADD (aa4, bb3, cc06, cc06)
  388. LDF [BO + 17 * SIZE], b2
  389. FMADD (aa3, bb4, cc07, cc07)
  390. LDF [BO + 18 * SIZE], b3
  391. FMADD (aa4, bb4, cc08, cc08)
  392. LDF [BO + 19 * SIZE], b4
  393. FMADD (aa3, bb5, cc09, cc09)
  394. LDF [AO + 4 * SIZE], a1
  395. FMADD (aa4, bb5, cc10, cc10)
  396. LDF [AO + 5 * SIZE], a2
  397. FMADD (aa3, bb6, cc11, cc11)
  398. add L, -1, L
  399. FMADD (aa4, bb6, cc12, cc12)
  400. nop
  401. FMADD (aa3, bb7, cc13, cc13)
  402. LDF [BO + 20 * SIZE], b5
  403. FMADD (aa4, bb7, cc14, cc14)
  404. LDF [BO + 21 * SIZE], b6
  405. FMADD (aa3, bb8, cc15, cc15)
  406. LDF [BO + 22 * SIZE], b7
  407. FMADD (aa4, bb8, cc16, cc16)
  408. LDF [BO + 23 * SIZE], b8
  409. FMADD (aa1, bb1, cc01, cc01)
  410. FMADD (aa2, bb1, cc02, cc02)
  411. FMADD (aa1, bb2, cc03, cc03)
  412. FMADD (aa2, bb2, cc04, cc04)
  413. FMADD (aa1, bb3, cc05, cc05)
  414. LDF [BO + 32 * SIZE], b1
  415. FMADD (aa2, bb3, cc06, cc06)
  416. LDF [BO + 25 * SIZE], b2
  417. FMADD (aa1, bb4, cc07, cc07)
  418. LDF [BO + 26 * SIZE], b3
  419. FMADD (aa2, bb4, cc08, cc08)
  420. LDF [BO + 27 * SIZE], b4
  421. FMADD (aa1, bb5, cc09, cc09)
  422. LDF [AO + 6 * SIZE], a3
  423. FMADD (aa2, bb5, cc10, cc10)
  424. LDF [AO + 7 * SIZE], a4
  425. FMADD (aa1, bb6, cc11, cc11)
  426. nop
  427. FMADD (aa2, bb6, cc12, cc12)
  428. nop
  429. FMADD (aa1, bb7, cc13, cc13)
  430. LDF [BO + 28 * SIZE], b5
  431. FMADD (aa2, bb7, cc14, cc14)
  432. LDF [BO + 29 * SIZE], b6
  433. FMADD (aa1, bb8, cc15, cc15)
  434. LDF [BO + 30 * SIZE], b7
  435. FMADD (aa2, bb8, cc16, cc16)
  436. LDF [BO + 31 * SIZE], b8
  437. FMADD (aa3, bb9, cc01, cc01)
  438. FMADD (aa4, bb9, cc02, cc02)
  439. FMADD (aa3, bb2, cc03, cc03)
  440. FMADD (aa4, bb2, cc04, cc04)
  441. FMADD (aa3, bb3, cc05, cc05)
  442. LDF [BO + 40 * SIZE], b9
  443. FMADD (aa4, bb3, cc06, cc06)
  444. LDF [BO + 33 * SIZE], b2
  445. FMADD (aa3, bb4, cc07, cc07)
  446. LDF [BO + 34 * SIZE], b3
  447. FMADD (aa4, bb4, cc08, cc08)
  448. LDF [BO + 35 * SIZE], b4
  449. FMADD (aa3, bb5, cc09, cc09)
  450. LDF [AO + 16 * SIZE], a1 /****/
  451. FMADD (aa4, bb5, cc10, cc10)
  452. LDF [AO + 9 * SIZE], a2
  453. FMADD (aa3, bb6, cc11, cc11)
  454. nop
  455. FMADD (aa4, bb6, cc12, cc12)
  456. nop
  457. FMADD (aa3, bb7, cc13, cc13)
  458. LDF [BO + 36 * SIZE], b5
  459. FMADD (aa4, bb7, cc14, cc14)
  460. LDF [BO + 37 * SIZE], b6
  461. FMADD (aa3, bb8, cc15, cc15)
  462. LDF [BO + 38 * SIZE], b7
  463. FMADD (aa4, bb8, cc16, cc16)
  464. LDF [BO + 39 * SIZE], b8
  465. FMADD (aa5, bb1, cc01, cc01)
  466. FMADD (aa2, bb1, cc02, cc02)
  467. FMADD (aa5, bb2, cc03, cc03)
  468. FMADD (aa2, bb2, cc04, cc04)
  469. FMADD (aa5, bb3, cc05, cc05)
  470. LDF [BO + 48 * SIZE], b1
  471. FMADD (aa2, bb3, cc06, cc06)
  472. LDF [BO + 41 * SIZE], b2
  473. FMADD (aa5, bb4, cc07, cc07)
  474. LDF [BO + 42 * SIZE], b3
  475. FMADD (aa2, bb4, cc08, cc08)
  476. LDF [BO + 43 * SIZE], b4
  477. FMADD (aa5, bb5, cc09, cc09)
  478. LDF [AO + 10 * SIZE], a3
  479. FMADD (aa2, bb5, cc10, cc10)
  480. LDF [AO + 11 * SIZE], a4
  481. FMADD (aa5, bb6, cc11, cc11)
  482. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  483. FMADD (aa2, bb6, cc12, cc12)
  484. nop
  485. FMADD (aa5, bb7, cc13, cc13)
  486. LDF [BO + 44 * SIZE], b5
  487. FMADD (aa2, bb7, cc14, cc14)
  488. LDF [BO + 45 * SIZE], b6
  489. FMADD (aa5, bb8, cc15, cc15)
  490. LDF [BO + 46 * SIZE], b7
  491. FMADD (aa2, bb8, cc16, cc16)
  492. LDF [BO + 47 * SIZE], b8
  493. FMADD (aa3, bb9, cc01, cc01)
  494. FMADD (aa4, bb9, cc02, cc02)
  495. FMADD (aa3, bb2, cc03, cc03)
  496. FMADD (aa4, bb2, cc04, cc04)
  497. FMADD (aa3, bb3, cc05, cc05)
  498. LDF [BO + 56 * SIZE], b9
  499. FMADD (aa4, bb3, cc06, cc06)
  500. LDF [BO + 49 * SIZE], b2
  501. FMADD (aa3, bb4, cc07, cc07)
  502. LDF [BO + 50 * SIZE], b3
  503. FMADD (aa4, bb4, cc08, cc08)
  504. LDF [BO + 51 * SIZE], b4
  505. FMADD (aa3, bb5, cc09, cc09)
  506. LDF [AO + 12 * SIZE], a5
  507. FMADD (aa4, bb5, cc10, cc10)
  508. LDF [AO + 13 * SIZE], a2
  509. FMADD (aa3, bb6, cc11, cc11)
  510. cmp L, 0
  511. FMADD (aa4, bb6, cc12, cc12)
  512. nop
  513. FMADD (aa3, bb7, cc13, cc13)
  514. LDF [BO + 52 * SIZE], b5
  515. FMADD (aa4, bb7, cc14, cc14)
  516. LDF [BO + 53 * SIZE], b6
  517. FMADD (aa3, bb8, cc15, cc15)
  518. LDF [BO + 54 * SIZE], b7
  519. FMADD (aa4, bb8, cc16, cc16)
  520. LDF [BO + 55 * SIZE], b8
  521. FMADD (aa5, bb1, cc01, cc01)
  522. FMADD (aa2, bb1, cc02, cc02)
  523. FMADD (aa5, bb2, cc03, cc03)
  524. FMADD (aa2, bb2, cc04, cc04)
  525. FMADD (aa5, bb3, cc05, cc05)
  526. LDF [BO + 64 * SIZE], b1
  527. FMADD (aa2, bb3, cc06, cc06)
  528. LDF [BO + 57 * SIZE], b2
  529. FMADD (aa5, bb4, cc07, cc07)
  530. LDF [BO + 58 * SIZE], b3
  531. FMADD (aa2, bb4, cc08, cc08)
  532. LDF [BO + 59 * SIZE], b4
  533. FMADD (aa5, bb5, cc09, cc09)
  534. LDF [AO + 14 * SIZE], a3
  535. FMADD (aa2, bb5, cc10, cc10)
  536. LDF [AO + 15 * SIZE], a4
  537. FMADD (aa5, bb6, cc11, cc11)
  538. add BO, 64 * SIZE, BO
  539. FMADD (aa2, bb6, cc12, cc12)
  540. add AO, 16 * SIZE, AO
  541. FMADD (aa5, bb7, cc13, cc13)
  542. LDF [BO - 4 * SIZE], b5
  543. FMADD (aa2, bb7, cc14, cc14)
  544. LDF [BO - 3 * SIZE], b6
  545. FMADD (aa5, bb8, cc15, cc15)
  546. LDF [BO - 2 * SIZE], b7
  547. FMADD (aa2, bb8, cc16, cc16)
  548. LDF [BO - 1 * SIZE], b8
  549. FMADD (aa3, bb9, cc01, cc01)
  550. FMADD (aa4, bb9, cc02, cc02)
  551. FMADD (aa3, bb2, cc03, cc03)
  552. FMADD (aa4, bb2, cc04, cc04)
  553. FMADD (aa3, bb3, cc05, cc05)
  554. LDF [BO + 8 * SIZE], b9
  555. FMADD (aa4, bb3, cc06, cc06)
  556. LDF [BO + 1 * SIZE], b2
  557. FMADD (aa3, bb4, cc07, cc07)
  558. LDF [BO + 2 * SIZE], b3
  559. FMADD (aa4, bb4, cc08, cc08)
  560. LDF [BO + 3 * SIZE], b4
  561. FMADD (aa3, bb5, cc09, cc09)
  562. LDF [AO + 8 * SIZE], a5 /****/
  563. FMADD (aa4, bb5, cc10, cc10)
  564. LDF [AO + 1 * SIZE], a2
  565. FMADD (aa3, bb6, cc11, cc11)
  566. FMADD (aa4, bb6, cc12, cc12)
  567. FMADD (aa3, bb7, cc13, cc13)
  568. LDF [BO + 4 * SIZE], b5
  569. FMADD (aa4, bb7, cc14, cc14)
  570. LDF [BO + 5 * SIZE], b6
  571. FMADD (aa3, bb8, cc15, cc15)
  572. LDF [BO + 6 * SIZE], b7
  573. FMADD (aa4, bb8, cc16, cc16)
  574. ble,pn %icc, .LL15
  575. LDF [BO + 7 * SIZE], b8
  576. FMADD (aa1, bb1, cc01, cc01)
  577. FMADD (aa2, bb1, cc02, cc02)
  578. FMADD (aa1, bb2, cc03, cc03)
  579. FMADD (aa2, bb2, cc04, cc04)
  580. FMADD (aa1, bb3, cc05, cc05)
  581. LDF [BO + 16 * SIZE], b1
  582. FMADD (aa2, bb3, cc06, cc06)
  583. LDF [BO + 9 * SIZE], b2
  584. FMADD (aa1, bb4, cc07, cc07)
  585. LDF [BO + 10 * SIZE], b3
  586. FMADD (aa2, bb4, cc08, cc08)
  587. LDF [BO + 11 * SIZE], b4
  588. FMADD (aa1, bb5, cc09, cc09)
  589. LDF [AO + 2 * SIZE], a3
  590. FMADD (aa2, bb5, cc10, cc10)
  591. LDF [AO + 3 * SIZE], a4
  592. FMADD (aa1, bb6, cc11, cc11)
  593. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  594. FMADD (aa2, bb6, cc12, cc12)
  595. nop
  596. FMADD (aa1, bb7, cc13, cc13)
  597. LDF [BO + 12 * SIZE], b5
  598. FMADD (aa2, bb7, cc14, cc14)
  599. LDF [BO + 13 * SIZE], b6
  600. FMADD (aa1, bb8, cc15, cc15)
  601. LDF [BO + 14 * SIZE], b7
  602. FMADD (aa2, bb8, cc16, cc16)
  603. LDF [BO + 15 * SIZE], b8
  604. FMADD (aa3, bb9, cc01, cc01)
  605. FMADD (aa4, bb9, cc02, cc02)
  606. FMADD (aa3, bb2, cc03, cc03)
  607. FMADD (aa4, bb2, cc04, cc04)
  608. FMADD (aa3, bb3, cc05, cc05)
  609. LDF [BO + 24 * SIZE], b9
  610. FMADD (aa4, bb3, cc06, cc06)
  611. LDF [BO + 17 * SIZE], b2
  612. FMADD (aa3, bb4, cc07, cc07)
  613. LDF [BO + 18 * SIZE], b3
  614. FMADD (aa4, bb4, cc08, cc08)
  615. LDF [BO + 19 * SIZE], b4
  616. FMADD (aa3, bb5, cc09, cc09)
  617. LDF [AO + 4 * SIZE], a1
  618. FMADD (aa4, bb5, cc10, cc10)
  619. LDF [AO + 5 * SIZE], a2
  620. FMADD (aa3, bb6, cc11, cc11)
  621. add L, -1, L
  622. FMADD (aa4, bb6, cc12, cc12)
  623. nop
  624. FMADD (aa3, bb7, cc13, cc13)
  625. LDF [BO + 20 * SIZE], b5
  626. FMADD (aa4, bb7, cc14, cc14)
  627. LDF [BO + 21 * SIZE], b6
  628. FMADD (aa3, bb8, cc15, cc15)
  629. LDF [BO + 22 * SIZE], b7
  630. FMADD (aa4, bb8, cc16, cc16)
  631. LDF [BO + 23 * SIZE], b8
  632. FMADD (aa1, bb1, cc01, cc01)
  633. FMADD (aa2, bb1, cc02, cc02)
  634. FMADD (aa1, bb2, cc03, cc03)
  635. FMADD (aa2, bb2, cc04, cc04)
  636. FMADD (aa1, bb3, cc05, cc05)
  637. LDF [BO + 32 * SIZE], b1
  638. FMADD (aa2, bb3, cc06, cc06)
  639. LDF [BO + 25 * SIZE], b2
  640. FMADD (aa1, bb4, cc07, cc07)
  641. LDF [BO + 26 * SIZE], b3
  642. FMADD (aa2, bb4, cc08, cc08)
  643. LDF [BO + 27 * SIZE], b4
  644. FMADD (aa1, bb5, cc09, cc09)
  645. LDF [AO + 6 * SIZE], a3
  646. FMADD (aa2, bb5, cc10, cc10)
  647. LDF [AO + 7 * SIZE], a4
  648. FMADD (aa1, bb6, cc11, cc11)
  649. nop
  650. FMADD (aa2, bb6, cc12, cc12)
  651. nop
  652. FMADD (aa1, bb7, cc13, cc13)
  653. LDF [BO + 28 * SIZE], b5
  654. FMADD (aa2, bb7, cc14, cc14)
  655. LDF [BO + 29 * SIZE], b6
  656. FMADD (aa1, bb8, cc15, cc15)
  657. LDF [BO + 30 * SIZE], b7
  658. FMADD (aa2, bb8, cc16, cc16)
  659. LDF [BO + 31 * SIZE], b8
  660. FMADD (aa3, bb9, cc01, cc01)
  661. FMADD (aa4, bb9, cc02, cc02)
  662. FMADD (aa3, bb2, cc03, cc03)
  663. FMADD (aa4, bb2, cc04, cc04)
  664. FMADD (aa3, bb3, cc05, cc05)
  665. LDF [BO + 40 * SIZE], b9
  666. FMADD (aa4, bb3, cc06, cc06)
  667. LDF [BO + 33 * SIZE], b2
  668. FMADD (aa3, bb4, cc07, cc07)
  669. LDF [BO + 34 * SIZE], b3
  670. FMADD (aa4, bb4, cc08, cc08)
  671. LDF [BO + 35 * SIZE], b4
  672. FMADD (aa3, bb5, cc09, cc09)
  673. LDF [AO + 16 * SIZE], a1 /****/
  674. FMADD (aa4, bb5, cc10, cc10)
  675. LDF [AO + 9 * SIZE], a2
  676. FMADD (aa3, bb6, cc11, cc11)
  677. nop
  678. FMADD (aa4, bb6, cc12, cc12)
  679. nop
  680. FMADD (aa3, bb7, cc13, cc13)
  681. LDF [BO + 36 * SIZE], b5
  682. FMADD (aa4, bb7, cc14, cc14)
  683. LDF [BO + 37 * SIZE], b6
  684. FMADD (aa3, bb8, cc15, cc15)
  685. LDF [BO + 38 * SIZE], b7
  686. FMADD (aa4, bb8, cc16, cc16)
  687. LDF [BO + 39 * SIZE], b8
  688. FMADD (aa5, bb1, cc01, cc01)
  689. FMADD (aa2, bb1, cc02, cc02)
  690. FMADD (aa5, bb2, cc03, cc03)
  691. FMADD (aa2, bb2, cc04, cc04)
  692. FMADD (aa5, bb3, cc05, cc05)
  693. LDF [BO + 48 * SIZE], b1
  694. FMADD (aa2, bb3, cc06, cc06)
  695. LDF [BO + 41 * SIZE], b2
  696. FMADD (aa5, bb4, cc07, cc07)
  697. LDF [BO + 42 * SIZE], b3
  698. FMADD (aa2, bb4, cc08, cc08)
  699. LDF [BO + 43 * SIZE], b4
  700. FMADD (aa5, bb5, cc09, cc09)
  701. LDF [AO + 10 * SIZE], a3
  702. FMADD (aa2, bb5, cc10, cc10)
  703. LDF [AO + 11 * SIZE], a4
  704. FMADD (aa5, bb6, cc11, cc11)
  705. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  706. FMADD (aa2, bb6, cc12, cc12)
  707. nop
  708. FMADD (aa5, bb7, cc13, cc13)
  709. LDF [BO + 44 * SIZE], b5
  710. FMADD (aa2, bb7, cc14, cc14)
  711. LDF [BO + 45 * SIZE], b6
  712. FMADD (aa5, bb8, cc15, cc15)
  713. LDF [BO + 46 * SIZE], b7
  714. FMADD (aa2, bb8, cc16, cc16)
  715. LDF [BO + 47 * SIZE], b8
  716. FMADD (aa3, bb9, cc01, cc01)
  717. FMADD (aa4, bb9, cc02, cc02)
  718. FMADD (aa3, bb2, cc03, cc03)
  719. FMADD (aa4, bb2, cc04, cc04)
  720. FMADD (aa3, bb3, cc05, cc05)
  721. LDF [BO + 56 * SIZE], b9
  722. FMADD (aa4, bb3, cc06, cc06)
  723. LDF [BO + 49 * SIZE], b2
  724. FMADD (aa3, bb4, cc07, cc07)
  725. LDF [BO + 50 * SIZE], b3
  726. FMADD (aa4, bb4, cc08, cc08)
  727. LDF [BO + 51 * SIZE], b4
  728. FMADD (aa3, bb5, cc09, cc09)
  729. LDF [AO + 12 * SIZE], a5
  730. FMADD (aa4, bb5, cc10, cc10)
  731. LDF [AO + 13 * SIZE], a2
  732. FMADD (aa3, bb6, cc11, cc11)
  733. cmp L, 0
  734. FMADD (aa4, bb6, cc12, cc12)
  735. nop
  736. FMADD (aa3, bb7, cc13, cc13)
  737. LDF [BO + 52 * SIZE], b5
  738. FMADD (aa4, bb7, cc14, cc14)
  739. LDF [BO + 53 * SIZE], b6
  740. FMADD (aa3, bb8, cc15, cc15)
  741. LDF [BO + 54 * SIZE], b7
  742. FMADD (aa4, bb8, cc16, cc16)
  743. LDF [BO + 55 * SIZE], b8
  744. FMADD (aa5, bb1, cc01, cc01)
  745. FMADD (aa2, bb1, cc02, cc02)
  746. FMADD (aa5, bb2, cc03, cc03)
  747. FMADD (aa2, bb2, cc04, cc04)
  748. FMADD (aa5, bb3, cc05, cc05)
  749. LDF [BO + 64 * SIZE], b1
  750. FMADD (aa2, bb3, cc06, cc06)
  751. LDF [BO + 57 * SIZE], b2
  752. FMADD (aa5, bb4, cc07, cc07)
  753. LDF [BO + 58 * SIZE], b3
  754. FMADD (aa2, bb4, cc08, cc08)
  755. LDF [BO + 59 * SIZE], b4
  756. FMADD (aa5, bb5, cc09, cc09)
  757. LDF [AO + 14 * SIZE], a3
  758. FMADD (aa2, bb5, cc10, cc10)
  759. LDF [AO + 15 * SIZE], a4
  760. FMADD (aa5, bb6, cc11, cc11)
  761. add BO, 64 * SIZE, BO
  762. FMADD (aa2, bb6, cc12, cc12)
  763. add AO, 16 * SIZE, AO
  764. FMADD (aa5, bb7, cc13, cc13)
  765. LDF [BO - 4 * SIZE], b5
  766. FMADD (aa2, bb7, cc14, cc14)
  767. LDF [BO - 3 * SIZE], b6
  768. FMADD (aa5, bb8, cc15, cc15)
  769. LDF [BO - 2 * SIZE], b7
  770. FMADD (aa2, bb8, cc16, cc16)
  771. LDF [BO - 1 * SIZE], b8
  772. FMADD (aa3, bb9, cc01, cc01)
  773. FMADD (aa4, bb9, cc02, cc02)
  774. FMADD (aa3, bb2, cc03, cc03)
  775. FMADD (aa4, bb2, cc04, cc04)
  776. FMADD (aa3, bb3, cc05, cc05)
  777. LDF [BO + 8 * SIZE], b9
  778. FMADD (aa4, bb3, cc06, cc06)
  779. LDF [BO + 1 * SIZE], b2
  780. FMADD (aa3, bb4, cc07, cc07)
  781. LDF [BO + 2 * SIZE], b3
  782. FMADD (aa4, bb4, cc08, cc08)
  783. LDF [BO + 3 * SIZE], b4
  784. FMADD (aa3, bb5, cc09, cc09)
  785. LDF [AO + 8 * SIZE], a5 /****/
  786. FMADD (aa4, bb5, cc10, cc10)
  787. LDF [AO + 1 * SIZE], a2
  788. FMADD (aa3, bb6, cc11, cc11)
  789. FMADD (aa4, bb6, cc12, cc12)
  790. FMADD (aa3, bb7, cc13, cc13)
  791. LDF [BO + 4 * SIZE], b5
  792. FMADD (aa4, bb7, cc14, cc14)
  793. LDF [BO + 5 * SIZE], b6
  794. FMADD (aa3, bb8, cc15, cc15)
  795. LDF [BO + 6 * SIZE], b7
  796. FMADD (aa4, bb8, cc16, cc16)
  797. bg,pt %icc, .LL13
  798. LDF [BO + 7 * SIZE], b8
  799. .align 4
  800. .LL15:
  801. #if defined(LT) || defined(RN)
  802. and KK, 7, L
  803. #else
  804. sub K, KK, L
  805. and L, 7, L
  806. #endif
  807. cmp L, 0
  808. ble,a,pn %icc, .LL18
  809. nop
  810. .align 4
  811. .LL17:
  812. FMADD (aa1, bb1, cc01, cc01)
  813. add L, -1, L
  814. FMADD (aa2, bb1, cc02, cc02)
  815. nop
  816. FMADD (aa1, bb2, cc03, cc03)
  817. LDF [BO + 8 * SIZE], b1
  818. FMADD (aa2, bb2, cc04, cc04)
  819. LDF [BO + 9 * SIZE], b2
  820. FMADD (aa1, bb3, cc05, cc05)
  821. cmp L, 0
  822. FMADD (aa2, bb3, cc06, cc06)
  823. nop
  824. FMADD (aa1, bb4, cc07, cc07)
  825. LDF [BO + 10 * SIZE], b3
  826. FMADD (aa2, bb4, cc08, cc08)
  827. LDF [BO + 11 * SIZE], b4
  828. FMADD (aa1, bb5, cc09, cc09)
  829. nop
  830. FMADD (aa2, bb5, cc10, cc10)
  831. nop
  832. FMADD (aa1, bb6, cc11, cc11)
  833. LDF [BO + 12 * SIZE], b5
  834. FMADD (aa2, bb6, cc12, cc12)
  835. LDF [BO + 13 * SIZE], b6
  836. FMADD (aa1, bb7, cc13, cc13)
  837. add AO, 2 * SIZE, AO
  838. FMADD (aa2, bb7, cc14, cc14)
  839. add BO, 8 * SIZE, BO
  840. FMADD (aa1, bb8, cc15, cc15)
  841. LDF [AO + 0 * SIZE], a1
  842. FMADD (aa2, bb8, cc16, cc16)
  843. LDF [AO + 1 * SIZE], a2
  844. LDF [BO + 6 * SIZE], b7
  845. bg,pt %icc, .LL17
  846. LDF [BO + 7 * SIZE], b8
  847. nop
  848. .align 4
  849. .LL18:
  850. #if defined(LN) || defined(RT)
  851. #ifdef LN
  852. sub KK, 2, TEMP1
  853. #else
  854. sub KK, 8, TEMP1
  855. #endif
  856. sll TEMP1, BASE_SHIFT + 1, TEMP2
  857. sll TEMP1, BASE_SHIFT + 3, TEMP1
  858. add AORIG, TEMP2, AO
  859. add B, TEMP1, BO
  860. #endif
  861. #if defined(LN) || defined(LT)
  862. LDF [BO + 0 * SIZE], a1
  863. LDF [BO + 1 * SIZE], a2
  864. LDF [BO + 2 * SIZE], a3
  865. LDF [BO + 3 * SIZE], a4
  866. LDF [BO + 4 * SIZE], b1
  867. LDF [BO + 5 * SIZE], b2
  868. LDF [BO + 6 * SIZE], b3
  869. LDF [BO + 7 * SIZE], b4
  870. FSUB a1, c01, c01
  871. FSUB a2, c03, c03
  872. FSUB a3, c05, c05
  873. FSUB a4, c07, c07
  874. FSUB b1, c09, c09
  875. FSUB b2, c11, c11
  876. FSUB b3, c13, c13
  877. FSUB b4, c15, c15
  878. LDF [BO + 8 * SIZE], a1
  879. LDF [BO + 9 * SIZE], a2
  880. LDF [BO + 10 * SIZE], a3
  881. LDF [BO + 11 * SIZE], a4
  882. LDF [BO + 12 * SIZE], b1
  883. LDF [BO + 13 * SIZE], b2
  884. LDF [BO + 14 * SIZE], b3
  885. LDF [BO + 15 * SIZE], b4
  886. FSUB a1, c02, c02
  887. FSUB a2, c04, c04
  888. FSUB a3, c06, c06
  889. FSUB a4, c08, c08
  890. FSUB b1, c10, c10
  891. FSUB b2, c12, c12
  892. FSUB b3, c14, c14
  893. FSUB b4, c16, c16
  894. #else
  895. LDF [AO + 0 * SIZE], a1
  896. LDF [AO + 1 * SIZE], a2
  897. LDF [AO + 2 * SIZE], a3
  898. LDF [AO + 3 * SIZE], a4
  899. LDF [AO + 4 * SIZE], b1
  900. LDF [AO + 5 * SIZE], b2
  901. LDF [AO + 6 * SIZE], b3
  902. LDF [AO + 7 * SIZE], b4
  903. FSUB a1, c01, c01
  904. FSUB a2, c02, c02
  905. FSUB a3, c03, c03
  906. FSUB a4, c04, c04
  907. FSUB b1, c05, c05
  908. FSUB b2, c06, c06
  909. FSUB b3, c07, c07
  910. FSUB b4, c08, c08
  911. LDF [AO + 8 * SIZE], a1
  912. LDF [AO + 9 * SIZE], a2
  913. LDF [AO + 10 * SIZE], a3
  914. LDF [AO + 11 * SIZE], a4
  915. LDF [AO + 12 * SIZE], b1
  916. LDF [AO + 13 * SIZE], b2
  917. LDF [AO + 14 * SIZE], b3
  918. LDF [AO + 15 * SIZE], b4
  919. FSUB a1, c09, c09
  920. FSUB a2, c10, c10
  921. FSUB a3, c11, c11
  922. FSUB a4, c12, c12
  923. FSUB b1, c13, c13
  924. FSUB b2, c14, c14
  925. FSUB b3, c15, c15
  926. FSUB b4, c16, c16
  927. #endif
  928. #ifdef LN
  929. LDF [AO + 3 * SIZE], a1
  930. LDF [AO + 2 * SIZE], a2
  931. LDF [AO + 0 * SIZE], a3
  932. FMUL a1, c02, c02
  933. FMUL a1, c04, c04
  934. FMUL a1, c06, c06
  935. FMUL a1, c08, c08
  936. FMUL a1, c10, c10
  937. FMUL a1, c12, c12
  938. FMUL a1, c14, c14
  939. FMUL a1, c16, c16
  940. FNMSUB (aa2, cc02, cc01, cc01)
  941. FNMSUB (aa2, cc04, cc03, cc03)
  942. FNMSUB (aa2, cc06, cc05, cc05)
  943. FNMSUB (aa2, cc08, cc07, cc07)
  944. FNMSUB (aa2, cc10, cc09, cc09)
  945. FNMSUB (aa2, cc12, cc11, cc11)
  946. FNMSUB (aa2, cc14, cc13, cc13)
  947. FNMSUB (aa2, cc16, cc15, cc15)
  948. FMUL a3, c01, c01
  949. FMUL a3, c03, c03
  950. FMUL a3, c05, c05
  951. FMUL a3, c07, c07
  952. FMUL a3, c09, c09
  953. FMUL a3, c11, c11
  954. FMUL a3, c13, c13
  955. FMUL a3, c15, c15
  956. #endif
  957. #ifdef LT
  958. LDF [AO + 0 * SIZE], a1
  959. LDF [AO + 1 * SIZE], a2
  960. LDF [AO + 3 * SIZE], a3
  961. FMUL a1, c01, c01
  962. FMUL a1, c03, c03
  963. FMUL a1, c05, c05
  964. FMUL a1, c07, c07
  965. FMUL a1, c09, c09
  966. FMUL a1, c11, c11
  967. FMUL a1, c13, c13
  968. FMUL a1, c15, c15
  969. FNMSUB (aa2, cc01, cc02, cc02)
  970. FNMSUB (aa2, cc03, cc04, cc04)
  971. FNMSUB (aa2, cc05, cc06, cc06)
  972. FNMSUB (aa2, cc07, cc08, cc08)
  973. FNMSUB (aa2, cc09, cc10, cc10)
  974. FNMSUB (aa2, cc11, cc12, cc12)
  975. FNMSUB (aa2, cc13, cc14, cc14)
  976. FNMSUB (aa2, cc15, cc16, cc16)
  977. FMUL a3, c02, c02
  978. FMUL a3, c04, c04
  979. FMUL a3, c06, c06
  980. FMUL a3, c08, c08
  981. FMUL a3, c10, c10
  982. FMUL a3, c12, c12
  983. FMUL a3, c14, c14
  984. FMUL a3, c16, c16
  985. #endif
  986. #ifdef RN
  987. LDF [BO + 0 * SIZE], a1
  988. LDF [BO + 1 * SIZE], a2
  989. LDF [BO + 2 * SIZE], a3
  990. LDF [BO + 3 * SIZE], a4
  991. LDF [BO + 4 * SIZE], b1
  992. LDF [BO + 5 * SIZE], b2
  993. LDF [BO + 6 * SIZE], b3
  994. LDF [BO + 7 * SIZE], b4
  995. FMUL a1, c01, c01
  996. FMUL a1, c02, c02
  997. FNMSUB (aa2, cc01, cc03, cc03)
  998. FNMSUB (aa2, cc02, cc04, cc04)
  999. FNMSUB (aa3, cc01, cc05, cc05)
  1000. FNMSUB (aa3, cc02, cc06, cc06)
  1001. FNMSUB (aa4, cc01, cc07, cc07)
  1002. FNMSUB (aa4, cc02, cc08, cc08)
  1003. FNMSUB (bb1, cc01, cc09, cc09)
  1004. FNMSUB (bb1, cc02, cc10, cc10)
  1005. FNMSUB (bb2, cc01, cc11, cc11)
  1006. FNMSUB (bb2, cc02, cc12, cc12)
  1007. FNMSUB (bb3, cc01, cc13, cc13)
  1008. FNMSUB (bb3, cc02, cc14, cc14)
  1009. FNMSUB (bb4, cc01, cc15, cc15)
  1010. FNMSUB (bb4, cc02, cc16, cc16)
  1011. LDF [BO + 9 * SIZE], a1
  1012. LDF [BO + 10 * SIZE], a2
  1013. LDF [BO + 11 * SIZE], a3
  1014. LDF [BO + 12 * SIZE], a4
  1015. LDF [BO + 13 * SIZE], b1
  1016. LDF [BO + 14 * SIZE], b2
  1017. LDF [BO + 15 * SIZE], b3
  1018. FMUL a1, c03, c03
  1019. FMUL a1, c04, c04
  1020. FNMSUB (aa2, cc03, cc05, cc05)
  1021. FNMSUB (aa2, cc04, cc06, cc06)
  1022. FNMSUB (aa3, cc03, cc07, cc07)
  1023. FNMSUB (aa3, cc04, cc08, cc08)
  1024. FNMSUB (aa4, cc03, cc09, cc09)
  1025. FNMSUB (aa4, cc04, cc10, cc10)
  1026. FNMSUB (bb1, cc03, cc11, cc11)
  1027. FNMSUB (bb1, cc04, cc12, cc12)
  1028. FNMSUB (bb2, cc03, cc13, cc13)
  1029. FNMSUB (bb2, cc04, cc14, cc14)
  1030. FNMSUB (bb3, cc03, cc15, cc15)
  1031. FNMSUB (bb3, cc04, cc16, cc16)
  1032. LDF [BO + 18 * SIZE], a1
  1033. LDF [BO + 19 * SIZE], a2
  1034. LDF [BO + 20 * SIZE], a3
  1035. LDF [BO + 21 * SIZE], a4
  1036. LDF [BO + 22 * SIZE], b1
  1037. LDF [BO + 23 * SIZE], b2
  1038. FMUL a1, c05, c05
  1039. FMUL a1, c06, c06
  1040. FNMSUB (aa2, cc05, cc07, cc07)
  1041. FNMSUB (aa2, cc06, cc08, cc08)
  1042. FNMSUB (aa3, cc05, cc09, cc09)
  1043. FNMSUB (aa3, cc06, cc10, cc10)
  1044. FNMSUB (aa4, cc05, cc11, cc11)
  1045. FNMSUB (aa4, cc06, cc12, cc12)
  1046. FNMSUB (bb1, cc05, cc13, cc13)
  1047. FNMSUB (bb1, cc06, cc14, cc14)
  1048. FNMSUB (bb2, cc05, cc15, cc15)
  1049. FNMSUB (bb2, cc06, cc16, cc16)
  1050. LDF [BO + 27 * SIZE], a1
  1051. LDF [BO + 28 * SIZE], a2
  1052. LDF [BO + 29 * SIZE], a3
  1053. LDF [BO + 30 * SIZE], a4
  1054. LDF [BO + 31 * SIZE], b1
  1055. FMUL a1, c07, c07
  1056. FMUL a1, c08, c08
  1057. FNMSUB (aa2, cc07, cc09, cc09)
  1058. FNMSUB (aa2, cc08, cc10, cc10)
  1059. FNMSUB (aa3, cc07, cc11, cc11)
  1060. FNMSUB (aa3, cc08, cc12, cc12)
  1061. FNMSUB (aa4, cc07, cc13, cc13)
  1062. FNMSUB (aa4, cc08, cc14, cc14)
  1063. FNMSUB (bb1, cc07, cc15, cc15)
  1064. FNMSUB (bb1, cc08, cc16, cc16)
  1065. LDF [BO + 36 * SIZE], a1
  1066. LDF [BO + 37 * SIZE], a2
  1067. LDF [BO + 38 * SIZE], a3
  1068. LDF [BO + 39 * SIZE], a4
  1069. FMUL a1, c09, c09
  1070. FMUL a1, c10, c10
  1071. FNMSUB (aa2, cc09, cc11, cc11)
  1072. FNMSUB (aa2, cc10, cc12, cc12)
  1073. FNMSUB (aa3, cc09, cc13, cc13)
  1074. FNMSUB (aa3, cc10, cc14, cc14)
  1075. FNMSUB (aa4, cc09, cc15, cc15)
  1076. FNMSUB (aa4, cc10, cc16, cc16)
  1077. LDF [BO + 45 * SIZE], a1
  1078. LDF [BO + 46 * SIZE], a2
  1079. LDF [BO + 47 * SIZE], a3
  1080. FMUL a1, c11, c11
  1081. FMUL a1, c12, c12
  1082. FNMSUB (aa2, cc11, cc13, cc13)
  1083. FNMSUB (aa2, cc12, cc14, cc14)
  1084. FNMSUB (aa3, cc11, cc15, cc15)
  1085. FNMSUB (aa3, cc12, cc16, cc16)
  1086. LDF [BO + 54 * SIZE], a1
  1087. LDF [BO + 55 * SIZE], a2
  1088. FMUL a1, c13, c13
  1089. FMUL a1, c14, c14
  1090. FNMSUB (aa2, cc13, cc15, cc15)
  1091. FNMSUB (aa2, cc14, cc16, cc16)
  1092. LDF [BO + 63 * SIZE], a1
  1093. FMUL a1, c15, c15
  1094. FMUL a1, c16, c16
  1095. #endif
  1096. #ifdef RT
  1097. LDF [BO + 63 * SIZE], a1
  1098. LDF [BO + 62 * SIZE], a2
  1099. LDF [BO + 61 * SIZE], a3
  1100. LDF [BO + 60 * SIZE], a4
  1101. LDF [BO + 59 * SIZE], b1
  1102. LDF [BO + 58 * SIZE], b2
  1103. LDF [BO + 57 * SIZE], b3
  1104. LDF [BO + 56 * SIZE], b4
  1105. FMUL a1, c16, c16
  1106. FMUL a1, c15, c15
  1107. FNMSUB (aa2, cc16, cc14, cc14)
  1108. FNMSUB (aa2, cc15, cc13, cc13)
  1109. FNMSUB (aa3, cc16, cc12, cc12)
  1110. FNMSUB (aa3, cc15, cc11, cc11)
  1111. FNMSUB (aa4, cc16, cc10, cc10)
  1112. FNMSUB (aa4, cc15, cc09, cc09)
  1113. FNMSUB (bb1, cc16, cc08, cc08)
  1114. FNMSUB (bb1, cc15, cc07, cc07)
  1115. FNMSUB (bb2, cc16, cc06, cc06)
  1116. FNMSUB (bb2, cc15, cc05, cc05)
  1117. FNMSUB (bb3, cc16, cc04, cc04)
  1118. FNMSUB (bb3, cc15, cc03, cc03)
  1119. FNMSUB (bb4, cc16, cc02, cc02)
  1120. FNMSUB (bb4, cc15, cc01, cc01)
  1121. LDF [BO + 54 * SIZE], a1
  1122. LDF [BO + 53 * SIZE], a2
  1123. LDF [BO + 52 * SIZE], a3
  1124. LDF [BO + 51 * SIZE], a4
  1125. LDF [BO + 50 * SIZE], b1
  1126. LDF [BO + 49 * SIZE], b2
  1127. LDF [BO + 48 * SIZE], b3
  1128. FMUL a1, c14, c14
  1129. FMUL a1, c13, c13
  1130. FNMSUB (aa2, cc14, cc12, cc12)
  1131. FNMSUB (aa2, cc13, cc11, cc11)
  1132. FNMSUB (aa3, cc14, cc10, cc10)
  1133. FNMSUB (aa3, cc13, cc09, cc09)
  1134. FNMSUB (aa4, cc14, cc08, cc08)
  1135. FNMSUB (aa4, cc13, cc07, cc07)
  1136. FNMSUB (bb1, cc14, cc06, cc06)
  1137. FNMSUB (bb1, cc13, cc05, cc05)
  1138. FNMSUB (bb2, cc14, cc04, cc04)
  1139. FNMSUB (bb2, cc13, cc03, cc03)
  1140. FNMSUB (bb3, cc14, cc02, cc02)
  1141. FNMSUB (bb3, cc13, cc01, cc01)
  1142. LDF [BO + 45 * SIZE], a1
  1143. LDF [BO + 44 * SIZE], a2
  1144. LDF [BO + 43 * SIZE], a3
  1145. LDF [BO + 42 * SIZE], a4
  1146. LDF [BO + 41 * SIZE], b1
  1147. LDF [BO + 40 * SIZE], b2
  1148. FMUL a1, c12, c12
  1149. FMUL a1, c11, c11
  1150. FNMSUB (aa2, cc12, cc10, cc10)
  1151. FNMSUB (aa2, cc11, cc09, cc09)
  1152. FNMSUB (aa3, cc12, cc08, cc08)
  1153. FNMSUB (aa3, cc11, cc07, cc07)
  1154. FNMSUB (aa4, cc12, cc06, cc06)
  1155. FNMSUB (aa4, cc11, cc05, cc05)
  1156. FNMSUB (bb1, cc12, cc04, cc04)
  1157. FNMSUB (bb1, cc11, cc03, cc03)
  1158. FNMSUB (bb2, cc12, cc02, cc02)
  1159. FNMSUB (bb2, cc11, cc01, cc01)
  1160. LDF [BO + 36 * SIZE], a1
  1161. LDF [BO + 35 * SIZE], a2
  1162. LDF [BO + 34 * SIZE], a3
  1163. LDF [BO + 33 * SIZE], a4
  1164. LDF [BO + 32 * SIZE], b1
  1165. FMUL a1, c10, c10
  1166. FMUL a1, c09, c09
  1167. FNMSUB (aa2, cc10, cc08, cc08)
  1168. FNMSUB (aa2, cc09, cc07, cc07)
  1169. FNMSUB (aa3, cc10, cc06, cc06)
  1170. FNMSUB (aa3, cc09, cc05, cc05)
  1171. FNMSUB (aa4, cc10, cc04, cc04)
  1172. FNMSUB (aa4, cc09, cc03, cc03)
  1173. FNMSUB (bb1, cc10, cc02, cc02)
  1174. FNMSUB (bb1, cc09, cc01, cc01)
  1175. LDF [BO + 27 * SIZE], a1
  1176. LDF [BO + 26 * SIZE], a2
  1177. LDF [BO + 25 * SIZE], a3
  1178. LDF [BO + 24 * SIZE], a4
  1179. FMUL a1, c08, c08
  1180. FMUL a1, c07, c07
  1181. FNMSUB (aa2, cc08, cc06, cc06)
  1182. FNMSUB (aa2, cc07, cc05, cc05)
  1183. FNMSUB (aa3, cc08, cc04, cc04)
  1184. FNMSUB (aa3, cc07, cc03, cc03)
  1185. FNMSUB (aa4, cc08, cc02, cc02)
  1186. FNMSUB (aa4, cc07, cc01, cc01)
  1187. LDF [BO + 18 * SIZE], a1
  1188. LDF [BO + 17 * SIZE], a2
  1189. LDF [BO + 16 * SIZE], a3
  1190. FMUL a1, c06, c06
  1191. FMUL a1, c05, c05
  1192. FNMSUB (aa2, cc06, cc04, cc04)
  1193. FNMSUB (aa2, cc05, cc03, cc03)
  1194. FNMSUB (aa3, cc06, cc02, cc02)
  1195. FNMSUB (aa3, cc05, cc01, cc01)
  1196. LDF [BO + 9 * SIZE], a1
  1197. LDF [BO + 8 * SIZE], a2
  1198. FMUL a1, c04, c04
  1199. FMUL a1, c03, c03
  1200. FNMSUB (aa2, cc04, cc02, cc02)
  1201. FNMSUB (aa2, cc03, cc01, cc01)
  1202. LDF [BO + 0 * SIZE], a1
  1203. FMUL a1, c02, c02
  1204. FMUL a1, c01, c01
  1205. #endif
  1206. #ifdef LN
  1207. add C1, -2 * SIZE, C1
  1208. add C2, -2 * SIZE, C2
  1209. add C3, -2 * SIZE, C3
  1210. add C4, -2 * SIZE, C4
  1211. add C5, -2 * SIZE, C5
  1212. add C6, -2 * SIZE, C6
  1213. add C7, -2 * SIZE, C7
  1214. add C8, -2 * SIZE, C8
  1215. #endif
  1216. #if defined(LN) || defined(LT)
  1217. STF c01, [BO + 0 * SIZE]
  1218. STF c03, [BO + 1 * SIZE]
  1219. STF c05, [BO + 2 * SIZE]
  1220. STF c07, [BO + 3 * SIZE]
  1221. STF c09, [BO + 4 * SIZE]
  1222. STF c11, [BO + 5 * SIZE]
  1223. STF c13, [BO + 6 * SIZE]
  1224. STF c15, [BO + 7 * SIZE]
  1225. STF c02, [BO + 8 * SIZE]
  1226. STF c04, [BO + 9 * SIZE]
  1227. STF c06, [BO + 10 * SIZE]
  1228. STF c08, [BO + 11 * SIZE]
  1229. STF c10, [BO + 12 * SIZE]
  1230. STF c12, [BO + 13 * SIZE]
  1231. STF c14, [BO + 14 * SIZE]
  1232. STF c16, [BO + 15 * SIZE]
  1233. #else
  1234. STF c01, [AO + 0 * SIZE]
  1235. STF c02, [AO + 1 * SIZE]
  1236. STF c03, [AO + 2 * SIZE]
  1237. STF c04, [AO + 3 * SIZE]
  1238. STF c05, [AO + 4 * SIZE]
  1239. STF c06, [AO + 5 * SIZE]
  1240. STF c07, [AO + 6 * SIZE]
  1241. STF c08, [AO + 7 * SIZE]
  1242. STF c09, [AO + 8 * SIZE]
  1243. STF c10, [AO + 9 * SIZE]
  1244. STF c11, [AO + 10 * SIZE]
  1245. STF c12, [AO + 11 * SIZE]
  1246. STF c13, [AO + 12 * SIZE]
  1247. STF c14, [AO + 13 * SIZE]
  1248. STF c15, [AO + 14 * SIZE]
  1249. STF c16, [AO + 15 * SIZE]
  1250. #endif
  1251. STF c01, [C1 + 0 * SIZE]
  1252. STF c02, [C1 + 1 * SIZE]
  1253. STF c03, [C2 + 0 * SIZE]
  1254. STF c04, [C2 + 1 * SIZE]
  1255. STF c05, [C3 + 0 * SIZE]
  1256. STF c06, [C3 + 1 * SIZE]
  1257. STF c07, [C4 + 0 * SIZE]
  1258. STF c08, [C4 + 1 * SIZE]
  1259. STF c09, [C5 + 0 * SIZE]
  1260. STF c10, [C5 + 1 * SIZE]
  1261. STF c11, [C6 + 0 * SIZE]
  1262. STF c12, [C6 + 1 * SIZE]
  1263. STF c13, [C7 + 0 * SIZE]
  1264. STF c14, [C7 + 1 * SIZE]
  1265. STF c15, [C8 + 0 * SIZE]
  1266. STF c16, [C8 + 1 * SIZE]
  1267. #ifndef LN
  1268. add C1, 2 * SIZE, C1
  1269. add C2, 2 * SIZE, C2
  1270. add C3, 2 * SIZE, C3
  1271. add C4, 2 * SIZE, C4
  1272. add C5, 2 * SIZE, C5
  1273. add C6, 2 * SIZE, C6
  1274. add C7, 2 * SIZE, C7
  1275. add C8, 2 * SIZE, C8
  1276. #endif
  1277. #ifdef RT
  1278. sll K, BASE_SHIFT + 1, TEMP1
  1279. add AORIG, TEMP1, AORIG
  1280. #endif
  1281. #if defined(LT) || defined(RN)
  1282. sub K, KK, TEMP1
  1283. sll TEMP1, BASE_SHIFT + 1, TEMP2
  1284. sll TEMP1, BASE_SHIFT + 3, TEMP1
  1285. add AO, TEMP2, AO
  1286. add BO, TEMP1, BO
  1287. #endif
  1288. #ifdef LT
  1289. add KK, 2, KK
  1290. #endif
  1291. #ifdef LN
  1292. sub KK, 2, KK
  1293. #endif
  1294. add I, -1, I
  1295. cmp I, 0
  1296. bg,pt %icc, .LL12
  1297. nop
  1298. .align 4
  1299. .LL20:
  1300. and M, 1, I
  1301. cmp I, 0
  1302. ble,pn %icc, .LL29
  1303. nop
  1304. #if defined(LT) || defined(RN)
  1305. mov B, BO
  1306. #else
  1307. #ifdef LN
  1308. sll K, BASE_SHIFT + 0, TEMP1
  1309. sub AORIG, TEMP1, AORIG
  1310. #endif
  1311. sll KK, BASE_SHIFT + 0, TEMP1
  1312. sll KK, BASE_SHIFT + 3, TEMP2
  1313. add AORIG, TEMP1, AO
  1314. add B, TEMP2, BO
  1315. #endif
  1316. LDF [AO + 0 * SIZE], a1
  1317. LDF [AO + 1 * SIZE], a2
  1318. LDF [AO + 2 * SIZE], a3
  1319. LDF [AO + 3 * SIZE], a4
  1320. LDF [BO + 0 * SIZE], b1
  1321. FCLR (cc01)
  1322. LDF [BO + 1 * SIZE], b2
  1323. FCLR (cc03)
  1324. LDF [BO + 2 * SIZE], b3
  1325. FCLR (cc05)
  1326. LDF [BO + 3 * SIZE], b4
  1327. FCLR (cc07)
  1328. LDF [BO + 4 * SIZE], b5
  1329. FCLR (cc09)
  1330. LDF [BO + 5 * SIZE], b6
  1331. FCLR (cc11)
  1332. LDF [BO + 6 * SIZE], b7
  1333. FCLR (cc13)
  1334. LDF [BO + 7 * SIZE], b8
  1335. FCLR (cc15)
  1336. #if defined(LT) || defined(RN)
  1337. sra KK, 2, L
  1338. #else
  1339. sub K, KK, L
  1340. sra L, 2, L
  1341. #endif
  1342. cmp L, 0
  1343. ble,pn %icc, .LL25
  1344. LDF [BO + 8 * SIZE], b9
  1345. .align 4
  1346. .LL23:
  1347. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1348. add L, -1, L
  1349. FMADD (aa1, bb1, cc01, cc01)
  1350. LDF [BO + 16 * SIZE], b1
  1351. FMADD (aa1, bb2, cc03, cc03)
  1352. LDF [BO + 9 * SIZE], b2
  1353. FMADD (aa1, bb3, cc05, cc05)
  1354. LDF [BO + 10 * SIZE], b3
  1355. FMADD (aa1, bb4, cc07, cc07)
  1356. LDF [BO + 11 * SIZE], b4
  1357. FMADD (aa1, bb5, cc09, cc09)
  1358. LDF [BO + 12 * SIZE], b5
  1359. FMADD (aa1, bb6, cc11, cc11)
  1360. LDF [BO + 13 * SIZE], b6
  1361. FMADD (aa1, bb7, cc13, cc13)
  1362. LDF [BO + 14 * SIZE], b7
  1363. FMADD (aa1, bb8, cc15, cc15)
  1364. LDF [BO + 15 * SIZE], b8
  1365. FMADD (aa2, bb9, cc01, cc01)
  1366. LDF [BO + 24 * SIZE], b9
  1367. FMADD (aa2, bb2, cc03, cc03)
  1368. LDF [BO + 17 * SIZE], b2
  1369. FMADD (aa2, bb3, cc05, cc05)
  1370. LDF [BO + 18 * SIZE], b3
  1371. FMADD (aa2, bb4, cc07, cc07)
  1372. LDF [BO + 19 * SIZE], b4
  1373. FMADD (aa2, bb5, cc09, cc09)
  1374. LDF [BO + 20 * SIZE], b5
  1375. FMADD (aa2, bb6, cc11, cc11)
  1376. LDF [BO + 21 * SIZE], b6
  1377. FMADD (aa2, bb7, cc13, cc13)
  1378. LDF [BO + 22 * SIZE], b7
  1379. FMADD (aa2, bb8, cc15, cc15)
  1380. LDF [BO + 23 * SIZE], b8
  1381. LDF [AO + 4 * SIZE], a1
  1382. LDF [AO + 5 * SIZE], a2
  1383. FMADD (aa3, bb1, cc01, cc01)
  1384. LDF [BO + 32 * SIZE], b1
  1385. FMADD (aa3, bb2, cc03, cc03)
  1386. LDF [BO + 25 * SIZE], b2
  1387. FMADD (aa3, bb3, cc05, cc05)
  1388. LDF [BO + 26 * SIZE], b3
  1389. FMADD (aa3, bb4, cc07, cc07)
  1390. LDF [BO + 27 * SIZE], b4
  1391. FMADD (aa3, bb5, cc09, cc09)
  1392. LDF [BO + 28 * SIZE], b5
  1393. FMADD (aa3, bb6, cc11, cc11)
  1394. LDF [BO + 29 * SIZE], b6
  1395. FMADD (aa3, bb7, cc13, cc13)
  1396. LDF [BO + 30 * SIZE], b7
  1397. FMADD (aa3, bb8, cc15, cc15)
  1398. LDF [BO + 31 * SIZE], b8
  1399. FMADD (aa4, bb9, cc01, cc01)
  1400. LDF [BO + 40 * SIZE], b9
  1401. FMADD (aa4, bb2, cc03, cc03)
  1402. LDF [BO + 33 * SIZE], b2
  1403. FMADD (aa4, bb3, cc05, cc05)
  1404. LDF [BO + 34 * SIZE], b3
  1405. FMADD (aa4, bb4, cc07, cc07)
  1406. LDF [BO + 35 * SIZE], b4
  1407. FMADD (aa4, bb5, cc09, cc09)
  1408. LDF [BO + 36 * SIZE], b5
  1409. FMADD (aa4, bb6, cc11, cc11)
  1410. LDF [BO + 37 * SIZE], b6
  1411. FMADD (aa4, bb7, cc13, cc13)
  1412. LDF [BO + 38 * SIZE], b7
  1413. FMADD (aa4, bb8, cc15, cc15)
  1414. LDF [BO + 39 * SIZE], b8
  1415. LDF [AO + 6 * SIZE], a3
  1416. LDF [AO + 7 * SIZE], a4
  1417. add AO, 4 * SIZE, AO
  1418. cmp L, 0
  1419. bg,pt %icc, .LL23
  1420. add BO, 32 * SIZE, BO
  1421. .align 4
  1422. .LL25:
  1423. #if defined(LT) || defined(RN)
  1424. and KK, 3, L
  1425. #else
  1426. sub K, KK, L
  1427. and L, 3, L
  1428. #endif
  1429. cmp L, 0
  1430. ble,a,pn %icc, .LL28
  1431. nop
  1432. .align 4
  1433. .LL27:
  1434. FMADD (aa1, bb1, cc01, cc01)
  1435. LDF [BO + 8 * SIZE], b1
  1436. FMADD (aa1, bb2, cc03, cc03)
  1437. LDF [BO + 9 * SIZE], b2
  1438. FMADD (aa1, bb3, cc05, cc05)
  1439. LDF [BO + 10 * SIZE], b3
  1440. FMADD (aa1, bb4, cc07, cc07)
  1441. LDF [BO + 11 * SIZE], b4
  1442. FMADD (aa1, bb5, cc09, cc09)
  1443. LDF [BO + 12 * SIZE], b5
  1444. FMADD (aa1, bb6, cc11, cc11)
  1445. LDF [BO + 13 * SIZE], b6
  1446. FMADD (aa1, bb7, cc13, cc13)
  1447. LDF [BO + 14 * SIZE], b7
  1448. FMADD (aa1, bb8, cc15, cc15)
  1449. LDF [BO + 15 * SIZE], b8
  1450. LDF [AO + 1 * SIZE], a1
  1451. add AO, 1 * SIZE, AO
  1452. add L, -1, L
  1453. cmp L, 0
  1454. bg,pt %icc, .LL27
  1455. add BO, 8 * SIZE, BO
  1456. .align 4
  1457. .LL28:
  1458. #if defined(LN) || defined(RT)
  1459. #ifdef LN
  1460. sub KK, 1, TEMP1
  1461. #else
  1462. sub KK, 8, TEMP1
  1463. #endif
  1464. sll TEMP1, BASE_SHIFT + 0, TEMP2
  1465. sll TEMP1, BASE_SHIFT + 3, TEMP1
  1466. add AORIG, TEMP2, AO
  1467. add B, TEMP1, BO
  1468. #endif
  1469. #if defined(LN) || defined(LT)
  1470. LDF [BO + 0 * SIZE], a1
  1471. LDF [BO + 1 * SIZE], a2
  1472. LDF [BO + 2 * SIZE], a3
  1473. LDF [BO + 3 * SIZE], a4
  1474. LDF [BO + 4 * SIZE], b1
  1475. LDF [BO + 5 * SIZE], b2
  1476. LDF [BO + 6 * SIZE], b3
  1477. LDF [BO + 7 * SIZE], b4
  1478. FSUB a1, c01, c01
  1479. FSUB a2, c03, c03
  1480. FSUB a3, c05, c05
  1481. FSUB a4, c07, c07
  1482. FSUB b1, c09, c09
  1483. FSUB b2, c11, c11
  1484. FSUB b3, c13, c13
  1485. FSUB b4, c15, c15
  1486. #else
  1487. LDF [AO + 0 * SIZE], a1
  1488. LDF [AO + 1 * SIZE], a2
  1489. LDF [AO + 2 * SIZE], a3
  1490. LDF [AO + 3 * SIZE], a4
  1491. LDF [AO + 4 * SIZE], b1
  1492. LDF [AO + 5 * SIZE], b2
  1493. LDF [AO + 6 * SIZE], b3
  1494. LDF [AO + 7 * SIZE], b4
  1495. FSUB a1, c01, c01
  1496. FSUB a2, c03, c03
  1497. FSUB a3, c05, c05
  1498. FSUB a4, c07, c07
  1499. FSUB b1, c09, c09
  1500. FSUB b2, c11, c11
  1501. FSUB b3, c13, c13
  1502. FSUB b4, c15, c15
  1503. #endif
  1504. #if defined(LN) || defined(LT)
  1505. LDF [AO + 0 * SIZE], a1
  1506. FMUL a1, c01, c01
  1507. FMUL a1, c03, c03
  1508. FMUL a1, c05, c05
  1509. FMUL a1, c07, c07
  1510. FMUL a1, c09, c09
  1511. FMUL a1, c11, c11
  1512. FMUL a1, c13, c13
  1513. FMUL a1, c15, c15
  1514. #endif
  1515. #ifdef RN
  1516. LDF [BO + 0 * SIZE], a1
  1517. LDF [BO + 1 * SIZE], a2
  1518. LDF [BO + 2 * SIZE], a3
  1519. LDF [BO + 3 * SIZE], a4
  1520. LDF [BO + 4 * SIZE], b1
  1521. LDF [BO + 5 * SIZE], b2
  1522. LDF [BO + 6 * SIZE], b3
  1523. LDF [BO + 7 * SIZE], b4
  1524. FMUL a1, c01, c01
  1525. FNMSUB (aa2, cc01, cc03, cc03)
  1526. FNMSUB (aa3, cc01, cc05, cc05)
  1527. FNMSUB (aa4, cc01, cc07, cc07)
  1528. FNMSUB (bb1, cc01, cc09, cc09)
  1529. FNMSUB (bb2, cc01, cc11, cc11)
  1530. FNMSUB (bb3, cc01, cc13, cc13)
  1531. FNMSUB (bb4, cc01, cc15, cc15)
  1532. LDF [BO + 9 * SIZE], a1
  1533. LDF [BO + 10 * SIZE], a2
  1534. LDF [BO + 11 * SIZE], a3
  1535. LDF [BO + 12 * SIZE], a4
  1536. LDF [BO + 13 * SIZE], b1
  1537. LDF [BO + 14 * SIZE], b2
  1538. LDF [BO + 15 * SIZE], b3
  1539. FMUL a1, c03, c03
  1540. FNMSUB (aa2, cc03, cc05, cc05)
  1541. FNMSUB (aa3, cc03, cc07, cc07)
  1542. FNMSUB (aa4, cc03, cc09, cc09)
  1543. FNMSUB (bb1, cc03, cc11, cc11)
  1544. FNMSUB (bb2, cc03, cc13, cc13)
  1545. FNMSUB (bb3, cc03, cc15, cc15)
  1546. LDF [BO + 18 * SIZE], a1
  1547. LDF [BO + 19 * SIZE], a2
  1548. LDF [BO + 20 * SIZE], a3
  1549. LDF [BO + 21 * SIZE], a4
  1550. LDF [BO + 22 * SIZE], b1
  1551. LDF [BO + 23 * SIZE], b2
  1552. FMUL a1, c05, c05
  1553. FNMSUB (aa2, cc05, cc07, cc07)
  1554. FNMSUB (aa3, cc05, cc09, cc09)
  1555. FNMSUB (aa4, cc05, cc11, cc11)
  1556. FNMSUB (bb1, cc05, cc13, cc13)
  1557. FNMSUB (bb2, cc05, cc15, cc15)
  1558. LDF [BO + 27 * SIZE], a1
  1559. LDF [BO + 28 * SIZE], a2
  1560. LDF [BO + 29 * SIZE], a3
  1561. LDF [BO + 30 * SIZE], a4
  1562. LDF [BO + 31 * SIZE], b1
  1563. FMUL a1, c07, c07
  1564. FNMSUB (aa2, cc07, cc09, cc09)
  1565. FNMSUB (aa3, cc07, cc11, cc11)
  1566. FNMSUB (aa4, cc07, cc13, cc13)
  1567. FNMSUB (bb1, cc07, cc15, cc15)
  1568. LDF [BO + 36 * SIZE], a1
  1569. LDF [BO + 37 * SIZE], a2
  1570. LDF [BO + 38 * SIZE], a3
  1571. LDF [BO + 39 * SIZE], a4
  1572. FMUL a1, c09, c09
  1573. FNMSUB (aa2, cc09, cc11, cc11)
  1574. FNMSUB (aa3, cc09, cc13, cc13)
  1575. FNMSUB (aa4, cc09, cc15, cc15)
  1576. LDF [BO + 45 * SIZE], a1
  1577. LDF [BO + 46 * SIZE], a2
  1578. LDF [BO + 47 * SIZE], a3
  1579. FMUL a1, c11, c11
  1580. FNMSUB (aa2, cc11, cc13, cc13)
  1581. FNMSUB (aa3, cc11, cc15, cc15)
  1582. LDF [BO + 54 * SIZE], a1
  1583. LDF [BO + 55 * SIZE], a2
  1584. FMUL a1, c13, c13
  1585. FNMSUB (aa2, cc13, cc15, cc15)
  1586. LDF [BO + 63 * SIZE], a1
  1587. FMUL a1, c15, c15
  1588. #endif
  1589. #ifdef RT
  1590. LDF [BO + 63 * SIZE], a1
  1591. LDF [BO + 62 * SIZE], a2
  1592. LDF [BO + 61 * SIZE], a3
  1593. LDF [BO + 60 * SIZE], a4
  1594. LDF [BO + 59 * SIZE], b1
  1595. LDF [BO + 58 * SIZE], b2
  1596. LDF [BO + 57 * SIZE], b3
  1597. LDF [BO + 56 * SIZE], b4
  1598. FMUL a1, c15, c15
  1599. FNMSUB (aa2, cc15, cc13, cc13)
  1600. FNMSUB (aa3, cc15, cc11, cc11)
  1601. FNMSUB (aa4, cc15, cc09, cc09)
  1602. FNMSUB (bb1, cc15, cc07, cc07)
  1603. FNMSUB (bb2, cc15, cc05, cc05)
  1604. FNMSUB (bb3, cc15, cc03, cc03)
  1605. FNMSUB (bb4, cc15, cc01, cc01)
  1606. LDF [BO + 54 * SIZE], a1
  1607. LDF [BO + 53 * SIZE], a2
  1608. LDF [BO + 52 * SIZE], a3
  1609. LDF [BO + 51 * SIZE], a4
  1610. LDF [BO + 50 * SIZE], b1
  1611. LDF [BO + 49 * SIZE], b2
  1612. LDF [BO + 48 * SIZE], b3
  1613. FMUL a1, c13, c13
  1614. FNMSUB (aa2, cc13, cc11, cc11)
  1615. FNMSUB (aa3, cc13, cc09, cc09)
  1616. FNMSUB (aa4, cc13, cc07, cc07)
  1617. FNMSUB (bb1, cc13, cc05, cc05)
  1618. FNMSUB (bb2, cc13, cc03, cc03)
  1619. FNMSUB (bb3, cc13, cc01, cc01)
  1620. LDF [BO + 45 * SIZE], a1
  1621. LDF [BO + 44 * SIZE], a2
  1622. LDF [BO + 43 * SIZE], a3
  1623. LDF [BO + 42 * SIZE], a4
  1624. LDF [BO + 41 * SIZE], b1
  1625. LDF [BO + 40 * SIZE], b2
  1626. FMUL a1, c11, c11
  1627. FNMSUB (aa2, cc11, cc09, cc09)
  1628. FNMSUB (aa3, cc11, cc07, cc07)
  1629. FNMSUB (aa4, cc11, cc05, cc05)
  1630. FNMSUB (bb1, cc11, cc03, cc03)
  1631. FNMSUB (bb2, cc11, cc01, cc01)
  1632. LDF [BO + 36 * SIZE], a1
  1633. LDF [BO + 35 * SIZE], a2
  1634. LDF [BO + 34 * SIZE], a3
  1635. LDF [BO + 33 * SIZE], a4
  1636. LDF [BO + 32 * SIZE], b1
  1637. FMUL a1, c09, c09
  1638. FNMSUB (aa2, cc09, cc07, cc07)
  1639. FNMSUB (aa3, cc09, cc05, cc05)
  1640. FNMSUB (aa4, cc09, cc03, cc03)
  1641. FNMSUB (bb1, cc09, cc01, cc01)
  1642. LDF [BO + 27 * SIZE], a1
  1643. LDF [BO + 26 * SIZE], a2
  1644. LDF [BO + 25 * SIZE], a3
  1645. LDF [BO + 24 * SIZE], a4
  1646. FMUL a1, c07, c07
  1647. FNMSUB (aa2, cc07, cc05, cc05)
  1648. FNMSUB (aa3, cc07, cc03, cc03)
  1649. FNMSUB (aa4, cc07, cc01, cc01)
  1650. LDF [BO + 18 * SIZE], a1
  1651. LDF [BO + 17 * SIZE], a2
  1652. LDF [BO + 16 * SIZE], a3
  1653. FMUL a1, c05, c05
  1654. FNMSUB (aa2, cc05, cc03, cc03)
  1655. FNMSUB (aa3, cc05, cc01, cc01)
  1656. LDF [BO + 9 * SIZE], a1
  1657. LDF [BO + 8 * SIZE], a2
  1658. FMUL a1, c03, c03
  1659. FNMSUB (aa2, cc03, cc01, cc01)
  1660. LDF [BO + 0 * SIZE], a1
  1661. FMUL a1, c01, c01
  1662. #endif
  1663. #ifdef LN
  1664. add C1, -1 * SIZE, C1
  1665. add C2, -1 * SIZE, C2
  1666. add C3, -1 * SIZE, C3
  1667. add C4, -1 * SIZE, C4
  1668. add C5, -1 * SIZE, C5
  1669. add C6, -1 * SIZE, C6
  1670. add C7, -1 * SIZE, C7
  1671. add C8, -1 * SIZE, C8
  1672. #endif
  1673. #if defined(LN) || defined(LT)
  1674. STF c01, [BO + 0 * SIZE]
  1675. STF c03, [BO + 1 * SIZE]
  1676. STF c05, [BO + 2 * SIZE]
  1677. STF c07, [BO + 3 * SIZE]
  1678. STF c09, [BO + 4 * SIZE]
  1679. STF c11, [BO + 5 * SIZE]
  1680. STF c13, [BO + 6 * SIZE]
  1681. STF c15, [BO + 7 * SIZE]
  1682. #else
  1683. STF c01, [AO + 0 * SIZE]
  1684. STF c03, [AO + 1 * SIZE]
  1685. STF c05, [AO + 2 * SIZE]
  1686. STF c07, [AO + 3 * SIZE]
  1687. STF c09, [AO + 4 * SIZE]
  1688. STF c11, [AO + 5 * SIZE]
  1689. STF c13, [AO + 6 * SIZE]
  1690. STF c15, [AO + 7 * SIZE]
  1691. #endif
  1692. STF c01, [C1 + 0 * SIZE]
  1693. STF c03, [C2 + 0 * SIZE]
  1694. STF c05, [C3 + 0 * SIZE]
  1695. STF c07, [C4 + 0 * SIZE]
  1696. STF c09, [C5 + 0 * SIZE]
  1697. STF c11, [C6 + 0 * SIZE]
  1698. STF c13, [C7 + 0 * SIZE]
  1699. STF c15, [C8 + 0 * SIZE]
  1700. #ifdef RT
  1701. sll K, BASE_SHIFT + 0, TEMP1
  1702. add AORIG, TEMP1, AORIG
  1703. #endif
  1704. #if defined(LT) || defined(RN)
  1705. sub K, KK, TEMP1
  1706. sll TEMP1, BASE_SHIFT + 0, TEMP2
  1707. sll TEMP1, BASE_SHIFT + 3, TEMP1
  1708. add AO, TEMP2, AO
  1709. add BO, TEMP1, BO
  1710. #endif
  1711. #ifdef LT
  1712. add KK, 1, KK
  1713. #endif
  1714. #ifdef LN
  1715. sub KK, 1, KK
  1716. #endif
  1717. .align 4
  1718. .LL29:
  1719. #ifdef LN
  1720. sll K, BASE_SHIFT + 3, TEMP1
  1721. add B, TEMP1, B
  1722. #endif
  1723. #if defined(LT) || defined(RN)
  1724. mov BO, B
  1725. #endif
  1726. #ifdef RN
  1727. add KK, 8, KK
  1728. #endif
  1729. #ifdef RT
  1730. sub KK, 8, KK
  1731. #endif
  1732. add J, -1, J
  1733. cmp J, 0
  1734. bg,pt %icc, .LL11
  1735. nop
  1736. .align 4
  1737. .LL30:
  1738. and N, 4, J
  1739. cmp J, 0
  1740. ble,pn %icc, .LL50
  1741. nop
  1742. #ifdef RT
  1743. sll K, BASE_SHIFT + 2, TEMP1
  1744. sub B, TEMP1, B
  1745. #endif
  1746. #ifndef RT
  1747. mov C, C1
  1748. add C, LDC, C2
  1749. add C2, LDC, C3
  1750. add C3, LDC, C4
  1751. add C4, LDC, C
  1752. #else
  1753. sub C, LDC, C4
  1754. sub C4, LDC, C3
  1755. sub C3, LDC, C2
  1756. sub C2, LDC, C1
  1757. sub C2, LDC, C
  1758. #endif
  1759. #ifdef LN
  1760. add M, OFFSET, KK
  1761. #endif
  1762. #ifdef LT
  1763. mov OFFSET, KK
  1764. #endif
  1765. #if defined(LN) || defined(RT)
  1766. mov A, AORIG
  1767. #else
  1768. mov A, AO
  1769. #endif
  1770. sra M, 1, I
  1771. cmp I, 0
  1772. ble,pn %icc, .LL40
  1773. nop
  1774. .align 4
  1775. .LL32:
  1776. #if defined(LT) || defined(RN)
  1777. mov B, BO
  1778. #else
  1779. #ifdef LN
  1780. sll K, BASE_SHIFT + 1, TEMP1
  1781. sub AORIG, TEMP1, AORIG
  1782. #endif
  1783. sll KK, BASE_SHIFT + 1, TEMP1
  1784. sll KK, BASE_SHIFT + 2, TEMP2
  1785. add AORIG, TEMP1, AO
  1786. add B, TEMP2, BO
  1787. #endif
  1788. LDF [AO + 0 * SIZE], a1
  1789. LDF [AO + 1 * SIZE], a2
  1790. LDF [BO + 0 * SIZE], b1
  1791. LDF [BO + 1 * SIZE], b2
  1792. LDF [BO + 2 * SIZE], b3
  1793. LDF [BO + 3 * SIZE], b4
  1794. LDF [BO + 4 * SIZE], b5
  1795. LDF [BO + 5 * SIZE], b6
  1796. FCLR (cc01)
  1797. LDF [BO + 6 * SIZE], b7
  1798. FCLR (cc02)
  1799. LDF [BO + 7 * SIZE], b8
  1800. FCLR (cc03)
  1801. LDF [BO + 8 * SIZE], b9
  1802. FCLR (cc04)
  1803. prefetch [C1 + 2 * SIZE], 3
  1804. FCLR (cc05)
  1805. prefetch [C2 + 2 * SIZE], 3
  1806. FCLR (cc06)
  1807. prefetch [C3 + 2 * SIZE], 3
  1808. FCLR (cc07)
  1809. prefetch [C4 + 2 * SIZE], 3
  1810. FCLR (cc08)
  1811. #if defined(LT) || defined(RN)
  1812. sra KK, 2, L
  1813. #else
  1814. sub K, KK, L
  1815. sra L, 2, L
  1816. #endif
  1817. cmp L, 0
  1818. ble,pn %icc, .LL35
  1819. nop
  1820. .align 4
  1821. .LL33:
  1822. FMADD (aa1, bb1, cc01, cc01)
  1823. LDF [AO + 2 * SIZE], a3
  1824. FMADD (aa2, bb1, cc02, cc02)
  1825. LDF [AO + 3 * SIZE], a4
  1826. FMADD (aa1, bb2, cc03, cc03)
  1827. LDF [BO + 16 * SIZE], b1
  1828. FMADD (aa2, bb2, cc04, cc04)
  1829. LDF [BO + 9 * SIZE], b2
  1830. FMADD (aa1, bb3, cc05, cc05)
  1831. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1832. FMADD (aa2, bb3, cc06, cc06)
  1833. add L, -1, L
  1834. FMADD (aa1, bb4, cc07, cc07)
  1835. LDF [BO + 10 * SIZE], b3
  1836. FMADD (aa2, bb4, cc08, cc08)
  1837. LDF [BO + 11 * SIZE], b4
  1838. FMADD (aa3, bb5, cc01, cc01)
  1839. LDF [AO + 4 * SIZE], a1
  1840. FMADD (aa4, bb5, cc02, cc02)
  1841. LDF [AO + 5 * SIZE], a2
  1842. FMADD (aa3, bb6, cc03, cc03)
  1843. LDF [BO + 12 * SIZE], b5
  1844. FMADD (aa4, bb6, cc04, cc04)
  1845. LDF [BO + 13 * SIZE], b6
  1846. FMADD (aa3, bb7, cc05, cc05)
  1847. cmp L, 0
  1848. FMADD (aa4, bb7, cc06, cc06)
  1849. add AO, 8 * SIZE, AO
  1850. FMADD (aa3, bb8, cc07, cc07)
  1851. LDF [BO + 14 * SIZE], b7
  1852. FMADD (aa4, bb8, cc08, cc08)
  1853. LDF [BO + 15 * SIZE], b8
  1854. FMADD (aa1, bb9, cc01, cc01)
  1855. LDF [AO - 2 * SIZE], a3
  1856. FMADD (aa2, bb9, cc02, cc02)
  1857. LDF [AO - 1 * SIZE], a4
  1858. FMADD (aa1, bb2, cc03, cc03)
  1859. LDF [BO + 24 * SIZE], b9
  1860. FMADD (aa2, bb2, cc04, cc04)
  1861. LDF [BO + 17 * SIZE], b2
  1862. FMADD (aa1, bb3, cc05, cc05)
  1863. add BO, 16 * SIZE, BO
  1864. FMADD (aa2, bb3, cc06, cc06)
  1865. nop
  1866. FMADD (aa1, bb4, cc07, cc07)
  1867. LDF [BO + 2 * SIZE], b3
  1868. FMADD (aa2, bb4, cc08, cc08)
  1869. LDF [BO + 3 * SIZE], b4
  1870. FMADD (aa3, bb5, cc01, cc01)
  1871. LDF [AO + 0 * SIZE], a1
  1872. FMADD (aa4, bb5, cc02, cc02)
  1873. LDF [AO + 1 * SIZE], a2
  1874. FMADD (aa3, bb6, cc03, cc03)
  1875. LDF [BO + 4 * SIZE], b5
  1876. FMADD (aa4, bb6, cc04, cc04)
  1877. LDF [BO + 5 * SIZE], b6
  1878. FMADD (aa3, bb7, cc05, cc05)
  1879. nop
  1880. FMADD (aa4, bb7, cc06, cc06)
  1881. LDF [BO + 6 * SIZE], b7
  1882. FMADD (aa3, bb8, cc07, cc07)
  1883. FMADD (aa4, bb8, cc08, cc08)
  1884. bg,pt %icc, .LL33
  1885. LDF [BO + 7 * SIZE], b8
  1886. .align 4
  1887. .LL35:
  1888. #if defined(LT) || defined(RN)
  1889. and KK, 3, L
  1890. #else
  1891. sub K, KK, L
  1892. and L, 3, L
  1893. #endif
  1894. cmp L, 0
  1895. ble,a,pn %icc, .LL38
  1896. nop
  1897. .align 4
  1898. .LL37:
  1899. FMADD (aa1, bb1, cc01, cc01)
  1900. add L, -1, L
  1901. FMADD (aa2, bb1, cc02, cc02)
  1902. LDF [BO + 4 * SIZE], b1
  1903. FMADD (aa1, bb2, cc03, cc03)
  1904. add AO, 2 * SIZE, AO
  1905. FMADD (aa2, bb2, cc04, cc04)
  1906. LDF [BO + 5 * SIZE], b2
  1907. FMADD (aa1, bb3, cc05, cc05)
  1908. cmp L, 0
  1909. FMADD (aa2, bb3, cc06, cc06)
  1910. LDF [BO + 6 * SIZE], b3
  1911. FMADD (aa1, bb4, cc07, cc07)
  1912. LDF [AO + 0 * SIZE], a1
  1913. FMADD (aa2, bb4, cc08, cc08)
  1914. LDF [AO + 1 * SIZE], a2
  1915. LDF [BO + 7 * SIZE], b4
  1916. bg,pt %icc, .LL37
  1917. add BO, 4 * SIZE, BO
  1918. .align 4
  1919. .LL38:
  1920. #if defined(LN) || defined(RT)
  1921. #ifdef LN
  1922. sub KK, 2, TEMP1
  1923. #else
  1924. sub KK, 4, TEMP1
  1925. #endif
  1926. sll TEMP1, BASE_SHIFT + 1, TEMP2
  1927. sll TEMP1, BASE_SHIFT + 2, TEMP1
  1928. add AORIG, TEMP2, AO
  1929. add B, TEMP1, BO
  1930. #endif
  1931. #if defined(LN) || defined(LT)
  1932. LDF [BO + 0 * SIZE], a1
  1933. LDF [BO + 1 * SIZE], a2
  1934. LDF [BO + 2 * SIZE], a3
  1935. LDF [BO + 3 * SIZE], a4
  1936. LDF [BO + 4 * SIZE], b1
  1937. LDF [BO + 5 * SIZE], b2
  1938. LDF [BO + 6 * SIZE], b3
  1939. LDF [BO + 7 * SIZE], b4
  1940. FSUB a1, c01, c01
  1941. FSUB a2, c03, c03
  1942. FSUB a3, c05, c05
  1943. FSUB a4, c07, c07
  1944. FSUB b1, c02, c02
  1945. FSUB b2, c04, c04
  1946. FSUB b3, c06, c06
  1947. FSUB b4, c08, c08
  1948. #else
  1949. LDF [AO + 0 * SIZE], a1
  1950. LDF [AO + 1 * SIZE], a2
  1951. LDF [AO + 2 * SIZE], a3
  1952. LDF [AO + 3 * SIZE], a4
  1953. LDF [AO + 4 * SIZE], b1
  1954. LDF [AO + 5 * SIZE], b2
  1955. LDF [AO + 6 * SIZE], b3
  1956. LDF [AO + 7 * SIZE], b4
  1957. FSUB a1, c01, c01
  1958. FSUB a2, c02, c02
  1959. FSUB a3, c03, c03
  1960. FSUB a4, c04, c04
  1961. FSUB b1, c05, c05
  1962. FSUB b2, c06, c06
  1963. FSUB b3, c07, c07
  1964. FSUB b4, c08, c08
  1965. #endif
  1966. #ifdef LN
  1967. LDF [AO + 3 * SIZE], a1
  1968. LDF [AO + 2 * SIZE], a2
  1969. LDF [AO + 0 * SIZE], a3
  1970. FMUL a1, c02, c02
  1971. FMUL a1, c04, c04
  1972. FMUL a1, c06, c06
  1973. FMUL a1, c08, c08
  1974. FNMSUB (aa2, cc02, cc01, cc01)
  1975. FNMSUB (aa2, cc04, cc03, cc03)
  1976. FNMSUB (aa2, cc06, cc05, cc05)
  1977. FNMSUB (aa2, cc08, cc07, cc07)
  1978. FMUL a3, c01, c01
  1979. FMUL a3, c03, c03
  1980. FMUL a3, c05, c05
  1981. FMUL a3, c07, c07
  1982. #endif
  1983. #ifdef LT
  1984. LDF [AO + 0 * SIZE], a1
  1985. LDF [AO + 1 * SIZE], a2
  1986. LDF [AO + 3 * SIZE], a3
  1987. FMUL a1, c01, c01
  1988. FMUL a1, c03, c03
  1989. FMUL a1, c05, c05
  1990. FMUL a1, c07, c07
  1991. FNMSUB (aa2, cc01, cc02, cc02)
  1992. FNMSUB (aa2, cc03, cc04, cc04)
  1993. FNMSUB (aa2, cc05, cc06, cc06)
  1994. FNMSUB (aa2, cc07, cc08, cc08)
  1995. FMUL a3, c02, c02
  1996. FMUL a3, c04, c04
  1997. FMUL a3, c06, c06
  1998. FMUL a3, c08, c08
  1999. #endif
  2000. #ifdef RN
  2001. LDF [BO + 0 * SIZE], a1
  2002. LDF [BO + 1 * SIZE], a2
  2003. LDF [BO + 2 * SIZE], a3
  2004. LDF [BO + 3 * SIZE], a4
  2005. FMUL a1, c01, c01
  2006. FMUL a1, c02, c02
  2007. FNMSUB (aa2, cc01, cc03, cc03)
  2008. FNMSUB (aa2, cc02, cc04, cc04)
  2009. FNMSUB (aa3, cc01, cc05, cc05)
  2010. FNMSUB (aa3, cc02, cc06, cc06)
  2011. FNMSUB (aa4, cc01, cc07, cc07)
  2012. FNMSUB (aa4, cc02, cc08, cc08)
  2013. LDF [BO + 5 * SIZE], a1
  2014. LDF [BO + 6 * SIZE], a2
  2015. LDF [BO + 7 * SIZE], a3
  2016. FMUL a1, c03, c03
  2017. FMUL a1, c04, c04
  2018. FNMSUB (aa2, cc03, cc05, cc05)
  2019. FNMSUB (aa2, cc04, cc06, cc06)
  2020. FNMSUB (aa3, cc03, cc07, cc07)
  2021. FNMSUB (aa3, cc04, cc08, cc08)
  2022. LDF [BO + 10 * SIZE], a1
  2023. LDF [BO + 11 * SIZE], a2
  2024. FMUL a1, c05, c05
  2025. FMUL a1, c06, c06
  2026. FNMSUB (aa2, cc05, cc07, cc07)
  2027. FNMSUB (aa2, cc06, cc08, cc08)
  2028. LDF [BO + 15 * SIZE], a1
  2029. FMUL a1, c07, c07
  2030. FMUL a1, c08, c08
  2031. #endif
  2032. #ifdef RT
  2033. LDF [BO + 15 * SIZE], a1
  2034. LDF [BO + 14 * SIZE], a2
  2035. LDF [BO + 13 * SIZE], a3
  2036. LDF [BO + 12 * SIZE], a4
  2037. FMUL a1, c08, c08
  2038. FMUL a1, c07, c07
  2039. FNMSUB (aa2, cc08, cc06, cc06)
  2040. FNMSUB (aa2, cc07, cc05, cc05)
  2041. FNMSUB (aa3, cc08, cc04, cc04)
  2042. FNMSUB (aa3, cc07, cc03, cc03)
  2043. FNMSUB (aa4, cc08, cc02, cc02)
  2044. FNMSUB (aa4, cc07, cc01, cc01)
  2045. LDF [BO + 10 * SIZE], a1
  2046. LDF [BO + 9 * SIZE], a2
  2047. LDF [BO + 8 * SIZE], a3
  2048. FMUL a1, c06, c06
  2049. FMUL a1, c05, c05
  2050. FNMSUB (aa2, cc06, cc04, cc04)
  2051. FNMSUB (aa2, cc05, cc03, cc03)
  2052. FNMSUB (aa3, cc06, cc02, cc02)
  2053. FNMSUB (aa3, cc05, cc01, cc01)
  2054. LDF [BO + 5 * SIZE], a1
  2055. LDF [BO + 4 * SIZE], a2
  2056. FMUL a1, c04, c04
  2057. FMUL a1, c03, c03
  2058. FNMSUB (aa2, cc04, cc02, cc02)
  2059. FNMSUB (aa2, cc03, cc01, cc01)
  2060. LDF [BO + 0 * SIZE], a1
  2061. FMUL a1, c02, c02
  2062. FMUL a1, c01, c01
  2063. #endif
  2064. #ifdef LN
  2065. add C1, -2 * SIZE, C1
  2066. add C2, -2 * SIZE, C2
  2067. add C3, -2 * SIZE, C3
  2068. add C4, -2 * SIZE, C4
  2069. #endif
  2070. #if defined(LN) || defined(LT)
  2071. STF c01, [BO + 0 * SIZE]
  2072. STF c03, [BO + 1 * SIZE]
  2073. STF c05, [BO + 2 * SIZE]
  2074. STF c07, [BO + 3 * SIZE]
  2075. STF c02, [BO + 4 * SIZE]
  2076. STF c04, [BO + 5 * SIZE]
  2077. STF c06, [BO + 6 * SIZE]
  2078. STF c08, [BO + 7 * SIZE]
  2079. #else
  2080. STF c01, [AO + 0 * SIZE]
  2081. STF c02, [AO + 1 * SIZE]
  2082. STF c03, [AO + 2 * SIZE]
  2083. STF c04, [AO + 3 * SIZE]
  2084. STF c05, [AO + 4 * SIZE]
  2085. STF c06, [AO + 5 * SIZE]
  2086. STF c07, [AO + 6 * SIZE]
  2087. STF c08, [AO + 7 * SIZE]
  2088. #endif
  2089. STF c01, [C1 + 0 * SIZE]
  2090. STF c02, [C1 + 1 * SIZE]
  2091. STF c03, [C2 + 0 * SIZE]
  2092. STF c04, [C2 + 1 * SIZE]
  2093. STF c05, [C3 + 0 * SIZE]
  2094. STF c06, [C3 + 1 * SIZE]
  2095. STF c07, [C4 + 0 * SIZE]
  2096. STF c08, [C4 + 1 * SIZE]
  2097. #ifndef LN
  2098. add C1, 2 * SIZE, C1
  2099. add C2, 2 * SIZE, C2
  2100. add C3, 2 * SIZE, C3
  2101. add C4, 2 * SIZE, C4
  2102. #endif
  2103. #ifdef RT
  2104. sll K, BASE_SHIFT + 1, TEMP1
  2105. add AORIG, TEMP1, AORIG
  2106. #endif
  2107. #if defined(LT) || defined(RN)
  2108. sub K, KK, TEMP1
  2109. sll TEMP1, BASE_SHIFT + 1, TEMP2
  2110. sll TEMP1, BASE_SHIFT + 2, TEMP1
  2111. add AO, TEMP2, AO
  2112. add BO, TEMP1, BO
  2113. #endif
  2114. #ifdef LT
  2115. add KK, 2, KK
  2116. #endif
  2117. #ifdef LN
  2118. sub KK, 2, KK
  2119. #endif
  2120. add I, -1, I
  2121. cmp I, 0
  2122. bg,pt %icc, .LL32
  2123. nop
  2124. .LL40:
  2125. and M, 1, I
  2126. cmp I, 0
  2127. ble,pn %icc, .LL49
  2128. nop
  2129. #if defined(LT) || defined(RN)
  2130. mov B, BO
  2131. #else
  2132. #ifdef LN
  2133. sll K, BASE_SHIFT + 0, TEMP1
  2134. sub AORIG, TEMP1, AORIG
  2135. #endif
  2136. sll KK, BASE_SHIFT + 0, TEMP1
  2137. sll KK, BASE_SHIFT + 2, TEMP2
  2138. add AORIG, TEMP1, AO
  2139. add B, TEMP2, BO
  2140. #endif
  2141. LDF [AO + 0 * SIZE], a1
  2142. LDF [AO + 1 * SIZE], a2
  2143. LDF [AO + 2 * SIZE], a3
  2144. LDF [AO + 3 * SIZE], a4
  2145. LDF [BO + 0 * SIZE], b1
  2146. LDF [BO + 1 * SIZE], b2
  2147. LDF [BO + 2 * SIZE], b3
  2148. LDF [BO + 3 * SIZE], b4
  2149. LDF [BO + 4 * SIZE], b5
  2150. LDF [BO + 5 * SIZE], b6
  2151. FCLR (cc01)
  2152. LDF [BO + 6 * SIZE], b7
  2153. FCLR (cc03)
  2154. LDF [BO + 7 * SIZE], b8
  2155. FCLR (cc05)
  2156. LDF [BO + 8 * SIZE], b9
  2157. FCLR (cc07)
  2158. #if defined(LT) || defined(RN)
  2159. sra KK, 2, L
  2160. #else
  2161. sub K, KK, L
  2162. sra L, 2, L
  2163. #endif
  2164. cmp L, 0
  2165. ble,pn %icc, .LL45
  2166. nop
  2167. .LL43:
  2168. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  2169. add L, -1, L
  2170. FMADD (aa1, bb1, cc01, cc01)
  2171. LDF [BO + 16 * SIZE], b1
  2172. FMADD (aa1, bb2, cc03, cc03)
  2173. LDF [BO + 9 * SIZE], b2
  2174. FMADD (aa1, bb3, cc05, cc05)
  2175. LDF [BO + 10 * SIZE], b3
  2176. FMADD (aa1, bb4, cc07, cc07)
  2177. LDF [BO + 11 * SIZE], b4
  2178. LDF [AO + 4 * SIZE], a1
  2179. cmp L, 0
  2180. FMADD (aa2, bb5, cc01, cc01)
  2181. LDF [BO + 12 * SIZE], b5
  2182. FMADD (aa2, bb6, cc03, cc03)
  2183. LDF [BO + 13 * SIZE], b6
  2184. FMADD (aa2, bb7, cc05, cc05)
  2185. LDF [BO + 14 * SIZE], b7
  2186. FMADD (aa2, bb8, cc07, cc07)
  2187. LDF [BO + 15 * SIZE], b8
  2188. LDF [AO + 5 * SIZE], a2
  2189. add AO, 4 * SIZE, AO
  2190. FMADD (aa3, bb9, cc01, cc01)
  2191. LDF [BO + 24 * SIZE], b9
  2192. FMADD (aa3, bb2, cc03, cc03)
  2193. LDF [BO + 17 * SIZE], b2
  2194. FMADD (aa3, bb3, cc05, cc05)
  2195. LDF [BO + 18 * SIZE], b3
  2196. FMADD (aa3, bb4, cc07, cc07)
  2197. LDF [BO + 19 * SIZE], b4
  2198. LDF [AO + 2 * SIZE], a3
  2199. add BO, 16 * SIZE, BO
  2200. FMADD (aa4, bb5, cc01, cc01)
  2201. LDF [BO + 4 * SIZE], b5
  2202. FMADD (aa4, bb6, cc03, cc03)
  2203. LDF [BO + 5 * SIZE], b6
  2204. FMADD (aa4, bb7, cc05, cc05)
  2205. LDF [BO + 6 * SIZE], b7
  2206. FMADD (aa4, bb8, cc07, cc07)
  2207. LDF [BO + 7 * SIZE], b8
  2208. bg,pt %icc, .LL43
  2209. LDF [AO + 3 * SIZE], a4
  2210. .align 4
  2211. .LL45:
  2212. #if defined(LT) || defined(RN)
  2213. and KK, 3, L
  2214. #else
  2215. sub K, KK, L
  2216. and L, 3, L
  2217. #endif
  2218. cmp L, 0
  2219. ble,a,pn %icc, .LL48
  2220. nop
  2221. .align 4
  2222. .LL47:
  2223. FMADD (aa1, bb1, cc01, cc01)
  2224. LDF [BO + 4 * SIZE], b1
  2225. add L, -1, L
  2226. FMADD (aa1, bb2, cc03, cc03)
  2227. LDF [BO + 5 * SIZE], b2
  2228. add AO, 1 * SIZE, AO
  2229. FMADD (aa1, bb3, cc05, cc05)
  2230. LDF [BO + 6 * SIZE], b3
  2231. cmp L, 0
  2232. FMADD (aa1, bb4, cc07, cc07)
  2233. LDF [BO + 7 * SIZE], b4
  2234. add BO, 4 * SIZE, BO
  2235. bg,pt %icc, .LL47
  2236. LDF [AO + 0 * SIZE], a1
  2237. .align 4
  2238. .LL48:
  2239. #if defined(LN) || defined(RT)
  2240. #ifdef LN
  2241. sub KK, 1, TEMP1
  2242. #else
  2243. sub KK, 4, TEMP1
  2244. #endif
  2245. sll TEMP1, BASE_SHIFT + 0, TEMP2
  2246. sll TEMP1, BASE_SHIFT + 2, TEMP1
  2247. add AORIG, TEMP2, AO
  2248. add B, TEMP1, BO
  2249. #endif
  2250. #if defined(LN) || defined(LT)
  2251. LDF [BO + 0 * SIZE], a1
  2252. LDF [BO + 1 * SIZE], a2
  2253. LDF [BO + 2 * SIZE], a3
  2254. LDF [BO + 3 * SIZE], a4
  2255. FSUB a1, c01, c01
  2256. FSUB a2, c03, c03
  2257. FSUB a3, c05, c05
  2258. FSUB a4, c07, c07
  2259. #else
  2260. LDF [AO + 0 * SIZE], a1
  2261. LDF [AO + 1 * SIZE], a2
  2262. LDF [AO + 2 * SIZE], a3
  2263. LDF [AO + 3 * SIZE], a4
  2264. FSUB a1, c01, c01
  2265. FSUB a2, c03, c03
  2266. FSUB a3, c05, c05
  2267. FSUB a4, c07, c07
  2268. #endif
  2269. #if defined(LN) || defined(LT)
  2270. LDF [AO + 0 * SIZE], a1
  2271. FMUL a1, c01, c01
  2272. FMUL a1, c03, c03
  2273. FMUL a1, c05, c05
  2274. FMUL a1, c07, c07
  2275. #endif
  2276. #ifdef RN
  2277. LDF [BO + 0 * SIZE], a1
  2278. LDF [BO + 1 * SIZE], a2
  2279. LDF [BO + 2 * SIZE], a3
  2280. LDF [BO + 3 * SIZE], a4
  2281. FMUL a1, c01, c01
  2282. FNMSUB (aa2, cc01, cc03, cc03)
  2283. FNMSUB (aa3, cc01, cc05, cc05)
  2284. FNMSUB (aa4, cc01, cc07, cc07)
  2285. LDF [BO + 5 * SIZE], a1
  2286. LDF [BO + 6 * SIZE], a2
  2287. LDF [BO + 7 * SIZE], a3
  2288. FMUL a1, c03, c03
  2289. FNMSUB (aa2, cc03, cc05, cc05)
  2290. FNMSUB (aa3, cc03, cc07, cc07)
  2291. LDF [BO + 10 * SIZE], a1
  2292. LDF [BO + 11 * SIZE], a2
  2293. FMUL a1, c05, c05
  2294. FNMSUB (aa2, cc05, cc07, cc07)
  2295. LDF [BO + 15 * SIZE], a1
  2296. FMUL a1, c07, c07
  2297. #endif
  2298. #ifdef RT
  2299. LDF [BO + 15 * SIZE], a1
  2300. LDF [BO + 14 * SIZE], a2
  2301. LDF [BO + 13 * SIZE], a3
  2302. LDF [BO + 12 * SIZE], a4
  2303. FMUL a1, c07, c07
  2304. FNMSUB (aa2, cc07, cc05, cc05)
  2305. FNMSUB (aa3, cc07, cc03, cc03)
  2306. FNMSUB (aa4, cc07, cc01, cc01)
  2307. LDF [BO + 10 * SIZE], a1
  2308. LDF [BO + 9 * SIZE], a2
  2309. LDF [BO + 8 * SIZE], a3
  2310. FMUL a1, c05, c05
  2311. FNMSUB (aa2, cc05, cc03, cc03)
  2312. FNMSUB (aa3, cc05, cc01, cc01)
  2313. LDF [BO + 5 * SIZE], a1
  2314. LDF [BO + 4 * SIZE], a2
  2315. FMUL a1, c03, c03
  2316. FNMSUB (aa2, cc03, cc01, cc01)
  2317. LDF [BO + 0 * SIZE], a1
  2318. FMUL a1, c01, c01
  2319. #endif
  2320. #ifdef LN
  2321. add C1, -1 * SIZE, C1
  2322. add C2, -1 * SIZE, C2
  2323. add C3, -1 * SIZE, C3
  2324. add C4, -1 * SIZE, C4
  2325. #endif
  2326. #if defined(LN) || defined(LT)
  2327. STF c01, [BO + 0 * SIZE]
  2328. STF c03, [BO + 1 * SIZE]
  2329. STF c05, [BO + 2 * SIZE]
  2330. STF c07, [BO + 3 * SIZE]
  2331. #else
  2332. STF c01, [AO + 0 * SIZE]
  2333. STF c03, [AO + 1 * SIZE]
  2334. STF c05, [AO + 2 * SIZE]
  2335. STF c07, [AO + 3 * SIZE]
  2336. #endif
  2337. STF c01, [C1 + 0 * SIZE]
  2338. STF c03, [C2 + 0 * SIZE]
  2339. STF c05, [C3 + 0 * SIZE]
  2340. STF c07, [C4 + 0 * SIZE]
  2341. #ifdef RT
  2342. sll K, BASE_SHIFT + 0, TEMP1
  2343. add AORIG, TEMP1, AORIG
  2344. #endif
  2345. #if defined(LT) || defined(RN)
  2346. sub K, KK, TEMP1
  2347. sll TEMP1, BASE_SHIFT + 0, TEMP2
  2348. sll TEMP1, BASE_SHIFT + 2, TEMP1
  2349. add AO, TEMP2, AO
  2350. add BO, TEMP1, BO
  2351. #endif
  2352. #ifdef LT
  2353. add KK, 1, KK
  2354. #endif
  2355. #ifdef LN
  2356. sub KK, 1, KK
  2357. #endif
  2358. .align 4
  2359. .LL49:
  2360. #ifdef LN
  2361. sll K, BASE_SHIFT + 2, TEMP1
  2362. add B, TEMP1, B
  2363. #endif
  2364. #if defined(LT) || defined(RN)
  2365. mov BO, B
  2366. #endif
  2367. #ifdef RN
  2368. add KK, 4, KK
  2369. #endif
  2370. #ifdef RT
  2371. sub KK, 4, KK
  2372. #endif
  2373. .align 4
  2374. .LL50:
  2375. and N, 2, J
  2376. cmp J, 0
  2377. ble,pn %icc, .LL70
  2378. nop
  2379. #ifdef RT
  2380. sll K, BASE_SHIFT + 1, TEMP1
  2381. sub B, TEMP1, B
  2382. #endif
  2383. #ifndef RT
  2384. mov C, C1
  2385. add C, LDC, C2
  2386. add C2, LDC, C
  2387. #else
  2388. sub C, LDC, C2
  2389. sub C2, LDC, C1
  2390. sub C2, LDC, C
  2391. #endif
  2392. #ifdef LN
  2393. add M, OFFSET, KK
  2394. #endif
  2395. #ifdef LT
  2396. mov OFFSET, KK
  2397. #endif
  2398. #if defined(LN) || defined(RT)
  2399. mov A, AORIG
  2400. #else
  2401. mov A, AO
  2402. #endif
  2403. sra M, 1, I
  2404. cmp I, 0
  2405. ble,pn %icc, .LL60
  2406. nop
  2407. .align 4
  2408. .LL52:
  2409. #if defined(LT) || defined(RN)
  2410. mov B, BO
  2411. #else
  2412. #ifdef LN
  2413. sll K, BASE_SHIFT + 1, TEMP1
  2414. sub AORIG, TEMP1, AORIG
  2415. #endif
  2416. sll KK, BASE_SHIFT + 1, TEMP1
  2417. sll KK, BASE_SHIFT + 1, TEMP2
  2418. add AORIG, TEMP1, AO
  2419. add B, TEMP2, BO
  2420. #endif
  2421. LDF [AO + 0 * SIZE], a1
  2422. LDF [AO + 1 * SIZE], a2
  2423. LDF [AO + 2 * SIZE], a3
  2424. LDF [AO + 3 * SIZE], a4
  2425. LDF [BO + 0 * SIZE], b1
  2426. LDF [BO + 1 * SIZE], b2
  2427. LDF [BO + 2 * SIZE], b3
  2428. FCLR (cc01)
  2429. LDF [BO + 3 * SIZE], b4
  2430. FCLR (cc02)
  2431. LDF [BO + 4 * SIZE], b5
  2432. FCLR (cc03)
  2433. LDF [BO + 5 * SIZE], b6
  2434. FCLR (cc04)
  2435. LDF [BO + 6 * SIZE], b7
  2436. FCLR (cc05)
  2437. LDF [BO + 7 * SIZE], b8
  2438. FCLR (cc06)
  2439. prefetch [C1 + 2 * SIZE], 3
  2440. FCLR (cc07)
  2441. prefetch [C2 + 2 * SIZE], 3
  2442. FCLR (cc08)
  2443. #if defined(LT) || defined(RN)
  2444. sra KK, 2, L
  2445. #else
  2446. sub K, KK, L
  2447. sra L, 2, L
  2448. #endif
  2449. cmp L, 0
  2450. ble,pn %icc, .LL55
  2451. nop
  2452. .align 4
  2453. .LL53:
  2454. FMADD (aa1, bb1, cc01, cc01)
  2455. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  2456. FMADD (aa2, bb1, cc02, cc02)
  2457. LDF [BO + 8 * SIZE], b1
  2458. FMADD (aa1, bb2, cc03, cc03)
  2459. LDF [AO + 4 * SIZE], a1
  2460. FMADD (aa2, bb2, cc04, cc04)
  2461. LDF [AO + 5 * SIZE], a2
  2462. FMADD (aa3, bb3, cc01, cc01)
  2463. LDF [BO + 9 * SIZE], b2
  2464. FMADD (aa4, bb3, cc02, cc02)
  2465. LDF [BO + 10 * SIZE], b3
  2466. FMADD (aa3, bb4, cc03, cc03)
  2467. LDF [AO + 6 * SIZE], a3
  2468. FMADD (aa4, bb4, cc04, cc04)
  2469. LDF [AO + 7 * SIZE], a4
  2470. FMADD (aa1, bb5, cc01, cc01)
  2471. LDF [BO + 11 * SIZE], b4
  2472. FMADD (aa2, bb5, cc02, cc02)
  2473. LDF [BO + 12 * SIZE], b5
  2474. FMADD (aa1, bb6, cc03, cc03)
  2475. LDF [AO + 8 * SIZE], a1
  2476. FMADD (aa2, bb6, cc04, cc04)
  2477. LDF [AO + 9 * SIZE], a2
  2478. FMADD (aa3, bb7, cc01, cc01)
  2479. LDF [BO + 13 * SIZE], b6
  2480. FMADD (aa4, bb7, cc02, cc02)
  2481. LDF [BO + 14 * SIZE], b7
  2482. FMADD (aa3, bb8, cc03, cc03)
  2483. LDF [AO + 10 * SIZE], a3
  2484. FMADD (aa4, bb8, cc04, cc04)
  2485. LDF [AO + 11 * SIZE], a4
  2486. add AO, 8 * SIZE, AO
  2487. add L, -1, L
  2488. add BO, 8 * SIZE, BO
  2489. cmp L, 0
  2490. bg,pt %icc, .LL53
  2491. LDF [BO + 7 * SIZE], b8
  2492. .align 4
  2493. .LL55:
  2494. #if defined(LT) || defined(RN)
  2495. and KK, 3, L
  2496. #else
  2497. sub K, KK, L
  2498. and L, 3, L
  2499. #endif
  2500. cmp L, 0
  2501. ble,a,pn %icc, .LL58
  2502. nop
  2503. .align 4
  2504. .LL57:
  2505. FMADD (aa1, bb1, cc01, cc01)
  2506. add L, -1, L
  2507. FMADD (aa2, bb1, cc02, cc02)
  2508. LDF [BO + 2 * SIZE], b1
  2509. FMADD (aa1, bb2, cc03, cc03)
  2510. LDF [AO + 2 * SIZE], a1
  2511. FMADD (aa2, bb2, cc04, cc04)
  2512. LDF [AO + 3 * SIZE], a2
  2513. add AO, 2 * SIZE, AO
  2514. cmp L, 0
  2515. add BO, 2 * SIZE, BO
  2516. bg,pt %icc, .LL57
  2517. LDF [BO + 1 * SIZE], b2
  2518. .align 4
  2519. .LL58:
  2520. #if defined(LN) || defined(RT)
  2521. #ifdef LN
  2522. sub KK, 2, TEMP1
  2523. #else
  2524. sub KK, 2, TEMP1
  2525. #endif
  2526. sll TEMP1, BASE_SHIFT + 1, TEMP2
  2527. sll TEMP1, BASE_SHIFT + 1, TEMP1
  2528. add AORIG, TEMP2, AO
  2529. add B, TEMP1, BO
  2530. #endif
  2531. #if defined(LN) || defined(LT)
  2532. LDF [BO + 0 * SIZE], a1
  2533. LDF [BO + 1 * SIZE], a2
  2534. LDF [BO + 2 * SIZE], a3
  2535. LDF [BO + 3 * SIZE], a4
  2536. FSUB a1, c01, c01
  2537. FSUB a2, c03, c03
  2538. FSUB a3, c02, c02
  2539. FSUB a4, c04, c04
  2540. #else
  2541. LDF [AO + 0 * SIZE], a1
  2542. LDF [AO + 1 * SIZE], a2
  2543. LDF [AO + 2 * SIZE], a3
  2544. LDF [AO + 3 * SIZE], a4
  2545. FSUB a1, c01, c01
  2546. FSUB a2, c02, c02
  2547. FSUB a3, c03, c03
  2548. FSUB a4, c04, c04
  2549. #endif
  2550. #ifdef LN
  2551. LDF [AO + 3 * SIZE], a1
  2552. LDF [AO + 2 * SIZE], a2
  2553. LDF [AO + 0 * SIZE], a3
  2554. FMUL a1, c02, c02
  2555. FMUL a1, c04, c04
  2556. FNMSUB (aa2, cc02, cc01, cc01)
  2557. FNMSUB (aa2, cc04, cc03, cc03)
  2558. FMUL a3, c01, c01
  2559. FMUL a3, c03, c03
  2560. #endif
  2561. #ifdef LT
  2562. LDF [AO + 0 * SIZE], a1
  2563. LDF [AO + 1 * SIZE], a2
  2564. LDF [AO + 3 * SIZE], a3
  2565. FMUL a1, c01, c01
  2566. FMUL a1, c03, c03
  2567. FNMSUB (aa2, cc01, cc02, cc02)
  2568. FNMSUB (aa2, cc03, cc04, cc04)
  2569. FMUL a3, c02, c02
  2570. FMUL a3, c04, c04
  2571. #endif
  2572. #ifdef RN
  2573. LDF [BO + 0 * SIZE], a1
  2574. LDF [BO + 1 * SIZE], a2
  2575. FMUL a1, c01, c01
  2576. FMUL a1, c02, c02
  2577. FNMSUB (aa2, cc01, cc03, cc03)
  2578. FNMSUB (aa2, cc02, cc04, cc04)
  2579. LDF [BO + 3 * SIZE], a1
  2580. FMUL a1, c03, c03
  2581. FMUL a1, c04, c04
  2582. #endif
  2583. #ifdef RT
  2584. LDF [BO + 3 * SIZE], a1
  2585. LDF [BO + 2 * SIZE], a2
  2586. FMUL a1, c04, c04
  2587. FMUL a1, c03, c03
  2588. FNMSUB (aa2, cc04, cc02, cc02)
  2589. FNMSUB (aa2, cc03, cc01, cc01)
  2590. LDF [BO + 0 * SIZE], a1
  2591. FMUL a1, c02, c02
  2592. FMUL a1, c01, c01
  2593. #endif
  2594. #ifdef LN
  2595. add C1, -2 * SIZE, C1
  2596. add C2, -2 * SIZE, C2
  2597. #endif
  2598. #if defined(LN) || defined(LT)
  2599. STF c01, [BO + 0 * SIZE]
  2600. STF c03, [BO + 1 * SIZE]
  2601. STF c02, [BO + 2 * SIZE]
  2602. STF c04, [BO + 3 * SIZE]
  2603. #else
  2604. STF c01, [AO + 0 * SIZE]
  2605. STF c02, [AO + 1 * SIZE]
  2606. STF c03, [AO + 2 * SIZE]
  2607. STF c04, [AO + 3 * SIZE]
  2608. #endif
  2609. STF c01, [C1 + 0 * SIZE]
  2610. STF c02, [C1 + 1 * SIZE]
  2611. STF c03, [C2 + 0 * SIZE]
  2612. STF c04, [C2 + 1 * SIZE]
  2613. #ifndef LN
  2614. add C1, 2 * SIZE, C1
  2615. add C2, 2 * SIZE, C2
  2616. #endif
  2617. #ifdef RT
  2618. sll K, BASE_SHIFT + 1, TEMP1
  2619. add AORIG, TEMP1, AORIG
  2620. #endif
  2621. #if defined(LT) || defined(RN)
  2622. sub K, KK, TEMP1
  2623. sll TEMP1, BASE_SHIFT + 1, TEMP2
  2624. sll TEMP1, BASE_SHIFT + 1, TEMP1
  2625. add AO, TEMP2, AO
  2626. add BO, TEMP1, BO
  2627. #endif
  2628. #ifdef LT
  2629. add KK, 2, KK
  2630. #endif
  2631. #ifdef LN
  2632. sub KK, 2, KK
  2633. #endif
  2634. add I, -1, I
  2635. cmp I, 0
  2636. bg,pt %icc, .LL52
  2637. nop
  2638. .align 4
  2639. .LL60:
  2640. and M, 1, I
  2641. cmp I, 0
  2642. ble,pn %icc, .LL69
  2643. nop
  2644. #if defined(LT) || defined(RN)
  2645. mov B, BO
  2646. #else
  2647. #ifdef LN
  2648. sll K, BASE_SHIFT + 0, TEMP1
  2649. sub AORIG, TEMP1, AORIG
  2650. #endif
  2651. sll KK, BASE_SHIFT + 0, TEMP1
  2652. sll KK, BASE_SHIFT + 1, TEMP2
  2653. add AORIG, TEMP1, AO
  2654. add B, TEMP2, BO
  2655. #endif
  2656. LDF [AO + 0 * SIZE], a1
  2657. LDF [AO + 1 * SIZE], a2
  2658. LDF [AO + 2 * SIZE], a3
  2659. LDF [AO + 3 * SIZE], a4
  2660. LDF [BO + 0 * SIZE], b1
  2661. LDF [BO + 1 * SIZE], b2
  2662. LDF [BO + 2 * SIZE], b3
  2663. LDF [BO + 3 * SIZE], b4
  2664. LDF [BO + 4 * SIZE], b5
  2665. LDF [BO + 5 * SIZE], b6
  2666. LDF [BO + 6 * SIZE], b7
  2667. FCLR (cc01)
  2668. LDF [BO + 7 * SIZE], b8
  2669. FCLR (cc03)
  2670. #if defined(LT) || defined(RN)
  2671. sra KK, 2, L
  2672. #else
  2673. sub K, KK, L
  2674. sra L, 2, L
  2675. #endif
  2676. cmp L, 0
  2677. ble,pn %icc, .LL65
  2678. nop
  2679. .align 4
  2680. .LL63:
  2681. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  2682. add L, -1, L
  2683. FMADD (aa1, bb1, cc01, cc01)
  2684. LDF [BO + 8 * SIZE], b1
  2685. FMADD (aa1, bb2, cc03, cc03)
  2686. LDF [BO + 9 * SIZE], b2
  2687. LDF [AO + 4 * SIZE], a1
  2688. cmp L, 0
  2689. FMADD (aa2, bb3, cc01, cc01)
  2690. LDF [BO + 10 * SIZE], b3
  2691. FMADD (aa2, bb4, cc03, cc03)
  2692. LDF [BO + 11 * SIZE], b4
  2693. LDF [AO + 5 * SIZE], a2
  2694. add AO, 4 * SIZE, AO
  2695. FMADD (aa3, bb5, cc01, cc01)
  2696. LDF [BO + 12 * SIZE], b5
  2697. FMADD (aa3, bb6, cc03, cc03)
  2698. LDF [BO + 13 * SIZE], b6
  2699. LDF [AO + 2 * SIZE], a3
  2700. add BO, 8 * SIZE, BO
  2701. FMADD (aa4, bb7, cc01, cc01)
  2702. LDF [BO + 6 * SIZE], b7
  2703. FMADD (aa4, bb8, cc03, cc03)
  2704. LDF [BO + 7 * SIZE], b8
  2705. bg,pt %icc, .LL63
  2706. LDF [AO + 3 * SIZE], a4
  2707. .align 4
  2708. .LL65:
  2709. #if defined(LT) || defined(RN)
  2710. and KK, 3, L
  2711. #else
  2712. sub K, KK, L
  2713. and L, 3, L
  2714. #endif
  2715. cmp L, 0
  2716. ble,a,pn %icc, .LL68
  2717. nop
  2718. .align 4
  2719. .LL67:
  2720. FMADD (aa1, bb1, cc01, cc01)
  2721. LDF [BO + 2 * SIZE], b1
  2722. FMADD (aa1, bb2, cc03, cc03)
  2723. LDF [BO + 3 * SIZE], b2
  2724. LDF [AO + 1 * SIZE], a1
  2725. add L, -1, L
  2726. add AO, 1 * SIZE, AO
  2727. cmp L, 0
  2728. bg,pt %icc, .LL67
  2729. add BO, 2 * SIZE, BO
  2730. .align 4
  2731. .LL68:
  2732. #if defined(LN) || defined(RT)
  2733. #ifdef LN
  2734. sub KK, 1, TEMP1
  2735. #else
  2736. sub KK, 2, TEMP1
  2737. #endif
  2738. sll TEMP1, BASE_SHIFT + 0, TEMP2
  2739. sll TEMP1, BASE_SHIFT + 1, TEMP1
  2740. add AORIG, TEMP2, AO
  2741. add B, TEMP1, BO
  2742. #endif
  2743. #if defined(LN) || defined(LT)
  2744. LDF [BO + 0 * SIZE], a1
  2745. LDF [BO + 1 * SIZE], a2
  2746. FSUB a1, c01, c01
  2747. FSUB a2, c03, c03
  2748. #else
  2749. LDF [AO + 0 * SIZE], a1
  2750. LDF [AO + 1 * SIZE], a2
  2751. FSUB a1, c01, c01
  2752. FSUB a2, c03, c03
  2753. #endif
  2754. #if defined(LN) || defined(LT)
  2755. LDF [AO + 0 * SIZE], a1
  2756. FMUL a1, c01, c01
  2757. FMUL a1, c03, c03
  2758. #endif
  2759. #ifdef RN
  2760. LDF [BO + 0 * SIZE], a1
  2761. LDF [BO + 1 * SIZE], a2
  2762. FMUL a1, c01, c01
  2763. FNMSUB (aa2, cc01, cc03, cc03)
  2764. LDF [BO + 3 * SIZE], a1
  2765. FMUL a1, c03, c03
  2766. #endif
  2767. #ifdef RT
  2768. LDF [BO + 3 * SIZE], a1
  2769. LDF [BO + 2 * SIZE], a2
  2770. FMUL a1, c03, c03
  2771. FNMSUB (aa2, cc03, cc01, cc01)
  2772. LDF [BO + 0 * SIZE], a1
  2773. FMUL a1, c01, c01
  2774. #endif
  2775. #ifdef LN
  2776. add C1, -1 * SIZE, C1
  2777. add C2, -1 * SIZE, C2
  2778. #endif
  2779. #if defined(LN) || defined(LT)
  2780. STF c01, [BO + 0 * SIZE]
  2781. STF c03, [BO + 1 * SIZE]
  2782. #else
  2783. STF c01, [AO + 0 * SIZE]
  2784. STF c03, [AO + 1 * SIZE]
  2785. #endif
  2786. STF c01, [C1 + 0 * SIZE]
  2787. STF c03, [C2 + 0 * SIZE]
  2788. #ifdef RT
  2789. sll K, BASE_SHIFT + 0, TEMP1
  2790. add AORIG, TEMP1, AORIG
  2791. #endif
  2792. #if defined(LT) || defined(RN)
  2793. sub K, KK, TEMP1
  2794. sll TEMP1, BASE_SHIFT + 0, TEMP2
  2795. sll TEMP1, BASE_SHIFT + 1, TEMP1
  2796. add AO, TEMP2, AO
  2797. add BO, TEMP1, BO
  2798. #endif
  2799. #ifdef LT
  2800. add KK, 1, KK
  2801. #endif
  2802. #ifdef LN
  2803. sub KK, 1, KK
  2804. #endif
  2805. .align 4
  2806. .LL69:
  2807. #ifdef LN
  2808. sll K, BASE_SHIFT + 1, TEMP1
  2809. add B, TEMP1, B
  2810. #endif
  2811. #if defined(LT) || defined(RN)
  2812. mov BO, B
  2813. #endif
  2814. #ifdef RN
  2815. add KK, 2, KK
  2816. #endif
  2817. #ifdef RT
  2818. sub KK, 2, KK
  2819. #endif
  2820. .align 4
  2821. .LL70:
  2822. and N, 1, J
  2823. cmp J, 0
  2824. ble,pn %icc, .LL999
  2825. nop
  2826. #ifdef RT
  2827. sll K, BASE_SHIFT, TEMP1
  2828. sub B, TEMP1, B
  2829. #endif
  2830. #ifndef RT
  2831. mov C, C1
  2832. add C1, LDC, C
  2833. #else
  2834. sub C, LDC, C1
  2835. sub C, LDC, C
  2836. #endif
  2837. #ifdef LN
  2838. add M, OFFSET, KK
  2839. #endif
  2840. #ifdef LT
  2841. mov OFFSET, KK
  2842. #endif
  2843. #if defined(LN) || defined(RT)
  2844. mov A, AORIG
  2845. #else
  2846. mov A, AO
  2847. #endif
  2848. sra M, 1, I
  2849. cmp I, 0
  2850. ble,pn %icc, .LL80
  2851. nop
  2852. .align 4
  2853. .LL72:
  2854. #if defined(LT) || defined(RN)
  2855. mov B, BO
  2856. #else
  2857. #ifdef LN
  2858. sll K, BASE_SHIFT + 1, TEMP1
  2859. sub AORIG, TEMP1, AORIG
  2860. #endif
  2861. sll KK, BASE_SHIFT + 1, TEMP1
  2862. sll KK, BASE_SHIFT + 0, TEMP2
  2863. add AORIG, TEMP1, AO
  2864. add B, TEMP2, BO
  2865. #endif
  2866. LDF [AO + 0 * SIZE], a1
  2867. LDF [AO + 1 * SIZE], a2
  2868. LDF [AO + 2 * SIZE], a3
  2869. LDF [AO + 3 * SIZE], a4
  2870. LDF [BO + 0 * SIZE], b1
  2871. LDF [BO + 1 * SIZE], b2
  2872. LDF [BO + 2 * SIZE], b3
  2873. FCLR (cc01)
  2874. LDF [BO + 3 * SIZE], b4
  2875. FCLR (cc02)
  2876. prefetch [C1 + 2 * SIZE], 3
  2877. #if defined(LT) || defined(RN)
  2878. sra KK, 2, L
  2879. #else
  2880. sub K, KK, L
  2881. sra L, 2, L
  2882. #endif
  2883. cmp L, 0
  2884. ble,pn %icc, .LL75
  2885. nop
  2886. .LL73:
  2887. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  2888. add L, -1, L
  2889. FMADD (aa1, bb1, cc01, cc01)
  2890. LDF [AO + 4 * SIZE], a1
  2891. FMADD (aa2, bb1, cc02, cc02)
  2892. LDF [AO + 5 * SIZE], a2
  2893. LDF [BO + 4 * SIZE], b1
  2894. cmp L, 0
  2895. FMADD (aa3, bb2, cc01, cc01)
  2896. LDF [AO + 6 * SIZE], a3
  2897. FMADD (aa4, bb2, cc02, cc02)
  2898. LDF [AO + 7 * SIZE], a4
  2899. LDF [BO + 5 * SIZE], b2
  2900. add BO, 4 * SIZE, BO
  2901. FMADD (aa1, bb3, cc01, cc01)
  2902. LDF [AO + 8 * SIZE], a1
  2903. FMADD (aa2, bb3, cc02, cc02)
  2904. LDF [AO + 9 * SIZE], a2
  2905. LDF [BO + 2 * SIZE], b3
  2906. add AO, 8 * SIZE, AO
  2907. FMADD (aa3, bb4, cc01, cc01)
  2908. LDF [AO + 2 * SIZE], a3
  2909. FMADD (aa4, bb4, cc02, cc02)
  2910. LDF [AO + 3 * SIZE], a4
  2911. bg,pt %icc, .LL73
  2912. LDF [BO + 3 * SIZE], b4
  2913. .align 4
  2914. .LL75:
  2915. #if defined(LT) || defined(RN)
  2916. and KK, 3, L
  2917. #else
  2918. sub K, KK, L
  2919. and L, 3, L
  2920. #endif
  2921. cmp L, 0
  2922. ble,a,pn %icc, .LL78
  2923. nop
  2924. .align 4
  2925. .LL77:
  2926. FMADD (aa1, bb1, cc01, cc01)
  2927. LDF [AO + 2 * SIZE], a1
  2928. FMADD (aa2, bb1, cc02, cc02)
  2929. LDF [AO + 3 * SIZE], a2
  2930. LDF [BO + 1 * SIZE], b1
  2931. add L, -1, L
  2932. add AO, 2 * SIZE, AO
  2933. cmp L, 0
  2934. bg,pt %icc, .LL77
  2935. add BO, 1 * SIZE, BO
  2936. .align 4
  2937. .LL78:
  2938. #if defined(LN) || defined(RT)
  2939. #ifdef LN
  2940. sub KK, 2, TEMP1
  2941. #else
  2942. sub KK, 1, TEMP1
  2943. #endif
  2944. sll TEMP1, BASE_SHIFT + 1, TEMP2
  2945. sll TEMP1, BASE_SHIFT + 0, TEMP1
  2946. add AORIG, TEMP2, AO
  2947. add B, TEMP1, BO
  2948. #endif
  2949. #if defined(LN) || defined(LT)
  2950. LDF [BO + 0 * SIZE], a1
  2951. LDF [BO + 1 * SIZE], a2
  2952. FSUB a1, c01, c01
  2953. FSUB a2, c02, c02
  2954. #else
  2955. LDF [AO + 0 * SIZE], a1
  2956. LDF [AO + 1 * SIZE], a2
  2957. FSUB a1, c01, c01
  2958. FSUB a2, c02, c02
  2959. #endif
  2960. #ifdef LN
  2961. LDF [AO + 3 * SIZE], a1
  2962. LDF [AO + 2 * SIZE], a2
  2963. LDF [AO + 0 * SIZE], a3
  2964. FMUL a1, c02, c02
  2965. FNMSUB (aa2, cc02, cc01, cc01)
  2966. FMUL a3, c01, c01
  2967. #endif
  2968. #ifdef LT
  2969. LDF [AO + 0 * SIZE], a1
  2970. LDF [AO + 1 * SIZE], a2
  2971. LDF [AO + 3 * SIZE], a3
  2972. FMUL a1, c01, c01
  2973. FNMSUB (aa2, cc01, cc02, cc02)
  2974. FMUL a3, c02, c02
  2975. #endif
  2976. #if defined(RN) || defined(RT)
  2977. LDF [BO + 0 * SIZE], a1
  2978. FMUL a1, c01, c01
  2979. FMUL a1, c02, c02
  2980. #endif
  2981. #ifdef LN
  2982. add C1, -2 * SIZE, C1
  2983. #endif
  2984. #if defined(LN) || defined(LT)
  2985. STF c01, [BO + 0 * SIZE]
  2986. STF c02, [BO + 1 * SIZE]
  2987. #else
  2988. STF c01, [AO + 0 * SIZE]
  2989. STF c02, [AO + 1 * SIZE]
  2990. #endif
  2991. STF c01, [C1 + 0 * SIZE]
  2992. STF c02, [C1 + 1 * SIZE]
  2993. #ifndef LN
  2994. add C1, 2 * SIZE, C1
  2995. #endif
  2996. #ifdef RT
  2997. sll K, BASE_SHIFT + 1, TEMP1
  2998. add AORIG, TEMP1, AORIG
  2999. #endif
  3000. #if defined(LT) || defined(RN)
  3001. sub K, KK, TEMP1
  3002. sll TEMP1, BASE_SHIFT + 1, TEMP2
  3003. sll TEMP1, BASE_SHIFT + 0, TEMP1
  3004. add AO, TEMP2, AO
  3005. add BO, TEMP1, BO
  3006. #endif
  3007. #ifdef LT
  3008. add KK, 2, KK
  3009. #endif
  3010. #ifdef LN
  3011. sub KK, 2, KK
  3012. #endif
  3013. add I, -1, I
  3014. cmp I, 0
  3015. bg,pt %icc, .LL72
  3016. nop
  3017. .align 4
  3018. .LL80:
  3019. and M, 1, I
  3020. cmp I, 0
  3021. ble,pn %icc, .LL89
  3022. nop
  3023. #if defined(LT) || defined(RN)
  3024. mov B, BO
  3025. #else
  3026. #ifdef LN
  3027. sll K, BASE_SHIFT + 0, TEMP1
  3028. sub AORIG, TEMP1, AORIG
  3029. #endif
  3030. sll KK, BASE_SHIFT + 0, TEMP1
  3031. sll KK, BASE_SHIFT + 0, TEMP2
  3032. add AORIG, TEMP1, AO
  3033. add B, TEMP2, BO
  3034. #endif
  3035. LDF [AO + 0 * SIZE], a1
  3036. LDF [BO + 0 * SIZE], b1
  3037. LDF [AO + 1 * SIZE], a2
  3038. LDF [BO + 1 * SIZE], b2
  3039. LDF [AO + 2 * SIZE], a3
  3040. LDF [BO + 2 * SIZE], b3
  3041. LDF [AO + 3 * SIZE], a4
  3042. LDF [BO + 3 * SIZE], b4
  3043. #if defined(LT) || defined(RN)
  3044. sra KK, 2, L
  3045. #else
  3046. sub K, KK, L
  3047. sra L, 2, L
  3048. #endif
  3049. cmp L, 0
  3050. ble,pn %icc, .LL85
  3051. FCLR (cc01)
  3052. .align 4
  3053. .LL83:
  3054. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  3055. add L, -1, L
  3056. FMADD (aa1, bb1, cc01, cc01)
  3057. LDF [AO + 4 * SIZE], a1
  3058. LDF [BO + 4 * SIZE], b1
  3059. FMADD (aa2, bb2, cc01, cc01)
  3060. LDF [AO + 5 * SIZE], a2
  3061. LDF [BO + 5 * SIZE], b2
  3062. FMADD (aa3, bb3, cc01, cc01)
  3063. LDF [AO + 6 * SIZE], a3
  3064. LDF [BO + 6 * SIZE], b3
  3065. FMADD (aa4, bb4, cc01, cc01)
  3066. LDF [AO + 7 * SIZE], a4
  3067. LDF [BO + 7 * SIZE], b4
  3068. add AO, 4 * SIZE, AO
  3069. cmp L, 0
  3070. bg,pt %icc, .LL83
  3071. add BO, 4 * SIZE, BO
  3072. .align 4
  3073. .LL85:
  3074. #if defined(LT) || defined(RN)
  3075. and KK, 3, L
  3076. #else
  3077. sub K, KK, L
  3078. and L, 3, L
  3079. #endif
  3080. cmp L, 0
  3081. ble,a,pn %icc, .LL88
  3082. nop
  3083. .align 4
  3084. .LL87:
  3085. FMADD (aa1, bb1, cc01, cc01)
  3086. LDF [AO + 1 * SIZE], a1
  3087. LDF [BO + 1 * SIZE], b1
  3088. add AO, 1 * SIZE, AO
  3089. add L, -1, L
  3090. cmp L, 0
  3091. bg,pt %icc, .LL87
  3092. add BO, 1 * SIZE, BO
  3093. .align 4
  3094. .LL88:
  3095. #if defined(LN) || defined(RT)
  3096. #ifdef LN
  3097. sub KK, 1, TEMP1
  3098. #else
  3099. sub KK, 1, TEMP1
  3100. #endif
  3101. sll TEMP1, BASE_SHIFT + 0, TEMP2
  3102. sll TEMP1, BASE_SHIFT + 0, TEMP1
  3103. add AORIG, TEMP2, AO
  3104. add B, TEMP1, BO
  3105. #endif
  3106. #if defined(LN) || defined(LT)
  3107. LDF [BO + 0 * SIZE], a1
  3108. FSUB a1, c01, c01
  3109. #else
  3110. LDF [AO + 0 * SIZE], a1
  3111. FSUB a1, c01, c01
  3112. #endif
  3113. #if defined(LN) || defined(LT)
  3114. LDF [AO + 0 * SIZE], a1
  3115. FMUL a1, c01, c01
  3116. #endif
  3117. #if defined(RN) || defined(RT)
  3118. LDF [BO + 0 * SIZE], a1
  3119. FMUL a1, c01, c01
  3120. #endif
  3121. #ifdef LN
  3122. add C1, -1 * SIZE, C1
  3123. #endif
  3124. #if defined(LN) || defined(LT)
  3125. STF c01, [BO + 0 * SIZE]
  3126. #else
  3127. STF c01, [AO + 0 * SIZE]
  3128. #endif
  3129. STF c01, [C1 + 0 * SIZE]
  3130. #ifdef RT
  3131. sll K, BASE_SHIFT + 0, TEMP1
  3132. add AORIG, TEMP1, AORIG
  3133. #endif
  3134. #if defined(LT) || defined(RN)
  3135. sub K, KK, TEMP1
  3136. sll TEMP1, BASE_SHIFT + 0, TEMP2
  3137. sll TEMP1, BASE_SHIFT + 0, TEMP1
  3138. add AO, TEMP2, AO
  3139. add BO, TEMP1, BO
  3140. #endif
  3141. #ifdef LT
  3142. add KK, 1, KK
  3143. #endif
  3144. #ifdef LN
  3145. sub KK, 1, KK
  3146. #endif
  3147. .align 4
  3148. .LL89:
  3149. #ifdef LN
  3150. sll K, BASE_SHIFT, TEMP1
  3151. add B, TEMP1, B
  3152. #endif
  3153. #if defined(LT) || defined(RN)
  3154. mov BO, B
  3155. #endif
  3156. #ifdef RN
  3157. add KK, 1, KK
  3158. #endif
  3159. #ifdef RT
  3160. sub KK, 1, KK
  3161. #endif
  3162. .align 4
  3163. .LL999:
  3164. #ifdef TRMMKERNEL
  3165. #ifndef __64BIT__
  3166. ld [%sp + STACK_START + 8], %g1
  3167. ld [%sp + STACK_START + 12], %g2
  3168. ld [%sp + STACK_START + 16], %g3
  3169. ld [%sp + STACK_START + 20], %g4
  3170. #else
  3171. ldx [%sp + STACK_START + 32], %g1
  3172. ldx [%sp + STACK_START + 40], %g2
  3173. ldx [%sp + STACK_START + 48], %g3
  3174. ldx [%sp + STACK_START + 56], %g4
  3175. #endif
  3176. #endif
  3177. return %i7 + 8
  3178. clr %o0
  3179. EPILOGUE