You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT_2x8.S 73 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896
  1. /*********************************************************************/
  2. /* Copyright 2005-2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define APREFETCHSIZE 24
  41. #define APREFETCH_CATEGORY 0
  42. #define M %i0
  43. #define N %i1
  44. #define K %i2
  45. #if defined(DOUBLE) && !defined(__64BIT__)
  46. #define A %i5
  47. #define B %i4
  48. #else
  49. #define A %i4
  50. #define B %i5
  51. #endif
  52. #define C %o4
  53. #define LDC %o5
  54. #define AO %l0
  55. #define BO %l1
  56. #define I %l2
  57. #define J %l3
  58. #define L %l4
  59. #define C1 %o0
  60. #define C2 %o1
  61. #define C3 %o2
  62. #define C4 %o3
  63. #define C5 %l5
  64. #define C6 %l6
  65. #define C7 %l7
  66. #define C8 %i3
  67. #define OFFSET %g1
  68. #define KK %g2
  69. #define TEMP1 %g3
  70. #define TEMP2 %g4
  71. #define AORIG %o7
  72. #ifdef DOUBLE
  73. #define c01 %f0
  74. #define c02 %f2
  75. #define c03 %f4
  76. #define c04 %f6
  77. #define c05 %f8
  78. #define c06 %f10
  79. #define c07 %f12
  80. #define c08 %f14
  81. #define c09 %f16
  82. #define c10 %f18
  83. #define c11 %f20
  84. #define c12 %f22
  85. #define c13 %f24
  86. #define c14 %f26
  87. #define c15 %f28
  88. #define c16 %f30
  89. #define a1 %f32
  90. #define a2 %f34
  91. #define a3 %f36
  92. #define a4 %f38
  93. #define a5 %f40
  94. #define b1 %f42
  95. #define b2 %f44
  96. #define b3 %f46
  97. #define b4 %f48
  98. #define b5 %f50
  99. #define b6 %f52
  100. #define b7 %f54
  101. #define b8 %f56
  102. #define b9 %f58
  103. #define cc01 0
  104. #define cc02 2
  105. #define cc03 4
  106. #define cc04 6
  107. #define cc05 8
  108. #define cc06 10
  109. #define cc07 12
  110. #define cc08 14
  111. #define cc09 16
  112. #define cc10 18
  113. #define cc11 20
  114. #define cc12 22
  115. #define cc13 24
  116. #define cc14 26
  117. #define cc15 28
  118. #define cc16 30
  119. #define aa1 1
  120. #define aa2 3
  121. #define aa3 5
  122. #define aa4 7
  123. #define aa5 9
  124. #define bb1 11
  125. #define bb2 13
  126. #define bb3 15
  127. #define bb4 17
  128. #define bb5 19
  129. #define bb6 21
  130. #define bb7 23
  131. #define bb8 25
  132. #define bb9 27
  133. #else
  134. #define c01 %f0
  135. #define c02 %f1
  136. #define c03 %f2
  137. #define c04 %f3
  138. #define c05 %f4
  139. #define c06 %f5
  140. #define c07 %f6
  141. #define c08 %f7
  142. #define c09 %f8
  143. #define c10 %f9
  144. #define c11 %f10
  145. #define c12 %f11
  146. #define c13 %f12
  147. #define c14 %f13
  148. #define c15 %f14
  149. #define c16 %f15
  150. #define a1 %f16
  151. #define a2 %f17
  152. #define a3 %f18
  153. #define a4 %f19
  154. #define a5 %f20
  155. #define b1 %f21
  156. #define b2 %f22
  157. #define b3 %f23
  158. #define b4 %f24
  159. #define b5 %f25
  160. #define b6 %f26
  161. #define b7 %f27
  162. #define b8 %f28
  163. #define b9 %f29
  164. #define cc01 0
  165. #define cc02 1
  166. #define cc03 2
  167. #define cc04 3
  168. #define cc05 4
  169. #define cc06 5
  170. #define cc07 6
  171. #define cc08 7
  172. #define cc09 8
  173. #define cc10 9
  174. #define cc11 10
  175. #define cc12 11
  176. #define cc13 12
  177. #define cc14 13
  178. #define cc15 14
  179. #define cc16 15
  180. #define aa1 16
  181. #define aa2 17
  182. #define aa3 18
  183. #define aa4 19
  184. #define aa5 20
  185. #define bb1 21
  186. #define bb2 22
  187. #define bb3 23
  188. #define bb4 24
  189. #define bb5 25
  190. #define bb6 26
  191. #define bb7 27
  192. #define bb8 28
  193. #define bb9 29
  194. #endif
  195. .register %g2, #scratch
  196. .register %g3, #scratch
  197. PROLOGUE
  198. SAVESP
  199. nop
  200. #ifndef __64BIT__
  201. #ifdef DOUBLE
  202. ld [%sp + STACK_START + 28], B
  203. ld [%sp + STACK_START + 32], C
  204. ld [%sp + STACK_START + 36], LDC
  205. ld [%sp + STACK_START + 40], OFFSET
  206. #else
  207. ld [%sp + STACK_START + 28], C
  208. ld [%sp + STACK_START + 32], LDC
  209. ld [%sp + STACK_START + 36], OFFSET
  210. #endif
  211. st %g1, [%sp + STACK_START + 8]
  212. st %g2, [%sp + STACK_START + 12]
  213. st %g3, [%sp + STACK_START + 16]
  214. st %g4, [%sp + STACK_START + 20]
  215. #else
  216. ldx [%sp+ STACK_START + 56], C
  217. ldx [%sp+ STACK_START + 64], LDC
  218. ldx [%sp+ STACK_START + 72], OFFSET
  219. stx %g1, [%sp + STACK_START + 32]
  220. stx %g2, [%sp + STACK_START + 40]
  221. stx %g3, [%sp + STACK_START + 48]
  222. stx %g4, [%sp + STACK_START + 56]
  223. #endif
  224. #if defined(TRMMKERNEL) && !defined(LEFT)
  225. neg OFFSET, KK
  226. #endif
  227. sll LDC, BASE_SHIFT, LDC
  228. #ifdef LN
  229. smul M, K, TEMP1
  230. sll TEMP1, BASE_SHIFT, TEMP1
  231. add A, TEMP1, A
  232. sll M, BASE_SHIFT, TEMP1
  233. add C, TEMP1, C
  234. #endif
  235. #ifdef RN
  236. neg OFFSET, KK
  237. #endif
  238. #ifdef RT
  239. smul N, K, TEMP1
  240. sll TEMP1, BASE_SHIFT, TEMP1
  241. add B, TEMP1, B
  242. smul N, LDC, TEMP1
  243. add C, TEMP1, C
  244. sub N, OFFSET, KK
  245. #endif
  246. and N, 1, J
  247. cmp J, 0
  248. ble,pn %icc, .LL50
  249. nop
  250. #ifdef RT
  251. sll K, BASE_SHIFT, TEMP1
  252. sub B, TEMP1, B
  253. #endif
  254. #ifndef RT
  255. mov C, C1
  256. add C1, LDC, C
  257. #else
  258. sub C, LDC, C1
  259. sub C, LDC, C
  260. #endif
  261. #ifdef LN
  262. add M, OFFSET, KK
  263. #endif
  264. #ifdef LT
  265. mov OFFSET, KK
  266. #endif
  267. #if defined(LN) || defined(RT)
  268. mov A, AORIG
  269. #else
  270. mov A, AO
  271. #endif
  272. sra M, 1, I
  273. cmp I, 0
  274. ble,pn %icc, .LL80
  275. nop
  276. .align 4
  277. .LL72:
  278. #if defined(LT) || defined(RN)
  279. mov B, BO
  280. #else
  281. #ifdef LN
  282. sll K, BASE_SHIFT + 1, TEMP1
  283. sub AORIG, TEMP1, AORIG
  284. #endif
  285. sll KK, BASE_SHIFT + 1, TEMP1
  286. sll KK, BASE_SHIFT + 0, TEMP2
  287. add AORIG, TEMP1, AO
  288. add B, TEMP2, BO
  289. #endif
  290. LDF [AO + 0 * SIZE], a1
  291. LDF [AO + 1 * SIZE], a2
  292. LDF [AO + 2 * SIZE], a3
  293. LDF [AO + 3 * SIZE], a4
  294. LDF [BO + 0 * SIZE], b1
  295. LDF [BO + 1 * SIZE], b2
  296. LDF [BO + 2 * SIZE], b3
  297. FCLR (cc01)
  298. LDF [BO + 3 * SIZE], b4
  299. FCLR (cc02)
  300. prefetch [C1 + 2 * SIZE], 3
  301. #if defined(LT) || defined(RN)
  302. sra KK, 2, L
  303. #else
  304. sub K, KK, L
  305. sra L, 2, L
  306. #endif
  307. cmp L, 0
  308. ble,pn %icc, .LL75
  309. nop
  310. .LL73:
  311. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  312. add L, -1, L
  313. FMADD (aa1, bb1, cc01, cc01)
  314. LDF [AO + 4 * SIZE], a1
  315. FMADD (aa2, bb1, cc02, cc02)
  316. LDF [AO + 5 * SIZE], a2
  317. LDF [BO + 4 * SIZE], b1
  318. cmp L, 0
  319. FMADD (aa3, bb2, cc01, cc01)
  320. LDF [AO + 6 * SIZE], a3
  321. FMADD (aa4, bb2, cc02, cc02)
  322. LDF [AO + 7 * SIZE], a4
  323. LDF [BO + 5 * SIZE], b2
  324. add BO, 4 * SIZE, BO
  325. FMADD (aa1, bb3, cc01, cc01)
  326. LDF [AO + 8 * SIZE], a1
  327. FMADD (aa2, bb3, cc02, cc02)
  328. LDF [AO + 9 * SIZE], a2
  329. LDF [BO + 2 * SIZE], b3
  330. add AO, 8 * SIZE, AO
  331. FMADD (aa3, bb4, cc01, cc01)
  332. LDF [AO + 2 * SIZE], a3
  333. FMADD (aa4, bb4, cc02, cc02)
  334. LDF [AO + 3 * SIZE], a4
  335. bg,pt %icc, .LL73
  336. LDF [BO + 3 * SIZE], b4
  337. .align 4
  338. .LL75:
  339. #if defined(LT) || defined(RN)
  340. and KK, 3, L
  341. #else
  342. sub K, KK, L
  343. and L, 3, L
  344. #endif
  345. cmp L, 0
  346. ble,a,pn %icc, .LL78
  347. nop
  348. .align 4
  349. .LL77:
  350. FMADD (aa1, bb1, cc01, cc01)
  351. LDF [AO + 2 * SIZE], a1
  352. FMADD (aa2, bb1, cc02, cc02)
  353. LDF [AO + 3 * SIZE], a2
  354. LDF [BO + 1 * SIZE], b1
  355. add L, -1, L
  356. add AO, 2 * SIZE, AO
  357. cmp L, 0
  358. bg,pt %icc, .LL77
  359. add BO, 1 * SIZE, BO
  360. .align 4
  361. .LL78:
  362. #if defined(LN) || defined(RT)
  363. #ifdef LN
  364. sub KK, 2, TEMP1
  365. #else
  366. sub KK, 1, TEMP1
  367. #endif
  368. sll TEMP1, BASE_SHIFT + 1, TEMP2
  369. sll TEMP1, BASE_SHIFT + 0, TEMP1
  370. add AORIG, TEMP2, AO
  371. add B, TEMP1, BO
  372. #endif
  373. #if defined(LN) || defined(LT)
  374. LDF [BO + 0 * SIZE], a1
  375. LDF [BO + 1 * SIZE], a2
  376. FSUB a1, c01, c01
  377. FSUB a2, c02, c02
  378. #else
  379. LDF [AO + 0 * SIZE], a1
  380. LDF [AO + 1 * SIZE], a2
  381. FSUB a1, c01, c01
  382. FSUB a2, c02, c02
  383. #endif
  384. #ifdef LN
  385. LDF [AO + 3 * SIZE], a1
  386. LDF [AO + 2 * SIZE], a2
  387. LDF [AO + 0 * SIZE], a3
  388. FMUL a1, c02, c02
  389. FNMSUB (aa2, cc02, cc01, cc01)
  390. FMUL a3, c01, c01
  391. #endif
  392. #ifdef LT
  393. LDF [AO + 0 * SIZE], a1
  394. LDF [AO + 1 * SIZE], a2
  395. LDF [AO + 3 * SIZE], a3
  396. FMUL a1, c01, c01
  397. FNMSUB (aa2, cc01, cc02, cc02)
  398. FMUL a3, c02, c02
  399. #endif
  400. #if defined(RN) || defined(RT)
  401. LDF [BO + 0 * SIZE], a1
  402. FMUL a1, c01, c01
  403. FMUL a1, c02, c02
  404. #endif
  405. #ifdef LN
  406. add C1, -2 * SIZE, C1
  407. #endif
  408. #if defined(LN) || defined(LT)
  409. STF c01, [BO + 0 * SIZE]
  410. STF c02, [BO + 1 * SIZE]
  411. #else
  412. STF c01, [AO + 0 * SIZE]
  413. STF c02, [AO + 1 * SIZE]
  414. #endif
  415. STF c01, [C1 + 0 * SIZE]
  416. STF c02, [C1 + 1 * SIZE]
  417. #ifndef LN
  418. add C1, 2 * SIZE, C1
  419. #endif
  420. #ifdef RT
  421. sll K, BASE_SHIFT + 1, TEMP1
  422. add AORIG, TEMP1, AORIG
  423. #endif
  424. #if defined(LT) || defined(RN)
  425. sub K, KK, TEMP1
  426. sll TEMP1, BASE_SHIFT + 1, TEMP2
  427. sll TEMP1, BASE_SHIFT + 0, TEMP1
  428. add AO, TEMP2, AO
  429. add BO, TEMP1, BO
  430. #endif
  431. #ifdef LT
  432. add KK, 2, KK
  433. #endif
  434. #ifdef LN
  435. sub KK, 2, KK
  436. #endif
  437. add I, -1, I
  438. cmp I, 0
  439. bg,pt %icc, .LL72
  440. nop
  441. .align 4
  442. .LL80:
  443. and M, 1, I
  444. cmp I, 0
  445. ble,pn %icc, .LL89
  446. nop
  447. #if defined(LT) || defined(RN)
  448. mov B, BO
  449. #else
  450. #ifdef LN
  451. sll K, BASE_SHIFT + 0, TEMP1
  452. sub AORIG, TEMP1, AORIG
  453. #endif
  454. sll KK, BASE_SHIFT + 0, TEMP1
  455. sll KK, BASE_SHIFT + 0, TEMP2
  456. add AORIG, TEMP1, AO
  457. add B, TEMP2, BO
  458. #endif
  459. LDF [AO + 0 * SIZE], a1
  460. LDF [BO + 0 * SIZE], b1
  461. LDF [AO + 1 * SIZE], a2
  462. LDF [BO + 1 * SIZE], b2
  463. LDF [AO + 2 * SIZE], a3
  464. LDF [BO + 2 * SIZE], b3
  465. LDF [AO + 3 * SIZE], a4
  466. LDF [BO + 3 * SIZE], b4
  467. #if defined(LT) || defined(RN)
  468. sra KK, 2, L
  469. #else
  470. sub K, KK, L
  471. sra L, 2, L
  472. #endif
  473. cmp L, 0
  474. ble,pn %icc, .LL85
  475. FCLR (cc01)
  476. .align 4
  477. .LL83:
  478. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  479. add L, -1, L
  480. FMADD (aa1, bb1, cc01, cc01)
  481. LDF [AO + 4 * SIZE], a1
  482. LDF [BO + 4 * SIZE], b1
  483. FMADD (aa2, bb2, cc01, cc01)
  484. LDF [AO + 5 * SIZE], a2
  485. LDF [BO + 5 * SIZE], b2
  486. FMADD (aa3, bb3, cc01, cc01)
  487. LDF [AO + 6 * SIZE], a3
  488. LDF [BO + 6 * SIZE], b3
  489. FMADD (aa4, bb4, cc01, cc01)
  490. LDF [AO + 7 * SIZE], a4
  491. LDF [BO + 7 * SIZE], b4
  492. add AO, 4 * SIZE, AO
  493. cmp L, 0
  494. bg,pt %icc, .LL83
  495. add BO, 4 * SIZE, BO
  496. .align 4
  497. .LL85:
  498. #if defined(LT) || defined(RN)
  499. and KK, 3, L
  500. #else
  501. sub K, KK, L
  502. and L, 3, L
  503. #endif
  504. cmp L, 0
  505. ble,a,pn %icc, .LL88
  506. nop
  507. .align 4
  508. .LL87:
  509. FMADD (aa1, bb1, cc01, cc01)
  510. LDF [AO + 1 * SIZE], a1
  511. LDF [BO + 1 * SIZE], b1
  512. add AO, 1 * SIZE, AO
  513. add L, -1, L
  514. cmp L, 0
  515. bg,pt %icc, .LL87
  516. add BO, 1 * SIZE, BO
  517. .align 4
  518. .LL88:
  519. #if defined(LN) || defined(RT)
  520. #ifdef LN
  521. sub KK, 1, TEMP1
  522. #else
  523. sub KK, 1, TEMP1
  524. #endif
  525. sll TEMP1, BASE_SHIFT + 0, TEMP2
  526. sll TEMP1, BASE_SHIFT + 0, TEMP1
  527. add AORIG, TEMP2, AO
  528. add B, TEMP1, BO
  529. #endif
  530. #if defined(LN) || defined(LT)
  531. LDF [BO + 0 * SIZE], a1
  532. FSUB a1, c01, c01
  533. #else
  534. LDF [AO + 0 * SIZE], a1
  535. FSUB a1, c01, c01
  536. #endif
  537. #if defined(LN) || defined(LT)
  538. LDF [AO + 0 * SIZE], a1
  539. FMUL a1, c01, c01
  540. #endif
  541. #if defined(RN) || defined(RT)
  542. LDF [BO + 0 * SIZE], a1
  543. FMUL a1, c01, c01
  544. #endif
  545. #ifdef LN
  546. add C1, -1 * SIZE, C1
  547. #endif
  548. #if defined(LN) || defined(LT)
  549. STF c01, [BO + 0 * SIZE]
  550. #else
  551. STF c01, [AO + 0 * SIZE]
  552. #endif
  553. STF c01, [C1 + 0 * SIZE]
  554. #ifdef RT
  555. sll K, BASE_SHIFT + 0, TEMP1
  556. add AORIG, TEMP1, AORIG
  557. #endif
  558. #if defined(LT) || defined(RN)
  559. sub K, KK, TEMP1
  560. sll TEMP1, BASE_SHIFT + 0, TEMP2
  561. sll TEMP1, BASE_SHIFT + 0, TEMP1
  562. add AO, TEMP2, AO
  563. add BO, TEMP1, BO
  564. #endif
  565. #ifdef LT
  566. add KK, 1, KK
  567. #endif
  568. #ifdef LN
  569. sub KK, 1, KK
  570. #endif
  571. .align 4
  572. .LL89:
  573. #ifdef LN
  574. sll K, BASE_SHIFT, TEMP1
  575. add B, TEMP1, B
  576. #endif
  577. #if defined(LT) || defined(RN)
  578. mov BO, B
  579. #endif
  580. #ifdef RN
  581. add KK, 1, KK
  582. #endif
  583. #ifdef RT
  584. sub KK, 1, KK
  585. #endif
  586. .align 4
  587. .LL50:
  588. and N, 2, J
  589. cmp J, 0
  590. ble,pn %icc, .LL30
  591. nop
  592. #ifdef RT
  593. sll K, BASE_SHIFT + 1, TEMP1
  594. sub B, TEMP1, B
  595. #endif
  596. #ifndef RT
  597. mov C, C1
  598. add C, LDC, C2
  599. add C2, LDC, C
  600. #else
  601. sub C, LDC, C2
  602. sub C2, LDC, C1
  603. sub C2, LDC, C
  604. #endif
  605. #ifdef LN
  606. add M, OFFSET, KK
  607. #endif
  608. #ifdef LT
  609. mov OFFSET, KK
  610. #endif
  611. #if defined(LN) || defined(RT)
  612. mov A, AORIG
  613. #else
  614. mov A, AO
  615. #endif
  616. sra M, 1, I
  617. cmp I, 0
  618. ble,pn %icc, .LL60
  619. nop
  620. .align 4
  621. .LL52:
  622. #if defined(LT) || defined(RN)
  623. mov B, BO
  624. #else
  625. #ifdef LN
  626. sll K, BASE_SHIFT + 1, TEMP1
  627. sub AORIG, TEMP1, AORIG
  628. #endif
  629. sll KK, BASE_SHIFT + 1, TEMP1
  630. sll KK, BASE_SHIFT + 1, TEMP2
  631. add AORIG, TEMP1, AO
  632. add B, TEMP2, BO
  633. #endif
  634. LDF [AO + 0 * SIZE], a1
  635. LDF [AO + 1 * SIZE], a2
  636. LDF [AO + 2 * SIZE], a3
  637. LDF [AO + 3 * SIZE], a4
  638. LDF [BO + 0 * SIZE], b1
  639. LDF [BO + 1 * SIZE], b2
  640. LDF [BO + 2 * SIZE], b3
  641. FCLR (cc01)
  642. LDF [BO + 3 * SIZE], b4
  643. FCLR (cc02)
  644. LDF [BO + 4 * SIZE], b5
  645. FCLR (cc03)
  646. LDF [BO + 5 * SIZE], b6
  647. FCLR (cc04)
  648. LDF [BO + 6 * SIZE], b7
  649. FCLR (cc05)
  650. LDF [BO + 7 * SIZE], b8
  651. FCLR (cc06)
  652. prefetch [C1 + 2 * SIZE], 3
  653. FCLR (cc07)
  654. prefetch [C2 + 2 * SIZE], 3
  655. FCLR (cc08)
  656. #if defined(LT) || defined(RN)
  657. sra KK, 2, L
  658. #else
  659. sub K, KK, L
  660. sra L, 2, L
  661. #endif
  662. cmp L, 0
  663. ble,pn %icc, .LL55
  664. nop
  665. .align 4
  666. .LL53:
  667. FMADD (aa1, bb1, cc01, cc01)
  668. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  669. FMADD (aa2, bb1, cc02, cc02)
  670. LDF [BO + 8 * SIZE], b1
  671. FMADD (aa1, bb2, cc03, cc03)
  672. LDF [AO + 4 * SIZE], a1
  673. FMADD (aa2, bb2, cc04, cc04)
  674. LDF [AO + 5 * SIZE], a2
  675. FMADD (aa3, bb3, cc01, cc01)
  676. LDF [BO + 9 * SIZE], b2
  677. FMADD (aa4, bb3, cc02, cc02)
  678. LDF [BO + 10 * SIZE], b3
  679. FMADD (aa3, bb4, cc03, cc03)
  680. LDF [AO + 6 * SIZE], a3
  681. FMADD (aa4, bb4, cc04, cc04)
  682. LDF [AO + 7 * SIZE], a4
  683. FMADD (aa1, bb5, cc01, cc01)
  684. LDF [BO + 11 * SIZE], b4
  685. FMADD (aa2, bb5, cc02, cc02)
  686. LDF [BO + 12 * SIZE], b5
  687. FMADD (aa1, bb6, cc03, cc03)
  688. LDF [AO + 8 * SIZE], a1
  689. FMADD (aa2, bb6, cc04, cc04)
  690. LDF [AO + 9 * SIZE], a2
  691. FMADD (aa3, bb7, cc01, cc01)
  692. LDF [BO + 13 * SIZE], b6
  693. FMADD (aa4, bb7, cc02, cc02)
  694. LDF [BO + 14 * SIZE], b7
  695. FMADD (aa3, bb8, cc03, cc03)
  696. LDF [AO + 10 * SIZE], a3
  697. FMADD (aa4, bb8, cc04, cc04)
  698. LDF [AO + 11 * SIZE], a4
  699. add AO, 8 * SIZE, AO
  700. add L, -1, L
  701. add BO, 8 * SIZE, BO
  702. cmp L, 0
  703. bg,pt %icc, .LL53
  704. LDF [BO + 7 * SIZE], b8
  705. .align 4
  706. .LL55:
  707. #if defined(LT) || defined(RN)
  708. and KK, 3, L
  709. #else
  710. sub K, KK, L
  711. and L, 3, L
  712. #endif
  713. cmp L, 0
  714. ble,a,pn %icc, .LL58
  715. nop
  716. .align 4
  717. .LL57:
  718. FMADD (aa1, bb1, cc01, cc01)
  719. add L, -1, L
  720. FMADD (aa2, bb1, cc02, cc02)
  721. LDF [BO + 2 * SIZE], b1
  722. FMADD (aa1, bb2, cc03, cc03)
  723. LDF [AO + 2 * SIZE], a1
  724. FMADD (aa2, bb2, cc04, cc04)
  725. LDF [AO + 3 * SIZE], a2
  726. add AO, 2 * SIZE, AO
  727. cmp L, 0
  728. add BO, 2 * SIZE, BO
  729. bg,pt %icc, .LL57
  730. LDF [BO + 1 * SIZE], b2
  731. .align 4
  732. .LL58:
  733. #if defined(LN) || defined(RT)
  734. #ifdef LN
  735. sub KK, 2, TEMP1
  736. #else
  737. sub KK, 2, TEMP1
  738. #endif
  739. sll TEMP1, BASE_SHIFT + 1, TEMP2
  740. sll TEMP1, BASE_SHIFT + 1, TEMP1
  741. add AORIG, TEMP2, AO
  742. add B, TEMP1, BO
  743. #endif
  744. #if defined(LN) || defined(LT)
  745. LDF [BO + 0 * SIZE], a1
  746. LDF [BO + 1 * SIZE], a2
  747. LDF [BO + 2 * SIZE], a3
  748. LDF [BO + 3 * SIZE], a4
  749. FSUB a1, c01, c01
  750. FSUB a2, c03, c03
  751. FSUB a3, c02, c02
  752. FSUB a4, c04, c04
  753. #else
  754. LDF [AO + 0 * SIZE], a1
  755. LDF [AO + 1 * SIZE], a2
  756. LDF [AO + 2 * SIZE], a3
  757. LDF [AO + 3 * SIZE], a4
  758. FSUB a1, c01, c01
  759. FSUB a2, c02, c02
  760. FSUB a3, c03, c03
  761. FSUB a4, c04, c04
  762. #endif
  763. #ifdef LN
  764. LDF [AO + 3 * SIZE], a1
  765. LDF [AO + 2 * SIZE], a2
  766. LDF [AO + 0 * SIZE], a3
  767. FMUL a1, c02, c02
  768. FMUL a1, c04, c04
  769. FNMSUB (aa2, cc02, cc01, cc01)
  770. FNMSUB (aa2, cc04, cc03, cc03)
  771. FMUL a3, c01, c01
  772. FMUL a3, c03, c03
  773. #endif
  774. #ifdef LT
  775. LDF [AO + 0 * SIZE], a1
  776. LDF [AO + 1 * SIZE], a2
  777. LDF [AO + 3 * SIZE], a3
  778. FMUL a1, c01, c01
  779. FMUL a1, c03, c03
  780. FNMSUB (aa2, cc01, cc02, cc02)
  781. FNMSUB (aa2, cc03, cc04, cc04)
  782. FMUL a3, c02, c02
  783. FMUL a3, c04, c04
  784. #endif
  785. #ifdef RN
  786. LDF [BO + 0 * SIZE], a1
  787. LDF [BO + 1 * SIZE], a2
  788. FMUL a1, c01, c01
  789. FMUL a1, c02, c02
  790. FNMSUB (aa2, cc01, cc03, cc03)
  791. FNMSUB (aa2, cc02, cc04, cc04)
  792. LDF [BO + 3 * SIZE], a1
  793. FMUL a1, c03, c03
  794. FMUL a1, c04, c04
  795. #endif
  796. #ifdef RT
  797. LDF [BO + 3 * SIZE], a1
  798. LDF [BO + 2 * SIZE], a2
  799. FMUL a1, c04, c04
  800. FMUL a1, c03, c03
  801. FNMSUB (aa2, cc04, cc02, cc02)
  802. FNMSUB (aa2, cc03, cc01, cc01)
  803. LDF [BO + 0 * SIZE], a1
  804. FMUL a1, c02, c02
  805. FMUL a1, c01, c01
  806. #endif
  807. #ifdef LN
  808. add C1, -2 * SIZE, C1
  809. add C2, -2 * SIZE, C2
  810. #endif
  811. #if defined(LN) || defined(LT)
  812. STF c01, [BO + 0 * SIZE]
  813. STF c03, [BO + 1 * SIZE]
  814. STF c02, [BO + 2 * SIZE]
  815. STF c04, [BO + 3 * SIZE]
  816. #else
  817. STF c01, [AO + 0 * SIZE]
  818. STF c02, [AO + 1 * SIZE]
  819. STF c03, [AO + 2 * SIZE]
  820. STF c04, [AO + 3 * SIZE]
  821. #endif
  822. STF c01, [C1 + 0 * SIZE]
  823. STF c02, [C1 + 1 * SIZE]
  824. STF c03, [C2 + 0 * SIZE]
  825. STF c04, [C2 + 1 * SIZE]
  826. #ifndef LN
  827. add C1, 2 * SIZE, C1
  828. add C2, 2 * SIZE, C2
  829. #endif
  830. #ifdef RT
  831. sll K, BASE_SHIFT + 1, TEMP1
  832. add AORIG, TEMP1, AORIG
  833. #endif
  834. #if defined(LT) || defined(RN)
  835. sub K, KK, TEMP1
  836. sll TEMP1, BASE_SHIFT + 1, TEMP2
  837. sll TEMP1, BASE_SHIFT + 1, TEMP1
  838. add AO, TEMP2, AO
  839. add BO, TEMP1, BO
  840. #endif
  841. #ifdef LT
  842. add KK, 2, KK
  843. #endif
  844. #ifdef LN
  845. sub KK, 2, KK
  846. #endif
  847. add I, -1, I
  848. cmp I, 0
  849. bg,pt %icc, .LL52
  850. nop
  851. .align 4
  852. .LL60:
  853. and M, 1, I
  854. cmp I, 0
  855. ble,pn %icc, .LL69
  856. nop
  857. #if defined(LT) || defined(RN)
  858. mov B, BO
  859. #else
  860. #ifdef LN
  861. sll K, BASE_SHIFT + 0, TEMP1
  862. sub AORIG, TEMP1, AORIG
  863. #endif
  864. sll KK, BASE_SHIFT + 0, TEMP1
  865. sll KK, BASE_SHIFT + 1, TEMP2
  866. add AORIG, TEMP1, AO
  867. add B, TEMP2, BO
  868. #endif
  869. LDF [AO + 0 * SIZE], a1
  870. LDF [AO + 1 * SIZE], a2
  871. LDF [AO + 2 * SIZE], a3
  872. LDF [AO + 3 * SIZE], a4
  873. LDF [BO + 0 * SIZE], b1
  874. LDF [BO + 1 * SIZE], b2
  875. LDF [BO + 2 * SIZE], b3
  876. LDF [BO + 3 * SIZE], b4
  877. LDF [BO + 4 * SIZE], b5
  878. LDF [BO + 5 * SIZE], b6
  879. LDF [BO + 6 * SIZE], b7
  880. FCLR (cc01)
  881. LDF [BO + 7 * SIZE], b8
  882. FCLR (cc03)
  883. #if defined(LT) || defined(RN)
  884. sra KK, 2, L
  885. #else
  886. sub K, KK, L
  887. sra L, 2, L
  888. #endif
  889. cmp L, 0
  890. ble,pn %icc, .LL65
  891. nop
  892. .align 4
  893. .LL63:
  894. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  895. add L, -1, L
  896. FMADD (aa1, bb1, cc01, cc01)
  897. LDF [BO + 8 * SIZE], b1
  898. FMADD (aa1, bb2, cc03, cc03)
  899. LDF [BO + 9 * SIZE], b2
  900. LDF [AO + 4 * SIZE], a1
  901. cmp L, 0
  902. FMADD (aa2, bb3, cc01, cc01)
  903. LDF [BO + 10 * SIZE], b3
  904. FMADD (aa2, bb4, cc03, cc03)
  905. LDF [BO + 11 * SIZE], b4
  906. LDF [AO + 5 * SIZE], a2
  907. add AO, 4 * SIZE, AO
  908. FMADD (aa3, bb5, cc01, cc01)
  909. LDF [BO + 12 * SIZE], b5
  910. FMADD (aa3, bb6, cc03, cc03)
  911. LDF [BO + 13 * SIZE], b6
  912. LDF [AO + 2 * SIZE], a3
  913. add BO, 8 * SIZE, BO
  914. FMADD (aa4, bb7, cc01, cc01)
  915. LDF [BO + 6 * SIZE], b7
  916. FMADD (aa4, bb8, cc03, cc03)
  917. LDF [BO + 7 * SIZE], b8
  918. bg,pt %icc, .LL63
  919. LDF [AO + 3 * SIZE], a4
  920. .align 4
  921. .LL65:
  922. #if defined(LT) || defined(RN)
  923. and KK, 3, L
  924. #else
  925. sub K, KK, L
  926. and L, 3, L
  927. #endif
  928. cmp L, 0
  929. ble,a,pn %icc, .LL68
  930. nop
  931. .align 4
  932. .LL67:
  933. FMADD (aa1, bb1, cc01, cc01)
  934. LDF [BO + 2 * SIZE], b1
  935. FMADD (aa1, bb2, cc03, cc03)
  936. LDF [BO + 3 * SIZE], b2
  937. LDF [AO + 1 * SIZE], a1
  938. add L, -1, L
  939. add AO, 1 * SIZE, AO
  940. cmp L, 0
  941. bg,pt %icc, .LL67
  942. add BO, 2 * SIZE, BO
  943. .align 4
  944. .LL68:
  945. #if defined(LN) || defined(RT)
  946. #ifdef LN
  947. sub KK, 1, TEMP1
  948. #else
  949. sub KK, 2, TEMP1
  950. #endif
  951. sll TEMP1, BASE_SHIFT + 0, TEMP2
  952. sll TEMP1, BASE_SHIFT + 1, TEMP1
  953. add AORIG, TEMP2, AO
  954. add B, TEMP1, BO
  955. #endif
  956. #if defined(LN) || defined(LT)
  957. LDF [BO + 0 * SIZE], a1
  958. LDF [BO + 1 * SIZE], a2
  959. FSUB a1, c01, c01
  960. FSUB a2, c03, c03
  961. #else
  962. LDF [AO + 0 * SIZE], a1
  963. LDF [AO + 1 * SIZE], a2
  964. FSUB a1, c01, c01
  965. FSUB a2, c03, c03
  966. #endif
  967. #if defined(LN) || defined(LT)
  968. LDF [AO + 0 * SIZE], a1
  969. FMUL a1, c01, c01
  970. FMUL a1, c03, c03
  971. #endif
  972. #ifdef RN
  973. LDF [BO + 0 * SIZE], a1
  974. LDF [BO + 1 * SIZE], a2
  975. FMUL a1, c01, c01
  976. FNMSUB (aa2, cc01, cc03, cc03)
  977. LDF [BO + 3 * SIZE], a1
  978. FMUL a1, c03, c03
  979. #endif
  980. #ifdef RT
  981. LDF [BO + 3 * SIZE], a1
  982. LDF [BO + 2 * SIZE], a2
  983. FMUL a1, c03, c03
  984. FNMSUB (aa2, cc03, cc01, cc01)
  985. LDF [BO + 0 * SIZE], a1
  986. FMUL a1, c01, c01
  987. #endif
  988. #ifdef LN
  989. add C1, -1 * SIZE, C1
  990. add C2, -1 * SIZE, C2
  991. #endif
  992. #if defined(LN) || defined(LT)
  993. STF c01, [BO + 0 * SIZE]
  994. STF c03, [BO + 1 * SIZE]
  995. #else
  996. STF c01, [AO + 0 * SIZE]
  997. STF c03, [AO + 1 * SIZE]
  998. #endif
  999. STF c01, [C1 + 0 * SIZE]
  1000. STF c03, [C2 + 0 * SIZE]
  1001. #ifdef RT
  1002. sll K, BASE_SHIFT + 0, TEMP1
  1003. add AORIG, TEMP1, AORIG
  1004. #endif
  1005. #if defined(LT) || defined(RN)
  1006. sub K, KK, TEMP1
  1007. sll TEMP1, BASE_SHIFT + 0, TEMP2
  1008. sll TEMP1, BASE_SHIFT + 1, TEMP1
  1009. add AO, TEMP2, AO
  1010. add BO, TEMP1, BO
  1011. #endif
  1012. #ifdef LT
  1013. add KK, 1, KK
  1014. #endif
  1015. #ifdef LN
  1016. sub KK, 1, KK
  1017. #endif
  1018. .align 4
  1019. .LL69:
  1020. #ifdef LN
  1021. sll K, BASE_SHIFT + 1, TEMP1
  1022. add B, TEMP1, B
  1023. #endif
  1024. #if defined(LT) || defined(RN)
  1025. mov BO, B
  1026. #endif
  1027. #ifdef RN
  1028. add KK, 2, KK
  1029. #endif
  1030. #ifdef RT
  1031. sub KK, 2, KK
  1032. #endif
  1033. .align 4
  1034. .LL30:
  1035. and N, 4, J
  1036. cmp J, 0
  1037. ble,pn %icc, .LL10
  1038. nop
  1039. #ifdef RT
  1040. sll K, BASE_SHIFT + 2, TEMP1
  1041. sub B, TEMP1, B
  1042. #endif
  1043. #ifndef RT
  1044. mov C, C1
  1045. add C, LDC, C2
  1046. add C2, LDC, C3
  1047. add C3, LDC, C4
  1048. add C4, LDC, C
  1049. #else
  1050. sub C, LDC, C4
  1051. sub C4, LDC, C3
  1052. sub C3, LDC, C2
  1053. sub C2, LDC, C1
  1054. sub C2, LDC, C
  1055. #endif
  1056. #ifdef LN
  1057. add M, OFFSET, KK
  1058. #endif
  1059. #ifdef LT
  1060. mov OFFSET, KK
  1061. #endif
  1062. #if defined(LN) || defined(RT)
  1063. mov A, AORIG
  1064. #else
  1065. mov A, AO
  1066. #endif
  1067. sra M, 1, I
  1068. cmp I, 0
  1069. ble,pn %icc, .LL40
  1070. nop
  1071. .align 4
  1072. .LL32:
  1073. #if defined(LT) || defined(RN)
  1074. mov B, BO
  1075. #else
  1076. #ifdef LN
  1077. sll K, BASE_SHIFT + 1, TEMP1
  1078. sub AORIG, TEMP1, AORIG
  1079. #endif
  1080. sll KK, BASE_SHIFT + 1, TEMP1
  1081. sll KK, BASE_SHIFT + 2, TEMP2
  1082. add AORIG, TEMP1, AO
  1083. add B, TEMP2, BO
  1084. #endif
  1085. LDF [AO + 0 * SIZE], a1
  1086. LDF [AO + 1 * SIZE], a2
  1087. LDF [BO + 0 * SIZE], b1
  1088. LDF [BO + 1 * SIZE], b2
  1089. LDF [BO + 2 * SIZE], b3
  1090. LDF [BO + 3 * SIZE], b4
  1091. LDF [BO + 4 * SIZE], b5
  1092. LDF [BO + 5 * SIZE], b6
  1093. FCLR (cc01)
  1094. LDF [BO + 6 * SIZE], b7
  1095. FCLR (cc02)
  1096. LDF [BO + 7 * SIZE], b8
  1097. FCLR (cc03)
  1098. LDF [BO + 8 * SIZE], b9
  1099. FCLR (cc04)
  1100. prefetch [C1 + 2 * SIZE], 3
  1101. FCLR (cc05)
  1102. prefetch [C2 + 2 * SIZE], 3
  1103. FCLR (cc06)
  1104. prefetch [C3 + 2 * SIZE], 3
  1105. FCLR (cc07)
  1106. prefetch [C4 + 2 * SIZE], 3
  1107. FCLR (cc08)
  1108. #if defined(LT) || defined(RN)
  1109. sra KK, 2, L
  1110. #else
  1111. sub K, KK, L
  1112. sra L, 2, L
  1113. #endif
  1114. cmp L, 0
  1115. ble,pn %icc, .LL35
  1116. nop
  1117. .align 4
  1118. .LL33:
  1119. FMADD (aa1, bb1, cc01, cc01)
  1120. LDF [AO + 2 * SIZE], a3
  1121. FMADD (aa2, bb1, cc02, cc02)
  1122. LDF [AO + 3 * SIZE], a4
  1123. FMADD (aa1, bb2, cc03, cc03)
  1124. LDF [BO + 16 * SIZE], b1
  1125. FMADD (aa2, bb2, cc04, cc04)
  1126. LDF [BO + 9 * SIZE], b2
  1127. FMADD (aa1, bb3, cc05, cc05)
  1128. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1129. FMADD (aa2, bb3, cc06, cc06)
  1130. add L, -1, L
  1131. FMADD (aa1, bb4, cc07, cc07)
  1132. LDF [BO + 10 * SIZE], b3
  1133. FMADD (aa2, bb4, cc08, cc08)
  1134. LDF [BO + 11 * SIZE], b4
  1135. FMADD (aa3, bb5, cc01, cc01)
  1136. LDF [AO + 4 * SIZE], a1
  1137. FMADD (aa4, bb5, cc02, cc02)
  1138. LDF [AO + 5 * SIZE], a2
  1139. FMADD (aa3, bb6, cc03, cc03)
  1140. LDF [BO + 12 * SIZE], b5
  1141. FMADD (aa4, bb6, cc04, cc04)
  1142. LDF [BO + 13 * SIZE], b6
  1143. FMADD (aa3, bb7, cc05, cc05)
  1144. cmp L, 0
  1145. FMADD (aa4, bb7, cc06, cc06)
  1146. add AO, 8 * SIZE, AO
  1147. FMADD (aa3, bb8, cc07, cc07)
  1148. LDF [BO + 14 * SIZE], b7
  1149. FMADD (aa4, bb8, cc08, cc08)
  1150. LDF [BO + 15 * SIZE], b8
  1151. FMADD (aa1, bb9, cc01, cc01)
  1152. LDF [AO - 2 * SIZE], a3
  1153. FMADD (aa2, bb9, cc02, cc02)
  1154. LDF [AO - 1 * SIZE], a4
  1155. FMADD (aa1, bb2, cc03, cc03)
  1156. LDF [BO + 24 * SIZE], b9
  1157. FMADD (aa2, bb2, cc04, cc04)
  1158. LDF [BO + 17 * SIZE], b2
  1159. FMADD (aa1, bb3, cc05, cc05)
  1160. add BO, 16 * SIZE, BO
  1161. FMADD (aa2, bb3, cc06, cc06)
  1162. nop
  1163. FMADD (aa1, bb4, cc07, cc07)
  1164. LDF [BO + 2 * SIZE], b3
  1165. FMADD (aa2, bb4, cc08, cc08)
  1166. LDF [BO + 3 * SIZE], b4
  1167. FMADD (aa3, bb5, cc01, cc01)
  1168. LDF [AO + 0 * SIZE], a1
  1169. FMADD (aa4, bb5, cc02, cc02)
  1170. LDF [AO + 1 * SIZE], a2
  1171. FMADD (aa3, bb6, cc03, cc03)
  1172. LDF [BO + 4 * SIZE], b5
  1173. FMADD (aa4, bb6, cc04, cc04)
  1174. LDF [BO + 5 * SIZE], b6
  1175. FMADD (aa3, bb7, cc05, cc05)
  1176. nop
  1177. FMADD (aa4, bb7, cc06, cc06)
  1178. LDF [BO + 6 * SIZE], b7
  1179. FMADD (aa3, bb8, cc07, cc07)
  1180. FMADD (aa4, bb8, cc08, cc08)
  1181. bg,pt %icc, .LL33
  1182. LDF [BO + 7 * SIZE], b8
  1183. .align 4
  1184. .LL35:
  1185. #if defined(LT) || defined(RN)
  1186. and KK, 3, L
  1187. #else
  1188. sub K, KK, L
  1189. and L, 3, L
  1190. #endif
  1191. cmp L, 0
  1192. ble,a,pn %icc, .LL38
  1193. nop
  1194. .align 4
  1195. .LL37:
  1196. FMADD (aa1, bb1, cc01, cc01)
  1197. add L, -1, L
  1198. FMADD (aa2, bb1, cc02, cc02)
  1199. LDF [BO + 4 * SIZE], b1
  1200. FMADD (aa1, bb2, cc03, cc03)
  1201. add AO, 2 * SIZE, AO
  1202. FMADD (aa2, bb2, cc04, cc04)
  1203. LDF [BO + 5 * SIZE], b2
  1204. FMADD (aa1, bb3, cc05, cc05)
  1205. cmp L, 0
  1206. FMADD (aa2, bb3, cc06, cc06)
  1207. LDF [BO + 6 * SIZE], b3
  1208. FMADD (aa1, bb4, cc07, cc07)
  1209. LDF [AO + 0 * SIZE], a1
  1210. FMADD (aa2, bb4, cc08, cc08)
  1211. LDF [AO + 1 * SIZE], a2
  1212. LDF [BO + 7 * SIZE], b4
  1213. bg,pt %icc, .LL37
  1214. add BO, 4 * SIZE, BO
  1215. .align 4
  1216. .LL38:
  1217. #if defined(LN) || defined(RT)
  1218. #ifdef LN
  1219. sub KK, 2, TEMP1
  1220. #else
  1221. sub KK, 4, TEMP1
  1222. #endif
  1223. sll TEMP1, BASE_SHIFT + 1, TEMP2
  1224. sll TEMP1, BASE_SHIFT + 2, TEMP1
  1225. add AORIG, TEMP2, AO
  1226. add B, TEMP1, BO
  1227. #endif
  1228. #if defined(LN) || defined(LT)
  1229. LDF [BO + 0 * SIZE], a1
  1230. LDF [BO + 1 * SIZE], a2
  1231. LDF [BO + 2 * SIZE], a3
  1232. LDF [BO + 3 * SIZE], a4
  1233. LDF [BO + 4 * SIZE], b1
  1234. LDF [BO + 5 * SIZE], b2
  1235. LDF [BO + 6 * SIZE], b3
  1236. LDF [BO + 7 * SIZE], b4
  1237. FSUB a1, c01, c01
  1238. FSUB a2, c03, c03
  1239. FSUB a3, c05, c05
  1240. FSUB a4, c07, c07
  1241. FSUB b1, c02, c02
  1242. FSUB b2, c04, c04
  1243. FSUB b3, c06, c06
  1244. FSUB b4, c08, c08
  1245. #else
  1246. LDF [AO + 0 * SIZE], a1
  1247. LDF [AO + 1 * SIZE], a2
  1248. LDF [AO + 2 * SIZE], a3
  1249. LDF [AO + 3 * SIZE], a4
  1250. LDF [AO + 4 * SIZE], b1
  1251. LDF [AO + 5 * SIZE], b2
  1252. LDF [AO + 6 * SIZE], b3
  1253. LDF [AO + 7 * SIZE], b4
  1254. FSUB a1, c01, c01
  1255. FSUB a2, c02, c02
  1256. FSUB a3, c03, c03
  1257. FSUB a4, c04, c04
  1258. FSUB b1, c05, c05
  1259. FSUB b2, c06, c06
  1260. FSUB b3, c07, c07
  1261. FSUB b4, c08, c08
  1262. #endif
  1263. #ifdef LN
  1264. LDF [AO + 3 * SIZE], a1
  1265. LDF [AO + 2 * SIZE], a2
  1266. LDF [AO + 0 * SIZE], a3
  1267. FMUL a1, c02, c02
  1268. FMUL a1, c04, c04
  1269. FMUL a1, c06, c06
  1270. FMUL a1, c08, c08
  1271. FNMSUB (aa2, cc02, cc01, cc01)
  1272. FNMSUB (aa2, cc04, cc03, cc03)
  1273. FNMSUB (aa2, cc06, cc05, cc05)
  1274. FNMSUB (aa2, cc08, cc07, cc07)
  1275. FMUL a3, c01, c01
  1276. FMUL a3, c03, c03
  1277. FMUL a3, c05, c05
  1278. FMUL a3, c07, c07
  1279. #endif
  1280. #ifdef LT
  1281. LDF [AO + 0 * SIZE], a1
  1282. LDF [AO + 1 * SIZE], a2
  1283. LDF [AO + 3 * SIZE], a3
  1284. FMUL a1, c01, c01
  1285. FMUL a1, c03, c03
  1286. FMUL a1, c05, c05
  1287. FMUL a1, c07, c07
  1288. FNMSUB (aa2, cc01, cc02, cc02)
  1289. FNMSUB (aa2, cc03, cc04, cc04)
  1290. FNMSUB (aa2, cc05, cc06, cc06)
  1291. FNMSUB (aa2, cc07, cc08, cc08)
  1292. FMUL a3, c02, c02
  1293. FMUL a3, c04, c04
  1294. FMUL a3, c06, c06
  1295. FMUL a3, c08, c08
  1296. #endif
  1297. #ifdef RN
  1298. LDF [BO + 0 * SIZE], a1
  1299. LDF [BO + 1 * SIZE], a2
  1300. LDF [BO + 2 * SIZE], a3
  1301. LDF [BO + 3 * SIZE], a4
  1302. FMUL a1, c01, c01
  1303. FMUL a1, c02, c02
  1304. FNMSUB (aa2, cc01, cc03, cc03)
  1305. FNMSUB (aa2, cc02, cc04, cc04)
  1306. FNMSUB (aa3, cc01, cc05, cc05)
  1307. FNMSUB (aa3, cc02, cc06, cc06)
  1308. FNMSUB (aa4, cc01, cc07, cc07)
  1309. FNMSUB (aa4, cc02, cc08, cc08)
  1310. LDF [BO + 5 * SIZE], a1
  1311. LDF [BO + 6 * SIZE], a2
  1312. LDF [BO + 7 * SIZE], a3
  1313. FMUL a1, c03, c03
  1314. FMUL a1, c04, c04
  1315. FNMSUB (aa2, cc03, cc05, cc05)
  1316. FNMSUB (aa2, cc04, cc06, cc06)
  1317. FNMSUB (aa3, cc03, cc07, cc07)
  1318. FNMSUB (aa3, cc04, cc08, cc08)
  1319. LDF [BO + 10 * SIZE], a1
  1320. LDF [BO + 11 * SIZE], a2
  1321. FMUL a1, c05, c05
  1322. FMUL a1, c06, c06
  1323. FNMSUB (aa2, cc05, cc07, cc07)
  1324. FNMSUB (aa2, cc06, cc08, cc08)
  1325. LDF [BO + 15 * SIZE], a1
  1326. FMUL a1, c07, c07
  1327. FMUL a1, c08, c08
  1328. #endif
  1329. #ifdef RT
  1330. LDF [BO + 15 * SIZE], a1
  1331. LDF [BO + 14 * SIZE], a2
  1332. LDF [BO + 13 * SIZE], a3
  1333. LDF [BO + 12 * SIZE], a4
  1334. FMUL a1, c08, c08
  1335. FMUL a1, c07, c07
  1336. FNMSUB (aa2, cc08, cc06, cc06)
  1337. FNMSUB (aa2, cc07, cc05, cc05)
  1338. FNMSUB (aa3, cc08, cc04, cc04)
  1339. FNMSUB (aa3, cc07, cc03, cc03)
  1340. FNMSUB (aa4, cc08, cc02, cc02)
  1341. FNMSUB (aa4, cc07, cc01, cc01)
  1342. LDF [BO + 10 * SIZE], a1
  1343. LDF [BO + 9 * SIZE], a2
  1344. LDF [BO + 8 * SIZE], a3
  1345. FMUL a1, c06, c06
  1346. FMUL a1, c05, c05
  1347. FNMSUB (aa2, cc06, cc04, cc04)
  1348. FNMSUB (aa2, cc05, cc03, cc03)
  1349. FNMSUB (aa3, cc06, cc02, cc02)
  1350. FNMSUB (aa3, cc05, cc01, cc01)
  1351. LDF [BO + 5 * SIZE], a1
  1352. LDF [BO + 4 * SIZE], a2
  1353. FMUL a1, c04, c04
  1354. FMUL a1, c03, c03
  1355. FNMSUB (aa2, cc04, cc02, cc02)
  1356. FNMSUB (aa2, cc03, cc01, cc01)
  1357. LDF [BO + 0 * SIZE], a1
  1358. FMUL a1, c02, c02
  1359. FMUL a1, c01, c01
  1360. #endif
  1361. #ifdef LN
  1362. add C1, -2 * SIZE, C1
  1363. add C2, -2 * SIZE, C2
  1364. add C3, -2 * SIZE, C3
  1365. add C4, -2 * SIZE, C4
  1366. #endif
  1367. #if defined(LN) || defined(LT)
  1368. STF c01, [BO + 0 * SIZE]
  1369. STF c03, [BO + 1 * SIZE]
  1370. STF c05, [BO + 2 * SIZE]
  1371. STF c07, [BO + 3 * SIZE]
  1372. STF c02, [BO + 4 * SIZE]
  1373. STF c04, [BO + 5 * SIZE]
  1374. STF c06, [BO + 6 * SIZE]
  1375. STF c08, [BO + 7 * SIZE]
  1376. #else
  1377. STF c01, [AO + 0 * SIZE]
  1378. STF c02, [AO + 1 * SIZE]
  1379. STF c03, [AO + 2 * SIZE]
  1380. STF c04, [AO + 3 * SIZE]
  1381. STF c05, [AO + 4 * SIZE]
  1382. STF c06, [AO + 5 * SIZE]
  1383. STF c07, [AO + 6 * SIZE]
  1384. STF c08, [AO + 7 * SIZE]
  1385. #endif
  1386. STF c01, [C1 + 0 * SIZE]
  1387. STF c02, [C1 + 1 * SIZE]
  1388. STF c03, [C2 + 0 * SIZE]
  1389. STF c04, [C2 + 1 * SIZE]
  1390. STF c05, [C3 + 0 * SIZE]
  1391. STF c06, [C3 + 1 * SIZE]
  1392. STF c07, [C4 + 0 * SIZE]
  1393. STF c08, [C4 + 1 * SIZE]
  1394. #ifndef LN
  1395. add C1, 2 * SIZE, C1
  1396. add C2, 2 * SIZE, C2
  1397. add C3, 2 * SIZE, C3
  1398. add C4, 2 * SIZE, C4
  1399. #endif
  1400. #ifdef RT
  1401. sll K, BASE_SHIFT + 1, TEMP1
  1402. add AORIG, TEMP1, AORIG
  1403. #endif
  1404. #if defined(LT) || defined(RN)
  1405. sub K, KK, TEMP1
  1406. sll TEMP1, BASE_SHIFT + 1, TEMP2
  1407. sll TEMP1, BASE_SHIFT + 2, TEMP1
  1408. add AO, TEMP2, AO
  1409. add BO, TEMP1, BO
  1410. #endif
  1411. #ifdef LT
  1412. add KK, 2, KK
  1413. #endif
  1414. #ifdef LN
  1415. sub KK, 2, KK
  1416. #endif
  1417. add I, -1, I
  1418. cmp I, 0
  1419. bg,pt %icc, .LL32
  1420. nop
  1421. .LL40:
  1422. and M, 1, I
  1423. cmp I, 0
  1424. ble,pn %icc, .LL49
  1425. nop
  1426. #if defined(LT) || defined(RN)
  1427. mov B, BO
  1428. #else
  1429. #ifdef LN
  1430. sll K, BASE_SHIFT + 0, TEMP1
  1431. sub AORIG, TEMP1, AORIG
  1432. #endif
  1433. sll KK, BASE_SHIFT + 0, TEMP1
  1434. sll KK, BASE_SHIFT + 2, TEMP2
  1435. add AORIG, TEMP1, AO
  1436. add B, TEMP2, BO
  1437. #endif
  1438. LDF [AO + 0 * SIZE], a1
  1439. LDF [AO + 1 * SIZE], a2
  1440. LDF [AO + 2 * SIZE], a3
  1441. LDF [AO + 3 * SIZE], a4
  1442. LDF [BO + 0 * SIZE], b1
  1443. LDF [BO + 1 * SIZE], b2
  1444. LDF [BO + 2 * SIZE], b3
  1445. LDF [BO + 3 * SIZE], b4
  1446. LDF [BO + 4 * SIZE], b5
  1447. LDF [BO + 5 * SIZE], b6
  1448. FCLR (cc01)
  1449. LDF [BO + 6 * SIZE], b7
  1450. FCLR (cc03)
  1451. LDF [BO + 7 * SIZE], b8
  1452. FCLR (cc05)
  1453. LDF [BO + 8 * SIZE], b9
  1454. FCLR (cc07)
  1455. #if defined(LT) || defined(RN)
  1456. sra KK, 2, L
  1457. #else
  1458. sub K, KK, L
  1459. sra L, 2, L
  1460. #endif
  1461. cmp L, 0
  1462. ble,pn %icc, .LL45
  1463. nop
  1464. .LL43:
  1465. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1466. add L, -1, L
  1467. FMADD (aa1, bb1, cc01, cc01)
  1468. LDF [BO + 16 * SIZE], b1
  1469. FMADD (aa1, bb2, cc03, cc03)
  1470. LDF [BO + 9 * SIZE], b2
  1471. FMADD (aa1, bb3, cc05, cc05)
  1472. LDF [BO + 10 * SIZE], b3
  1473. FMADD (aa1, bb4, cc07, cc07)
  1474. LDF [BO + 11 * SIZE], b4
  1475. LDF [AO + 4 * SIZE], a1
  1476. cmp L, 0
  1477. FMADD (aa2, bb5, cc01, cc01)
  1478. LDF [BO + 12 * SIZE], b5
  1479. FMADD (aa2, bb6, cc03, cc03)
  1480. LDF [BO + 13 * SIZE], b6
  1481. FMADD (aa2, bb7, cc05, cc05)
  1482. LDF [BO + 14 * SIZE], b7
  1483. FMADD (aa2, bb8, cc07, cc07)
  1484. LDF [BO + 15 * SIZE], b8
  1485. LDF [AO + 5 * SIZE], a2
  1486. add AO, 4 * SIZE, AO
  1487. FMADD (aa3, bb9, cc01, cc01)
  1488. LDF [BO + 24 * SIZE], b9
  1489. FMADD (aa3, bb2, cc03, cc03)
  1490. LDF [BO + 17 * SIZE], b2
  1491. FMADD (aa3, bb3, cc05, cc05)
  1492. LDF [BO + 18 * SIZE], b3
  1493. FMADD (aa3, bb4, cc07, cc07)
  1494. LDF [BO + 19 * SIZE], b4
  1495. LDF [AO + 2 * SIZE], a3
  1496. add BO, 16 * SIZE, BO
  1497. FMADD (aa4, bb5, cc01, cc01)
  1498. LDF [BO + 4 * SIZE], b5
  1499. FMADD (aa4, bb6, cc03, cc03)
  1500. LDF [BO + 5 * SIZE], b6
  1501. FMADD (aa4, bb7, cc05, cc05)
  1502. LDF [BO + 6 * SIZE], b7
  1503. FMADD (aa4, bb8, cc07, cc07)
  1504. LDF [BO + 7 * SIZE], b8
  1505. bg,pt %icc, .LL43
  1506. LDF [AO + 3 * SIZE], a4
  1507. .align 4
  1508. .LL45:
  1509. #if defined(LT) || defined(RN)
  1510. and KK, 3, L
  1511. #else
  1512. sub K, KK, L
  1513. and L, 3, L
  1514. #endif
  1515. cmp L, 0
  1516. ble,a,pn %icc, .LL48
  1517. nop
  1518. .align 4
  1519. .LL47:
  1520. FMADD (aa1, bb1, cc01, cc01)
  1521. LDF [BO + 4 * SIZE], b1
  1522. add L, -1, L
  1523. FMADD (aa1, bb2, cc03, cc03)
  1524. LDF [BO + 5 * SIZE], b2
  1525. add AO, 1 * SIZE, AO
  1526. FMADD (aa1, bb3, cc05, cc05)
  1527. LDF [BO + 6 * SIZE], b3
  1528. cmp L, 0
  1529. FMADD (aa1, bb4, cc07, cc07)
  1530. LDF [BO + 7 * SIZE], b4
  1531. add BO, 4 * SIZE, BO
  1532. bg,pt %icc, .LL47
  1533. LDF [AO + 0 * SIZE], a1
  1534. .align 4
  1535. .LL48:
  1536. #if defined(LN) || defined(RT)
  1537. #ifdef LN
  1538. sub KK, 1, TEMP1
  1539. #else
  1540. sub KK, 4, TEMP1
  1541. #endif
  1542. sll TEMP1, BASE_SHIFT + 0, TEMP2
  1543. sll TEMP1, BASE_SHIFT + 2, TEMP1
  1544. add AORIG, TEMP2, AO
  1545. add B, TEMP1, BO
  1546. #endif
  1547. #if defined(LN) || defined(LT)
  1548. LDF [BO + 0 * SIZE], a1
  1549. LDF [BO + 1 * SIZE], a2
  1550. LDF [BO + 2 * SIZE], a3
  1551. LDF [BO + 3 * SIZE], a4
  1552. FSUB a1, c01, c01
  1553. FSUB a2, c03, c03
  1554. FSUB a3, c05, c05
  1555. FSUB a4, c07, c07
  1556. #else
  1557. LDF [AO + 0 * SIZE], a1
  1558. LDF [AO + 1 * SIZE], a2
  1559. LDF [AO + 2 * SIZE], a3
  1560. LDF [AO + 3 * SIZE], a4
  1561. FSUB a1, c01, c01
  1562. FSUB a2, c03, c03
  1563. FSUB a3, c05, c05
  1564. FSUB a4, c07, c07
  1565. #endif
  1566. #if defined(LN) || defined(LT)
  1567. LDF [AO + 0 * SIZE], a1
  1568. FMUL a1, c01, c01
  1569. FMUL a1, c03, c03
  1570. FMUL a1, c05, c05
  1571. FMUL a1, c07, c07
  1572. #endif
  1573. #ifdef RN
  1574. LDF [BO + 0 * SIZE], a1
  1575. LDF [BO + 1 * SIZE], a2
  1576. LDF [BO + 2 * SIZE], a3
  1577. LDF [BO + 3 * SIZE], a4
  1578. FMUL a1, c01, c01
  1579. FNMSUB (aa2, cc01, cc03, cc03)
  1580. FNMSUB (aa3, cc01, cc05, cc05)
  1581. FNMSUB (aa4, cc01, cc07, cc07)
  1582. LDF [BO + 5 * SIZE], a1
  1583. LDF [BO + 6 * SIZE], a2
  1584. LDF [BO + 7 * SIZE], a3
  1585. FMUL a1, c03, c03
  1586. FNMSUB (aa2, cc03, cc05, cc05)
  1587. FNMSUB (aa3, cc03, cc07, cc07)
  1588. LDF [BO + 10 * SIZE], a1
  1589. LDF [BO + 11 * SIZE], a2
  1590. FMUL a1, c05, c05
  1591. FNMSUB (aa2, cc05, cc07, cc07)
  1592. LDF [BO + 15 * SIZE], a1
  1593. FMUL a1, c07, c07
  1594. #endif
  1595. #ifdef RT
  1596. LDF [BO + 15 * SIZE], a1
  1597. LDF [BO + 14 * SIZE], a2
  1598. LDF [BO + 13 * SIZE], a3
  1599. LDF [BO + 12 * SIZE], a4
  1600. FMUL a1, c07, c07
  1601. FNMSUB (aa2, cc07, cc05, cc05)
  1602. FNMSUB (aa3, cc07, cc03, cc03)
  1603. FNMSUB (aa4, cc07, cc01, cc01)
  1604. LDF [BO + 10 * SIZE], a1
  1605. LDF [BO + 9 * SIZE], a2
  1606. LDF [BO + 8 * SIZE], a3
  1607. FMUL a1, c05, c05
  1608. FNMSUB (aa2, cc05, cc03, cc03)
  1609. FNMSUB (aa3, cc05, cc01, cc01)
  1610. LDF [BO + 5 * SIZE], a1
  1611. LDF [BO + 4 * SIZE], a2
  1612. FMUL a1, c03, c03
  1613. FNMSUB (aa2, cc03, cc01, cc01)
  1614. LDF [BO + 0 * SIZE], a1
  1615. FMUL a1, c01, c01
  1616. #endif
  1617. #ifdef LN
  1618. add C1, -1 * SIZE, C1
  1619. add C2, -1 * SIZE, C2
  1620. add C3, -1 * SIZE, C3
  1621. add C4, -1 * SIZE, C4
  1622. #endif
  1623. #if defined(LN) || defined(LT)
  1624. STF c01, [BO + 0 * SIZE]
  1625. STF c03, [BO + 1 * SIZE]
  1626. STF c05, [BO + 2 * SIZE]
  1627. STF c07, [BO + 3 * SIZE]
  1628. #else
  1629. STF c01, [AO + 0 * SIZE]
  1630. STF c03, [AO + 1 * SIZE]
  1631. STF c05, [AO + 2 * SIZE]
  1632. STF c07, [AO + 3 * SIZE]
  1633. #endif
  1634. STF c01, [C1 + 0 * SIZE]
  1635. STF c03, [C2 + 0 * SIZE]
  1636. STF c05, [C3 + 0 * SIZE]
  1637. STF c07, [C4 + 0 * SIZE]
  1638. #ifdef RT
  1639. sll K, BASE_SHIFT + 0, TEMP1
  1640. add AORIG, TEMP1, AORIG
  1641. #endif
  1642. #if defined(LT) || defined(RN)
  1643. sub K, KK, TEMP1
  1644. sll TEMP1, BASE_SHIFT + 0, TEMP2
  1645. sll TEMP1, BASE_SHIFT + 2, TEMP1
  1646. add AO, TEMP2, AO
  1647. add BO, TEMP1, BO
  1648. #endif
  1649. #ifdef LT
  1650. add KK, 1, KK
  1651. #endif
  1652. #ifdef LN
  1653. sub KK, 1, KK
  1654. #endif
  1655. .align 4
  1656. .LL49:
  1657. #ifdef LN
  1658. sll K, BASE_SHIFT + 2, TEMP1
  1659. add B, TEMP1, B
  1660. #endif
  1661. #if defined(LT) || defined(RN)
  1662. mov BO, B
  1663. #endif
  1664. #ifdef RN
  1665. add KK, 4, KK
  1666. #endif
  1667. #ifdef RT
  1668. sub KK, 4, KK
  1669. #endif
  1670. .align 4
  1671. .LL10:
  1672. sra N, 3, J
  1673. cmp J, 0
  1674. ble,pn %icc, .LL999
  1675. nop
  1676. .align 4
  1677. .LL11:
  1678. #ifdef RT
  1679. sll K, BASE_SHIFT + 3, TEMP1
  1680. sub B, TEMP1, B
  1681. #endif
  1682. #ifndef RT
  1683. mov C, C1
  1684. add C, LDC, C2
  1685. add C2, LDC, C3
  1686. add C3, LDC, C4
  1687. add C4, LDC, C5
  1688. add C5, LDC, C6
  1689. add C6, LDC, C7
  1690. add C7, LDC, C8
  1691. add C8, LDC, C
  1692. #else
  1693. sub C, LDC, C8
  1694. sub C8, LDC, C7
  1695. sub C7, LDC, C6
  1696. sub C6, LDC, C5
  1697. sub C5, LDC, C4
  1698. sub C4, LDC, C3
  1699. sub C3, LDC, C2
  1700. sub C2, LDC, C1
  1701. sub C2, LDC, C
  1702. #endif
  1703. #ifdef LN
  1704. add M, OFFSET, KK
  1705. #endif
  1706. #ifdef LT
  1707. mov OFFSET, KK
  1708. #endif
  1709. #if defined(LN) || defined(RT)
  1710. mov A, AORIG
  1711. #else
  1712. mov A, AO
  1713. #endif
  1714. sra M, 1, I
  1715. cmp I, 0
  1716. ble,pn %icc, .LL20
  1717. nop
  1718. .align 4
  1719. .LL12:
  1720. #if defined(LT) || defined(RN)
  1721. mov B, BO
  1722. #else
  1723. #ifdef LN
  1724. sll K, BASE_SHIFT + 1, TEMP1
  1725. sub AORIG, TEMP1, AORIG
  1726. #endif
  1727. sll KK, BASE_SHIFT + 1, TEMP1
  1728. sll KK, BASE_SHIFT + 3, TEMP2
  1729. add AORIG, TEMP1, AO
  1730. add B, TEMP2, BO
  1731. #endif
  1732. LDF [AO + 0 * SIZE], a1
  1733. LDF [AO + 1 * SIZE], a2
  1734. LDF [AO + 8 * SIZE], a5
  1735. LDF [BO + 0 * SIZE], b1
  1736. LDF [BO + 1 * SIZE], b2
  1737. FCLR (cc01)
  1738. LDF [BO + 2 * SIZE], b3
  1739. FCLR (cc05)
  1740. LDF [BO + 3 * SIZE], b4
  1741. FCLR (cc09)
  1742. LDF [BO + 4 * SIZE], b5
  1743. FCLR (cc13)
  1744. LDF [BO + 5 * SIZE], b6
  1745. FCLR (cc02)
  1746. LDF [BO + 6 * SIZE], b7
  1747. FCLR (cc06)
  1748. LDF [BO + 7 * SIZE], b8
  1749. FCLR (cc10)
  1750. LDF [BO + 8 * SIZE], b9
  1751. FCLR (cc14)
  1752. prefetch [C1 + 1 * SIZE], 3
  1753. FCLR (cc03)
  1754. prefetch [C2 + 2 * SIZE], 3
  1755. FCLR (cc07)
  1756. prefetch [C3 + 1 * SIZE], 3
  1757. FCLR (cc11)
  1758. prefetch [C4 + 2 * SIZE], 3
  1759. FCLR (cc15)
  1760. prefetch [C5 + 1 * SIZE], 3
  1761. FCLR (cc04)
  1762. prefetch [C6 + 2 * SIZE], 3
  1763. FCLR (cc08)
  1764. prefetch [C7 + 1 * SIZE], 3
  1765. FCLR (cc12)
  1766. prefetch [C8 + 2 * SIZE], 3
  1767. FCLR (cc16)
  1768. #if defined(LT) || defined(RN)
  1769. sra KK, 3, L
  1770. #else
  1771. sub K, KK, L
  1772. sra L, 3, L
  1773. #endif
  1774. cmp L, 0
  1775. ble,pn %icc, .LL15
  1776. nop
  1777. .align 4
  1778. .LL13:
  1779. FMADD (aa1, bb1, cc01, cc01)
  1780. FMADD (aa2, bb1, cc02, cc02)
  1781. FMADD (aa1, bb2, cc03, cc03)
  1782. FMADD (aa2, bb2, cc04, cc04)
  1783. FMADD (aa1, bb3, cc05, cc05)
  1784. LDF [BO + 16 * SIZE], b1
  1785. FMADD (aa2, bb3, cc06, cc06)
  1786. LDF [BO + 9 * SIZE], b2
  1787. FMADD (aa1, bb4, cc07, cc07)
  1788. LDF [BO + 10 * SIZE], b3
  1789. FMADD (aa2, bb4, cc08, cc08)
  1790. LDF [BO + 11 * SIZE], b4
  1791. FMADD (aa1, bb5, cc09, cc09)
  1792. LDF [AO + 2 * SIZE], a3
  1793. FMADD (aa2, bb5, cc10, cc10)
  1794. LDF [AO + 3 * SIZE], a4
  1795. FMADD (aa1, bb6, cc11, cc11)
  1796. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1797. FMADD (aa2, bb6, cc12, cc12)
  1798. nop
  1799. FMADD (aa1, bb7, cc13, cc13)
  1800. LDF [BO + 12 * SIZE], b5
  1801. FMADD (aa2, bb7, cc14, cc14)
  1802. LDF [BO + 13 * SIZE], b6
  1803. FMADD (aa1, bb8, cc15, cc15)
  1804. LDF [BO + 14 * SIZE], b7
  1805. FMADD (aa2, bb8, cc16, cc16)
  1806. LDF [BO + 15 * SIZE], b8
  1807. FMADD (aa3, bb9, cc01, cc01)
  1808. FMADD (aa4, bb9, cc02, cc02)
  1809. FMADD (aa3, bb2, cc03, cc03)
  1810. FMADD (aa4, bb2, cc04, cc04)
  1811. FMADD (aa3, bb3, cc05, cc05)
  1812. LDF [BO + 24 * SIZE], b9
  1813. FMADD (aa4, bb3, cc06, cc06)
  1814. LDF [BO + 17 * SIZE], b2
  1815. FMADD (aa3, bb4, cc07, cc07)
  1816. LDF [BO + 18 * SIZE], b3
  1817. FMADD (aa4, bb4, cc08, cc08)
  1818. LDF [BO + 19 * SIZE], b4
  1819. FMADD (aa3, bb5, cc09, cc09)
  1820. LDF [AO + 4 * SIZE], a1
  1821. FMADD (aa4, bb5, cc10, cc10)
  1822. LDF [AO + 5 * SIZE], a2
  1823. FMADD (aa3, bb6, cc11, cc11)
  1824. add L, -1, L
  1825. FMADD (aa4, bb6, cc12, cc12)
  1826. nop
  1827. FMADD (aa3, bb7, cc13, cc13)
  1828. LDF [BO + 20 * SIZE], b5
  1829. FMADD (aa4, bb7, cc14, cc14)
  1830. LDF [BO + 21 * SIZE], b6
  1831. FMADD (aa3, bb8, cc15, cc15)
  1832. LDF [BO + 22 * SIZE], b7
  1833. FMADD (aa4, bb8, cc16, cc16)
  1834. LDF [BO + 23 * SIZE], b8
  1835. FMADD (aa1, bb1, cc01, cc01)
  1836. FMADD (aa2, bb1, cc02, cc02)
  1837. FMADD (aa1, bb2, cc03, cc03)
  1838. FMADD (aa2, bb2, cc04, cc04)
  1839. FMADD (aa1, bb3, cc05, cc05)
  1840. LDF [BO + 32 * SIZE], b1
  1841. FMADD (aa2, bb3, cc06, cc06)
  1842. LDF [BO + 25 * SIZE], b2
  1843. FMADD (aa1, bb4, cc07, cc07)
  1844. LDF [BO + 26 * SIZE], b3
  1845. FMADD (aa2, bb4, cc08, cc08)
  1846. LDF [BO + 27 * SIZE], b4
  1847. FMADD (aa1, bb5, cc09, cc09)
  1848. LDF [AO + 6 * SIZE], a3
  1849. FMADD (aa2, bb5, cc10, cc10)
  1850. LDF [AO + 7 * SIZE], a4
  1851. FMADD (aa1, bb6, cc11, cc11)
  1852. nop
  1853. FMADD (aa2, bb6, cc12, cc12)
  1854. nop
  1855. FMADD (aa1, bb7, cc13, cc13)
  1856. LDF [BO + 28 * SIZE], b5
  1857. FMADD (aa2, bb7, cc14, cc14)
  1858. LDF [BO + 29 * SIZE], b6
  1859. FMADD (aa1, bb8, cc15, cc15)
  1860. LDF [BO + 30 * SIZE], b7
  1861. FMADD (aa2, bb8, cc16, cc16)
  1862. LDF [BO + 31 * SIZE], b8
  1863. FMADD (aa3, bb9, cc01, cc01)
  1864. FMADD (aa4, bb9, cc02, cc02)
  1865. FMADD (aa3, bb2, cc03, cc03)
  1866. FMADD (aa4, bb2, cc04, cc04)
  1867. FMADD (aa3, bb3, cc05, cc05)
  1868. LDF [BO + 40 * SIZE], b9
  1869. FMADD (aa4, bb3, cc06, cc06)
  1870. LDF [BO + 33 * SIZE], b2
  1871. FMADD (aa3, bb4, cc07, cc07)
  1872. LDF [BO + 34 * SIZE], b3
  1873. FMADD (aa4, bb4, cc08, cc08)
  1874. LDF [BO + 35 * SIZE], b4
  1875. FMADD (aa3, bb5, cc09, cc09)
  1876. LDF [AO + 16 * SIZE], a1 /****/
  1877. FMADD (aa4, bb5, cc10, cc10)
  1878. LDF [AO + 9 * SIZE], a2
  1879. FMADD (aa3, bb6, cc11, cc11)
  1880. nop
  1881. FMADD (aa4, bb6, cc12, cc12)
  1882. nop
  1883. FMADD (aa3, bb7, cc13, cc13)
  1884. LDF [BO + 36 * SIZE], b5
  1885. FMADD (aa4, bb7, cc14, cc14)
  1886. LDF [BO + 37 * SIZE], b6
  1887. FMADD (aa3, bb8, cc15, cc15)
  1888. LDF [BO + 38 * SIZE], b7
  1889. FMADD (aa4, bb8, cc16, cc16)
  1890. LDF [BO + 39 * SIZE], b8
  1891. FMADD (aa5, bb1, cc01, cc01)
  1892. FMADD (aa2, bb1, cc02, cc02)
  1893. FMADD (aa5, bb2, cc03, cc03)
  1894. FMADD (aa2, bb2, cc04, cc04)
  1895. FMADD (aa5, bb3, cc05, cc05)
  1896. LDF [BO + 48 * SIZE], b1
  1897. FMADD (aa2, bb3, cc06, cc06)
  1898. LDF [BO + 41 * SIZE], b2
  1899. FMADD (aa5, bb4, cc07, cc07)
  1900. LDF [BO + 42 * SIZE], b3
  1901. FMADD (aa2, bb4, cc08, cc08)
  1902. LDF [BO + 43 * SIZE], b4
  1903. FMADD (aa5, bb5, cc09, cc09)
  1904. LDF [AO + 10 * SIZE], a3
  1905. FMADD (aa2, bb5, cc10, cc10)
  1906. LDF [AO + 11 * SIZE], a4
  1907. FMADD (aa5, bb6, cc11, cc11)
  1908. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  1909. FMADD (aa2, bb6, cc12, cc12)
  1910. nop
  1911. FMADD (aa5, bb7, cc13, cc13)
  1912. LDF [BO + 44 * SIZE], b5
  1913. FMADD (aa2, bb7, cc14, cc14)
  1914. LDF [BO + 45 * SIZE], b6
  1915. FMADD (aa5, bb8, cc15, cc15)
  1916. LDF [BO + 46 * SIZE], b7
  1917. FMADD (aa2, bb8, cc16, cc16)
  1918. LDF [BO + 47 * SIZE], b8
  1919. FMADD (aa3, bb9, cc01, cc01)
  1920. FMADD (aa4, bb9, cc02, cc02)
  1921. FMADD (aa3, bb2, cc03, cc03)
  1922. FMADD (aa4, bb2, cc04, cc04)
  1923. FMADD (aa3, bb3, cc05, cc05)
  1924. LDF [BO + 56 * SIZE], b9
  1925. FMADD (aa4, bb3, cc06, cc06)
  1926. LDF [BO + 49 * SIZE], b2
  1927. FMADD (aa3, bb4, cc07, cc07)
  1928. LDF [BO + 50 * SIZE], b3
  1929. FMADD (aa4, bb4, cc08, cc08)
  1930. LDF [BO + 51 * SIZE], b4
  1931. FMADD (aa3, bb5, cc09, cc09)
  1932. LDF [AO + 12 * SIZE], a5
  1933. FMADD (aa4, bb5, cc10, cc10)
  1934. LDF [AO + 13 * SIZE], a2
  1935. FMADD (aa3, bb6, cc11, cc11)
  1936. cmp L, 0
  1937. FMADD (aa4, bb6, cc12, cc12)
  1938. nop
  1939. FMADD (aa3, bb7, cc13, cc13)
  1940. LDF [BO + 52 * SIZE], b5
  1941. FMADD (aa4, bb7, cc14, cc14)
  1942. LDF [BO + 53 * SIZE], b6
  1943. FMADD (aa3, bb8, cc15, cc15)
  1944. LDF [BO + 54 * SIZE], b7
  1945. FMADD (aa4, bb8, cc16, cc16)
  1946. LDF [BO + 55 * SIZE], b8
  1947. FMADD (aa5, bb1, cc01, cc01)
  1948. FMADD (aa2, bb1, cc02, cc02)
  1949. FMADD (aa5, bb2, cc03, cc03)
  1950. FMADD (aa2, bb2, cc04, cc04)
  1951. FMADD (aa5, bb3, cc05, cc05)
  1952. LDF [BO + 64 * SIZE], b1
  1953. FMADD (aa2, bb3, cc06, cc06)
  1954. LDF [BO + 57 * SIZE], b2
  1955. FMADD (aa5, bb4, cc07, cc07)
  1956. LDF [BO + 58 * SIZE], b3
  1957. FMADD (aa2, bb4, cc08, cc08)
  1958. LDF [BO + 59 * SIZE], b4
  1959. FMADD (aa5, bb5, cc09, cc09)
  1960. LDF [AO + 14 * SIZE], a3
  1961. FMADD (aa2, bb5, cc10, cc10)
  1962. LDF [AO + 15 * SIZE], a4
  1963. FMADD (aa5, bb6, cc11, cc11)
  1964. add BO, 64 * SIZE, BO
  1965. FMADD (aa2, bb6, cc12, cc12)
  1966. add AO, 16 * SIZE, AO
  1967. FMADD (aa5, bb7, cc13, cc13)
  1968. LDF [BO - 4 * SIZE], b5
  1969. FMADD (aa2, bb7, cc14, cc14)
  1970. LDF [BO - 3 * SIZE], b6
  1971. FMADD (aa5, bb8, cc15, cc15)
  1972. LDF [BO - 2 * SIZE], b7
  1973. FMADD (aa2, bb8, cc16, cc16)
  1974. LDF [BO - 1 * SIZE], b8
  1975. FMADD (aa3, bb9, cc01, cc01)
  1976. FMADD (aa4, bb9, cc02, cc02)
  1977. FMADD (aa3, bb2, cc03, cc03)
  1978. FMADD (aa4, bb2, cc04, cc04)
  1979. FMADD (aa3, bb3, cc05, cc05)
  1980. LDF [BO + 8 * SIZE], b9
  1981. FMADD (aa4, bb3, cc06, cc06)
  1982. LDF [BO + 1 * SIZE], b2
  1983. FMADD (aa3, bb4, cc07, cc07)
  1984. LDF [BO + 2 * SIZE], b3
  1985. FMADD (aa4, bb4, cc08, cc08)
  1986. LDF [BO + 3 * SIZE], b4
  1987. FMADD (aa3, bb5, cc09, cc09)
  1988. LDF [AO + 8 * SIZE], a5 /****/
  1989. FMADD (aa4, bb5, cc10, cc10)
  1990. LDF [AO + 1 * SIZE], a2
  1991. FMADD (aa3, bb6, cc11, cc11)
  1992. FMADD (aa4, bb6, cc12, cc12)
  1993. FMADD (aa3, bb7, cc13, cc13)
  1994. LDF [BO + 4 * SIZE], b5
  1995. FMADD (aa4, bb7, cc14, cc14)
  1996. LDF [BO + 5 * SIZE], b6
  1997. FMADD (aa3, bb8, cc15, cc15)
  1998. LDF [BO + 6 * SIZE], b7
  1999. FMADD (aa4, bb8, cc16, cc16)
  2000. ble,pn %icc, .LL15
  2001. LDF [BO + 7 * SIZE], b8
  2002. FMADD (aa1, bb1, cc01, cc01)
  2003. FMADD (aa2, bb1, cc02, cc02)
  2004. FMADD (aa1, bb2, cc03, cc03)
  2005. FMADD (aa2, bb2, cc04, cc04)
  2006. FMADD (aa1, bb3, cc05, cc05)
  2007. LDF [BO + 16 * SIZE], b1
  2008. FMADD (aa2, bb3, cc06, cc06)
  2009. LDF [BO + 9 * SIZE], b2
  2010. FMADD (aa1, bb4, cc07, cc07)
  2011. LDF [BO + 10 * SIZE], b3
  2012. FMADD (aa2, bb4, cc08, cc08)
  2013. LDF [BO + 11 * SIZE], b4
  2014. FMADD (aa1, bb5, cc09, cc09)
  2015. LDF [AO + 2 * SIZE], a3
  2016. FMADD (aa2, bb5, cc10, cc10)
  2017. LDF [AO + 3 * SIZE], a4
  2018. FMADD (aa1, bb6, cc11, cc11)
  2019. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  2020. FMADD (aa2, bb6, cc12, cc12)
  2021. nop
  2022. FMADD (aa1, bb7, cc13, cc13)
  2023. LDF [BO + 12 * SIZE], b5
  2024. FMADD (aa2, bb7, cc14, cc14)
  2025. LDF [BO + 13 * SIZE], b6
  2026. FMADD (aa1, bb8, cc15, cc15)
  2027. LDF [BO + 14 * SIZE], b7
  2028. FMADD (aa2, bb8, cc16, cc16)
  2029. LDF [BO + 15 * SIZE], b8
  2030. FMADD (aa3, bb9, cc01, cc01)
  2031. FMADD (aa4, bb9, cc02, cc02)
  2032. FMADD (aa3, bb2, cc03, cc03)
  2033. FMADD (aa4, bb2, cc04, cc04)
  2034. FMADD (aa3, bb3, cc05, cc05)
  2035. LDF [BO + 24 * SIZE], b9
  2036. FMADD (aa4, bb3, cc06, cc06)
  2037. LDF [BO + 17 * SIZE], b2
  2038. FMADD (aa3, bb4, cc07, cc07)
  2039. LDF [BO + 18 * SIZE], b3
  2040. FMADD (aa4, bb4, cc08, cc08)
  2041. LDF [BO + 19 * SIZE], b4
  2042. FMADD (aa3, bb5, cc09, cc09)
  2043. LDF [AO + 4 * SIZE], a1
  2044. FMADD (aa4, bb5, cc10, cc10)
  2045. LDF [AO + 5 * SIZE], a2
  2046. FMADD (aa3, bb6, cc11, cc11)
  2047. add L, -1, L
  2048. FMADD (aa4, bb6, cc12, cc12)
  2049. nop
  2050. FMADD (aa3, bb7, cc13, cc13)
  2051. LDF [BO + 20 * SIZE], b5
  2052. FMADD (aa4, bb7, cc14, cc14)
  2053. LDF [BO + 21 * SIZE], b6
  2054. FMADD (aa3, bb8, cc15, cc15)
  2055. LDF [BO + 22 * SIZE], b7
  2056. FMADD (aa4, bb8, cc16, cc16)
  2057. LDF [BO + 23 * SIZE], b8
  2058. FMADD (aa1, bb1, cc01, cc01)
  2059. FMADD (aa2, bb1, cc02, cc02)
  2060. FMADD (aa1, bb2, cc03, cc03)
  2061. FMADD (aa2, bb2, cc04, cc04)
  2062. FMADD (aa1, bb3, cc05, cc05)
  2063. LDF [BO + 32 * SIZE], b1
  2064. FMADD (aa2, bb3, cc06, cc06)
  2065. LDF [BO + 25 * SIZE], b2
  2066. FMADD (aa1, bb4, cc07, cc07)
  2067. LDF [BO + 26 * SIZE], b3
  2068. FMADD (aa2, bb4, cc08, cc08)
  2069. LDF [BO + 27 * SIZE], b4
  2070. FMADD (aa1, bb5, cc09, cc09)
  2071. LDF [AO + 6 * SIZE], a3
  2072. FMADD (aa2, bb5, cc10, cc10)
  2073. LDF [AO + 7 * SIZE], a4
  2074. FMADD (aa1, bb6, cc11, cc11)
  2075. nop
  2076. FMADD (aa2, bb6, cc12, cc12)
  2077. nop
  2078. FMADD (aa1, bb7, cc13, cc13)
  2079. LDF [BO + 28 * SIZE], b5
  2080. FMADD (aa2, bb7, cc14, cc14)
  2081. LDF [BO + 29 * SIZE], b6
  2082. FMADD (aa1, bb8, cc15, cc15)
  2083. LDF [BO + 30 * SIZE], b7
  2084. FMADD (aa2, bb8, cc16, cc16)
  2085. LDF [BO + 31 * SIZE], b8
  2086. FMADD (aa3, bb9, cc01, cc01)
  2087. FMADD (aa4, bb9, cc02, cc02)
  2088. FMADD (aa3, bb2, cc03, cc03)
  2089. FMADD (aa4, bb2, cc04, cc04)
  2090. FMADD (aa3, bb3, cc05, cc05)
  2091. LDF [BO + 40 * SIZE], b9
  2092. FMADD (aa4, bb3, cc06, cc06)
  2093. LDF [BO + 33 * SIZE], b2
  2094. FMADD (aa3, bb4, cc07, cc07)
  2095. LDF [BO + 34 * SIZE], b3
  2096. FMADD (aa4, bb4, cc08, cc08)
  2097. LDF [BO + 35 * SIZE], b4
  2098. FMADD (aa3, bb5, cc09, cc09)
  2099. LDF [AO + 16 * SIZE], a1 /****/
  2100. FMADD (aa4, bb5, cc10, cc10)
  2101. LDF [AO + 9 * SIZE], a2
  2102. FMADD (aa3, bb6, cc11, cc11)
  2103. nop
  2104. FMADD (aa4, bb6, cc12, cc12)
  2105. nop
  2106. FMADD (aa3, bb7, cc13, cc13)
  2107. LDF [BO + 36 * SIZE], b5
  2108. FMADD (aa4, bb7, cc14, cc14)
  2109. LDF [BO + 37 * SIZE], b6
  2110. FMADD (aa3, bb8, cc15, cc15)
  2111. LDF [BO + 38 * SIZE], b7
  2112. FMADD (aa4, bb8, cc16, cc16)
  2113. LDF [BO + 39 * SIZE], b8
  2114. FMADD (aa5, bb1, cc01, cc01)
  2115. FMADD (aa2, bb1, cc02, cc02)
  2116. FMADD (aa5, bb2, cc03, cc03)
  2117. FMADD (aa2, bb2, cc04, cc04)
  2118. FMADD (aa5, bb3, cc05, cc05)
  2119. LDF [BO + 48 * SIZE], b1
  2120. FMADD (aa2, bb3, cc06, cc06)
  2121. LDF [BO + 41 * SIZE], b2
  2122. FMADD (aa5, bb4, cc07, cc07)
  2123. LDF [BO + 42 * SIZE], b3
  2124. FMADD (aa2, bb4, cc08, cc08)
  2125. LDF [BO + 43 * SIZE], b4
  2126. FMADD (aa5, bb5, cc09, cc09)
  2127. LDF [AO + 10 * SIZE], a3
  2128. FMADD (aa2, bb5, cc10, cc10)
  2129. LDF [AO + 11 * SIZE], a4
  2130. FMADD (aa5, bb6, cc11, cc11)
  2131. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  2132. FMADD (aa2, bb6, cc12, cc12)
  2133. nop
  2134. FMADD (aa5, bb7, cc13, cc13)
  2135. LDF [BO + 44 * SIZE], b5
  2136. FMADD (aa2, bb7, cc14, cc14)
  2137. LDF [BO + 45 * SIZE], b6
  2138. FMADD (aa5, bb8, cc15, cc15)
  2139. LDF [BO + 46 * SIZE], b7
  2140. FMADD (aa2, bb8, cc16, cc16)
  2141. LDF [BO + 47 * SIZE], b8
  2142. FMADD (aa3, bb9, cc01, cc01)
  2143. FMADD (aa4, bb9, cc02, cc02)
  2144. FMADD (aa3, bb2, cc03, cc03)
  2145. FMADD (aa4, bb2, cc04, cc04)
  2146. FMADD (aa3, bb3, cc05, cc05)
  2147. LDF [BO + 56 * SIZE], b9
  2148. FMADD (aa4, bb3, cc06, cc06)
  2149. LDF [BO + 49 * SIZE], b2
  2150. FMADD (aa3, bb4, cc07, cc07)
  2151. LDF [BO + 50 * SIZE], b3
  2152. FMADD (aa4, bb4, cc08, cc08)
  2153. LDF [BO + 51 * SIZE], b4
  2154. FMADD (aa3, bb5, cc09, cc09)
  2155. LDF [AO + 12 * SIZE], a5
  2156. FMADD (aa4, bb5, cc10, cc10)
  2157. LDF [AO + 13 * SIZE], a2
  2158. FMADD (aa3, bb6, cc11, cc11)
  2159. cmp L, 0
  2160. FMADD (aa4, bb6, cc12, cc12)
  2161. nop
  2162. FMADD (aa3, bb7, cc13, cc13)
  2163. LDF [BO + 52 * SIZE], b5
  2164. FMADD (aa4, bb7, cc14, cc14)
  2165. LDF [BO + 53 * SIZE], b6
  2166. FMADD (aa3, bb8, cc15, cc15)
  2167. LDF [BO + 54 * SIZE], b7
  2168. FMADD (aa4, bb8, cc16, cc16)
  2169. LDF [BO + 55 * SIZE], b8
  2170. FMADD (aa5, bb1, cc01, cc01)
  2171. FMADD (aa2, bb1, cc02, cc02)
  2172. FMADD (aa5, bb2, cc03, cc03)
  2173. FMADD (aa2, bb2, cc04, cc04)
  2174. FMADD (aa5, bb3, cc05, cc05)
  2175. LDF [BO + 64 * SIZE], b1
  2176. FMADD (aa2, bb3, cc06, cc06)
  2177. LDF [BO + 57 * SIZE], b2
  2178. FMADD (aa5, bb4, cc07, cc07)
  2179. LDF [BO + 58 * SIZE], b3
  2180. FMADD (aa2, bb4, cc08, cc08)
  2181. LDF [BO + 59 * SIZE], b4
  2182. FMADD (aa5, bb5, cc09, cc09)
  2183. LDF [AO + 14 * SIZE], a3
  2184. FMADD (aa2, bb5, cc10, cc10)
  2185. LDF [AO + 15 * SIZE], a4
  2186. FMADD (aa5, bb6, cc11, cc11)
  2187. add BO, 64 * SIZE, BO
  2188. FMADD (aa2, bb6, cc12, cc12)
  2189. add AO, 16 * SIZE, AO
  2190. FMADD (aa5, bb7, cc13, cc13)
  2191. LDF [BO - 4 * SIZE], b5
  2192. FMADD (aa2, bb7, cc14, cc14)
  2193. LDF [BO - 3 * SIZE], b6
  2194. FMADD (aa5, bb8, cc15, cc15)
  2195. LDF [BO - 2 * SIZE], b7
  2196. FMADD (aa2, bb8, cc16, cc16)
  2197. LDF [BO - 1 * SIZE], b8
  2198. FMADD (aa3, bb9, cc01, cc01)
  2199. FMADD (aa4, bb9, cc02, cc02)
  2200. FMADD (aa3, bb2, cc03, cc03)
  2201. FMADD (aa4, bb2, cc04, cc04)
  2202. FMADD (aa3, bb3, cc05, cc05)
  2203. LDF [BO + 8 * SIZE], b9
  2204. FMADD (aa4, bb3, cc06, cc06)
  2205. LDF [BO + 1 * SIZE], b2
  2206. FMADD (aa3, bb4, cc07, cc07)
  2207. LDF [BO + 2 * SIZE], b3
  2208. FMADD (aa4, bb4, cc08, cc08)
  2209. LDF [BO + 3 * SIZE], b4
  2210. FMADD (aa3, bb5, cc09, cc09)
  2211. LDF [AO + 8 * SIZE], a5 /****/
  2212. FMADD (aa4, bb5, cc10, cc10)
  2213. LDF [AO + 1 * SIZE], a2
  2214. FMADD (aa3, bb6, cc11, cc11)
  2215. FMADD (aa4, bb6, cc12, cc12)
  2216. FMADD (aa3, bb7, cc13, cc13)
  2217. LDF [BO + 4 * SIZE], b5
  2218. FMADD (aa4, bb7, cc14, cc14)
  2219. LDF [BO + 5 * SIZE], b6
  2220. FMADD (aa3, bb8, cc15, cc15)
  2221. LDF [BO + 6 * SIZE], b7
  2222. FMADD (aa4, bb8, cc16, cc16)
  2223. bg,pt %icc, .LL13
  2224. LDF [BO + 7 * SIZE], b8
  2225. .align 4
  2226. .LL15:
  2227. #if defined(LT) || defined(RN)
  2228. and KK, 7, L
  2229. #else
  2230. sub K, KK, L
  2231. and L, 7, L
  2232. #endif
  2233. cmp L, 0
  2234. ble,a,pn %icc, .LL18
  2235. nop
  2236. .align 4
  2237. .LL17:
  2238. FMADD (aa1, bb1, cc01, cc01)
  2239. add L, -1, L
  2240. FMADD (aa2, bb1, cc02, cc02)
  2241. nop
  2242. FMADD (aa1, bb2, cc03, cc03)
  2243. LDF [BO + 8 * SIZE], b1
  2244. FMADD (aa2, bb2, cc04, cc04)
  2245. LDF [BO + 9 * SIZE], b2
  2246. FMADD (aa1, bb3, cc05, cc05)
  2247. cmp L, 0
  2248. FMADD (aa2, bb3, cc06, cc06)
  2249. nop
  2250. FMADD (aa1, bb4, cc07, cc07)
  2251. LDF [BO + 10 * SIZE], b3
  2252. FMADD (aa2, bb4, cc08, cc08)
  2253. LDF [BO + 11 * SIZE], b4
  2254. FMADD (aa1, bb5, cc09, cc09)
  2255. nop
  2256. FMADD (aa2, bb5, cc10, cc10)
  2257. nop
  2258. FMADD (aa1, bb6, cc11, cc11)
  2259. LDF [BO + 12 * SIZE], b5
  2260. FMADD (aa2, bb6, cc12, cc12)
  2261. LDF [BO + 13 * SIZE], b6
  2262. FMADD (aa1, bb7, cc13, cc13)
  2263. add AO, 2 * SIZE, AO
  2264. FMADD (aa2, bb7, cc14, cc14)
  2265. add BO, 8 * SIZE, BO
  2266. FMADD (aa1, bb8, cc15, cc15)
  2267. LDF [AO + 0 * SIZE], a1
  2268. FMADD (aa2, bb8, cc16, cc16)
  2269. LDF [AO + 1 * SIZE], a2
  2270. LDF [BO + 6 * SIZE], b7
  2271. bg,pt %icc, .LL17
  2272. LDF [BO + 7 * SIZE], b8
  2273. nop
  2274. .align 4
  2275. .LL18:
  2276. #if defined(LN) || defined(RT)
  2277. #ifdef LN
  2278. sub KK, 2, TEMP1
  2279. #else
  2280. sub KK, 8, TEMP1
  2281. #endif
  2282. sll TEMP1, BASE_SHIFT + 1, TEMP2
  2283. sll TEMP1, BASE_SHIFT + 3, TEMP1
  2284. add AORIG, TEMP2, AO
  2285. add B, TEMP1, BO
  2286. #endif
  2287. #if defined(LN) || defined(LT)
  2288. LDF [BO + 0 * SIZE], a1
  2289. LDF [BO + 1 * SIZE], a2
  2290. LDF [BO + 2 * SIZE], a3
  2291. LDF [BO + 3 * SIZE], a4
  2292. LDF [BO + 4 * SIZE], b1
  2293. LDF [BO + 5 * SIZE], b2
  2294. LDF [BO + 6 * SIZE], b3
  2295. LDF [BO + 7 * SIZE], b4
  2296. FSUB a1, c01, c01
  2297. FSUB a2, c03, c03
  2298. FSUB a3, c05, c05
  2299. FSUB a4, c07, c07
  2300. FSUB b1, c09, c09
  2301. FSUB b2, c11, c11
  2302. FSUB b3, c13, c13
  2303. FSUB b4, c15, c15
  2304. LDF [BO + 8 * SIZE], a1
  2305. LDF [BO + 9 * SIZE], a2
  2306. LDF [BO + 10 * SIZE], a3
  2307. LDF [BO + 11 * SIZE], a4
  2308. LDF [BO + 12 * SIZE], b1
  2309. LDF [BO + 13 * SIZE], b2
  2310. LDF [BO + 14 * SIZE], b3
  2311. LDF [BO + 15 * SIZE], b4
  2312. FSUB a1, c02, c02
  2313. FSUB a2, c04, c04
  2314. FSUB a3, c06, c06
  2315. FSUB a4, c08, c08
  2316. FSUB b1, c10, c10
  2317. FSUB b2, c12, c12
  2318. FSUB b3, c14, c14
  2319. FSUB b4, c16, c16
  2320. #else
  2321. LDF [AO + 0 * SIZE], a1
  2322. LDF [AO + 1 * SIZE], a2
  2323. LDF [AO + 2 * SIZE], a3
  2324. LDF [AO + 3 * SIZE], a4
  2325. LDF [AO + 4 * SIZE], b1
  2326. LDF [AO + 5 * SIZE], b2
  2327. LDF [AO + 6 * SIZE], b3
  2328. LDF [AO + 7 * SIZE], b4
  2329. FSUB a1, c01, c01
  2330. FSUB a2, c02, c02
  2331. FSUB a3, c03, c03
  2332. FSUB a4, c04, c04
  2333. FSUB b1, c05, c05
  2334. FSUB b2, c06, c06
  2335. FSUB b3, c07, c07
  2336. FSUB b4, c08, c08
  2337. LDF [AO + 8 * SIZE], a1
  2338. LDF [AO + 9 * SIZE], a2
  2339. LDF [AO + 10 * SIZE], a3
  2340. LDF [AO + 11 * SIZE], a4
  2341. LDF [AO + 12 * SIZE], b1
  2342. LDF [AO + 13 * SIZE], b2
  2343. LDF [AO + 14 * SIZE], b3
  2344. LDF [AO + 15 * SIZE], b4
  2345. FSUB a1, c09, c09
  2346. FSUB a2, c10, c10
  2347. FSUB a3, c11, c11
  2348. FSUB a4, c12, c12
  2349. FSUB b1, c13, c13
  2350. FSUB b2, c14, c14
  2351. FSUB b3, c15, c15
  2352. FSUB b4, c16, c16
  2353. #endif
  2354. #ifdef LN
  2355. LDF [AO + 3 * SIZE], a1
  2356. LDF [AO + 2 * SIZE], a2
  2357. LDF [AO + 0 * SIZE], a3
  2358. FMUL a1, c02, c02
  2359. FMUL a1, c04, c04
  2360. FMUL a1, c06, c06
  2361. FMUL a1, c08, c08
  2362. FMUL a1, c10, c10
  2363. FMUL a1, c12, c12
  2364. FMUL a1, c14, c14
  2365. FMUL a1, c16, c16
  2366. FNMSUB (aa2, cc02, cc01, cc01)
  2367. FNMSUB (aa2, cc04, cc03, cc03)
  2368. FNMSUB (aa2, cc06, cc05, cc05)
  2369. FNMSUB (aa2, cc08, cc07, cc07)
  2370. FNMSUB (aa2, cc10, cc09, cc09)
  2371. FNMSUB (aa2, cc12, cc11, cc11)
  2372. FNMSUB (aa2, cc14, cc13, cc13)
  2373. FNMSUB (aa2, cc16, cc15, cc15)
  2374. FMUL a3, c01, c01
  2375. FMUL a3, c03, c03
  2376. FMUL a3, c05, c05
  2377. FMUL a3, c07, c07
  2378. FMUL a3, c09, c09
  2379. FMUL a3, c11, c11
  2380. FMUL a3, c13, c13
  2381. FMUL a3, c15, c15
  2382. #endif
  2383. #ifdef LT
  2384. LDF [AO + 0 * SIZE], a1
  2385. LDF [AO + 1 * SIZE], a2
  2386. LDF [AO + 3 * SIZE], a3
  2387. FMUL a1, c01, c01
  2388. FMUL a1, c03, c03
  2389. FMUL a1, c05, c05
  2390. FMUL a1, c07, c07
  2391. FMUL a1, c09, c09
  2392. FMUL a1, c11, c11
  2393. FMUL a1, c13, c13
  2394. FMUL a1, c15, c15
  2395. FNMSUB (aa2, cc01, cc02, cc02)
  2396. FNMSUB (aa2, cc03, cc04, cc04)
  2397. FNMSUB (aa2, cc05, cc06, cc06)
  2398. FNMSUB (aa2, cc07, cc08, cc08)
  2399. FNMSUB (aa2, cc09, cc10, cc10)
  2400. FNMSUB (aa2, cc11, cc12, cc12)
  2401. FNMSUB (aa2, cc13, cc14, cc14)
  2402. FNMSUB (aa2, cc15, cc16, cc16)
  2403. FMUL a3, c02, c02
  2404. FMUL a3, c04, c04
  2405. FMUL a3, c06, c06
  2406. FMUL a3, c08, c08
  2407. FMUL a3, c10, c10
  2408. FMUL a3, c12, c12
  2409. FMUL a3, c14, c14
  2410. FMUL a3, c16, c16
  2411. #endif
  2412. #ifdef RN
  2413. LDF [BO + 0 * SIZE], a1
  2414. LDF [BO + 1 * SIZE], a2
  2415. LDF [BO + 2 * SIZE], a3
  2416. LDF [BO + 3 * SIZE], a4
  2417. LDF [BO + 4 * SIZE], b1
  2418. LDF [BO + 5 * SIZE], b2
  2419. LDF [BO + 6 * SIZE], b3
  2420. LDF [BO + 7 * SIZE], b4
  2421. FMUL a1, c01, c01
  2422. FMUL a1, c02, c02
  2423. FNMSUB (aa2, cc01, cc03, cc03)
  2424. FNMSUB (aa2, cc02, cc04, cc04)
  2425. FNMSUB (aa3, cc01, cc05, cc05)
  2426. FNMSUB (aa3, cc02, cc06, cc06)
  2427. FNMSUB (aa4, cc01, cc07, cc07)
  2428. FNMSUB (aa4, cc02, cc08, cc08)
  2429. FNMSUB (bb1, cc01, cc09, cc09)
  2430. FNMSUB (bb1, cc02, cc10, cc10)
  2431. FNMSUB (bb2, cc01, cc11, cc11)
  2432. FNMSUB (bb2, cc02, cc12, cc12)
  2433. FNMSUB (bb3, cc01, cc13, cc13)
  2434. FNMSUB (bb3, cc02, cc14, cc14)
  2435. FNMSUB (bb4, cc01, cc15, cc15)
  2436. FNMSUB (bb4, cc02, cc16, cc16)
  2437. LDF [BO + 9 * SIZE], a1
  2438. LDF [BO + 10 * SIZE], a2
  2439. LDF [BO + 11 * SIZE], a3
  2440. LDF [BO + 12 * SIZE], a4
  2441. LDF [BO + 13 * SIZE], b1
  2442. LDF [BO + 14 * SIZE], b2
  2443. LDF [BO + 15 * SIZE], b3
  2444. FMUL a1, c03, c03
  2445. FMUL a1, c04, c04
  2446. FNMSUB (aa2, cc03, cc05, cc05)
  2447. FNMSUB (aa2, cc04, cc06, cc06)
  2448. FNMSUB (aa3, cc03, cc07, cc07)
  2449. FNMSUB (aa3, cc04, cc08, cc08)
  2450. FNMSUB (aa4, cc03, cc09, cc09)
  2451. FNMSUB (aa4, cc04, cc10, cc10)
  2452. FNMSUB (bb1, cc03, cc11, cc11)
  2453. FNMSUB (bb1, cc04, cc12, cc12)
  2454. FNMSUB (bb2, cc03, cc13, cc13)
  2455. FNMSUB (bb2, cc04, cc14, cc14)
  2456. FNMSUB (bb3, cc03, cc15, cc15)
  2457. FNMSUB (bb3, cc04, cc16, cc16)
  2458. LDF [BO + 18 * SIZE], a1
  2459. LDF [BO + 19 * SIZE], a2
  2460. LDF [BO + 20 * SIZE], a3
  2461. LDF [BO + 21 * SIZE], a4
  2462. LDF [BO + 22 * SIZE], b1
  2463. LDF [BO + 23 * SIZE], b2
  2464. FMUL a1, c05, c05
  2465. FMUL a1, c06, c06
  2466. FNMSUB (aa2, cc05, cc07, cc07)
  2467. FNMSUB (aa2, cc06, cc08, cc08)
  2468. FNMSUB (aa3, cc05, cc09, cc09)
  2469. FNMSUB (aa3, cc06, cc10, cc10)
  2470. FNMSUB (aa4, cc05, cc11, cc11)
  2471. FNMSUB (aa4, cc06, cc12, cc12)
  2472. FNMSUB (bb1, cc05, cc13, cc13)
  2473. FNMSUB (bb1, cc06, cc14, cc14)
  2474. FNMSUB (bb2, cc05, cc15, cc15)
  2475. FNMSUB (bb2, cc06, cc16, cc16)
  2476. LDF [BO + 27 * SIZE], a1
  2477. LDF [BO + 28 * SIZE], a2
  2478. LDF [BO + 29 * SIZE], a3
  2479. LDF [BO + 30 * SIZE], a4
  2480. LDF [BO + 31 * SIZE], b1
  2481. FMUL a1, c07, c07
  2482. FMUL a1, c08, c08
  2483. FNMSUB (aa2, cc07, cc09, cc09)
  2484. FNMSUB (aa2, cc08, cc10, cc10)
  2485. FNMSUB (aa3, cc07, cc11, cc11)
  2486. FNMSUB (aa3, cc08, cc12, cc12)
  2487. FNMSUB (aa4, cc07, cc13, cc13)
  2488. FNMSUB (aa4, cc08, cc14, cc14)
  2489. FNMSUB (bb1, cc07, cc15, cc15)
  2490. FNMSUB (bb1, cc08, cc16, cc16)
  2491. LDF [BO + 36 * SIZE], a1
  2492. LDF [BO + 37 * SIZE], a2
  2493. LDF [BO + 38 * SIZE], a3
  2494. LDF [BO + 39 * SIZE], a4
  2495. FMUL a1, c09, c09
  2496. FMUL a1, c10, c10
  2497. FNMSUB (aa2, cc09, cc11, cc11)
  2498. FNMSUB (aa2, cc10, cc12, cc12)
  2499. FNMSUB (aa3, cc09, cc13, cc13)
  2500. FNMSUB (aa3, cc10, cc14, cc14)
  2501. FNMSUB (aa4, cc09, cc15, cc15)
  2502. FNMSUB (aa4, cc10, cc16, cc16)
  2503. LDF [BO + 45 * SIZE], a1
  2504. LDF [BO + 46 * SIZE], a2
  2505. LDF [BO + 47 * SIZE], a3
  2506. FMUL a1, c11, c11
  2507. FMUL a1, c12, c12
  2508. FNMSUB (aa2, cc11, cc13, cc13)
  2509. FNMSUB (aa2, cc12, cc14, cc14)
  2510. FNMSUB (aa3, cc11, cc15, cc15)
  2511. FNMSUB (aa3, cc12, cc16, cc16)
  2512. LDF [BO + 54 * SIZE], a1
  2513. LDF [BO + 55 * SIZE], a2
  2514. FMUL a1, c13, c13
  2515. FMUL a1, c14, c14
  2516. FNMSUB (aa2, cc13, cc15, cc15)
  2517. FNMSUB (aa2, cc14, cc16, cc16)
  2518. LDF [BO + 63 * SIZE], a1
  2519. FMUL a1, c15, c15
  2520. FMUL a1, c16, c16
  2521. #endif
  2522. #ifdef RT
  2523. LDF [BO + 63 * SIZE], a1
  2524. LDF [BO + 62 * SIZE], a2
  2525. LDF [BO + 61 * SIZE], a3
  2526. LDF [BO + 60 * SIZE], a4
  2527. LDF [BO + 59 * SIZE], b1
  2528. LDF [BO + 58 * SIZE], b2
  2529. LDF [BO + 57 * SIZE], b3
  2530. LDF [BO + 56 * SIZE], b4
  2531. FMUL a1, c16, c16
  2532. FMUL a1, c15, c15
  2533. FNMSUB (aa2, cc16, cc14, cc14)
  2534. FNMSUB (aa2, cc15, cc13, cc13)
  2535. FNMSUB (aa3, cc16, cc12, cc12)
  2536. FNMSUB (aa3, cc15, cc11, cc11)
  2537. FNMSUB (aa4, cc16, cc10, cc10)
  2538. FNMSUB (aa4, cc15, cc09, cc09)
  2539. FNMSUB (bb1, cc16, cc08, cc08)
  2540. FNMSUB (bb1, cc15, cc07, cc07)
  2541. FNMSUB (bb2, cc16, cc06, cc06)
  2542. FNMSUB (bb2, cc15, cc05, cc05)
  2543. FNMSUB (bb3, cc16, cc04, cc04)
  2544. FNMSUB (bb3, cc15, cc03, cc03)
  2545. FNMSUB (bb4, cc16, cc02, cc02)
  2546. FNMSUB (bb4, cc15, cc01, cc01)
  2547. LDF [BO + 54 * SIZE], a1
  2548. LDF [BO + 53 * SIZE], a2
  2549. LDF [BO + 52 * SIZE], a3
  2550. LDF [BO + 51 * SIZE], a4
  2551. LDF [BO + 50 * SIZE], b1
  2552. LDF [BO + 49 * SIZE], b2
  2553. LDF [BO + 48 * SIZE], b3
  2554. FMUL a1, c14, c14
  2555. FMUL a1, c13, c13
  2556. FNMSUB (aa2, cc14, cc12, cc12)
  2557. FNMSUB (aa2, cc13, cc11, cc11)
  2558. FNMSUB (aa3, cc14, cc10, cc10)
  2559. FNMSUB (aa3, cc13, cc09, cc09)
  2560. FNMSUB (aa4, cc14, cc08, cc08)
  2561. FNMSUB (aa4, cc13, cc07, cc07)
  2562. FNMSUB (bb1, cc14, cc06, cc06)
  2563. FNMSUB (bb1, cc13, cc05, cc05)
  2564. FNMSUB (bb2, cc14, cc04, cc04)
  2565. FNMSUB (bb2, cc13, cc03, cc03)
  2566. FNMSUB (bb3, cc14, cc02, cc02)
  2567. FNMSUB (bb3, cc13, cc01, cc01)
  2568. LDF [BO + 45 * SIZE], a1
  2569. LDF [BO + 44 * SIZE], a2
  2570. LDF [BO + 43 * SIZE], a3
  2571. LDF [BO + 42 * SIZE], a4
  2572. LDF [BO + 41 * SIZE], b1
  2573. LDF [BO + 40 * SIZE], b2
  2574. FMUL a1, c12, c12
  2575. FMUL a1, c11, c11
  2576. FNMSUB (aa2, cc12, cc10, cc10)
  2577. FNMSUB (aa2, cc11, cc09, cc09)
  2578. FNMSUB (aa3, cc12, cc08, cc08)
  2579. FNMSUB (aa3, cc11, cc07, cc07)
  2580. FNMSUB (aa4, cc12, cc06, cc06)
  2581. FNMSUB (aa4, cc11, cc05, cc05)
  2582. FNMSUB (bb1, cc12, cc04, cc04)
  2583. FNMSUB (bb1, cc11, cc03, cc03)
  2584. FNMSUB (bb2, cc12, cc02, cc02)
  2585. FNMSUB (bb2, cc11, cc01, cc01)
  2586. LDF [BO + 36 * SIZE], a1
  2587. LDF [BO + 35 * SIZE], a2
  2588. LDF [BO + 34 * SIZE], a3
  2589. LDF [BO + 33 * SIZE], a4
  2590. LDF [BO + 32 * SIZE], b1
  2591. FMUL a1, c10, c10
  2592. FMUL a1, c09, c09
  2593. FNMSUB (aa2, cc10, cc08, cc08)
  2594. FNMSUB (aa2, cc09, cc07, cc07)
  2595. FNMSUB (aa3, cc10, cc06, cc06)
  2596. FNMSUB (aa3, cc09, cc05, cc05)
  2597. FNMSUB (aa4, cc10, cc04, cc04)
  2598. FNMSUB (aa4, cc09, cc03, cc03)
  2599. FNMSUB (bb1, cc10, cc02, cc02)
  2600. FNMSUB (bb1, cc09, cc01, cc01)
  2601. LDF [BO + 27 * SIZE], a1
  2602. LDF [BO + 26 * SIZE], a2
  2603. LDF [BO + 25 * SIZE], a3
  2604. LDF [BO + 24 * SIZE], a4
  2605. FMUL a1, c08, c08
  2606. FMUL a1, c07, c07
  2607. FNMSUB (aa2, cc08, cc06, cc06)
  2608. FNMSUB (aa2, cc07, cc05, cc05)
  2609. FNMSUB (aa3, cc08, cc04, cc04)
  2610. FNMSUB (aa3, cc07, cc03, cc03)
  2611. FNMSUB (aa4, cc08, cc02, cc02)
  2612. FNMSUB (aa4, cc07, cc01, cc01)
  2613. LDF [BO + 18 * SIZE], a1
  2614. LDF [BO + 17 * SIZE], a2
  2615. LDF [BO + 16 * SIZE], a3
  2616. FMUL a1, c06, c06
  2617. FMUL a1, c05, c05
  2618. FNMSUB (aa2, cc06, cc04, cc04)
  2619. FNMSUB (aa2, cc05, cc03, cc03)
  2620. FNMSUB (aa3, cc06, cc02, cc02)
  2621. FNMSUB (aa3, cc05, cc01, cc01)
  2622. LDF [BO + 9 * SIZE], a1
  2623. LDF [BO + 8 * SIZE], a2
  2624. FMUL a1, c04, c04
  2625. FMUL a1, c03, c03
  2626. FNMSUB (aa2, cc04, cc02, cc02)
  2627. FNMSUB (aa2, cc03, cc01, cc01)
  2628. LDF [BO + 0 * SIZE], a1
  2629. FMUL a1, c02, c02
  2630. FMUL a1, c01, c01
  2631. #endif
  2632. #ifdef LN
  2633. add C1, -2 * SIZE, C1
  2634. add C2, -2 * SIZE, C2
  2635. add C3, -2 * SIZE, C3
  2636. add C4, -2 * SIZE, C4
  2637. add C5, -2 * SIZE, C5
  2638. add C6, -2 * SIZE, C6
  2639. add C7, -2 * SIZE, C7
  2640. add C8, -2 * SIZE, C8
  2641. #endif
  2642. #if defined(LN) || defined(LT)
  2643. STF c01, [BO + 0 * SIZE]
  2644. STF c03, [BO + 1 * SIZE]
  2645. STF c05, [BO + 2 * SIZE]
  2646. STF c07, [BO + 3 * SIZE]
  2647. STF c09, [BO + 4 * SIZE]
  2648. STF c11, [BO + 5 * SIZE]
  2649. STF c13, [BO + 6 * SIZE]
  2650. STF c15, [BO + 7 * SIZE]
  2651. STF c02, [BO + 8 * SIZE]
  2652. STF c04, [BO + 9 * SIZE]
  2653. STF c06, [BO + 10 * SIZE]
  2654. STF c08, [BO + 11 * SIZE]
  2655. STF c10, [BO + 12 * SIZE]
  2656. STF c12, [BO + 13 * SIZE]
  2657. STF c14, [BO + 14 * SIZE]
  2658. STF c16, [BO + 15 * SIZE]
  2659. #else
  2660. STF c01, [AO + 0 * SIZE]
  2661. STF c02, [AO + 1 * SIZE]
  2662. STF c03, [AO + 2 * SIZE]
  2663. STF c04, [AO + 3 * SIZE]
  2664. STF c05, [AO + 4 * SIZE]
  2665. STF c06, [AO + 5 * SIZE]
  2666. STF c07, [AO + 6 * SIZE]
  2667. STF c08, [AO + 7 * SIZE]
  2668. STF c09, [AO + 8 * SIZE]
  2669. STF c10, [AO + 9 * SIZE]
  2670. STF c11, [AO + 10 * SIZE]
  2671. STF c12, [AO + 11 * SIZE]
  2672. STF c13, [AO + 12 * SIZE]
  2673. STF c14, [AO + 13 * SIZE]
  2674. STF c15, [AO + 14 * SIZE]
  2675. STF c16, [AO + 15 * SIZE]
  2676. #endif
  2677. STF c01, [C1 + 0 * SIZE]
  2678. STF c02, [C1 + 1 * SIZE]
  2679. STF c03, [C2 + 0 * SIZE]
  2680. STF c04, [C2 + 1 * SIZE]
  2681. STF c05, [C3 + 0 * SIZE]
  2682. STF c06, [C3 + 1 * SIZE]
  2683. STF c07, [C4 + 0 * SIZE]
  2684. STF c08, [C4 + 1 * SIZE]
  2685. STF c09, [C5 + 0 * SIZE]
  2686. STF c10, [C5 + 1 * SIZE]
  2687. STF c11, [C6 + 0 * SIZE]
  2688. STF c12, [C6 + 1 * SIZE]
  2689. STF c13, [C7 + 0 * SIZE]
  2690. STF c14, [C7 + 1 * SIZE]
  2691. STF c15, [C8 + 0 * SIZE]
  2692. STF c16, [C8 + 1 * SIZE]
  2693. #ifndef LN
  2694. add C1, 2 * SIZE, C1
  2695. add C2, 2 * SIZE, C2
  2696. add C3, 2 * SIZE, C3
  2697. add C4, 2 * SIZE, C4
  2698. add C5, 2 * SIZE, C5
  2699. add C6, 2 * SIZE, C6
  2700. add C7, 2 * SIZE, C7
  2701. add C8, 2 * SIZE, C8
  2702. #endif
  2703. #ifdef RT
  2704. sll K, BASE_SHIFT + 1, TEMP1
  2705. add AORIG, TEMP1, AORIG
  2706. #endif
  2707. #if defined(LT) || defined(RN)
  2708. sub K, KK, TEMP1
  2709. sll TEMP1, BASE_SHIFT + 1, TEMP2
  2710. sll TEMP1, BASE_SHIFT + 3, TEMP1
  2711. add AO, TEMP2, AO
  2712. add BO, TEMP1, BO
  2713. #endif
  2714. #ifdef LT
  2715. add KK, 2, KK
  2716. #endif
  2717. #ifdef LN
  2718. sub KK, 2, KK
  2719. #endif
  2720. add I, -1, I
  2721. cmp I, 0
  2722. bg,pt %icc, .LL12
  2723. nop
  2724. .align 4
  2725. .LL20:
  2726. and M, 1, I
  2727. cmp I, 0
  2728. ble,pn %icc, .LL29
  2729. nop
  2730. #if defined(LT) || defined(RN)
  2731. mov B, BO
  2732. #else
  2733. #ifdef LN
  2734. sll K, BASE_SHIFT + 0, TEMP1
  2735. sub AORIG, TEMP1, AORIG
  2736. #endif
  2737. sll KK, BASE_SHIFT + 0, TEMP1
  2738. sll KK, BASE_SHIFT + 3, TEMP2
  2739. add AORIG, TEMP1, AO
  2740. add B, TEMP2, BO
  2741. #endif
  2742. LDF [AO + 0 * SIZE], a1
  2743. LDF [AO + 1 * SIZE], a2
  2744. LDF [AO + 2 * SIZE], a3
  2745. LDF [AO + 3 * SIZE], a4
  2746. LDF [BO + 0 * SIZE], b1
  2747. FCLR (cc01)
  2748. LDF [BO + 1 * SIZE], b2
  2749. FCLR (cc03)
  2750. LDF [BO + 2 * SIZE], b3
  2751. FCLR (cc05)
  2752. LDF [BO + 3 * SIZE], b4
  2753. FCLR (cc07)
  2754. LDF [BO + 4 * SIZE], b5
  2755. FCLR (cc09)
  2756. LDF [BO + 5 * SIZE], b6
  2757. FCLR (cc11)
  2758. LDF [BO + 6 * SIZE], b7
  2759. FCLR (cc13)
  2760. LDF [BO + 7 * SIZE], b8
  2761. FCLR (cc15)
  2762. #if defined(LT) || defined(RN)
  2763. sra KK, 2, L
  2764. #else
  2765. sub K, KK, L
  2766. sra L, 2, L
  2767. #endif
  2768. cmp L, 0
  2769. ble,pn %icc, .LL25
  2770. LDF [BO + 8 * SIZE], b9
  2771. .align 4
  2772. .LL23:
  2773. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  2774. add L, -1, L
  2775. FMADD (aa1, bb1, cc01, cc01)
  2776. LDF [BO + 16 * SIZE], b1
  2777. FMADD (aa1, bb2, cc03, cc03)
  2778. LDF [BO + 9 * SIZE], b2
  2779. FMADD (aa1, bb3, cc05, cc05)
  2780. LDF [BO + 10 * SIZE], b3
  2781. FMADD (aa1, bb4, cc07, cc07)
  2782. LDF [BO + 11 * SIZE], b4
  2783. FMADD (aa1, bb5, cc09, cc09)
  2784. LDF [BO + 12 * SIZE], b5
  2785. FMADD (aa1, bb6, cc11, cc11)
  2786. LDF [BO + 13 * SIZE], b6
  2787. FMADD (aa1, bb7, cc13, cc13)
  2788. LDF [BO + 14 * SIZE], b7
  2789. FMADD (aa1, bb8, cc15, cc15)
  2790. LDF [BO + 15 * SIZE], b8
  2791. FMADD (aa2, bb9, cc01, cc01)
  2792. LDF [BO + 24 * SIZE], b9
  2793. FMADD (aa2, bb2, cc03, cc03)
  2794. LDF [BO + 17 * SIZE], b2
  2795. FMADD (aa2, bb3, cc05, cc05)
  2796. LDF [BO + 18 * SIZE], b3
  2797. FMADD (aa2, bb4, cc07, cc07)
  2798. LDF [BO + 19 * SIZE], b4
  2799. FMADD (aa2, bb5, cc09, cc09)
  2800. LDF [BO + 20 * SIZE], b5
  2801. FMADD (aa2, bb6, cc11, cc11)
  2802. LDF [BO + 21 * SIZE], b6
  2803. FMADD (aa2, bb7, cc13, cc13)
  2804. LDF [BO + 22 * SIZE], b7
  2805. FMADD (aa2, bb8, cc15, cc15)
  2806. LDF [BO + 23 * SIZE], b8
  2807. LDF [AO + 4 * SIZE], a1
  2808. LDF [AO + 5 * SIZE], a2
  2809. FMADD (aa3, bb1, cc01, cc01)
  2810. LDF [BO + 32 * SIZE], b1
  2811. FMADD (aa3, bb2, cc03, cc03)
  2812. LDF [BO + 25 * SIZE], b2
  2813. FMADD (aa3, bb3, cc05, cc05)
  2814. LDF [BO + 26 * SIZE], b3
  2815. FMADD (aa3, bb4, cc07, cc07)
  2816. LDF [BO + 27 * SIZE], b4
  2817. FMADD (aa3, bb5, cc09, cc09)
  2818. LDF [BO + 28 * SIZE], b5
  2819. FMADD (aa3, bb6, cc11, cc11)
  2820. LDF [BO + 29 * SIZE], b6
  2821. FMADD (aa3, bb7, cc13, cc13)
  2822. LDF [BO + 30 * SIZE], b7
  2823. FMADD (aa3, bb8, cc15, cc15)
  2824. LDF [BO + 31 * SIZE], b8
  2825. FMADD (aa4, bb9, cc01, cc01)
  2826. LDF [BO + 40 * SIZE], b9
  2827. FMADD (aa4, bb2, cc03, cc03)
  2828. LDF [BO + 33 * SIZE], b2
  2829. FMADD (aa4, bb3, cc05, cc05)
  2830. LDF [BO + 34 * SIZE], b3
  2831. FMADD (aa4, bb4, cc07, cc07)
  2832. LDF [BO + 35 * SIZE], b4
  2833. FMADD (aa4, bb5, cc09, cc09)
  2834. LDF [BO + 36 * SIZE], b5
  2835. FMADD (aa4, bb6, cc11, cc11)
  2836. LDF [BO + 37 * SIZE], b6
  2837. FMADD (aa4, bb7, cc13, cc13)
  2838. LDF [BO + 38 * SIZE], b7
  2839. FMADD (aa4, bb8, cc15, cc15)
  2840. LDF [BO + 39 * SIZE], b8
  2841. LDF [AO + 6 * SIZE], a3
  2842. LDF [AO + 7 * SIZE], a4
  2843. add AO, 4 * SIZE, AO
  2844. cmp L, 0
  2845. bg,pt %icc, .LL23
  2846. add BO, 32 * SIZE, BO
  2847. .align 4
  2848. .LL25:
  2849. #if defined(LT) || defined(RN)
  2850. and KK, 3, L
  2851. #else
  2852. sub K, KK, L
  2853. and L, 3, L
  2854. #endif
  2855. cmp L, 0
  2856. ble,a,pn %icc, .LL28
  2857. nop
  2858. .align 4
  2859. .LL27:
  2860. FMADD (aa1, bb1, cc01, cc01)
  2861. LDF [BO + 8 * SIZE], b1
  2862. FMADD (aa1, bb2, cc03, cc03)
  2863. LDF [BO + 9 * SIZE], b2
  2864. FMADD (aa1, bb3, cc05, cc05)
  2865. LDF [BO + 10 * SIZE], b3
  2866. FMADD (aa1, bb4, cc07, cc07)
  2867. LDF [BO + 11 * SIZE], b4
  2868. FMADD (aa1, bb5, cc09, cc09)
  2869. LDF [BO + 12 * SIZE], b5
  2870. FMADD (aa1, bb6, cc11, cc11)
  2871. LDF [BO + 13 * SIZE], b6
  2872. FMADD (aa1, bb7, cc13, cc13)
  2873. LDF [BO + 14 * SIZE], b7
  2874. FMADD (aa1, bb8, cc15, cc15)
  2875. LDF [BO + 15 * SIZE], b8
  2876. LDF [AO + 1 * SIZE], a1
  2877. add AO, 1 * SIZE, AO
  2878. add L, -1, L
  2879. cmp L, 0
  2880. bg,pt %icc, .LL27
  2881. add BO, 8 * SIZE, BO
  2882. .align 4
  2883. .LL28:
  2884. #if defined(LN) || defined(RT)
  2885. #ifdef LN
  2886. sub KK, 1, TEMP1
  2887. #else
  2888. sub KK, 8, TEMP1
  2889. #endif
  2890. sll TEMP1, BASE_SHIFT + 0, TEMP2
  2891. sll TEMP1, BASE_SHIFT + 3, TEMP1
  2892. add AORIG, TEMP2, AO
  2893. add B, TEMP1, BO
  2894. #endif
  2895. #if defined(LN) || defined(LT)
  2896. LDF [BO + 0 * SIZE], a1
  2897. LDF [BO + 1 * SIZE], a2
  2898. LDF [BO + 2 * SIZE], a3
  2899. LDF [BO + 3 * SIZE], a4
  2900. LDF [BO + 4 * SIZE], b1
  2901. LDF [BO + 5 * SIZE], b2
  2902. LDF [BO + 6 * SIZE], b3
  2903. LDF [BO + 7 * SIZE], b4
  2904. FSUB a1, c01, c01
  2905. FSUB a2, c03, c03
  2906. FSUB a3, c05, c05
  2907. FSUB a4, c07, c07
  2908. FSUB b1, c09, c09
  2909. FSUB b2, c11, c11
  2910. FSUB b3, c13, c13
  2911. FSUB b4, c15, c15
  2912. #else
  2913. LDF [AO + 0 * SIZE], a1
  2914. LDF [AO + 1 * SIZE], a2
  2915. LDF [AO + 2 * SIZE], a3
  2916. LDF [AO + 3 * SIZE], a4
  2917. LDF [AO + 4 * SIZE], b1
  2918. LDF [AO + 5 * SIZE], b2
  2919. LDF [AO + 6 * SIZE], b3
  2920. LDF [AO + 7 * SIZE], b4
  2921. FSUB a1, c01, c01
  2922. FSUB a2, c03, c03
  2923. FSUB a3, c05, c05
  2924. FSUB a4, c07, c07
  2925. FSUB b1, c09, c09
  2926. FSUB b2, c11, c11
  2927. FSUB b3, c13, c13
  2928. FSUB b4, c15, c15
  2929. #endif
  2930. #if defined(LN) || defined(LT)
  2931. LDF [AO + 0 * SIZE], a1
  2932. FMUL a1, c01, c01
  2933. FMUL a1, c03, c03
  2934. FMUL a1, c05, c05
  2935. FMUL a1, c07, c07
  2936. FMUL a1, c09, c09
  2937. FMUL a1, c11, c11
  2938. FMUL a1, c13, c13
  2939. FMUL a1, c15, c15
  2940. #endif
  2941. #ifdef RN
  2942. LDF [BO + 0 * SIZE], a1
  2943. LDF [BO + 1 * SIZE], a2
  2944. LDF [BO + 2 * SIZE], a3
  2945. LDF [BO + 3 * SIZE], a4
  2946. LDF [BO + 4 * SIZE], b1
  2947. LDF [BO + 5 * SIZE], b2
  2948. LDF [BO + 6 * SIZE], b3
  2949. LDF [BO + 7 * SIZE], b4
  2950. FMUL a1, c01, c01
  2951. FNMSUB (aa2, cc01, cc03, cc03)
  2952. FNMSUB (aa3, cc01, cc05, cc05)
  2953. FNMSUB (aa4, cc01, cc07, cc07)
  2954. FNMSUB (bb1, cc01, cc09, cc09)
  2955. FNMSUB (bb2, cc01, cc11, cc11)
  2956. FNMSUB (bb3, cc01, cc13, cc13)
  2957. FNMSUB (bb4, cc01, cc15, cc15)
  2958. LDF [BO + 9 * SIZE], a1
  2959. LDF [BO + 10 * SIZE], a2
  2960. LDF [BO + 11 * SIZE], a3
  2961. LDF [BO + 12 * SIZE], a4
  2962. LDF [BO + 13 * SIZE], b1
  2963. LDF [BO + 14 * SIZE], b2
  2964. LDF [BO + 15 * SIZE], b3
  2965. FMUL a1, c03, c03
  2966. FNMSUB (aa2, cc03, cc05, cc05)
  2967. FNMSUB (aa3, cc03, cc07, cc07)
  2968. FNMSUB (aa4, cc03, cc09, cc09)
  2969. FNMSUB (bb1, cc03, cc11, cc11)
  2970. FNMSUB (bb2, cc03, cc13, cc13)
  2971. FNMSUB (bb3, cc03, cc15, cc15)
  2972. LDF [BO + 18 * SIZE], a1
  2973. LDF [BO + 19 * SIZE], a2
  2974. LDF [BO + 20 * SIZE], a3
  2975. LDF [BO + 21 * SIZE], a4
  2976. LDF [BO + 22 * SIZE], b1
  2977. LDF [BO + 23 * SIZE], b2
  2978. FMUL a1, c05, c05
  2979. FNMSUB (aa2, cc05, cc07, cc07)
  2980. FNMSUB (aa3, cc05, cc09, cc09)
  2981. FNMSUB (aa4, cc05, cc11, cc11)
  2982. FNMSUB (bb1, cc05, cc13, cc13)
  2983. FNMSUB (bb2, cc05, cc15, cc15)
  2984. LDF [BO + 27 * SIZE], a1
  2985. LDF [BO + 28 * SIZE], a2
  2986. LDF [BO + 29 * SIZE], a3
  2987. LDF [BO + 30 * SIZE], a4
  2988. LDF [BO + 31 * SIZE], b1
  2989. FMUL a1, c07, c07
  2990. FNMSUB (aa2, cc07, cc09, cc09)
  2991. FNMSUB (aa3, cc07, cc11, cc11)
  2992. FNMSUB (aa4, cc07, cc13, cc13)
  2993. FNMSUB (bb1, cc07, cc15, cc15)
  2994. LDF [BO + 36 * SIZE], a1
  2995. LDF [BO + 37 * SIZE], a2
  2996. LDF [BO + 38 * SIZE], a3
  2997. LDF [BO + 39 * SIZE], a4
  2998. FMUL a1, c09, c09
  2999. FNMSUB (aa2, cc09, cc11, cc11)
  3000. FNMSUB (aa3, cc09, cc13, cc13)
  3001. FNMSUB (aa4, cc09, cc15, cc15)
  3002. LDF [BO + 45 * SIZE], a1
  3003. LDF [BO + 46 * SIZE], a2
  3004. LDF [BO + 47 * SIZE], a3
  3005. FMUL a1, c11, c11
  3006. FNMSUB (aa2, cc11, cc13, cc13)
  3007. FNMSUB (aa3, cc11, cc15, cc15)
  3008. LDF [BO + 54 * SIZE], a1
  3009. LDF [BO + 55 * SIZE], a2
  3010. FMUL a1, c13, c13
  3011. FNMSUB (aa2, cc13, cc15, cc15)
  3012. LDF [BO + 63 * SIZE], a1
  3013. FMUL a1, c15, c15
  3014. #endif
  3015. #ifdef RT
  3016. LDF [BO + 63 * SIZE], a1
  3017. LDF [BO + 62 * SIZE], a2
  3018. LDF [BO + 61 * SIZE], a3
  3019. LDF [BO + 60 * SIZE], a4
  3020. LDF [BO + 59 * SIZE], b1
  3021. LDF [BO + 58 * SIZE], b2
  3022. LDF [BO + 57 * SIZE], b3
  3023. LDF [BO + 56 * SIZE], b4
  3024. FMUL a1, c15, c15
  3025. FNMSUB (aa2, cc15, cc13, cc13)
  3026. FNMSUB (aa3, cc15, cc11, cc11)
  3027. FNMSUB (aa4, cc15, cc09, cc09)
  3028. FNMSUB (bb1, cc15, cc07, cc07)
  3029. FNMSUB (bb2, cc15, cc05, cc05)
  3030. FNMSUB (bb3, cc15, cc03, cc03)
  3031. FNMSUB (bb4, cc15, cc01, cc01)
  3032. LDF [BO + 54 * SIZE], a1
  3033. LDF [BO + 53 * SIZE], a2
  3034. LDF [BO + 52 * SIZE], a3
  3035. LDF [BO + 51 * SIZE], a4
  3036. LDF [BO + 50 * SIZE], b1
  3037. LDF [BO + 49 * SIZE], b2
  3038. LDF [BO + 48 * SIZE], b3
  3039. FMUL a1, c13, c13
  3040. FNMSUB (aa2, cc13, cc11, cc11)
  3041. FNMSUB (aa3, cc13, cc09, cc09)
  3042. FNMSUB (aa4, cc13, cc07, cc07)
  3043. FNMSUB (bb1, cc13, cc05, cc05)
  3044. FNMSUB (bb2, cc13, cc03, cc03)
  3045. FNMSUB (bb3, cc13, cc01, cc01)
  3046. LDF [BO + 45 * SIZE], a1
  3047. LDF [BO + 44 * SIZE], a2
  3048. LDF [BO + 43 * SIZE], a3
  3049. LDF [BO + 42 * SIZE], a4
  3050. LDF [BO + 41 * SIZE], b1
  3051. LDF [BO + 40 * SIZE], b2
  3052. FMUL a1, c11, c11
  3053. FNMSUB (aa2, cc11, cc09, cc09)
  3054. FNMSUB (aa3, cc11, cc07, cc07)
  3055. FNMSUB (aa4, cc11, cc05, cc05)
  3056. FNMSUB (bb1, cc11, cc03, cc03)
  3057. FNMSUB (bb2, cc11, cc01, cc01)
  3058. LDF [BO + 36 * SIZE], a1
  3059. LDF [BO + 35 * SIZE], a2
  3060. LDF [BO + 34 * SIZE], a3
  3061. LDF [BO + 33 * SIZE], a4
  3062. LDF [BO + 32 * SIZE], b1
  3063. FMUL a1, c09, c09
  3064. FNMSUB (aa2, cc09, cc07, cc07)
  3065. FNMSUB (aa3, cc09, cc05, cc05)
  3066. FNMSUB (aa4, cc09, cc03, cc03)
  3067. FNMSUB (bb1, cc09, cc01, cc01)
  3068. LDF [BO + 27 * SIZE], a1
  3069. LDF [BO + 26 * SIZE], a2
  3070. LDF [BO + 25 * SIZE], a3
  3071. LDF [BO + 24 * SIZE], a4
  3072. FMUL a1, c07, c07
  3073. FNMSUB (aa2, cc07, cc05, cc05)
  3074. FNMSUB (aa3, cc07, cc03, cc03)
  3075. FNMSUB (aa4, cc07, cc01, cc01)
  3076. LDF [BO + 18 * SIZE], a1
  3077. LDF [BO + 17 * SIZE], a2
  3078. LDF [BO + 16 * SIZE], a3
  3079. FMUL a1, c05, c05
  3080. FNMSUB (aa2, cc05, cc03, cc03)
  3081. FNMSUB (aa3, cc05, cc01, cc01)
  3082. LDF [BO + 9 * SIZE], a1
  3083. LDF [BO + 8 * SIZE], a2
  3084. FMUL a1, c03, c03
  3085. FNMSUB (aa2, cc03, cc01, cc01)
  3086. LDF [BO + 0 * SIZE], a1
  3087. FMUL a1, c01, c01
  3088. #endif
  3089. #ifdef LN
  3090. add C1, -1 * SIZE, C1
  3091. add C2, -1 * SIZE, C2
  3092. add C3, -1 * SIZE, C3
  3093. add C4, -1 * SIZE, C4
  3094. add C5, -1 * SIZE, C5
  3095. add C6, -1 * SIZE, C6
  3096. add C7, -1 * SIZE, C7
  3097. add C8, -1 * SIZE, C8
  3098. #endif
  3099. #if defined(LN) || defined(LT)
  3100. STF c01, [BO + 0 * SIZE]
  3101. STF c03, [BO + 1 * SIZE]
  3102. STF c05, [BO + 2 * SIZE]
  3103. STF c07, [BO + 3 * SIZE]
  3104. STF c09, [BO + 4 * SIZE]
  3105. STF c11, [BO + 5 * SIZE]
  3106. STF c13, [BO + 6 * SIZE]
  3107. STF c15, [BO + 7 * SIZE]
  3108. #else
  3109. STF c01, [AO + 0 * SIZE]
  3110. STF c03, [AO + 1 * SIZE]
  3111. STF c05, [AO + 2 * SIZE]
  3112. STF c07, [AO + 3 * SIZE]
  3113. STF c09, [AO + 4 * SIZE]
  3114. STF c11, [AO + 5 * SIZE]
  3115. STF c13, [AO + 6 * SIZE]
  3116. STF c15, [AO + 7 * SIZE]
  3117. #endif
  3118. STF c01, [C1 + 0 * SIZE]
  3119. STF c03, [C2 + 0 * SIZE]
  3120. STF c05, [C3 + 0 * SIZE]
  3121. STF c07, [C4 + 0 * SIZE]
  3122. STF c09, [C5 + 0 * SIZE]
  3123. STF c11, [C6 + 0 * SIZE]
  3124. STF c13, [C7 + 0 * SIZE]
  3125. STF c15, [C8 + 0 * SIZE]
  3126. #ifdef RT
  3127. sll K, BASE_SHIFT + 0, TEMP1
  3128. add AORIG, TEMP1, AORIG
  3129. #endif
  3130. #if defined(LT) || defined(RN)
  3131. sub K, KK, TEMP1
  3132. sll TEMP1, BASE_SHIFT + 0, TEMP2
  3133. sll TEMP1, BASE_SHIFT + 3, TEMP1
  3134. add AO, TEMP2, AO
  3135. add BO, TEMP1, BO
  3136. #endif
  3137. #ifdef LT
  3138. add KK, 1, KK
  3139. #endif
  3140. #ifdef LN
  3141. sub KK, 1, KK
  3142. #endif
  3143. .align 4
  3144. .LL29:
  3145. #ifdef LN
  3146. sll K, BASE_SHIFT + 3, TEMP1
  3147. add B, TEMP1, B
  3148. #endif
  3149. #if defined(LT) || defined(RN)
  3150. mov BO, B
  3151. #endif
  3152. #ifdef RN
  3153. add KK, 8, KK
  3154. #endif
  3155. #ifdef RT
  3156. sub KK, 8, KK
  3157. #endif
  3158. add J, -1, J
  3159. cmp J, 0
  3160. bg,pt %icc, .LL11
  3161. nop
  3162. .align 4
  3163. .LL999:
  3164. #ifdef TRMMKERNEL
  3165. #ifndef __64BIT__
  3166. ld [%sp + STACK_START + 8], %g1
  3167. ld [%sp + STACK_START + 12], %g2
  3168. ld [%sp + STACK_START + 16], %g3
  3169. ld [%sp + STACK_START + 20], %g4
  3170. #else
  3171. ldx [%sp + STACK_START + 32], %g1
  3172. ldx [%sp + STACK_START + 40], %g2
  3173. ldx [%sp + STACK_START + 48], %g3
  3174. ldx [%sp + STACK_START + 56], %g4
  3175. #endif
  3176. #endif
  3177. return %i7 + 8
  3178. clr %o0
  3179. EPILOGUE