You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN_2x8.S 73 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897
  1. /*********************************************************************/
  2. /* Copyright 2005-2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define APREFETCHSIZE 24
  41. #define APREFETCH_CATEGORY 0
  42. #define M %i0
  43. #define N %i1
  44. #define K %i2
  45. #if defined(DOUBLE) && !defined(__64BIT__)
  46. #define A %i5
  47. #define B %i4
  48. #else
  49. #define A %i4
  50. #define B %i5
  51. #endif
  52. #define C %o4
  53. #define LDC %o5
  54. #define AO %l0
  55. #define BO %l1
  56. #define I %l2
  57. #define J %l3
  58. #define L %l4
  59. #define C1 %o0
  60. #define C2 %o1
  61. #define C3 %o2
  62. #define C4 %o3
  63. #define C5 %l5
  64. #define C6 %l6
  65. #define C7 %l7
  66. #define C8 %i3
  67. #define OFFSET %g1
  68. #define KK %g2
  69. #define TEMP1 %g3
  70. #define TEMP2 %g4
  71. #define AORIG %o7
  72. #ifdef DOUBLE
  73. #define c01 %f0
  74. #define c02 %f2
  75. #define c03 %f4
  76. #define c04 %f6
  77. #define c05 %f8
  78. #define c06 %f10
  79. #define c07 %f12
  80. #define c08 %f14
  81. #define c09 %f16
  82. #define c10 %f18
  83. #define c11 %f20
  84. #define c12 %f22
  85. #define c13 %f24
  86. #define c14 %f26
  87. #define c15 %f28
  88. #define c16 %f30
  89. #define a1 %f32
  90. #define a2 %f34
  91. #define a3 %f36
  92. #define a4 %f38
  93. #define a5 %f40
  94. #define b1 %f42
  95. #define b2 %f44
  96. #define b3 %f46
  97. #define b4 %f48
  98. #define b5 %f50
  99. #define b6 %f52
  100. #define b7 %f54
  101. #define b8 %f56
  102. #define b9 %f58
  103. #define cc01 0
  104. #define cc02 2
  105. #define cc03 4
  106. #define cc04 6
  107. #define cc05 8
  108. #define cc06 10
  109. #define cc07 12
  110. #define cc08 14
  111. #define cc09 16
  112. #define cc10 18
  113. #define cc11 20
  114. #define cc12 22
  115. #define cc13 24
  116. #define cc14 26
  117. #define cc15 28
  118. #define cc16 30
  119. #define aa1 1
  120. #define aa2 3
  121. #define aa3 5
  122. #define aa4 7
  123. #define aa5 9
  124. #define bb1 11
  125. #define bb2 13
  126. #define bb3 15
  127. #define bb4 17
  128. #define bb5 19
  129. #define bb6 21
  130. #define bb7 23
  131. #define bb8 25
  132. #define bb9 27
  133. #else
  134. #define c01 %f0
  135. #define c02 %f1
  136. #define c03 %f2
  137. #define c04 %f3
  138. #define c05 %f4
  139. #define c06 %f5
  140. #define c07 %f6
  141. #define c08 %f7
  142. #define c09 %f8
  143. #define c10 %f9
  144. #define c11 %f10
  145. #define c12 %f11
  146. #define c13 %f12
  147. #define c14 %f13
  148. #define c15 %f14
  149. #define c16 %f15
  150. #define a1 %f16
  151. #define a2 %f17
  152. #define a3 %f18
  153. #define a4 %f19
  154. #define a5 %f20
  155. #define b1 %f21
  156. #define b2 %f22
  157. #define b3 %f23
  158. #define b4 %f24
  159. #define b5 %f25
  160. #define b6 %f26
  161. #define b7 %f27
  162. #define b8 %f28
  163. #define b9 %f29
  164. #define cc01 0
  165. #define cc02 1
  166. #define cc03 2
  167. #define cc04 3
  168. #define cc05 4
  169. #define cc06 5
  170. #define cc07 6
  171. #define cc08 7
  172. #define cc09 8
  173. #define cc10 9
  174. #define cc11 10
  175. #define cc12 11
  176. #define cc13 12
  177. #define cc14 13
  178. #define cc15 14
  179. #define cc16 15
  180. #define aa1 16
  181. #define aa2 17
  182. #define aa3 18
  183. #define aa4 19
  184. #define aa5 20
  185. #define bb1 21
  186. #define bb2 22
  187. #define bb3 23
  188. #define bb4 24
  189. #define bb5 25
  190. #define bb6 26
  191. #define bb7 27
  192. #define bb8 28
  193. #define bb9 29
  194. #endif
  195. .register %g2, #scratch
  196. .register %g3, #scratch
  197. PROLOGUE
  198. SAVESP
  199. nop
  200. #ifndef __64BIT__
  201. #ifdef DOUBLE
  202. ld [%sp + STACK_START + 28], B
  203. ld [%sp + STACK_START + 32], C
  204. ld [%sp + STACK_START + 36], LDC
  205. ld [%sp + STACK_START + 40], OFFSET
  206. #else
  207. ld [%sp + STACK_START + 28], C
  208. ld [%sp + STACK_START + 32], LDC
  209. ld [%sp + STACK_START + 36], OFFSET
  210. #endif
  211. st %g1, [%sp + STACK_START + 8]
  212. st %g2, [%sp + STACK_START + 12]
  213. st %g3, [%sp + STACK_START + 16]
  214. st %g4, [%sp + STACK_START + 20]
  215. #else
  216. ldx [%sp+ STACK_START + 56], C
  217. ldx [%sp+ STACK_START + 64], LDC
  218. ldx [%sp+ STACK_START + 72], OFFSET
  219. stx %g1, [%sp + STACK_START + 32]
  220. stx %g2, [%sp + STACK_START + 40]
  221. stx %g3, [%sp + STACK_START + 48]
  222. stx %g4, [%sp + STACK_START + 56]
  223. #endif
  224. #if defined(TRMMKERNEL) && !defined(LEFT)
  225. neg OFFSET, KK
  226. #endif
  227. sll LDC, BASE_SHIFT, LDC
  228. #ifdef LN
  229. smul M, K, TEMP1
  230. sll TEMP1, BASE_SHIFT, TEMP1
  231. add A, TEMP1, A
  232. sll M, BASE_SHIFT, TEMP1
  233. add C, TEMP1, C
  234. #endif
  235. #ifdef RN
  236. neg OFFSET, KK
  237. #endif
  238. #ifdef RT
  239. smul N, K, TEMP1
  240. sll TEMP1, BASE_SHIFT, TEMP1
  241. add B, TEMP1, B
  242. smul N, LDC, TEMP1
  243. add C, TEMP1, C
  244. sub N, OFFSET, KK
  245. #endif
  246. sra N, 3, J
  247. cmp J, 0
  248. ble,pn %icc, .LL30
  249. nop
  250. .align 4
  251. .LL11:
  252. #ifdef RT
  253. sll K, BASE_SHIFT + 3, TEMP1
  254. sub B, TEMP1, B
  255. #endif
  256. #ifndef RT
  257. mov C, C1
  258. add C, LDC, C2
  259. add C2, LDC, C3
  260. add C3, LDC, C4
  261. add C4, LDC, C5
  262. add C5, LDC, C6
  263. add C6, LDC, C7
  264. add C7, LDC, C8
  265. add C8, LDC, C
  266. #else
  267. sub C, LDC, C8
  268. sub C8, LDC, C7
  269. sub C7, LDC, C6
  270. sub C6, LDC, C5
  271. sub C5, LDC, C4
  272. sub C4, LDC, C3
  273. sub C3, LDC, C2
  274. sub C2, LDC, C1
  275. sub C2, LDC, C
  276. #endif
  277. #ifdef LN
  278. add M, OFFSET, KK
  279. #endif
  280. #ifdef LT
  281. mov OFFSET, KK
  282. #endif
  283. #if defined(LN) || defined(RT)
  284. mov A, AORIG
  285. #else
  286. mov A, AO
  287. #endif
  288. and M, 1, I
  289. cmp I, 0
  290. ble,pn %icc, .LL20
  291. nop
  292. #if defined(LT) || defined(RN)
  293. mov B, BO
  294. #else
  295. #ifdef LN
  296. sll K, BASE_SHIFT + 0, TEMP1
  297. sub AORIG, TEMP1, AORIG
  298. #endif
  299. sll KK, BASE_SHIFT + 0, TEMP1
  300. sll KK, BASE_SHIFT + 3, TEMP2
  301. add AORIG, TEMP1, AO
  302. add B, TEMP2, BO
  303. #endif
  304. LDF [AO + 0 * SIZE], a1
  305. LDF [AO + 1 * SIZE], a2
  306. LDF [AO + 2 * SIZE], a3
  307. LDF [AO + 3 * SIZE], a4
  308. LDF [BO + 0 * SIZE], b1
  309. FCLR (cc01)
  310. LDF [BO + 1 * SIZE], b2
  311. FCLR (cc03)
  312. LDF [BO + 2 * SIZE], b3
  313. FCLR (cc05)
  314. LDF [BO + 3 * SIZE], b4
  315. FCLR (cc07)
  316. LDF [BO + 4 * SIZE], b5
  317. FCLR (cc09)
  318. LDF [BO + 5 * SIZE], b6
  319. FCLR (cc11)
  320. LDF [BO + 6 * SIZE], b7
  321. FCLR (cc13)
  322. LDF [BO + 7 * SIZE], b8
  323. FCLR (cc15)
  324. #if defined(LT) || defined(RN)
  325. sra KK, 2, L
  326. #else
  327. sub K, KK, L
  328. sra L, 2, L
  329. #endif
  330. cmp L, 0
  331. ble,pn %icc, .LL25
  332. LDF [BO + 8 * SIZE], b9
  333. .align 4
  334. .LL23:
  335. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  336. add L, -1, L
  337. FMADD (aa1, bb1, cc01, cc01)
  338. LDF [BO + 16 * SIZE], b1
  339. FMADD (aa1, bb2, cc03, cc03)
  340. LDF [BO + 9 * SIZE], b2
  341. FMADD (aa1, bb3, cc05, cc05)
  342. LDF [BO + 10 * SIZE], b3
  343. FMADD (aa1, bb4, cc07, cc07)
  344. LDF [BO + 11 * SIZE], b4
  345. FMADD (aa1, bb5, cc09, cc09)
  346. LDF [BO + 12 * SIZE], b5
  347. FMADD (aa1, bb6, cc11, cc11)
  348. LDF [BO + 13 * SIZE], b6
  349. FMADD (aa1, bb7, cc13, cc13)
  350. LDF [BO + 14 * SIZE], b7
  351. FMADD (aa1, bb8, cc15, cc15)
  352. LDF [BO + 15 * SIZE], b8
  353. FMADD (aa2, bb9, cc01, cc01)
  354. LDF [BO + 24 * SIZE], b9
  355. FMADD (aa2, bb2, cc03, cc03)
  356. LDF [BO + 17 * SIZE], b2
  357. FMADD (aa2, bb3, cc05, cc05)
  358. LDF [BO + 18 * SIZE], b3
  359. FMADD (aa2, bb4, cc07, cc07)
  360. LDF [BO + 19 * SIZE], b4
  361. FMADD (aa2, bb5, cc09, cc09)
  362. LDF [BO + 20 * SIZE], b5
  363. FMADD (aa2, bb6, cc11, cc11)
  364. LDF [BO + 21 * SIZE], b6
  365. FMADD (aa2, bb7, cc13, cc13)
  366. LDF [BO + 22 * SIZE], b7
  367. FMADD (aa2, bb8, cc15, cc15)
  368. LDF [BO + 23 * SIZE], b8
  369. LDF [AO + 4 * SIZE], a1
  370. LDF [AO + 5 * SIZE], a2
  371. FMADD (aa3, bb1, cc01, cc01)
  372. LDF [BO + 32 * SIZE], b1
  373. FMADD (aa3, bb2, cc03, cc03)
  374. LDF [BO + 25 * SIZE], b2
  375. FMADD (aa3, bb3, cc05, cc05)
  376. LDF [BO + 26 * SIZE], b3
  377. FMADD (aa3, bb4, cc07, cc07)
  378. LDF [BO + 27 * SIZE], b4
  379. FMADD (aa3, bb5, cc09, cc09)
  380. LDF [BO + 28 * SIZE], b5
  381. FMADD (aa3, bb6, cc11, cc11)
  382. LDF [BO + 29 * SIZE], b6
  383. FMADD (aa3, bb7, cc13, cc13)
  384. LDF [BO + 30 * SIZE], b7
  385. FMADD (aa3, bb8, cc15, cc15)
  386. LDF [BO + 31 * SIZE], b8
  387. FMADD (aa4, bb9, cc01, cc01)
  388. LDF [BO + 40 * SIZE], b9
  389. FMADD (aa4, bb2, cc03, cc03)
  390. LDF [BO + 33 * SIZE], b2
  391. FMADD (aa4, bb3, cc05, cc05)
  392. LDF [BO + 34 * SIZE], b3
  393. FMADD (aa4, bb4, cc07, cc07)
  394. LDF [BO + 35 * SIZE], b4
  395. FMADD (aa4, bb5, cc09, cc09)
  396. LDF [BO + 36 * SIZE], b5
  397. FMADD (aa4, bb6, cc11, cc11)
  398. LDF [BO + 37 * SIZE], b6
  399. FMADD (aa4, bb7, cc13, cc13)
  400. LDF [BO + 38 * SIZE], b7
  401. FMADD (aa4, bb8, cc15, cc15)
  402. LDF [BO + 39 * SIZE], b8
  403. LDF [AO + 6 * SIZE], a3
  404. LDF [AO + 7 * SIZE], a4
  405. add AO, 4 * SIZE, AO
  406. cmp L, 0
  407. bg,pt %icc, .LL23
  408. add BO, 32 * SIZE, BO
  409. .align 4
  410. .LL25:
  411. #if defined(LT) || defined(RN)
  412. and KK, 3, L
  413. #else
  414. sub K, KK, L
  415. and L, 3, L
  416. #endif
  417. cmp L, 0
  418. ble,a,pn %icc, .LL28
  419. nop
  420. .align 4
  421. .LL27:
  422. FMADD (aa1, bb1, cc01, cc01)
  423. LDF [BO + 8 * SIZE], b1
  424. FMADD (aa1, bb2, cc03, cc03)
  425. LDF [BO + 9 * SIZE], b2
  426. FMADD (aa1, bb3, cc05, cc05)
  427. LDF [BO + 10 * SIZE], b3
  428. FMADD (aa1, bb4, cc07, cc07)
  429. LDF [BO + 11 * SIZE], b4
  430. FMADD (aa1, bb5, cc09, cc09)
  431. LDF [BO + 12 * SIZE], b5
  432. FMADD (aa1, bb6, cc11, cc11)
  433. LDF [BO + 13 * SIZE], b6
  434. FMADD (aa1, bb7, cc13, cc13)
  435. LDF [BO + 14 * SIZE], b7
  436. FMADD (aa1, bb8, cc15, cc15)
  437. LDF [BO + 15 * SIZE], b8
  438. LDF [AO + 1 * SIZE], a1
  439. add AO, 1 * SIZE, AO
  440. add L, -1, L
  441. cmp L, 0
  442. bg,pt %icc, .LL27
  443. add BO, 8 * SIZE, BO
  444. .align 4
  445. .LL28:
  446. #if defined(LN) || defined(RT)
  447. #ifdef LN
  448. sub KK, 1, TEMP1
  449. #else
  450. sub KK, 8, TEMP1
  451. #endif
  452. sll TEMP1, BASE_SHIFT + 0, TEMP2
  453. sll TEMP1, BASE_SHIFT + 3, TEMP1
  454. add AORIG, TEMP2, AO
  455. add B, TEMP1, BO
  456. #endif
  457. #if defined(LN) || defined(LT)
  458. LDF [BO + 0 * SIZE], a1
  459. LDF [BO + 1 * SIZE], a2
  460. LDF [BO + 2 * SIZE], a3
  461. LDF [BO + 3 * SIZE], a4
  462. LDF [BO + 4 * SIZE], b1
  463. LDF [BO + 5 * SIZE], b2
  464. LDF [BO + 6 * SIZE], b3
  465. LDF [BO + 7 * SIZE], b4
  466. FSUB a1, c01, c01
  467. FSUB a2, c03, c03
  468. FSUB a3, c05, c05
  469. FSUB a4, c07, c07
  470. FSUB b1, c09, c09
  471. FSUB b2, c11, c11
  472. FSUB b3, c13, c13
  473. FSUB b4, c15, c15
  474. #else
  475. LDF [AO + 0 * SIZE], a1
  476. LDF [AO + 1 * SIZE], a2
  477. LDF [AO + 2 * SIZE], a3
  478. LDF [AO + 3 * SIZE], a4
  479. LDF [AO + 4 * SIZE], b1
  480. LDF [AO + 5 * SIZE], b2
  481. LDF [AO + 6 * SIZE], b3
  482. LDF [AO + 7 * SIZE], b4
  483. FSUB a1, c01, c01
  484. FSUB a2, c03, c03
  485. FSUB a3, c05, c05
  486. FSUB a4, c07, c07
  487. FSUB b1, c09, c09
  488. FSUB b2, c11, c11
  489. FSUB b3, c13, c13
  490. FSUB b4, c15, c15
  491. #endif
  492. #if defined(LN) || defined(LT)
  493. LDF [AO + 0 * SIZE], a1
  494. FMUL a1, c01, c01
  495. FMUL a1, c03, c03
  496. FMUL a1, c05, c05
  497. FMUL a1, c07, c07
  498. FMUL a1, c09, c09
  499. FMUL a1, c11, c11
  500. FMUL a1, c13, c13
  501. FMUL a1, c15, c15
  502. #endif
  503. #ifdef RN
  504. LDF [BO + 0 * SIZE], a1
  505. LDF [BO + 1 * SIZE], a2
  506. LDF [BO + 2 * SIZE], a3
  507. LDF [BO + 3 * SIZE], a4
  508. LDF [BO + 4 * SIZE], b1
  509. LDF [BO + 5 * SIZE], b2
  510. LDF [BO + 6 * SIZE], b3
  511. LDF [BO + 7 * SIZE], b4
  512. FMUL a1, c01, c01
  513. FNMSUB (aa2, cc01, cc03, cc03)
  514. FNMSUB (aa3, cc01, cc05, cc05)
  515. FNMSUB (aa4, cc01, cc07, cc07)
  516. FNMSUB (bb1, cc01, cc09, cc09)
  517. FNMSUB (bb2, cc01, cc11, cc11)
  518. FNMSUB (bb3, cc01, cc13, cc13)
  519. FNMSUB (bb4, cc01, cc15, cc15)
  520. LDF [BO + 9 * SIZE], a1
  521. LDF [BO + 10 * SIZE], a2
  522. LDF [BO + 11 * SIZE], a3
  523. LDF [BO + 12 * SIZE], a4
  524. LDF [BO + 13 * SIZE], b1
  525. LDF [BO + 14 * SIZE], b2
  526. LDF [BO + 15 * SIZE], b3
  527. FMUL a1, c03, c03
  528. FNMSUB (aa2, cc03, cc05, cc05)
  529. FNMSUB (aa3, cc03, cc07, cc07)
  530. FNMSUB (aa4, cc03, cc09, cc09)
  531. FNMSUB (bb1, cc03, cc11, cc11)
  532. FNMSUB (bb2, cc03, cc13, cc13)
  533. FNMSUB (bb3, cc03, cc15, cc15)
  534. LDF [BO + 18 * SIZE], a1
  535. LDF [BO + 19 * SIZE], a2
  536. LDF [BO + 20 * SIZE], a3
  537. LDF [BO + 21 * SIZE], a4
  538. LDF [BO + 22 * SIZE], b1
  539. LDF [BO + 23 * SIZE], b2
  540. FMUL a1, c05, c05
  541. FNMSUB (aa2, cc05, cc07, cc07)
  542. FNMSUB (aa3, cc05, cc09, cc09)
  543. FNMSUB (aa4, cc05, cc11, cc11)
  544. FNMSUB (bb1, cc05, cc13, cc13)
  545. FNMSUB (bb2, cc05, cc15, cc15)
  546. LDF [BO + 27 * SIZE], a1
  547. LDF [BO + 28 * SIZE], a2
  548. LDF [BO + 29 * SIZE], a3
  549. LDF [BO + 30 * SIZE], a4
  550. LDF [BO + 31 * SIZE], b1
  551. FMUL a1, c07, c07
  552. FNMSUB (aa2, cc07, cc09, cc09)
  553. FNMSUB (aa3, cc07, cc11, cc11)
  554. FNMSUB (aa4, cc07, cc13, cc13)
  555. FNMSUB (bb1, cc07, cc15, cc15)
  556. LDF [BO + 36 * SIZE], a1
  557. LDF [BO + 37 * SIZE], a2
  558. LDF [BO + 38 * SIZE], a3
  559. LDF [BO + 39 * SIZE], a4
  560. FMUL a1, c09, c09
  561. FNMSUB (aa2, cc09, cc11, cc11)
  562. FNMSUB (aa3, cc09, cc13, cc13)
  563. FNMSUB (aa4, cc09, cc15, cc15)
  564. LDF [BO + 45 * SIZE], a1
  565. LDF [BO + 46 * SIZE], a2
  566. LDF [BO + 47 * SIZE], a3
  567. FMUL a1, c11, c11
  568. FNMSUB (aa2, cc11, cc13, cc13)
  569. FNMSUB (aa3, cc11, cc15, cc15)
  570. LDF [BO + 54 * SIZE], a1
  571. LDF [BO + 55 * SIZE], a2
  572. FMUL a1, c13, c13
  573. FNMSUB (aa2, cc13, cc15, cc15)
  574. LDF [BO + 63 * SIZE], a1
  575. FMUL a1, c15, c15
  576. #endif
  577. #ifdef RT
  578. LDF [BO + 63 * SIZE], a1
  579. LDF [BO + 62 * SIZE], a2
  580. LDF [BO + 61 * SIZE], a3
  581. LDF [BO + 60 * SIZE], a4
  582. LDF [BO + 59 * SIZE], b1
  583. LDF [BO + 58 * SIZE], b2
  584. LDF [BO + 57 * SIZE], b3
  585. LDF [BO + 56 * SIZE], b4
  586. FMUL a1, c15, c15
  587. FNMSUB (aa2, cc15, cc13, cc13)
  588. FNMSUB (aa3, cc15, cc11, cc11)
  589. FNMSUB (aa4, cc15, cc09, cc09)
  590. FNMSUB (bb1, cc15, cc07, cc07)
  591. FNMSUB (bb2, cc15, cc05, cc05)
  592. FNMSUB (bb3, cc15, cc03, cc03)
  593. FNMSUB (bb4, cc15, cc01, cc01)
  594. LDF [BO + 54 * SIZE], a1
  595. LDF [BO + 53 * SIZE], a2
  596. LDF [BO + 52 * SIZE], a3
  597. LDF [BO + 51 * SIZE], a4
  598. LDF [BO + 50 * SIZE], b1
  599. LDF [BO + 49 * SIZE], b2
  600. LDF [BO + 48 * SIZE], b3
  601. FMUL a1, c13, c13
  602. FNMSUB (aa2, cc13, cc11, cc11)
  603. FNMSUB (aa3, cc13, cc09, cc09)
  604. FNMSUB (aa4, cc13, cc07, cc07)
  605. FNMSUB (bb1, cc13, cc05, cc05)
  606. FNMSUB (bb2, cc13, cc03, cc03)
  607. FNMSUB (bb3, cc13, cc01, cc01)
  608. LDF [BO + 45 * SIZE], a1
  609. LDF [BO + 44 * SIZE], a2
  610. LDF [BO + 43 * SIZE], a3
  611. LDF [BO + 42 * SIZE], a4
  612. LDF [BO + 41 * SIZE], b1
  613. LDF [BO + 40 * SIZE], b2
  614. FMUL a1, c11, c11
  615. FNMSUB (aa2, cc11, cc09, cc09)
  616. FNMSUB (aa3, cc11, cc07, cc07)
  617. FNMSUB (aa4, cc11, cc05, cc05)
  618. FNMSUB (bb1, cc11, cc03, cc03)
  619. FNMSUB (bb2, cc11, cc01, cc01)
  620. LDF [BO + 36 * SIZE], a1
  621. LDF [BO + 35 * SIZE], a2
  622. LDF [BO + 34 * SIZE], a3
  623. LDF [BO + 33 * SIZE], a4
  624. LDF [BO + 32 * SIZE], b1
  625. FMUL a1, c09, c09
  626. FNMSUB (aa2, cc09, cc07, cc07)
  627. FNMSUB (aa3, cc09, cc05, cc05)
  628. FNMSUB (aa4, cc09, cc03, cc03)
  629. FNMSUB (bb1, cc09, cc01, cc01)
  630. LDF [BO + 27 * SIZE], a1
  631. LDF [BO + 26 * SIZE], a2
  632. LDF [BO + 25 * SIZE], a3
  633. LDF [BO + 24 * SIZE], a4
  634. FMUL a1, c07, c07
  635. FNMSUB (aa2, cc07, cc05, cc05)
  636. FNMSUB (aa3, cc07, cc03, cc03)
  637. FNMSUB (aa4, cc07, cc01, cc01)
  638. LDF [BO + 18 * SIZE], a1
  639. LDF [BO + 17 * SIZE], a2
  640. LDF [BO + 16 * SIZE], a3
  641. FMUL a1, c05, c05
  642. FNMSUB (aa2, cc05, cc03, cc03)
  643. FNMSUB (aa3, cc05, cc01, cc01)
  644. LDF [BO + 9 * SIZE], a1
  645. LDF [BO + 8 * SIZE], a2
  646. FMUL a1, c03, c03
  647. FNMSUB (aa2, cc03, cc01, cc01)
  648. LDF [BO + 0 * SIZE], a1
  649. FMUL a1, c01, c01
  650. #endif
  651. #ifdef LN
  652. add C1, -1 * SIZE, C1
  653. add C2, -1 * SIZE, C2
  654. add C3, -1 * SIZE, C3
  655. add C4, -1 * SIZE, C4
  656. add C5, -1 * SIZE, C5
  657. add C6, -1 * SIZE, C6
  658. add C7, -1 * SIZE, C7
  659. add C8, -1 * SIZE, C8
  660. #endif
  661. #if defined(LN) || defined(LT)
  662. STF c01, [BO + 0 * SIZE]
  663. STF c03, [BO + 1 * SIZE]
  664. STF c05, [BO + 2 * SIZE]
  665. STF c07, [BO + 3 * SIZE]
  666. STF c09, [BO + 4 * SIZE]
  667. STF c11, [BO + 5 * SIZE]
  668. STF c13, [BO + 6 * SIZE]
  669. STF c15, [BO + 7 * SIZE]
  670. #else
  671. STF c01, [AO + 0 * SIZE]
  672. STF c03, [AO + 1 * SIZE]
  673. STF c05, [AO + 2 * SIZE]
  674. STF c07, [AO + 3 * SIZE]
  675. STF c09, [AO + 4 * SIZE]
  676. STF c11, [AO + 5 * SIZE]
  677. STF c13, [AO + 6 * SIZE]
  678. STF c15, [AO + 7 * SIZE]
  679. #endif
  680. STF c01, [C1 + 0 * SIZE]
  681. STF c03, [C2 + 0 * SIZE]
  682. STF c05, [C3 + 0 * SIZE]
  683. STF c07, [C4 + 0 * SIZE]
  684. STF c09, [C5 + 0 * SIZE]
  685. STF c11, [C6 + 0 * SIZE]
  686. STF c13, [C7 + 0 * SIZE]
  687. STF c15, [C8 + 0 * SIZE]
  688. #ifdef RT
  689. sll K, BASE_SHIFT + 0, TEMP1
  690. add AORIG, TEMP1, AORIG
  691. #endif
  692. #if defined(LT) || defined(RN)
  693. sub K, KK, TEMP1
  694. sll TEMP1, BASE_SHIFT + 0, TEMP2
  695. sll TEMP1, BASE_SHIFT + 3, TEMP1
  696. add AO, TEMP2, AO
  697. add BO, TEMP1, BO
  698. #endif
  699. #ifdef LT
  700. add KK, 1, KK
  701. #endif
  702. #ifdef LN
  703. sub KK, 1, KK
  704. #endif
  705. .align 4
  706. .LL20:
  707. sra M, 1, I
  708. cmp I, 0
  709. ble,pn %icc, .LL29
  710. nop
  711. .align 4
  712. .LL12:
  713. #if defined(LT) || defined(RN)
  714. mov B, BO
  715. #else
  716. #ifdef LN
  717. sll K, BASE_SHIFT + 1, TEMP1
  718. sub AORIG, TEMP1, AORIG
  719. #endif
  720. sll KK, BASE_SHIFT + 1, TEMP1
  721. sll KK, BASE_SHIFT + 3, TEMP2
  722. add AORIG, TEMP1, AO
  723. add B, TEMP2, BO
  724. #endif
  725. LDF [AO + 0 * SIZE], a1
  726. LDF [AO + 1 * SIZE], a2
  727. LDF [AO + 8 * SIZE], a5
  728. LDF [BO + 0 * SIZE], b1
  729. LDF [BO + 1 * SIZE], b2
  730. FCLR (cc01)
  731. LDF [BO + 2 * SIZE], b3
  732. FCLR (cc05)
  733. LDF [BO + 3 * SIZE], b4
  734. FCLR (cc09)
  735. LDF [BO + 4 * SIZE], b5
  736. FCLR (cc13)
  737. LDF [BO + 5 * SIZE], b6
  738. FCLR (cc02)
  739. LDF [BO + 6 * SIZE], b7
  740. FCLR (cc06)
  741. LDF [BO + 7 * SIZE], b8
  742. FCLR (cc10)
  743. LDF [BO + 8 * SIZE], b9
  744. FCLR (cc14)
  745. prefetch [C1 + 1 * SIZE], 3
  746. FCLR (cc03)
  747. prefetch [C2 + 2 * SIZE], 3
  748. FCLR (cc07)
  749. prefetch [C3 + 1 * SIZE], 3
  750. FCLR (cc11)
  751. prefetch [C4 + 2 * SIZE], 3
  752. FCLR (cc15)
  753. prefetch [C5 + 1 * SIZE], 3
  754. FCLR (cc04)
  755. prefetch [C6 + 2 * SIZE], 3
  756. FCLR (cc08)
  757. prefetch [C7 + 1 * SIZE], 3
  758. FCLR (cc12)
  759. prefetch [C8 + 2 * SIZE], 3
  760. FCLR (cc16)
  761. #if defined(LT) || defined(RN)
  762. sra KK, 3, L
  763. #else
  764. sub K, KK, L
  765. sra L, 3, L
  766. #endif
  767. cmp L, 0
  768. ble,pn %icc, .LL15
  769. nop
  770. .align 4
  771. .LL13:
  772. FMADD (aa1, bb1, cc01, cc01)
  773. FMADD (aa2, bb1, cc02, cc02)
  774. FMADD (aa1, bb2, cc03, cc03)
  775. FMADD (aa2, bb2, cc04, cc04)
  776. FMADD (aa1, bb3, cc05, cc05)
  777. LDF [BO + 16 * SIZE], b1
  778. FMADD (aa2, bb3, cc06, cc06)
  779. LDF [BO + 9 * SIZE], b2
  780. FMADD (aa1, bb4, cc07, cc07)
  781. LDF [BO + 10 * SIZE], b3
  782. FMADD (aa2, bb4, cc08, cc08)
  783. LDF [BO + 11 * SIZE], b4
  784. FMADD (aa1, bb5, cc09, cc09)
  785. LDF [AO + 2 * SIZE], a3
  786. FMADD (aa2, bb5, cc10, cc10)
  787. LDF [AO + 3 * SIZE], a4
  788. FMADD (aa1, bb6, cc11, cc11)
  789. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  790. FMADD (aa2, bb6, cc12, cc12)
  791. nop
  792. FMADD (aa1, bb7, cc13, cc13)
  793. LDF [BO + 12 * SIZE], b5
  794. FMADD (aa2, bb7, cc14, cc14)
  795. LDF [BO + 13 * SIZE], b6
  796. FMADD (aa1, bb8, cc15, cc15)
  797. LDF [BO + 14 * SIZE], b7
  798. FMADD (aa2, bb8, cc16, cc16)
  799. LDF [BO + 15 * SIZE], b8
  800. FMADD (aa3, bb9, cc01, cc01)
  801. FMADD (aa4, bb9, cc02, cc02)
  802. FMADD (aa3, bb2, cc03, cc03)
  803. FMADD (aa4, bb2, cc04, cc04)
  804. FMADD (aa3, bb3, cc05, cc05)
  805. LDF [BO + 24 * SIZE], b9
  806. FMADD (aa4, bb3, cc06, cc06)
  807. LDF [BO + 17 * SIZE], b2
  808. FMADD (aa3, bb4, cc07, cc07)
  809. LDF [BO + 18 * SIZE], b3
  810. FMADD (aa4, bb4, cc08, cc08)
  811. LDF [BO + 19 * SIZE], b4
  812. FMADD (aa3, bb5, cc09, cc09)
  813. LDF [AO + 4 * SIZE], a1
  814. FMADD (aa4, bb5, cc10, cc10)
  815. LDF [AO + 5 * SIZE], a2
  816. FMADD (aa3, bb6, cc11, cc11)
  817. add L, -1, L
  818. FMADD (aa4, bb6, cc12, cc12)
  819. nop
  820. FMADD (aa3, bb7, cc13, cc13)
  821. LDF [BO + 20 * SIZE], b5
  822. FMADD (aa4, bb7, cc14, cc14)
  823. LDF [BO + 21 * SIZE], b6
  824. FMADD (aa3, bb8, cc15, cc15)
  825. LDF [BO + 22 * SIZE], b7
  826. FMADD (aa4, bb8, cc16, cc16)
  827. LDF [BO + 23 * SIZE], b8
  828. FMADD (aa1, bb1, cc01, cc01)
  829. FMADD (aa2, bb1, cc02, cc02)
  830. FMADD (aa1, bb2, cc03, cc03)
  831. FMADD (aa2, bb2, cc04, cc04)
  832. FMADD (aa1, bb3, cc05, cc05)
  833. LDF [BO + 32 * SIZE], b1
  834. FMADD (aa2, bb3, cc06, cc06)
  835. LDF [BO + 25 * SIZE], b2
  836. FMADD (aa1, bb4, cc07, cc07)
  837. LDF [BO + 26 * SIZE], b3
  838. FMADD (aa2, bb4, cc08, cc08)
  839. LDF [BO + 27 * SIZE], b4
  840. FMADD (aa1, bb5, cc09, cc09)
  841. LDF [AO + 6 * SIZE], a3
  842. FMADD (aa2, bb5, cc10, cc10)
  843. LDF [AO + 7 * SIZE], a4
  844. FMADD (aa1, bb6, cc11, cc11)
  845. nop
  846. FMADD (aa2, bb6, cc12, cc12)
  847. nop
  848. FMADD (aa1, bb7, cc13, cc13)
  849. LDF [BO + 28 * SIZE], b5
  850. FMADD (aa2, bb7, cc14, cc14)
  851. LDF [BO + 29 * SIZE], b6
  852. FMADD (aa1, bb8, cc15, cc15)
  853. LDF [BO + 30 * SIZE], b7
  854. FMADD (aa2, bb8, cc16, cc16)
  855. LDF [BO + 31 * SIZE], b8
  856. FMADD (aa3, bb9, cc01, cc01)
  857. FMADD (aa4, bb9, cc02, cc02)
  858. FMADD (aa3, bb2, cc03, cc03)
  859. FMADD (aa4, bb2, cc04, cc04)
  860. FMADD (aa3, bb3, cc05, cc05)
  861. LDF [BO + 40 * SIZE], b9
  862. FMADD (aa4, bb3, cc06, cc06)
  863. LDF [BO + 33 * SIZE], b2
  864. FMADD (aa3, bb4, cc07, cc07)
  865. LDF [BO + 34 * SIZE], b3
  866. FMADD (aa4, bb4, cc08, cc08)
  867. LDF [BO + 35 * SIZE], b4
  868. FMADD (aa3, bb5, cc09, cc09)
  869. LDF [AO + 16 * SIZE], a1 /****/
  870. FMADD (aa4, bb5, cc10, cc10)
  871. LDF [AO + 9 * SIZE], a2
  872. FMADD (aa3, bb6, cc11, cc11)
  873. nop
  874. FMADD (aa4, bb6, cc12, cc12)
  875. nop
  876. FMADD (aa3, bb7, cc13, cc13)
  877. LDF [BO + 36 * SIZE], b5
  878. FMADD (aa4, bb7, cc14, cc14)
  879. LDF [BO + 37 * SIZE], b6
  880. FMADD (aa3, bb8, cc15, cc15)
  881. LDF [BO + 38 * SIZE], b7
  882. FMADD (aa4, bb8, cc16, cc16)
  883. LDF [BO + 39 * SIZE], b8
  884. FMADD (aa5, bb1, cc01, cc01)
  885. FMADD (aa2, bb1, cc02, cc02)
  886. FMADD (aa5, bb2, cc03, cc03)
  887. FMADD (aa2, bb2, cc04, cc04)
  888. FMADD (aa5, bb3, cc05, cc05)
  889. LDF [BO + 48 * SIZE], b1
  890. FMADD (aa2, bb3, cc06, cc06)
  891. LDF [BO + 41 * SIZE], b2
  892. FMADD (aa5, bb4, cc07, cc07)
  893. LDF [BO + 42 * SIZE], b3
  894. FMADD (aa2, bb4, cc08, cc08)
  895. LDF [BO + 43 * SIZE], b4
  896. FMADD (aa5, bb5, cc09, cc09)
  897. LDF [AO + 10 * SIZE], a3
  898. FMADD (aa2, bb5, cc10, cc10)
  899. LDF [AO + 11 * SIZE], a4
  900. FMADD (aa5, bb6, cc11, cc11)
  901. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  902. FMADD (aa2, bb6, cc12, cc12)
  903. nop
  904. FMADD (aa5, bb7, cc13, cc13)
  905. LDF [BO + 44 * SIZE], b5
  906. FMADD (aa2, bb7, cc14, cc14)
  907. LDF [BO + 45 * SIZE], b6
  908. FMADD (aa5, bb8, cc15, cc15)
  909. LDF [BO + 46 * SIZE], b7
  910. FMADD (aa2, bb8, cc16, cc16)
  911. LDF [BO + 47 * SIZE], b8
  912. FMADD (aa3, bb9, cc01, cc01)
  913. FMADD (aa4, bb9, cc02, cc02)
  914. FMADD (aa3, bb2, cc03, cc03)
  915. FMADD (aa4, bb2, cc04, cc04)
  916. FMADD (aa3, bb3, cc05, cc05)
  917. LDF [BO + 56 * SIZE], b9
  918. FMADD (aa4, bb3, cc06, cc06)
  919. LDF [BO + 49 * SIZE], b2
  920. FMADD (aa3, bb4, cc07, cc07)
  921. LDF [BO + 50 * SIZE], b3
  922. FMADD (aa4, bb4, cc08, cc08)
  923. LDF [BO + 51 * SIZE], b4
  924. FMADD (aa3, bb5, cc09, cc09)
  925. LDF [AO + 12 * SIZE], a5
  926. FMADD (aa4, bb5, cc10, cc10)
  927. LDF [AO + 13 * SIZE], a2
  928. FMADD (aa3, bb6, cc11, cc11)
  929. cmp L, 0
  930. FMADD (aa4, bb6, cc12, cc12)
  931. nop
  932. FMADD (aa3, bb7, cc13, cc13)
  933. LDF [BO + 52 * SIZE], b5
  934. FMADD (aa4, bb7, cc14, cc14)
  935. LDF [BO + 53 * SIZE], b6
  936. FMADD (aa3, bb8, cc15, cc15)
  937. LDF [BO + 54 * SIZE], b7
  938. FMADD (aa4, bb8, cc16, cc16)
  939. LDF [BO + 55 * SIZE], b8
  940. FMADD (aa5, bb1, cc01, cc01)
  941. FMADD (aa2, bb1, cc02, cc02)
  942. FMADD (aa5, bb2, cc03, cc03)
  943. FMADD (aa2, bb2, cc04, cc04)
  944. FMADD (aa5, bb3, cc05, cc05)
  945. LDF [BO + 64 * SIZE], b1
  946. FMADD (aa2, bb3, cc06, cc06)
  947. LDF [BO + 57 * SIZE], b2
  948. FMADD (aa5, bb4, cc07, cc07)
  949. LDF [BO + 58 * SIZE], b3
  950. FMADD (aa2, bb4, cc08, cc08)
  951. LDF [BO + 59 * SIZE], b4
  952. FMADD (aa5, bb5, cc09, cc09)
  953. LDF [AO + 14 * SIZE], a3
  954. FMADD (aa2, bb5, cc10, cc10)
  955. LDF [AO + 15 * SIZE], a4
  956. FMADD (aa5, bb6, cc11, cc11)
  957. add BO, 64 * SIZE, BO
  958. FMADD (aa2, bb6, cc12, cc12)
  959. add AO, 16 * SIZE, AO
  960. FMADD (aa5, bb7, cc13, cc13)
  961. LDF [BO - 4 * SIZE], b5
  962. FMADD (aa2, bb7, cc14, cc14)
  963. LDF [BO - 3 * SIZE], b6
  964. FMADD (aa5, bb8, cc15, cc15)
  965. LDF [BO - 2 * SIZE], b7
  966. FMADD (aa2, bb8, cc16, cc16)
  967. LDF [BO - 1 * SIZE], b8
  968. FMADD (aa3, bb9, cc01, cc01)
  969. FMADD (aa4, bb9, cc02, cc02)
  970. FMADD (aa3, bb2, cc03, cc03)
  971. FMADD (aa4, bb2, cc04, cc04)
  972. FMADD (aa3, bb3, cc05, cc05)
  973. LDF [BO + 8 * SIZE], b9
  974. FMADD (aa4, bb3, cc06, cc06)
  975. LDF [BO + 1 * SIZE], b2
  976. FMADD (aa3, bb4, cc07, cc07)
  977. LDF [BO + 2 * SIZE], b3
  978. FMADD (aa4, bb4, cc08, cc08)
  979. LDF [BO + 3 * SIZE], b4
  980. FMADD (aa3, bb5, cc09, cc09)
  981. LDF [AO + 8 * SIZE], a5 /****/
  982. FMADD (aa4, bb5, cc10, cc10)
  983. LDF [AO + 1 * SIZE], a2
  984. FMADD (aa3, bb6, cc11, cc11)
  985. FMADD (aa4, bb6, cc12, cc12)
  986. FMADD (aa3, bb7, cc13, cc13)
  987. LDF [BO + 4 * SIZE], b5
  988. FMADD (aa4, bb7, cc14, cc14)
  989. LDF [BO + 5 * SIZE], b6
  990. FMADD (aa3, bb8, cc15, cc15)
  991. LDF [BO + 6 * SIZE], b7
  992. FMADD (aa4, bb8, cc16, cc16)
  993. ble,pn %icc, .LL15
  994. LDF [BO + 7 * SIZE], b8
  995. FMADD (aa1, bb1, cc01, cc01)
  996. FMADD (aa2, bb1, cc02, cc02)
  997. FMADD (aa1, bb2, cc03, cc03)
  998. FMADD (aa2, bb2, cc04, cc04)
  999. FMADD (aa1, bb3, cc05, cc05)
  1000. LDF [BO + 16 * SIZE], b1
  1001. FMADD (aa2, bb3, cc06, cc06)
  1002. LDF [BO + 9 * SIZE], b2
  1003. FMADD (aa1, bb4, cc07, cc07)
  1004. LDF [BO + 10 * SIZE], b3
  1005. FMADD (aa2, bb4, cc08, cc08)
  1006. LDF [BO + 11 * SIZE], b4
  1007. FMADD (aa1, bb5, cc09, cc09)
  1008. LDF [AO + 2 * SIZE], a3
  1009. FMADD (aa2, bb5, cc10, cc10)
  1010. LDF [AO + 3 * SIZE], a4
  1011. FMADD (aa1, bb6, cc11, cc11)
  1012. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1013. FMADD (aa2, bb6, cc12, cc12)
  1014. nop
  1015. FMADD (aa1, bb7, cc13, cc13)
  1016. LDF [BO + 12 * SIZE], b5
  1017. FMADD (aa2, bb7, cc14, cc14)
  1018. LDF [BO + 13 * SIZE], b6
  1019. FMADD (aa1, bb8, cc15, cc15)
  1020. LDF [BO + 14 * SIZE], b7
  1021. FMADD (aa2, bb8, cc16, cc16)
  1022. LDF [BO + 15 * SIZE], b8
  1023. FMADD (aa3, bb9, cc01, cc01)
  1024. FMADD (aa4, bb9, cc02, cc02)
  1025. FMADD (aa3, bb2, cc03, cc03)
  1026. FMADD (aa4, bb2, cc04, cc04)
  1027. FMADD (aa3, bb3, cc05, cc05)
  1028. LDF [BO + 24 * SIZE], b9
  1029. FMADD (aa4, bb3, cc06, cc06)
  1030. LDF [BO + 17 * SIZE], b2
  1031. FMADD (aa3, bb4, cc07, cc07)
  1032. LDF [BO + 18 * SIZE], b3
  1033. FMADD (aa4, bb4, cc08, cc08)
  1034. LDF [BO + 19 * SIZE], b4
  1035. FMADD (aa3, bb5, cc09, cc09)
  1036. LDF [AO + 4 * SIZE], a1
  1037. FMADD (aa4, bb5, cc10, cc10)
  1038. LDF [AO + 5 * SIZE], a2
  1039. FMADD (aa3, bb6, cc11, cc11)
  1040. add L, -1, L
  1041. FMADD (aa4, bb6, cc12, cc12)
  1042. nop
  1043. FMADD (aa3, bb7, cc13, cc13)
  1044. LDF [BO + 20 * SIZE], b5
  1045. FMADD (aa4, bb7, cc14, cc14)
  1046. LDF [BO + 21 * SIZE], b6
  1047. FMADD (aa3, bb8, cc15, cc15)
  1048. LDF [BO + 22 * SIZE], b7
  1049. FMADD (aa4, bb8, cc16, cc16)
  1050. LDF [BO + 23 * SIZE], b8
  1051. FMADD (aa1, bb1, cc01, cc01)
  1052. FMADD (aa2, bb1, cc02, cc02)
  1053. FMADD (aa1, bb2, cc03, cc03)
  1054. FMADD (aa2, bb2, cc04, cc04)
  1055. FMADD (aa1, bb3, cc05, cc05)
  1056. LDF [BO + 32 * SIZE], b1
  1057. FMADD (aa2, bb3, cc06, cc06)
  1058. LDF [BO + 25 * SIZE], b2
  1059. FMADD (aa1, bb4, cc07, cc07)
  1060. LDF [BO + 26 * SIZE], b3
  1061. FMADD (aa2, bb4, cc08, cc08)
  1062. LDF [BO + 27 * SIZE], b4
  1063. FMADD (aa1, bb5, cc09, cc09)
  1064. LDF [AO + 6 * SIZE], a3
  1065. FMADD (aa2, bb5, cc10, cc10)
  1066. LDF [AO + 7 * SIZE], a4
  1067. FMADD (aa1, bb6, cc11, cc11)
  1068. nop
  1069. FMADD (aa2, bb6, cc12, cc12)
  1070. nop
  1071. FMADD (aa1, bb7, cc13, cc13)
  1072. LDF [BO + 28 * SIZE], b5
  1073. FMADD (aa2, bb7, cc14, cc14)
  1074. LDF [BO + 29 * SIZE], b6
  1075. FMADD (aa1, bb8, cc15, cc15)
  1076. LDF [BO + 30 * SIZE], b7
  1077. FMADD (aa2, bb8, cc16, cc16)
  1078. LDF [BO + 31 * SIZE], b8
  1079. FMADD (aa3, bb9, cc01, cc01)
  1080. FMADD (aa4, bb9, cc02, cc02)
  1081. FMADD (aa3, bb2, cc03, cc03)
  1082. FMADD (aa4, bb2, cc04, cc04)
  1083. FMADD (aa3, bb3, cc05, cc05)
  1084. LDF [BO + 40 * SIZE], b9
  1085. FMADD (aa4, bb3, cc06, cc06)
  1086. LDF [BO + 33 * SIZE], b2
  1087. FMADD (aa3, bb4, cc07, cc07)
  1088. LDF [BO + 34 * SIZE], b3
  1089. FMADD (aa4, bb4, cc08, cc08)
  1090. LDF [BO + 35 * SIZE], b4
  1091. FMADD (aa3, bb5, cc09, cc09)
  1092. LDF [AO + 16 * SIZE], a1 /****/
  1093. FMADD (aa4, bb5, cc10, cc10)
  1094. LDF [AO + 9 * SIZE], a2
  1095. FMADD (aa3, bb6, cc11, cc11)
  1096. nop
  1097. FMADD (aa4, bb6, cc12, cc12)
  1098. nop
  1099. FMADD (aa3, bb7, cc13, cc13)
  1100. LDF [BO + 36 * SIZE], b5
  1101. FMADD (aa4, bb7, cc14, cc14)
  1102. LDF [BO + 37 * SIZE], b6
  1103. FMADD (aa3, bb8, cc15, cc15)
  1104. LDF [BO + 38 * SIZE], b7
  1105. FMADD (aa4, bb8, cc16, cc16)
  1106. LDF [BO + 39 * SIZE], b8
  1107. FMADD (aa5, bb1, cc01, cc01)
  1108. FMADD (aa2, bb1, cc02, cc02)
  1109. FMADD (aa5, bb2, cc03, cc03)
  1110. FMADD (aa2, bb2, cc04, cc04)
  1111. FMADD (aa5, bb3, cc05, cc05)
  1112. LDF [BO + 48 * SIZE], b1
  1113. FMADD (aa2, bb3, cc06, cc06)
  1114. LDF [BO + 41 * SIZE], b2
  1115. FMADD (aa5, bb4, cc07, cc07)
  1116. LDF [BO + 42 * SIZE], b3
  1117. FMADD (aa2, bb4, cc08, cc08)
  1118. LDF [BO + 43 * SIZE], b4
  1119. FMADD (aa5, bb5, cc09, cc09)
  1120. LDF [AO + 10 * SIZE], a3
  1121. FMADD (aa2, bb5, cc10, cc10)
  1122. LDF [AO + 11 * SIZE], a4
  1123. FMADD (aa5, bb6, cc11, cc11)
  1124. prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
  1125. FMADD (aa2, bb6, cc12, cc12)
  1126. nop
  1127. FMADD (aa5, bb7, cc13, cc13)
  1128. LDF [BO + 44 * SIZE], b5
  1129. FMADD (aa2, bb7, cc14, cc14)
  1130. LDF [BO + 45 * SIZE], b6
  1131. FMADD (aa5, bb8, cc15, cc15)
  1132. LDF [BO + 46 * SIZE], b7
  1133. FMADD (aa2, bb8, cc16, cc16)
  1134. LDF [BO + 47 * SIZE], b8
  1135. FMADD (aa3, bb9, cc01, cc01)
  1136. FMADD (aa4, bb9, cc02, cc02)
  1137. FMADD (aa3, bb2, cc03, cc03)
  1138. FMADD (aa4, bb2, cc04, cc04)
  1139. FMADD (aa3, bb3, cc05, cc05)
  1140. LDF [BO + 56 * SIZE], b9
  1141. FMADD (aa4, bb3, cc06, cc06)
  1142. LDF [BO + 49 * SIZE], b2
  1143. FMADD (aa3, bb4, cc07, cc07)
  1144. LDF [BO + 50 * SIZE], b3
  1145. FMADD (aa4, bb4, cc08, cc08)
  1146. LDF [BO + 51 * SIZE], b4
  1147. FMADD (aa3, bb5, cc09, cc09)
  1148. LDF [AO + 12 * SIZE], a5
  1149. FMADD (aa4, bb5, cc10, cc10)
  1150. LDF [AO + 13 * SIZE], a2
  1151. FMADD (aa3, bb6, cc11, cc11)
  1152. cmp L, 0
  1153. FMADD (aa4, bb6, cc12, cc12)
  1154. nop
  1155. FMADD (aa3, bb7, cc13, cc13)
  1156. LDF [BO + 52 * SIZE], b5
  1157. FMADD (aa4, bb7, cc14, cc14)
  1158. LDF [BO + 53 * SIZE], b6
  1159. FMADD (aa3, bb8, cc15, cc15)
  1160. LDF [BO + 54 * SIZE], b7
  1161. FMADD (aa4, bb8, cc16, cc16)
  1162. LDF [BO + 55 * SIZE], b8
  1163. FMADD (aa5, bb1, cc01, cc01)
  1164. FMADD (aa2, bb1, cc02, cc02)
  1165. FMADD (aa5, bb2, cc03, cc03)
  1166. FMADD (aa2, bb2, cc04, cc04)
  1167. FMADD (aa5, bb3, cc05, cc05)
  1168. LDF [BO + 64 * SIZE], b1
  1169. FMADD (aa2, bb3, cc06, cc06)
  1170. LDF [BO + 57 * SIZE], b2
  1171. FMADD (aa5, bb4, cc07, cc07)
  1172. LDF [BO + 58 * SIZE], b3
  1173. FMADD (aa2, bb4, cc08, cc08)
  1174. LDF [BO + 59 * SIZE], b4
  1175. FMADD (aa5, bb5, cc09, cc09)
  1176. LDF [AO + 14 * SIZE], a3
  1177. FMADD (aa2, bb5, cc10, cc10)
  1178. LDF [AO + 15 * SIZE], a4
  1179. FMADD (aa5, bb6, cc11, cc11)
  1180. add BO, 64 * SIZE, BO
  1181. FMADD (aa2, bb6, cc12, cc12)
  1182. add AO, 16 * SIZE, AO
  1183. FMADD (aa5, bb7, cc13, cc13)
  1184. LDF [BO - 4 * SIZE], b5
  1185. FMADD (aa2, bb7, cc14, cc14)
  1186. LDF [BO - 3 * SIZE], b6
  1187. FMADD (aa5, bb8, cc15, cc15)
  1188. LDF [BO - 2 * SIZE], b7
  1189. FMADD (aa2, bb8, cc16, cc16)
  1190. LDF [BO - 1 * SIZE], b8
  1191. FMADD (aa3, bb9, cc01, cc01)
  1192. FMADD (aa4, bb9, cc02, cc02)
  1193. FMADD (aa3, bb2, cc03, cc03)
  1194. FMADD (aa4, bb2, cc04, cc04)
  1195. FMADD (aa3, bb3, cc05, cc05)
  1196. LDF [BO + 8 * SIZE], b9
  1197. FMADD (aa4, bb3, cc06, cc06)
  1198. LDF [BO + 1 * SIZE], b2
  1199. FMADD (aa3, bb4, cc07, cc07)
  1200. LDF [BO + 2 * SIZE], b3
  1201. FMADD (aa4, bb4, cc08, cc08)
  1202. LDF [BO + 3 * SIZE], b4
  1203. FMADD (aa3, bb5, cc09, cc09)
  1204. LDF [AO + 8 * SIZE], a5 /****/
  1205. FMADD (aa4, bb5, cc10, cc10)
  1206. LDF [AO + 1 * SIZE], a2
  1207. FMADD (aa3, bb6, cc11, cc11)
  1208. FMADD (aa4, bb6, cc12, cc12)
  1209. FMADD (aa3, bb7, cc13, cc13)
  1210. LDF [BO + 4 * SIZE], b5
  1211. FMADD (aa4, bb7, cc14, cc14)
  1212. LDF [BO + 5 * SIZE], b6
  1213. FMADD (aa3, bb8, cc15, cc15)
  1214. LDF [BO + 6 * SIZE], b7
  1215. FMADD (aa4, bb8, cc16, cc16)
  1216. bg,pt %icc, .LL13
  1217. LDF [BO + 7 * SIZE], b8
  1218. .align 4
  1219. .LL15:
  1220. #if defined(LT) || defined(RN)
  1221. and KK, 7, L
  1222. #else
  1223. sub K, KK, L
  1224. and L, 7, L
  1225. #endif
  1226. cmp L, 0
  1227. ble,a,pn %icc, .LL18
  1228. nop
  1229. .align 4
  1230. .LL17:
  1231. FMADD (aa1, bb1, cc01, cc01)
  1232. add L, -1, L
  1233. FMADD (aa2, bb1, cc02, cc02)
  1234. nop
  1235. FMADD (aa1, bb2, cc03, cc03)
  1236. LDF [BO + 8 * SIZE], b1
  1237. FMADD (aa2, bb2, cc04, cc04)
  1238. LDF [BO + 9 * SIZE], b2
  1239. FMADD (aa1, bb3, cc05, cc05)
  1240. cmp L, 0
  1241. FMADD (aa2, bb3, cc06, cc06)
  1242. nop
  1243. FMADD (aa1, bb4, cc07, cc07)
  1244. LDF [BO + 10 * SIZE], b3
  1245. FMADD (aa2, bb4, cc08, cc08)
  1246. LDF [BO + 11 * SIZE], b4
  1247. FMADD (aa1, bb5, cc09, cc09)
  1248. nop
  1249. FMADD (aa2, bb5, cc10, cc10)
  1250. nop
  1251. FMADD (aa1, bb6, cc11, cc11)
  1252. LDF [BO + 12 * SIZE], b5
  1253. FMADD (aa2, bb6, cc12, cc12)
  1254. LDF [BO + 13 * SIZE], b6
  1255. FMADD (aa1, bb7, cc13, cc13)
  1256. add AO, 2 * SIZE, AO
  1257. FMADD (aa2, bb7, cc14, cc14)
  1258. add BO, 8 * SIZE, BO
  1259. FMADD (aa1, bb8, cc15, cc15)
  1260. LDF [AO + 0 * SIZE], a1
  1261. FMADD (aa2, bb8, cc16, cc16)
  1262. LDF [AO + 1 * SIZE], a2
  1263. LDF [BO + 6 * SIZE], b7
  1264. bg,pt %icc, .LL17
  1265. LDF [BO + 7 * SIZE], b8
  1266. nop
  1267. .align 4
  1268. .LL18:
  1269. #if defined(LN) || defined(RT)
  1270. #ifdef LN
  1271. sub KK, 2, TEMP1
  1272. #else
  1273. sub KK, 8, TEMP1
  1274. #endif
  1275. sll TEMP1, BASE_SHIFT + 1, TEMP2
  1276. sll TEMP1, BASE_SHIFT + 3, TEMP1
  1277. add AORIG, TEMP2, AO
  1278. add B, TEMP1, BO
  1279. #endif
  1280. #if defined(LN) || defined(LT)
  1281. LDF [BO + 0 * SIZE], a1
  1282. LDF [BO + 1 * SIZE], a2
  1283. LDF [BO + 2 * SIZE], a3
  1284. LDF [BO + 3 * SIZE], a4
  1285. LDF [BO + 4 * SIZE], b1
  1286. LDF [BO + 5 * SIZE], b2
  1287. LDF [BO + 6 * SIZE], b3
  1288. LDF [BO + 7 * SIZE], b4
  1289. FSUB a1, c01, c01
  1290. FSUB a2, c03, c03
  1291. FSUB a3, c05, c05
  1292. FSUB a4, c07, c07
  1293. FSUB b1, c09, c09
  1294. FSUB b2, c11, c11
  1295. FSUB b3, c13, c13
  1296. FSUB b4, c15, c15
  1297. LDF [BO + 8 * SIZE], a1
  1298. LDF [BO + 9 * SIZE], a2
  1299. LDF [BO + 10 * SIZE], a3
  1300. LDF [BO + 11 * SIZE], a4
  1301. LDF [BO + 12 * SIZE], b1
  1302. LDF [BO + 13 * SIZE], b2
  1303. LDF [BO + 14 * SIZE], b3
  1304. LDF [BO + 15 * SIZE], b4
  1305. FSUB a1, c02, c02
  1306. FSUB a2, c04, c04
  1307. FSUB a3, c06, c06
  1308. FSUB a4, c08, c08
  1309. FSUB b1, c10, c10
  1310. FSUB b2, c12, c12
  1311. FSUB b3, c14, c14
  1312. FSUB b4, c16, c16
  1313. #else
  1314. LDF [AO + 0 * SIZE], a1
  1315. LDF [AO + 1 * SIZE], a2
  1316. LDF [AO + 2 * SIZE], a3
  1317. LDF [AO + 3 * SIZE], a4
  1318. LDF [AO + 4 * SIZE], b1
  1319. LDF [AO + 5 * SIZE], b2
  1320. LDF [AO + 6 * SIZE], b3
  1321. LDF [AO + 7 * SIZE], b4
  1322. FSUB a1, c01, c01
  1323. FSUB a2, c02, c02
  1324. FSUB a3, c03, c03
  1325. FSUB a4, c04, c04
  1326. FSUB b1, c05, c05
  1327. FSUB b2, c06, c06
  1328. FSUB b3, c07, c07
  1329. FSUB b4, c08, c08
  1330. LDF [AO + 8 * SIZE], a1
  1331. LDF [AO + 9 * SIZE], a2
  1332. LDF [AO + 10 * SIZE], a3
  1333. LDF [AO + 11 * SIZE], a4
  1334. LDF [AO + 12 * SIZE], b1
  1335. LDF [AO + 13 * SIZE], b2
  1336. LDF [AO + 14 * SIZE], b3
  1337. LDF [AO + 15 * SIZE], b4
  1338. FSUB a1, c09, c09
  1339. FSUB a2, c10, c10
  1340. FSUB a3, c11, c11
  1341. FSUB a4, c12, c12
  1342. FSUB b1, c13, c13
  1343. FSUB b2, c14, c14
  1344. FSUB b3, c15, c15
  1345. FSUB b4, c16, c16
  1346. #endif
  1347. #ifdef LN
  1348. LDF [AO + 3 * SIZE], a1
  1349. LDF [AO + 2 * SIZE], a2
  1350. LDF [AO + 0 * SIZE], a3
  1351. FMUL a1, c02, c02
  1352. FMUL a1, c04, c04
  1353. FMUL a1, c06, c06
  1354. FMUL a1, c08, c08
  1355. FMUL a1, c10, c10
  1356. FMUL a1, c12, c12
  1357. FMUL a1, c14, c14
  1358. FMUL a1, c16, c16
  1359. FNMSUB (aa2, cc02, cc01, cc01)
  1360. FNMSUB (aa2, cc04, cc03, cc03)
  1361. FNMSUB (aa2, cc06, cc05, cc05)
  1362. FNMSUB (aa2, cc08, cc07, cc07)
  1363. FNMSUB (aa2, cc10, cc09, cc09)
  1364. FNMSUB (aa2, cc12, cc11, cc11)
  1365. FNMSUB (aa2, cc14, cc13, cc13)
  1366. FNMSUB (aa2, cc16, cc15, cc15)
  1367. FMUL a3, c01, c01
  1368. FMUL a3, c03, c03
  1369. FMUL a3, c05, c05
  1370. FMUL a3, c07, c07
  1371. FMUL a3, c09, c09
  1372. FMUL a3, c11, c11
  1373. FMUL a3, c13, c13
  1374. FMUL a3, c15, c15
  1375. #endif
  1376. #ifdef LT
  1377. LDF [AO + 0 * SIZE], a1
  1378. LDF [AO + 1 * SIZE], a2
  1379. LDF [AO + 3 * SIZE], a3
  1380. FMUL a1, c01, c01
  1381. FMUL a1, c03, c03
  1382. FMUL a1, c05, c05
  1383. FMUL a1, c07, c07
  1384. FMUL a1, c09, c09
  1385. FMUL a1, c11, c11
  1386. FMUL a1, c13, c13
  1387. FMUL a1, c15, c15
  1388. FNMSUB (aa2, cc01, cc02, cc02)
  1389. FNMSUB (aa2, cc03, cc04, cc04)
  1390. FNMSUB (aa2, cc05, cc06, cc06)
  1391. FNMSUB (aa2, cc07, cc08, cc08)
  1392. FNMSUB (aa2, cc09, cc10, cc10)
  1393. FNMSUB (aa2, cc11, cc12, cc12)
  1394. FNMSUB (aa2, cc13, cc14, cc14)
  1395. FNMSUB (aa2, cc15, cc16, cc16)
  1396. FMUL a3, c02, c02
  1397. FMUL a3, c04, c04
  1398. FMUL a3, c06, c06
  1399. FMUL a3, c08, c08
  1400. FMUL a3, c10, c10
  1401. FMUL a3, c12, c12
  1402. FMUL a3, c14, c14
  1403. FMUL a3, c16, c16
  1404. #endif
  1405. #ifdef RN
  1406. LDF [BO + 0 * SIZE], a1
  1407. LDF [BO + 1 * SIZE], a2
  1408. LDF [BO + 2 * SIZE], a3
  1409. LDF [BO + 3 * SIZE], a4
  1410. LDF [BO + 4 * SIZE], b1
  1411. LDF [BO + 5 * SIZE], b2
  1412. LDF [BO + 6 * SIZE], b3
  1413. LDF [BO + 7 * SIZE], b4
  1414. FMUL a1, c01, c01
  1415. FMUL a1, c02, c02
  1416. FNMSUB (aa2, cc01, cc03, cc03)
  1417. FNMSUB (aa2, cc02, cc04, cc04)
  1418. FNMSUB (aa3, cc01, cc05, cc05)
  1419. FNMSUB (aa3, cc02, cc06, cc06)
  1420. FNMSUB (aa4, cc01, cc07, cc07)
  1421. FNMSUB (aa4, cc02, cc08, cc08)
  1422. FNMSUB (bb1, cc01, cc09, cc09)
  1423. FNMSUB (bb1, cc02, cc10, cc10)
  1424. FNMSUB (bb2, cc01, cc11, cc11)
  1425. FNMSUB (bb2, cc02, cc12, cc12)
  1426. FNMSUB (bb3, cc01, cc13, cc13)
  1427. FNMSUB (bb3, cc02, cc14, cc14)
  1428. FNMSUB (bb4, cc01, cc15, cc15)
  1429. FNMSUB (bb4, cc02, cc16, cc16)
  1430. LDF [BO + 9 * SIZE], a1
  1431. LDF [BO + 10 * SIZE], a2
  1432. LDF [BO + 11 * SIZE], a3
  1433. LDF [BO + 12 * SIZE], a4
  1434. LDF [BO + 13 * SIZE], b1
  1435. LDF [BO + 14 * SIZE], b2
  1436. LDF [BO + 15 * SIZE], b3
  1437. FMUL a1, c03, c03
  1438. FMUL a1, c04, c04
  1439. FNMSUB (aa2, cc03, cc05, cc05)
  1440. FNMSUB (aa2, cc04, cc06, cc06)
  1441. FNMSUB (aa3, cc03, cc07, cc07)
  1442. FNMSUB (aa3, cc04, cc08, cc08)
  1443. FNMSUB (aa4, cc03, cc09, cc09)
  1444. FNMSUB (aa4, cc04, cc10, cc10)
  1445. FNMSUB (bb1, cc03, cc11, cc11)
  1446. FNMSUB (bb1, cc04, cc12, cc12)
  1447. FNMSUB (bb2, cc03, cc13, cc13)
  1448. FNMSUB (bb2, cc04, cc14, cc14)
  1449. FNMSUB (bb3, cc03, cc15, cc15)
  1450. FNMSUB (bb3, cc04, cc16, cc16)
  1451. LDF [BO + 18 * SIZE], a1
  1452. LDF [BO + 19 * SIZE], a2
  1453. LDF [BO + 20 * SIZE], a3
  1454. LDF [BO + 21 * SIZE], a4
  1455. LDF [BO + 22 * SIZE], b1
  1456. LDF [BO + 23 * SIZE], b2
  1457. FMUL a1, c05, c05
  1458. FMUL a1, c06, c06
  1459. FNMSUB (aa2, cc05, cc07, cc07)
  1460. FNMSUB (aa2, cc06, cc08, cc08)
  1461. FNMSUB (aa3, cc05, cc09, cc09)
  1462. FNMSUB (aa3, cc06, cc10, cc10)
  1463. FNMSUB (aa4, cc05, cc11, cc11)
  1464. FNMSUB (aa4, cc06, cc12, cc12)
  1465. FNMSUB (bb1, cc05, cc13, cc13)
  1466. FNMSUB (bb1, cc06, cc14, cc14)
  1467. FNMSUB (bb2, cc05, cc15, cc15)
  1468. FNMSUB (bb2, cc06, cc16, cc16)
  1469. LDF [BO + 27 * SIZE], a1
  1470. LDF [BO + 28 * SIZE], a2
  1471. LDF [BO + 29 * SIZE], a3
  1472. LDF [BO + 30 * SIZE], a4
  1473. LDF [BO + 31 * SIZE], b1
  1474. FMUL a1, c07, c07
  1475. FMUL a1, c08, c08
  1476. FNMSUB (aa2, cc07, cc09, cc09)
  1477. FNMSUB (aa2, cc08, cc10, cc10)
  1478. FNMSUB (aa3, cc07, cc11, cc11)
  1479. FNMSUB (aa3, cc08, cc12, cc12)
  1480. FNMSUB (aa4, cc07, cc13, cc13)
  1481. FNMSUB (aa4, cc08, cc14, cc14)
  1482. FNMSUB (bb1, cc07, cc15, cc15)
  1483. FNMSUB (bb1, cc08, cc16, cc16)
  1484. LDF [BO + 36 * SIZE], a1
  1485. LDF [BO + 37 * SIZE], a2
  1486. LDF [BO + 38 * SIZE], a3
  1487. LDF [BO + 39 * SIZE], a4
  1488. FMUL a1, c09, c09
  1489. FMUL a1, c10, c10
  1490. FNMSUB (aa2, cc09, cc11, cc11)
  1491. FNMSUB (aa2, cc10, cc12, cc12)
  1492. FNMSUB (aa3, cc09, cc13, cc13)
  1493. FNMSUB (aa3, cc10, cc14, cc14)
  1494. FNMSUB (aa4, cc09, cc15, cc15)
  1495. FNMSUB (aa4, cc10, cc16, cc16)
  1496. LDF [BO + 45 * SIZE], a1
  1497. LDF [BO + 46 * SIZE], a2
  1498. LDF [BO + 47 * SIZE], a3
  1499. FMUL a1, c11, c11
  1500. FMUL a1, c12, c12
  1501. FNMSUB (aa2, cc11, cc13, cc13)
  1502. FNMSUB (aa2, cc12, cc14, cc14)
  1503. FNMSUB (aa3, cc11, cc15, cc15)
  1504. FNMSUB (aa3, cc12, cc16, cc16)
  1505. LDF [BO + 54 * SIZE], a1
  1506. LDF [BO + 55 * SIZE], a2
  1507. FMUL a1, c13, c13
  1508. FMUL a1, c14, c14
  1509. FNMSUB (aa2, cc13, cc15, cc15)
  1510. FNMSUB (aa2, cc14, cc16, cc16)
  1511. LDF [BO + 63 * SIZE], a1
  1512. FMUL a1, c15, c15
  1513. FMUL a1, c16, c16
  1514. #endif
  1515. #ifdef RT
  1516. LDF [BO + 63 * SIZE], a1
  1517. LDF [BO + 62 * SIZE], a2
  1518. LDF [BO + 61 * SIZE], a3
  1519. LDF [BO + 60 * SIZE], a4
  1520. LDF [BO + 59 * SIZE], b1
  1521. LDF [BO + 58 * SIZE], b2
  1522. LDF [BO + 57 * SIZE], b3
  1523. LDF [BO + 56 * SIZE], b4
  1524. FMUL a1, c16, c16
  1525. FMUL a1, c15, c15
  1526. FNMSUB (aa2, cc16, cc14, cc14)
  1527. FNMSUB (aa2, cc15, cc13, cc13)
  1528. FNMSUB (aa3, cc16, cc12, cc12)
  1529. FNMSUB (aa3, cc15, cc11, cc11)
  1530. FNMSUB (aa4, cc16, cc10, cc10)
  1531. FNMSUB (aa4, cc15, cc09, cc09)
  1532. FNMSUB (bb1, cc16, cc08, cc08)
  1533. FNMSUB (bb1, cc15, cc07, cc07)
  1534. FNMSUB (bb2, cc16, cc06, cc06)
  1535. FNMSUB (bb2, cc15, cc05, cc05)
  1536. FNMSUB (bb3, cc16, cc04, cc04)
  1537. FNMSUB (bb3, cc15, cc03, cc03)
  1538. FNMSUB (bb4, cc16, cc02, cc02)
  1539. FNMSUB (bb4, cc15, cc01, cc01)
  1540. LDF [BO + 54 * SIZE], a1
  1541. LDF [BO + 53 * SIZE], a2
  1542. LDF [BO + 52 * SIZE], a3
  1543. LDF [BO + 51 * SIZE], a4
  1544. LDF [BO + 50 * SIZE], b1
  1545. LDF [BO + 49 * SIZE], b2
  1546. LDF [BO + 48 * SIZE], b3
  1547. FMUL a1, c14, c14
  1548. FMUL a1, c13, c13
  1549. FNMSUB (aa2, cc14, cc12, cc12)
  1550. FNMSUB (aa2, cc13, cc11, cc11)
  1551. FNMSUB (aa3, cc14, cc10, cc10)
  1552. FNMSUB (aa3, cc13, cc09, cc09)
  1553. FNMSUB (aa4, cc14, cc08, cc08)
  1554. FNMSUB (aa4, cc13, cc07, cc07)
  1555. FNMSUB (bb1, cc14, cc06, cc06)
  1556. FNMSUB (bb1, cc13, cc05, cc05)
  1557. FNMSUB (bb2, cc14, cc04, cc04)
  1558. FNMSUB (bb2, cc13, cc03, cc03)
  1559. FNMSUB (bb3, cc14, cc02, cc02)
  1560. FNMSUB (bb3, cc13, cc01, cc01)
  1561. LDF [BO + 45 * SIZE], a1
  1562. LDF [BO + 44 * SIZE], a2
  1563. LDF [BO + 43 * SIZE], a3
  1564. LDF [BO + 42 * SIZE], a4
  1565. LDF [BO + 41 * SIZE], b1
  1566. LDF [BO + 40 * SIZE], b2
  1567. FMUL a1, c12, c12
  1568. FMUL a1, c11, c11
  1569. FNMSUB (aa2, cc12, cc10, cc10)
  1570. FNMSUB (aa2, cc11, cc09, cc09)
  1571. FNMSUB (aa3, cc12, cc08, cc08)
  1572. FNMSUB (aa3, cc11, cc07, cc07)
  1573. FNMSUB (aa4, cc12, cc06, cc06)
  1574. FNMSUB (aa4, cc11, cc05, cc05)
  1575. FNMSUB (bb1, cc12, cc04, cc04)
  1576. FNMSUB (bb1, cc11, cc03, cc03)
  1577. FNMSUB (bb2, cc12, cc02, cc02)
  1578. FNMSUB (bb2, cc11, cc01, cc01)
  1579. LDF [BO + 36 * SIZE], a1
  1580. LDF [BO + 35 * SIZE], a2
  1581. LDF [BO + 34 * SIZE], a3
  1582. LDF [BO + 33 * SIZE], a4
  1583. LDF [BO + 32 * SIZE], b1
  1584. FMUL a1, c10, c10
  1585. FMUL a1, c09, c09
  1586. FNMSUB (aa2, cc10, cc08, cc08)
  1587. FNMSUB (aa2, cc09, cc07, cc07)
  1588. FNMSUB (aa3, cc10, cc06, cc06)
  1589. FNMSUB (aa3, cc09, cc05, cc05)
  1590. FNMSUB (aa4, cc10, cc04, cc04)
  1591. FNMSUB (aa4, cc09, cc03, cc03)
  1592. FNMSUB (bb1, cc10, cc02, cc02)
  1593. FNMSUB (bb1, cc09, cc01, cc01)
  1594. LDF [BO + 27 * SIZE], a1
  1595. LDF [BO + 26 * SIZE], a2
  1596. LDF [BO + 25 * SIZE], a3
  1597. LDF [BO + 24 * SIZE], a4
  1598. FMUL a1, c08, c08
  1599. FMUL a1, c07, c07
  1600. FNMSUB (aa2, cc08, cc06, cc06)
  1601. FNMSUB (aa2, cc07, cc05, cc05)
  1602. FNMSUB (aa3, cc08, cc04, cc04)
  1603. FNMSUB (aa3, cc07, cc03, cc03)
  1604. FNMSUB (aa4, cc08, cc02, cc02)
  1605. FNMSUB (aa4, cc07, cc01, cc01)
  1606. LDF [BO + 18 * SIZE], a1
  1607. LDF [BO + 17 * SIZE], a2
  1608. LDF [BO + 16 * SIZE], a3
  1609. FMUL a1, c06, c06
  1610. FMUL a1, c05, c05
  1611. FNMSUB (aa2, cc06, cc04, cc04)
  1612. FNMSUB (aa2, cc05, cc03, cc03)
  1613. FNMSUB (aa3, cc06, cc02, cc02)
  1614. FNMSUB (aa3, cc05, cc01, cc01)
  1615. LDF [BO + 9 * SIZE], a1
  1616. LDF [BO + 8 * SIZE], a2
  1617. FMUL a1, c04, c04
  1618. FMUL a1, c03, c03
  1619. FNMSUB (aa2, cc04, cc02, cc02)
  1620. FNMSUB (aa2, cc03, cc01, cc01)
  1621. LDF [BO + 0 * SIZE], a1
  1622. FMUL a1, c02, c02
  1623. FMUL a1, c01, c01
  1624. #endif
  1625. #ifdef LN
  1626. add C1, -2 * SIZE, C1
  1627. add C2, -2 * SIZE, C2
  1628. add C3, -2 * SIZE, C3
  1629. add C4, -2 * SIZE, C4
  1630. add C5, -2 * SIZE, C5
  1631. add C6, -2 * SIZE, C6
  1632. add C7, -2 * SIZE, C7
  1633. add C8, -2 * SIZE, C8
  1634. #endif
  1635. #if defined(LN) || defined(LT)
  1636. STF c01, [BO + 0 * SIZE]
  1637. STF c03, [BO + 1 * SIZE]
  1638. STF c05, [BO + 2 * SIZE]
  1639. STF c07, [BO + 3 * SIZE]
  1640. STF c09, [BO + 4 * SIZE]
  1641. STF c11, [BO + 5 * SIZE]
  1642. STF c13, [BO + 6 * SIZE]
  1643. STF c15, [BO + 7 * SIZE]
  1644. STF c02, [BO + 8 * SIZE]
  1645. STF c04, [BO + 9 * SIZE]
  1646. STF c06, [BO + 10 * SIZE]
  1647. STF c08, [BO + 11 * SIZE]
  1648. STF c10, [BO + 12 * SIZE]
  1649. STF c12, [BO + 13 * SIZE]
  1650. STF c14, [BO + 14 * SIZE]
  1651. STF c16, [BO + 15 * SIZE]
  1652. #else
  1653. STF c01, [AO + 0 * SIZE]
  1654. STF c02, [AO + 1 * SIZE]
  1655. STF c03, [AO + 2 * SIZE]
  1656. STF c04, [AO + 3 * SIZE]
  1657. STF c05, [AO + 4 * SIZE]
  1658. STF c06, [AO + 5 * SIZE]
  1659. STF c07, [AO + 6 * SIZE]
  1660. STF c08, [AO + 7 * SIZE]
  1661. STF c09, [AO + 8 * SIZE]
  1662. STF c10, [AO + 9 * SIZE]
  1663. STF c11, [AO + 10 * SIZE]
  1664. STF c12, [AO + 11 * SIZE]
  1665. STF c13, [AO + 12 * SIZE]
  1666. STF c14, [AO + 13 * SIZE]
  1667. STF c15, [AO + 14 * SIZE]
  1668. STF c16, [AO + 15 * SIZE]
  1669. #endif
  1670. STF c01, [C1 + 0 * SIZE]
  1671. STF c02, [C1 + 1 * SIZE]
  1672. STF c03, [C2 + 0 * SIZE]
  1673. STF c04, [C2 + 1 * SIZE]
  1674. STF c05, [C3 + 0 * SIZE]
  1675. STF c06, [C3 + 1 * SIZE]
  1676. STF c07, [C4 + 0 * SIZE]
  1677. STF c08, [C4 + 1 * SIZE]
  1678. STF c09, [C5 + 0 * SIZE]
  1679. STF c10, [C5 + 1 * SIZE]
  1680. STF c11, [C6 + 0 * SIZE]
  1681. STF c12, [C6 + 1 * SIZE]
  1682. STF c13, [C7 + 0 * SIZE]
  1683. STF c14, [C7 + 1 * SIZE]
  1684. STF c15, [C8 + 0 * SIZE]
  1685. STF c16, [C8 + 1 * SIZE]
  1686. #ifndef LN
  1687. add C1, 2 * SIZE, C1
  1688. add C2, 2 * SIZE, C2
  1689. add C3, 2 * SIZE, C3
  1690. add C4, 2 * SIZE, C4
  1691. add C5, 2 * SIZE, C5
  1692. add C6, 2 * SIZE, C6
  1693. add C7, 2 * SIZE, C7
  1694. add C8, 2 * SIZE, C8
  1695. #endif
  1696. #ifdef RT
  1697. sll K, BASE_SHIFT + 1, TEMP1
  1698. add AORIG, TEMP1, AORIG
  1699. #endif
  1700. #if defined(LT) || defined(RN)
  1701. sub K, KK, TEMP1
  1702. sll TEMP1, BASE_SHIFT + 1, TEMP2
  1703. sll TEMP1, BASE_SHIFT + 3, TEMP1
  1704. add AO, TEMP2, AO
  1705. add BO, TEMP1, BO
  1706. #endif
  1707. #ifdef LT
  1708. add KK, 2, KK
  1709. #endif
  1710. #ifdef LN
  1711. sub KK, 2, KK
  1712. #endif
  1713. add I, -1, I
  1714. cmp I, 0
  1715. bg,pt %icc, .LL12
  1716. nop
  1717. .align 4
  1718. .LL29:
  1719. #ifdef LN
  1720. sll K, BASE_SHIFT + 3, TEMP1
  1721. add B, TEMP1, B
  1722. #endif
  1723. #if defined(LT) || defined(RN)
  1724. mov BO, B
  1725. #endif
  1726. #ifdef RN
  1727. add KK, 8, KK
  1728. #endif
  1729. #ifdef RT
  1730. sub KK, 8, KK
  1731. #endif
  1732. add J, -1, J
  1733. cmp J, 0
  1734. bg,pt %icc, .LL11
  1735. nop
  1736. .align 4
  1737. .LL30:
  1738. and N, 4, J
  1739. cmp J, 0
  1740. ble,pn %icc, .LL50
  1741. nop
  1742. #ifdef RT
  1743. sll K, BASE_SHIFT + 2, TEMP1
  1744. sub B, TEMP1, B
  1745. #endif
  1746. #ifndef RT
  1747. mov C, C1
  1748. add C, LDC, C2
  1749. add C2, LDC, C3
  1750. add C3, LDC, C4
  1751. add C4, LDC, C
  1752. #else
  1753. sub C, LDC, C4
  1754. sub C4, LDC, C3
  1755. sub C3, LDC, C2
  1756. sub C2, LDC, C1
  1757. sub C2, LDC, C
  1758. #endif
  1759. #ifdef LN
  1760. add M, OFFSET, KK
  1761. #endif
  1762. #ifdef LT
  1763. mov OFFSET, KK
  1764. #endif
  1765. #if defined(LN) || defined(RT)
  1766. mov A, AORIG
  1767. #else
  1768. mov A, AO
  1769. #endif
  1770. and M, 1, I
  1771. cmp I, 0
  1772. ble,pn %icc, .LL40
  1773. nop
  1774. #if defined(LT) || defined(RN)
  1775. mov B, BO
  1776. #else
  1777. #ifdef LN
  1778. sll K, BASE_SHIFT + 0, TEMP1
  1779. sub AORIG, TEMP1, AORIG
  1780. #endif
  1781. sll KK, BASE_SHIFT + 0, TEMP1
  1782. sll KK, BASE_SHIFT + 2, TEMP2
  1783. add AORIG, TEMP1, AO
  1784. add B, TEMP2, BO
  1785. #endif
  1786. LDF [AO + 0 * SIZE], a1
  1787. LDF [AO + 1 * SIZE], a2
  1788. LDF [AO + 2 * SIZE], a3
  1789. LDF [AO + 3 * SIZE], a4
  1790. LDF [BO + 0 * SIZE], b1
  1791. LDF [BO + 1 * SIZE], b2
  1792. LDF [BO + 2 * SIZE], b3
  1793. LDF [BO + 3 * SIZE], b4
  1794. LDF [BO + 4 * SIZE], b5
  1795. LDF [BO + 5 * SIZE], b6
  1796. FCLR (cc01)
  1797. LDF [BO + 6 * SIZE], b7
  1798. FCLR (cc03)
  1799. LDF [BO + 7 * SIZE], b8
  1800. FCLR (cc05)
  1801. LDF [BO + 8 * SIZE], b9
  1802. FCLR (cc07)
  1803. #if defined(LT) || defined(RN)
  1804. sra KK, 2, L
  1805. #else
  1806. sub K, KK, L
  1807. sra L, 2, L
  1808. #endif
  1809. cmp L, 0
  1810. ble,pn %icc, .LL45
  1811. nop
  1812. .LL43:
  1813. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  1814. add L, -1, L
  1815. FMADD (aa1, bb1, cc01, cc01)
  1816. LDF [BO + 16 * SIZE], b1
  1817. FMADD (aa1, bb2, cc03, cc03)
  1818. LDF [BO + 9 * SIZE], b2
  1819. FMADD (aa1, bb3, cc05, cc05)
  1820. LDF [BO + 10 * SIZE], b3
  1821. FMADD (aa1, bb4, cc07, cc07)
  1822. LDF [BO + 11 * SIZE], b4
  1823. LDF [AO + 4 * SIZE], a1
  1824. cmp L, 0
  1825. FMADD (aa2, bb5, cc01, cc01)
  1826. LDF [BO + 12 * SIZE], b5
  1827. FMADD (aa2, bb6, cc03, cc03)
  1828. LDF [BO + 13 * SIZE], b6
  1829. FMADD (aa2, bb7, cc05, cc05)
  1830. LDF [BO + 14 * SIZE], b7
  1831. FMADD (aa2, bb8, cc07, cc07)
  1832. LDF [BO + 15 * SIZE], b8
  1833. LDF [AO + 5 * SIZE], a2
  1834. add AO, 4 * SIZE, AO
  1835. FMADD (aa3, bb9, cc01, cc01)
  1836. LDF [BO + 24 * SIZE], b9
  1837. FMADD (aa3, bb2, cc03, cc03)
  1838. LDF [BO + 17 * SIZE], b2
  1839. FMADD (aa3, bb3, cc05, cc05)
  1840. LDF [BO + 18 * SIZE], b3
  1841. FMADD (aa3, bb4, cc07, cc07)
  1842. LDF [BO + 19 * SIZE], b4
  1843. LDF [AO + 2 * SIZE], a3
  1844. add BO, 16 * SIZE, BO
  1845. FMADD (aa4, bb5, cc01, cc01)
  1846. LDF [BO + 4 * SIZE], b5
  1847. FMADD (aa4, bb6, cc03, cc03)
  1848. LDF [BO + 5 * SIZE], b6
  1849. FMADD (aa4, bb7, cc05, cc05)
  1850. LDF [BO + 6 * SIZE], b7
  1851. FMADD (aa4, bb8, cc07, cc07)
  1852. LDF [BO + 7 * SIZE], b8
  1853. bg,pt %icc, .LL43
  1854. LDF [AO + 3 * SIZE], a4
  1855. .align 4
  1856. .LL45:
  1857. #if defined(LT) || defined(RN)
  1858. and KK, 3, L
  1859. #else
  1860. sub K, KK, L
  1861. and L, 3, L
  1862. #endif
  1863. cmp L, 0
  1864. ble,a,pn %icc, .LL48
  1865. nop
  1866. .align 4
  1867. .LL47:
  1868. FMADD (aa1, bb1, cc01, cc01)
  1869. LDF [BO + 4 * SIZE], b1
  1870. add L, -1, L
  1871. FMADD (aa1, bb2, cc03, cc03)
  1872. LDF [BO + 5 * SIZE], b2
  1873. add AO, 1 * SIZE, AO
  1874. FMADD (aa1, bb3, cc05, cc05)
  1875. LDF [BO + 6 * SIZE], b3
  1876. cmp L, 0
  1877. FMADD (aa1, bb4, cc07, cc07)
  1878. LDF [BO + 7 * SIZE], b4
  1879. add BO, 4 * SIZE, BO
  1880. bg,pt %icc, .LL47
  1881. LDF [AO + 0 * SIZE], a1
  1882. .align 4
  1883. .LL48:
  1884. #if defined(LN) || defined(RT)
  1885. #ifdef LN
  1886. sub KK, 1, TEMP1
  1887. #else
  1888. sub KK, 4, TEMP1
  1889. #endif
  1890. sll TEMP1, BASE_SHIFT + 0, TEMP2
  1891. sll TEMP1, BASE_SHIFT + 2, TEMP1
  1892. add AORIG, TEMP2, AO
  1893. add B, TEMP1, BO
  1894. #endif
  1895. #if defined(LN) || defined(LT)
  1896. LDF [BO + 0 * SIZE], a1
  1897. LDF [BO + 1 * SIZE], a2
  1898. LDF [BO + 2 * SIZE], a3
  1899. LDF [BO + 3 * SIZE], a4
  1900. FSUB a1, c01, c01
  1901. FSUB a2, c03, c03
  1902. FSUB a3, c05, c05
  1903. FSUB a4, c07, c07
  1904. #else
  1905. LDF [AO + 0 * SIZE], a1
  1906. LDF [AO + 1 * SIZE], a2
  1907. LDF [AO + 2 * SIZE], a3
  1908. LDF [AO + 3 * SIZE], a4
  1909. FSUB a1, c01, c01
  1910. FSUB a2, c03, c03
  1911. FSUB a3, c05, c05
  1912. FSUB a4, c07, c07
  1913. #endif
  1914. #if defined(LN) || defined(LT)
  1915. LDF [AO + 0 * SIZE], a1
  1916. FMUL a1, c01, c01
  1917. FMUL a1, c03, c03
  1918. FMUL a1, c05, c05
  1919. FMUL a1, c07, c07
  1920. #endif
  1921. #ifdef RN
  1922. LDF [BO + 0 * SIZE], a1
  1923. LDF [BO + 1 * SIZE], a2
  1924. LDF [BO + 2 * SIZE], a3
  1925. LDF [BO + 3 * SIZE], a4
  1926. FMUL a1, c01, c01
  1927. FNMSUB (aa2, cc01, cc03, cc03)
  1928. FNMSUB (aa3, cc01, cc05, cc05)
  1929. FNMSUB (aa4, cc01, cc07, cc07)
  1930. LDF [BO + 5 * SIZE], a1
  1931. LDF [BO + 6 * SIZE], a2
  1932. LDF [BO + 7 * SIZE], a3
  1933. FMUL a1, c03, c03
  1934. FNMSUB (aa2, cc03, cc05, cc05)
  1935. FNMSUB (aa3, cc03, cc07, cc07)
  1936. LDF [BO + 10 * SIZE], a1
  1937. LDF [BO + 11 * SIZE], a2
  1938. FMUL a1, c05, c05
  1939. FNMSUB (aa2, cc05, cc07, cc07)
  1940. LDF [BO + 15 * SIZE], a1
  1941. FMUL a1, c07, c07
  1942. #endif
  1943. #ifdef RT
  1944. LDF [BO + 15 * SIZE], a1
  1945. LDF [BO + 14 * SIZE], a2
  1946. LDF [BO + 13 * SIZE], a3
  1947. LDF [BO + 12 * SIZE], a4
  1948. FMUL a1, c07, c07
  1949. FNMSUB (aa2, cc07, cc05, cc05)
  1950. FNMSUB (aa3, cc07, cc03, cc03)
  1951. FNMSUB (aa4, cc07, cc01, cc01)
  1952. LDF [BO + 10 * SIZE], a1
  1953. LDF [BO + 9 * SIZE], a2
  1954. LDF [BO + 8 * SIZE], a3
  1955. FMUL a1, c05, c05
  1956. FNMSUB (aa2, cc05, cc03, cc03)
  1957. FNMSUB (aa3, cc05, cc01, cc01)
  1958. LDF [BO + 5 * SIZE], a1
  1959. LDF [BO + 4 * SIZE], a2
  1960. FMUL a1, c03, c03
  1961. FNMSUB (aa2, cc03, cc01, cc01)
  1962. LDF [BO + 0 * SIZE], a1
  1963. FMUL a1, c01, c01
  1964. #endif
  1965. #ifdef LN
  1966. add C1, -1 * SIZE, C1
  1967. add C2, -1 * SIZE, C2
  1968. add C3, -1 * SIZE, C3
  1969. add C4, -1 * SIZE, C4
  1970. #endif
  1971. #if defined(LN) || defined(LT)
  1972. STF c01, [BO + 0 * SIZE]
  1973. STF c03, [BO + 1 * SIZE]
  1974. STF c05, [BO + 2 * SIZE]
  1975. STF c07, [BO + 3 * SIZE]
  1976. #else
  1977. STF c01, [AO + 0 * SIZE]
  1978. STF c03, [AO + 1 * SIZE]
  1979. STF c05, [AO + 2 * SIZE]
  1980. STF c07, [AO + 3 * SIZE]
  1981. #endif
  1982. STF c01, [C1 + 0 * SIZE]
  1983. STF c03, [C2 + 0 * SIZE]
  1984. STF c05, [C3 + 0 * SIZE]
  1985. STF c07, [C4 + 0 * SIZE]
  1986. #ifdef RT
  1987. sll K, BASE_SHIFT + 0, TEMP1
  1988. add AORIG, TEMP1, AORIG
  1989. #endif
  1990. #if defined(LT) || defined(RN)
  1991. sub K, KK, TEMP1
  1992. sll TEMP1, BASE_SHIFT + 0, TEMP2
  1993. sll TEMP1, BASE_SHIFT + 2, TEMP1
  1994. add AO, TEMP2, AO
  1995. add BO, TEMP1, BO
  1996. #endif
  1997. #ifdef LT
  1998. add KK, 1, KK
  1999. #endif
  2000. #ifdef LN
  2001. sub KK, 1, KK
  2002. #endif
  2003. .align 4
  2004. .LL40:
  2005. sra M, 1, I
  2006. cmp I, 0
  2007. ble,pn %icc, .LL49
  2008. nop
  2009. .align 4
  2010. .LL32:
  2011. #if defined(LT) || defined(RN)
  2012. mov B, BO
  2013. #else
  2014. #ifdef LN
  2015. sll K, BASE_SHIFT + 1, TEMP1
  2016. sub AORIG, TEMP1, AORIG
  2017. #endif
  2018. sll KK, BASE_SHIFT + 1, TEMP1
  2019. sll KK, BASE_SHIFT + 2, TEMP2
  2020. add AORIG, TEMP1, AO
  2021. add B, TEMP2, BO
  2022. #endif
  2023. LDF [AO + 0 * SIZE], a1
  2024. LDF [AO + 1 * SIZE], a2
  2025. LDF [BO + 0 * SIZE], b1
  2026. LDF [BO + 1 * SIZE], b2
  2027. LDF [BO + 2 * SIZE], b3
  2028. LDF [BO + 3 * SIZE], b4
  2029. LDF [BO + 4 * SIZE], b5
  2030. LDF [BO + 5 * SIZE], b6
  2031. FCLR (cc01)
  2032. LDF [BO + 6 * SIZE], b7
  2033. FCLR (cc02)
  2034. LDF [BO + 7 * SIZE], b8
  2035. FCLR (cc03)
  2036. LDF [BO + 8 * SIZE], b9
  2037. FCLR (cc04)
  2038. prefetch [C1 + 2 * SIZE], 3
  2039. FCLR (cc05)
  2040. prefetch [C2 + 2 * SIZE], 3
  2041. FCLR (cc06)
  2042. prefetch [C3 + 2 * SIZE], 3
  2043. FCLR (cc07)
  2044. prefetch [C4 + 2 * SIZE], 3
  2045. FCLR (cc08)
  2046. #if defined(LT) || defined(RN)
  2047. sra KK, 2, L
  2048. #else
  2049. sub K, KK, L
  2050. sra L, 2, L
  2051. #endif
  2052. cmp L, 0
  2053. ble,pn %icc, .LL35
  2054. nop
  2055. .align 4
  2056. .LL33:
  2057. FMADD (aa1, bb1, cc01, cc01)
  2058. LDF [AO + 2 * SIZE], a3
  2059. FMADD (aa2, bb1, cc02, cc02)
  2060. LDF [AO + 3 * SIZE], a4
  2061. FMADD (aa1, bb2, cc03, cc03)
  2062. LDF [BO + 16 * SIZE], b1
  2063. FMADD (aa2, bb2, cc04, cc04)
  2064. LDF [BO + 9 * SIZE], b2
  2065. FMADD (aa1, bb3, cc05, cc05)
  2066. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  2067. FMADD (aa2, bb3, cc06, cc06)
  2068. add L, -1, L
  2069. FMADD (aa1, bb4, cc07, cc07)
  2070. LDF [BO + 10 * SIZE], b3
  2071. FMADD (aa2, bb4, cc08, cc08)
  2072. LDF [BO + 11 * SIZE], b4
  2073. FMADD (aa3, bb5, cc01, cc01)
  2074. LDF [AO + 4 * SIZE], a1
  2075. FMADD (aa4, bb5, cc02, cc02)
  2076. LDF [AO + 5 * SIZE], a2
  2077. FMADD (aa3, bb6, cc03, cc03)
  2078. LDF [BO + 12 * SIZE], b5
  2079. FMADD (aa4, bb6, cc04, cc04)
  2080. LDF [BO + 13 * SIZE], b6
  2081. FMADD (aa3, bb7, cc05, cc05)
  2082. cmp L, 0
  2083. FMADD (aa4, bb7, cc06, cc06)
  2084. add AO, 8 * SIZE, AO
  2085. FMADD (aa3, bb8, cc07, cc07)
  2086. LDF [BO + 14 * SIZE], b7
  2087. FMADD (aa4, bb8, cc08, cc08)
  2088. LDF [BO + 15 * SIZE], b8
  2089. FMADD (aa1, bb9, cc01, cc01)
  2090. LDF [AO - 2 * SIZE], a3
  2091. FMADD (aa2, bb9, cc02, cc02)
  2092. LDF [AO - 1 * SIZE], a4
  2093. FMADD (aa1, bb2, cc03, cc03)
  2094. LDF [BO + 24 * SIZE], b9
  2095. FMADD (aa2, bb2, cc04, cc04)
  2096. LDF [BO + 17 * SIZE], b2
  2097. FMADD (aa1, bb3, cc05, cc05)
  2098. add BO, 16 * SIZE, BO
  2099. FMADD (aa2, bb3, cc06, cc06)
  2100. nop
  2101. FMADD (aa1, bb4, cc07, cc07)
  2102. LDF [BO + 2 * SIZE], b3
  2103. FMADD (aa2, bb4, cc08, cc08)
  2104. LDF [BO + 3 * SIZE], b4
  2105. FMADD (aa3, bb5, cc01, cc01)
  2106. LDF [AO + 0 * SIZE], a1
  2107. FMADD (aa4, bb5, cc02, cc02)
  2108. LDF [AO + 1 * SIZE], a2
  2109. FMADD (aa3, bb6, cc03, cc03)
  2110. LDF [BO + 4 * SIZE], b5
  2111. FMADD (aa4, bb6, cc04, cc04)
  2112. LDF [BO + 5 * SIZE], b6
  2113. FMADD (aa3, bb7, cc05, cc05)
  2114. nop
  2115. FMADD (aa4, bb7, cc06, cc06)
  2116. LDF [BO + 6 * SIZE], b7
  2117. FMADD (aa3, bb8, cc07, cc07)
  2118. FMADD (aa4, bb8, cc08, cc08)
  2119. bg,pt %icc, .LL33
  2120. LDF [BO + 7 * SIZE], b8
  2121. .align 4
  2122. .LL35:
  2123. #if defined(LT) || defined(RN)
  2124. and KK, 3, L
  2125. #else
  2126. sub K, KK, L
  2127. and L, 3, L
  2128. #endif
  2129. cmp L, 0
  2130. ble,a,pn %icc, .LL38
  2131. nop
  2132. .align 4
  2133. .LL37:
  2134. FMADD (aa1, bb1, cc01, cc01)
  2135. add L, -1, L
  2136. FMADD (aa2, bb1, cc02, cc02)
  2137. LDF [BO + 4 * SIZE], b1
  2138. FMADD (aa1, bb2, cc03, cc03)
  2139. add AO, 2 * SIZE, AO
  2140. FMADD (aa2, bb2, cc04, cc04)
  2141. LDF [BO + 5 * SIZE], b2
  2142. FMADD (aa1, bb3, cc05, cc05)
  2143. cmp L, 0
  2144. FMADD (aa2, bb3, cc06, cc06)
  2145. LDF [BO + 6 * SIZE], b3
  2146. FMADD (aa1, bb4, cc07, cc07)
  2147. LDF [AO + 0 * SIZE], a1
  2148. FMADD (aa2, bb4, cc08, cc08)
  2149. LDF [AO + 1 * SIZE], a2
  2150. LDF [BO + 7 * SIZE], b4
  2151. bg,pt %icc, .LL37
  2152. add BO, 4 * SIZE, BO
  2153. .align 4
  2154. .LL38:
  2155. #if defined(LN) || defined(RT)
  2156. #ifdef LN
  2157. sub KK, 2, TEMP1
  2158. #else
  2159. sub KK, 4, TEMP1
  2160. #endif
  2161. sll TEMP1, BASE_SHIFT + 1, TEMP2
  2162. sll TEMP1, BASE_SHIFT + 2, TEMP1
  2163. add AORIG, TEMP2, AO
  2164. add B, TEMP1, BO
  2165. #endif
  2166. #if defined(LN) || defined(LT)
  2167. LDF [BO + 0 * SIZE], a1
  2168. LDF [BO + 1 * SIZE], a2
  2169. LDF [BO + 2 * SIZE], a3
  2170. LDF [BO + 3 * SIZE], a4
  2171. LDF [BO + 4 * SIZE], b1
  2172. LDF [BO + 5 * SIZE], b2
  2173. LDF [BO + 6 * SIZE], b3
  2174. LDF [BO + 7 * SIZE], b4
  2175. FSUB a1, c01, c01
  2176. FSUB a2, c03, c03
  2177. FSUB a3, c05, c05
  2178. FSUB a4, c07, c07
  2179. FSUB b1, c02, c02
  2180. FSUB b2, c04, c04
  2181. FSUB b3, c06, c06
  2182. FSUB b4, c08, c08
  2183. #else
  2184. LDF [AO + 0 * SIZE], a1
  2185. LDF [AO + 1 * SIZE], a2
  2186. LDF [AO + 2 * SIZE], a3
  2187. LDF [AO + 3 * SIZE], a4
  2188. LDF [AO + 4 * SIZE], b1
  2189. LDF [AO + 5 * SIZE], b2
  2190. LDF [AO + 6 * SIZE], b3
  2191. LDF [AO + 7 * SIZE], b4
  2192. FSUB a1, c01, c01
  2193. FSUB a2, c02, c02
  2194. FSUB a3, c03, c03
  2195. FSUB a4, c04, c04
  2196. FSUB b1, c05, c05
  2197. FSUB b2, c06, c06
  2198. FSUB b3, c07, c07
  2199. FSUB b4, c08, c08
  2200. #endif
  2201. #ifdef LN
  2202. LDF [AO + 3 * SIZE], a1
  2203. LDF [AO + 2 * SIZE], a2
  2204. LDF [AO + 0 * SIZE], a3
  2205. FMUL a1, c02, c02
  2206. FMUL a1, c04, c04
  2207. FMUL a1, c06, c06
  2208. FMUL a1, c08, c08
  2209. FNMSUB (aa2, cc02, cc01, cc01)
  2210. FNMSUB (aa2, cc04, cc03, cc03)
  2211. FNMSUB (aa2, cc06, cc05, cc05)
  2212. FNMSUB (aa2, cc08, cc07, cc07)
  2213. FMUL a3, c01, c01
  2214. FMUL a3, c03, c03
  2215. FMUL a3, c05, c05
  2216. FMUL a3, c07, c07
  2217. #endif
  2218. #ifdef LT
  2219. LDF [AO + 0 * SIZE], a1
  2220. LDF [AO + 1 * SIZE], a2
  2221. LDF [AO + 3 * SIZE], a3
  2222. FMUL a1, c01, c01
  2223. FMUL a1, c03, c03
  2224. FMUL a1, c05, c05
  2225. FMUL a1, c07, c07
  2226. FNMSUB (aa2, cc01, cc02, cc02)
  2227. FNMSUB (aa2, cc03, cc04, cc04)
  2228. FNMSUB (aa2, cc05, cc06, cc06)
  2229. FNMSUB (aa2, cc07, cc08, cc08)
  2230. FMUL a3, c02, c02
  2231. FMUL a3, c04, c04
  2232. FMUL a3, c06, c06
  2233. FMUL a3, c08, c08
  2234. #endif
  2235. #ifdef RN
  2236. LDF [BO + 0 * SIZE], a1
  2237. LDF [BO + 1 * SIZE], a2
  2238. LDF [BO + 2 * SIZE], a3
  2239. LDF [BO + 3 * SIZE], a4
  2240. FMUL a1, c01, c01
  2241. FMUL a1, c02, c02
  2242. FNMSUB (aa2, cc01, cc03, cc03)
  2243. FNMSUB (aa2, cc02, cc04, cc04)
  2244. FNMSUB (aa3, cc01, cc05, cc05)
  2245. FNMSUB (aa3, cc02, cc06, cc06)
  2246. FNMSUB (aa4, cc01, cc07, cc07)
  2247. FNMSUB (aa4, cc02, cc08, cc08)
  2248. LDF [BO + 5 * SIZE], a1
  2249. LDF [BO + 6 * SIZE], a2
  2250. LDF [BO + 7 * SIZE], a3
  2251. FMUL a1, c03, c03
  2252. FMUL a1, c04, c04
  2253. FNMSUB (aa2, cc03, cc05, cc05)
  2254. FNMSUB (aa2, cc04, cc06, cc06)
  2255. FNMSUB (aa3, cc03, cc07, cc07)
  2256. FNMSUB (aa3, cc04, cc08, cc08)
  2257. LDF [BO + 10 * SIZE], a1
  2258. LDF [BO + 11 * SIZE], a2
  2259. FMUL a1, c05, c05
  2260. FMUL a1, c06, c06
  2261. FNMSUB (aa2, cc05, cc07, cc07)
  2262. FNMSUB (aa2, cc06, cc08, cc08)
  2263. LDF [BO + 15 * SIZE], a1
  2264. FMUL a1, c07, c07
  2265. FMUL a1, c08, c08
  2266. #endif
  2267. #ifdef RT
  2268. LDF [BO + 15 * SIZE], a1
  2269. LDF [BO + 14 * SIZE], a2
  2270. LDF [BO + 13 * SIZE], a3
  2271. LDF [BO + 12 * SIZE], a4
  2272. FMUL a1, c08, c08
  2273. FMUL a1, c07, c07
  2274. FNMSUB (aa2, cc08, cc06, cc06)
  2275. FNMSUB (aa2, cc07, cc05, cc05)
  2276. FNMSUB (aa3, cc08, cc04, cc04)
  2277. FNMSUB (aa3, cc07, cc03, cc03)
  2278. FNMSUB (aa4, cc08, cc02, cc02)
  2279. FNMSUB (aa4, cc07, cc01, cc01)
  2280. LDF [BO + 10 * SIZE], a1
  2281. LDF [BO + 9 * SIZE], a2
  2282. LDF [BO + 8 * SIZE], a3
  2283. FMUL a1, c06, c06
  2284. FMUL a1, c05, c05
  2285. FNMSUB (aa2, cc06, cc04, cc04)
  2286. FNMSUB (aa2, cc05, cc03, cc03)
  2287. FNMSUB (aa3, cc06, cc02, cc02)
  2288. FNMSUB (aa3, cc05, cc01, cc01)
  2289. LDF [BO + 5 * SIZE], a1
  2290. LDF [BO + 4 * SIZE], a2
  2291. FMUL a1, c04, c04
  2292. FMUL a1, c03, c03
  2293. FNMSUB (aa2, cc04, cc02, cc02)
  2294. FNMSUB (aa2, cc03, cc01, cc01)
  2295. LDF [BO + 0 * SIZE], a1
  2296. FMUL a1, c02, c02
  2297. FMUL a1, c01, c01
  2298. #endif
  2299. #ifdef LN
  2300. add C1, -2 * SIZE, C1
  2301. add C2, -2 * SIZE, C2
  2302. add C3, -2 * SIZE, C3
  2303. add C4, -2 * SIZE, C4
  2304. #endif
  2305. #if defined(LN) || defined(LT)
  2306. STF c01, [BO + 0 * SIZE]
  2307. STF c03, [BO + 1 * SIZE]
  2308. STF c05, [BO + 2 * SIZE]
  2309. STF c07, [BO + 3 * SIZE]
  2310. STF c02, [BO + 4 * SIZE]
  2311. STF c04, [BO + 5 * SIZE]
  2312. STF c06, [BO + 6 * SIZE]
  2313. STF c08, [BO + 7 * SIZE]
  2314. #else
  2315. STF c01, [AO + 0 * SIZE]
  2316. STF c02, [AO + 1 * SIZE]
  2317. STF c03, [AO + 2 * SIZE]
  2318. STF c04, [AO + 3 * SIZE]
  2319. STF c05, [AO + 4 * SIZE]
  2320. STF c06, [AO + 5 * SIZE]
  2321. STF c07, [AO + 6 * SIZE]
  2322. STF c08, [AO + 7 * SIZE]
  2323. #endif
  2324. STF c01, [C1 + 0 * SIZE]
  2325. STF c02, [C1 + 1 * SIZE]
  2326. STF c03, [C2 + 0 * SIZE]
  2327. STF c04, [C2 + 1 * SIZE]
  2328. STF c05, [C3 + 0 * SIZE]
  2329. STF c06, [C3 + 1 * SIZE]
  2330. STF c07, [C4 + 0 * SIZE]
  2331. STF c08, [C4 + 1 * SIZE]
  2332. #ifndef LN
  2333. add C1, 2 * SIZE, C1
  2334. add C2, 2 * SIZE, C2
  2335. add C3, 2 * SIZE, C3
  2336. add C4, 2 * SIZE, C4
  2337. #endif
  2338. #ifdef RT
  2339. sll K, BASE_SHIFT + 1, TEMP1
  2340. add AORIG, TEMP1, AORIG
  2341. #endif
  2342. #if defined(LT) || defined(RN)
  2343. sub K, KK, TEMP1
  2344. sll TEMP1, BASE_SHIFT + 1, TEMP2
  2345. sll TEMP1, BASE_SHIFT + 2, TEMP1
  2346. add AO, TEMP2, AO
  2347. add BO, TEMP1, BO
  2348. #endif
  2349. #ifdef LT
  2350. add KK, 2, KK
  2351. #endif
  2352. #ifdef LN
  2353. sub KK, 2, KK
  2354. #endif
  2355. add I, -1, I
  2356. cmp I, 0
  2357. bg,pt %icc, .LL32
  2358. nop
  2359. .LL49:
  2360. #ifdef LN
  2361. sll K, BASE_SHIFT + 2, TEMP1
  2362. add B, TEMP1, B
  2363. #endif
  2364. #if defined(LT) || defined(RN)
  2365. mov BO, B
  2366. #endif
  2367. #ifdef RN
  2368. add KK, 4, KK
  2369. #endif
  2370. #ifdef RT
  2371. sub KK, 4, KK
  2372. #endif
  2373. .align 4
  2374. .LL50:
  2375. and N, 2, J
  2376. cmp J, 0
  2377. ble,pn %icc, .LL70
  2378. nop
  2379. #ifdef RT
  2380. sll K, BASE_SHIFT + 1, TEMP1
  2381. sub B, TEMP1, B
  2382. #endif
  2383. #ifndef RT
  2384. mov C, C1
  2385. add C, LDC, C2
  2386. add C2, LDC, C
  2387. #else
  2388. sub C, LDC, C2
  2389. sub C2, LDC, C1
  2390. sub C2, LDC, C
  2391. #endif
  2392. #ifdef LN
  2393. add M, OFFSET, KK
  2394. #endif
  2395. #ifdef LT
  2396. mov OFFSET, KK
  2397. #endif
  2398. #if defined(LN) || defined(RT)
  2399. mov A, AORIG
  2400. #else
  2401. mov A, AO
  2402. #endif
  2403. and M, 1, I
  2404. cmp I, 0
  2405. ble,pn %icc, .LL60
  2406. nop
  2407. #if defined(LT) || defined(RN)
  2408. mov B, BO
  2409. #else
  2410. #ifdef LN
  2411. sll K, BASE_SHIFT + 0, TEMP1
  2412. sub AORIG, TEMP1, AORIG
  2413. #endif
  2414. sll KK, BASE_SHIFT + 0, TEMP1
  2415. sll KK, BASE_SHIFT + 1, TEMP2
  2416. add AORIG, TEMP1, AO
  2417. add B, TEMP2, BO
  2418. #endif
  2419. LDF [AO + 0 * SIZE], a1
  2420. LDF [AO + 1 * SIZE], a2
  2421. LDF [AO + 2 * SIZE], a3
  2422. LDF [AO + 3 * SIZE], a4
  2423. LDF [BO + 0 * SIZE], b1
  2424. LDF [BO + 1 * SIZE], b2
  2425. LDF [BO + 2 * SIZE], b3
  2426. LDF [BO + 3 * SIZE], b4
  2427. LDF [BO + 4 * SIZE], b5
  2428. LDF [BO + 5 * SIZE], b6
  2429. LDF [BO + 6 * SIZE], b7
  2430. FCLR (cc01)
  2431. LDF [BO + 7 * SIZE], b8
  2432. FCLR (cc03)
  2433. #if defined(LT) || defined(RN)
  2434. sra KK, 2, L
  2435. #else
  2436. sub K, KK, L
  2437. sra L, 2, L
  2438. #endif
  2439. cmp L, 0
  2440. ble,pn %icc, .LL65
  2441. nop
  2442. .align 4
  2443. .LL63:
  2444. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  2445. add L, -1, L
  2446. FMADD (aa1, bb1, cc01, cc01)
  2447. LDF [BO + 8 * SIZE], b1
  2448. FMADD (aa1, bb2, cc03, cc03)
  2449. LDF [BO + 9 * SIZE], b2
  2450. LDF [AO + 4 * SIZE], a1
  2451. cmp L, 0
  2452. FMADD (aa2, bb3, cc01, cc01)
  2453. LDF [BO + 10 * SIZE], b3
  2454. FMADD (aa2, bb4, cc03, cc03)
  2455. LDF [BO + 11 * SIZE], b4
  2456. LDF [AO + 5 * SIZE], a2
  2457. add AO, 4 * SIZE, AO
  2458. FMADD (aa3, bb5, cc01, cc01)
  2459. LDF [BO + 12 * SIZE], b5
  2460. FMADD (aa3, bb6, cc03, cc03)
  2461. LDF [BO + 13 * SIZE], b6
  2462. LDF [AO + 2 * SIZE], a3
  2463. add BO, 8 * SIZE, BO
  2464. FMADD (aa4, bb7, cc01, cc01)
  2465. LDF [BO + 6 * SIZE], b7
  2466. FMADD (aa4, bb8, cc03, cc03)
  2467. LDF [BO + 7 * SIZE], b8
  2468. bg,pt %icc, .LL63
  2469. LDF [AO + 3 * SIZE], a4
  2470. .align 4
  2471. .LL65:
  2472. #if defined(LT) || defined(RN)
  2473. and KK, 3, L
  2474. #else
  2475. sub K, KK, L
  2476. and L, 3, L
  2477. #endif
  2478. cmp L, 0
  2479. ble,a,pn %icc, .LL68
  2480. nop
  2481. .align 4
  2482. .LL67:
  2483. FMADD (aa1, bb1, cc01, cc01)
  2484. LDF [BO + 2 * SIZE], b1
  2485. FMADD (aa1, bb2, cc03, cc03)
  2486. LDF [BO + 3 * SIZE], b2
  2487. LDF [AO + 1 * SIZE], a1
  2488. add L, -1, L
  2489. add AO, 1 * SIZE, AO
  2490. cmp L, 0
  2491. bg,pt %icc, .LL67
  2492. add BO, 2 * SIZE, BO
  2493. .align 4
  2494. .LL68:
  2495. #if defined(LN) || defined(RT)
  2496. #ifdef LN
  2497. sub KK, 1, TEMP1
  2498. #else
  2499. sub KK, 2, TEMP1
  2500. #endif
  2501. sll TEMP1, BASE_SHIFT + 0, TEMP2
  2502. sll TEMP1, BASE_SHIFT + 1, TEMP1
  2503. add AORIG, TEMP2, AO
  2504. add B, TEMP1, BO
  2505. #endif
  2506. #if defined(LN) || defined(LT)
  2507. LDF [BO + 0 * SIZE], a1
  2508. LDF [BO + 1 * SIZE], a2
  2509. FSUB a1, c01, c01
  2510. FSUB a2, c03, c03
  2511. #else
  2512. LDF [AO + 0 * SIZE], a1
  2513. LDF [AO + 1 * SIZE], a2
  2514. FSUB a1, c01, c01
  2515. FSUB a2, c03, c03
  2516. #endif
  2517. #if defined(LN) || defined(LT)
  2518. LDF [AO + 0 * SIZE], a1
  2519. FMUL a1, c01, c01
  2520. FMUL a1, c03, c03
  2521. #endif
  2522. #ifdef RN
  2523. LDF [BO + 0 * SIZE], a1
  2524. LDF [BO + 1 * SIZE], a2
  2525. FMUL a1, c01, c01
  2526. FNMSUB (aa2, cc01, cc03, cc03)
  2527. LDF [BO + 3 * SIZE], a1
  2528. FMUL a1, c03, c03
  2529. #endif
  2530. #ifdef RT
  2531. LDF [BO + 3 * SIZE], a1
  2532. LDF [BO + 2 * SIZE], a2
  2533. FMUL a1, c03, c03
  2534. FNMSUB (aa2, cc03, cc01, cc01)
  2535. LDF [BO + 0 * SIZE], a1
  2536. FMUL a1, c01, c01
  2537. #endif
  2538. #ifdef LN
  2539. add C1, -1 * SIZE, C1
  2540. add C2, -1 * SIZE, C2
  2541. #endif
  2542. #if defined(LN) || defined(LT)
  2543. STF c01, [BO + 0 * SIZE]
  2544. STF c03, [BO + 1 * SIZE]
  2545. #else
  2546. STF c01, [AO + 0 * SIZE]
  2547. STF c03, [AO + 1 * SIZE]
  2548. #endif
  2549. STF c01, [C1 + 0 * SIZE]
  2550. STF c03, [C2 + 0 * SIZE]
  2551. #ifdef RT
  2552. sll K, BASE_SHIFT + 0, TEMP1
  2553. add AORIG, TEMP1, AORIG
  2554. #endif
  2555. #if defined(LT) || defined(RN)
  2556. sub K, KK, TEMP1
  2557. sll TEMP1, BASE_SHIFT + 0, TEMP2
  2558. sll TEMP1, BASE_SHIFT + 1, TEMP1
  2559. add AO, TEMP2, AO
  2560. add BO, TEMP1, BO
  2561. #endif
  2562. #ifdef LT
  2563. add KK, 1, KK
  2564. #endif
  2565. #ifdef LN
  2566. sub KK, 1, KK
  2567. #endif
  2568. .align 4
  2569. .LL60:
  2570. sra M, 1, I
  2571. cmp I, 0
  2572. ble,pn %icc, .LL69
  2573. nop
  2574. .align 4
  2575. .LL52:
  2576. #if defined(LT) || defined(RN)
  2577. mov B, BO
  2578. #else
  2579. #ifdef LN
  2580. sll K, BASE_SHIFT + 1, TEMP1
  2581. sub AORIG, TEMP1, AORIG
  2582. #endif
  2583. sll KK, BASE_SHIFT + 1, TEMP1
  2584. sll KK, BASE_SHIFT + 1, TEMP2
  2585. add AORIG, TEMP1, AO
  2586. add B, TEMP2, BO
  2587. #endif
  2588. LDF [AO + 0 * SIZE], a1
  2589. LDF [AO + 1 * SIZE], a2
  2590. LDF [AO + 2 * SIZE], a3
  2591. LDF [AO + 3 * SIZE], a4
  2592. LDF [BO + 0 * SIZE], b1
  2593. LDF [BO + 1 * SIZE], b2
  2594. LDF [BO + 2 * SIZE], b3
  2595. FCLR (cc01)
  2596. LDF [BO + 3 * SIZE], b4
  2597. FCLR (cc02)
  2598. LDF [BO + 4 * SIZE], b5
  2599. FCLR (cc03)
  2600. LDF [BO + 5 * SIZE], b6
  2601. FCLR (cc04)
  2602. LDF [BO + 6 * SIZE], b7
  2603. FCLR (cc05)
  2604. LDF [BO + 7 * SIZE], b8
  2605. FCLR (cc06)
  2606. prefetch [C1 + 2 * SIZE], 3
  2607. FCLR (cc07)
  2608. prefetch [C2 + 2 * SIZE], 3
  2609. FCLR (cc08)
  2610. #if defined(LT) || defined(RN)
  2611. sra KK, 2, L
  2612. #else
  2613. sub K, KK, L
  2614. sra L, 2, L
  2615. #endif
  2616. cmp L, 0
  2617. ble,pn %icc, .LL55
  2618. nop
  2619. .align 4
  2620. .LL53:
  2621. FMADD (aa1, bb1, cc01, cc01)
  2622. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  2623. FMADD (aa2, bb1, cc02, cc02)
  2624. LDF [BO + 8 * SIZE], b1
  2625. FMADD (aa1, bb2, cc03, cc03)
  2626. LDF [AO + 4 * SIZE], a1
  2627. FMADD (aa2, bb2, cc04, cc04)
  2628. LDF [AO + 5 * SIZE], a2
  2629. FMADD (aa3, bb3, cc01, cc01)
  2630. LDF [BO + 9 * SIZE], b2
  2631. FMADD (aa4, bb3, cc02, cc02)
  2632. LDF [BO + 10 * SIZE], b3
  2633. FMADD (aa3, bb4, cc03, cc03)
  2634. LDF [AO + 6 * SIZE], a3
  2635. FMADD (aa4, bb4, cc04, cc04)
  2636. LDF [AO + 7 * SIZE], a4
  2637. FMADD (aa1, bb5, cc01, cc01)
  2638. LDF [BO + 11 * SIZE], b4
  2639. FMADD (aa2, bb5, cc02, cc02)
  2640. LDF [BO + 12 * SIZE], b5
  2641. FMADD (aa1, bb6, cc03, cc03)
  2642. LDF [AO + 8 * SIZE], a1
  2643. FMADD (aa2, bb6, cc04, cc04)
  2644. LDF [AO + 9 * SIZE], a2
  2645. FMADD (aa3, bb7, cc01, cc01)
  2646. LDF [BO + 13 * SIZE], b6
  2647. FMADD (aa4, bb7, cc02, cc02)
  2648. LDF [BO + 14 * SIZE], b7
  2649. FMADD (aa3, bb8, cc03, cc03)
  2650. LDF [AO + 10 * SIZE], a3
  2651. FMADD (aa4, bb8, cc04, cc04)
  2652. LDF [AO + 11 * SIZE], a4
  2653. add AO, 8 * SIZE, AO
  2654. add L, -1, L
  2655. add BO, 8 * SIZE, BO
  2656. cmp L, 0
  2657. bg,pt %icc, .LL53
  2658. LDF [BO + 7 * SIZE], b8
  2659. .align 4
  2660. .LL55:
  2661. #if defined(LT) || defined(RN)
  2662. and KK, 3, L
  2663. #else
  2664. sub K, KK, L
  2665. and L, 3, L
  2666. #endif
  2667. cmp L, 0
  2668. ble,a,pn %icc, .LL58
  2669. nop
  2670. .align 4
  2671. .LL57:
  2672. FMADD (aa1, bb1, cc01, cc01)
  2673. add L, -1, L
  2674. FMADD (aa2, bb1, cc02, cc02)
  2675. LDF [BO + 2 * SIZE], b1
  2676. FMADD (aa1, bb2, cc03, cc03)
  2677. LDF [AO + 2 * SIZE], a1
  2678. FMADD (aa2, bb2, cc04, cc04)
  2679. LDF [AO + 3 * SIZE], a2
  2680. add AO, 2 * SIZE, AO
  2681. cmp L, 0
  2682. add BO, 2 * SIZE, BO
  2683. bg,pt %icc, .LL57
  2684. LDF [BO + 1 * SIZE], b2
  2685. .align 4
  2686. .LL58:
  2687. #if defined(LN) || defined(RT)
  2688. #ifdef LN
  2689. sub KK, 2, TEMP1
  2690. #else
  2691. sub KK, 2, TEMP1
  2692. #endif
  2693. sll TEMP1, BASE_SHIFT + 1, TEMP2
  2694. sll TEMP1, BASE_SHIFT + 1, TEMP1
  2695. add AORIG, TEMP2, AO
  2696. add B, TEMP1, BO
  2697. #endif
  2698. #if defined(LN) || defined(LT)
  2699. LDF [BO + 0 * SIZE], a1
  2700. LDF [BO + 1 * SIZE], a2
  2701. LDF [BO + 2 * SIZE], a3
  2702. LDF [BO + 3 * SIZE], a4
  2703. FSUB a1, c01, c01
  2704. FSUB a2, c03, c03
  2705. FSUB a3, c02, c02
  2706. FSUB a4, c04, c04
  2707. #else
  2708. LDF [AO + 0 * SIZE], a1
  2709. LDF [AO + 1 * SIZE], a2
  2710. LDF [AO + 2 * SIZE], a3
  2711. LDF [AO + 3 * SIZE], a4
  2712. FSUB a1, c01, c01
  2713. FSUB a2, c02, c02
  2714. FSUB a3, c03, c03
  2715. FSUB a4, c04, c04
  2716. #endif
  2717. #ifdef LN
  2718. LDF [AO + 3 * SIZE], a1
  2719. LDF [AO + 2 * SIZE], a2
  2720. LDF [AO + 0 * SIZE], a3
  2721. FMUL a1, c02, c02
  2722. FMUL a1, c04, c04
  2723. FNMSUB (aa2, cc02, cc01, cc01)
  2724. FNMSUB (aa2, cc04, cc03, cc03)
  2725. FMUL a3, c01, c01
  2726. FMUL a3, c03, c03
  2727. #endif
  2728. #ifdef LT
  2729. LDF [AO + 0 * SIZE], a1
  2730. LDF [AO + 1 * SIZE], a2
  2731. LDF [AO + 3 * SIZE], a3
  2732. FMUL a1, c01, c01
  2733. FMUL a1, c03, c03
  2734. FNMSUB (aa2, cc01, cc02, cc02)
  2735. FNMSUB (aa2, cc03, cc04, cc04)
  2736. FMUL a3, c02, c02
  2737. FMUL a3, c04, c04
  2738. #endif
  2739. #ifdef RN
  2740. LDF [BO + 0 * SIZE], a1
  2741. LDF [BO + 1 * SIZE], a2
  2742. FMUL a1, c01, c01
  2743. FMUL a1, c02, c02
  2744. FNMSUB (aa2, cc01, cc03, cc03)
  2745. FNMSUB (aa2, cc02, cc04, cc04)
  2746. LDF [BO + 3 * SIZE], a1
  2747. FMUL a1, c03, c03
  2748. FMUL a1, c04, c04
  2749. #endif
  2750. #ifdef RT
  2751. LDF [BO + 3 * SIZE], a1
  2752. LDF [BO + 2 * SIZE], a2
  2753. FMUL a1, c04, c04
  2754. FMUL a1, c03, c03
  2755. FNMSUB (aa2, cc04, cc02, cc02)
  2756. FNMSUB (aa2, cc03, cc01, cc01)
  2757. LDF [BO + 0 * SIZE], a1
  2758. FMUL a1, c02, c02
  2759. FMUL a1, c01, c01
  2760. #endif
  2761. #ifdef LN
  2762. add C1, -2 * SIZE, C1
  2763. add C2, -2 * SIZE, C2
  2764. #endif
  2765. #if defined(LN) || defined(LT)
  2766. STF c01, [BO + 0 * SIZE]
  2767. STF c03, [BO + 1 * SIZE]
  2768. STF c02, [BO + 2 * SIZE]
  2769. STF c04, [BO + 3 * SIZE]
  2770. #else
  2771. STF c01, [AO + 0 * SIZE]
  2772. STF c02, [AO + 1 * SIZE]
  2773. STF c03, [AO + 2 * SIZE]
  2774. STF c04, [AO + 3 * SIZE]
  2775. #endif
  2776. STF c01, [C1 + 0 * SIZE]
  2777. STF c02, [C1 + 1 * SIZE]
  2778. STF c03, [C2 + 0 * SIZE]
  2779. STF c04, [C2 + 1 * SIZE]
  2780. #ifndef LN
  2781. add C1, 2 * SIZE, C1
  2782. add C2, 2 * SIZE, C2
  2783. #endif
  2784. #ifdef RT
  2785. sll K, BASE_SHIFT + 1, TEMP1
  2786. add AORIG, TEMP1, AORIG
  2787. #endif
  2788. #if defined(LT) || defined(RN)
  2789. sub K, KK, TEMP1
  2790. sll TEMP1, BASE_SHIFT + 1, TEMP2
  2791. sll TEMP1, BASE_SHIFT + 1, TEMP1
  2792. add AO, TEMP2, AO
  2793. add BO, TEMP1, BO
  2794. #endif
  2795. #ifdef LT
  2796. add KK, 2, KK
  2797. #endif
  2798. #ifdef LN
  2799. sub KK, 2, KK
  2800. #endif
  2801. add I, -1, I
  2802. cmp I, 0
  2803. bg,pt %icc, .LL52
  2804. nop
  2805. .align 4
  2806. .LL69:
  2807. #ifdef LN
  2808. sll K, BASE_SHIFT + 1, TEMP1
  2809. add B, TEMP1, B
  2810. #endif
  2811. #if defined(LT) || defined(RN)
  2812. mov BO, B
  2813. #endif
  2814. #ifdef RN
  2815. add KK, 2, KK
  2816. #endif
  2817. #ifdef RT
  2818. sub KK, 2, KK
  2819. #endif
  2820. .align 4
  2821. .LL70:
  2822. and N, 1, J
  2823. cmp J, 0
  2824. ble,pn %icc, .LL999
  2825. nop
  2826. #ifdef RT
  2827. sll K, BASE_SHIFT, TEMP1
  2828. sub B, TEMP1, B
  2829. #endif
  2830. #ifndef RT
  2831. mov C, C1
  2832. add C1, LDC, C
  2833. #else
  2834. sub C, LDC, C1
  2835. sub C, LDC, C
  2836. #endif
  2837. #ifdef LN
  2838. add M, OFFSET, KK
  2839. #endif
  2840. #ifdef LT
  2841. mov OFFSET, KK
  2842. #endif
  2843. #if defined(LN) || defined(RT)
  2844. mov A, AORIG
  2845. #else
  2846. mov A, AO
  2847. #endif
  2848. and M, 1, I
  2849. cmp I, 0
  2850. ble,pn %icc, .LL80
  2851. nop
  2852. #if defined(LT) || defined(RN)
  2853. mov B, BO
  2854. #else
  2855. #ifdef LN
  2856. sll K, BASE_SHIFT + 0, TEMP1
  2857. sub AORIG, TEMP1, AORIG
  2858. #endif
  2859. sll KK, BASE_SHIFT + 0, TEMP1
  2860. sll KK, BASE_SHIFT + 0, TEMP2
  2861. add AORIG, TEMP1, AO
  2862. add B, TEMP2, BO
  2863. #endif
  2864. LDF [AO + 0 * SIZE], a1
  2865. LDF [BO + 0 * SIZE], b1
  2866. LDF [AO + 1 * SIZE], a2
  2867. LDF [BO + 1 * SIZE], b2
  2868. LDF [AO + 2 * SIZE], a3
  2869. LDF [BO + 2 * SIZE], b3
  2870. LDF [AO + 3 * SIZE], a4
  2871. LDF [BO + 3 * SIZE], b4
  2872. #if defined(LT) || defined(RN)
  2873. sra KK, 2, L
  2874. #else
  2875. sub K, KK, L
  2876. sra L, 2, L
  2877. #endif
  2878. cmp L, 0
  2879. ble,pn %icc, .LL85
  2880. FCLR (cc01)
  2881. .align 4
  2882. .LL83:
  2883. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  2884. add L, -1, L
  2885. FMADD (aa1, bb1, cc01, cc01)
  2886. LDF [AO + 4 * SIZE], a1
  2887. LDF [BO + 4 * SIZE], b1
  2888. FMADD (aa2, bb2, cc01, cc01)
  2889. LDF [AO + 5 * SIZE], a2
  2890. LDF [BO + 5 * SIZE], b2
  2891. FMADD (aa3, bb3, cc01, cc01)
  2892. LDF [AO + 6 * SIZE], a3
  2893. LDF [BO + 6 * SIZE], b3
  2894. FMADD (aa4, bb4, cc01, cc01)
  2895. LDF [AO + 7 * SIZE], a4
  2896. LDF [BO + 7 * SIZE], b4
  2897. add AO, 4 * SIZE, AO
  2898. cmp L, 0
  2899. bg,pt %icc, .LL83
  2900. add BO, 4 * SIZE, BO
  2901. .align 4
  2902. .LL85:
  2903. #if defined(LT) || defined(RN)
  2904. and KK, 3, L
  2905. #else
  2906. sub K, KK, L
  2907. and L, 3, L
  2908. #endif
  2909. cmp L, 0
  2910. ble,a,pn %icc, .LL88
  2911. nop
  2912. .align 4
  2913. .LL87:
  2914. FMADD (aa1, bb1, cc01, cc01)
  2915. LDF [AO + 1 * SIZE], a1
  2916. LDF [BO + 1 * SIZE], b1
  2917. add AO, 1 * SIZE, AO
  2918. add L, -1, L
  2919. cmp L, 0
  2920. bg,pt %icc, .LL87
  2921. add BO, 1 * SIZE, BO
  2922. .align 4
  2923. .LL88:
  2924. #if defined(LN) || defined(RT)
  2925. #ifdef LN
  2926. sub KK, 1, TEMP1
  2927. #else
  2928. sub KK, 1, TEMP1
  2929. #endif
  2930. sll TEMP1, BASE_SHIFT + 0, TEMP2
  2931. sll TEMP1, BASE_SHIFT + 0, TEMP1
  2932. add AORIG, TEMP2, AO
  2933. add B, TEMP1, BO
  2934. #endif
  2935. #if defined(LN) || defined(LT)
  2936. LDF [BO + 0 * SIZE], a1
  2937. FSUB a1, c01, c01
  2938. #else
  2939. LDF [AO + 0 * SIZE], a1
  2940. FSUB a1, c01, c01
  2941. #endif
  2942. #if defined(LN) || defined(LT)
  2943. LDF [AO + 0 * SIZE], a1
  2944. FMUL a1, c01, c01
  2945. #endif
  2946. #if defined(RN) || defined(RT)
  2947. LDF [BO + 0 * SIZE], a1
  2948. FMUL a1, c01, c01
  2949. #endif
  2950. #ifdef LN
  2951. add C1, -1 * SIZE, C1
  2952. #endif
  2953. #if defined(LN) || defined(LT)
  2954. STF c01, [BO + 0 * SIZE]
  2955. #else
  2956. STF c01, [AO + 0 * SIZE]
  2957. #endif
  2958. STF c01, [C1 + 0 * SIZE]
  2959. #ifdef RT
  2960. sll K, BASE_SHIFT + 0, TEMP1
  2961. add AORIG, TEMP1, AORIG
  2962. #endif
  2963. #if defined(LT) || defined(RN)
  2964. sub K, KK, TEMP1
  2965. sll TEMP1, BASE_SHIFT + 0, TEMP2
  2966. sll TEMP1, BASE_SHIFT + 0, TEMP1
  2967. add AO, TEMP2, AO
  2968. add BO, TEMP1, BO
  2969. #endif
  2970. #ifdef LT
  2971. add KK, 1, KK
  2972. #endif
  2973. #ifdef LN
  2974. sub KK, 1, KK
  2975. #endif
  2976. .align 4
  2977. .LL80:
  2978. sra M, 1, I
  2979. cmp I, 0
  2980. ble,pn %icc, .LL89
  2981. nop
  2982. .align 4
  2983. .LL72:
  2984. #if defined(LT) || defined(RN)
  2985. mov B, BO
  2986. #else
  2987. #ifdef LN
  2988. sll K, BASE_SHIFT + 1, TEMP1
  2989. sub AORIG, TEMP1, AORIG
  2990. #endif
  2991. sll KK, BASE_SHIFT + 1, TEMP1
  2992. sll KK, BASE_SHIFT + 0, TEMP2
  2993. add AORIG, TEMP1, AO
  2994. add B, TEMP2, BO
  2995. #endif
  2996. LDF [AO + 0 * SIZE], a1
  2997. LDF [AO + 1 * SIZE], a2
  2998. LDF [AO + 2 * SIZE], a3
  2999. LDF [AO + 3 * SIZE], a4
  3000. LDF [BO + 0 * SIZE], b1
  3001. LDF [BO + 1 * SIZE], b2
  3002. LDF [BO + 2 * SIZE], b3
  3003. FCLR (cc01)
  3004. LDF [BO + 3 * SIZE], b4
  3005. FCLR (cc02)
  3006. prefetch [C1 + 2 * SIZE], 3
  3007. #if defined(LT) || defined(RN)
  3008. sra KK, 2, L
  3009. #else
  3010. sub K, KK, L
  3011. sra L, 2, L
  3012. #endif
  3013. cmp L, 0
  3014. ble,pn %icc, .LL75
  3015. nop
  3016. .LL73:
  3017. prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY
  3018. add L, -1, L
  3019. FMADD (aa1, bb1, cc01, cc01)
  3020. LDF [AO + 4 * SIZE], a1
  3021. FMADD (aa2, bb1, cc02, cc02)
  3022. LDF [AO + 5 * SIZE], a2
  3023. LDF [BO + 4 * SIZE], b1
  3024. cmp L, 0
  3025. FMADD (aa3, bb2, cc01, cc01)
  3026. LDF [AO + 6 * SIZE], a3
  3027. FMADD (aa4, bb2, cc02, cc02)
  3028. LDF [AO + 7 * SIZE], a4
  3029. LDF [BO + 5 * SIZE], b2
  3030. add BO, 4 * SIZE, BO
  3031. FMADD (aa1, bb3, cc01, cc01)
  3032. LDF [AO + 8 * SIZE], a1
  3033. FMADD (aa2, bb3, cc02, cc02)
  3034. LDF [AO + 9 * SIZE], a2
  3035. LDF [BO + 2 * SIZE], b3
  3036. add AO, 8 * SIZE, AO
  3037. FMADD (aa3, bb4, cc01, cc01)
  3038. LDF [AO + 2 * SIZE], a3
  3039. FMADD (aa4, bb4, cc02, cc02)
  3040. LDF [AO + 3 * SIZE], a4
  3041. bg,pt %icc, .LL73
  3042. LDF [BO + 3 * SIZE], b4
  3043. .align 4
  3044. .LL75:
  3045. #if defined(LT) || defined(RN)
  3046. and KK, 3, L
  3047. #else
  3048. sub K, KK, L
  3049. and L, 3, L
  3050. #endif
  3051. cmp L, 0
  3052. ble,a,pn %icc, .LL78
  3053. nop
  3054. .align 4
  3055. .LL77:
  3056. FMADD (aa1, bb1, cc01, cc01)
  3057. LDF [AO + 2 * SIZE], a1
  3058. FMADD (aa2, bb1, cc02, cc02)
  3059. LDF [AO + 3 * SIZE], a2
  3060. LDF [BO + 1 * SIZE], b1
  3061. add L, -1, L
  3062. add AO, 2 * SIZE, AO
  3063. cmp L, 0
  3064. bg,pt %icc, .LL77
  3065. add BO, 1 * SIZE, BO
  3066. .align 4
  3067. .LL78:
  3068. #if defined(LN) || defined(RT)
  3069. #ifdef LN
  3070. sub KK, 2, TEMP1
  3071. #else
  3072. sub KK, 1, TEMP1
  3073. #endif
  3074. sll TEMP1, BASE_SHIFT + 1, TEMP2
  3075. sll TEMP1, BASE_SHIFT + 0, TEMP1
  3076. add AORIG, TEMP2, AO
  3077. add B, TEMP1, BO
  3078. #endif
  3079. #if defined(LN) || defined(LT)
  3080. LDF [BO + 0 * SIZE], a1
  3081. LDF [BO + 1 * SIZE], a2
  3082. FSUB a1, c01, c01
  3083. FSUB a2, c02, c02
  3084. #else
  3085. LDF [AO + 0 * SIZE], a1
  3086. LDF [AO + 1 * SIZE], a2
  3087. FSUB a1, c01, c01
  3088. FSUB a2, c02, c02
  3089. #endif
  3090. #ifdef LN
  3091. LDF [AO + 3 * SIZE], a1
  3092. LDF [AO + 2 * SIZE], a2
  3093. LDF [AO + 0 * SIZE], a3
  3094. FMUL a1, c02, c02
  3095. FNMSUB (aa2, cc02, cc01, cc01)
  3096. FMUL a3, c01, c01
  3097. #endif
  3098. #ifdef LT
  3099. LDF [AO + 0 * SIZE], a1
  3100. LDF [AO + 1 * SIZE], a2
  3101. LDF [AO + 3 * SIZE], a3
  3102. FMUL a1, c01, c01
  3103. FNMSUB (aa2, cc01, cc02, cc02)
  3104. FMUL a3, c02, c02
  3105. #endif
  3106. #if defined(RN) || defined(RT)
  3107. LDF [BO + 0 * SIZE], a1
  3108. FMUL a1, c01, c01
  3109. FMUL a1, c02, c02
  3110. #endif
  3111. #ifdef LN
  3112. add C1, -2 * SIZE, C1
  3113. #endif
  3114. #if defined(LN) || defined(LT)
  3115. STF c01, [BO + 0 * SIZE]
  3116. STF c02, [BO + 1 * SIZE]
  3117. #else
  3118. STF c01, [AO + 0 * SIZE]
  3119. STF c02, [AO + 1 * SIZE]
  3120. #endif
  3121. STF c01, [C1 + 0 * SIZE]
  3122. STF c02, [C1 + 1 * SIZE]
  3123. #ifndef LN
  3124. add C1, 2 * SIZE, C1
  3125. #endif
  3126. #ifdef RT
  3127. sll K, BASE_SHIFT + 1, TEMP1
  3128. add AORIG, TEMP1, AORIG
  3129. #endif
  3130. #if defined(LT) || defined(RN)
  3131. sub K, KK, TEMP1
  3132. sll TEMP1, BASE_SHIFT + 1, TEMP2
  3133. sll TEMP1, BASE_SHIFT + 0, TEMP1
  3134. add AO, TEMP2, AO
  3135. add BO, TEMP1, BO
  3136. #endif
  3137. #ifdef LT
  3138. add KK, 2, KK
  3139. #endif
  3140. #ifdef LN
  3141. sub KK, 2, KK
  3142. #endif
  3143. add I, -1, I
  3144. cmp I, 0
  3145. bg,pt %icc, .LL72
  3146. nop
  3147. .align 4
  3148. .LL89:
  3149. #ifdef LN
  3150. sll K, BASE_SHIFT, TEMP1
  3151. add B, TEMP1, B
  3152. #endif
  3153. #if defined(LT) || defined(RN)
  3154. mov BO, B
  3155. #endif
  3156. #ifdef RN
  3157. add KK, 1, KK
  3158. #endif
  3159. #ifdef RT
  3160. sub KK, 1, KK
  3161. #endif
  3162. .align 4
  3163. .LL999:
  3164. #ifdef TRMMKERNEL
  3165. #ifndef __64BIT__
  3166. ld [%sp + STACK_START + 8], %g1
  3167. ld [%sp + STACK_START + 12], %g2
  3168. ld [%sp + STACK_START + 16], %g3
  3169. ld [%sp + STACK_START + 20], %g4
  3170. #else
  3171. ldx [%sp + STACK_START + 32], %g1
  3172. ldx [%sp + STACK_START + 40], %g2
  3173. ldx [%sp + STACK_START + 48], %g3
  3174. ldx [%sp + STACK_START + 56], %g4
  3175. #endif
  3176. #endif
  3177. return %i7 + 8
  3178. clr %o0
  3179. EPILOGUE