You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_4x4_RT.S 59 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(EV4) && !defined(EV5) && !defined(EV6)
  41. #error "Architecture is not specified."
  42. #endif
  43. #ifdef EV6
  44. #define PREFETCHSIZE 56
  45. #define UNOP unop
  46. #endif
  47. #ifdef EV5
  48. #define PREFETCHSIZE 56
  49. #define UNOP
  50. #endif
  51. #ifdef EV4
  52. #define UNOP
  53. #endif
  54. #define STACKSIZE 80
  55. #define M $16
  56. #define N $17
  57. #define K $18
  58. #define A $20
  59. #define B $21
  60. #define C $22
  61. #define LDC $23
  62. #define C1 $19
  63. #define C2 $24
  64. #define C3 $25
  65. #define C4 $27
  66. #define AO $at
  67. #define BO $5
  68. #define I $6
  69. #define J $7
  70. #define L $8
  71. #define a1 $f16
  72. #define a2 $f17
  73. #define a3 $f18
  74. #define a4 $f19
  75. #define b1 $f20
  76. #define b2 $f21
  77. #define b3 $f22
  78. #define b4 $f23
  79. #define t1 $f24
  80. #define t2 $f25
  81. #define t3 $f26
  82. #define t4 $f27
  83. #define a5 $f28
  84. #define a6 $f30
  85. #define b5 $f29
  86. #define alpha $f30
  87. #define c01 $f0
  88. #define c02 $f1
  89. #define c03 $f2
  90. #define c04 $f3
  91. #define c05 $f4
  92. #define c06 $f5
  93. #define c07 $f6
  94. #define c08 $f7
  95. #define c09 $f8
  96. #define c10 $f9
  97. #define c11 $f10
  98. #define c12 $f11
  99. #define c13 $f12
  100. #define c14 $f13
  101. #define c15 $f14
  102. #define c16 $f15
  103. #define TMP1 $0
  104. #define TMP2 $1
  105. #define KK $2
  106. #define AORIG $3
  107. #define OFFSET $4
  108. PROLOGUE
  109. PROFCODE
  110. .frame $sp, STACKSIZE, $26, 0
  111. lda $sp, -STACKSIZE($sp)
  112. ldq C, 0 + STACKSIZE($sp)
  113. ldq LDC, 8 + STACKSIZE($sp)
  114. ldq OFFSET, 16 + STACKSIZE($sp)
  115. SXADDQ LDC, 0, LDC
  116. stt $f2, 0($sp)
  117. stt $f3, 8($sp)
  118. stt $f4, 16($sp)
  119. stt $f5, 24($sp)
  120. stt $f6, 32($sp)
  121. stt $f7, 40($sp)
  122. stt $f8, 48($sp)
  123. stt $f9, 56($sp)
  124. cmple M, 0, $0
  125. cmple N, 0, $1
  126. cmple K, 0, $2
  127. or $0, $1, $0
  128. or $0, $2, $0
  129. bne $0, $L999
  130. #ifdef LN
  131. mulq M, K, TMP1
  132. SXADDQ TMP1, A, A
  133. SXADDQ M, C, C
  134. #endif
  135. #ifdef RN
  136. negq OFFSET, KK
  137. #endif
  138. #ifdef RT
  139. mulq N, K, TMP1
  140. SXADDQ TMP1, B, B
  141. mulq N, LDC, TMP1
  142. addq TMP1, C, C
  143. subq N, OFFSET, KK
  144. #endif
  145. and N, 1, J
  146. ble J, $L40
  147. #ifdef RT
  148. sll K, BASE_SHIFT, TMP1
  149. subq B, TMP1, B
  150. subq C, LDC, C
  151. #endif
  152. mov C, C1
  153. #ifndef RT
  154. addq C, LDC, C
  155. #endif
  156. #ifdef LN
  157. addq M, OFFSET, KK
  158. #endif
  159. #ifdef LT
  160. mov OFFSET, KK
  161. #endif
  162. #if defined(LN) || defined(RT)
  163. mov A, AORIG
  164. #else
  165. mov A, AO
  166. #endif
  167. sra M, 2, I
  168. ble I, $L100
  169. .align 4
  170. $L91:
  171. #if defined(LT) || defined(RN)
  172. LD a1, 0 * SIZE(AO)
  173. fclr t1
  174. LD a2, 1 * SIZE(AO)
  175. fclr t2
  176. LD a3, 2 * SIZE(AO)
  177. fclr t3
  178. LD a4, 3 * SIZE(AO)
  179. fclr t4
  180. LD b1, 0 * SIZE(B)
  181. fclr c01
  182. LD b2, 1 * SIZE(B)
  183. fclr c02
  184. LD b3, 2 * SIZE(B)
  185. fclr c03
  186. LD b4, 3 * SIZE(B)
  187. fclr c04
  188. sra KK, 2, L
  189. mov B, BO
  190. ble L, $L95
  191. #else
  192. #ifdef LN
  193. sll K, BASE_SHIFT + 2, TMP1
  194. subq AORIG, TMP1, AORIG
  195. #endif
  196. sll KK, BASE_SHIFT + 2, TMP1
  197. addq AORIG, TMP1, AO
  198. sll KK, BASE_SHIFT + 0, TMP1
  199. addq B, TMP1, BO
  200. subq K, KK, TMP1
  201. LD a1, 0 * SIZE(AO)
  202. fclr t1
  203. LD a2, 1 * SIZE(AO)
  204. fclr t2
  205. LD a3, 2 * SIZE(AO)
  206. fclr t3
  207. LD a4, 3 * SIZE(AO)
  208. fclr t4
  209. LD b1, 0 * SIZE(BO)
  210. fclr c01
  211. LD b2, 1 * SIZE(BO)
  212. fclr c02
  213. LD b3, 2 * SIZE(BO)
  214. fclr c03
  215. LD b4, 3 * SIZE(BO)
  216. fclr c04
  217. sra TMP1, 2, L
  218. unop
  219. ble L, $L95
  220. #endif
  221. .align 5
  222. $L92:
  223. ADD c01, t1, c01
  224. unop
  225. MUL a1, b1, t1
  226. LD a1, 4 * SIZE(AO)
  227. ADD c02, t2, c02
  228. lda L, -1(L)
  229. MUL a2, b1, t2
  230. LD a2, 5 * SIZE(AO)
  231. ADD c03, t3, c03
  232. unop
  233. MUL a3, b1, t3
  234. LD a3, 6 * SIZE(AO)
  235. ADD c04, t4, c04
  236. MUL a4, b1, t4
  237. LD a4, 7 * SIZE(AO)
  238. LD b1, 4 * SIZE(BO)
  239. ADD c01, t1, c01
  240. unop
  241. MUL a1, b2, t1
  242. LD a1, 8 * SIZE(AO)
  243. ADD c02, t2, c02
  244. unop
  245. MUL a2, b2, t2
  246. LD a2, 9 * SIZE(AO)
  247. ADD c03, t3, c03
  248. unop
  249. MUL a3, b2, t3
  250. LD a3, 10 * SIZE(AO)
  251. ADD c04, t4, c04
  252. MUL a4, b2, t4
  253. LD a4, 11 * SIZE(AO)
  254. LD b2, 5 * SIZE(BO)
  255. ADD c01, t1, c01
  256. unop
  257. MUL a1, b3, t1
  258. LD a1, 12 * SIZE(AO)
  259. ADD c02, t2, c02
  260. unop
  261. MUL a2, b3, t2
  262. LD a2, 13 * SIZE(AO)
  263. ADD c03, t3, c03
  264. unop
  265. MUL a3, b3, t3
  266. LD a3, 14 * SIZE(AO)
  267. ADD c04, t4, c04
  268. MUL a4, b3, t4
  269. LD a5, 15 * SIZE(AO)
  270. LD b3, 6 * SIZE(BO)
  271. ADD c01, t1, c01
  272. MUL a1, b4, t1
  273. LD a1, 16 * SIZE(AO)
  274. lda AO, 16 * SIZE(AO)
  275. ADD c02, t2, c02
  276. lda BO, 4 * SIZE(BO)
  277. MUL a2, b4, t2
  278. LD a2, 1 * SIZE(AO)
  279. ADD c03, t3, c03
  280. LD a4, 3 * SIZE(AO)
  281. MUL a3, b4, t3
  282. LD a3, 2 * SIZE(AO)
  283. ADD c04, t4, c04
  284. MUL a5, b4, t4
  285. LD b4, 3 * SIZE(BO)
  286. bgt L, $L92
  287. .align 4
  288. $L95:
  289. #if defined(LT) || defined(RN)
  290. and KK, 3, L
  291. #else
  292. and TMP1, 3, L
  293. #endif
  294. unop
  295. ble L, $L98
  296. .align 4
  297. $L96:
  298. ADD c01, t1, c01
  299. lda L, -1(L)
  300. MUL a1, b1, t1
  301. LD a1, 4 * SIZE(AO)
  302. ADD c02, t2, c02
  303. lda BO, 1 * SIZE(BO)
  304. MUL a2, b1, t2
  305. LD a2, 5 * SIZE(AO)
  306. ADD c03, t3, c03
  307. unop
  308. MUL a3, b1, t3
  309. LD a3, 6 * SIZE(AO)
  310. ADD c04, t4, c04
  311. MUL a4, b1, t4
  312. LD a4, 7 * SIZE(AO)
  313. LD b1, 0 * SIZE(BO)
  314. lda AO, 4 * SIZE(AO)
  315. bgt L, $L96
  316. .align 4
  317. $L98:
  318. ADD c01, t1, c01
  319. ADD c02, t2, c02
  320. ADD c03, t3, c03
  321. ADD c04, t4, c04
  322. #if defined(LN) || defined(RT)
  323. #ifdef LN
  324. subq KK, 4, TMP1
  325. #else
  326. subq KK, 1, TMP1
  327. #endif
  328. sll TMP1, BASE_SHIFT + 2, TMP2
  329. addq AORIG, TMP2, AO
  330. sll TMP1, BASE_SHIFT + 0, TMP2
  331. addq B, TMP2, BO
  332. #endif
  333. #if defined(LN) || defined(LT)
  334. LD a1, 0 * SIZE(BO)
  335. LD a2, 1 * SIZE(BO)
  336. LD a3, 2 * SIZE(BO)
  337. LD a4, 3 * SIZE(BO)
  338. SUB a1, c01, c01
  339. SUB a2, c02, c02
  340. SUB a3, c03, c03
  341. SUB a4, c04, c04
  342. #else
  343. LD a1, 0 * SIZE(AO)
  344. LD a2, 1 * SIZE(AO)
  345. LD a3, 2 * SIZE(AO)
  346. LD a4, 3 * SIZE(AO)
  347. SUB a1, c01, c01
  348. SUB a2, c02, c02
  349. SUB a3, c03, c03
  350. SUB a4, c04, c04
  351. #endif
  352. #ifdef LN
  353. LD a1, 15 * SIZE(AO)
  354. LD a2, 14 * SIZE(AO)
  355. LD a3, 13 * SIZE(AO)
  356. LD a4, 12 * SIZE(AO)
  357. MUL a1, c04, c04
  358. MUL a2, c04, t1
  359. SUB c03, t1, c03
  360. MUL a3, c04, t1
  361. SUB c02, t1, c02
  362. MUL a4, c04, t1
  363. SUB c01, t1, c01
  364. LD b1, 10 * SIZE(AO)
  365. LD b2, 9 * SIZE(AO)
  366. LD b3, 8 * SIZE(AO)
  367. MUL b1, c03, c03
  368. MUL b2, c03, t1
  369. SUB c02, t1, c02
  370. MUL b3, c03, t1
  371. SUB c01, t1, c01
  372. LD a1, 5 * SIZE(AO)
  373. LD a2, 4 * SIZE(AO)
  374. LD a3, 0 * SIZE(AO)
  375. MUL a1, c02, c02
  376. MUL a2, c02, t1
  377. SUB c01, t1, c01
  378. MUL a3, c01, c01
  379. #endif
  380. #ifdef LT
  381. LD a1, 0 * SIZE(AO)
  382. LD a2, 1 * SIZE(AO)
  383. LD a3, 2 * SIZE(AO)
  384. LD a4, 3 * SIZE(AO)
  385. MUL a1, c01, c01
  386. MUL a2, c01, t1
  387. SUB c02, t1, c02
  388. MUL a3, c01, t1
  389. SUB c03, t1, c03
  390. MUL a4, c01, t1
  391. SUB c04, t1, c04
  392. LD b1, 5 * SIZE(AO)
  393. LD b2, 6 * SIZE(AO)
  394. LD b3, 7 * SIZE(AO)
  395. MUL b1, c02, c02
  396. MUL b2, c02, t1
  397. SUB c03, t1, c03
  398. MUL b3, c02, t1
  399. SUB c04, t1, c04
  400. LD a1, 10 * SIZE(AO)
  401. LD a2, 11 * SIZE(AO)
  402. LD a3, 15 * SIZE(AO)
  403. MUL a1, c03, c03
  404. MUL a2, c03, t1
  405. SUB c04, t1, c04
  406. MUL a3, c04, c04
  407. #endif
  408. #if defined(RN) || defined(RT)
  409. LD a1, 0 * SIZE(BO)
  410. MUL a1, c01, c01
  411. MUL a1, c02, c02
  412. MUL a1, c03, c03
  413. MUL a1, c04, c04
  414. #endif
  415. #if defined(LN) || defined(LT)
  416. ST c01, 0 * SIZE(BO)
  417. ST c02, 1 * SIZE(BO)
  418. ST c03, 2 * SIZE(BO)
  419. ST c04, 3 * SIZE(BO)
  420. #else
  421. ST c01, 0 * SIZE(AO)
  422. ST c02, 1 * SIZE(AO)
  423. ST c03, 2 * SIZE(AO)
  424. ST c04, 3 * SIZE(AO)
  425. #endif
  426. #ifdef LN
  427. lda C1, -4 * SIZE(C1)
  428. #endif
  429. ST c01, 0 * SIZE(C1)
  430. ST c02, 1 * SIZE(C1)
  431. ST c03, 2 * SIZE(C1)
  432. ST c04, 3 * SIZE(C1)
  433. #ifndef LN
  434. lda C1, 4 * SIZE(C1)
  435. #endif
  436. fclr t1
  437. fclr t2
  438. fclr t3
  439. fclr t4
  440. #ifdef RT
  441. sll K, 2 + BASE_SHIFT, TMP1
  442. addq AORIG, TMP1, AORIG
  443. #endif
  444. #if defined(LT) || defined(RN)
  445. subq K, KK, TMP1
  446. sll TMP1, BASE_SHIFT + 2, TMP2
  447. addq AO, TMP2, AO
  448. sll TMP1, BASE_SHIFT + 0, TMP2
  449. addq BO, TMP2, BO
  450. #endif
  451. #ifdef LT
  452. addq KK, 4, KK
  453. #endif
  454. #ifdef LN
  455. subq KK, 4, KK
  456. #endif
  457. lda I, -1(I)
  458. bgt I, $L91
  459. .align 4
  460. $L100:
  461. and M, 2, I
  462. ble I, $L110
  463. #if defined(LT) || defined(RN)
  464. LD a1, 0 * SIZE(AO)
  465. fclr t1
  466. LD a2, 1 * SIZE(AO)
  467. fclr t2
  468. LD a3, 2 * SIZE(AO)
  469. fclr t3
  470. LD a4, 3 * SIZE(AO)
  471. fclr t4
  472. LD b1, 0 * SIZE(B)
  473. fclr c01
  474. LD b2, 1 * SIZE(B)
  475. fclr c02
  476. LD b3, 2 * SIZE(B)
  477. fclr c03
  478. LD b4, 3 * SIZE(B)
  479. fclr c04
  480. sra KK, 2, L
  481. mov B, BO
  482. ble L, $L105
  483. #else
  484. #ifdef LN
  485. sll K, BASE_SHIFT + 1, TMP1
  486. subq AORIG, TMP1, AORIG
  487. #endif
  488. sll KK, BASE_SHIFT + 1, TMP1
  489. addq AORIG, TMP1, AO
  490. sll KK, BASE_SHIFT + 0, TMP1
  491. addq B, TMP1, BO
  492. subq K, KK, TMP1
  493. LD a1, 0 * SIZE(AO)
  494. fclr t1
  495. LD a2, 1 * SIZE(AO)
  496. fclr t2
  497. LD a3, 2 * SIZE(AO)
  498. fclr t3
  499. LD a4, 3 * SIZE(AO)
  500. fclr t4
  501. LD b1, 0 * SIZE(BO)
  502. fclr c01
  503. LD b2, 1 * SIZE(BO)
  504. fclr c02
  505. LD b3, 2 * SIZE(BO)
  506. fclr c03
  507. LD b4, 3 * SIZE(BO)
  508. fclr c04
  509. sra TMP1, 2, L
  510. ble L, $L105
  511. #endif
  512. .align 5
  513. $L102:
  514. ADD c01, t1, c01
  515. lda L, -1(L)
  516. MUL a1, b1, t1
  517. LD a1, 4 * SIZE(AO)
  518. ADD c02, t2, c02
  519. MUL a2, b1, t2
  520. LD a2, 5 * SIZE(AO)
  521. LD b1, 4 * SIZE(BO)
  522. ADD c03, t3, c03
  523. lda BO, 4 * SIZE(BO)
  524. MUL a3, b2, t3
  525. LD a3, 6 * SIZE(AO)
  526. ADD c04, t4, c04
  527. MUL a4, b2, t4
  528. LD a5, 7 * SIZE(AO)
  529. LD b2, 1 * SIZE(BO)
  530. ADD c01, t1, c01
  531. MUL a1, b3, t1
  532. LD a1, 8 * SIZE(AO)
  533. lda AO, 8 * SIZE(AO)
  534. ADD c02, t2, c02
  535. MUL a2, b3, t2
  536. LD b3, 2 * SIZE(BO)
  537. LD a2, 1 * SIZE(AO)
  538. ADD c03, t3, c03
  539. LD a4, 3 * SIZE(AO)
  540. MUL a3, b4, t3
  541. LD a3, 2 * SIZE(AO)
  542. ADD c04, t4, c04
  543. MUL a5, b4, t4
  544. LD b4, 3 * SIZE(BO)
  545. bgt L, $L102
  546. .align 4
  547. $L105:
  548. #if defined(LT) || defined(RN)
  549. and KK, 3, L
  550. #else
  551. and TMP1, 3, L
  552. #endif
  553. ble L, $L108
  554. .align 4
  555. $L106:
  556. ADD c01, t1, c01
  557. lda L, -1(L)
  558. MUL a1, b1, t1
  559. LD a1, 2 * SIZE(AO)
  560. ADD c02, t2, c02
  561. MUL a2, b1, t2
  562. LD a2, 3 * SIZE(AO)
  563. LD b1, 1 * SIZE(BO)
  564. lda AO, 2 * SIZE(AO)
  565. unop
  566. lda BO, 1 * SIZE(BO)
  567. bgt L, $L106
  568. .align 4
  569. $L108:
  570. ADD c01, t1, c01
  571. ADD c02, t2, c02
  572. ADD c03, t3, c03
  573. ADD c04, t4, c04
  574. ADD c01, c03, c01
  575. ADD c02, c04, c02
  576. #if defined(LN) || defined(RT)
  577. #ifdef LN
  578. subq KK, 2, TMP1
  579. #else
  580. subq KK, 1, TMP1
  581. #endif
  582. sll TMP1, BASE_SHIFT + 1, TMP2
  583. addq AORIG, TMP2, AO
  584. sll TMP1, BASE_SHIFT + 0, TMP2
  585. addq B, TMP2, BO
  586. #endif
  587. #if defined(LN) || defined(LT)
  588. LD a1, 0 * SIZE(BO)
  589. LD a2, 1 * SIZE(BO)
  590. SUB a1, c01, c01
  591. SUB a2, c02, c02
  592. #else
  593. LD a1, 0 * SIZE(AO)
  594. LD a2, 1 * SIZE(AO)
  595. SUB a1, c01, c01
  596. SUB a2, c02, c02
  597. #endif
  598. #ifdef LN
  599. LD a1, 3 * SIZE(AO)
  600. LD a2, 2 * SIZE(AO)
  601. LD a3, 0 * SIZE(AO)
  602. MUL a1, c02, c02
  603. MUL a2, c02, t1
  604. SUB c01, t1, c01
  605. MUL a3, c01, c01
  606. #endif
  607. #ifdef LT
  608. LD a1, 0 * SIZE(AO)
  609. LD a2, 1 * SIZE(AO)
  610. LD a3, 3 * SIZE(AO)
  611. MUL a1, c01, c01
  612. MUL a2, c01, t1
  613. SUB c02, t1, c02
  614. MUL a3, c02, c02
  615. #endif
  616. #if defined(RN) || defined(RT)
  617. LD a1, 0 * SIZE(BO)
  618. MUL a1, c01, c01
  619. MUL a1, c02, c02
  620. #endif
  621. #if defined(LN) || defined(LT)
  622. ST c01, 0 * SIZE(BO)
  623. ST c02, 1 * SIZE(BO)
  624. #else
  625. ST c01, 0 * SIZE(AO)
  626. ST c02, 1 * SIZE(AO)
  627. #endif
  628. #ifdef LN
  629. lda C1, -2 * SIZE(C1)
  630. #endif
  631. ST c01, 0 * SIZE(C1)
  632. ST c02, 1 * SIZE(C1)
  633. #ifndef LN
  634. lda C1, 2 * SIZE(C1)
  635. #endif
  636. fclr t1
  637. fclr t2
  638. fclr t3
  639. fclr t4
  640. #ifdef RT
  641. sll K, 1 + BASE_SHIFT, TMP1
  642. addq AORIG, TMP1, AORIG
  643. #endif
  644. #if defined(LT) || defined(RN)
  645. subq K, KK, TMP1
  646. sll TMP1, BASE_SHIFT + 1, TMP2
  647. addq AO, TMP2, AO
  648. sll TMP1, BASE_SHIFT + 0, TMP2
  649. addq BO, TMP2, BO
  650. #endif
  651. #ifdef LT
  652. addq KK, 2, KK
  653. #endif
  654. #ifdef LN
  655. subq KK, 2, KK
  656. #endif
  657. .align 4
  658. $L110:
  659. and M, 1, I
  660. ble I, $L119
  661. #if defined(LT) || defined(RN)
  662. LD a1, 0 * SIZE(AO)
  663. fclr t1
  664. LD a2, 1 * SIZE(AO)
  665. fclr t2
  666. LD a3, 2 * SIZE(AO)
  667. fclr t3
  668. LD a4, 3 * SIZE(AO)
  669. fclr t4
  670. LD b1, 0 * SIZE(B)
  671. fclr c01
  672. LD b2, 1 * SIZE(B)
  673. fclr c02
  674. LD b3, 2 * SIZE(B)
  675. fclr c03
  676. LD b4, 3 * SIZE(B)
  677. fclr c04
  678. sra KK, 2, L
  679. mov B, BO
  680. unop
  681. ble L, $L115
  682. #else
  683. #ifdef LN
  684. sll K, BASE_SHIFT + 0, TMP1
  685. subq AORIG, TMP1, AORIG
  686. #endif
  687. sll KK, BASE_SHIFT + 0, TMP1
  688. addq AORIG, TMP1, AO
  689. sll KK, BASE_SHIFT + 0, TMP1
  690. addq B, TMP1, BO
  691. subq K, KK, TMP1
  692. LD a1, 0 * SIZE(AO)
  693. fclr t1
  694. LD a2, 1 * SIZE(AO)
  695. fclr t2
  696. LD a3, 2 * SIZE(AO)
  697. fclr t3
  698. LD a4, 3 * SIZE(AO)
  699. fclr t4
  700. LD b1, 0 * SIZE(BO)
  701. fclr c01
  702. LD b2, 1 * SIZE(BO)
  703. fclr c02
  704. LD b3, 2 * SIZE(BO)
  705. fclr c03
  706. LD b4, 3 * SIZE(BO)
  707. fclr c04
  708. sra TMP1, 2, L
  709. unop
  710. ble L, $L115
  711. #endif
  712. .align 4
  713. $L112:
  714. ADD c01, t1, c01
  715. MUL a1, b1, t1
  716. LD a1, 4 * SIZE(AO)
  717. LD b1, 4 * SIZE(BO)
  718. ADD c02, t2, c02
  719. MUL a2, b2, t2
  720. LD a2, 5 * SIZE(AO)
  721. LD b2, 5 * SIZE(BO)
  722. ADD c03, t3, c03
  723. MUL a3, b3, t3
  724. LD a3, 6 * SIZE(AO)
  725. LD b3, 6 * SIZE(BO)
  726. ADD c04, t4, c04
  727. MUL a4, b4, t4
  728. LD a4, 7 * SIZE(AO)
  729. LD b4, 7 * SIZE(BO)
  730. lda L, -1(L)
  731. lda AO, 4 * SIZE(AO)
  732. lda BO, 4 * SIZE(BO)
  733. bgt L, $L112
  734. .align 4
  735. $L115:
  736. #if defined(LT) || defined(RN)
  737. and KK, 3, L
  738. #else
  739. and TMP1, 3, L
  740. #endif
  741. ble L, $L118
  742. .align 4
  743. $L116:
  744. ADD c01, t1, c01
  745. MUL a1, b1, t1
  746. LD a1, 1 * SIZE(AO)
  747. LD b1, 1 * SIZE(BO)
  748. lda L, -1(L)
  749. lda AO, 1 * SIZE(AO)
  750. lda BO, 1 * SIZE(BO)
  751. bgt L, $L116
  752. .align 4
  753. $L118:
  754. ADD c01, t1, c01
  755. ADD c02, t2, c02
  756. ADD c03, t3, c03
  757. ADD c04, t4, c04
  758. ADD c01, c02, c01
  759. ADD c03, c04, c03
  760. ADD c01, c03, c01
  761. #if defined(LN) || defined(RT)
  762. subq KK, 1, TMP1
  763. sll TMP1, BASE_SHIFT + 0, TMP2
  764. addq AORIG, TMP2, AO
  765. addq B, TMP2, BO
  766. #endif
  767. #if defined(LN) || defined(LT)
  768. LD a1, 0 * SIZE(BO)
  769. SUB a1, c01, c01
  770. #else
  771. LD a1, 0 * SIZE(AO)
  772. SUB a1, c01, c01
  773. #endif
  774. #if defined(LN) || defined(LT)
  775. LD a1, 0 * SIZE(AO)
  776. MUL a1, c01, c01
  777. #endif
  778. #if defined(RN) || defined(RT)
  779. LD a1, 0 * SIZE(BO)
  780. MUL a1, c01, c01
  781. #endif
  782. #if defined(LN) || defined(LT)
  783. ST c01, 0 * SIZE(BO)
  784. #else
  785. ST c01, 0 * SIZE(AO)
  786. #endif
  787. #ifdef LN
  788. lda C1, -1 * SIZE(C1)
  789. #endif
  790. ST c01, 0 * SIZE(C1)
  791. #ifndef LN
  792. lda C1, 1 * SIZE(C1)
  793. #endif
  794. #ifdef RT
  795. SXADDQ K, AORIG, AORIG
  796. #endif
  797. #if defined(LT) || defined(RN)
  798. subq K, KK, TMP1
  799. sll TMP1, BASE_SHIFT + 0, TMP2
  800. addq AO, TMP2, AO
  801. addq BO, TMP2, BO
  802. #endif
  803. #ifdef LT
  804. addq KK, 1, KK
  805. #endif
  806. #ifdef LN
  807. subq KK, 1, KK
  808. #endif
  809. .align 4
  810. $L119:
  811. #ifdef LN
  812. SXADDQ K, B, B
  813. #endif
  814. #if defined(LT) || defined(RN)
  815. mov BO, B
  816. #endif
  817. #ifdef RN
  818. addq KK, 1, KK
  819. #endif
  820. #ifdef RT
  821. subq KK, 1, KK
  822. #endif
  823. .align 4
  824. $L40:
  825. and N, 2, J
  826. ble J, $L80
  827. #ifdef RT
  828. sll K, 1 + BASE_SHIFT, TMP1
  829. subq B, TMP1, B
  830. addq LDC, LDC, TMP1
  831. subq C, TMP1, C
  832. #endif
  833. mov C, C1
  834. addq C, LDC, C2
  835. fclr t1
  836. #ifndef RT
  837. addq C2, LDC, C
  838. #endif
  839. fclr t2
  840. #ifdef LN
  841. addq M, OFFSET, KK
  842. #endif
  843. #ifdef LT
  844. mov OFFSET, KK
  845. #endif
  846. #if defined(LN) || defined(RT)
  847. mov A, AORIG
  848. #else
  849. mov A, AO
  850. #endif
  851. sra M, 2, I
  852. fclr t3
  853. fclr t4
  854. ble I, $L60
  855. .align 4
  856. $L51:
  857. #if defined(LT) || defined(RN)
  858. LD a1, 0 * SIZE(AO)
  859. fclr c03
  860. LD a2, 1 * SIZE(AO)
  861. fclr c07
  862. LD a3, 2 * SIZE(AO)
  863. fclr c04
  864. LD a4, 3 * SIZE(AO)
  865. fclr c08
  866. LD b1, 0 * SIZE(B)
  867. fclr c01
  868. LD b2, 1 * SIZE(B)
  869. fclr c05
  870. LD b3, 2 * SIZE(B)
  871. fclr c02
  872. LD b4, 3 * SIZE(B)
  873. fclr c06
  874. lda L, -2(KK)
  875. lda BO, 2 * SIZE(B)
  876. lda AO, 4 * SIZE(AO)
  877. ble KK, $L58
  878. ble L, $L55
  879. #else
  880. #ifdef LN
  881. sll K, BASE_SHIFT + 2, TMP1
  882. subq AORIG, TMP1, AORIG
  883. #endif
  884. sll KK, BASE_SHIFT + 2, TMP1
  885. addq AORIG, TMP1, AO
  886. sll KK, BASE_SHIFT + 1, TMP1
  887. addq B, TMP1, BO
  888. subq K, KK, TMP1
  889. LD a1, 0 * SIZE(AO)
  890. fclr c03
  891. LD a2, 1 * SIZE(AO)
  892. fclr c07
  893. LD a3, 2 * SIZE(AO)
  894. fclr c04
  895. LD a4, 3 * SIZE(AO)
  896. fclr c08
  897. LD b1, 0 * SIZE(BO)
  898. fclr c01
  899. LD b2, 1 * SIZE(BO)
  900. fclr c05
  901. LD b3, 2 * SIZE(BO)
  902. fclr c02
  903. LD b4, 3 * SIZE(BO)
  904. fclr c06
  905. lda L, -2(TMP1)
  906. lda BO, 2 * SIZE(BO)
  907. lda AO, 4 * SIZE(AO)
  908. ble TMP1, $L58
  909. ble L, $L55
  910. #endif
  911. .align 4
  912. $L52:
  913. ADD c05, t1, c05
  914. unop
  915. MUL a1, b1, t1
  916. unop
  917. ADD c06, t2, c06
  918. lda L, -2(L)
  919. MUL a2, b1, t2
  920. unop
  921. ADD c07, t3, c07
  922. unop
  923. MUL a3, b1, t3
  924. unop
  925. ADD c08, t4, c08
  926. unop
  927. MUL a4, b1, t4
  928. LD b1, 2 * SIZE(BO)
  929. ADD c01, t1, c01
  930. unop
  931. MUL a1, b2, t1
  932. LD a1, 0 * SIZE(AO)
  933. ADD c02, t2, c02
  934. lda BO, 4 * SIZE(BO)
  935. MUL a2, b2, t2
  936. LD a2, 1 * SIZE(AO)
  937. ADD c03, t3, c03
  938. unop
  939. MUL a3, b2, t3
  940. LD a3, 2 * SIZE(AO)
  941. ADD c04, t4, c04
  942. unop
  943. MUL a4, b2, t4
  944. LD a5, 3 * SIZE(AO)
  945. ADD c05, t1, c05
  946. unop
  947. MUL a1, b3, t1
  948. LD b2, -1 * SIZE(BO)
  949. ADD c06, t2, c06
  950. unop
  951. MUL a2, b3, t2
  952. unop
  953. ADD c07, t3, c07
  954. unop
  955. MUL a3, b3, t3
  956. lda AO, 8 * SIZE(AO)
  957. ADD c08, t4, c08
  958. unop
  959. MUL a5, b3, t4
  960. LD b3, 0 * SIZE(BO)
  961. ADD c01, t1, c01
  962. unop
  963. MUL a1, b4, t1
  964. LD a1, -4 * SIZE(AO)
  965. ADD c02, t2, c02
  966. unop
  967. MUL a2, b4, t2
  968. LD a2, -3 * SIZE(AO)
  969. ADD c03, t3, c03
  970. LD a4, -1 * SIZE(AO)
  971. MUL a3, b4, t3
  972. LD a3, -2 * SIZE(AO)
  973. ADD c04, t4, c04
  974. MUL a5, b4, t4
  975. LD b4, 1 * SIZE(BO)
  976. bgt L, $L52
  977. .align 4
  978. $L55:
  979. ADD c05, t1, c05
  980. MUL a1, b1, t1
  981. #if defined(LT) || defined(RN)
  982. blbs KK, $L57
  983. #else
  984. blbs TMP1, $L57
  985. #endif
  986. .align 4
  987. ADD c06, t2, c06
  988. MUL a2, b1, t2
  989. ADD c07, t3, c07
  990. MUL a3, b1, t3
  991. ADD c08, t4, c08
  992. unop
  993. MUL a4, b1, t4
  994. LD b1, 0 * SIZE(BO)
  995. ADD c01, t1, c01
  996. unop
  997. MUL a1, b2, t1
  998. LD a1, 0 * SIZE(AO)
  999. ADD c02, t2, c02
  1000. unop
  1001. MUL a2, b2, t2
  1002. LD a2, 1 * SIZE(AO)
  1003. ADD c03, t3, c03
  1004. unop
  1005. MUL a3, b2, t3
  1006. LD a3, 2 * SIZE(AO)
  1007. ADD c04, t4, c04
  1008. MUL a4, b2, t4
  1009. LD a4, 3 * SIZE(AO)
  1010. lda AO, 4 * SIZE(AO)
  1011. ADD c05, t1, c05
  1012. LD b2, 1 * SIZE(BO)
  1013. MUL a1, b1, t1
  1014. lda BO, 2 * SIZE(BO)
  1015. .align 4
  1016. $L57:
  1017. ADD c06, t2, c06
  1018. MUL a2, b1, t2
  1019. ADD c07, t3, c07
  1020. MUL a3, b1, t3
  1021. ADD c08, t4, c08
  1022. MUL a4, b1, t4
  1023. ADD c01, t1, c01
  1024. MUL a1, b2, t1
  1025. ADD c02, t2, c02
  1026. MUL a2, b2, t2
  1027. ADD c03, t3, c03
  1028. MUL a3, b2, t3
  1029. ADD c04, t4, c04
  1030. lda AO, 4 * SIZE(AO)
  1031. MUL a4, b2, t4
  1032. lda BO, 2 * SIZE(BO)
  1033. ADD c05, t1, c05
  1034. ADD c06, t2, c06
  1035. ADD c07, t3, c07
  1036. ADD c08, t4, c08
  1037. .align 4
  1038. $L58:
  1039. #if defined(LN) || defined(RT)
  1040. #ifdef LN
  1041. subq KK, 4, TMP1
  1042. #else
  1043. subq KK, 2, TMP1
  1044. #endif
  1045. sll TMP1, BASE_SHIFT + 2, TMP2
  1046. addq AORIG, TMP2, AO
  1047. sll TMP1, BASE_SHIFT + 1, TMP2
  1048. addq B, TMP2, BO
  1049. #else
  1050. lda AO, -4 * SIZE(AO)
  1051. lda BO, -2 * SIZE(BO)
  1052. #endif
  1053. #if defined(LN) || defined(LT)
  1054. LD a1, 0 * SIZE(BO)
  1055. LD a2, 1 * SIZE(BO)
  1056. LD a3, 2 * SIZE(BO)
  1057. LD a4, 3 * SIZE(BO)
  1058. LD b1, 4 * SIZE(BO)
  1059. LD b2, 5 * SIZE(BO)
  1060. LD b3, 6 * SIZE(BO)
  1061. LD b4, 7 * SIZE(BO)
  1062. SUB a1, c01, c01
  1063. SUB a2, c05, c05
  1064. SUB a3, c02, c02
  1065. SUB a4, c06, c06
  1066. SUB b1, c03, c03
  1067. SUB b2, c07, c07
  1068. SUB b3, c04, c04
  1069. SUB b4, c08, c08
  1070. #else
  1071. LD a1, 0 * SIZE(AO)
  1072. LD a2, 1 * SIZE(AO)
  1073. LD a3, 2 * SIZE(AO)
  1074. LD a4, 3 * SIZE(AO)
  1075. LD b1, 4 * SIZE(AO)
  1076. LD b2, 5 * SIZE(AO)
  1077. LD b3, 6 * SIZE(AO)
  1078. LD b4, 7 * SIZE(AO)
  1079. SUB a1, c01, c01
  1080. SUB a2, c02, c02
  1081. SUB a3, c03, c03
  1082. SUB a4, c04, c04
  1083. SUB b1, c05, c05
  1084. SUB b2, c06, c06
  1085. SUB b3, c07, c07
  1086. SUB b4, c08, c08
  1087. #endif
  1088. #ifdef LN
  1089. LD a1, 15 * SIZE(AO)
  1090. LD a2, 14 * SIZE(AO)
  1091. LD a3, 13 * SIZE(AO)
  1092. LD a4, 12 * SIZE(AO)
  1093. MUL a1, c04, c04
  1094. MUL a1, c08, c08
  1095. MUL a2, c04, t1
  1096. MUL a2, c08, t2
  1097. SUB c03, t1, c03
  1098. SUB c07, t2, c07
  1099. MUL a3, c04, t1
  1100. MUL a3, c08, t2
  1101. SUB c02, t1, c02
  1102. SUB c06, t2, c06
  1103. MUL a4, c04, t1
  1104. MUL a4, c08, t2
  1105. SUB c01, t1, c01
  1106. SUB c05, t2, c05
  1107. LD b1, 10 * SIZE(AO)
  1108. LD b2, 9 * SIZE(AO)
  1109. LD b3, 8 * SIZE(AO)
  1110. MUL b1, c03, c03
  1111. MUL b1, c07, c07
  1112. MUL b2, c03, t1
  1113. MUL b2, c07, t2
  1114. SUB c02, t1, c02
  1115. SUB c06, t2, c06
  1116. MUL b3, c03, t1
  1117. MUL b3, c07, t2
  1118. SUB c01, t1, c01
  1119. SUB c05, t2, c05
  1120. LD a1, 5 * SIZE(AO)
  1121. LD a2, 4 * SIZE(AO)
  1122. LD a3, 0 * SIZE(AO)
  1123. MUL a1, c02, c02
  1124. MUL a1, c06, c06
  1125. MUL a2, c02, t1
  1126. MUL a2, c06, t2
  1127. SUB c01, t1, c01
  1128. SUB c05, t2, c05
  1129. MUL a3, c01, c01
  1130. MUL a3, c05, c05
  1131. #endif
  1132. #ifdef LT
  1133. LD a1, 0 * SIZE(AO)
  1134. LD a2, 1 * SIZE(AO)
  1135. LD a3, 2 * SIZE(AO)
  1136. LD a4, 3 * SIZE(AO)
  1137. MUL a1, c01, c01
  1138. MUL a1, c05, c05
  1139. MUL a2, c01, t1
  1140. MUL a2, c05, t2
  1141. SUB c02, t1, c02
  1142. SUB c06, t2, c06
  1143. MUL a3, c01, t1
  1144. MUL a3, c05, t2
  1145. SUB c03, t1, c03
  1146. SUB c07, t2, c07
  1147. MUL a4, c01, t1
  1148. MUL a4, c05, t2
  1149. SUB c04, t1, c04
  1150. SUB c08, t2, c08
  1151. LD b1, 5 * SIZE(AO)
  1152. LD b2, 6 * SIZE(AO)
  1153. LD b3, 7 * SIZE(AO)
  1154. MUL b1, c02, c02
  1155. MUL b1, c06, c06
  1156. MUL b2, c02, t1
  1157. MUL b2, c06, t2
  1158. SUB c03, t1, c03
  1159. SUB c07, t2, c07
  1160. MUL b3, c02, t1
  1161. MUL b3, c06, t2
  1162. SUB c04, t1, c04
  1163. SUB c08, t2, c08
  1164. LD a1, 10 * SIZE(AO)
  1165. LD a2, 11 * SIZE(AO)
  1166. LD a3, 15 * SIZE(AO)
  1167. MUL a1, c03, c03
  1168. MUL a1, c07, c07
  1169. MUL a2, c03, t1
  1170. MUL a2, c07, t2
  1171. SUB c04, t1, c04
  1172. SUB c08, t2, c08
  1173. MUL a3, c04, c04
  1174. MUL a3, c08, c08
  1175. #endif
  1176. #ifdef RN
  1177. LD a1, 0 * SIZE(BO)
  1178. LD a2, 1 * SIZE(BO)
  1179. LD a3, 3 * SIZE(BO)
  1180. MUL a1, c01, c01
  1181. MUL a1, c02, c02
  1182. MUL a1, c03, c03
  1183. MUL a1, c04, c04
  1184. MUL a2, c01, t1
  1185. MUL a2, c02, t2
  1186. MUL a2, c03, t3
  1187. MUL a2, c04, t4
  1188. SUB c05, t1, c05
  1189. SUB c06, t2, c06
  1190. SUB c07, t3, c07
  1191. SUB c08, t4, c08
  1192. MUL a3, c05, c05
  1193. MUL a3, c06, c06
  1194. MUL a3, c07, c07
  1195. MUL a3, c08, c08
  1196. #endif
  1197. #ifdef RT
  1198. LD a1, 3 * SIZE(BO)
  1199. LD a2, 2 * SIZE(BO)
  1200. LD a3, 0 * SIZE(BO)
  1201. MUL a1, c05, c05
  1202. MUL a1, c06, c06
  1203. MUL a1, c07, c07
  1204. MUL a1, c08, c08
  1205. MUL a2, c05, t1
  1206. MUL a2, c06, t2
  1207. MUL a2, c07, t3
  1208. MUL a2, c08, t4
  1209. SUB c01, t1, c01
  1210. SUB c02, t2, c02
  1211. SUB c03, t3, c03
  1212. SUB c04, t4, c04
  1213. MUL a3, c01, c01
  1214. MUL a3, c02, c02
  1215. MUL a3, c03, c03
  1216. MUL a3, c04, c04
  1217. #endif
  1218. #if defined(LN) || defined(LT)
  1219. ST c01, 0 * SIZE(BO)
  1220. ST c05, 1 * SIZE(BO)
  1221. ST c02, 2 * SIZE(BO)
  1222. ST c06, 3 * SIZE(BO)
  1223. ST c03, 4 * SIZE(BO)
  1224. ST c07, 5 * SIZE(BO)
  1225. ST c04, 6 * SIZE(BO)
  1226. ST c08, 7 * SIZE(BO)
  1227. #else
  1228. ST c01, 0 * SIZE(AO)
  1229. ST c02, 1 * SIZE(AO)
  1230. ST c03, 2 * SIZE(AO)
  1231. ST c04, 3 * SIZE(AO)
  1232. ST c05, 4 * SIZE(AO)
  1233. ST c06, 5 * SIZE(AO)
  1234. ST c07, 6 * SIZE(AO)
  1235. ST c08, 7 * SIZE(AO)
  1236. #endif
  1237. #ifdef LN
  1238. lda C1, -4 * SIZE(C1)
  1239. lda C2, -4 * SIZE(C2)
  1240. #endif
  1241. ST c01, 0 * SIZE(C1)
  1242. ST c02, 1 * SIZE(C1)
  1243. ST c03, 2 * SIZE(C1)
  1244. ST c04, 3 * SIZE(C1)
  1245. ST c05, 0 * SIZE(C2)
  1246. ST c06, 1 * SIZE(C2)
  1247. ST c07, 2 * SIZE(C2)
  1248. ST c08, 3 * SIZE(C2)
  1249. #ifndef LN
  1250. lda C1, 4 * SIZE(C1)
  1251. lda C2, 4 * SIZE(C2)
  1252. #endif
  1253. fclr t1
  1254. fclr t2
  1255. fclr t3
  1256. fclr t4
  1257. #ifdef RT
  1258. sll K, 2 + BASE_SHIFT, TMP1
  1259. addq AORIG, TMP1, AORIG
  1260. #endif
  1261. #if defined(LT) || defined(RN)
  1262. subq K, KK, TMP1
  1263. sll TMP1, BASE_SHIFT + 2, TMP2
  1264. addq AO, TMP2, AO
  1265. sll TMP1, BASE_SHIFT + 1, TMP2
  1266. addq BO, TMP2, BO
  1267. #endif
  1268. #ifdef LT
  1269. addq KK, 4, KK
  1270. #endif
  1271. #ifdef LN
  1272. subq KK, 4, KK
  1273. #endif
  1274. lda I, -1(I)
  1275. bgt I, $L51
  1276. .align 4
  1277. $L60:
  1278. and M, 2, I
  1279. ble I, $L70
  1280. #if defined(LT) || defined(RN)
  1281. LD a1, 0 * SIZE(AO)
  1282. fclr c01
  1283. LD a2, 1 * SIZE(AO)
  1284. fclr c05
  1285. LD a3, 2 * SIZE(AO)
  1286. fclr c02
  1287. LD a4, 3 * SIZE(AO)
  1288. fclr c06
  1289. LD b1, 0 * SIZE(B)
  1290. lda L, -2(KK)
  1291. LD b2, 1 * SIZE(B)
  1292. lda AO, 2 * SIZE(AO)
  1293. LD b3, 2 * SIZE(B)
  1294. LD b4, 3 * SIZE(B)
  1295. lda BO, 2 * SIZE(B)
  1296. ble KK, $L68
  1297. ble L, $L65
  1298. #else
  1299. #ifdef LN
  1300. sll K, BASE_SHIFT + 1, TMP1
  1301. subq AORIG, TMP1, AORIG
  1302. #endif
  1303. sll KK, BASE_SHIFT + 1, TMP1
  1304. addq AORIG, TMP1, AO
  1305. sll KK, BASE_SHIFT + 1, TMP1
  1306. addq B, TMP1, BO
  1307. subq K, KK, TMP1
  1308. LD a1, 0 * SIZE(AO)
  1309. fclr c01
  1310. LD a2, 1 * SIZE(AO)
  1311. fclr c05
  1312. LD a3, 2 * SIZE(AO)
  1313. fclr c02
  1314. LD a4, 3 * SIZE(AO)
  1315. fclr c06
  1316. LD b1, 0 * SIZE(BO)
  1317. lda L, -2(TMP1)
  1318. LD b2, 1 * SIZE(BO)
  1319. lda AO, 2 * SIZE(AO)
  1320. LD b3, 2 * SIZE(BO)
  1321. LD b4, 3 * SIZE(BO)
  1322. lda BO, 2 * SIZE(BO)
  1323. ble TMP1, $L68
  1324. ble L, $L65
  1325. #endif
  1326. .align 4
  1327. $L62:
  1328. ADD c01, t1, c01
  1329. unop
  1330. MUL a1, b1, t1
  1331. unop
  1332. ADD c02, t2, c02
  1333. lda AO, 4 * SIZE(AO)
  1334. MUL a2, b1, t2
  1335. LD b1, 2 * SIZE(BO)
  1336. ADD c05, t3, c05
  1337. lda L, -2(L)
  1338. MUL a1, b2, t3
  1339. LD a1, -2 * SIZE(AO)
  1340. ADD c06, t4, c06
  1341. unop
  1342. MUL a2, b2, t4
  1343. LD a2, -1 * SIZE(AO)
  1344. ADD c01, t1, c01
  1345. LD b2, 3 * SIZE(BO)
  1346. MUL a3, b3, t1
  1347. lda BO, 4 * SIZE(BO)
  1348. ADD c02, t2, c02
  1349. unop
  1350. MUL a4, b3, t2
  1351. LD b3, 0 * SIZE(BO)
  1352. ADD c05, t3, c05
  1353. unop
  1354. MUL a3, b4, t3
  1355. LD a3, 0 * SIZE(AO)
  1356. ADD c06, t4, c06
  1357. MUL a4, b4, t4
  1358. LD b4, 1 * SIZE(BO)
  1359. unop
  1360. LD a4, 1 * SIZE(AO)
  1361. unop
  1362. unop
  1363. bgt L, $L62
  1364. .align 4
  1365. $L65:
  1366. ADD c01, t1, c01
  1367. MUL a1, b1, t1
  1368. #if defined(LT) || defined(RN)
  1369. blbs KK, $L67
  1370. #else
  1371. blbs TMP1, $L67
  1372. #endif
  1373. .align 4
  1374. ADD c02, t2, c02
  1375. unop
  1376. MUL a2, b1, t2
  1377. LD b1, 0 * SIZE(BO)
  1378. ADD c05, t3, c05
  1379. lda BO, 2 * SIZE(BO)
  1380. MUL a1, b2, t3
  1381. LD a1, 0 * SIZE(AO)
  1382. ADD c06, t4, c06
  1383. unop
  1384. MUL a2, b2, t4
  1385. LD a2, 1 * SIZE(AO)
  1386. ADD c01, t1, c01
  1387. LD b2, -1 * SIZE(BO)
  1388. MUL a1, b1, t1
  1389. lda AO, 2 * SIZE(AO)
  1390. .align 4
  1391. $L67:
  1392. ADD c02, t2, c02
  1393. MUL a2, b1, t2
  1394. ADD c05, t3, c05
  1395. MUL a1, b2, t3
  1396. ADD c06, t4, c06
  1397. lda AO, 2 * SIZE(AO)
  1398. MUL a2, b2, t4
  1399. lda BO, 2 * SIZE(BO)
  1400. ADD c01, t1, c01
  1401. ADD c02, t2, c02
  1402. ADD c05, t3, c05
  1403. ADD c06, t4, c06
  1404. .align 4
  1405. $L68:
  1406. #if defined(LN) || defined(RT)
  1407. #ifdef LN
  1408. subq KK, 2, TMP1
  1409. #else
  1410. subq KK, 2, TMP1
  1411. #endif
  1412. sll TMP1, BASE_SHIFT + 1, TMP2
  1413. addq AORIG, TMP2, AO
  1414. sll TMP1, BASE_SHIFT + 1, TMP2
  1415. addq B, TMP2, BO
  1416. #else
  1417. lda AO, -2 * SIZE(AO)
  1418. lda BO, -2 * SIZE(BO)
  1419. #endif
  1420. #if defined(LN) || defined(LT)
  1421. LD a1, 0 * SIZE(BO)
  1422. LD a2, 1 * SIZE(BO)
  1423. LD a3, 2 * SIZE(BO)
  1424. LD a4, 3 * SIZE(BO)
  1425. SUB a1, c01, c01
  1426. SUB a2, c05, c05
  1427. SUB a3, c02, c02
  1428. SUB a4, c06, c06
  1429. #else
  1430. LD a1, 0 * SIZE(AO)
  1431. LD a2, 1 * SIZE(AO)
  1432. LD a3, 2 * SIZE(AO)
  1433. LD a4, 3 * SIZE(AO)
  1434. SUB a1, c01, c01
  1435. SUB a2, c02, c02
  1436. SUB a3, c05, c05
  1437. SUB a4, c06, c06
  1438. #endif
  1439. #ifdef LN
  1440. LD a1, 3 * SIZE(AO)
  1441. LD a2, 2 * SIZE(AO)
  1442. LD a3, 0 * SIZE(AO)
  1443. MUL a1, c02, c02
  1444. MUL a1, c06, c06
  1445. MUL a2, c02, t1
  1446. MUL a2, c06, t2
  1447. SUB c01, t1, c01
  1448. SUB c05, t2, c05
  1449. MUL a3, c01, c01
  1450. MUL a3, c05, c05
  1451. #endif
  1452. #ifdef LT
  1453. LD a1, 0 * SIZE(AO)
  1454. LD a2, 1 * SIZE(AO)
  1455. LD a3, 3 * SIZE(AO)
  1456. MUL a1, c01, c01
  1457. MUL a1, c05, c05
  1458. MUL a2, c01, t1
  1459. MUL a2, c05, t2
  1460. SUB c02, t1, c02
  1461. SUB c06, t2, c06
  1462. MUL a3, c02, c02
  1463. MUL a3, c06, c06
  1464. #endif
  1465. #ifdef RN
  1466. LD a1, 0 * SIZE(BO)
  1467. LD a2, 1 * SIZE(BO)
  1468. LD a3, 3 * SIZE(BO)
  1469. MUL a1, c01, c01
  1470. MUL a1, c02, c02
  1471. MUL a2, c01, t1
  1472. MUL a2, c02, t2
  1473. SUB c05, t1, c05
  1474. SUB c06, t2, c06
  1475. MUL a3, c05, c05
  1476. MUL a3, c06, c06
  1477. #endif
  1478. #ifdef RT
  1479. LD a1, 3 * SIZE(BO)
  1480. LD a2, 2 * SIZE(BO)
  1481. LD a3, 0 * SIZE(BO)
  1482. MUL a1, c05, c05
  1483. MUL a1, c06, c06
  1484. MUL a2, c05, t1
  1485. MUL a2, c06, t2
  1486. SUB c01, t1, c01
  1487. SUB c02, t2, c02
  1488. MUL a3, c01, c01
  1489. MUL a3, c02, c02
  1490. #endif
  1491. #if defined(LN) || defined(LT)
  1492. ST c01, 0 * SIZE(BO)
  1493. ST c05, 1 * SIZE(BO)
  1494. ST c02, 2 * SIZE(BO)
  1495. ST c06, 3 * SIZE(BO)
  1496. #else
  1497. ST c01, 0 * SIZE(AO)
  1498. ST c02, 1 * SIZE(AO)
  1499. ST c05, 2 * SIZE(AO)
  1500. ST c06, 3 * SIZE(AO)
  1501. #endif
  1502. #ifdef LN
  1503. lda C1, -2 * SIZE(C1)
  1504. lda C2, -2 * SIZE(C2)
  1505. #endif
  1506. ST c01, 0 * SIZE(C1)
  1507. ST c02, 1 * SIZE(C1)
  1508. ST c05, 0 * SIZE(C2)
  1509. ST c06, 1 * SIZE(C2)
  1510. #ifndef LN
  1511. lda C1, 2 * SIZE(C1)
  1512. lda C2, 2 * SIZE(C2)
  1513. #endif
  1514. fclr t1
  1515. fclr t2
  1516. fclr t3
  1517. fclr t4
  1518. #ifdef RT
  1519. sll K, 1 + BASE_SHIFT, TMP1
  1520. addq AORIG, TMP1, AORIG
  1521. #endif
  1522. #if defined(LT) || defined(RN)
  1523. subq K, KK, TMP1
  1524. sll TMP1, BASE_SHIFT + 1, TMP2
  1525. addq AO, TMP2, AO
  1526. sll TMP1, BASE_SHIFT + 1, TMP2
  1527. addq BO, TMP2, BO
  1528. #endif
  1529. #ifdef LT
  1530. addq KK, 2, KK
  1531. #endif
  1532. #ifdef LN
  1533. subq KK, 2, KK
  1534. #endif
  1535. .align 4
  1536. $L70:
  1537. and M, 1, I
  1538. ble I, $L79
  1539. #if defined(LT) || defined(RN)
  1540. LD a1, 0 * SIZE(AO)
  1541. fclr c01
  1542. LD a2, 1 * SIZE(AO)
  1543. fclr c05
  1544. LD b1, 0 * SIZE(B)
  1545. fclr c02
  1546. LD b2, 1 * SIZE(B)
  1547. fclr c06
  1548. lda L, -2(KK)
  1549. LD b3, 2 * SIZE(B)
  1550. lda AO, 1 * SIZE(AO)
  1551. LD b4, 3 * SIZE(B)
  1552. lda BO, 2 * SIZE(B)
  1553. ble KK, $L78
  1554. ble L, $L75
  1555. #else
  1556. #ifdef LN
  1557. sll K, BASE_SHIFT + 0, TMP1
  1558. subq AORIG, TMP1, AORIG
  1559. #endif
  1560. sll KK, BASE_SHIFT + 0, TMP1
  1561. addq AORIG, TMP1, AO
  1562. sll KK, BASE_SHIFT + 1, TMP1
  1563. addq B, TMP1, BO
  1564. subq K, KK, TMP1
  1565. LD a1, 0 * SIZE(AO)
  1566. fclr c01
  1567. LD a2, 1 * SIZE(AO)
  1568. fclr c05
  1569. LD b1, 0 * SIZE(BO)
  1570. fclr c02
  1571. LD b2, 1 * SIZE(BO)
  1572. fclr c06
  1573. lda L, -2(TMP1)
  1574. LD b3, 2 * SIZE(BO)
  1575. lda AO, 1 * SIZE(AO)
  1576. LD b4, 3 * SIZE(BO)
  1577. lda BO, 2 * SIZE(BO)
  1578. ble TMP1, $L78
  1579. ble L, $L75
  1580. #endif
  1581. .align 4
  1582. $L72:
  1583. ADD c01, t1, c01
  1584. lda L, -2(L)
  1585. MUL a1, b1, t1
  1586. LD b1, 2 * SIZE(BO)
  1587. ADD c05, t2, c05
  1588. MUL a1, b2, t2
  1589. LD a1, 1 * SIZE(AO)
  1590. LD b2, 3 * SIZE(BO)
  1591. ADD c02, t3, c02
  1592. lda AO, 2 * SIZE(AO)
  1593. MUL a2, b3, t3
  1594. LD b3, 4 * SIZE(BO)
  1595. ADD c06, t4, c06
  1596. MUL a2, b4, t4
  1597. LD a2, 0 * SIZE(AO)
  1598. LD b4, 5 * SIZE(BO)
  1599. lda BO, 4 * SIZE(BO)
  1600. unop
  1601. unop
  1602. bgt L, $L72
  1603. .align 4
  1604. $L75:
  1605. ADD c01, t1, c01
  1606. MUL a1, b1, t1
  1607. #if defined(LT) || defined(RN)
  1608. blbs KK, $L77
  1609. #else
  1610. blbs TMP1, $L77
  1611. #endif
  1612. .align 4
  1613. ADD c05, t2, c05
  1614. MUL a1, b2, t2
  1615. LD a1, 0 * SIZE(AO)
  1616. LD b1, 0 * SIZE(BO)
  1617. ADD c01, t1, c01
  1618. LD b2, 1 * SIZE(BO)
  1619. lda AO, 1 * SIZE(AO)
  1620. MUL a1, b1, t1
  1621. lda BO, 2 * SIZE(BO)
  1622. .align 4
  1623. $L77:
  1624. ADD c05, t2, c05
  1625. MUL a1, b2, t2
  1626. ADD c02, t3, c02
  1627. ADD c06, t4, c06
  1628. ADD c01, c02, c01
  1629. lda AO, 1 * SIZE(AO)
  1630. ADD c05, c06, c05
  1631. lda BO, 2 * SIZE(BO)
  1632. ADD c01, t1, c01
  1633. ADD c05, t2, c05
  1634. .align 4
  1635. $L78:
  1636. #if defined(LN) || defined(RT)
  1637. #ifdef LN
  1638. subq KK, 1, TMP1
  1639. #else
  1640. subq KK, 2, TMP1
  1641. #endif
  1642. sll TMP1, BASE_SHIFT + 0, TMP2
  1643. addq AORIG, TMP2, AO
  1644. sll TMP1, BASE_SHIFT + 1, TMP2
  1645. addq B, TMP2, BO
  1646. #else
  1647. lda AO, -1 * SIZE(AO)
  1648. lda BO, -2 * SIZE(BO)
  1649. #endif
  1650. #if defined(LN) || defined(LT)
  1651. LD a1, 0 * SIZE(BO)
  1652. LD a2, 1 * SIZE(BO)
  1653. SUB a1, c01, c01
  1654. SUB a2, c05, c05
  1655. #else
  1656. LD a1, 0 * SIZE(AO)
  1657. LD a2, 1 * SIZE(AO)
  1658. SUB a1, c01, c01
  1659. SUB a2, c05, c05
  1660. #endif
  1661. #if defined(LN) || defined(LT)
  1662. LD a1, 0 * SIZE(AO)
  1663. MUL a1, c01, c01
  1664. MUL a1, c05, c05
  1665. #endif
  1666. #ifdef RN
  1667. LD a1, 0 * SIZE(BO)
  1668. LD a2, 1 * SIZE(BO)
  1669. LD a3, 3 * SIZE(BO)
  1670. MUL a1, c01, c01
  1671. MUL a2, c01, t1
  1672. SUB c05, t1, c05
  1673. MUL a3, c05, c05
  1674. #endif
  1675. #ifdef RT
  1676. LD a1, 3 * SIZE(BO)
  1677. LD a2, 2 * SIZE(BO)
  1678. LD a3, 0 * SIZE(BO)
  1679. MUL a1, c05, c05
  1680. MUL a2, c05, t1
  1681. SUB c01, t1, c01
  1682. MUL a3, c01, c01
  1683. #endif
  1684. #if defined(LN) || defined(LT)
  1685. ST c01, 0 * SIZE(BO)
  1686. ST c05, 1 * SIZE(BO)
  1687. #else
  1688. ST c01, 0 * SIZE(AO)
  1689. ST c05, 1 * SIZE(AO)
  1690. #endif
  1691. #ifdef LN
  1692. lda C1, -1 * SIZE(C1)
  1693. lda C2, -1 * SIZE(C2)
  1694. #endif
  1695. ST c01, 0 * SIZE(C1)
  1696. ST c05, 0 * SIZE(C2)
  1697. fclr t1
  1698. fclr t2
  1699. fclr t3
  1700. fclr t4
  1701. #ifdef RT
  1702. sll K, 0 + BASE_SHIFT, TMP1
  1703. addq AORIG, TMP1, AORIG
  1704. #endif
  1705. #if defined(LT) || defined(RN)
  1706. subq K, KK, TMP1
  1707. sll TMP1, BASE_SHIFT + 0, TMP2
  1708. addq AO, TMP2, AO
  1709. sll TMP1, BASE_SHIFT + 1, TMP2
  1710. addq BO, TMP2, BO
  1711. #endif
  1712. #ifdef LT
  1713. addq KK, 1, KK
  1714. #endif
  1715. #ifdef LN
  1716. subq KK, 1, KK
  1717. #endif
  1718. .align 4
  1719. $L79:
  1720. #ifdef LN
  1721. sll K, 1 + BASE_SHIFT, TMP1
  1722. addq B, TMP1, B
  1723. #endif
  1724. #if defined(LT) || defined(RN)
  1725. mov BO, B
  1726. #endif
  1727. #ifdef RN
  1728. addq KK, 2, KK
  1729. #endif
  1730. #ifdef RT
  1731. subq KK, 2, KK
  1732. #endif
  1733. .align 4
  1734. $L80:
  1735. sra N, 2, J
  1736. ble J, $L999
  1737. .align 4
  1738. $L01:
  1739. #ifdef RT
  1740. sll K, 2 + BASE_SHIFT, TMP1
  1741. subq B, TMP1, B
  1742. s4addq LDC, 0, TMP1
  1743. subq C, TMP1, C
  1744. #endif
  1745. mov C, C1
  1746. addq C, LDC, C2
  1747. addq C2, LDC, C3
  1748. #ifndef RT
  1749. s4addq LDC, C, C
  1750. #endif
  1751. fclr t1
  1752. addq C3, LDC, C4
  1753. fclr t2
  1754. #ifdef LN
  1755. addq M, OFFSET, KK
  1756. #endif
  1757. #ifdef LT
  1758. mov OFFSET, KK
  1759. #endif
  1760. #if defined(LN) || defined(RT)
  1761. mov A, AORIG
  1762. #else
  1763. mov A, AO
  1764. #endif
  1765. sra M, 2, I
  1766. fclr t3
  1767. fclr t4
  1768. ble I, $L20
  1769. .align 4
  1770. $L11:
  1771. #if defined(LT) || defined(RN)
  1772. LD a1, 0 * SIZE(AO)
  1773. fclr c11
  1774. LD a2, 1 * SIZE(AO)
  1775. fclr c12
  1776. LD a3, 2 * SIZE(AO)
  1777. fclr c16
  1778. LD a4, 3 * SIZE(AO)
  1779. fclr c15
  1780. LD b1, 0 * SIZE(B)
  1781. fclr c01
  1782. LD b2, 1 * SIZE(B)
  1783. fclr c02
  1784. LD b3, 2 * SIZE(B)
  1785. fclr c06
  1786. LD b4, 3 * SIZE(B)
  1787. fclr c05
  1788. lds $f31, 4 * SIZE(C1)
  1789. fclr c03
  1790. lda L, -2(KK)
  1791. fclr c04
  1792. lds $f31, 7 * SIZE(C2)
  1793. fclr c08
  1794. lda BO, 4 * SIZE(B)
  1795. fclr c13
  1796. lds $f31, 4 * SIZE(C3)
  1797. fclr c09
  1798. lda AO, 4 * SIZE(AO)
  1799. fclr c10
  1800. lds $f31, 7 * SIZE(C4)
  1801. fclr c14
  1802. fclr c07
  1803. ble KK, $L18
  1804. #else
  1805. #ifdef LN
  1806. sll K, BASE_SHIFT + 2, TMP1
  1807. subq AORIG, TMP1, AORIG
  1808. #endif
  1809. sll KK, BASE_SHIFT + 2, TMP1
  1810. addq AORIG, TMP1, AO
  1811. addq B, TMP1, BO
  1812. subq K, KK, TMP1
  1813. LD a1, 0 * SIZE(AO)
  1814. fclr c11
  1815. LD a2, 1 * SIZE(AO)
  1816. fclr c12
  1817. LD a3, 2 * SIZE(AO)
  1818. fclr c16
  1819. LD a4, 3 * SIZE(AO)
  1820. fclr c15
  1821. LD b1, 0 * SIZE(BO)
  1822. fclr c01
  1823. LD b2, 1 * SIZE(BO)
  1824. fclr c02
  1825. LD b3, 2 * SIZE(BO)
  1826. fclr c06
  1827. LD b4, 3 * SIZE(BO)
  1828. fclr c05
  1829. lds $f31, 4 * SIZE(C1)
  1830. fclr c03
  1831. lda L, -2(TMP1)
  1832. fclr c04
  1833. lds $f31, 7 * SIZE(C2)
  1834. fclr c08
  1835. lda BO, 4 * SIZE(BO)
  1836. fclr c13
  1837. lds $f31, 4 * SIZE(C3)
  1838. fclr c09
  1839. lda AO, 4 * SIZE(AO)
  1840. fclr c10
  1841. lds $f31, 7 * SIZE(C4)
  1842. fclr c14
  1843. fclr c07
  1844. ble TMP1, $L18
  1845. #endif
  1846. ble L, $L15
  1847. .align 5
  1848. $L12:
  1849. /* 1 */
  1850. ADD c11, t1, c11
  1851. #ifndef EV4
  1852. ldq $31, PREFETCHSIZE * SIZE(AO)
  1853. #else
  1854. unop
  1855. #endif
  1856. MUL b1, a1, t1
  1857. #ifndef EV4
  1858. ldl $31, PREFETCHSIZE * SIZE(BO)
  1859. #else
  1860. unop
  1861. #endif
  1862. ADD c12, t2, c12
  1863. unop
  1864. MUL b1, a2, t2
  1865. unop
  1866. ADD c16, t3, c16
  1867. unop
  1868. MUL b2, a2, t3
  1869. LD a5, 0 * SIZE(AO)
  1870. ADD c15, t4, c15
  1871. unop
  1872. MUL b2, a1, t4
  1873. LD b5, 0 * SIZE(BO)
  1874. /* 2 */
  1875. ADD c01, t1, c01
  1876. UNOP
  1877. MUL b1, a3, t1
  1878. UNOP
  1879. ADD c02, t2, c02
  1880. UNOP
  1881. MUL b1, a4, t2
  1882. UNOP
  1883. ADD c06, t3, c06
  1884. unop
  1885. MUL b2, a4, t3
  1886. unop
  1887. ADD c05, t4, c05
  1888. unop
  1889. MUL b4, a1, t4
  1890. unop
  1891. /* 3 */
  1892. ADD c03, t1, c03
  1893. unop
  1894. MUL b3, a1, t1
  1895. unop
  1896. ADD c04, t2, c04
  1897. unop
  1898. MUL b3, a2, t2
  1899. unop
  1900. ADD c08, t3, c08
  1901. unop
  1902. MUL b4, a2, t3
  1903. LD a2, 1 * SIZE(AO)
  1904. ADD c13, t4, c13
  1905. unop
  1906. MUL b2, a3, t4
  1907. LD b2, 1 * SIZE(BO)
  1908. /* 4 */
  1909. ADD c09, t1, c09
  1910. unop
  1911. MUL b3, a3, t1
  1912. LD a6, 2 * SIZE(AO)
  1913. ADD c10, t2, c10
  1914. unop
  1915. MUL b3, a4, t2
  1916. LD b3, 2 * SIZE(BO)
  1917. ADD c14, t3, c14
  1918. unop
  1919. MUL b4, a4, t3
  1920. LD a4, 3 * SIZE(AO)
  1921. ADD c07, t4, c07
  1922. unop
  1923. MUL b4, a3, t4
  1924. LD b4, 3 * SIZE(BO)
  1925. /* 5 */
  1926. ADD c11, t1, c11
  1927. unop
  1928. MUL b5, a5, t1
  1929. LD a1, 4 * SIZE(AO)
  1930. ADD c12, t2, c12
  1931. lda L, -2(L)
  1932. MUL b5, a2, t2
  1933. LD b1, 4 * SIZE(BO)
  1934. ADD c16, t3, c16
  1935. unop
  1936. MUL b2, a2, t3
  1937. unop
  1938. ADD c15, t4, c15
  1939. unop
  1940. MUL b2, a5, t4
  1941. unop
  1942. /* 6 */
  1943. ADD c01, t1, c01
  1944. unop
  1945. MUL b5, a6, t1
  1946. unop
  1947. ADD c02, t2, c02
  1948. unop
  1949. MUL b5, a4, t2
  1950. unop
  1951. ADD c06, t3, c06
  1952. unop
  1953. MUL b2, a4, t3
  1954. unop
  1955. ADD c05, t4, c05
  1956. unop
  1957. MUL b4, a5, t4
  1958. unop
  1959. /* 7 */
  1960. ADD c03, t1, c03
  1961. lda AO, 8 * SIZE(AO)
  1962. MUL b3, a5, t1
  1963. unop
  1964. ADD c04, t2, c04
  1965. lda BO, 8 * SIZE(BO)
  1966. MUL b3, a2, t2
  1967. unop
  1968. ADD c08, t3, c08
  1969. unop
  1970. MUL b4, a2, t3
  1971. LD a2, -3 * SIZE(AO)
  1972. ADD c13, t4, c13
  1973. unop
  1974. MUL b2, a6, t4
  1975. LD b2, -3 * SIZE(BO)
  1976. /* 8 */
  1977. ADD c09, t1, c09
  1978. unop
  1979. MUL b3, a6, t1
  1980. LD a3, -2 * SIZE(AO)
  1981. ADD c10, t2, c10
  1982. unop
  1983. MUL b3, a4, t2
  1984. LD b3, -2 * SIZE(BO)
  1985. ADD c14, t3, c14
  1986. unop
  1987. MUL b4, a4, t3
  1988. LD a4, -1 * SIZE(AO)
  1989. ADD c07, t4, c07
  1990. MUL b4, a6, t4
  1991. LD b4, -1 * SIZE(BO)
  1992. bgt L, $L12
  1993. .align 4
  1994. $L15:
  1995. ADD c11, t1, c11
  1996. MUL b1, a1, t1
  1997. #if defined(LT) || defined(RN)
  1998. blbs KK, $L17
  1999. #else
  2000. blbs TMP1, $L17
  2001. #endif
  2002. .align 4
  2003. ADD c12, t2, c12
  2004. MUL b1, a2, t2
  2005. ADD c16, t3, c16
  2006. MUL b2, a2, t3
  2007. ADD c15, t4, c15
  2008. MUL b2, a1, t4
  2009. ADD c01, t1, c01
  2010. MUL b1, a3, t1
  2011. ADD c02, t2, c02
  2012. unop
  2013. MUL b1, a4, t2
  2014. LD b1, 0 * SIZE(BO)
  2015. ADD c06, t3, c06
  2016. MUL b2, a4, t3
  2017. ADD c05, t4, c05
  2018. MUL b4, a1, t4
  2019. ADD c03, t1, c03
  2020. unop
  2021. MUL b3, a1, t1
  2022. LD a1, 0 * SIZE(AO)
  2023. ADD c04, t2, c04
  2024. unop
  2025. MUL b3, a2, t2
  2026. unop
  2027. ADD c08, t3, c08
  2028. unop
  2029. MUL b4, a2, t3
  2030. LD a2, 1 * SIZE(AO)
  2031. ADD c13, t4, c13
  2032. unop
  2033. MUL b2, a3, t4
  2034. LD b2, 1 * SIZE(BO)
  2035. ADD c09, t1, c09
  2036. unop
  2037. MUL b3, a3, t1
  2038. lda AO, 4 * SIZE(AO)
  2039. ADD c10, t2, c10
  2040. unop
  2041. MUL b3, a4, t2
  2042. LD b3, 2 * SIZE(BO)
  2043. ADD c14, t3, c14
  2044. unop
  2045. MUL b4, a4, t3
  2046. LD a4, -1 * SIZE(AO)
  2047. ADD c07, t4, c07
  2048. unop
  2049. MUL b4, a3, t4
  2050. LD a3, -2 * SIZE(AO)
  2051. ADD c11, t1, c11
  2052. LD b4, 3 * SIZE(BO)
  2053. MUL b1, a1, t1
  2054. lda BO, 4 * SIZE(BO)
  2055. .align 4
  2056. $L17:
  2057. ADD c12, t2, c12
  2058. MUL b1, a2, t2
  2059. ADD c16, t3, c16
  2060. MUL b2, a2, t3
  2061. ADD c15, t4, c15
  2062. MUL b2, a1, t4
  2063. ADD c01, t1, c01
  2064. MUL b1, a3, t1
  2065. ADD c02, t2, c02
  2066. MUL b1, a4, t2
  2067. ADD c06, t3, c06
  2068. MUL b2, a4, t3
  2069. ADD c05, t4, c05
  2070. MUL b4, a1, t4
  2071. ADD c03, t1, c03
  2072. MUL b3, a1, t1
  2073. ADD c04, t2, c04
  2074. MUL b3, a2, t2
  2075. ADD c08, t3, c08
  2076. MUL b4, a2, t3
  2077. ADD c13, t4, c13
  2078. MUL b2, a3, t4
  2079. ADD c09, t1, c09
  2080. MUL b3, a3, t1
  2081. ADD c10, t2, c10
  2082. MUL b3, a4, t2
  2083. ADD c14, t3, c14
  2084. MUL b4, a4, t3
  2085. ADD c07, t4, c07
  2086. lda AO, 4 * SIZE(AO)
  2087. MUL b4, a3, t4
  2088. lda BO, 4 * SIZE(BO)
  2089. ADD c11, t1, c11
  2090. ADD c12, t2, c12
  2091. ADD c16, t3, c16
  2092. ADD c15, t4, c15
  2093. .align 4
  2094. $L18:
  2095. #if defined(LN) || defined(RT)
  2096. #ifdef LN
  2097. subq KK, 4, TMP1
  2098. #else
  2099. subq KK, 4, TMP1
  2100. #endif
  2101. sll TMP1, BASE_SHIFT + 2, TMP2
  2102. addq AORIG, TMP2, AO
  2103. sll TMP1, BASE_SHIFT + 2, TMP2
  2104. addq B, TMP2, BO
  2105. #else
  2106. lda AO, -4 * SIZE(AO)
  2107. lda BO, -4 * SIZE(BO)
  2108. #endif
  2109. #if defined(LN) || defined(LT)
  2110. LD a1, 0 * SIZE(BO)
  2111. LD a2, 1 * SIZE(BO)
  2112. LD a3, 2 * SIZE(BO)
  2113. LD a4, 3 * SIZE(BO)
  2114. LD b1, 4 * SIZE(BO)
  2115. LD b2, 5 * SIZE(BO)
  2116. LD b3, 6 * SIZE(BO)
  2117. LD b4, 7 * SIZE(BO)
  2118. SUB a1, c01, c01
  2119. SUB a2, c05, c05
  2120. SUB a3, c09, c09
  2121. SUB a4, c13, c13
  2122. SUB b1, c02, c02
  2123. SUB b2, c06, c06
  2124. SUB b3, c10, c10
  2125. SUB b4, c14, c14
  2126. LD a1, 8 * SIZE(BO)
  2127. LD a2, 9 * SIZE(BO)
  2128. LD a3, 10 * SIZE(BO)
  2129. LD a4, 11 * SIZE(BO)
  2130. LD b1, 12 * SIZE(BO)
  2131. LD b2, 13 * SIZE(BO)
  2132. LD b3, 14 * SIZE(BO)
  2133. LD b4, 15 * SIZE(BO)
  2134. SUB a1, c03, c03
  2135. SUB a2, c07, c07
  2136. SUB a3, c11, c11
  2137. SUB a4, c15, c15
  2138. SUB b1, c04, c04
  2139. SUB b2, c08, c08
  2140. SUB b3, c12, c12
  2141. SUB b4, c16, c16
  2142. #else
  2143. LD a1, 0 * SIZE(AO)
  2144. LD a2, 1 * SIZE(AO)
  2145. LD a3, 2 * SIZE(AO)
  2146. LD a4, 3 * SIZE(AO)
  2147. LD b1, 4 * SIZE(AO)
  2148. LD b2, 5 * SIZE(AO)
  2149. LD b3, 6 * SIZE(AO)
  2150. LD b4, 7 * SIZE(AO)
  2151. SUB a1, c01, c01
  2152. SUB a2, c02, c02
  2153. SUB a3, c03, c03
  2154. SUB a4, c04, c04
  2155. SUB b1, c05, c05
  2156. SUB b2, c06, c06
  2157. SUB b3, c07, c07
  2158. SUB b4, c08, c08
  2159. LD a1, 8 * SIZE(AO)
  2160. LD a2, 9 * SIZE(AO)
  2161. LD a3, 10 * SIZE(AO)
  2162. LD a4, 11 * SIZE(AO)
  2163. LD b1, 12 * SIZE(AO)
  2164. LD b2, 13 * SIZE(AO)
  2165. LD b3, 14 * SIZE(AO)
  2166. LD b4, 15 * SIZE(AO)
  2167. SUB a1, c09, c09
  2168. SUB a2, c10, c10
  2169. SUB a3, c11, c11
  2170. SUB a4, c12, c12
  2171. SUB b1, c13, c13
  2172. SUB b2, c14, c14
  2173. SUB b3, c15, c15
  2174. SUB b4, c16, c16
  2175. #endif
  2176. #ifdef LN
  2177. LD a1, 15 * SIZE(AO)
  2178. LD a2, 14 * SIZE(AO)
  2179. LD a3, 13 * SIZE(AO)
  2180. LD a4, 12 * SIZE(AO)
  2181. MUL a1, c04, c04
  2182. MUL a1, c08, c08
  2183. MUL a1, c12, c12
  2184. MUL a1, c16, c16
  2185. MUL a2, c04, t1
  2186. MUL a2, c08, t2
  2187. MUL a2, c12, t3
  2188. MUL a2, c16, t4
  2189. SUB c03, t1, c03
  2190. SUB c07, t2, c07
  2191. SUB c11, t3, c11
  2192. SUB c15, t4, c15
  2193. MUL a3, c04, t1
  2194. MUL a3, c08, t2
  2195. MUL a3, c12, t3
  2196. MUL a3, c16, t4
  2197. SUB c02, t1, c02
  2198. SUB c06, t2, c06
  2199. SUB c10, t3, c10
  2200. SUB c14, t4, c14
  2201. MUL a4, c04, t1
  2202. MUL a4, c08, t2
  2203. MUL a4, c12, t3
  2204. MUL a4, c16, t4
  2205. SUB c01, t1, c01
  2206. SUB c05, t2, c05
  2207. SUB c09, t3, c09
  2208. SUB c13, t4, c13
  2209. LD b1, 10 * SIZE(AO)
  2210. LD b2, 9 * SIZE(AO)
  2211. LD b3, 8 * SIZE(AO)
  2212. MUL b1, c03, c03
  2213. MUL b1, c07, c07
  2214. MUL b1, c11, c11
  2215. MUL b1, c15, c15
  2216. MUL b2, c03, t1
  2217. MUL b2, c07, t2
  2218. MUL b2, c11, t3
  2219. MUL b2, c15, t4
  2220. SUB c02, t1, c02
  2221. SUB c06, t2, c06
  2222. SUB c10, t3, c10
  2223. SUB c14, t4, c14
  2224. MUL b3, c03, t1
  2225. MUL b3, c07, t2
  2226. MUL b3, c11, t3
  2227. MUL b3, c15, t4
  2228. SUB c01, t1, c01
  2229. SUB c05, t2, c05
  2230. SUB c09, t3, c09
  2231. SUB c13, t4, c13
  2232. LD a1, 5 * SIZE(AO)
  2233. LD a2, 4 * SIZE(AO)
  2234. LD a3, 0 * SIZE(AO)
  2235. MUL a1, c02, c02
  2236. MUL a1, c06, c06
  2237. MUL a1, c10, c10
  2238. MUL a1, c14, c14
  2239. MUL a2, c02, t1
  2240. MUL a2, c06, t2
  2241. MUL a2, c10, t3
  2242. MUL a2, c14, t4
  2243. SUB c01, t1, c01
  2244. SUB c05, t2, c05
  2245. SUB c09, t3, c09
  2246. SUB c13, t4, c13
  2247. MUL a3, c01, c01
  2248. MUL a3, c05, c05
  2249. MUL a3, c09, c09
  2250. MUL a3, c13, c13
  2251. #endif
  2252. #ifdef LT
  2253. LD a1, 0 * SIZE(AO)
  2254. LD a2, 1 * SIZE(AO)
  2255. LD a3, 2 * SIZE(AO)
  2256. LD a4, 3 * SIZE(AO)
  2257. MUL a1, c01, c01
  2258. MUL a1, c05, c05
  2259. MUL a1, c09, c09
  2260. MUL a1, c13, c13
  2261. MUL a2, c01, t1
  2262. MUL a2, c05, t2
  2263. MUL a2, c09, t3
  2264. MUL a2, c13, t4
  2265. SUB c02, t1, c02
  2266. SUB c06, t2, c06
  2267. SUB c10, t3, c10
  2268. SUB c14, t4, c14
  2269. MUL a3, c01, t1
  2270. MUL a3, c05, t2
  2271. MUL a3, c09, t3
  2272. MUL a3, c13, t4
  2273. SUB c03, t1, c03
  2274. SUB c07, t2, c07
  2275. SUB c11, t3, c11
  2276. SUB c15, t4, c15
  2277. MUL a4, c01, t1
  2278. MUL a4, c05, t2
  2279. MUL a4, c09, t3
  2280. MUL a4, c13, t4
  2281. SUB c04, t1, c04
  2282. SUB c08, t2, c08
  2283. SUB c12, t3, c12
  2284. SUB c16, t4, c16
  2285. LD b1, 5 * SIZE(AO)
  2286. LD b2, 6 * SIZE(AO)
  2287. LD b3, 7 * SIZE(AO)
  2288. MUL b1, c02, c02
  2289. MUL b1, c06, c06
  2290. MUL b1, c10, c10
  2291. MUL b1, c14, c14
  2292. MUL b2, c02, t1
  2293. MUL b2, c06, t2
  2294. MUL b2, c10, t3
  2295. MUL b2, c14, t4
  2296. SUB c03, t1, c03
  2297. SUB c07, t2, c07
  2298. SUB c11, t3, c11
  2299. SUB c15, t4, c15
  2300. MUL b3, c02, t1
  2301. MUL b3, c06, t2
  2302. MUL b3, c10, t3
  2303. MUL b3, c14, t4
  2304. SUB c04, t1, c04
  2305. SUB c08, t2, c08
  2306. SUB c12, t3, c12
  2307. SUB c16, t4, c16
  2308. LD a1, 10 * SIZE(AO)
  2309. LD a2, 11 * SIZE(AO)
  2310. LD a3, 15 * SIZE(AO)
  2311. MUL a1, c03, c03
  2312. MUL a1, c07, c07
  2313. MUL a1, c11, c11
  2314. MUL a1, c15, c15
  2315. MUL a2, c03, t1
  2316. MUL a2, c07, t2
  2317. MUL a2, c11, t3
  2318. MUL a2, c15, t4
  2319. SUB c04, t1, c04
  2320. SUB c08, t2, c08
  2321. SUB c12, t3, c12
  2322. SUB c16, t4, c16
  2323. MUL a3, c04, c04
  2324. MUL a3, c08, c08
  2325. MUL a3, c12, c12
  2326. MUL a3, c16, c16
  2327. #endif
  2328. #ifdef RN
  2329. LD a1, 0 * SIZE(BO)
  2330. LD a2, 1 * SIZE(BO)
  2331. LD a3, 2 * SIZE(BO)
  2332. LD a4, 3 * SIZE(BO)
  2333. MUL a1, c01, c01
  2334. MUL a1, c02, c02
  2335. MUL a1, c03, c03
  2336. MUL a1, c04, c04
  2337. MUL a2, c01, t1
  2338. MUL a2, c02, t2
  2339. MUL a2, c03, t3
  2340. MUL a2, c04, t4
  2341. SUB c05, t1, c05
  2342. SUB c06, t2, c06
  2343. SUB c07, t3, c07
  2344. SUB c08, t4, c08
  2345. MUL a3, c01, t1
  2346. MUL a3, c02, t2
  2347. MUL a3, c03, t3
  2348. MUL a3, c04, t4
  2349. SUB c09, t1, c09
  2350. SUB c10, t2, c10
  2351. SUB c11, t3, c11
  2352. SUB c12, t4, c12
  2353. MUL a4, c01, t1
  2354. MUL a4, c02, t2
  2355. MUL a4, c03, t3
  2356. MUL a4, c04, t4
  2357. SUB c13, t1, c13
  2358. SUB c14, t2, c14
  2359. SUB c15, t3, c15
  2360. SUB c16, t4, c16
  2361. LD b1, 5 * SIZE(BO)
  2362. LD b2, 6 * SIZE(BO)
  2363. LD b3, 7 * SIZE(BO)
  2364. MUL b1, c05, c05
  2365. MUL b1, c06, c06
  2366. MUL b1, c07, c07
  2367. MUL b1, c08, c08
  2368. MUL b2, c05, t1
  2369. MUL b2, c06, t2
  2370. MUL b2, c07, t3
  2371. MUL b2, c08, t4
  2372. SUB c09, t1, c09
  2373. SUB c10, t2, c10
  2374. SUB c11, t3, c11
  2375. SUB c12, t4, c12
  2376. MUL b3, c05, t1
  2377. MUL b3, c06, t2
  2378. MUL b3, c07, t3
  2379. MUL b3, c08, t4
  2380. SUB c13, t1, c13
  2381. SUB c14, t2, c14
  2382. SUB c15, t3, c15
  2383. SUB c16, t4, c16
  2384. LD a1, 10 * SIZE(BO)
  2385. LD a2, 11 * SIZE(BO)
  2386. LD a3, 15 * SIZE(BO)
  2387. MUL a1, c09, c09
  2388. MUL a1, c10, c10
  2389. MUL a1, c11, c11
  2390. MUL a1, c12, c12
  2391. MUL a2, c09, t1
  2392. MUL a2, c10, t2
  2393. MUL a2, c11, t3
  2394. MUL a2, c12, t4
  2395. SUB c13, t1, c13
  2396. SUB c14, t2, c14
  2397. SUB c15, t3, c15
  2398. SUB c16, t4, c16
  2399. MUL a3, c13, c13
  2400. MUL a3, c14, c14
  2401. MUL a3, c15, c15
  2402. MUL a3, c16, c16
  2403. #endif
  2404. #ifdef RT
  2405. LD a1, 15 * SIZE(BO)
  2406. LD a2, 14 * SIZE(BO)
  2407. LD a3, 13 * SIZE(BO)
  2408. LD a4, 12 * SIZE(BO)
  2409. MUL a1, c13, c13
  2410. MUL a1, c14, c14
  2411. MUL a1, c15, c15
  2412. MUL a1, c16, c16
  2413. MUL a2, c13, t1
  2414. MUL a2, c14, t2
  2415. MUL a2, c15, t3
  2416. MUL a2, c16, t4
  2417. SUB c09, t1, c09
  2418. SUB c10, t2, c10
  2419. SUB c11, t3, c11
  2420. SUB c12, t4, c12
  2421. MUL a3, c13, t1
  2422. MUL a3, c14, t2
  2423. MUL a3, c15, t3
  2424. MUL a3, c16, t4
  2425. SUB c05, t1, c05
  2426. SUB c06, t2, c06
  2427. SUB c07, t3, c07
  2428. SUB c08, t4, c08
  2429. MUL a4, c13, t1
  2430. MUL a4, c14, t2
  2431. MUL a4, c15, t3
  2432. MUL a4, c16, t4
  2433. SUB c01, t1, c01
  2434. SUB c02, t2, c02
  2435. SUB c03, t3, c03
  2436. SUB c04, t4, c04
  2437. LD b1, 10 * SIZE(BO)
  2438. LD b2, 9 * SIZE(BO)
  2439. LD b3, 8 * SIZE(BO)
  2440. MUL b1, c09, c09
  2441. MUL b1, c10, c10
  2442. MUL b1, c11, c11
  2443. MUL b1, c12, c12
  2444. MUL b2, c09, t1
  2445. MUL b2, c10, t2
  2446. MUL b2, c11, t3
  2447. MUL b2, c12, t4
  2448. SUB c05, t1, c05
  2449. SUB c06, t2, c06
  2450. SUB c07, t3, c07
  2451. SUB c08, t4, c08
  2452. MUL b3, c09, t1
  2453. MUL b3, c10, t2
  2454. MUL b3, c11, t3
  2455. MUL b3, c12, t4
  2456. SUB c01, t1, c01
  2457. SUB c02, t2, c02
  2458. SUB c03, t3, c03
  2459. SUB c04, t4, c04
  2460. LD a1, 5 * SIZE(BO)
  2461. LD a2, 4 * SIZE(BO)
  2462. LD a3, 0 * SIZE(BO)
  2463. MUL a1, c05, c05
  2464. MUL a1, c06, c06
  2465. MUL a1, c07, c07
  2466. MUL a1, c08, c08
  2467. MUL a2, c05, t1
  2468. MUL a2, c06, t2
  2469. MUL a2, c07, t3
  2470. MUL a2, c08, t4
  2471. SUB c01, t1, c01
  2472. SUB c02, t2, c02
  2473. SUB c03, t3, c03
  2474. SUB c04, t4, c04
  2475. MUL a3, c01, c01
  2476. MUL a3, c02, c02
  2477. MUL a3, c03, c03
  2478. MUL a3, c04, c04
  2479. #endif
  2480. #if defined(LN) || defined(LT)
  2481. ST c01, 0 * SIZE(BO)
  2482. ST c05, 1 * SIZE(BO)
  2483. ST c09, 2 * SIZE(BO)
  2484. ST c13, 3 * SIZE(BO)
  2485. ST c02, 4 * SIZE(BO)
  2486. ST c06, 5 * SIZE(BO)
  2487. ST c10, 6 * SIZE(BO)
  2488. ST c14, 7 * SIZE(BO)
  2489. ST c03, 8 * SIZE(BO)
  2490. ST c07, 9 * SIZE(BO)
  2491. ST c11, 10 * SIZE(BO)
  2492. ST c15, 11 * SIZE(BO)
  2493. ST c04, 12 * SIZE(BO)
  2494. ST c08, 13 * SIZE(BO)
  2495. ST c12, 14 * SIZE(BO)
  2496. ST c16, 15 * SIZE(BO)
  2497. #else
  2498. ST c01, 0 * SIZE(AO)
  2499. ST c02, 1 * SIZE(AO)
  2500. ST c03, 2 * SIZE(AO)
  2501. ST c04, 3 * SIZE(AO)
  2502. ST c05, 4 * SIZE(AO)
  2503. ST c06, 5 * SIZE(AO)
  2504. ST c07, 6 * SIZE(AO)
  2505. ST c08, 7 * SIZE(AO)
  2506. ST c09, 8 * SIZE(AO)
  2507. ST c10, 9 * SIZE(AO)
  2508. ST c11, 10 * SIZE(AO)
  2509. ST c12, 11 * SIZE(AO)
  2510. ST c13, 12 * SIZE(AO)
  2511. ST c14, 13 * SIZE(AO)
  2512. ST c15, 14 * SIZE(AO)
  2513. ST c16, 15 * SIZE(AO)
  2514. #endif
  2515. #ifdef LN
  2516. lda C1, -4 * SIZE(C1)
  2517. lda C2, -4 * SIZE(C2)
  2518. lda C3, -4 * SIZE(C3)
  2519. lda C4, -4 * SIZE(C4)
  2520. #endif
  2521. ST c01, 0 * SIZE(C1)
  2522. ST c02, 1 * SIZE(C1)
  2523. ST c03, 2 * SIZE(C1)
  2524. ST c04, 3 * SIZE(C1)
  2525. ST c05, 0 * SIZE(C2)
  2526. ST c06, 1 * SIZE(C2)
  2527. ST c07, 2 * SIZE(C2)
  2528. ST c08, 3 * SIZE(C2)
  2529. ST c09, 0 * SIZE(C3)
  2530. ST c10, 1 * SIZE(C3)
  2531. ST c11, 2 * SIZE(C3)
  2532. ST c12, 3 * SIZE(C3)
  2533. ST c13, 0 * SIZE(C4)
  2534. ST c14, 1 * SIZE(C4)
  2535. ST c15, 2 * SIZE(C4)
  2536. ST c16, 3 * SIZE(C4)
  2537. #ifndef LN
  2538. lda C1, 4 * SIZE(C1)
  2539. lda C2, 4 * SIZE(C2)
  2540. lda C3, 4 * SIZE(C3)
  2541. lda C4, 4 * SIZE(C4)
  2542. #endif
  2543. fclr t1
  2544. fclr t2
  2545. fclr t3
  2546. fclr t4
  2547. #ifdef RT
  2548. sll K, 2 + BASE_SHIFT, TMP1
  2549. addq AORIG, TMP1, AORIG
  2550. #endif
  2551. #if defined(LT) || defined(RN)
  2552. subq K, KK, TMP1
  2553. sll TMP1, BASE_SHIFT + 2, TMP1
  2554. addq AO, TMP1, AO
  2555. addq BO, TMP1, BO
  2556. #endif
  2557. #ifdef LT
  2558. addq KK, 4, KK
  2559. #endif
  2560. #ifdef LN
  2561. subq KK, 4, KK
  2562. #endif
  2563. lda I, -1(I)
  2564. bgt I, $L11
  2565. .align 4
  2566. $L20:
  2567. and M, 2, I
  2568. ble I, $L30
  2569. #if defined(LT) || defined(RN)
  2570. LD a1, 0 * SIZE(AO)
  2571. fclr c09
  2572. LD a2, 1 * SIZE(AO)
  2573. fclr c13
  2574. LD a3, 2 * SIZE(AO)
  2575. fclr c10
  2576. LD a4, 3 * SIZE(AO)
  2577. fclr c14
  2578. LD b1, 0 * SIZE(B)
  2579. lda L, -2(KK)
  2580. LD b2, 1 * SIZE(B)
  2581. lda AO, 2 * SIZE(AO)
  2582. LD b3, 2 * SIZE(B)
  2583. fclr c01
  2584. LD b4, 3 * SIZE(B)
  2585. fclr c05
  2586. lda BO, 4 * SIZE(B)
  2587. fclr c02
  2588. fclr c06
  2589. ble KK, $L28
  2590. ble L, $L25
  2591. #else
  2592. #ifdef LN
  2593. sll K, BASE_SHIFT + 1, TMP1
  2594. subq AORIG, TMP1, AORIG
  2595. #endif
  2596. sll KK, BASE_SHIFT + 1, TMP1
  2597. addq AORIG, TMP1, AO
  2598. sll KK, BASE_SHIFT + 2, TMP2
  2599. addq B, TMP2, BO
  2600. subq K, KK, TMP1
  2601. LD a1, 0 * SIZE(AO)
  2602. fclr c09
  2603. LD a2, 1 * SIZE(AO)
  2604. fclr c13
  2605. LD a3, 2 * SIZE(AO)
  2606. fclr c10
  2607. LD a4, 3 * SIZE(AO)
  2608. fclr c14
  2609. LD b1, 0 * SIZE(BO)
  2610. lda L, -2(TMP1)
  2611. LD b2, 1 * SIZE(BO)
  2612. lda AO, 2 * SIZE(AO)
  2613. LD b3, 2 * SIZE(BO)
  2614. fclr c01
  2615. LD b4, 3 * SIZE(BO)
  2616. fclr c05
  2617. lda BO, 4 * SIZE(BO)
  2618. fclr c02
  2619. fclr c06
  2620. ble TMP1, $L28
  2621. ble L, $L25
  2622. #endif
  2623. .align 4
  2624. $L22:
  2625. ADD c09, t1, c09
  2626. unop
  2627. MUL a1, b1, t1
  2628. unop
  2629. ADD c10, t2, c10
  2630. unop
  2631. MUL a2, b1, t2
  2632. LD b1, 0 * SIZE(BO)
  2633. ADD c13, t3, c13
  2634. unop
  2635. MUL a1, b2, t3
  2636. lda BO, 8 * SIZE(BO)
  2637. ADD c14, t4, c14
  2638. unop
  2639. MUL a2, b2, t4
  2640. LD b2, -7 * SIZE(BO)
  2641. ADD c01, t1, c01
  2642. unop
  2643. MUL a1, b3, t1
  2644. unop
  2645. ADD c02, t2, c02
  2646. unop
  2647. MUL a2, b3, t2
  2648. LD b3, -6 * SIZE(BO)
  2649. ADD c05, t3, c05
  2650. unop
  2651. MUL a1, b4, t3
  2652. LD a1, 2 * SIZE(AO)
  2653. ADD c06, t4, c06
  2654. MUL a2, b4, t4
  2655. LD b5, -5 * SIZE(BO)
  2656. ADD c09, t1, c09
  2657. unop
  2658. MUL a3, b1, t1
  2659. LD a2, 3 * SIZE(AO)
  2660. ADD c10, t2, c10
  2661. unop
  2662. MUL a4, b1, t2
  2663. LD b1, -4 * SIZE(BO)
  2664. ADD c13, t3, c13
  2665. unop
  2666. MUL a3, b2, t3
  2667. lda AO, 4 * SIZE(AO)
  2668. ADD c14, t4, c14
  2669. MUL a4, b2, t4
  2670. LD b2, -3 * SIZE(BO)
  2671. ADD c01, t1, c01
  2672. lda L, -2(L)
  2673. MUL a3, b3, t1
  2674. LD b4, -1 * SIZE(BO)
  2675. ADD c02, t2, c02
  2676. unop
  2677. MUL a4, b3, t2
  2678. LD b3, -2 * SIZE(BO)
  2679. ADD c05, t3, c05
  2680. unop
  2681. MUL a3, b5, t3
  2682. LD a3, 0 * SIZE(AO)
  2683. ADD c06, t4, c06
  2684. MUL a4, b5, t4
  2685. LD a4, 1 * SIZE(AO)
  2686. bgt L, $L22
  2687. .align 4
  2688. $L25:
  2689. ADD c09, t1, c09
  2690. MUL a1, b1, t1
  2691. #if defined(LT) || defined(RN)
  2692. blbs KK, $L27
  2693. #else
  2694. blbs TMP1, $L27
  2695. #endif
  2696. ADD c10, t2, c10
  2697. unop
  2698. MUL a2, b1, t2
  2699. LD b1, 0 * SIZE(BO)
  2700. ADD c13, t3, c13
  2701. unop
  2702. MUL a1, b2, t3
  2703. unop
  2704. ADD c14, t4, c14
  2705. unop
  2706. MUL a2, b2, t4
  2707. LD b2, 1 * SIZE(BO)
  2708. ADD c01, t1, c01
  2709. unop
  2710. MUL a1, b3, t1
  2711. lda AO, 2 * SIZE(AO)
  2712. ADD c02, t2, c02
  2713. unop
  2714. MUL a2, b3, t2
  2715. LD b3, 2 * SIZE(BO)
  2716. ADD c05, t3, c05
  2717. unop
  2718. MUL a1, b4, t3
  2719. LD a1, -2 * SIZE(AO)
  2720. ADD c06, t4, c06
  2721. unop
  2722. MUL a2, b4, t4
  2723. LD a2, -1 * SIZE(AO)
  2724. ADD c09, t1, c09
  2725. LD b4, 3 * SIZE(BO)
  2726. MUL a1, b1, t1
  2727. lda BO, 4 * SIZE(BO)
  2728. .align 4
  2729. $L27:
  2730. ADD c10, t2, c10
  2731. MUL a2, b1, t2
  2732. ADD c13, t3, c13
  2733. MUL a1, b2, t3
  2734. ADD c14, t4, c14
  2735. MUL a2, b2, t4
  2736. ADD c01, t1, c01
  2737. MUL a1, b3, t1
  2738. ADD c02, t2, c02
  2739. MUL a2, b3, t2
  2740. ADD c05, t3, c05
  2741. MUL a1, b4, t3
  2742. ADD c06, t4, c06
  2743. lda AO, 2 * SIZE(AO)
  2744. MUL a2, b4, t4
  2745. lda BO, 4 * SIZE(BO)
  2746. ADD c09, t1, c09
  2747. ADD c10, t2, c10
  2748. ADD c13, t3, c13
  2749. ADD c14, t4, c14
  2750. .align 4
  2751. $L28:
  2752. #if defined(LN) || defined(RT)
  2753. #ifdef LN
  2754. subq KK, 2, TMP1
  2755. #else
  2756. subq KK, 4, TMP1
  2757. #endif
  2758. sll TMP1, BASE_SHIFT + 1, TMP2
  2759. addq AORIG, TMP2, AO
  2760. sll TMP1, BASE_SHIFT + 2, TMP2
  2761. addq B, TMP2, BO
  2762. #else
  2763. lda AO, -2 * SIZE(AO)
  2764. lda BO, -4 * SIZE(BO)
  2765. #endif
  2766. #if defined(LN) || defined(LT)
  2767. LD a1, 0 * SIZE(BO)
  2768. LD a2, 1 * SIZE(BO)
  2769. LD a3, 2 * SIZE(BO)
  2770. LD a4, 3 * SIZE(BO)
  2771. LD b1, 4 * SIZE(BO)
  2772. LD b2, 5 * SIZE(BO)
  2773. LD b3, 6 * SIZE(BO)
  2774. LD b4, 7 * SIZE(BO)
  2775. SUB a1, c01, c01
  2776. SUB a2, c05, c05
  2777. SUB a3, c09, c09
  2778. SUB a4, c13, c13
  2779. SUB b1, c02, c02
  2780. SUB b2, c06, c06
  2781. SUB b3, c10, c10
  2782. SUB b4, c14, c14
  2783. #else
  2784. LD a1, 0 * SIZE(AO)
  2785. LD a2, 1 * SIZE(AO)
  2786. LD a3, 2 * SIZE(AO)
  2787. LD a4, 3 * SIZE(AO)
  2788. LD b1, 4 * SIZE(AO)
  2789. LD b2, 5 * SIZE(AO)
  2790. LD b3, 6 * SIZE(AO)
  2791. LD b4, 7 * SIZE(AO)
  2792. SUB a1, c01, c01
  2793. SUB a2, c02, c02
  2794. SUB a3, c05, c05
  2795. SUB a4, c06, c06
  2796. SUB b1, c09, c09
  2797. SUB b2, c10, c10
  2798. SUB b3, c13, c13
  2799. SUB b4, c14, c14
  2800. #endif
  2801. #ifdef LN
  2802. LD a1, 3 * SIZE(AO)
  2803. LD a2, 2 * SIZE(AO)
  2804. LD a3, 0 * SIZE(AO)
  2805. MUL a1, c02, c02
  2806. MUL a1, c06, c06
  2807. MUL a1, c10, c10
  2808. MUL a1, c14, c14
  2809. MUL a2, c02, t1
  2810. MUL a2, c06, t2
  2811. MUL a2, c10, t3
  2812. MUL a2, c14, t4
  2813. SUB c01, t1, c01
  2814. SUB c05, t2, c05
  2815. SUB c09, t3, c09
  2816. SUB c13, t4, c13
  2817. MUL a3, c01, c01
  2818. MUL a3, c05, c05
  2819. MUL a3, c09, c09
  2820. MUL a3, c13, c13
  2821. #endif
  2822. #ifdef LT
  2823. LD a1, 0 * SIZE(AO)
  2824. LD a2, 1 * SIZE(AO)
  2825. LD a3, 3 * SIZE(AO)
  2826. MUL a1, c01, c01
  2827. MUL a1, c05, c05
  2828. MUL a1, c09, c09
  2829. MUL a1, c13, c13
  2830. MUL a2, c01, t1
  2831. MUL a2, c05, t2
  2832. MUL a2, c09, t3
  2833. MUL a2, c13, t4
  2834. SUB c02, t1, c02
  2835. SUB c06, t2, c06
  2836. SUB c10, t3, c10
  2837. SUB c14, t4, c14
  2838. MUL a3, c02, c02
  2839. MUL a3, c06, c06
  2840. MUL a3, c10, c10
  2841. MUL a3, c14, c14
  2842. #endif
  2843. #ifdef RN
  2844. LD a1, 0 * SIZE(BO)
  2845. LD a2, 1 * SIZE(BO)
  2846. LD a3, 2 * SIZE(BO)
  2847. LD a4, 3 * SIZE(BO)
  2848. MUL a1, c01, c01
  2849. MUL a1, c02, c02
  2850. MUL a2, c01, t1
  2851. MUL a2, c02, t2
  2852. SUB c05, t1, c05
  2853. SUB c06, t2, c06
  2854. MUL a3, c01, t1
  2855. MUL a3, c02, t2
  2856. SUB c09, t1, c09
  2857. SUB c10, t2, c10
  2858. MUL a4, c01, t1
  2859. MUL a4, c02, t2
  2860. SUB c13, t1, c13
  2861. SUB c14, t2, c14
  2862. LD b1, 5 * SIZE(BO)
  2863. LD b2, 6 * SIZE(BO)
  2864. LD b3, 7 * SIZE(BO)
  2865. MUL b1, c05, c05
  2866. MUL b1, c06, c06
  2867. MUL b2, c05, t1
  2868. MUL b2, c06, t2
  2869. SUB c09, t1, c09
  2870. SUB c10, t2, c10
  2871. MUL b3, c05, t1
  2872. MUL b3, c06, t2
  2873. SUB c13, t1, c13
  2874. SUB c14, t2, c14
  2875. LD a1, 10 * SIZE(BO)
  2876. LD a2, 11 * SIZE(BO)
  2877. LD a3, 15 * SIZE(BO)
  2878. MUL a1, c09, c09
  2879. MUL a1, c10, c10
  2880. MUL a2, c09, t1
  2881. MUL a2, c10, t2
  2882. SUB c13, t1, c13
  2883. SUB c14, t2, c14
  2884. MUL a3, c13, c13
  2885. MUL a3, c14, c14
  2886. #endif
  2887. #ifdef RT
  2888. LD a1, 15 * SIZE(BO)
  2889. LD a2, 14 * SIZE(BO)
  2890. LD a3, 13 * SIZE(BO)
  2891. LD a4, 12 * SIZE(BO)
  2892. MUL a1, c13, c13
  2893. MUL a1, c14, c14
  2894. MUL a2, c13, t1
  2895. MUL a2, c14, t2
  2896. SUB c09, t1, c09
  2897. SUB c10, t2, c10
  2898. MUL a3, c13, t1
  2899. MUL a3, c14, t2
  2900. SUB c05, t1, c05
  2901. SUB c06, t2, c06
  2902. MUL a4, c13, t1
  2903. MUL a4, c14, t2
  2904. SUB c01, t1, c01
  2905. SUB c02, t2, c02
  2906. LD b1, 10 * SIZE(BO)
  2907. LD b2, 9 * SIZE(BO)
  2908. LD b3, 8 * SIZE(BO)
  2909. MUL b1, c09, c09
  2910. MUL b1, c10, c10
  2911. MUL b2, c09, t1
  2912. MUL b2, c10, t2
  2913. SUB c05, t1, c05
  2914. SUB c06, t2, c06
  2915. MUL b3, c09, t1
  2916. MUL b3, c10, t2
  2917. SUB c01, t1, c01
  2918. SUB c02, t2, c02
  2919. LD a1, 5 * SIZE(BO)
  2920. LD a2, 4 * SIZE(BO)
  2921. LD a3, 0 * SIZE(BO)
  2922. MUL a1, c05, c05
  2923. MUL a1, c06, c06
  2924. MUL a2, c05, t1
  2925. MUL a2, c06, t2
  2926. SUB c01, t1, c01
  2927. SUB c02, t2, c02
  2928. MUL a3, c01, c01
  2929. MUL a3, c02, c02
  2930. #endif
  2931. #if defined(LN) || defined(LT)
  2932. ST c01, 0 * SIZE(BO)
  2933. ST c05, 1 * SIZE(BO)
  2934. ST c09, 2 * SIZE(BO)
  2935. ST c13, 3 * SIZE(BO)
  2936. ST c02, 4 * SIZE(BO)
  2937. ST c06, 5 * SIZE(BO)
  2938. ST c10, 6 * SIZE(BO)
  2939. ST c14, 7 * SIZE(BO)
  2940. #else
  2941. ST c01, 0 * SIZE(AO)
  2942. ST c02, 1 * SIZE(AO)
  2943. ST c05, 2 * SIZE(AO)
  2944. ST c06, 3 * SIZE(AO)
  2945. ST c09, 4 * SIZE(AO)
  2946. ST c10, 5 * SIZE(AO)
  2947. ST c13, 6 * SIZE(AO)
  2948. ST c14, 7 * SIZE(AO)
  2949. #endif
  2950. #ifdef LN
  2951. lda C1, -2 * SIZE(C1)
  2952. lda C2, -2 * SIZE(C2)
  2953. lda C3, -2 * SIZE(C3)
  2954. lda C4, -2 * SIZE(C4)
  2955. #endif
  2956. ST c01, 0 * SIZE(C1)
  2957. ST c02, 1 * SIZE(C1)
  2958. ST c05, 0 * SIZE(C2)
  2959. ST c06, 1 * SIZE(C2)
  2960. ST c09, 0 * SIZE(C3)
  2961. ST c10, 1 * SIZE(C3)
  2962. ST c13, 0 * SIZE(C4)
  2963. ST c14, 1 * SIZE(C4)
  2964. #ifndef LN
  2965. lda C1, 2 * SIZE(C1)
  2966. lda C2, 2 * SIZE(C2)
  2967. lda C3, 2 * SIZE(C3)
  2968. lda C4, 2 * SIZE(C4)
  2969. #endif
  2970. fclr t1
  2971. fclr t2
  2972. fclr t3
  2973. fclr t4
  2974. #ifdef RT
  2975. sll K, 1 + BASE_SHIFT, TMP1
  2976. addq AORIG, TMP1, AORIG
  2977. #endif
  2978. #if defined(LT) || defined(RN)
  2979. subq K, KK, TMP1
  2980. sll TMP1, BASE_SHIFT + 1, TMP2
  2981. addq AO, TMP2, AO
  2982. sll TMP1, BASE_SHIFT + 2, TMP2
  2983. addq BO, TMP2, BO
  2984. #endif
  2985. #ifdef LT
  2986. addq KK, 2, KK
  2987. #endif
  2988. #ifdef LN
  2989. subq KK, 2, KK
  2990. #endif
  2991. .align 4
  2992. $L30:
  2993. and M, 1, I
  2994. ble I, $L39
  2995. #if defined(LT) || defined(RN)
  2996. LD a1, 0 * SIZE(AO)
  2997. fclr c01
  2998. LD a2, 1 * SIZE(AO)
  2999. fclr c05
  3000. LD b1, 0 * SIZE(B)
  3001. lda L, -2(KK)
  3002. LD b2, 1 * SIZE(B)
  3003. lda AO, 1 * SIZE(AO)
  3004. LD b3, 2 * SIZE(B)
  3005. fclr c09
  3006. LD b4, 3 * SIZE(B)
  3007. fclr c13
  3008. lda BO, 4 * SIZE(B)
  3009. ble KK, $L38
  3010. ble L, $L35
  3011. #else
  3012. #ifdef LN
  3013. sll K, BASE_SHIFT + 0, TMP1
  3014. subq AORIG, TMP1, AORIG
  3015. #endif
  3016. sll KK, BASE_SHIFT + 0, TMP1
  3017. addq AORIG, TMP1, AO
  3018. sll KK, BASE_SHIFT + 2, TMP2
  3019. addq B, TMP2, BO
  3020. subq K, KK, TMP1
  3021. LD a1, 0 * SIZE(AO)
  3022. fclr c01
  3023. LD a2, 1 * SIZE(AO)
  3024. fclr c05
  3025. LD b1, 0 * SIZE(BO)
  3026. lda L, -2(TMP1)
  3027. LD b2, 1 * SIZE(BO)
  3028. lda AO, 1 * SIZE(AO)
  3029. LD b3, 2 * SIZE(BO)
  3030. fclr c09
  3031. LD b4, 3 * SIZE(BO)
  3032. fclr c13
  3033. lda BO, 4 * SIZE(BO)
  3034. ble TMP1, $L38
  3035. ble L, $L35
  3036. #endif
  3037. .align 4
  3038. $L32:
  3039. ADD c01, t1, c01
  3040. lda L, -2(L)
  3041. MUL a1, b1, t1
  3042. LD b1, 0 * SIZE(BO)
  3043. ADD c05, t2, c05
  3044. lda AO, 2 * SIZE(AO)
  3045. MUL a1, b2, t2
  3046. LD b2, 1 * SIZE(BO)
  3047. ADD c09, t3, c09
  3048. LD b5, 3 * SIZE(BO)
  3049. MUL a1, b3, t3
  3050. LD b3, 2 * SIZE(BO)
  3051. ADD c13, t4, c13
  3052. MUL a1, b4, t4
  3053. LD a1, -1 * SIZE(AO)
  3054. ADD c01, t1, c01
  3055. MUL a2, b1, t1
  3056. LD b1, 4 * SIZE(BO)
  3057. lda BO, 8 * SIZE(BO)
  3058. ADD c05, t2, c05
  3059. MUL a2, b2, t2
  3060. LD b2, -3 * SIZE(BO)
  3061. ADD c09, t3, c09
  3062. LD b4, -1 * SIZE(BO)
  3063. MUL a2, b3, t3
  3064. LD b3, -2 * SIZE(BO)
  3065. ADD c13, t4, c13
  3066. MUL a2, b5, t4
  3067. LD a2, 0 * SIZE(AO)
  3068. bgt L, $L32
  3069. .align 4
  3070. $L35:
  3071. ADD c01, t1, c01
  3072. MUL a1, b1, t1
  3073. #if defined(LT) || defined(RN)
  3074. blbs KK, $L37
  3075. #else
  3076. blbs TMP1, $L37
  3077. #endif
  3078. .align 4
  3079. ADD c05, t2, c05
  3080. LD b1, 0 * SIZE(BO)
  3081. MUL a1, b2, t2
  3082. LD b2, 1 * SIZE(BO)
  3083. ADD c09, t3, c09
  3084. MUL a1, b3, t3
  3085. LD b3, 2 * SIZE(BO)
  3086. ADD c13, t4, c13
  3087. MUL a1, b4, t4
  3088. LD a1, 0 * SIZE(AO)
  3089. lda AO, 1 * SIZE(AO)
  3090. ADD c01, t1, c01
  3091. LD b4, 3 * SIZE(BO)
  3092. MUL a1, b1, t1
  3093. lda BO, 4 * SIZE(BO)
  3094. .align 4
  3095. $L37:
  3096. ADD c05, t2, c05
  3097. MUL a1, b2, t2
  3098. ADD c09, t3, c09
  3099. MUL a1, b3, t3
  3100. ADD c13, t4, c13
  3101. lda AO, 1 * SIZE(AO)
  3102. MUL a1, b4, t4
  3103. lda BO, 4 * SIZE(BO)
  3104. ADD c01, t1, c01
  3105. ADD c05, t2, c05
  3106. ADD c09, t3, c09
  3107. ADD c13, t4, c13
  3108. $L38:
  3109. #if defined(LN) || defined(RT)
  3110. #ifdef LN
  3111. subq KK, 1, TMP1
  3112. #else
  3113. subq KK, 4, TMP1
  3114. #endif
  3115. sll TMP1, BASE_SHIFT + 0, TMP2
  3116. addq AORIG, TMP2, AO
  3117. sll TMP1, BASE_SHIFT + 2, TMP2
  3118. addq B, TMP2, BO
  3119. #else
  3120. lda AO, -1 * SIZE(AO)
  3121. lda BO, -4 * SIZE(BO)
  3122. #endif
  3123. #if defined(LN) || defined(LT)
  3124. LD a1, 0 * SIZE(BO)
  3125. LD a2, 1 * SIZE(BO)
  3126. LD a3, 2 * SIZE(BO)
  3127. LD a4, 3 * SIZE(BO)
  3128. SUB a1, c01, c01
  3129. SUB a2, c05, c05
  3130. SUB a3, c09, c09
  3131. SUB a4, c13, c13
  3132. #else
  3133. LD a1, 0 * SIZE(AO)
  3134. LD a2, 1 * SIZE(AO)
  3135. LD a3, 2 * SIZE(AO)
  3136. LD a4, 3 * SIZE(AO)
  3137. SUB a1, c01, c01
  3138. SUB a2, c05, c05
  3139. SUB a3, c09, c09
  3140. SUB a4, c13, c13
  3141. #endif
  3142. #if defined(LN) || defined(LT)
  3143. LD a1, 0 * SIZE(AO)
  3144. MUL a1, c01, c01
  3145. MUL a1, c05, c05
  3146. MUL a1, c09, c09
  3147. MUL a1, c13, c13
  3148. #endif
  3149. #ifdef RN
  3150. LD a1, 0 * SIZE(BO)
  3151. LD a2, 1 * SIZE(BO)
  3152. LD a3, 2 * SIZE(BO)
  3153. LD a4, 3 * SIZE(BO)
  3154. MUL a1, c01, c01
  3155. MUL a2, c01, t1
  3156. SUB c05, t1, c05
  3157. MUL a3, c01, t1
  3158. SUB c09, t1, c09
  3159. MUL a4, c01, t1
  3160. SUB c13, t1, c13
  3161. LD b1, 5 * SIZE(BO)
  3162. LD b2, 6 * SIZE(BO)
  3163. LD b3, 7 * SIZE(BO)
  3164. MUL b1, c05, c05
  3165. MUL b2, c05, t1
  3166. SUB c09, t1, c09
  3167. MUL b3, c05, t1
  3168. SUB c13, t1, c13
  3169. LD a1, 10 * SIZE(BO)
  3170. LD a2, 11 * SIZE(BO)
  3171. LD a3, 15 * SIZE(BO)
  3172. MUL a1, c09, c09
  3173. MUL a2, c09, t1
  3174. SUB c13, t1, c13
  3175. MUL a3, c13, c13
  3176. #endif
  3177. #ifdef RT
  3178. LD a1, 15 * SIZE(BO)
  3179. LD a2, 14 * SIZE(BO)
  3180. LD a3, 13 * SIZE(BO)
  3181. LD a4, 12 * SIZE(BO)
  3182. MUL a1, c13, c13
  3183. MUL a2, c13, t1
  3184. SUB c09, t1, c09
  3185. MUL a3, c13, t1
  3186. SUB c05, t1, c05
  3187. MUL a4, c13, t1
  3188. SUB c01, t1, c01
  3189. LD b1, 10 * SIZE(BO)
  3190. LD b2, 9 * SIZE(BO)
  3191. LD b3, 8 * SIZE(BO)
  3192. MUL b1, c09, c09
  3193. MUL b2, c09, t1
  3194. SUB c05, t1, c05
  3195. MUL b3, c09, t1
  3196. SUB c01, t1, c01
  3197. LD a1, 5 * SIZE(BO)
  3198. LD a2, 4 * SIZE(BO)
  3199. LD a3, 0 * SIZE(BO)
  3200. MUL a1, c05, c05
  3201. MUL a2, c05, t1
  3202. SUB c01, t1, c01
  3203. MUL a3, c01, c01
  3204. #endif
  3205. #if defined(LN) || defined(LT)
  3206. ST c01, 0 * SIZE(BO)
  3207. ST c05, 1 * SIZE(BO)
  3208. ST c09, 2 * SIZE(BO)
  3209. ST c13, 3 * SIZE(BO)
  3210. #else
  3211. ST c01, 0 * SIZE(AO)
  3212. ST c05, 1 * SIZE(AO)
  3213. ST c09, 2 * SIZE(AO)
  3214. ST c13, 3 * SIZE(AO)
  3215. #endif
  3216. #ifdef LN
  3217. lda C1, -1 * SIZE(C1)
  3218. lda C2, -1 * SIZE(C2)
  3219. lda C3, -1 * SIZE(C3)
  3220. lda C4, -1 * SIZE(C4)
  3221. #endif
  3222. ST c01, 0 * SIZE(C1)
  3223. ST c05, 0 * SIZE(C2)
  3224. ST c09, 0 * SIZE(C3)
  3225. ST c13, 0 * SIZE(C4)
  3226. #ifdef RT
  3227. sll K, 0 + BASE_SHIFT, TMP1
  3228. addq AORIG, TMP1, AORIG
  3229. #endif
  3230. #if defined(LT) || defined(RN)
  3231. subq K, KK, TMP1
  3232. sll TMP1, BASE_SHIFT + 0, TMP2
  3233. addq AO, TMP2, AO
  3234. sll TMP1, BASE_SHIFT + 2, TMP2
  3235. addq BO, TMP2, BO
  3236. #endif
  3237. #ifdef LT
  3238. addq KK, 1, KK
  3239. #endif
  3240. #ifdef LN
  3241. subq KK, 1, KK
  3242. #endif
  3243. .align 4
  3244. $L39:
  3245. #ifdef LN
  3246. sll K, 2 + BASE_SHIFT, TMP1
  3247. addq B, TMP1, B
  3248. #endif
  3249. #if defined(LT) || defined(RN)
  3250. mov BO, B
  3251. #endif
  3252. #ifdef RN
  3253. addq KK, 4, KK
  3254. #endif
  3255. #ifdef RT
  3256. subq KK, 4, KK
  3257. #endif
  3258. lda J, -1(J)
  3259. bgt J, $L01
  3260. .align 4
  3261. $L999:
  3262. ldt $f2, 0($sp)
  3263. ldt $f3, 8($sp)
  3264. ldt $f4, 16($sp)
  3265. ldt $f5, 24($sp)
  3266. ldt $f6, 32($sp)
  3267. ldt $f7, 40($sp)
  3268. ldt $f8, 48($sp)
  3269. ldt $f9, 56($sp)
  3270. clr $0
  3271. lda $sp, STACKSIZE($sp)
  3272. ret
  3273. EPILOGUE