You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_4x4_RT.S 59 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "version.h"
  41. #if !defined(EV4) && !defined(EV5) && !defined(EV6)
  42. #error "Architecture is not specified."
  43. #endif
  44. #ifdef EV6
  45. #define PREFETCHSIZE 56
  46. #define UNOP unop
  47. #endif
  48. #ifdef EV5
  49. #define PREFETCHSIZE 56
  50. #define UNOP
  51. #endif
  52. #ifdef EV4
  53. #define UNOP
  54. #endif
  55. #define STACKSIZE 80
  56. #define M $16
  57. #define N $17
  58. #define K $18
  59. #define A $20
  60. #define B $21
  61. #define C $22
  62. #define LDC $23
  63. #define C1 $19
  64. #define C2 $24
  65. #define C3 $25
  66. #define C4 $27
  67. #define AO $at
  68. #define BO $5
  69. #define I $6
  70. #define J $7
  71. #define L $8
  72. #define a1 $f16
  73. #define a2 $f17
  74. #define a3 $f18
  75. #define a4 $f19
  76. #define b1 $f20
  77. #define b2 $f21
  78. #define b3 $f22
  79. #define b4 $f23
  80. #define t1 $f24
  81. #define t2 $f25
  82. #define t3 $f26
  83. #define t4 $f27
  84. #define a5 $f28
  85. #define a6 $f30
  86. #define b5 $f29
  87. #define alpha $f30
  88. #define c01 $f0
  89. #define c02 $f1
  90. #define c03 $f2
  91. #define c04 $f3
  92. #define c05 $f4
  93. #define c06 $f5
  94. #define c07 $f6
  95. #define c08 $f7
  96. #define c09 $f8
  97. #define c10 $f9
  98. #define c11 $f10
  99. #define c12 $f11
  100. #define c13 $f12
  101. #define c14 $f13
  102. #define c15 $f14
  103. #define c16 $f15
  104. #define TMP1 $0
  105. #define TMP2 $1
  106. #define KK $2
  107. #define AORIG $3
  108. #define OFFSET $4
  109. PROLOGUE
  110. PROFCODE
  111. .frame $sp, STACKSIZE, $26, 0
  112. lda $sp, -STACKSIZE($sp)
  113. ldq C, 0 + STACKSIZE($sp)
  114. ldq LDC, 8 + STACKSIZE($sp)
  115. ldq OFFSET, 16 + STACKSIZE($sp)
  116. SXADDQ LDC, 0, LDC
  117. stt $f2, 0($sp)
  118. stt $f3, 8($sp)
  119. stt $f4, 16($sp)
  120. stt $f5, 24($sp)
  121. stt $f6, 32($sp)
  122. stt $f7, 40($sp)
  123. stt $f8, 48($sp)
  124. stt $f9, 56($sp)
  125. cmple M, 0, $0
  126. cmple N, 0, $1
  127. cmple K, 0, $2
  128. or $0, $1, $0
  129. or $0, $2, $0
  130. bne $0, $L999
  131. #ifdef LN
  132. mulq M, K, TMP1
  133. SXADDQ TMP1, A, A
  134. SXADDQ M, C, C
  135. #endif
  136. #ifdef RN
  137. negq OFFSET, KK
  138. #endif
  139. #ifdef RT
  140. mulq N, K, TMP1
  141. SXADDQ TMP1, B, B
  142. mulq N, LDC, TMP1
  143. addq TMP1, C, C
  144. subq N, OFFSET, KK
  145. #endif
  146. and N, 1, J
  147. ble J, $L40
  148. #ifdef RT
  149. sll K, BASE_SHIFT, TMP1
  150. subq B, TMP1, B
  151. subq C, LDC, C
  152. #endif
  153. mov C, C1
  154. #ifndef RT
  155. addq C, LDC, C
  156. #endif
  157. #ifdef LN
  158. addq M, OFFSET, KK
  159. #endif
  160. #ifdef LT
  161. mov OFFSET, KK
  162. #endif
  163. #if defined(LN) || defined(RT)
  164. mov A, AORIG
  165. #else
  166. mov A, AO
  167. #endif
  168. sra M, 2, I
  169. ble I, $L100
  170. .align 4
  171. $L91:
  172. #if defined(LT) || defined(RN)
  173. LD a1, 0 * SIZE(AO)
  174. fclr t1
  175. LD a2, 1 * SIZE(AO)
  176. fclr t2
  177. LD a3, 2 * SIZE(AO)
  178. fclr t3
  179. LD a4, 3 * SIZE(AO)
  180. fclr t4
  181. LD b1, 0 * SIZE(B)
  182. fclr c01
  183. LD b2, 1 * SIZE(B)
  184. fclr c02
  185. LD b3, 2 * SIZE(B)
  186. fclr c03
  187. LD b4, 3 * SIZE(B)
  188. fclr c04
  189. sra KK, 2, L
  190. mov B, BO
  191. ble L, $L95
  192. #else
  193. #ifdef LN
  194. sll K, BASE_SHIFT + 2, TMP1
  195. subq AORIG, TMP1, AORIG
  196. #endif
  197. sll KK, BASE_SHIFT + 2, TMP1
  198. addq AORIG, TMP1, AO
  199. sll KK, BASE_SHIFT + 0, TMP1
  200. addq B, TMP1, BO
  201. subq K, KK, TMP1
  202. LD a1, 0 * SIZE(AO)
  203. fclr t1
  204. LD a2, 1 * SIZE(AO)
  205. fclr t2
  206. LD a3, 2 * SIZE(AO)
  207. fclr t3
  208. LD a4, 3 * SIZE(AO)
  209. fclr t4
  210. LD b1, 0 * SIZE(BO)
  211. fclr c01
  212. LD b2, 1 * SIZE(BO)
  213. fclr c02
  214. LD b3, 2 * SIZE(BO)
  215. fclr c03
  216. LD b4, 3 * SIZE(BO)
  217. fclr c04
  218. sra TMP1, 2, L
  219. unop
  220. ble L, $L95
  221. #endif
  222. .align 5
  223. $L92:
  224. ADD c01, t1, c01
  225. unop
  226. MUL a1, b1, t1
  227. LD a1, 4 * SIZE(AO)
  228. ADD c02, t2, c02
  229. lda L, -1(L)
  230. MUL a2, b1, t2
  231. LD a2, 5 * SIZE(AO)
  232. ADD c03, t3, c03
  233. unop
  234. MUL a3, b1, t3
  235. LD a3, 6 * SIZE(AO)
  236. ADD c04, t4, c04
  237. MUL a4, b1, t4
  238. LD a4, 7 * SIZE(AO)
  239. LD b1, 4 * SIZE(BO)
  240. ADD c01, t1, c01
  241. unop
  242. MUL a1, b2, t1
  243. LD a1, 8 * SIZE(AO)
  244. ADD c02, t2, c02
  245. unop
  246. MUL a2, b2, t2
  247. LD a2, 9 * SIZE(AO)
  248. ADD c03, t3, c03
  249. unop
  250. MUL a3, b2, t3
  251. LD a3, 10 * SIZE(AO)
  252. ADD c04, t4, c04
  253. MUL a4, b2, t4
  254. LD a4, 11 * SIZE(AO)
  255. LD b2, 5 * SIZE(BO)
  256. ADD c01, t1, c01
  257. unop
  258. MUL a1, b3, t1
  259. LD a1, 12 * SIZE(AO)
  260. ADD c02, t2, c02
  261. unop
  262. MUL a2, b3, t2
  263. LD a2, 13 * SIZE(AO)
  264. ADD c03, t3, c03
  265. unop
  266. MUL a3, b3, t3
  267. LD a3, 14 * SIZE(AO)
  268. ADD c04, t4, c04
  269. MUL a4, b3, t4
  270. LD a5, 15 * SIZE(AO)
  271. LD b3, 6 * SIZE(BO)
  272. ADD c01, t1, c01
  273. MUL a1, b4, t1
  274. LD a1, 16 * SIZE(AO)
  275. lda AO, 16 * SIZE(AO)
  276. ADD c02, t2, c02
  277. lda BO, 4 * SIZE(BO)
  278. MUL a2, b4, t2
  279. LD a2, 1 * SIZE(AO)
  280. ADD c03, t3, c03
  281. LD a4, 3 * SIZE(AO)
  282. MUL a3, b4, t3
  283. LD a3, 2 * SIZE(AO)
  284. ADD c04, t4, c04
  285. MUL a5, b4, t4
  286. LD b4, 3 * SIZE(BO)
  287. bgt L, $L92
  288. .align 4
  289. $L95:
  290. #if defined(LT) || defined(RN)
  291. and KK, 3, L
  292. #else
  293. and TMP1, 3, L
  294. #endif
  295. unop
  296. ble L, $L98
  297. .align 4
  298. $L96:
  299. ADD c01, t1, c01
  300. lda L, -1(L)
  301. MUL a1, b1, t1
  302. LD a1, 4 * SIZE(AO)
  303. ADD c02, t2, c02
  304. lda BO, 1 * SIZE(BO)
  305. MUL a2, b1, t2
  306. LD a2, 5 * SIZE(AO)
  307. ADD c03, t3, c03
  308. unop
  309. MUL a3, b1, t3
  310. LD a3, 6 * SIZE(AO)
  311. ADD c04, t4, c04
  312. MUL a4, b1, t4
  313. LD a4, 7 * SIZE(AO)
  314. LD b1, 0 * SIZE(BO)
  315. lda AO, 4 * SIZE(AO)
  316. bgt L, $L96
  317. .align 4
  318. $L98:
  319. ADD c01, t1, c01
  320. ADD c02, t2, c02
  321. ADD c03, t3, c03
  322. ADD c04, t4, c04
  323. #if defined(LN) || defined(RT)
  324. #ifdef LN
  325. subq KK, 4, TMP1
  326. #else
  327. subq KK, 1, TMP1
  328. #endif
  329. sll TMP1, BASE_SHIFT + 2, TMP2
  330. addq AORIG, TMP2, AO
  331. sll TMP1, BASE_SHIFT + 0, TMP2
  332. addq B, TMP2, BO
  333. #endif
  334. #if defined(LN) || defined(LT)
  335. LD a1, 0 * SIZE(BO)
  336. LD a2, 1 * SIZE(BO)
  337. LD a3, 2 * SIZE(BO)
  338. LD a4, 3 * SIZE(BO)
  339. SUB a1, c01, c01
  340. SUB a2, c02, c02
  341. SUB a3, c03, c03
  342. SUB a4, c04, c04
  343. #else
  344. LD a1, 0 * SIZE(AO)
  345. LD a2, 1 * SIZE(AO)
  346. LD a3, 2 * SIZE(AO)
  347. LD a4, 3 * SIZE(AO)
  348. SUB a1, c01, c01
  349. SUB a2, c02, c02
  350. SUB a3, c03, c03
  351. SUB a4, c04, c04
  352. #endif
  353. #ifdef LN
  354. LD a1, 15 * SIZE(AO)
  355. LD a2, 14 * SIZE(AO)
  356. LD a3, 13 * SIZE(AO)
  357. LD a4, 12 * SIZE(AO)
  358. MUL a1, c04, c04
  359. MUL a2, c04, t1
  360. SUB c03, t1, c03
  361. MUL a3, c04, t1
  362. SUB c02, t1, c02
  363. MUL a4, c04, t1
  364. SUB c01, t1, c01
  365. LD b1, 10 * SIZE(AO)
  366. LD b2, 9 * SIZE(AO)
  367. LD b3, 8 * SIZE(AO)
  368. MUL b1, c03, c03
  369. MUL b2, c03, t1
  370. SUB c02, t1, c02
  371. MUL b3, c03, t1
  372. SUB c01, t1, c01
  373. LD a1, 5 * SIZE(AO)
  374. LD a2, 4 * SIZE(AO)
  375. LD a3, 0 * SIZE(AO)
  376. MUL a1, c02, c02
  377. MUL a2, c02, t1
  378. SUB c01, t1, c01
  379. MUL a3, c01, c01
  380. #endif
  381. #ifdef LT
  382. LD a1, 0 * SIZE(AO)
  383. LD a2, 1 * SIZE(AO)
  384. LD a3, 2 * SIZE(AO)
  385. LD a4, 3 * SIZE(AO)
  386. MUL a1, c01, c01
  387. MUL a2, c01, t1
  388. SUB c02, t1, c02
  389. MUL a3, c01, t1
  390. SUB c03, t1, c03
  391. MUL a4, c01, t1
  392. SUB c04, t1, c04
  393. LD b1, 5 * SIZE(AO)
  394. LD b2, 6 * SIZE(AO)
  395. LD b3, 7 * SIZE(AO)
  396. MUL b1, c02, c02
  397. MUL b2, c02, t1
  398. SUB c03, t1, c03
  399. MUL b3, c02, t1
  400. SUB c04, t1, c04
  401. LD a1, 10 * SIZE(AO)
  402. LD a2, 11 * SIZE(AO)
  403. LD a3, 15 * SIZE(AO)
  404. MUL a1, c03, c03
  405. MUL a2, c03, t1
  406. SUB c04, t1, c04
  407. MUL a3, c04, c04
  408. #endif
  409. #if defined(RN) || defined(RT)
  410. LD a1, 0 * SIZE(BO)
  411. MUL a1, c01, c01
  412. MUL a1, c02, c02
  413. MUL a1, c03, c03
  414. MUL a1, c04, c04
  415. #endif
  416. #if defined(LN) || defined(LT)
  417. ST c01, 0 * SIZE(BO)
  418. ST c02, 1 * SIZE(BO)
  419. ST c03, 2 * SIZE(BO)
  420. ST c04, 3 * SIZE(BO)
  421. #else
  422. ST c01, 0 * SIZE(AO)
  423. ST c02, 1 * SIZE(AO)
  424. ST c03, 2 * SIZE(AO)
  425. ST c04, 3 * SIZE(AO)
  426. #endif
  427. #ifdef LN
  428. lda C1, -4 * SIZE(C1)
  429. #endif
  430. ST c01, 0 * SIZE(C1)
  431. ST c02, 1 * SIZE(C1)
  432. ST c03, 2 * SIZE(C1)
  433. ST c04, 3 * SIZE(C1)
  434. #ifndef LN
  435. lda C1, 4 * SIZE(C1)
  436. #endif
  437. fclr t1
  438. fclr t2
  439. fclr t3
  440. fclr t4
  441. #ifdef RT
  442. sll K, 2 + BASE_SHIFT, TMP1
  443. addq AORIG, TMP1, AORIG
  444. #endif
  445. #if defined(LT) || defined(RN)
  446. subq K, KK, TMP1
  447. sll TMP1, BASE_SHIFT + 2, TMP2
  448. addq AO, TMP2, AO
  449. sll TMP1, BASE_SHIFT + 0, TMP2
  450. addq BO, TMP2, BO
  451. #endif
  452. #ifdef LT
  453. addq KK, 4, KK
  454. #endif
  455. #ifdef LN
  456. subq KK, 4, KK
  457. #endif
  458. lda I, -1(I)
  459. bgt I, $L91
  460. .align 4
  461. $L100:
  462. and M, 2, I
  463. ble I, $L110
  464. #if defined(LT) || defined(RN)
  465. LD a1, 0 * SIZE(AO)
  466. fclr t1
  467. LD a2, 1 * SIZE(AO)
  468. fclr t2
  469. LD a3, 2 * SIZE(AO)
  470. fclr t3
  471. LD a4, 3 * SIZE(AO)
  472. fclr t4
  473. LD b1, 0 * SIZE(B)
  474. fclr c01
  475. LD b2, 1 * SIZE(B)
  476. fclr c02
  477. LD b3, 2 * SIZE(B)
  478. fclr c03
  479. LD b4, 3 * SIZE(B)
  480. fclr c04
  481. sra KK, 2, L
  482. mov B, BO
  483. ble L, $L105
  484. #else
  485. #ifdef LN
  486. sll K, BASE_SHIFT + 1, TMP1
  487. subq AORIG, TMP1, AORIG
  488. #endif
  489. sll KK, BASE_SHIFT + 1, TMP1
  490. addq AORIG, TMP1, AO
  491. sll KK, BASE_SHIFT + 0, TMP1
  492. addq B, TMP1, BO
  493. subq K, KK, TMP1
  494. LD a1, 0 * SIZE(AO)
  495. fclr t1
  496. LD a2, 1 * SIZE(AO)
  497. fclr t2
  498. LD a3, 2 * SIZE(AO)
  499. fclr t3
  500. LD a4, 3 * SIZE(AO)
  501. fclr t4
  502. LD b1, 0 * SIZE(BO)
  503. fclr c01
  504. LD b2, 1 * SIZE(BO)
  505. fclr c02
  506. LD b3, 2 * SIZE(BO)
  507. fclr c03
  508. LD b4, 3 * SIZE(BO)
  509. fclr c04
  510. sra TMP1, 2, L
  511. ble L, $L105
  512. #endif
  513. .align 5
  514. $L102:
  515. ADD c01, t1, c01
  516. lda L, -1(L)
  517. MUL a1, b1, t1
  518. LD a1, 4 * SIZE(AO)
  519. ADD c02, t2, c02
  520. MUL a2, b1, t2
  521. LD a2, 5 * SIZE(AO)
  522. LD b1, 4 * SIZE(BO)
  523. ADD c03, t3, c03
  524. lda BO, 4 * SIZE(BO)
  525. MUL a3, b2, t3
  526. LD a3, 6 * SIZE(AO)
  527. ADD c04, t4, c04
  528. MUL a4, b2, t4
  529. LD a5, 7 * SIZE(AO)
  530. LD b2, 1 * SIZE(BO)
  531. ADD c01, t1, c01
  532. MUL a1, b3, t1
  533. LD a1, 8 * SIZE(AO)
  534. lda AO, 8 * SIZE(AO)
  535. ADD c02, t2, c02
  536. MUL a2, b3, t2
  537. LD b3, 2 * SIZE(BO)
  538. LD a2, 1 * SIZE(AO)
  539. ADD c03, t3, c03
  540. LD a4, 3 * SIZE(AO)
  541. MUL a3, b4, t3
  542. LD a3, 2 * SIZE(AO)
  543. ADD c04, t4, c04
  544. MUL a5, b4, t4
  545. LD b4, 3 * SIZE(BO)
  546. bgt L, $L102
  547. .align 4
  548. $L105:
  549. #if defined(LT) || defined(RN)
  550. and KK, 3, L
  551. #else
  552. and TMP1, 3, L
  553. #endif
  554. ble L, $L108
  555. .align 4
  556. $L106:
  557. ADD c01, t1, c01
  558. lda L, -1(L)
  559. MUL a1, b1, t1
  560. LD a1, 2 * SIZE(AO)
  561. ADD c02, t2, c02
  562. MUL a2, b1, t2
  563. LD a2, 3 * SIZE(AO)
  564. LD b1, 1 * SIZE(BO)
  565. lda AO, 2 * SIZE(AO)
  566. unop
  567. lda BO, 1 * SIZE(BO)
  568. bgt L, $L106
  569. .align 4
  570. $L108:
  571. ADD c01, t1, c01
  572. ADD c02, t2, c02
  573. ADD c03, t3, c03
  574. ADD c04, t4, c04
  575. ADD c01, c03, c01
  576. ADD c02, c04, c02
  577. #if defined(LN) || defined(RT)
  578. #ifdef LN
  579. subq KK, 2, TMP1
  580. #else
  581. subq KK, 1, TMP1
  582. #endif
  583. sll TMP1, BASE_SHIFT + 1, TMP2
  584. addq AORIG, TMP2, AO
  585. sll TMP1, BASE_SHIFT + 0, TMP2
  586. addq B, TMP2, BO
  587. #endif
  588. #if defined(LN) || defined(LT)
  589. LD a1, 0 * SIZE(BO)
  590. LD a2, 1 * SIZE(BO)
  591. SUB a1, c01, c01
  592. SUB a2, c02, c02
  593. #else
  594. LD a1, 0 * SIZE(AO)
  595. LD a2, 1 * SIZE(AO)
  596. SUB a1, c01, c01
  597. SUB a2, c02, c02
  598. #endif
  599. #ifdef LN
  600. LD a1, 3 * SIZE(AO)
  601. LD a2, 2 * SIZE(AO)
  602. LD a3, 0 * SIZE(AO)
  603. MUL a1, c02, c02
  604. MUL a2, c02, t1
  605. SUB c01, t1, c01
  606. MUL a3, c01, c01
  607. #endif
  608. #ifdef LT
  609. LD a1, 0 * SIZE(AO)
  610. LD a2, 1 * SIZE(AO)
  611. LD a3, 3 * SIZE(AO)
  612. MUL a1, c01, c01
  613. MUL a2, c01, t1
  614. SUB c02, t1, c02
  615. MUL a3, c02, c02
  616. #endif
  617. #if defined(RN) || defined(RT)
  618. LD a1, 0 * SIZE(BO)
  619. MUL a1, c01, c01
  620. MUL a1, c02, c02
  621. #endif
  622. #if defined(LN) || defined(LT)
  623. ST c01, 0 * SIZE(BO)
  624. ST c02, 1 * SIZE(BO)
  625. #else
  626. ST c01, 0 * SIZE(AO)
  627. ST c02, 1 * SIZE(AO)
  628. #endif
  629. #ifdef LN
  630. lda C1, -2 * SIZE(C1)
  631. #endif
  632. ST c01, 0 * SIZE(C1)
  633. ST c02, 1 * SIZE(C1)
  634. #ifndef LN
  635. lda C1, 2 * SIZE(C1)
  636. #endif
  637. fclr t1
  638. fclr t2
  639. fclr t3
  640. fclr t4
  641. #ifdef RT
  642. sll K, 1 + BASE_SHIFT, TMP1
  643. addq AORIG, TMP1, AORIG
  644. #endif
  645. #if defined(LT) || defined(RN)
  646. subq K, KK, TMP1
  647. sll TMP1, BASE_SHIFT + 1, TMP2
  648. addq AO, TMP2, AO
  649. sll TMP1, BASE_SHIFT + 0, TMP2
  650. addq BO, TMP2, BO
  651. #endif
  652. #ifdef LT
  653. addq KK, 2, KK
  654. #endif
  655. #ifdef LN
  656. subq KK, 2, KK
  657. #endif
  658. .align 4
  659. $L110:
  660. and M, 1, I
  661. ble I, $L119
  662. #if defined(LT) || defined(RN)
  663. LD a1, 0 * SIZE(AO)
  664. fclr t1
  665. LD a2, 1 * SIZE(AO)
  666. fclr t2
  667. LD a3, 2 * SIZE(AO)
  668. fclr t3
  669. LD a4, 3 * SIZE(AO)
  670. fclr t4
  671. LD b1, 0 * SIZE(B)
  672. fclr c01
  673. LD b2, 1 * SIZE(B)
  674. fclr c02
  675. LD b3, 2 * SIZE(B)
  676. fclr c03
  677. LD b4, 3 * SIZE(B)
  678. fclr c04
  679. sra KK, 2, L
  680. mov B, BO
  681. unop
  682. ble L, $L115
  683. #else
  684. #ifdef LN
  685. sll K, BASE_SHIFT + 0, TMP1
  686. subq AORIG, TMP1, AORIG
  687. #endif
  688. sll KK, BASE_SHIFT + 0, TMP1
  689. addq AORIG, TMP1, AO
  690. sll KK, BASE_SHIFT + 0, TMP1
  691. addq B, TMP1, BO
  692. subq K, KK, TMP1
  693. LD a1, 0 * SIZE(AO)
  694. fclr t1
  695. LD a2, 1 * SIZE(AO)
  696. fclr t2
  697. LD a3, 2 * SIZE(AO)
  698. fclr t3
  699. LD a4, 3 * SIZE(AO)
  700. fclr t4
  701. LD b1, 0 * SIZE(BO)
  702. fclr c01
  703. LD b2, 1 * SIZE(BO)
  704. fclr c02
  705. LD b3, 2 * SIZE(BO)
  706. fclr c03
  707. LD b4, 3 * SIZE(BO)
  708. fclr c04
  709. sra TMP1, 2, L
  710. unop
  711. ble L, $L115
  712. #endif
  713. .align 4
  714. $L112:
  715. ADD c01, t1, c01
  716. MUL a1, b1, t1
  717. LD a1, 4 * SIZE(AO)
  718. LD b1, 4 * SIZE(BO)
  719. ADD c02, t2, c02
  720. MUL a2, b2, t2
  721. LD a2, 5 * SIZE(AO)
  722. LD b2, 5 * SIZE(BO)
  723. ADD c03, t3, c03
  724. MUL a3, b3, t3
  725. LD a3, 6 * SIZE(AO)
  726. LD b3, 6 * SIZE(BO)
  727. ADD c04, t4, c04
  728. MUL a4, b4, t4
  729. LD a4, 7 * SIZE(AO)
  730. LD b4, 7 * SIZE(BO)
  731. lda L, -1(L)
  732. lda AO, 4 * SIZE(AO)
  733. lda BO, 4 * SIZE(BO)
  734. bgt L, $L112
  735. .align 4
  736. $L115:
  737. #if defined(LT) || defined(RN)
  738. and KK, 3, L
  739. #else
  740. and TMP1, 3, L
  741. #endif
  742. ble L, $L118
  743. .align 4
  744. $L116:
  745. ADD c01, t1, c01
  746. MUL a1, b1, t1
  747. LD a1, 1 * SIZE(AO)
  748. LD b1, 1 * SIZE(BO)
  749. lda L, -1(L)
  750. lda AO, 1 * SIZE(AO)
  751. lda BO, 1 * SIZE(BO)
  752. bgt L, $L116
  753. .align 4
  754. $L118:
  755. ADD c01, t1, c01
  756. ADD c02, t2, c02
  757. ADD c03, t3, c03
  758. ADD c04, t4, c04
  759. ADD c01, c02, c01
  760. ADD c03, c04, c03
  761. ADD c01, c03, c01
  762. #if defined(LN) || defined(RT)
  763. subq KK, 1, TMP1
  764. sll TMP1, BASE_SHIFT + 0, TMP2
  765. addq AORIG, TMP2, AO
  766. addq B, TMP2, BO
  767. #endif
  768. #if defined(LN) || defined(LT)
  769. LD a1, 0 * SIZE(BO)
  770. SUB a1, c01, c01
  771. #else
  772. LD a1, 0 * SIZE(AO)
  773. SUB a1, c01, c01
  774. #endif
  775. #if defined(LN) || defined(LT)
  776. LD a1, 0 * SIZE(AO)
  777. MUL a1, c01, c01
  778. #endif
  779. #if defined(RN) || defined(RT)
  780. LD a1, 0 * SIZE(BO)
  781. MUL a1, c01, c01
  782. #endif
  783. #if defined(LN) || defined(LT)
  784. ST c01, 0 * SIZE(BO)
  785. #else
  786. ST c01, 0 * SIZE(AO)
  787. #endif
  788. #ifdef LN
  789. lda C1, -1 * SIZE(C1)
  790. #endif
  791. ST c01, 0 * SIZE(C1)
  792. #ifndef LN
  793. lda C1, 1 * SIZE(C1)
  794. #endif
  795. #ifdef RT
  796. SXADDQ K, AORIG, AORIG
  797. #endif
  798. #if defined(LT) || defined(RN)
  799. subq K, KK, TMP1
  800. sll TMP1, BASE_SHIFT + 0, TMP2
  801. addq AO, TMP2, AO
  802. addq BO, TMP2, BO
  803. #endif
  804. #ifdef LT
  805. addq KK, 1, KK
  806. #endif
  807. #ifdef LN
  808. subq KK, 1, KK
  809. #endif
  810. .align 4
  811. $L119:
  812. #ifdef LN
  813. SXADDQ K, B, B
  814. #endif
  815. #if defined(LT) || defined(RN)
  816. mov BO, B
  817. #endif
  818. #ifdef RN
  819. addq KK, 1, KK
  820. #endif
  821. #ifdef RT
  822. subq KK, 1, KK
  823. #endif
  824. .align 4
  825. $L40:
  826. and N, 2, J
  827. ble J, $L80
  828. #ifdef RT
  829. sll K, 1 + BASE_SHIFT, TMP1
  830. subq B, TMP1, B
  831. addq LDC, LDC, TMP1
  832. subq C, TMP1, C
  833. #endif
  834. mov C, C1
  835. addq C, LDC, C2
  836. fclr t1
  837. #ifndef RT
  838. addq C2, LDC, C
  839. #endif
  840. fclr t2
  841. #ifdef LN
  842. addq M, OFFSET, KK
  843. #endif
  844. #ifdef LT
  845. mov OFFSET, KK
  846. #endif
  847. #if defined(LN) || defined(RT)
  848. mov A, AORIG
  849. #else
  850. mov A, AO
  851. #endif
  852. sra M, 2, I
  853. fclr t3
  854. fclr t4
  855. ble I, $L60
  856. .align 4
  857. $L51:
  858. #if defined(LT) || defined(RN)
  859. LD a1, 0 * SIZE(AO)
  860. fclr c03
  861. LD a2, 1 * SIZE(AO)
  862. fclr c07
  863. LD a3, 2 * SIZE(AO)
  864. fclr c04
  865. LD a4, 3 * SIZE(AO)
  866. fclr c08
  867. LD b1, 0 * SIZE(B)
  868. fclr c01
  869. LD b2, 1 * SIZE(B)
  870. fclr c05
  871. LD b3, 2 * SIZE(B)
  872. fclr c02
  873. LD b4, 3 * SIZE(B)
  874. fclr c06
  875. lda L, -2(KK)
  876. lda BO, 2 * SIZE(B)
  877. lda AO, 4 * SIZE(AO)
  878. ble KK, $L58
  879. ble L, $L55
  880. #else
  881. #ifdef LN
  882. sll K, BASE_SHIFT + 2, TMP1
  883. subq AORIG, TMP1, AORIG
  884. #endif
  885. sll KK, BASE_SHIFT + 2, TMP1
  886. addq AORIG, TMP1, AO
  887. sll KK, BASE_SHIFT + 1, TMP1
  888. addq B, TMP1, BO
  889. subq K, KK, TMP1
  890. LD a1, 0 * SIZE(AO)
  891. fclr c03
  892. LD a2, 1 * SIZE(AO)
  893. fclr c07
  894. LD a3, 2 * SIZE(AO)
  895. fclr c04
  896. LD a4, 3 * SIZE(AO)
  897. fclr c08
  898. LD b1, 0 * SIZE(BO)
  899. fclr c01
  900. LD b2, 1 * SIZE(BO)
  901. fclr c05
  902. LD b3, 2 * SIZE(BO)
  903. fclr c02
  904. LD b4, 3 * SIZE(BO)
  905. fclr c06
  906. lda L, -2(TMP1)
  907. lda BO, 2 * SIZE(BO)
  908. lda AO, 4 * SIZE(AO)
  909. ble TMP1, $L58
  910. ble L, $L55
  911. #endif
  912. .align 4
  913. $L52:
  914. ADD c05, t1, c05
  915. unop
  916. MUL a1, b1, t1
  917. unop
  918. ADD c06, t2, c06
  919. lda L, -2(L)
  920. MUL a2, b1, t2
  921. unop
  922. ADD c07, t3, c07
  923. unop
  924. MUL a3, b1, t3
  925. unop
  926. ADD c08, t4, c08
  927. unop
  928. MUL a4, b1, t4
  929. LD b1, 2 * SIZE(BO)
  930. ADD c01, t1, c01
  931. unop
  932. MUL a1, b2, t1
  933. LD a1, 0 * SIZE(AO)
  934. ADD c02, t2, c02
  935. lda BO, 4 * SIZE(BO)
  936. MUL a2, b2, t2
  937. LD a2, 1 * SIZE(AO)
  938. ADD c03, t3, c03
  939. unop
  940. MUL a3, b2, t3
  941. LD a3, 2 * SIZE(AO)
  942. ADD c04, t4, c04
  943. unop
  944. MUL a4, b2, t4
  945. LD a5, 3 * SIZE(AO)
  946. ADD c05, t1, c05
  947. unop
  948. MUL a1, b3, t1
  949. LD b2, -1 * SIZE(BO)
  950. ADD c06, t2, c06
  951. unop
  952. MUL a2, b3, t2
  953. unop
  954. ADD c07, t3, c07
  955. unop
  956. MUL a3, b3, t3
  957. lda AO, 8 * SIZE(AO)
  958. ADD c08, t4, c08
  959. unop
  960. MUL a5, b3, t4
  961. LD b3, 0 * SIZE(BO)
  962. ADD c01, t1, c01
  963. unop
  964. MUL a1, b4, t1
  965. LD a1, -4 * SIZE(AO)
  966. ADD c02, t2, c02
  967. unop
  968. MUL a2, b4, t2
  969. LD a2, -3 * SIZE(AO)
  970. ADD c03, t3, c03
  971. LD a4, -1 * SIZE(AO)
  972. MUL a3, b4, t3
  973. LD a3, -2 * SIZE(AO)
  974. ADD c04, t4, c04
  975. MUL a5, b4, t4
  976. LD b4, 1 * SIZE(BO)
  977. bgt L, $L52
  978. .align 4
  979. $L55:
  980. ADD c05, t1, c05
  981. MUL a1, b1, t1
  982. #if defined(LT) || defined(RN)
  983. blbs KK, $L57
  984. #else
  985. blbs TMP1, $L57
  986. #endif
  987. .align 4
  988. ADD c06, t2, c06
  989. MUL a2, b1, t2
  990. ADD c07, t3, c07
  991. MUL a3, b1, t3
  992. ADD c08, t4, c08
  993. unop
  994. MUL a4, b1, t4
  995. LD b1, 0 * SIZE(BO)
  996. ADD c01, t1, c01
  997. unop
  998. MUL a1, b2, t1
  999. LD a1, 0 * SIZE(AO)
  1000. ADD c02, t2, c02
  1001. unop
  1002. MUL a2, b2, t2
  1003. LD a2, 1 * SIZE(AO)
  1004. ADD c03, t3, c03
  1005. unop
  1006. MUL a3, b2, t3
  1007. LD a3, 2 * SIZE(AO)
  1008. ADD c04, t4, c04
  1009. MUL a4, b2, t4
  1010. LD a4, 3 * SIZE(AO)
  1011. lda AO, 4 * SIZE(AO)
  1012. ADD c05, t1, c05
  1013. LD b2, 1 * SIZE(BO)
  1014. MUL a1, b1, t1
  1015. lda BO, 2 * SIZE(BO)
  1016. .align 4
  1017. $L57:
  1018. ADD c06, t2, c06
  1019. MUL a2, b1, t2
  1020. ADD c07, t3, c07
  1021. MUL a3, b1, t3
  1022. ADD c08, t4, c08
  1023. MUL a4, b1, t4
  1024. ADD c01, t1, c01
  1025. MUL a1, b2, t1
  1026. ADD c02, t2, c02
  1027. MUL a2, b2, t2
  1028. ADD c03, t3, c03
  1029. MUL a3, b2, t3
  1030. ADD c04, t4, c04
  1031. lda AO, 4 * SIZE(AO)
  1032. MUL a4, b2, t4
  1033. lda BO, 2 * SIZE(BO)
  1034. ADD c05, t1, c05
  1035. ADD c06, t2, c06
  1036. ADD c07, t3, c07
  1037. ADD c08, t4, c08
  1038. .align 4
  1039. $L58:
  1040. #if defined(LN) || defined(RT)
  1041. #ifdef LN
  1042. subq KK, 4, TMP1
  1043. #else
  1044. subq KK, 2, TMP1
  1045. #endif
  1046. sll TMP1, BASE_SHIFT + 2, TMP2
  1047. addq AORIG, TMP2, AO
  1048. sll TMP1, BASE_SHIFT + 1, TMP2
  1049. addq B, TMP2, BO
  1050. #else
  1051. lda AO, -4 * SIZE(AO)
  1052. lda BO, -2 * SIZE(BO)
  1053. #endif
  1054. #if defined(LN) || defined(LT)
  1055. LD a1, 0 * SIZE(BO)
  1056. LD a2, 1 * SIZE(BO)
  1057. LD a3, 2 * SIZE(BO)
  1058. LD a4, 3 * SIZE(BO)
  1059. LD b1, 4 * SIZE(BO)
  1060. LD b2, 5 * SIZE(BO)
  1061. LD b3, 6 * SIZE(BO)
  1062. LD b4, 7 * SIZE(BO)
  1063. SUB a1, c01, c01
  1064. SUB a2, c05, c05
  1065. SUB a3, c02, c02
  1066. SUB a4, c06, c06
  1067. SUB b1, c03, c03
  1068. SUB b2, c07, c07
  1069. SUB b3, c04, c04
  1070. SUB b4, c08, c08
  1071. #else
  1072. LD a1, 0 * SIZE(AO)
  1073. LD a2, 1 * SIZE(AO)
  1074. LD a3, 2 * SIZE(AO)
  1075. LD a4, 3 * SIZE(AO)
  1076. LD b1, 4 * SIZE(AO)
  1077. LD b2, 5 * SIZE(AO)
  1078. LD b3, 6 * SIZE(AO)
  1079. LD b4, 7 * SIZE(AO)
  1080. SUB a1, c01, c01
  1081. SUB a2, c02, c02
  1082. SUB a3, c03, c03
  1083. SUB a4, c04, c04
  1084. SUB b1, c05, c05
  1085. SUB b2, c06, c06
  1086. SUB b3, c07, c07
  1087. SUB b4, c08, c08
  1088. #endif
  1089. #ifdef LN
  1090. LD a1, 15 * SIZE(AO)
  1091. LD a2, 14 * SIZE(AO)
  1092. LD a3, 13 * SIZE(AO)
  1093. LD a4, 12 * SIZE(AO)
  1094. MUL a1, c04, c04
  1095. MUL a1, c08, c08
  1096. MUL a2, c04, t1
  1097. MUL a2, c08, t2
  1098. SUB c03, t1, c03
  1099. SUB c07, t2, c07
  1100. MUL a3, c04, t1
  1101. MUL a3, c08, t2
  1102. SUB c02, t1, c02
  1103. SUB c06, t2, c06
  1104. MUL a4, c04, t1
  1105. MUL a4, c08, t2
  1106. SUB c01, t1, c01
  1107. SUB c05, t2, c05
  1108. LD b1, 10 * SIZE(AO)
  1109. LD b2, 9 * SIZE(AO)
  1110. LD b3, 8 * SIZE(AO)
  1111. MUL b1, c03, c03
  1112. MUL b1, c07, c07
  1113. MUL b2, c03, t1
  1114. MUL b2, c07, t2
  1115. SUB c02, t1, c02
  1116. SUB c06, t2, c06
  1117. MUL b3, c03, t1
  1118. MUL b3, c07, t2
  1119. SUB c01, t1, c01
  1120. SUB c05, t2, c05
  1121. LD a1, 5 * SIZE(AO)
  1122. LD a2, 4 * SIZE(AO)
  1123. LD a3, 0 * SIZE(AO)
  1124. MUL a1, c02, c02
  1125. MUL a1, c06, c06
  1126. MUL a2, c02, t1
  1127. MUL a2, c06, t2
  1128. SUB c01, t1, c01
  1129. SUB c05, t2, c05
  1130. MUL a3, c01, c01
  1131. MUL a3, c05, c05
  1132. #endif
  1133. #ifdef LT
  1134. LD a1, 0 * SIZE(AO)
  1135. LD a2, 1 * SIZE(AO)
  1136. LD a3, 2 * SIZE(AO)
  1137. LD a4, 3 * SIZE(AO)
  1138. MUL a1, c01, c01
  1139. MUL a1, c05, c05
  1140. MUL a2, c01, t1
  1141. MUL a2, c05, t2
  1142. SUB c02, t1, c02
  1143. SUB c06, t2, c06
  1144. MUL a3, c01, t1
  1145. MUL a3, c05, t2
  1146. SUB c03, t1, c03
  1147. SUB c07, t2, c07
  1148. MUL a4, c01, t1
  1149. MUL a4, c05, t2
  1150. SUB c04, t1, c04
  1151. SUB c08, t2, c08
  1152. LD b1, 5 * SIZE(AO)
  1153. LD b2, 6 * SIZE(AO)
  1154. LD b3, 7 * SIZE(AO)
  1155. MUL b1, c02, c02
  1156. MUL b1, c06, c06
  1157. MUL b2, c02, t1
  1158. MUL b2, c06, t2
  1159. SUB c03, t1, c03
  1160. SUB c07, t2, c07
  1161. MUL b3, c02, t1
  1162. MUL b3, c06, t2
  1163. SUB c04, t1, c04
  1164. SUB c08, t2, c08
  1165. LD a1, 10 * SIZE(AO)
  1166. LD a2, 11 * SIZE(AO)
  1167. LD a3, 15 * SIZE(AO)
  1168. MUL a1, c03, c03
  1169. MUL a1, c07, c07
  1170. MUL a2, c03, t1
  1171. MUL a2, c07, t2
  1172. SUB c04, t1, c04
  1173. SUB c08, t2, c08
  1174. MUL a3, c04, c04
  1175. MUL a3, c08, c08
  1176. #endif
  1177. #ifdef RN
  1178. LD a1, 0 * SIZE(BO)
  1179. LD a2, 1 * SIZE(BO)
  1180. LD a3, 3 * SIZE(BO)
  1181. MUL a1, c01, c01
  1182. MUL a1, c02, c02
  1183. MUL a1, c03, c03
  1184. MUL a1, c04, c04
  1185. MUL a2, c01, t1
  1186. MUL a2, c02, t2
  1187. MUL a2, c03, t3
  1188. MUL a2, c04, t4
  1189. SUB c05, t1, c05
  1190. SUB c06, t2, c06
  1191. SUB c07, t3, c07
  1192. SUB c08, t4, c08
  1193. MUL a3, c05, c05
  1194. MUL a3, c06, c06
  1195. MUL a3, c07, c07
  1196. MUL a3, c08, c08
  1197. #endif
  1198. #ifdef RT
  1199. LD a1, 3 * SIZE(BO)
  1200. LD a2, 2 * SIZE(BO)
  1201. LD a3, 0 * SIZE(BO)
  1202. MUL a1, c05, c05
  1203. MUL a1, c06, c06
  1204. MUL a1, c07, c07
  1205. MUL a1, c08, c08
  1206. MUL a2, c05, t1
  1207. MUL a2, c06, t2
  1208. MUL a2, c07, t3
  1209. MUL a2, c08, t4
  1210. SUB c01, t1, c01
  1211. SUB c02, t2, c02
  1212. SUB c03, t3, c03
  1213. SUB c04, t4, c04
  1214. MUL a3, c01, c01
  1215. MUL a3, c02, c02
  1216. MUL a3, c03, c03
  1217. MUL a3, c04, c04
  1218. #endif
  1219. #if defined(LN) || defined(LT)
  1220. ST c01, 0 * SIZE(BO)
  1221. ST c05, 1 * SIZE(BO)
  1222. ST c02, 2 * SIZE(BO)
  1223. ST c06, 3 * SIZE(BO)
  1224. ST c03, 4 * SIZE(BO)
  1225. ST c07, 5 * SIZE(BO)
  1226. ST c04, 6 * SIZE(BO)
  1227. ST c08, 7 * SIZE(BO)
  1228. #else
  1229. ST c01, 0 * SIZE(AO)
  1230. ST c02, 1 * SIZE(AO)
  1231. ST c03, 2 * SIZE(AO)
  1232. ST c04, 3 * SIZE(AO)
  1233. ST c05, 4 * SIZE(AO)
  1234. ST c06, 5 * SIZE(AO)
  1235. ST c07, 6 * SIZE(AO)
  1236. ST c08, 7 * SIZE(AO)
  1237. #endif
  1238. #ifdef LN
  1239. lda C1, -4 * SIZE(C1)
  1240. lda C2, -4 * SIZE(C2)
  1241. #endif
  1242. ST c01, 0 * SIZE(C1)
  1243. ST c02, 1 * SIZE(C1)
  1244. ST c03, 2 * SIZE(C1)
  1245. ST c04, 3 * SIZE(C1)
  1246. ST c05, 0 * SIZE(C2)
  1247. ST c06, 1 * SIZE(C2)
  1248. ST c07, 2 * SIZE(C2)
  1249. ST c08, 3 * SIZE(C2)
  1250. #ifndef LN
  1251. lda C1, 4 * SIZE(C1)
  1252. lda C2, 4 * SIZE(C2)
  1253. #endif
  1254. fclr t1
  1255. fclr t2
  1256. fclr t3
  1257. fclr t4
  1258. #ifdef RT
  1259. sll K, 2 + BASE_SHIFT, TMP1
  1260. addq AORIG, TMP1, AORIG
  1261. #endif
  1262. #if defined(LT) || defined(RN)
  1263. subq K, KK, TMP1
  1264. sll TMP1, BASE_SHIFT + 2, TMP2
  1265. addq AO, TMP2, AO
  1266. sll TMP1, BASE_SHIFT + 1, TMP2
  1267. addq BO, TMP2, BO
  1268. #endif
  1269. #ifdef LT
  1270. addq KK, 4, KK
  1271. #endif
  1272. #ifdef LN
  1273. subq KK, 4, KK
  1274. #endif
  1275. lda I, -1(I)
  1276. bgt I, $L51
  1277. .align 4
  1278. $L60:
  1279. and M, 2, I
  1280. ble I, $L70
  1281. #if defined(LT) || defined(RN)
  1282. LD a1, 0 * SIZE(AO)
  1283. fclr c01
  1284. LD a2, 1 * SIZE(AO)
  1285. fclr c05
  1286. LD a3, 2 * SIZE(AO)
  1287. fclr c02
  1288. LD a4, 3 * SIZE(AO)
  1289. fclr c06
  1290. LD b1, 0 * SIZE(B)
  1291. lda L, -2(KK)
  1292. LD b2, 1 * SIZE(B)
  1293. lda AO, 2 * SIZE(AO)
  1294. LD b3, 2 * SIZE(B)
  1295. LD b4, 3 * SIZE(B)
  1296. lda BO, 2 * SIZE(B)
  1297. ble KK, $L68
  1298. ble L, $L65
  1299. #else
  1300. #ifdef LN
  1301. sll K, BASE_SHIFT + 1, TMP1
  1302. subq AORIG, TMP1, AORIG
  1303. #endif
  1304. sll KK, BASE_SHIFT + 1, TMP1
  1305. addq AORIG, TMP1, AO
  1306. sll KK, BASE_SHIFT + 1, TMP1
  1307. addq B, TMP1, BO
  1308. subq K, KK, TMP1
  1309. LD a1, 0 * SIZE(AO)
  1310. fclr c01
  1311. LD a2, 1 * SIZE(AO)
  1312. fclr c05
  1313. LD a3, 2 * SIZE(AO)
  1314. fclr c02
  1315. LD a4, 3 * SIZE(AO)
  1316. fclr c06
  1317. LD b1, 0 * SIZE(BO)
  1318. lda L, -2(TMP1)
  1319. LD b2, 1 * SIZE(BO)
  1320. lda AO, 2 * SIZE(AO)
  1321. LD b3, 2 * SIZE(BO)
  1322. LD b4, 3 * SIZE(BO)
  1323. lda BO, 2 * SIZE(BO)
  1324. ble TMP1, $L68
  1325. ble L, $L65
  1326. #endif
  1327. .align 4
  1328. $L62:
  1329. ADD c01, t1, c01
  1330. unop
  1331. MUL a1, b1, t1
  1332. unop
  1333. ADD c02, t2, c02
  1334. lda AO, 4 * SIZE(AO)
  1335. MUL a2, b1, t2
  1336. LD b1, 2 * SIZE(BO)
  1337. ADD c05, t3, c05
  1338. lda L, -2(L)
  1339. MUL a1, b2, t3
  1340. LD a1, -2 * SIZE(AO)
  1341. ADD c06, t4, c06
  1342. unop
  1343. MUL a2, b2, t4
  1344. LD a2, -1 * SIZE(AO)
  1345. ADD c01, t1, c01
  1346. LD b2, 3 * SIZE(BO)
  1347. MUL a3, b3, t1
  1348. lda BO, 4 * SIZE(BO)
  1349. ADD c02, t2, c02
  1350. unop
  1351. MUL a4, b3, t2
  1352. LD b3, 0 * SIZE(BO)
  1353. ADD c05, t3, c05
  1354. unop
  1355. MUL a3, b4, t3
  1356. LD a3, 0 * SIZE(AO)
  1357. ADD c06, t4, c06
  1358. MUL a4, b4, t4
  1359. LD b4, 1 * SIZE(BO)
  1360. unop
  1361. LD a4, 1 * SIZE(AO)
  1362. unop
  1363. unop
  1364. bgt L, $L62
  1365. .align 4
  1366. $L65:
  1367. ADD c01, t1, c01
  1368. MUL a1, b1, t1
  1369. #if defined(LT) || defined(RN)
  1370. blbs KK, $L67
  1371. #else
  1372. blbs TMP1, $L67
  1373. #endif
  1374. .align 4
  1375. ADD c02, t2, c02
  1376. unop
  1377. MUL a2, b1, t2
  1378. LD b1, 0 * SIZE(BO)
  1379. ADD c05, t3, c05
  1380. lda BO, 2 * SIZE(BO)
  1381. MUL a1, b2, t3
  1382. LD a1, 0 * SIZE(AO)
  1383. ADD c06, t4, c06
  1384. unop
  1385. MUL a2, b2, t4
  1386. LD a2, 1 * SIZE(AO)
  1387. ADD c01, t1, c01
  1388. LD b2, -1 * SIZE(BO)
  1389. MUL a1, b1, t1
  1390. lda AO, 2 * SIZE(AO)
  1391. .align 4
  1392. $L67:
  1393. ADD c02, t2, c02
  1394. MUL a2, b1, t2
  1395. ADD c05, t3, c05
  1396. MUL a1, b2, t3
  1397. ADD c06, t4, c06
  1398. lda AO, 2 * SIZE(AO)
  1399. MUL a2, b2, t4
  1400. lda BO, 2 * SIZE(BO)
  1401. ADD c01, t1, c01
  1402. ADD c02, t2, c02
  1403. ADD c05, t3, c05
  1404. ADD c06, t4, c06
  1405. .align 4
  1406. $L68:
  1407. #if defined(LN) || defined(RT)
  1408. #ifdef LN
  1409. subq KK, 2, TMP1
  1410. #else
  1411. subq KK, 2, TMP1
  1412. #endif
  1413. sll TMP1, BASE_SHIFT + 1, TMP2
  1414. addq AORIG, TMP2, AO
  1415. sll TMP1, BASE_SHIFT + 1, TMP2
  1416. addq B, TMP2, BO
  1417. #else
  1418. lda AO, -2 * SIZE(AO)
  1419. lda BO, -2 * SIZE(BO)
  1420. #endif
  1421. #if defined(LN) || defined(LT)
  1422. LD a1, 0 * SIZE(BO)
  1423. LD a2, 1 * SIZE(BO)
  1424. LD a3, 2 * SIZE(BO)
  1425. LD a4, 3 * SIZE(BO)
  1426. SUB a1, c01, c01
  1427. SUB a2, c05, c05
  1428. SUB a3, c02, c02
  1429. SUB a4, c06, c06
  1430. #else
  1431. LD a1, 0 * SIZE(AO)
  1432. LD a2, 1 * SIZE(AO)
  1433. LD a3, 2 * SIZE(AO)
  1434. LD a4, 3 * SIZE(AO)
  1435. SUB a1, c01, c01
  1436. SUB a2, c02, c02
  1437. SUB a3, c05, c05
  1438. SUB a4, c06, c06
  1439. #endif
  1440. #ifdef LN
  1441. LD a1, 3 * SIZE(AO)
  1442. LD a2, 2 * SIZE(AO)
  1443. LD a3, 0 * SIZE(AO)
  1444. MUL a1, c02, c02
  1445. MUL a1, c06, c06
  1446. MUL a2, c02, t1
  1447. MUL a2, c06, t2
  1448. SUB c01, t1, c01
  1449. SUB c05, t2, c05
  1450. MUL a3, c01, c01
  1451. MUL a3, c05, c05
  1452. #endif
  1453. #ifdef LT
  1454. LD a1, 0 * SIZE(AO)
  1455. LD a2, 1 * SIZE(AO)
  1456. LD a3, 3 * SIZE(AO)
  1457. MUL a1, c01, c01
  1458. MUL a1, c05, c05
  1459. MUL a2, c01, t1
  1460. MUL a2, c05, t2
  1461. SUB c02, t1, c02
  1462. SUB c06, t2, c06
  1463. MUL a3, c02, c02
  1464. MUL a3, c06, c06
  1465. #endif
  1466. #ifdef RN
  1467. LD a1, 0 * SIZE(BO)
  1468. LD a2, 1 * SIZE(BO)
  1469. LD a3, 3 * SIZE(BO)
  1470. MUL a1, c01, c01
  1471. MUL a1, c02, c02
  1472. MUL a2, c01, t1
  1473. MUL a2, c02, t2
  1474. SUB c05, t1, c05
  1475. SUB c06, t2, c06
  1476. MUL a3, c05, c05
  1477. MUL a3, c06, c06
  1478. #endif
  1479. #ifdef RT
  1480. LD a1, 3 * SIZE(BO)
  1481. LD a2, 2 * SIZE(BO)
  1482. LD a3, 0 * SIZE(BO)
  1483. MUL a1, c05, c05
  1484. MUL a1, c06, c06
  1485. MUL a2, c05, t1
  1486. MUL a2, c06, t2
  1487. SUB c01, t1, c01
  1488. SUB c02, t2, c02
  1489. MUL a3, c01, c01
  1490. MUL a3, c02, c02
  1491. #endif
  1492. #if defined(LN) || defined(LT)
  1493. ST c01, 0 * SIZE(BO)
  1494. ST c05, 1 * SIZE(BO)
  1495. ST c02, 2 * SIZE(BO)
  1496. ST c06, 3 * SIZE(BO)
  1497. #else
  1498. ST c01, 0 * SIZE(AO)
  1499. ST c02, 1 * SIZE(AO)
  1500. ST c05, 2 * SIZE(AO)
  1501. ST c06, 3 * SIZE(AO)
  1502. #endif
  1503. #ifdef LN
  1504. lda C1, -2 * SIZE(C1)
  1505. lda C2, -2 * SIZE(C2)
  1506. #endif
  1507. ST c01, 0 * SIZE(C1)
  1508. ST c02, 1 * SIZE(C1)
  1509. ST c05, 0 * SIZE(C2)
  1510. ST c06, 1 * SIZE(C2)
  1511. #ifndef LN
  1512. lda C1, 2 * SIZE(C1)
  1513. lda C2, 2 * SIZE(C2)
  1514. #endif
  1515. fclr t1
  1516. fclr t2
  1517. fclr t3
  1518. fclr t4
  1519. #ifdef RT
  1520. sll K, 1 + BASE_SHIFT, TMP1
  1521. addq AORIG, TMP1, AORIG
  1522. #endif
  1523. #if defined(LT) || defined(RN)
  1524. subq K, KK, TMP1
  1525. sll TMP1, BASE_SHIFT + 1, TMP2
  1526. addq AO, TMP2, AO
  1527. sll TMP1, BASE_SHIFT + 1, TMP2
  1528. addq BO, TMP2, BO
  1529. #endif
  1530. #ifdef LT
  1531. addq KK, 2, KK
  1532. #endif
  1533. #ifdef LN
  1534. subq KK, 2, KK
  1535. #endif
  1536. .align 4
  1537. $L70:
  1538. and M, 1, I
  1539. ble I, $L79
  1540. #if defined(LT) || defined(RN)
  1541. LD a1, 0 * SIZE(AO)
  1542. fclr c01
  1543. LD a2, 1 * SIZE(AO)
  1544. fclr c05
  1545. LD b1, 0 * SIZE(B)
  1546. fclr c02
  1547. LD b2, 1 * SIZE(B)
  1548. fclr c06
  1549. lda L, -2(KK)
  1550. LD b3, 2 * SIZE(B)
  1551. lda AO, 1 * SIZE(AO)
  1552. LD b4, 3 * SIZE(B)
  1553. lda BO, 2 * SIZE(B)
  1554. ble KK, $L78
  1555. ble L, $L75
  1556. #else
  1557. #ifdef LN
  1558. sll K, BASE_SHIFT + 0, TMP1
  1559. subq AORIG, TMP1, AORIG
  1560. #endif
  1561. sll KK, BASE_SHIFT + 0, TMP1
  1562. addq AORIG, TMP1, AO
  1563. sll KK, BASE_SHIFT + 1, TMP1
  1564. addq B, TMP1, BO
  1565. subq K, KK, TMP1
  1566. LD a1, 0 * SIZE(AO)
  1567. fclr c01
  1568. LD a2, 1 * SIZE(AO)
  1569. fclr c05
  1570. LD b1, 0 * SIZE(BO)
  1571. fclr c02
  1572. LD b2, 1 * SIZE(BO)
  1573. fclr c06
  1574. lda L, -2(TMP1)
  1575. LD b3, 2 * SIZE(BO)
  1576. lda AO, 1 * SIZE(AO)
  1577. LD b4, 3 * SIZE(BO)
  1578. lda BO, 2 * SIZE(BO)
  1579. ble TMP1, $L78
  1580. ble L, $L75
  1581. #endif
  1582. .align 4
  1583. $L72:
  1584. ADD c01, t1, c01
  1585. lda L, -2(L)
  1586. MUL a1, b1, t1
  1587. LD b1, 2 * SIZE(BO)
  1588. ADD c05, t2, c05
  1589. MUL a1, b2, t2
  1590. LD a1, 1 * SIZE(AO)
  1591. LD b2, 3 * SIZE(BO)
  1592. ADD c02, t3, c02
  1593. lda AO, 2 * SIZE(AO)
  1594. MUL a2, b3, t3
  1595. LD b3, 4 * SIZE(BO)
  1596. ADD c06, t4, c06
  1597. MUL a2, b4, t4
  1598. LD a2, 0 * SIZE(AO)
  1599. LD b4, 5 * SIZE(BO)
  1600. lda BO, 4 * SIZE(BO)
  1601. unop
  1602. unop
  1603. bgt L, $L72
  1604. .align 4
  1605. $L75:
  1606. ADD c01, t1, c01
  1607. MUL a1, b1, t1
  1608. #if defined(LT) || defined(RN)
  1609. blbs KK, $L77
  1610. #else
  1611. blbs TMP1, $L77
  1612. #endif
  1613. .align 4
  1614. ADD c05, t2, c05
  1615. MUL a1, b2, t2
  1616. LD a1, 0 * SIZE(AO)
  1617. LD b1, 0 * SIZE(BO)
  1618. ADD c01, t1, c01
  1619. LD b2, 1 * SIZE(BO)
  1620. lda AO, 1 * SIZE(AO)
  1621. MUL a1, b1, t1
  1622. lda BO, 2 * SIZE(BO)
  1623. .align 4
  1624. $L77:
  1625. ADD c05, t2, c05
  1626. MUL a1, b2, t2
  1627. ADD c02, t3, c02
  1628. ADD c06, t4, c06
  1629. ADD c01, c02, c01
  1630. lda AO, 1 * SIZE(AO)
  1631. ADD c05, c06, c05
  1632. lda BO, 2 * SIZE(BO)
  1633. ADD c01, t1, c01
  1634. ADD c05, t2, c05
  1635. .align 4
  1636. $L78:
  1637. #if defined(LN) || defined(RT)
  1638. #ifdef LN
  1639. subq KK, 1, TMP1
  1640. #else
  1641. subq KK, 2, TMP1
  1642. #endif
  1643. sll TMP1, BASE_SHIFT + 0, TMP2
  1644. addq AORIG, TMP2, AO
  1645. sll TMP1, BASE_SHIFT + 1, TMP2
  1646. addq B, TMP2, BO
  1647. #else
  1648. lda AO, -1 * SIZE(AO)
  1649. lda BO, -2 * SIZE(BO)
  1650. #endif
  1651. #if defined(LN) || defined(LT)
  1652. LD a1, 0 * SIZE(BO)
  1653. LD a2, 1 * SIZE(BO)
  1654. SUB a1, c01, c01
  1655. SUB a2, c05, c05
  1656. #else
  1657. LD a1, 0 * SIZE(AO)
  1658. LD a2, 1 * SIZE(AO)
  1659. SUB a1, c01, c01
  1660. SUB a2, c05, c05
  1661. #endif
  1662. #if defined(LN) || defined(LT)
  1663. LD a1, 0 * SIZE(AO)
  1664. MUL a1, c01, c01
  1665. MUL a1, c05, c05
  1666. #endif
  1667. #ifdef RN
  1668. LD a1, 0 * SIZE(BO)
  1669. LD a2, 1 * SIZE(BO)
  1670. LD a3, 3 * SIZE(BO)
  1671. MUL a1, c01, c01
  1672. MUL a2, c01, t1
  1673. SUB c05, t1, c05
  1674. MUL a3, c05, c05
  1675. #endif
  1676. #ifdef RT
  1677. LD a1, 3 * SIZE(BO)
  1678. LD a2, 2 * SIZE(BO)
  1679. LD a3, 0 * SIZE(BO)
  1680. MUL a1, c05, c05
  1681. MUL a2, c05, t1
  1682. SUB c01, t1, c01
  1683. MUL a3, c01, c01
  1684. #endif
  1685. #if defined(LN) || defined(LT)
  1686. ST c01, 0 * SIZE(BO)
  1687. ST c05, 1 * SIZE(BO)
  1688. #else
  1689. ST c01, 0 * SIZE(AO)
  1690. ST c05, 1 * SIZE(AO)
  1691. #endif
  1692. #ifdef LN
  1693. lda C1, -1 * SIZE(C1)
  1694. lda C2, -1 * SIZE(C2)
  1695. #endif
  1696. ST c01, 0 * SIZE(C1)
  1697. ST c05, 0 * SIZE(C2)
  1698. fclr t1
  1699. fclr t2
  1700. fclr t3
  1701. fclr t4
  1702. #ifdef RT
  1703. sll K, 0 + BASE_SHIFT, TMP1
  1704. addq AORIG, TMP1, AORIG
  1705. #endif
  1706. #if defined(LT) || defined(RN)
  1707. subq K, KK, TMP1
  1708. sll TMP1, BASE_SHIFT + 0, TMP2
  1709. addq AO, TMP2, AO
  1710. sll TMP1, BASE_SHIFT + 1, TMP2
  1711. addq BO, TMP2, BO
  1712. #endif
  1713. #ifdef LT
  1714. addq KK, 1, KK
  1715. #endif
  1716. #ifdef LN
  1717. subq KK, 1, KK
  1718. #endif
  1719. .align 4
  1720. $L79:
  1721. #ifdef LN
  1722. sll K, 1 + BASE_SHIFT, TMP1
  1723. addq B, TMP1, B
  1724. #endif
  1725. #if defined(LT) || defined(RN)
  1726. mov BO, B
  1727. #endif
  1728. #ifdef RN
  1729. addq KK, 2, KK
  1730. #endif
  1731. #ifdef RT
  1732. subq KK, 2, KK
  1733. #endif
  1734. .align 4
  1735. $L80:
  1736. sra N, 2, J
  1737. ble J, $L999
  1738. .align 4
  1739. $L01:
  1740. #ifdef RT
  1741. sll K, 2 + BASE_SHIFT, TMP1
  1742. subq B, TMP1, B
  1743. s4addq LDC, 0, TMP1
  1744. subq C, TMP1, C
  1745. #endif
  1746. mov C, C1
  1747. addq C, LDC, C2
  1748. addq C2, LDC, C3
  1749. #ifndef RT
  1750. s4addq LDC, C, C
  1751. #endif
  1752. fclr t1
  1753. addq C3, LDC, C4
  1754. fclr t2
  1755. #ifdef LN
  1756. addq M, OFFSET, KK
  1757. #endif
  1758. #ifdef LT
  1759. mov OFFSET, KK
  1760. #endif
  1761. #if defined(LN) || defined(RT)
  1762. mov A, AORIG
  1763. #else
  1764. mov A, AO
  1765. #endif
  1766. sra M, 2, I
  1767. fclr t3
  1768. fclr t4
  1769. ble I, $L20
  1770. .align 4
  1771. $L11:
  1772. #if defined(LT) || defined(RN)
  1773. LD a1, 0 * SIZE(AO)
  1774. fclr c11
  1775. LD a2, 1 * SIZE(AO)
  1776. fclr c12
  1777. LD a3, 2 * SIZE(AO)
  1778. fclr c16
  1779. LD a4, 3 * SIZE(AO)
  1780. fclr c15
  1781. LD b1, 0 * SIZE(B)
  1782. fclr c01
  1783. LD b2, 1 * SIZE(B)
  1784. fclr c02
  1785. LD b3, 2 * SIZE(B)
  1786. fclr c06
  1787. LD b4, 3 * SIZE(B)
  1788. fclr c05
  1789. lds $f31, 4 * SIZE(C1)
  1790. fclr c03
  1791. lda L, -2(KK)
  1792. fclr c04
  1793. lds $f31, 7 * SIZE(C2)
  1794. fclr c08
  1795. lda BO, 4 * SIZE(B)
  1796. fclr c13
  1797. lds $f31, 4 * SIZE(C3)
  1798. fclr c09
  1799. lda AO, 4 * SIZE(AO)
  1800. fclr c10
  1801. lds $f31, 7 * SIZE(C4)
  1802. fclr c14
  1803. fclr c07
  1804. ble KK, $L18
  1805. #else
  1806. #ifdef LN
  1807. sll K, BASE_SHIFT + 2, TMP1
  1808. subq AORIG, TMP1, AORIG
  1809. #endif
  1810. sll KK, BASE_SHIFT + 2, TMP1
  1811. addq AORIG, TMP1, AO
  1812. addq B, TMP1, BO
  1813. subq K, KK, TMP1
  1814. LD a1, 0 * SIZE(AO)
  1815. fclr c11
  1816. LD a2, 1 * SIZE(AO)
  1817. fclr c12
  1818. LD a3, 2 * SIZE(AO)
  1819. fclr c16
  1820. LD a4, 3 * SIZE(AO)
  1821. fclr c15
  1822. LD b1, 0 * SIZE(BO)
  1823. fclr c01
  1824. LD b2, 1 * SIZE(BO)
  1825. fclr c02
  1826. LD b3, 2 * SIZE(BO)
  1827. fclr c06
  1828. LD b4, 3 * SIZE(BO)
  1829. fclr c05
  1830. lds $f31, 4 * SIZE(C1)
  1831. fclr c03
  1832. lda L, -2(TMP1)
  1833. fclr c04
  1834. lds $f31, 7 * SIZE(C2)
  1835. fclr c08
  1836. lda BO, 4 * SIZE(BO)
  1837. fclr c13
  1838. lds $f31, 4 * SIZE(C3)
  1839. fclr c09
  1840. lda AO, 4 * SIZE(AO)
  1841. fclr c10
  1842. lds $f31, 7 * SIZE(C4)
  1843. fclr c14
  1844. fclr c07
  1845. ble TMP1, $L18
  1846. #endif
  1847. ble L, $L15
  1848. .align 5
  1849. $L12:
  1850. /* 1 */
  1851. ADD c11, t1, c11
  1852. #ifndef EV4
  1853. ldq $31, PREFETCHSIZE * SIZE(AO)
  1854. #else
  1855. unop
  1856. #endif
  1857. MUL b1, a1, t1
  1858. #ifndef EV4
  1859. ldl $31, PREFETCHSIZE * SIZE(BO)
  1860. #else
  1861. unop
  1862. #endif
  1863. ADD c12, t2, c12
  1864. unop
  1865. MUL b1, a2, t2
  1866. unop
  1867. ADD c16, t3, c16
  1868. unop
  1869. MUL b2, a2, t3
  1870. LD a5, 0 * SIZE(AO)
  1871. ADD c15, t4, c15
  1872. unop
  1873. MUL b2, a1, t4
  1874. LD b5, 0 * SIZE(BO)
  1875. /* 2 */
  1876. ADD c01, t1, c01
  1877. UNOP
  1878. MUL b1, a3, t1
  1879. UNOP
  1880. ADD c02, t2, c02
  1881. UNOP
  1882. MUL b1, a4, t2
  1883. UNOP
  1884. ADD c06, t3, c06
  1885. unop
  1886. MUL b2, a4, t3
  1887. unop
  1888. ADD c05, t4, c05
  1889. unop
  1890. MUL b4, a1, t4
  1891. unop
  1892. /* 3 */
  1893. ADD c03, t1, c03
  1894. unop
  1895. MUL b3, a1, t1
  1896. unop
  1897. ADD c04, t2, c04
  1898. unop
  1899. MUL b3, a2, t2
  1900. unop
  1901. ADD c08, t3, c08
  1902. unop
  1903. MUL b4, a2, t3
  1904. LD a2, 1 * SIZE(AO)
  1905. ADD c13, t4, c13
  1906. unop
  1907. MUL b2, a3, t4
  1908. LD b2, 1 * SIZE(BO)
  1909. /* 4 */
  1910. ADD c09, t1, c09
  1911. unop
  1912. MUL b3, a3, t1
  1913. LD a6, 2 * SIZE(AO)
  1914. ADD c10, t2, c10
  1915. unop
  1916. MUL b3, a4, t2
  1917. LD b3, 2 * SIZE(BO)
  1918. ADD c14, t3, c14
  1919. unop
  1920. MUL b4, a4, t3
  1921. LD a4, 3 * SIZE(AO)
  1922. ADD c07, t4, c07
  1923. unop
  1924. MUL b4, a3, t4
  1925. LD b4, 3 * SIZE(BO)
  1926. /* 5 */
  1927. ADD c11, t1, c11
  1928. unop
  1929. MUL b5, a5, t1
  1930. LD a1, 4 * SIZE(AO)
  1931. ADD c12, t2, c12
  1932. lda L, -2(L)
  1933. MUL b5, a2, t2
  1934. LD b1, 4 * SIZE(BO)
  1935. ADD c16, t3, c16
  1936. unop
  1937. MUL b2, a2, t3
  1938. unop
  1939. ADD c15, t4, c15
  1940. unop
  1941. MUL b2, a5, t4
  1942. unop
  1943. /* 6 */
  1944. ADD c01, t1, c01
  1945. unop
  1946. MUL b5, a6, t1
  1947. unop
  1948. ADD c02, t2, c02
  1949. unop
  1950. MUL b5, a4, t2
  1951. unop
  1952. ADD c06, t3, c06
  1953. unop
  1954. MUL b2, a4, t3
  1955. unop
  1956. ADD c05, t4, c05
  1957. unop
  1958. MUL b4, a5, t4
  1959. unop
  1960. /* 7 */
  1961. ADD c03, t1, c03
  1962. lda AO, 8 * SIZE(AO)
  1963. MUL b3, a5, t1
  1964. unop
  1965. ADD c04, t2, c04
  1966. lda BO, 8 * SIZE(BO)
  1967. MUL b3, a2, t2
  1968. unop
  1969. ADD c08, t3, c08
  1970. unop
  1971. MUL b4, a2, t3
  1972. LD a2, -3 * SIZE(AO)
  1973. ADD c13, t4, c13
  1974. unop
  1975. MUL b2, a6, t4
  1976. LD b2, -3 * SIZE(BO)
  1977. /* 8 */
  1978. ADD c09, t1, c09
  1979. unop
  1980. MUL b3, a6, t1
  1981. LD a3, -2 * SIZE(AO)
  1982. ADD c10, t2, c10
  1983. unop
  1984. MUL b3, a4, t2
  1985. LD b3, -2 * SIZE(BO)
  1986. ADD c14, t3, c14
  1987. unop
  1988. MUL b4, a4, t3
  1989. LD a4, -1 * SIZE(AO)
  1990. ADD c07, t4, c07
  1991. MUL b4, a6, t4
  1992. LD b4, -1 * SIZE(BO)
  1993. bgt L, $L12
  1994. .align 4
  1995. $L15:
  1996. ADD c11, t1, c11
  1997. MUL b1, a1, t1
  1998. #if defined(LT) || defined(RN)
  1999. blbs KK, $L17
  2000. #else
  2001. blbs TMP1, $L17
  2002. #endif
  2003. .align 4
  2004. ADD c12, t2, c12
  2005. MUL b1, a2, t2
  2006. ADD c16, t3, c16
  2007. MUL b2, a2, t3
  2008. ADD c15, t4, c15
  2009. MUL b2, a1, t4
  2010. ADD c01, t1, c01
  2011. MUL b1, a3, t1
  2012. ADD c02, t2, c02
  2013. unop
  2014. MUL b1, a4, t2
  2015. LD b1, 0 * SIZE(BO)
  2016. ADD c06, t3, c06
  2017. MUL b2, a4, t3
  2018. ADD c05, t4, c05
  2019. MUL b4, a1, t4
  2020. ADD c03, t1, c03
  2021. unop
  2022. MUL b3, a1, t1
  2023. LD a1, 0 * SIZE(AO)
  2024. ADD c04, t2, c04
  2025. unop
  2026. MUL b3, a2, t2
  2027. unop
  2028. ADD c08, t3, c08
  2029. unop
  2030. MUL b4, a2, t3
  2031. LD a2, 1 * SIZE(AO)
  2032. ADD c13, t4, c13
  2033. unop
  2034. MUL b2, a3, t4
  2035. LD b2, 1 * SIZE(BO)
  2036. ADD c09, t1, c09
  2037. unop
  2038. MUL b3, a3, t1
  2039. lda AO, 4 * SIZE(AO)
  2040. ADD c10, t2, c10
  2041. unop
  2042. MUL b3, a4, t2
  2043. LD b3, 2 * SIZE(BO)
  2044. ADD c14, t3, c14
  2045. unop
  2046. MUL b4, a4, t3
  2047. LD a4, -1 * SIZE(AO)
  2048. ADD c07, t4, c07
  2049. unop
  2050. MUL b4, a3, t4
  2051. LD a3, -2 * SIZE(AO)
  2052. ADD c11, t1, c11
  2053. LD b4, 3 * SIZE(BO)
  2054. MUL b1, a1, t1
  2055. lda BO, 4 * SIZE(BO)
  2056. .align 4
  2057. $L17:
  2058. ADD c12, t2, c12
  2059. MUL b1, a2, t2
  2060. ADD c16, t3, c16
  2061. MUL b2, a2, t3
  2062. ADD c15, t4, c15
  2063. MUL b2, a1, t4
  2064. ADD c01, t1, c01
  2065. MUL b1, a3, t1
  2066. ADD c02, t2, c02
  2067. MUL b1, a4, t2
  2068. ADD c06, t3, c06
  2069. MUL b2, a4, t3
  2070. ADD c05, t4, c05
  2071. MUL b4, a1, t4
  2072. ADD c03, t1, c03
  2073. MUL b3, a1, t1
  2074. ADD c04, t2, c04
  2075. MUL b3, a2, t2
  2076. ADD c08, t3, c08
  2077. MUL b4, a2, t3
  2078. ADD c13, t4, c13
  2079. MUL b2, a3, t4
  2080. ADD c09, t1, c09
  2081. MUL b3, a3, t1
  2082. ADD c10, t2, c10
  2083. MUL b3, a4, t2
  2084. ADD c14, t3, c14
  2085. MUL b4, a4, t3
  2086. ADD c07, t4, c07
  2087. lda AO, 4 * SIZE(AO)
  2088. MUL b4, a3, t4
  2089. lda BO, 4 * SIZE(BO)
  2090. ADD c11, t1, c11
  2091. ADD c12, t2, c12
  2092. ADD c16, t3, c16
  2093. ADD c15, t4, c15
  2094. .align 4
  2095. $L18:
  2096. #if defined(LN) || defined(RT)
  2097. #ifdef LN
  2098. subq KK, 4, TMP1
  2099. #else
  2100. subq KK, 4, TMP1
  2101. #endif
  2102. sll TMP1, BASE_SHIFT + 2, TMP2
  2103. addq AORIG, TMP2, AO
  2104. sll TMP1, BASE_SHIFT + 2, TMP2
  2105. addq B, TMP2, BO
  2106. #else
  2107. lda AO, -4 * SIZE(AO)
  2108. lda BO, -4 * SIZE(BO)
  2109. #endif
  2110. #if defined(LN) || defined(LT)
  2111. LD a1, 0 * SIZE(BO)
  2112. LD a2, 1 * SIZE(BO)
  2113. LD a3, 2 * SIZE(BO)
  2114. LD a4, 3 * SIZE(BO)
  2115. LD b1, 4 * SIZE(BO)
  2116. LD b2, 5 * SIZE(BO)
  2117. LD b3, 6 * SIZE(BO)
  2118. LD b4, 7 * SIZE(BO)
  2119. SUB a1, c01, c01
  2120. SUB a2, c05, c05
  2121. SUB a3, c09, c09
  2122. SUB a4, c13, c13
  2123. SUB b1, c02, c02
  2124. SUB b2, c06, c06
  2125. SUB b3, c10, c10
  2126. SUB b4, c14, c14
  2127. LD a1, 8 * SIZE(BO)
  2128. LD a2, 9 * SIZE(BO)
  2129. LD a3, 10 * SIZE(BO)
  2130. LD a4, 11 * SIZE(BO)
  2131. LD b1, 12 * SIZE(BO)
  2132. LD b2, 13 * SIZE(BO)
  2133. LD b3, 14 * SIZE(BO)
  2134. LD b4, 15 * SIZE(BO)
  2135. SUB a1, c03, c03
  2136. SUB a2, c07, c07
  2137. SUB a3, c11, c11
  2138. SUB a4, c15, c15
  2139. SUB b1, c04, c04
  2140. SUB b2, c08, c08
  2141. SUB b3, c12, c12
  2142. SUB b4, c16, c16
  2143. #else
  2144. LD a1, 0 * SIZE(AO)
  2145. LD a2, 1 * SIZE(AO)
  2146. LD a3, 2 * SIZE(AO)
  2147. LD a4, 3 * SIZE(AO)
  2148. LD b1, 4 * SIZE(AO)
  2149. LD b2, 5 * SIZE(AO)
  2150. LD b3, 6 * SIZE(AO)
  2151. LD b4, 7 * SIZE(AO)
  2152. SUB a1, c01, c01
  2153. SUB a2, c02, c02
  2154. SUB a3, c03, c03
  2155. SUB a4, c04, c04
  2156. SUB b1, c05, c05
  2157. SUB b2, c06, c06
  2158. SUB b3, c07, c07
  2159. SUB b4, c08, c08
  2160. LD a1, 8 * SIZE(AO)
  2161. LD a2, 9 * SIZE(AO)
  2162. LD a3, 10 * SIZE(AO)
  2163. LD a4, 11 * SIZE(AO)
  2164. LD b1, 12 * SIZE(AO)
  2165. LD b2, 13 * SIZE(AO)
  2166. LD b3, 14 * SIZE(AO)
  2167. LD b4, 15 * SIZE(AO)
  2168. SUB a1, c09, c09
  2169. SUB a2, c10, c10
  2170. SUB a3, c11, c11
  2171. SUB a4, c12, c12
  2172. SUB b1, c13, c13
  2173. SUB b2, c14, c14
  2174. SUB b3, c15, c15
  2175. SUB b4, c16, c16
  2176. #endif
  2177. #ifdef LN
  2178. LD a1, 15 * SIZE(AO)
  2179. LD a2, 14 * SIZE(AO)
  2180. LD a3, 13 * SIZE(AO)
  2181. LD a4, 12 * SIZE(AO)
  2182. MUL a1, c04, c04
  2183. MUL a1, c08, c08
  2184. MUL a1, c12, c12
  2185. MUL a1, c16, c16
  2186. MUL a2, c04, t1
  2187. MUL a2, c08, t2
  2188. MUL a2, c12, t3
  2189. MUL a2, c16, t4
  2190. SUB c03, t1, c03
  2191. SUB c07, t2, c07
  2192. SUB c11, t3, c11
  2193. SUB c15, t4, c15
  2194. MUL a3, c04, t1
  2195. MUL a3, c08, t2
  2196. MUL a3, c12, t3
  2197. MUL a3, c16, t4
  2198. SUB c02, t1, c02
  2199. SUB c06, t2, c06
  2200. SUB c10, t3, c10
  2201. SUB c14, t4, c14
  2202. MUL a4, c04, t1
  2203. MUL a4, c08, t2
  2204. MUL a4, c12, t3
  2205. MUL a4, c16, t4
  2206. SUB c01, t1, c01
  2207. SUB c05, t2, c05
  2208. SUB c09, t3, c09
  2209. SUB c13, t4, c13
  2210. LD b1, 10 * SIZE(AO)
  2211. LD b2, 9 * SIZE(AO)
  2212. LD b3, 8 * SIZE(AO)
  2213. MUL b1, c03, c03
  2214. MUL b1, c07, c07
  2215. MUL b1, c11, c11
  2216. MUL b1, c15, c15
  2217. MUL b2, c03, t1
  2218. MUL b2, c07, t2
  2219. MUL b2, c11, t3
  2220. MUL b2, c15, t4
  2221. SUB c02, t1, c02
  2222. SUB c06, t2, c06
  2223. SUB c10, t3, c10
  2224. SUB c14, t4, c14
  2225. MUL b3, c03, t1
  2226. MUL b3, c07, t2
  2227. MUL b3, c11, t3
  2228. MUL b3, c15, t4
  2229. SUB c01, t1, c01
  2230. SUB c05, t2, c05
  2231. SUB c09, t3, c09
  2232. SUB c13, t4, c13
  2233. LD a1, 5 * SIZE(AO)
  2234. LD a2, 4 * SIZE(AO)
  2235. LD a3, 0 * SIZE(AO)
  2236. MUL a1, c02, c02
  2237. MUL a1, c06, c06
  2238. MUL a1, c10, c10
  2239. MUL a1, c14, c14
  2240. MUL a2, c02, t1
  2241. MUL a2, c06, t2
  2242. MUL a2, c10, t3
  2243. MUL a2, c14, t4
  2244. SUB c01, t1, c01
  2245. SUB c05, t2, c05
  2246. SUB c09, t3, c09
  2247. SUB c13, t4, c13
  2248. MUL a3, c01, c01
  2249. MUL a3, c05, c05
  2250. MUL a3, c09, c09
  2251. MUL a3, c13, c13
  2252. #endif
  2253. #ifdef LT
  2254. LD a1, 0 * SIZE(AO)
  2255. LD a2, 1 * SIZE(AO)
  2256. LD a3, 2 * SIZE(AO)
  2257. LD a4, 3 * SIZE(AO)
  2258. MUL a1, c01, c01
  2259. MUL a1, c05, c05
  2260. MUL a1, c09, c09
  2261. MUL a1, c13, c13
  2262. MUL a2, c01, t1
  2263. MUL a2, c05, t2
  2264. MUL a2, c09, t3
  2265. MUL a2, c13, t4
  2266. SUB c02, t1, c02
  2267. SUB c06, t2, c06
  2268. SUB c10, t3, c10
  2269. SUB c14, t4, c14
  2270. MUL a3, c01, t1
  2271. MUL a3, c05, t2
  2272. MUL a3, c09, t3
  2273. MUL a3, c13, t4
  2274. SUB c03, t1, c03
  2275. SUB c07, t2, c07
  2276. SUB c11, t3, c11
  2277. SUB c15, t4, c15
  2278. MUL a4, c01, t1
  2279. MUL a4, c05, t2
  2280. MUL a4, c09, t3
  2281. MUL a4, c13, t4
  2282. SUB c04, t1, c04
  2283. SUB c08, t2, c08
  2284. SUB c12, t3, c12
  2285. SUB c16, t4, c16
  2286. LD b1, 5 * SIZE(AO)
  2287. LD b2, 6 * SIZE(AO)
  2288. LD b3, 7 * SIZE(AO)
  2289. MUL b1, c02, c02
  2290. MUL b1, c06, c06
  2291. MUL b1, c10, c10
  2292. MUL b1, c14, c14
  2293. MUL b2, c02, t1
  2294. MUL b2, c06, t2
  2295. MUL b2, c10, t3
  2296. MUL b2, c14, t4
  2297. SUB c03, t1, c03
  2298. SUB c07, t2, c07
  2299. SUB c11, t3, c11
  2300. SUB c15, t4, c15
  2301. MUL b3, c02, t1
  2302. MUL b3, c06, t2
  2303. MUL b3, c10, t3
  2304. MUL b3, c14, t4
  2305. SUB c04, t1, c04
  2306. SUB c08, t2, c08
  2307. SUB c12, t3, c12
  2308. SUB c16, t4, c16
  2309. LD a1, 10 * SIZE(AO)
  2310. LD a2, 11 * SIZE(AO)
  2311. LD a3, 15 * SIZE(AO)
  2312. MUL a1, c03, c03
  2313. MUL a1, c07, c07
  2314. MUL a1, c11, c11
  2315. MUL a1, c15, c15
  2316. MUL a2, c03, t1
  2317. MUL a2, c07, t2
  2318. MUL a2, c11, t3
  2319. MUL a2, c15, t4
  2320. SUB c04, t1, c04
  2321. SUB c08, t2, c08
  2322. SUB c12, t3, c12
  2323. SUB c16, t4, c16
  2324. MUL a3, c04, c04
  2325. MUL a3, c08, c08
  2326. MUL a3, c12, c12
  2327. MUL a3, c16, c16
  2328. #endif
  2329. #ifdef RN
  2330. LD a1, 0 * SIZE(BO)
  2331. LD a2, 1 * SIZE(BO)
  2332. LD a3, 2 * SIZE(BO)
  2333. LD a4, 3 * SIZE(BO)
  2334. MUL a1, c01, c01
  2335. MUL a1, c02, c02
  2336. MUL a1, c03, c03
  2337. MUL a1, c04, c04
  2338. MUL a2, c01, t1
  2339. MUL a2, c02, t2
  2340. MUL a2, c03, t3
  2341. MUL a2, c04, t4
  2342. SUB c05, t1, c05
  2343. SUB c06, t2, c06
  2344. SUB c07, t3, c07
  2345. SUB c08, t4, c08
  2346. MUL a3, c01, t1
  2347. MUL a3, c02, t2
  2348. MUL a3, c03, t3
  2349. MUL a3, c04, t4
  2350. SUB c09, t1, c09
  2351. SUB c10, t2, c10
  2352. SUB c11, t3, c11
  2353. SUB c12, t4, c12
  2354. MUL a4, c01, t1
  2355. MUL a4, c02, t2
  2356. MUL a4, c03, t3
  2357. MUL a4, c04, t4
  2358. SUB c13, t1, c13
  2359. SUB c14, t2, c14
  2360. SUB c15, t3, c15
  2361. SUB c16, t4, c16
  2362. LD b1, 5 * SIZE(BO)
  2363. LD b2, 6 * SIZE(BO)
  2364. LD b3, 7 * SIZE(BO)
  2365. MUL b1, c05, c05
  2366. MUL b1, c06, c06
  2367. MUL b1, c07, c07
  2368. MUL b1, c08, c08
  2369. MUL b2, c05, t1
  2370. MUL b2, c06, t2
  2371. MUL b2, c07, t3
  2372. MUL b2, c08, t4
  2373. SUB c09, t1, c09
  2374. SUB c10, t2, c10
  2375. SUB c11, t3, c11
  2376. SUB c12, t4, c12
  2377. MUL b3, c05, t1
  2378. MUL b3, c06, t2
  2379. MUL b3, c07, t3
  2380. MUL b3, c08, t4
  2381. SUB c13, t1, c13
  2382. SUB c14, t2, c14
  2383. SUB c15, t3, c15
  2384. SUB c16, t4, c16
  2385. LD a1, 10 * SIZE(BO)
  2386. LD a2, 11 * SIZE(BO)
  2387. LD a3, 15 * SIZE(BO)
  2388. MUL a1, c09, c09
  2389. MUL a1, c10, c10
  2390. MUL a1, c11, c11
  2391. MUL a1, c12, c12
  2392. MUL a2, c09, t1
  2393. MUL a2, c10, t2
  2394. MUL a2, c11, t3
  2395. MUL a2, c12, t4
  2396. SUB c13, t1, c13
  2397. SUB c14, t2, c14
  2398. SUB c15, t3, c15
  2399. SUB c16, t4, c16
  2400. MUL a3, c13, c13
  2401. MUL a3, c14, c14
  2402. MUL a3, c15, c15
  2403. MUL a3, c16, c16
  2404. #endif
  2405. #ifdef RT
  2406. LD a1, 15 * SIZE(BO)
  2407. LD a2, 14 * SIZE(BO)
  2408. LD a3, 13 * SIZE(BO)
  2409. LD a4, 12 * SIZE(BO)
  2410. MUL a1, c13, c13
  2411. MUL a1, c14, c14
  2412. MUL a1, c15, c15
  2413. MUL a1, c16, c16
  2414. MUL a2, c13, t1
  2415. MUL a2, c14, t2
  2416. MUL a2, c15, t3
  2417. MUL a2, c16, t4
  2418. SUB c09, t1, c09
  2419. SUB c10, t2, c10
  2420. SUB c11, t3, c11
  2421. SUB c12, t4, c12
  2422. MUL a3, c13, t1
  2423. MUL a3, c14, t2
  2424. MUL a3, c15, t3
  2425. MUL a3, c16, t4
  2426. SUB c05, t1, c05
  2427. SUB c06, t2, c06
  2428. SUB c07, t3, c07
  2429. SUB c08, t4, c08
  2430. MUL a4, c13, t1
  2431. MUL a4, c14, t2
  2432. MUL a4, c15, t3
  2433. MUL a4, c16, t4
  2434. SUB c01, t1, c01
  2435. SUB c02, t2, c02
  2436. SUB c03, t3, c03
  2437. SUB c04, t4, c04
  2438. LD b1, 10 * SIZE(BO)
  2439. LD b2, 9 * SIZE(BO)
  2440. LD b3, 8 * SIZE(BO)
  2441. MUL b1, c09, c09
  2442. MUL b1, c10, c10
  2443. MUL b1, c11, c11
  2444. MUL b1, c12, c12
  2445. MUL b2, c09, t1
  2446. MUL b2, c10, t2
  2447. MUL b2, c11, t3
  2448. MUL b2, c12, t4
  2449. SUB c05, t1, c05
  2450. SUB c06, t2, c06
  2451. SUB c07, t3, c07
  2452. SUB c08, t4, c08
  2453. MUL b3, c09, t1
  2454. MUL b3, c10, t2
  2455. MUL b3, c11, t3
  2456. MUL b3, c12, t4
  2457. SUB c01, t1, c01
  2458. SUB c02, t2, c02
  2459. SUB c03, t3, c03
  2460. SUB c04, t4, c04
  2461. LD a1, 5 * SIZE(BO)
  2462. LD a2, 4 * SIZE(BO)
  2463. LD a3, 0 * SIZE(BO)
  2464. MUL a1, c05, c05
  2465. MUL a1, c06, c06
  2466. MUL a1, c07, c07
  2467. MUL a1, c08, c08
  2468. MUL a2, c05, t1
  2469. MUL a2, c06, t2
  2470. MUL a2, c07, t3
  2471. MUL a2, c08, t4
  2472. SUB c01, t1, c01
  2473. SUB c02, t2, c02
  2474. SUB c03, t3, c03
  2475. SUB c04, t4, c04
  2476. MUL a3, c01, c01
  2477. MUL a3, c02, c02
  2478. MUL a3, c03, c03
  2479. MUL a3, c04, c04
  2480. #endif
  2481. #if defined(LN) || defined(LT)
  2482. ST c01, 0 * SIZE(BO)
  2483. ST c05, 1 * SIZE(BO)
  2484. ST c09, 2 * SIZE(BO)
  2485. ST c13, 3 * SIZE(BO)
  2486. ST c02, 4 * SIZE(BO)
  2487. ST c06, 5 * SIZE(BO)
  2488. ST c10, 6 * SIZE(BO)
  2489. ST c14, 7 * SIZE(BO)
  2490. ST c03, 8 * SIZE(BO)
  2491. ST c07, 9 * SIZE(BO)
  2492. ST c11, 10 * SIZE(BO)
  2493. ST c15, 11 * SIZE(BO)
  2494. ST c04, 12 * SIZE(BO)
  2495. ST c08, 13 * SIZE(BO)
  2496. ST c12, 14 * SIZE(BO)
  2497. ST c16, 15 * SIZE(BO)
  2498. #else
  2499. ST c01, 0 * SIZE(AO)
  2500. ST c02, 1 * SIZE(AO)
  2501. ST c03, 2 * SIZE(AO)
  2502. ST c04, 3 * SIZE(AO)
  2503. ST c05, 4 * SIZE(AO)
  2504. ST c06, 5 * SIZE(AO)
  2505. ST c07, 6 * SIZE(AO)
  2506. ST c08, 7 * SIZE(AO)
  2507. ST c09, 8 * SIZE(AO)
  2508. ST c10, 9 * SIZE(AO)
  2509. ST c11, 10 * SIZE(AO)
  2510. ST c12, 11 * SIZE(AO)
  2511. ST c13, 12 * SIZE(AO)
  2512. ST c14, 13 * SIZE(AO)
  2513. ST c15, 14 * SIZE(AO)
  2514. ST c16, 15 * SIZE(AO)
  2515. #endif
  2516. #ifdef LN
  2517. lda C1, -4 * SIZE(C1)
  2518. lda C2, -4 * SIZE(C2)
  2519. lda C3, -4 * SIZE(C3)
  2520. lda C4, -4 * SIZE(C4)
  2521. #endif
  2522. ST c01, 0 * SIZE(C1)
  2523. ST c02, 1 * SIZE(C1)
  2524. ST c03, 2 * SIZE(C1)
  2525. ST c04, 3 * SIZE(C1)
  2526. ST c05, 0 * SIZE(C2)
  2527. ST c06, 1 * SIZE(C2)
  2528. ST c07, 2 * SIZE(C2)
  2529. ST c08, 3 * SIZE(C2)
  2530. ST c09, 0 * SIZE(C3)
  2531. ST c10, 1 * SIZE(C3)
  2532. ST c11, 2 * SIZE(C3)
  2533. ST c12, 3 * SIZE(C3)
  2534. ST c13, 0 * SIZE(C4)
  2535. ST c14, 1 * SIZE(C4)
  2536. ST c15, 2 * SIZE(C4)
  2537. ST c16, 3 * SIZE(C4)
  2538. #ifndef LN
  2539. lda C1, 4 * SIZE(C1)
  2540. lda C2, 4 * SIZE(C2)
  2541. lda C3, 4 * SIZE(C3)
  2542. lda C4, 4 * SIZE(C4)
  2543. #endif
  2544. fclr t1
  2545. fclr t2
  2546. fclr t3
  2547. fclr t4
  2548. #ifdef RT
  2549. sll K, 2 + BASE_SHIFT, TMP1
  2550. addq AORIG, TMP1, AORIG
  2551. #endif
  2552. #if defined(LT) || defined(RN)
  2553. subq K, KK, TMP1
  2554. sll TMP1, BASE_SHIFT + 2, TMP1
  2555. addq AO, TMP1, AO
  2556. addq BO, TMP1, BO
  2557. #endif
  2558. #ifdef LT
  2559. addq KK, 4, KK
  2560. #endif
  2561. #ifdef LN
  2562. subq KK, 4, KK
  2563. #endif
  2564. lda I, -1(I)
  2565. bgt I, $L11
  2566. .align 4
  2567. $L20:
  2568. and M, 2, I
  2569. ble I, $L30
  2570. #if defined(LT) || defined(RN)
  2571. LD a1, 0 * SIZE(AO)
  2572. fclr c09
  2573. LD a2, 1 * SIZE(AO)
  2574. fclr c13
  2575. LD a3, 2 * SIZE(AO)
  2576. fclr c10
  2577. LD a4, 3 * SIZE(AO)
  2578. fclr c14
  2579. LD b1, 0 * SIZE(B)
  2580. lda L, -2(KK)
  2581. LD b2, 1 * SIZE(B)
  2582. lda AO, 2 * SIZE(AO)
  2583. LD b3, 2 * SIZE(B)
  2584. fclr c01
  2585. LD b4, 3 * SIZE(B)
  2586. fclr c05
  2587. lda BO, 4 * SIZE(B)
  2588. fclr c02
  2589. fclr c06
  2590. ble KK, $L28
  2591. ble L, $L25
  2592. #else
  2593. #ifdef LN
  2594. sll K, BASE_SHIFT + 1, TMP1
  2595. subq AORIG, TMP1, AORIG
  2596. #endif
  2597. sll KK, BASE_SHIFT + 1, TMP1
  2598. addq AORIG, TMP1, AO
  2599. sll KK, BASE_SHIFT + 2, TMP2
  2600. addq B, TMP2, BO
  2601. subq K, KK, TMP1
  2602. LD a1, 0 * SIZE(AO)
  2603. fclr c09
  2604. LD a2, 1 * SIZE(AO)
  2605. fclr c13
  2606. LD a3, 2 * SIZE(AO)
  2607. fclr c10
  2608. LD a4, 3 * SIZE(AO)
  2609. fclr c14
  2610. LD b1, 0 * SIZE(BO)
  2611. lda L, -2(TMP1)
  2612. LD b2, 1 * SIZE(BO)
  2613. lda AO, 2 * SIZE(AO)
  2614. LD b3, 2 * SIZE(BO)
  2615. fclr c01
  2616. LD b4, 3 * SIZE(BO)
  2617. fclr c05
  2618. lda BO, 4 * SIZE(BO)
  2619. fclr c02
  2620. fclr c06
  2621. ble TMP1, $L28
  2622. ble L, $L25
  2623. #endif
  2624. .align 4
  2625. $L22:
  2626. ADD c09, t1, c09
  2627. unop
  2628. MUL a1, b1, t1
  2629. unop
  2630. ADD c10, t2, c10
  2631. unop
  2632. MUL a2, b1, t2
  2633. LD b1, 0 * SIZE(BO)
  2634. ADD c13, t3, c13
  2635. unop
  2636. MUL a1, b2, t3
  2637. lda BO, 8 * SIZE(BO)
  2638. ADD c14, t4, c14
  2639. unop
  2640. MUL a2, b2, t4
  2641. LD b2, -7 * SIZE(BO)
  2642. ADD c01, t1, c01
  2643. unop
  2644. MUL a1, b3, t1
  2645. unop
  2646. ADD c02, t2, c02
  2647. unop
  2648. MUL a2, b3, t2
  2649. LD b3, -6 * SIZE(BO)
  2650. ADD c05, t3, c05
  2651. unop
  2652. MUL a1, b4, t3
  2653. LD a1, 2 * SIZE(AO)
  2654. ADD c06, t4, c06
  2655. MUL a2, b4, t4
  2656. LD b5, -5 * SIZE(BO)
  2657. ADD c09, t1, c09
  2658. unop
  2659. MUL a3, b1, t1
  2660. LD a2, 3 * SIZE(AO)
  2661. ADD c10, t2, c10
  2662. unop
  2663. MUL a4, b1, t2
  2664. LD b1, -4 * SIZE(BO)
  2665. ADD c13, t3, c13
  2666. unop
  2667. MUL a3, b2, t3
  2668. lda AO, 4 * SIZE(AO)
  2669. ADD c14, t4, c14
  2670. MUL a4, b2, t4
  2671. LD b2, -3 * SIZE(BO)
  2672. ADD c01, t1, c01
  2673. lda L, -2(L)
  2674. MUL a3, b3, t1
  2675. LD b4, -1 * SIZE(BO)
  2676. ADD c02, t2, c02
  2677. unop
  2678. MUL a4, b3, t2
  2679. LD b3, -2 * SIZE(BO)
  2680. ADD c05, t3, c05
  2681. unop
  2682. MUL a3, b5, t3
  2683. LD a3, 0 * SIZE(AO)
  2684. ADD c06, t4, c06
  2685. MUL a4, b5, t4
  2686. LD a4, 1 * SIZE(AO)
  2687. bgt L, $L22
  2688. .align 4
  2689. $L25:
  2690. ADD c09, t1, c09
  2691. MUL a1, b1, t1
  2692. #if defined(LT) || defined(RN)
  2693. blbs KK, $L27
  2694. #else
  2695. blbs TMP1, $L27
  2696. #endif
  2697. ADD c10, t2, c10
  2698. unop
  2699. MUL a2, b1, t2
  2700. LD b1, 0 * SIZE(BO)
  2701. ADD c13, t3, c13
  2702. unop
  2703. MUL a1, b2, t3
  2704. unop
  2705. ADD c14, t4, c14
  2706. unop
  2707. MUL a2, b2, t4
  2708. LD b2, 1 * SIZE(BO)
  2709. ADD c01, t1, c01
  2710. unop
  2711. MUL a1, b3, t1
  2712. lda AO, 2 * SIZE(AO)
  2713. ADD c02, t2, c02
  2714. unop
  2715. MUL a2, b3, t2
  2716. LD b3, 2 * SIZE(BO)
  2717. ADD c05, t3, c05
  2718. unop
  2719. MUL a1, b4, t3
  2720. LD a1, -2 * SIZE(AO)
  2721. ADD c06, t4, c06
  2722. unop
  2723. MUL a2, b4, t4
  2724. LD a2, -1 * SIZE(AO)
  2725. ADD c09, t1, c09
  2726. LD b4, 3 * SIZE(BO)
  2727. MUL a1, b1, t1
  2728. lda BO, 4 * SIZE(BO)
  2729. .align 4
  2730. $L27:
  2731. ADD c10, t2, c10
  2732. MUL a2, b1, t2
  2733. ADD c13, t3, c13
  2734. MUL a1, b2, t3
  2735. ADD c14, t4, c14
  2736. MUL a2, b2, t4
  2737. ADD c01, t1, c01
  2738. MUL a1, b3, t1
  2739. ADD c02, t2, c02
  2740. MUL a2, b3, t2
  2741. ADD c05, t3, c05
  2742. MUL a1, b4, t3
  2743. ADD c06, t4, c06
  2744. lda AO, 2 * SIZE(AO)
  2745. MUL a2, b4, t4
  2746. lda BO, 4 * SIZE(BO)
  2747. ADD c09, t1, c09
  2748. ADD c10, t2, c10
  2749. ADD c13, t3, c13
  2750. ADD c14, t4, c14
  2751. .align 4
  2752. $L28:
  2753. #if defined(LN) || defined(RT)
  2754. #ifdef LN
  2755. subq KK, 2, TMP1
  2756. #else
  2757. subq KK, 4, TMP1
  2758. #endif
  2759. sll TMP1, BASE_SHIFT + 1, TMP2
  2760. addq AORIG, TMP2, AO
  2761. sll TMP1, BASE_SHIFT + 2, TMP2
  2762. addq B, TMP2, BO
  2763. #else
  2764. lda AO, -2 * SIZE(AO)
  2765. lda BO, -4 * SIZE(BO)
  2766. #endif
  2767. #if defined(LN) || defined(LT)
  2768. LD a1, 0 * SIZE(BO)
  2769. LD a2, 1 * SIZE(BO)
  2770. LD a3, 2 * SIZE(BO)
  2771. LD a4, 3 * SIZE(BO)
  2772. LD b1, 4 * SIZE(BO)
  2773. LD b2, 5 * SIZE(BO)
  2774. LD b3, 6 * SIZE(BO)
  2775. LD b4, 7 * SIZE(BO)
  2776. SUB a1, c01, c01
  2777. SUB a2, c05, c05
  2778. SUB a3, c09, c09
  2779. SUB a4, c13, c13
  2780. SUB b1, c02, c02
  2781. SUB b2, c06, c06
  2782. SUB b3, c10, c10
  2783. SUB b4, c14, c14
  2784. #else
  2785. LD a1, 0 * SIZE(AO)
  2786. LD a2, 1 * SIZE(AO)
  2787. LD a3, 2 * SIZE(AO)
  2788. LD a4, 3 * SIZE(AO)
  2789. LD b1, 4 * SIZE(AO)
  2790. LD b2, 5 * SIZE(AO)
  2791. LD b3, 6 * SIZE(AO)
  2792. LD b4, 7 * SIZE(AO)
  2793. SUB a1, c01, c01
  2794. SUB a2, c02, c02
  2795. SUB a3, c05, c05
  2796. SUB a4, c06, c06
  2797. SUB b1, c09, c09
  2798. SUB b2, c10, c10
  2799. SUB b3, c13, c13
  2800. SUB b4, c14, c14
  2801. #endif
  2802. #ifdef LN
  2803. LD a1, 3 * SIZE(AO)
  2804. LD a2, 2 * SIZE(AO)
  2805. LD a3, 0 * SIZE(AO)
  2806. MUL a1, c02, c02
  2807. MUL a1, c06, c06
  2808. MUL a1, c10, c10
  2809. MUL a1, c14, c14
  2810. MUL a2, c02, t1
  2811. MUL a2, c06, t2
  2812. MUL a2, c10, t3
  2813. MUL a2, c14, t4
  2814. SUB c01, t1, c01
  2815. SUB c05, t2, c05
  2816. SUB c09, t3, c09
  2817. SUB c13, t4, c13
  2818. MUL a3, c01, c01
  2819. MUL a3, c05, c05
  2820. MUL a3, c09, c09
  2821. MUL a3, c13, c13
  2822. #endif
  2823. #ifdef LT
  2824. LD a1, 0 * SIZE(AO)
  2825. LD a2, 1 * SIZE(AO)
  2826. LD a3, 3 * SIZE(AO)
  2827. MUL a1, c01, c01
  2828. MUL a1, c05, c05
  2829. MUL a1, c09, c09
  2830. MUL a1, c13, c13
  2831. MUL a2, c01, t1
  2832. MUL a2, c05, t2
  2833. MUL a2, c09, t3
  2834. MUL a2, c13, t4
  2835. SUB c02, t1, c02
  2836. SUB c06, t2, c06
  2837. SUB c10, t3, c10
  2838. SUB c14, t4, c14
  2839. MUL a3, c02, c02
  2840. MUL a3, c06, c06
  2841. MUL a3, c10, c10
  2842. MUL a3, c14, c14
  2843. #endif
  2844. #ifdef RN
  2845. LD a1, 0 * SIZE(BO)
  2846. LD a2, 1 * SIZE(BO)
  2847. LD a3, 2 * SIZE(BO)
  2848. LD a4, 3 * SIZE(BO)
  2849. MUL a1, c01, c01
  2850. MUL a1, c02, c02
  2851. MUL a2, c01, t1
  2852. MUL a2, c02, t2
  2853. SUB c05, t1, c05
  2854. SUB c06, t2, c06
  2855. MUL a3, c01, t1
  2856. MUL a3, c02, t2
  2857. SUB c09, t1, c09
  2858. SUB c10, t2, c10
  2859. MUL a4, c01, t1
  2860. MUL a4, c02, t2
  2861. SUB c13, t1, c13
  2862. SUB c14, t2, c14
  2863. LD b1, 5 * SIZE(BO)
  2864. LD b2, 6 * SIZE(BO)
  2865. LD b3, 7 * SIZE(BO)
  2866. MUL b1, c05, c05
  2867. MUL b1, c06, c06
  2868. MUL b2, c05, t1
  2869. MUL b2, c06, t2
  2870. SUB c09, t1, c09
  2871. SUB c10, t2, c10
  2872. MUL b3, c05, t1
  2873. MUL b3, c06, t2
  2874. SUB c13, t1, c13
  2875. SUB c14, t2, c14
  2876. LD a1, 10 * SIZE(BO)
  2877. LD a2, 11 * SIZE(BO)
  2878. LD a3, 15 * SIZE(BO)
  2879. MUL a1, c09, c09
  2880. MUL a1, c10, c10
  2881. MUL a2, c09, t1
  2882. MUL a2, c10, t2
  2883. SUB c13, t1, c13
  2884. SUB c14, t2, c14
  2885. MUL a3, c13, c13
  2886. MUL a3, c14, c14
  2887. #endif
  2888. #ifdef RT
  2889. LD a1, 15 * SIZE(BO)
  2890. LD a2, 14 * SIZE(BO)
  2891. LD a3, 13 * SIZE(BO)
  2892. LD a4, 12 * SIZE(BO)
  2893. MUL a1, c13, c13
  2894. MUL a1, c14, c14
  2895. MUL a2, c13, t1
  2896. MUL a2, c14, t2
  2897. SUB c09, t1, c09
  2898. SUB c10, t2, c10
  2899. MUL a3, c13, t1
  2900. MUL a3, c14, t2
  2901. SUB c05, t1, c05
  2902. SUB c06, t2, c06
  2903. MUL a4, c13, t1
  2904. MUL a4, c14, t2
  2905. SUB c01, t1, c01
  2906. SUB c02, t2, c02
  2907. LD b1, 10 * SIZE(BO)
  2908. LD b2, 9 * SIZE(BO)
  2909. LD b3, 8 * SIZE(BO)
  2910. MUL b1, c09, c09
  2911. MUL b1, c10, c10
  2912. MUL b2, c09, t1
  2913. MUL b2, c10, t2
  2914. SUB c05, t1, c05
  2915. SUB c06, t2, c06
  2916. MUL b3, c09, t1
  2917. MUL b3, c10, t2
  2918. SUB c01, t1, c01
  2919. SUB c02, t2, c02
  2920. LD a1, 5 * SIZE(BO)
  2921. LD a2, 4 * SIZE(BO)
  2922. LD a3, 0 * SIZE(BO)
  2923. MUL a1, c05, c05
  2924. MUL a1, c06, c06
  2925. MUL a2, c05, t1
  2926. MUL a2, c06, t2
  2927. SUB c01, t1, c01
  2928. SUB c02, t2, c02
  2929. MUL a3, c01, c01
  2930. MUL a3, c02, c02
  2931. #endif
  2932. #if defined(LN) || defined(LT)
  2933. ST c01, 0 * SIZE(BO)
  2934. ST c05, 1 * SIZE(BO)
  2935. ST c09, 2 * SIZE(BO)
  2936. ST c13, 3 * SIZE(BO)
  2937. ST c02, 4 * SIZE(BO)
  2938. ST c06, 5 * SIZE(BO)
  2939. ST c10, 6 * SIZE(BO)
  2940. ST c14, 7 * SIZE(BO)
  2941. #else
  2942. ST c01, 0 * SIZE(AO)
  2943. ST c02, 1 * SIZE(AO)
  2944. ST c05, 2 * SIZE(AO)
  2945. ST c06, 3 * SIZE(AO)
  2946. ST c09, 4 * SIZE(AO)
  2947. ST c10, 5 * SIZE(AO)
  2948. ST c13, 6 * SIZE(AO)
  2949. ST c14, 7 * SIZE(AO)
  2950. #endif
  2951. #ifdef LN
  2952. lda C1, -2 * SIZE(C1)
  2953. lda C2, -2 * SIZE(C2)
  2954. lda C3, -2 * SIZE(C3)
  2955. lda C4, -2 * SIZE(C4)
  2956. #endif
  2957. ST c01, 0 * SIZE(C1)
  2958. ST c02, 1 * SIZE(C1)
  2959. ST c05, 0 * SIZE(C2)
  2960. ST c06, 1 * SIZE(C2)
  2961. ST c09, 0 * SIZE(C3)
  2962. ST c10, 1 * SIZE(C3)
  2963. ST c13, 0 * SIZE(C4)
  2964. ST c14, 1 * SIZE(C4)
  2965. #ifndef LN
  2966. lda C1, 2 * SIZE(C1)
  2967. lda C2, 2 * SIZE(C2)
  2968. lda C3, 2 * SIZE(C3)
  2969. lda C4, 2 * SIZE(C4)
  2970. #endif
  2971. fclr t1
  2972. fclr t2
  2973. fclr t3
  2974. fclr t4
  2975. #ifdef RT
  2976. sll K, 1 + BASE_SHIFT, TMP1
  2977. addq AORIG, TMP1, AORIG
  2978. #endif
  2979. #if defined(LT) || defined(RN)
  2980. subq K, KK, TMP1
  2981. sll TMP1, BASE_SHIFT + 1, TMP2
  2982. addq AO, TMP2, AO
  2983. sll TMP1, BASE_SHIFT + 2, TMP2
  2984. addq BO, TMP2, BO
  2985. #endif
  2986. #ifdef LT
  2987. addq KK, 2, KK
  2988. #endif
  2989. #ifdef LN
  2990. subq KK, 2, KK
  2991. #endif
  2992. .align 4
  2993. $L30:
  2994. and M, 1, I
  2995. ble I, $L39
  2996. #if defined(LT) || defined(RN)
  2997. LD a1, 0 * SIZE(AO)
  2998. fclr c01
  2999. LD a2, 1 * SIZE(AO)
  3000. fclr c05
  3001. LD b1, 0 * SIZE(B)
  3002. lda L, -2(KK)
  3003. LD b2, 1 * SIZE(B)
  3004. lda AO, 1 * SIZE(AO)
  3005. LD b3, 2 * SIZE(B)
  3006. fclr c09
  3007. LD b4, 3 * SIZE(B)
  3008. fclr c13
  3009. lda BO, 4 * SIZE(B)
  3010. ble KK, $L38
  3011. ble L, $L35
  3012. #else
  3013. #ifdef LN
  3014. sll K, BASE_SHIFT + 0, TMP1
  3015. subq AORIG, TMP1, AORIG
  3016. #endif
  3017. sll KK, BASE_SHIFT + 0, TMP1
  3018. addq AORIG, TMP1, AO
  3019. sll KK, BASE_SHIFT + 2, TMP2
  3020. addq B, TMP2, BO
  3021. subq K, KK, TMP1
  3022. LD a1, 0 * SIZE(AO)
  3023. fclr c01
  3024. LD a2, 1 * SIZE(AO)
  3025. fclr c05
  3026. LD b1, 0 * SIZE(BO)
  3027. lda L, -2(TMP1)
  3028. LD b2, 1 * SIZE(BO)
  3029. lda AO, 1 * SIZE(AO)
  3030. LD b3, 2 * SIZE(BO)
  3031. fclr c09
  3032. LD b4, 3 * SIZE(BO)
  3033. fclr c13
  3034. lda BO, 4 * SIZE(BO)
  3035. ble TMP1, $L38
  3036. ble L, $L35
  3037. #endif
  3038. .align 4
  3039. $L32:
  3040. ADD c01, t1, c01
  3041. lda L, -2(L)
  3042. MUL a1, b1, t1
  3043. LD b1, 0 * SIZE(BO)
  3044. ADD c05, t2, c05
  3045. lda AO, 2 * SIZE(AO)
  3046. MUL a1, b2, t2
  3047. LD b2, 1 * SIZE(BO)
  3048. ADD c09, t3, c09
  3049. LD b5, 3 * SIZE(BO)
  3050. MUL a1, b3, t3
  3051. LD b3, 2 * SIZE(BO)
  3052. ADD c13, t4, c13
  3053. MUL a1, b4, t4
  3054. LD a1, -1 * SIZE(AO)
  3055. ADD c01, t1, c01
  3056. MUL a2, b1, t1
  3057. LD b1, 4 * SIZE(BO)
  3058. lda BO, 8 * SIZE(BO)
  3059. ADD c05, t2, c05
  3060. MUL a2, b2, t2
  3061. LD b2, -3 * SIZE(BO)
  3062. ADD c09, t3, c09
  3063. LD b4, -1 * SIZE(BO)
  3064. MUL a2, b3, t3
  3065. LD b3, -2 * SIZE(BO)
  3066. ADD c13, t4, c13
  3067. MUL a2, b5, t4
  3068. LD a2, 0 * SIZE(AO)
  3069. bgt L, $L32
  3070. .align 4
  3071. $L35:
  3072. ADD c01, t1, c01
  3073. MUL a1, b1, t1
  3074. #if defined(LT) || defined(RN)
  3075. blbs KK, $L37
  3076. #else
  3077. blbs TMP1, $L37
  3078. #endif
  3079. .align 4
  3080. ADD c05, t2, c05
  3081. LD b1, 0 * SIZE(BO)
  3082. MUL a1, b2, t2
  3083. LD b2, 1 * SIZE(BO)
  3084. ADD c09, t3, c09
  3085. MUL a1, b3, t3
  3086. LD b3, 2 * SIZE(BO)
  3087. ADD c13, t4, c13
  3088. MUL a1, b4, t4
  3089. LD a1, 0 * SIZE(AO)
  3090. lda AO, 1 * SIZE(AO)
  3091. ADD c01, t1, c01
  3092. LD b4, 3 * SIZE(BO)
  3093. MUL a1, b1, t1
  3094. lda BO, 4 * SIZE(BO)
  3095. .align 4
  3096. $L37:
  3097. ADD c05, t2, c05
  3098. MUL a1, b2, t2
  3099. ADD c09, t3, c09
  3100. MUL a1, b3, t3
  3101. ADD c13, t4, c13
  3102. lda AO, 1 * SIZE(AO)
  3103. MUL a1, b4, t4
  3104. lda BO, 4 * SIZE(BO)
  3105. ADD c01, t1, c01
  3106. ADD c05, t2, c05
  3107. ADD c09, t3, c09
  3108. ADD c13, t4, c13
  3109. $L38:
  3110. #if defined(LN) || defined(RT)
  3111. #ifdef LN
  3112. subq KK, 1, TMP1
  3113. #else
  3114. subq KK, 4, TMP1
  3115. #endif
  3116. sll TMP1, BASE_SHIFT + 0, TMP2
  3117. addq AORIG, TMP2, AO
  3118. sll TMP1, BASE_SHIFT + 2, TMP2
  3119. addq B, TMP2, BO
  3120. #else
  3121. lda AO, -1 * SIZE(AO)
  3122. lda BO, -4 * SIZE(BO)
  3123. #endif
  3124. #if defined(LN) || defined(LT)
  3125. LD a1, 0 * SIZE(BO)
  3126. LD a2, 1 * SIZE(BO)
  3127. LD a3, 2 * SIZE(BO)
  3128. LD a4, 3 * SIZE(BO)
  3129. SUB a1, c01, c01
  3130. SUB a2, c05, c05
  3131. SUB a3, c09, c09
  3132. SUB a4, c13, c13
  3133. #else
  3134. LD a1, 0 * SIZE(AO)
  3135. LD a2, 1 * SIZE(AO)
  3136. LD a3, 2 * SIZE(AO)
  3137. LD a4, 3 * SIZE(AO)
  3138. SUB a1, c01, c01
  3139. SUB a2, c05, c05
  3140. SUB a3, c09, c09
  3141. SUB a4, c13, c13
  3142. #endif
  3143. #if defined(LN) || defined(LT)
  3144. LD a1, 0 * SIZE(AO)
  3145. MUL a1, c01, c01
  3146. MUL a1, c05, c05
  3147. MUL a1, c09, c09
  3148. MUL a1, c13, c13
  3149. #endif
  3150. #ifdef RN
  3151. LD a1, 0 * SIZE(BO)
  3152. LD a2, 1 * SIZE(BO)
  3153. LD a3, 2 * SIZE(BO)
  3154. LD a4, 3 * SIZE(BO)
  3155. MUL a1, c01, c01
  3156. MUL a2, c01, t1
  3157. SUB c05, t1, c05
  3158. MUL a3, c01, t1
  3159. SUB c09, t1, c09
  3160. MUL a4, c01, t1
  3161. SUB c13, t1, c13
  3162. LD b1, 5 * SIZE(BO)
  3163. LD b2, 6 * SIZE(BO)
  3164. LD b3, 7 * SIZE(BO)
  3165. MUL b1, c05, c05
  3166. MUL b2, c05, t1
  3167. SUB c09, t1, c09
  3168. MUL b3, c05, t1
  3169. SUB c13, t1, c13
  3170. LD a1, 10 * SIZE(BO)
  3171. LD a2, 11 * SIZE(BO)
  3172. LD a3, 15 * SIZE(BO)
  3173. MUL a1, c09, c09
  3174. MUL a2, c09, t1
  3175. SUB c13, t1, c13
  3176. MUL a3, c13, c13
  3177. #endif
  3178. #ifdef RT
  3179. LD a1, 15 * SIZE(BO)
  3180. LD a2, 14 * SIZE(BO)
  3181. LD a3, 13 * SIZE(BO)
  3182. LD a4, 12 * SIZE(BO)
  3183. MUL a1, c13, c13
  3184. MUL a2, c13, t1
  3185. SUB c09, t1, c09
  3186. MUL a3, c13, t1
  3187. SUB c05, t1, c05
  3188. MUL a4, c13, t1
  3189. SUB c01, t1, c01
  3190. LD b1, 10 * SIZE(BO)
  3191. LD b2, 9 * SIZE(BO)
  3192. LD b3, 8 * SIZE(BO)
  3193. MUL b1, c09, c09
  3194. MUL b2, c09, t1
  3195. SUB c05, t1, c05
  3196. MUL b3, c09, t1
  3197. SUB c01, t1, c01
  3198. LD a1, 5 * SIZE(BO)
  3199. LD a2, 4 * SIZE(BO)
  3200. LD a3, 0 * SIZE(BO)
  3201. MUL a1, c05, c05
  3202. MUL a2, c05, t1
  3203. SUB c01, t1, c01
  3204. MUL a3, c01, c01
  3205. #endif
  3206. #if defined(LN) || defined(LT)
  3207. ST c01, 0 * SIZE(BO)
  3208. ST c05, 1 * SIZE(BO)
  3209. ST c09, 2 * SIZE(BO)
  3210. ST c13, 3 * SIZE(BO)
  3211. #else
  3212. ST c01, 0 * SIZE(AO)
  3213. ST c05, 1 * SIZE(AO)
  3214. ST c09, 2 * SIZE(AO)
  3215. ST c13, 3 * SIZE(AO)
  3216. #endif
  3217. #ifdef LN
  3218. lda C1, -1 * SIZE(C1)
  3219. lda C2, -1 * SIZE(C2)
  3220. lda C3, -1 * SIZE(C3)
  3221. lda C4, -1 * SIZE(C4)
  3222. #endif
  3223. ST c01, 0 * SIZE(C1)
  3224. ST c05, 0 * SIZE(C2)
  3225. ST c09, 0 * SIZE(C3)
  3226. ST c13, 0 * SIZE(C4)
  3227. #ifdef RT
  3228. sll K, 0 + BASE_SHIFT, TMP1
  3229. addq AORIG, TMP1, AORIG
  3230. #endif
  3231. #if defined(LT) || defined(RN)
  3232. subq K, KK, TMP1
  3233. sll TMP1, BASE_SHIFT + 0, TMP2
  3234. addq AO, TMP2, AO
  3235. sll TMP1, BASE_SHIFT + 2, TMP2
  3236. addq BO, TMP2, BO
  3237. #endif
  3238. #ifdef LT
  3239. addq KK, 1, KK
  3240. #endif
  3241. #ifdef LN
  3242. subq KK, 1, KK
  3243. #endif
  3244. .align 4
  3245. $L39:
  3246. #ifdef LN
  3247. sll K, 2 + BASE_SHIFT, TMP1
  3248. addq B, TMP1, B
  3249. #endif
  3250. #if defined(LT) || defined(RN)
  3251. mov BO, B
  3252. #endif
  3253. #ifdef RN
  3254. addq KK, 4, KK
  3255. #endif
  3256. #ifdef RT
  3257. subq KK, 4, KK
  3258. #endif
  3259. lda J, -1(J)
  3260. bgt J, $L01
  3261. .align 4
  3262. $L999:
  3263. ldt $f2, 0($sp)
  3264. ldt $f3, 8($sp)
  3265. ldt $f4, 16($sp)
  3266. ldt $f5, 24($sp)
  3267. ldt $f6, 32($sp)
  3268. ldt $f7, 40($sp)
  3269. ldt $f8, 48($sp)
  3270. ldt $f9, 56($sp)
  3271. clr $0
  3272. lda $sp, STACKSIZE($sp)
  3273. ret
  3274. EPILOGUE