You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LT.S 28 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M $4
  41. #define N $5
  42. #define K $6
  43. #define A $9
  44. #define B $10
  45. #define C $11
  46. #define LDC $8
  47. #define AO $12
  48. #define BO $13
  49. #define I $2
  50. #define J $3
  51. #define L $7
  52. #define CO1 $14
  53. #define CO2 $15
  54. #define CO3 $16
  55. #define CO4 $17
  56. #define OFFSET $18
  57. #define KK $19
  58. #define TEMP $20
  59. #define AORIG $21
  60. #define a1 $f0
  61. #define a2 $f1
  62. #define a3 $f26
  63. #define a4 $f27
  64. #define b1 $f2
  65. #define b2 $f3
  66. #define b3 $f4
  67. #define b4 $f5
  68. #define b5 $f6
  69. #define b6 $f7
  70. #define b7 $f8
  71. #define b8 $f9
  72. #define a5 b8
  73. #define c11 $f10
  74. #define c12 $f11
  75. #define c21 $f12
  76. #define c22 $f13
  77. #define c31 $f14
  78. #define c32 $f15
  79. #define c41 $f16
  80. #define c42 $f17
  81. #define c51 $f18
  82. #define c52 $f19
  83. #define c61 $f20
  84. #define c62 $f21
  85. #define c71 $f22
  86. #define c72 $f23
  87. #define c81 $f24
  88. #define c82 $f25
  89. #ifndef CONJ
  90. #define MADD1 MADD
  91. #define MADD2 MADD
  92. #define MADD3 MADD
  93. #define MADD4 NMSUB
  94. #define MADD5 MSUB
  95. #define MADD6 MADD
  96. #define MADD7 NMSUB
  97. #define MADD8 MADD
  98. #else
  99. #if defined(LN) || defined(LT)
  100. #define MADD1 MADD
  101. #define MADD2 NMSUB
  102. #define MADD3 MADD
  103. #define MADD4 MADD
  104. #else
  105. #define MADD1 MADD
  106. #define MADD2 MADD
  107. #define MADD3 NMSUB
  108. #define MADD4 MADD
  109. #endif
  110. #define MADD5 MADD
  111. #define MADD6 MSUB
  112. #define MADD7 MADD
  113. #define MADD8 NMSUB
  114. #endif
  115. PROLOGUE
  116. daddiu $sp, $sp, -128
  117. SDARG $16, 0($sp)
  118. SDARG $17, 8($sp)
  119. SDARG $18, 16($sp)
  120. SDARG $19, 24($sp)
  121. SDARG $20, 32($sp)
  122. SDARG $21, 40($sp)
  123. sdc1 $f24, 48($sp)
  124. sdc1 $f25, 56($sp)
  125. sdc1 $f26, 64($sp)
  126. sdc1 $f27, 72($sp)
  127. #ifndef __64BIT__
  128. sdc1 $f20, 88($sp)
  129. sdc1 $f21, 96($sp)
  130. sdc1 $f22,104($sp)
  131. sdc1 $f23,112($sp)
  132. #endif
  133. LDARG LDC, 128 + 0($sp)
  134. LDARG OFFSET, 128 + 8($sp)
  135. dsll LDC, LDC, ZBASE_SHIFT
  136. #ifdef LN
  137. mult M, K
  138. mflo TEMP
  139. dsll TEMP, TEMP, ZBASE_SHIFT
  140. daddu A, A, TEMP
  141. dsll TEMP, M, ZBASE_SHIFT
  142. daddu C, C, TEMP
  143. #endif
  144. #ifdef RN
  145. neg KK, OFFSET
  146. #endif
  147. #ifdef RT
  148. mult N, K
  149. mflo TEMP
  150. dsll TEMP, TEMP, ZBASE_SHIFT
  151. daddu B, B, TEMP
  152. mult N, LDC
  153. mflo TEMP
  154. daddu C, C, TEMP
  155. dsubu KK, N, OFFSET
  156. #endif
  157. dsra J, N, 2
  158. blez J, .L20
  159. nop
  160. .L10:
  161. #ifdef RT
  162. dsll TEMP, K, 2 + ZBASE_SHIFT
  163. dsubu B, B, TEMP
  164. dsll TEMP, LDC, 2
  165. dsubu C, C, TEMP
  166. #endif
  167. move CO1, C
  168. MTC $0, c11
  169. daddu CO2, C, LDC
  170. daddu CO3, CO2, LDC
  171. daddiu J, J, -1
  172. daddu CO4, CO3, LDC
  173. MOV c21, c11
  174. MOV c31, c11
  175. MOV c41, c11
  176. MOV c51, c11
  177. move I, M
  178. #ifdef LN
  179. daddu KK, M, OFFSET
  180. #endif
  181. #ifdef LT
  182. move KK, OFFSET
  183. #endif
  184. #if defined(LN) || defined(RT)
  185. move AORIG, A
  186. #else
  187. move AO, A
  188. #endif
  189. #ifndef RT
  190. daddu C, CO4, LDC
  191. #endif
  192. blez I, .L19
  193. MOV c61, c11
  194. .align 3
  195. .L11:
  196. #if defined(LT) || defined(RN)
  197. LD a1, 0 * SIZE(AO)
  198. MOV c71, c11
  199. LD b1, 0 * SIZE(B)
  200. MOV c81, c11
  201. LD a3, 4 * SIZE(AO)
  202. MOV c12, c11
  203. LD b2, 1 * SIZE(B)
  204. MOV c22, c11
  205. dsra L, KK, 2
  206. MOV c32, c11
  207. LD b3, 2 * SIZE(B)
  208. MOV c42, c11
  209. LD b4, 3 * SIZE(B)
  210. MOV c52, c11
  211. LD b5, 4 * SIZE(B)
  212. MOV c62, c11
  213. LD b6, 8 * SIZE(B)
  214. MOV c72, c11
  215. LD b7, 12 * SIZE(B)
  216. MOV c82, c11
  217. blez L, .L15
  218. move BO, B
  219. #else
  220. #ifdef LN
  221. dsll TEMP, K, ZBASE_SHIFT
  222. dsubu AORIG, AORIG, TEMP
  223. #endif
  224. dsll L, KK, ZBASE_SHIFT
  225. dsll TEMP, KK, 2 + ZBASE_SHIFT
  226. daddu AO, AORIG, L
  227. daddu BO, B, TEMP
  228. dsubu TEMP, K, KK
  229. LD a1, 0 * SIZE(AO)
  230. MOV c71, c11
  231. LD b1, 0 * SIZE(BO)
  232. MOV c81, c11
  233. LD a3, 4 * SIZE(AO)
  234. MOV c12, c11
  235. LD b2, 1 * SIZE(BO)
  236. MOV c22, c11
  237. dsra L, TEMP, 2
  238. MOV c32, c11
  239. LD b3, 2 * SIZE(BO)
  240. MOV c42, c11
  241. LD b4, 3 * SIZE(BO)
  242. MOV c52, c11
  243. LD b5, 4 * SIZE(BO)
  244. MOV c62, c11
  245. LD b6, 8 * SIZE(BO)
  246. MOV c72, c11
  247. LD b7, 12 * SIZE(BO)
  248. MOV c82, c11
  249. blez L, .L15
  250. NOP
  251. #endif
  252. MADD1 c11, c11, a1, b1
  253. LD a2, 1 * SIZE(AO)
  254. MADD3 c21, c21, a1, b2
  255. daddiu L, L, -1
  256. MADD1 c31, c31, a1, b3
  257. NOP
  258. blez L, .L13
  259. MADD3 c41, c41, a1, b4
  260. .align 3
  261. .L12:
  262. MADD2 c12, c12, a2, b1
  263. LD b1, 16 * SIZE(BO)
  264. MADD4 c22, c22, a2, b2
  265. LD b2, 5 * SIZE(BO)
  266. MADD2 c32, c32, a2, b3
  267. LD b3, 6 * SIZE(BO)
  268. MADD4 c42, c42, a2, b4
  269. LD b4, 7 * SIZE(BO)
  270. MADD1 c51, c51, a1, b5
  271. NOP
  272. MADD3 c61, c61, a1, b2
  273. LD a4, 2 * SIZE(AO)
  274. MADD1 c71, c71, a1, b3
  275. NOP
  276. MADD3 c81, c81, a1, b4
  277. LD a1, 8 * SIZE(AO)
  278. MADD2 c52, c52, a2, b5
  279. LD b5, 20 * SIZE(BO)
  280. MADD4 c62, c62, a2, b2
  281. LD b2, 9 * SIZE(BO)
  282. MADD2 c72, c72, a2, b3
  283. LD b3, 10 * SIZE(BO)
  284. MADD4 c82, c82, a2, b4
  285. LD b4, 11 * SIZE(BO)
  286. MADD1 c11, c11, a4, b6
  287. LD a2, 3 * SIZE(AO)
  288. MADD3 c21, c21, a4, b2
  289. NOP
  290. MADD1 c31, c31, a4, b3
  291. NOP
  292. MADD3 c41, c41, a4, b4
  293. NOP
  294. MADD2 c12, c12, a2, b6
  295. LD b6, 24 * SIZE(BO)
  296. MADD4 c22, c22, a2, b2
  297. LD b2, 13 * SIZE(BO)
  298. MADD2 c32, c32, a2, b3
  299. LD b3, 14 * SIZE(BO)
  300. MADD4 c42, c42, a2, b4
  301. LD b4, 15 * SIZE(BO)
  302. MADD1 c51, c51, a4, b7
  303. NOP
  304. MADD3 c61, c61, a4, b2
  305. NOP
  306. MADD1 c71, c71, a4, b3
  307. NOP
  308. MADD3 c81, c81, a4, b4
  309. NOP
  310. MADD2 c52, c52, a2, b7
  311. LD b7, 28 * SIZE(BO)
  312. MADD4 c62, c62, a2, b2
  313. LD b2, 17 * SIZE(BO)
  314. MADD2 c72, c72, a2, b3
  315. LD b3, 18 * SIZE(BO)
  316. MADD4 c82, c82, a2, b4
  317. LD b4, 19 * SIZE(BO)
  318. MADD1 c11, c11, a3, b1
  319. LD a2, 5 * SIZE(AO)
  320. MADD3 c21, c21, a3, b2
  321. NOP
  322. MADD1 c31, c31, a3, b3
  323. NOP
  324. MADD3 c41, c41, a3, b4
  325. NOP
  326. MADD2 c12, c12, a2, b1
  327. LD b1, 32 * SIZE(BO)
  328. MADD4 c22, c22, a2, b2
  329. LD b2, 21 * SIZE(BO)
  330. MADD2 c32, c32, a2, b3
  331. LD b3, 22 * SIZE(BO)
  332. MADD4 c42, c42, a2, b4
  333. LD b4, 23 * SIZE(BO)
  334. MADD1 c51, c51, a3, b5
  335. NOP
  336. MADD3 c61, c61, a3, b2
  337. LD a4, 6 * SIZE(AO)
  338. MADD1 c71, c71, a3, b3
  339. NOP
  340. MADD3 c81, c81, a3, b4
  341. LD a3, 12 * SIZE(AO)
  342. MADD2 c52, c52, a2, b5
  343. LD b5, 36 * SIZE(BO)
  344. MADD4 c62, c62, a2, b2
  345. LD b2, 25 * SIZE(BO)
  346. MADD2 c72, c72, a2, b3
  347. LD b3, 26 * SIZE(BO)
  348. MADD4 c82, c82, a2, b4
  349. LD b4, 27 * SIZE(BO)
  350. MADD1 c11, c11, a4, b6
  351. LD a2, 7 * SIZE(AO)
  352. MADD3 c21, c21, a4, b2
  353. NOP
  354. MADD1 c31, c31, a4, b3
  355. NOP
  356. MADD3 c41, c41, a4, b4
  357. daddiu L, L, -1
  358. MADD2 c12, c12, a2, b6
  359. LD b6, 40 * SIZE(BO)
  360. MADD4 c22, c22, a2, b2
  361. LD b2, 29 * SIZE(BO)
  362. MADD2 c32, c32, a2, b3
  363. LD b3, 30 * SIZE(BO)
  364. MADD4 c42, c42, a2, b4
  365. LD b4, 31 * SIZE(BO)
  366. MADD1 c51, c51, a4, b7
  367. daddiu BO, BO, 32 * SIZE
  368. MADD3 c61, c61, a4, b2
  369. daddiu AO, AO, 8 * SIZE
  370. MADD1 c71, c71, a4, b3
  371. NOP
  372. MADD3 c81, c81, a4, b4
  373. NOP
  374. MADD2 c52, c52, a2, b7
  375. LD b7, 12 * SIZE(BO)
  376. MADD4 c62, c62, a2, b2
  377. LD b2, 1 * SIZE(BO)
  378. MADD2 c72, c72, a2, b3
  379. LD b3, 2 * SIZE(BO)
  380. MADD4 c82, c82, a2, b4
  381. LD b4, 3 * SIZE(BO)
  382. MADD1 c11, c11, a1, b1
  383. LD a2, 1 * SIZE(AO)
  384. MADD3 c21, c21, a1, b2
  385. NOP
  386. MADD1 c31, c31, a1, b3
  387. NOP
  388. bgtz L, .L12
  389. MADD3 c41, c41, a1, b4
  390. .align 3
  391. .L13:
  392. MADD2 c12, c12, a2, b1
  393. LD b1, 16 * SIZE(BO)
  394. MADD4 c22, c22, a2, b2
  395. LD b2, 5 * SIZE(BO)
  396. MADD2 c32, c32, a2, b3
  397. LD b3, 6 * SIZE(BO)
  398. MADD4 c42, c42, a2, b4
  399. LD b4, 7 * SIZE(BO)
  400. MADD1 c51, c51, a1, b5
  401. NOP
  402. MADD3 c61, c61, a1, b2
  403. LD a4, 2 * SIZE(AO)
  404. MADD1 c71, c71, a1, b3
  405. NOP
  406. MADD3 c81, c81, a1, b4
  407. LD a1, 8 * SIZE(AO)
  408. MADD2 c52, c52, a2, b5
  409. LD b5, 20 * SIZE(BO)
  410. MADD4 c62, c62, a2, b2
  411. LD b2, 9 * SIZE(BO)
  412. MADD2 c72, c72, a2, b3
  413. LD b3, 10 * SIZE(BO)
  414. MADD4 c82, c82, a2, b4
  415. LD b4, 11 * SIZE(BO)
  416. MADD1 c11, c11, a4, b6
  417. LD a2, 3 * SIZE(AO)
  418. MADD3 c21, c21, a4, b2
  419. NOP
  420. MADD1 c31, c31, a4, b3
  421. NOP
  422. MADD3 c41, c41, a4, b4
  423. NOP
  424. MADD2 c12, c12, a2, b6
  425. LD b6, 24 * SIZE(BO)
  426. MADD4 c22, c22, a2, b2
  427. LD b2, 13 * SIZE(BO)
  428. MADD2 c32, c32, a2, b3
  429. LD b3, 14 * SIZE(BO)
  430. MADD4 c42, c42, a2, b4
  431. LD b4, 15 * SIZE(BO)
  432. MADD1 c51, c51, a4, b7
  433. NOP
  434. MADD3 c61, c61, a4, b2
  435. NOP
  436. MADD1 c71, c71, a4, b3
  437. NOP
  438. MADD3 c81, c81, a4, b4
  439. NOP
  440. MADD2 c52, c52, a2, b7
  441. LD b7, 28 * SIZE(BO)
  442. MADD4 c62, c62, a2, b2
  443. LD b2, 17 * SIZE(BO)
  444. MADD2 c72, c72, a2, b3
  445. LD b3, 18 * SIZE(BO)
  446. MADD4 c82, c82, a2, b4
  447. LD b4, 19 * SIZE(BO)
  448. MADD1 c11, c11, a3, b1
  449. LD a2, 5 * SIZE(AO)
  450. MADD3 c21, c21, a3, b2
  451. NOP
  452. MADD1 c31, c31, a3, b3
  453. NOP
  454. MADD3 c41, c41, a3, b4
  455. NOP
  456. MADD2 c12, c12, a2, b1
  457. LD b1, 32 * SIZE(BO)
  458. MADD4 c22, c22, a2, b2
  459. LD b2, 21 * SIZE(BO)
  460. MADD2 c32, c32, a2, b3
  461. LD b3, 22 * SIZE(BO)
  462. MADD4 c42, c42, a2, b4
  463. LD b4, 23 * SIZE(BO)
  464. MADD1 c51, c51, a3, b5
  465. NOP
  466. MADD3 c61, c61, a3, b2
  467. LD a4, 6 * SIZE(AO)
  468. MADD1 c71, c71, a3, b3
  469. NOP
  470. MADD3 c81, c81, a3, b4
  471. LD a3, 12 * SIZE(AO)
  472. MADD2 c52, c52, a2, b5
  473. LD b5, 36 * SIZE(BO)
  474. MADD4 c62, c62, a2, b2
  475. LD b2, 25 * SIZE(BO)
  476. MADD2 c72, c72, a2, b3
  477. LD b3, 26 * SIZE(BO)
  478. MADD4 c82, c82, a2, b4
  479. LD b4, 27 * SIZE(BO)
  480. MADD1 c11, c11, a4, b6
  481. LD a2, 7 * SIZE(AO)
  482. MADD3 c21, c21, a4, b2
  483. NOP
  484. MADD1 c31, c31, a4, b3
  485. NOP
  486. MADD3 c41, c41, a4, b4
  487. NOP
  488. MADD2 c12, c12, a2, b6
  489. LD b6, 40 * SIZE(BO)
  490. MADD4 c22, c22, a2, b2
  491. LD b2, 29 * SIZE(BO)
  492. MADD2 c32, c32, a2, b3
  493. LD b3, 30 * SIZE(BO)
  494. MADD4 c42, c42, a2, b4
  495. LD b4, 31 * SIZE(BO)
  496. MADD1 c51, c51, a4, b7
  497. daddiu BO, BO, 32 * SIZE
  498. MADD3 c61, c61, a4, b2
  499. daddiu AO, AO, 8 * SIZE
  500. MADD1 c71, c71, a4, b3
  501. NOP
  502. MADD3 c81, c81, a4, b4
  503. NOP
  504. MADD2 c52, c52, a2, b7
  505. LD b7, 12 * SIZE(BO)
  506. MADD4 c62, c62, a2, b2
  507. LD b2, 1 * SIZE(BO)
  508. MADD2 c72, c72, a2, b3
  509. LD b3, 2 * SIZE(BO)
  510. MADD4 c82, c82, a2, b4
  511. LD b4, 3 * SIZE(BO)
  512. .align 3
  513. .L15:
  514. #if defined(LT) || defined(RN)
  515. andi L, KK, 3
  516. #else
  517. andi L, TEMP, 3
  518. #endif
  519. blez L, .L18
  520. NOP
  521. .align 3
  522. .L16:
  523. MADD1 c11, c11, a1, b1
  524. LD a2, 1 * SIZE(AO)
  525. MADD3 c21, c21, a1, b2
  526. NOP
  527. MADD1 c31, c31, a1, b3
  528. NOP
  529. MADD3 c41, c41, a1, b4
  530. NOP
  531. MADD2 c12, c12, a2, b1
  532. LD b1, 8 * SIZE(BO)
  533. MADD4 c22, c22, a2, b2
  534. LD b2, 5 * SIZE(BO)
  535. MADD2 c32, c32, a2, b3
  536. LD b3, 6 * SIZE(BO)
  537. MADD4 c42, c42, a2, b4
  538. LD b4, 7 * SIZE(BO)
  539. MADD1 c51, c51, a1, b5
  540. daddiu L, L, -1
  541. MADD3 c61, c61, a1, b2
  542. daddiu AO, AO, 2 * SIZE
  543. MADD1 c71, c71, a1, b3
  544. daddiu BO, BO, 8 * SIZE
  545. MADD3 c81, c81, a1, b4
  546. LD a1, 0 * SIZE(AO)
  547. MADD2 c52, c52, a2, b5
  548. LD b5, 4 * SIZE(BO)
  549. MADD4 c62, c62, a2, b2
  550. LD b2, 1 * SIZE(BO)
  551. MADD2 c72, c72, a2, b3
  552. LD b3, 2 * SIZE(BO)
  553. MADD4 c82, c82, a2, b4
  554. bgtz L, .L16
  555. LD b4, 3 * SIZE(BO)
  556. .L18:
  557. ADD c11, c11, c22
  558. ADD c12, c12, c21
  559. ADD c31, c31, c42
  560. ADD c32, c32, c41
  561. ADD c51, c51, c62
  562. ADD c52, c52, c61
  563. ADD c71, c71, c82
  564. ADD c72, c72, c81
  565. #if defined(LN) || defined(RT)
  566. #ifdef LN
  567. daddiu TEMP, KK, -1
  568. #else
  569. daddiu TEMP, KK, -4
  570. #endif
  571. dsll L, TEMP, ZBASE_SHIFT
  572. dsll TEMP, TEMP, 2 + ZBASE_SHIFT
  573. daddu AO, AORIG, L
  574. daddu BO, B, TEMP
  575. #endif
  576. #if defined(LN) || defined(LT)
  577. LD b1, 0 * SIZE(BO)
  578. LD b2, 1 * SIZE(BO)
  579. LD b3, 2 * SIZE(BO)
  580. LD b4, 3 * SIZE(BO)
  581. LD b5, 4 * SIZE(BO)
  582. LD b6, 5 * SIZE(BO)
  583. LD b7, 6 * SIZE(BO)
  584. LD b8, 7 * SIZE(BO)
  585. SUB c11, b1, c11
  586. SUB c12, b2, c12
  587. SUB c31, b3, c31
  588. SUB c32, b4, c32
  589. SUB c51, b5, c51
  590. SUB c52, b6, c52
  591. SUB c71, b7, c71
  592. SUB c72, b8, c72
  593. #else
  594. LD b1, 0 * SIZE(AO)
  595. LD b2, 1 * SIZE(AO)
  596. LD b3, 2 * SIZE(AO)
  597. LD b4, 3 * SIZE(AO)
  598. LD b5, 4 * SIZE(AO)
  599. LD b6, 5 * SIZE(AO)
  600. LD b7, 6 * SIZE(AO)
  601. LD b8, 7 * SIZE(AO)
  602. SUB c11, b1, c11
  603. SUB c12, b2, c12
  604. SUB c31, b3, c31
  605. SUB c32, b4, c32
  606. SUB c51, b5, c51
  607. SUB c52, b6, c52
  608. SUB c71, b7, c71
  609. SUB c72, b8, c72
  610. #endif
  611. #if defined(LN) || defined(LT)
  612. LD b1, 0 * SIZE(AO)
  613. LD b2, 1 * SIZE(AO)
  614. MUL a1, b2, c12
  615. MUL a2, b2, c11
  616. MUL a3, b2, c32
  617. MUL a4, b2, c31
  618. MADD5 c11, a1, b1, c11
  619. MADD6 c12, a2, b1, c12
  620. MADD5 c31, a3, b1, c31
  621. MADD6 c32, a4, b1, c32
  622. MUL a1, b2, c52
  623. MUL a2, b2, c51
  624. MUL a3, b2, c72
  625. MUL a4, b2, c71
  626. MADD5 c51, a1, b1, c51
  627. MADD6 c52, a2, b1, c52
  628. MADD5 c71, a3, b1, c71
  629. MADD6 c72, a4, b1, c72
  630. #endif
  631. #ifdef RN
  632. LD b1, 0 * SIZE(BO)
  633. LD b2, 1 * SIZE(BO)
  634. LD b3, 2 * SIZE(BO)
  635. LD b4, 3 * SIZE(BO)
  636. LD b5, 4 * SIZE(BO)
  637. LD b6, 5 * SIZE(BO)
  638. LD b7, 6 * SIZE(BO)
  639. LD b8, 7 * SIZE(BO)
  640. MUL a1, b2, c12
  641. MUL a2, b2, c11
  642. MADD5 c11, a1, b1, c11
  643. MADD6 c12, a2, b1, c12
  644. NMSUB c31, c31, b3, c11
  645. MADD7 c32, c32, b4, c11
  646. NMSUB c51, c51, b5, c11
  647. MADD7 c52, c52, b6, c11
  648. NMSUB c71, c71, b7, c11
  649. MADD7 c72, c72, b8, c11
  650. MADD8 c31, c31, b4, c12
  651. NMSUB c32, c32, b3, c12
  652. MADD8 c51, c51, b6, c12
  653. NMSUB c52, c52, b5, c12
  654. MADD8 c71, c71, b8, c12
  655. NMSUB c72, c72, b7, c12
  656. LD b3, 10 * SIZE(BO)
  657. LD b4, 11 * SIZE(BO)
  658. LD b5, 12 * SIZE(BO)
  659. LD b6, 13 * SIZE(BO)
  660. LD b7, 14 * SIZE(BO)
  661. LD b8, 15 * SIZE(BO)
  662. MUL a1, b4, c32
  663. MUL a2, b4, c31
  664. MADD5 c31, a1, b3, c31
  665. MADD6 c32, a2, b3, c32
  666. NMSUB c51, c51, b5, c31
  667. MADD7 c52, c52, b6, c31
  668. NMSUB c71, c71, b7, c31
  669. MADD7 c72, c72, b8, c31
  670. MADD8 c51, c51, b6, c32
  671. NMSUB c52, c52, b5, c32
  672. MADD8 c71, c71, b8, c32
  673. NMSUB c72, c72, b7, c32
  674. LD b5, 20 * SIZE(BO)
  675. LD b6, 21 * SIZE(BO)
  676. LD b7, 22 * SIZE(BO)
  677. LD b8, 23 * SIZE(BO)
  678. MUL a1, b6, c52
  679. MUL a2, b6, c51
  680. MADD5 c51, a1, b5, c51
  681. MADD6 c52, a2, b5, c52
  682. NMSUB c71, c71, b7, c51
  683. MADD7 c72, c72, b8, c51
  684. MADD8 c71, c71, b8, c52
  685. NMSUB c72, c72, b7, c52
  686. LD b7, 30 * SIZE(BO)
  687. LD b8, 31 * SIZE(BO)
  688. MUL a1, b8, c72
  689. MUL a2, b8, c71
  690. MADD5 c71, a1, b7, c71
  691. MADD6 c72, a2, b7, c72
  692. #endif
  693. #ifdef RT
  694. LD b1, 30 * SIZE(BO)
  695. LD b2, 31 * SIZE(BO)
  696. LD b3, 28 * SIZE(BO)
  697. LD b4, 29 * SIZE(BO)
  698. LD b5, 26 * SIZE(BO)
  699. LD b6, 27 * SIZE(BO)
  700. LD b7, 24 * SIZE(BO)
  701. LD b8, 25 * SIZE(BO)
  702. MUL a1, b2, c72
  703. MUL a2, b2, c71
  704. MADD5 c71, a1, b1, c71
  705. MADD6 c72, a2, b1, c72
  706. NMSUB c51, c51, b3, c71
  707. MADD7 c52, c52, b4, c71
  708. NMSUB c31, c31, b5, c71
  709. MADD7 c32, c32, b6, c71
  710. NMSUB c11, c11, b7, c71
  711. MADD7 c12, c12, b8, c71
  712. MADD8 c51, c51, b4, c72
  713. NMSUB c52, c52, b3, c72
  714. MADD8 c31, c31, b6, c72
  715. NMSUB c32, c32, b5, c72
  716. MADD8 c11, c11, b8, c72
  717. NMSUB c12, c12, b7, c72
  718. LD b3, 20 * SIZE(BO)
  719. LD b4, 21 * SIZE(BO)
  720. LD b5, 18 * SIZE(BO)
  721. LD b6, 19 * SIZE(BO)
  722. LD b7, 16 * SIZE(BO)
  723. LD b8, 17 * SIZE(BO)
  724. MUL a1, b4, c52
  725. MUL a2, b4, c51
  726. MADD5 c51, a1, b3, c51
  727. MADD6 c52, a2, b3, c52
  728. NMSUB c31, c31, b5, c51
  729. MADD7 c32, c32, b6, c51
  730. NMSUB c11, c11, b7, c51
  731. MADD7 c12, c12, b8, c51
  732. MADD8 c31, c31, b6, c52
  733. NMSUB c32, c32, b5, c52
  734. MADD8 c11, c11, b8, c52
  735. NMSUB c12, c12, b7, c52
  736. LD b5, 10 * SIZE(BO)
  737. LD b6, 11 * SIZE(BO)
  738. LD b7, 8 * SIZE(BO)
  739. LD b8, 9 * SIZE(BO)
  740. MUL a1, b6, c32
  741. MUL a2, b6, c31
  742. MADD5 c31, a1, b5, c31
  743. MADD6 c32, a2, b5, c32
  744. NMSUB c11, c11, b7, c31
  745. MADD7 c12, c12, b8, c31
  746. MADD8 c11, c11, b8, c32
  747. NMSUB c12, c12, b7, c32
  748. LD b7, 0 * SIZE(BO)
  749. LD b8, 1 * SIZE(BO)
  750. MUL a1, b8, c12
  751. MUL a2, b8, c11
  752. MADD5 c11, a1, b7, c11
  753. MADD6 c12, a2, b7, c12
  754. #endif
  755. #if defined(LN) || defined(LT)
  756. ST c11, 0 * SIZE(BO)
  757. ST c12, 1 * SIZE(BO)
  758. ST c31, 2 * SIZE(BO)
  759. ST c32, 3 * SIZE(BO)
  760. ST c51, 4 * SIZE(BO)
  761. ST c52, 5 * SIZE(BO)
  762. ST c71, 6 * SIZE(BO)
  763. ST c72, 7 * SIZE(BO)
  764. #else
  765. ST c11, 0 * SIZE(AO)
  766. ST c12, 1 * SIZE(AO)
  767. ST c31, 2 * SIZE(AO)
  768. ST c32, 3 * SIZE(AO)
  769. ST c51, 4 * SIZE(AO)
  770. ST c52, 5 * SIZE(AO)
  771. ST c71, 6 * SIZE(AO)
  772. ST c72, 7 * SIZE(AO)
  773. #endif
  774. #ifdef LN
  775. daddiu CO1,CO1, -2 * SIZE
  776. daddiu CO2,CO2, -2 * SIZE
  777. daddiu CO3,CO3, -2 * SIZE
  778. daddiu CO4,CO4, -2 * SIZE
  779. #endif
  780. ST c11, 0 * SIZE(CO1)
  781. ST c12, 1 * SIZE(CO1)
  782. ST c31, 0 * SIZE(CO2)
  783. ST c32, 1 * SIZE(CO2)
  784. ST c51, 0 * SIZE(CO3)
  785. ST c52, 1 * SIZE(CO3)
  786. ST c71, 0 * SIZE(CO4)
  787. ST c72, 1 * SIZE(CO4)
  788. #ifndef LN
  789. daddiu CO1,CO1, 2 * SIZE
  790. daddiu CO2,CO2, 2 * SIZE
  791. daddiu CO3,CO3, 2 * SIZE
  792. daddiu CO4,CO4, 2 * SIZE
  793. #endif
  794. #ifdef RT
  795. dsll TEMP, K, ZBASE_SHIFT
  796. daddu AORIG, AORIG, TEMP
  797. #endif
  798. #if defined(LT) || defined(RN)
  799. dsubu TEMP, K, KK
  800. dsll L, TEMP, ZBASE_SHIFT
  801. dsll TEMP, TEMP, 2 + ZBASE_SHIFT
  802. daddu AO, AO, L
  803. daddu BO, BO, TEMP
  804. #endif
  805. #ifdef LT
  806. daddiu KK, KK, 1
  807. #endif
  808. #ifdef LN
  809. daddiu KK, KK, -1
  810. #endif
  811. MTC $0, c11
  812. daddiu I, I, -1
  813. MOV c21, c11
  814. MOV c31, c11
  815. MOV c41, c11
  816. MOV c51, c11
  817. bgtz I, .L11
  818. MOV c61, c11
  819. .align 3
  820. .L19:
  821. #ifdef LN
  822. dsll TEMP, K, 2 + ZBASE_SHIFT
  823. daddu B, B, TEMP
  824. #endif
  825. #if defined(LT) || defined(RN)
  826. move B, BO
  827. #endif
  828. #ifdef RN
  829. daddiu KK, KK, 4
  830. #endif
  831. #ifdef RT
  832. daddiu KK, KK, -4
  833. #endif
  834. bgtz J, .L10
  835. NOP
  836. .align 3
  837. .L20:
  838. andi J, N, 2
  839. blez J, .L30
  840. NOP
  841. #ifdef RT
  842. dsll TEMP, K, 1 + ZBASE_SHIFT
  843. dsubu B, B, TEMP
  844. dsll TEMP, LDC, 1
  845. dsubu C, C, TEMP
  846. #endif
  847. MTC $0, c11
  848. move CO1, C
  849. daddu CO2, C, LDC
  850. #ifdef LN
  851. daddu KK, M, OFFSET
  852. #endif
  853. #ifdef LT
  854. move KK, OFFSET
  855. #endif
  856. #if defined(LN) || defined(RT)
  857. move AORIG, A
  858. #else
  859. move AO, A
  860. #endif
  861. #ifndef RT
  862. daddu C, CO2, LDC
  863. #endif
  864. move I, M
  865. blez I, .L29
  866. NOP
  867. .align 3
  868. .L21:
  869. #if defined(LT) || defined(RN)
  870. LD a1, 0 * SIZE(AO)
  871. MOV c21, c11
  872. LD b1, 0 * SIZE(B)
  873. MOV c31, c11
  874. LD a3, 4 * SIZE(AO)
  875. MOV c41, c11
  876. LD b2, 1 * SIZE(B)
  877. dsra L, KK, 2
  878. LD b3, 2 * SIZE(B)
  879. MOV c12, c11
  880. LD b4, 3 * SIZE(B)
  881. MOV c22, c11
  882. LD b5, 4 * SIZE(B)
  883. MOV c32, c11
  884. NOP
  885. MOV c42, c11
  886. blez L, .L25
  887. move BO, B
  888. #else
  889. #ifdef LN
  890. dsll TEMP, K, ZBASE_SHIFT
  891. dsubu AORIG, AORIG, TEMP
  892. #endif
  893. dsll L, KK, ZBASE_SHIFT
  894. dsll TEMP, KK, 1 + ZBASE_SHIFT
  895. daddu AO, AORIG, L
  896. daddu BO, B, TEMP
  897. dsubu TEMP, K, KK
  898. LD a1, 0 * SIZE(AO)
  899. MOV c21, c11
  900. LD b1, 0 * SIZE(BO)
  901. MOV c31, c11
  902. LD a3, 4 * SIZE(AO)
  903. MOV c41, c11
  904. LD b2, 1 * SIZE(BO)
  905. dsra L, TEMP, 2
  906. LD b3, 2 * SIZE(BO)
  907. MOV c12, c11
  908. LD b4, 3 * SIZE(BO)
  909. MOV c22, c11
  910. LD b5, 4 * SIZE(BO)
  911. MOV c32, c11
  912. blez L, .L25
  913. MOV c42, c11
  914. #endif
  915. .align 3
  916. .L22:
  917. MADD1 c11, c11, a1, b1
  918. LD a2, 1 * SIZE(AO)
  919. MADD3 c21, c21, a1, b2
  920. daddiu L, L, -1
  921. MADD1 c31, c31, a1, b3
  922. NOP
  923. MADD3 c41, c41, a1, b4
  924. LD a1, 2 * SIZE(AO)
  925. MADD2 c12, c12, a2, b1
  926. LD b1, 8 * SIZE(BO)
  927. MADD4 c22, c22, a2, b2
  928. LD b2, 5 * SIZE(BO)
  929. MADD2 c32, c32, a2, b3
  930. LD b3, 6 * SIZE(BO)
  931. MADD4 c42, c42, a2, b4
  932. LD b4, 7 * SIZE(BO)
  933. MADD1 c11, c11, a1, b5
  934. LD a2, 3 * SIZE(AO)
  935. MADD3 c21, c21, a1, b2
  936. NOP
  937. MADD1 c31, c31, a1, b3
  938. NOP
  939. MADD3 c41, c41, a1, b4
  940. LD a1, 8 * SIZE(AO)
  941. MADD2 c12, c12, a2, b5
  942. LD b5, 12 * SIZE(BO)
  943. MADD4 c22, c22, a2, b2
  944. LD b2, 9 * SIZE(BO)
  945. MADD2 c32, c32, a2, b3
  946. LD b3, 10 * SIZE(BO)
  947. MADD4 c42, c42, a2, b4
  948. LD b4, 11 * SIZE(BO)
  949. MADD1 c11, c11, a3, b1
  950. LD a2, 5 * SIZE(AO)
  951. MADD3 c21, c21, a3, b2
  952. NOP
  953. MADD1 c31, c31, a3, b3
  954. NOP
  955. MADD3 c41, c41, a3, b4
  956. LD a3, 6 * SIZE(AO)
  957. MADD2 c12, c12, a2, b1
  958. LD b1, 16 * SIZE(BO)
  959. MADD4 c22, c22, a2, b2
  960. LD b2, 13 * SIZE(BO)
  961. MADD2 c32, c32, a2, b3
  962. LD b3, 14 * SIZE(BO)
  963. MADD4 c42, c42, a2, b4
  964. LD b4, 15 * SIZE(BO)
  965. MADD1 c11, c11, a3, b5
  966. LD a2, 7 * SIZE(AO)
  967. MADD3 c21, c21, a3, b2
  968. daddiu AO, AO, 8 * SIZE
  969. MADD1 c31, c31, a3, b3
  970. NOP
  971. MADD3 c41, c41, a3, b4
  972. LD a3, 4 * SIZE(AO)
  973. MADD2 c12, c12, a2, b5
  974. LD b5, 20 * SIZE(BO)
  975. MADD4 c22, c22, a2, b2
  976. LD b2, 17 * SIZE(BO)
  977. MADD2 c32, c32, a2, b3
  978. LD b3, 18 * SIZE(BO)
  979. MADD4 c42, c42, a2, b4
  980. LD b4, 19 * SIZE(BO)
  981. bgtz L, .L22
  982. daddiu BO, BO, 16 * SIZE
  983. .align 3
  984. .L25:
  985. #if defined(LT) || defined(RN)
  986. andi L, KK, 3
  987. #else
  988. andi L, TEMP, 3
  989. #endif
  990. blez L, .L28
  991. NOP
  992. .align 3
  993. .L26:
  994. MADD1 c11, c11, a1, b1
  995. LD a2, 1 * SIZE(AO)
  996. MADD3 c21, c21, a1, b2
  997. daddiu L, L, -1
  998. MADD1 c31, c31, a1, b3
  999. daddiu BO, BO, 4 * SIZE
  1000. MADD3 c41, c41, a1, b4
  1001. LD a1, 2 * SIZE(AO)
  1002. MADD2 c12, c12, a2, b1
  1003. LD b1, 0 * SIZE(BO)
  1004. MADD4 c22, c22, a2, b2
  1005. LD b2, 1 * SIZE(BO)
  1006. MADD2 c32, c32, a2, b3
  1007. LD b3, 2 * SIZE(BO)
  1008. MADD4 c42, c42, a2, b4
  1009. LD b4, 3 * SIZE(BO)
  1010. bgtz L, .L26
  1011. daddiu AO, AO, 2 * SIZE
  1012. .L28:
  1013. ADD c11, c11, c22
  1014. ADD c12, c12, c21
  1015. ADD c31, c31, c42
  1016. ADD c32, c32, c41
  1017. #if defined(LN) || defined(RT)
  1018. #ifdef LN
  1019. daddiu TEMP, KK, -1
  1020. #else
  1021. daddiu TEMP, KK, -2
  1022. #endif
  1023. dsll L, TEMP, ZBASE_SHIFT
  1024. dsll TEMP, TEMP, 1 + ZBASE_SHIFT
  1025. daddu AO, AORIG, L
  1026. daddu BO, B, TEMP
  1027. #endif
  1028. #if defined(LN) || defined(LT)
  1029. LD b1, 0 * SIZE(BO)
  1030. LD b2, 1 * SIZE(BO)
  1031. LD b3, 2 * SIZE(BO)
  1032. LD b4, 3 * SIZE(BO)
  1033. SUB c11, b1, c11
  1034. SUB c12, b2, c12
  1035. SUB c31, b3, c31
  1036. SUB c32, b4, c32
  1037. #else
  1038. LD b1, 0 * SIZE(AO)
  1039. LD b2, 1 * SIZE(AO)
  1040. LD b3, 2 * SIZE(AO)
  1041. LD b4, 3 * SIZE(AO)
  1042. SUB c11, b1, c11
  1043. SUB c12, b2, c12
  1044. SUB c31, b3, c31
  1045. SUB c32, b4, c32
  1046. #endif
  1047. #if defined(LN) || defined(LT)
  1048. LD b1, 0 * SIZE(AO)
  1049. LD b2, 1 * SIZE(AO)
  1050. MUL a1, b2, c12
  1051. MUL a2, b2, c11
  1052. MUL a3, b2, c32
  1053. MUL a4, b2, c31
  1054. MADD5 c11, a1, b1, c11
  1055. MADD6 c12, a2, b1, c12
  1056. MADD5 c31, a3, b1, c31
  1057. MADD6 c32, a4, b1, c32
  1058. #endif
  1059. #ifdef RN
  1060. LD b1, 0 * SIZE(BO)
  1061. LD b2, 1 * SIZE(BO)
  1062. LD b3, 2 * SIZE(BO)
  1063. LD b4, 3 * SIZE(BO)
  1064. MUL a1, b2, c12
  1065. MUL a2, b2, c11
  1066. MADD5 c11, a1, b1, c11
  1067. MADD6 c12, a2, b1, c12
  1068. NMSUB c31, c31, b3, c11
  1069. MADD7 c32, c32, b4, c11
  1070. MADD8 c31, c31, b4, c12
  1071. NMSUB c32, c32, b3, c12
  1072. LD b3, 6 * SIZE(BO)
  1073. LD b4, 7 * SIZE(BO)
  1074. MUL a1, b4, c32
  1075. MUL a2, b4, c31
  1076. MADD5 c31, a1, b3, c31
  1077. MADD6 c32, a2, b3, c32
  1078. #endif
  1079. #ifdef RT
  1080. LD b5, 6 * SIZE(BO)
  1081. LD b6, 7 * SIZE(BO)
  1082. LD b7, 4 * SIZE(BO)
  1083. LD b8, 5 * SIZE(BO)
  1084. MUL a1, b6, c32
  1085. MUL a2, b6, c31
  1086. MADD5 c31, a1, b5, c31
  1087. MADD6 c32, a2, b5, c32
  1088. NMSUB c11, c11, b7, c31
  1089. MADD7 c12, c12, b8, c31
  1090. MADD8 c11, c11, b8, c32
  1091. NMSUB c12, c12, b7, c32
  1092. LD b7, 0 * SIZE(BO)
  1093. LD b8, 1 * SIZE(BO)
  1094. MUL a1, b8, c12
  1095. MUL a2, b8, c11
  1096. MADD5 c11, a1, b7, c11
  1097. MADD6 c12, a2, b7, c12
  1098. #endif
  1099. #if defined(LN) || defined(LT)
  1100. ST c11, 0 * SIZE(BO)
  1101. ST c12, 1 * SIZE(BO)
  1102. ST c31, 2 * SIZE(BO)
  1103. ST c32, 3 * SIZE(BO)
  1104. #else
  1105. ST c11, 0 * SIZE(AO)
  1106. ST c12, 1 * SIZE(AO)
  1107. ST c31, 2 * SIZE(AO)
  1108. ST c32, 3 * SIZE(AO)
  1109. #endif
  1110. #ifdef LN
  1111. daddiu CO1,CO1, -2 * SIZE
  1112. daddiu CO2,CO2, -2 * SIZE
  1113. #endif
  1114. ST c11, 0 * SIZE(CO1)
  1115. ST c12, 1 * SIZE(CO1)
  1116. ST c31, 0 * SIZE(CO2)
  1117. ST c32, 1 * SIZE(CO2)
  1118. #ifndef LN
  1119. daddiu CO1,CO1, 2 * SIZE
  1120. daddiu CO2,CO2, 2 * SIZE
  1121. #endif
  1122. MTC $0, c11
  1123. #ifdef RT
  1124. dsll TEMP, K, ZBASE_SHIFT
  1125. daddu AORIG, AORIG, TEMP
  1126. #endif
  1127. #if defined(LT) || defined(RN)
  1128. dsubu TEMP, K, KK
  1129. dsll L, TEMP, ZBASE_SHIFT
  1130. dsll TEMP, TEMP, 1 + ZBASE_SHIFT
  1131. daddu AO, AO, L
  1132. daddu BO, BO, TEMP
  1133. #endif
  1134. #ifdef LT
  1135. daddiu KK, KK, 1
  1136. #endif
  1137. #ifdef LN
  1138. daddiu KK, KK, -1
  1139. #endif
  1140. daddiu I, I, -1
  1141. bgtz I, .L21
  1142. NOP
  1143. .align 3
  1144. .L29:
  1145. #ifdef LN
  1146. dsll TEMP, K, 1 + ZBASE_SHIFT
  1147. daddu B, B, TEMP
  1148. #endif
  1149. #if defined(LT) || defined(RN)
  1150. move B, BO
  1151. #endif
  1152. #ifdef RN
  1153. daddiu KK, KK, 2
  1154. #endif
  1155. #ifdef RT
  1156. daddiu KK, KK, -2
  1157. #endif
  1158. .align 3
  1159. .L30:
  1160. andi J, N, 1
  1161. blez J, .L999
  1162. NOP
  1163. #ifdef RT
  1164. dsll TEMP, K, ZBASE_SHIFT
  1165. dsubu B, B, TEMP
  1166. dsubu C, C, LDC
  1167. #endif
  1168. MTC $0, c11
  1169. move CO1, C
  1170. #ifdef LN
  1171. daddu KK, M, OFFSET
  1172. #endif
  1173. #ifdef LT
  1174. move KK, OFFSET
  1175. #endif
  1176. #if defined(LN) || defined(RT)
  1177. move AORIG, A
  1178. #else
  1179. move AO, A
  1180. #endif
  1181. #ifndef RT
  1182. daddu C, CO1, LDC
  1183. #endif
  1184. move I, M
  1185. blez I, .L39
  1186. NOP
  1187. .align 3
  1188. .L31:
  1189. #if defined(LT) || defined(RN)
  1190. LD a1, 0 * SIZE(AO)
  1191. MOV c21, c11
  1192. LD b1, 0 * SIZE(B)
  1193. MOV c31, c11
  1194. LD a2, 1 * SIZE(AO)
  1195. MOV c41, c11
  1196. LD b2, 1 * SIZE(B)
  1197. MOV c12, c11
  1198. dsra L, KK, 2
  1199. MOV c22, c11
  1200. LD a3, 4 * SIZE(AO)
  1201. MOV c32, c11
  1202. LD b3, 4 * SIZE(B)
  1203. NOP
  1204. MOV c42, c11
  1205. blez L, .L35
  1206. move BO, B
  1207. #else
  1208. #ifdef LN
  1209. dsll TEMP, K, ZBASE_SHIFT
  1210. dsubu AORIG, AORIG, TEMP
  1211. #endif
  1212. dsll TEMP, KK, ZBASE_SHIFT
  1213. daddu AO, AORIG, TEMP
  1214. daddu BO, B, TEMP
  1215. dsubu TEMP, K, KK
  1216. LD a1, 0 * SIZE(AO)
  1217. MOV c21, c11
  1218. LD b1, 0 * SIZE(BO)
  1219. MOV c31, c11
  1220. LD a2, 1 * SIZE(AO)
  1221. MOV c41, c11
  1222. LD b2, 1 * SIZE(BO)
  1223. MOV c12, c11
  1224. dsra L, TEMP, 2
  1225. MOV c22, c11
  1226. LD a3, 4 * SIZE(AO)
  1227. MOV c32, c11
  1228. LD b3, 4 * SIZE(BO)
  1229. blez L, .L35
  1230. MOV c42, c11
  1231. #endif
  1232. .align 3
  1233. .L32:
  1234. MADD1 c11, c11, a1, b1
  1235. LD b4, 3 * SIZE(BO)
  1236. MADD3 c21, c21, a1, b2
  1237. LD a1, 2 * SIZE(AO)
  1238. MADD2 c12, c12, a2, b1
  1239. LD b1, 2 * SIZE(BO)
  1240. MADD4 c22, c22, a2, b2
  1241. LD a2, 3 * SIZE(AO)
  1242. MADD1 c11, c11, a1, b1
  1243. LD b2, 5 * SIZE(BO)
  1244. MADD3 c21, c21, a1, b4
  1245. LD a1, 8 * SIZE(AO)
  1246. MADD2 c12, c12, a2, b1
  1247. LD b1, 8 * SIZE(BO)
  1248. MADD4 c22, c22, a2, b4
  1249. LD a2, 5 * SIZE(AO)
  1250. MADD1 c11, c11, a3, b3
  1251. LD b4, 7 * SIZE(BO)
  1252. MADD3 c21, c21, a3, b2
  1253. LD a3, 6 * SIZE(AO)
  1254. MADD2 c12, c12, a2, b3
  1255. LD b3, 6 * SIZE(BO)
  1256. MADD4 c22, c22, a2, b2
  1257. LD a2, 7 * SIZE(AO)
  1258. MADD1 c11, c11, a3, b3
  1259. LD b2, 9 * SIZE(BO)
  1260. MADD3 c21, c21, a3, b4
  1261. LD a3, 12 * SIZE(AO)
  1262. MADD2 c12, c12, a2, b3
  1263. LD b3, 12 * SIZE(BO)
  1264. MADD4 c22, c22, a2, b4
  1265. LD a2, 9 * SIZE(AO)
  1266. daddiu AO, AO, 8 * SIZE
  1267. daddiu L, L, -1
  1268. bgtz L, .L32
  1269. daddiu BO, BO, 8 * SIZE
  1270. .align 3
  1271. .L35:
  1272. #if defined(LT) || defined(RN)
  1273. andi L, KK, 3
  1274. #else
  1275. andi L, TEMP, 3
  1276. #endif
  1277. blez L, .L38
  1278. NOP
  1279. .align 3
  1280. .L36:
  1281. MADD1 c11, c11, a1, b1
  1282. daddiu L, L, -1
  1283. MADD3 c21, c21, a1, b2
  1284. LD a1, 2 * SIZE(AO)
  1285. MADD2 c12, c12, a2, b1
  1286. LD b1, 2 * SIZE(BO)
  1287. MADD4 c22, c22, a2, b2
  1288. LD a2, 3 * SIZE(AO)
  1289. LD b2, 3 * SIZE(BO)
  1290. daddiu BO, BO, 2 * SIZE
  1291. bgtz L, .L36
  1292. daddiu AO, AO, 2 * SIZE
  1293. .L38:
  1294. ADD c11, c11, c22
  1295. ADD c12, c12, c21
  1296. #if defined(LN) || defined(RT)
  1297. daddiu TEMP, KK, -1
  1298. dsll TEMP, TEMP, ZBASE_SHIFT
  1299. daddu AO, AORIG, TEMP
  1300. daddu BO, B, TEMP
  1301. #endif
  1302. #if defined(LN) || defined(LT)
  1303. LD b1, 0 * SIZE(BO)
  1304. LD b2, 1 * SIZE(BO)
  1305. SUB c11, b1, c11
  1306. SUB c12, b2, c12
  1307. #else
  1308. LD b1, 0 * SIZE(AO)
  1309. LD b2, 1 * SIZE(AO)
  1310. SUB c11, b1, c11
  1311. SUB c12, b2, c12
  1312. #endif
  1313. #if defined(LN) || defined(LT)
  1314. LD b1, 0 * SIZE(AO)
  1315. LD b2, 1 * SIZE(AO)
  1316. MUL a1, b2, c12
  1317. MUL a2, b2, c11
  1318. MADD5 c11, a1, b1, c11
  1319. MADD6 c12, a2, b1, c12
  1320. #endif
  1321. #if defined(RN) || defined(RT)
  1322. LD b1, 0 * SIZE(BO)
  1323. LD b2, 1 * SIZE(BO)
  1324. MUL a1, b2, c12
  1325. MUL a2, b2, c11
  1326. MADD5 c11, a1, b1, c11
  1327. MADD6 c12, a2, b1, c12
  1328. #endif
  1329. #if defined(LN) || defined(LT)
  1330. ST c11, 0 * SIZE(BO)
  1331. ST c12, 1 * SIZE(BO)
  1332. #else
  1333. ST c11, 0 * SIZE(AO)
  1334. ST c12, 1 * SIZE(AO)
  1335. #endif
  1336. #ifdef LN
  1337. daddiu CO1,CO1, -2 * SIZE
  1338. #endif
  1339. ST c11, 0 * SIZE(CO1)
  1340. ST c12, 1 * SIZE(CO1)
  1341. #ifndef LN
  1342. daddiu CO1,CO1, 2 * SIZE
  1343. #endif
  1344. MTC $0, c11
  1345. #ifdef RT
  1346. dsll TEMP, K, ZBASE_SHIFT
  1347. daddu AORIG, AORIG, TEMP
  1348. #endif
  1349. #if defined(LT) || defined(RN)
  1350. dsubu TEMP, K, KK
  1351. dsll TEMP, TEMP, ZBASE_SHIFT
  1352. daddu AO, AO, TEMP
  1353. daddu BO, BO, TEMP
  1354. #endif
  1355. #ifdef LT
  1356. daddiu KK, KK, 1
  1357. #endif
  1358. #ifdef LN
  1359. daddiu KK, KK, -1
  1360. #endif
  1361. daddiu I, I, -1
  1362. bgtz I, .L31
  1363. NOP
  1364. .align 3
  1365. .L39:
  1366. #ifdef LN
  1367. dsll TEMP, K, ZBASE_SHIFT
  1368. daddu B, B, TEMP
  1369. #endif
  1370. #if defined(LT) || defined(RN)
  1371. move B, BO
  1372. #endif
  1373. #ifdef RN
  1374. daddiu KK, KK, 1
  1375. #endif
  1376. #ifdef RT
  1377. daddiu KK, KK, -1
  1378. #endif
  1379. .align 3
  1380. .L999:
  1381. LDARG $16, 0($sp)
  1382. LDARG $17, 8($sp)
  1383. LDARG $18, 16($sp)
  1384. LDARG $19, 24($sp)
  1385. LDARG $20, 32($sp)
  1386. LDARG $21, 40($sp)
  1387. ldc1 $f24, 48($sp)
  1388. ldc1 $f25, 56($sp)
  1389. ldc1 $f26, 64($sp)
  1390. ldc1 $f27, 72($sp)
  1391. #ifndef __64BIT__
  1392. ldc1 $f20, 88($sp)
  1393. ldc1 $f21, 96($sp)
  1394. ldc1 $f22,104($sp)
  1395. ldc1 $f23,112($sp)
  1396. #endif
  1397. j $31
  1398. daddiu $sp, $sp, 128
  1399. EPILOGUE