You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_RT.S 28 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M $4
  41. #define N $5
  42. #define K $6
  43. #define A $9
  44. #define B $10
  45. #define C $11
  46. #define LDC $8
  47. #define AO $12
  48. #define BO $13
  49. #define I $2
  50. #define J $3
  51. #define L $7
  52. #define CO1 $14
  53. #define CO2 $15
  54. #define CO3 $16
  55. #define CO4 $17
  56. #define OFFSET $18
  57. #define KK $19
  58. #define TEMP $20
  59. #define AORIG $21
  60. #define a1 $f0
  61. #define a2 $f1
  62. #define a3 $f26
  63. #define a4 $f27
  64. #define b1 $f2
  65. #define b2 $f3
  66. #define b3 $f4
  67. #define b4 $f5
  68. #define b5 $f6
  69. #define b6 $f7
  70. #define b7 $f8
  71. #define b8 $f9
  72. #define a5 b8
  73. #define c11 $f10
  74. #define c12 $f11
  75. #define c21 $f12
  76. #define c22 $f13
  77. #define c31 $f14
  78. #define c32 $f15
  79. #define c41 $f16
  80. #define c42 $f17
  81. #define c51 $f18
  82. #define c52 $f19
  83. #define c61 $f20
  84. #define c62 $f21
  85. #define c71 $f22
  86. #define c72 $f23
  87. #define c81 $f24
  88. #define c82 $f25
  89. #ifndef CONJ
  90. #define MADD1 MADD
  91. #define MADD2 MADD
  92. #define MADD3 MADD
  93. #define MADD4 NMSUB
  94. #define MADD5 MSUB
  95. #define MADD6 MADD
  96. #define MADD7 NMSUB
  97. #define MADD8 MADD
  98. #else
  99. #if defined(LN) || defined(LT)
  100. #define MADD1 MADD
  101. #define MADD2 NMSUB
  102. #define MADD3 MADD
  103. #define MADD4 MADD
  104. #else
  105. #define MADD1 MADD
  106. #define MADD2 MADD
  107. #define MADD3 NMSUB
  108. #define MADD4 MADD
  109. #endif
  110. #define MADD5 MADD
  111. #define MADD6 MSUB
  112. #define MADD7 MADD
  113. #define MADD8 NMSUB
  114. #endif
  115. PROLOGUE
  116. daddiu $sp, $sp, -128
  117. SDARG $16, 0($sp)
  118. SDARG $17, 8($sp)
  119. SDARG $18, 16($sp)
  120. SDARG $19, 24($sp)
  121. SDARG $20, 32($sp)
  122. SDARG $21, 40($sp)
  123. sdc1 $f24, 48($sp)
  124. sdc1 $f25, 56($sp)
  125. sdc1 $f26, 64($sp)
  126. sdc1 $f27, 72($sp)
  127. #ifndef __64BIT__
  128. sdc1 $f20, 88($sp)
  129. sdc1 $f21, 96($sp)
  130. sdc1 $f22,104($sp)
  131. sdc1 $f23,112($sp)
  132. #endif
  133. LDARG LDC, 128 + 0($sp)
  134. LDARG OFFSET, 128 + 8($sp)
  135. dsll LDC, LDC, ZBASE_SHIFT
  136. #ifdef LN
  137. mult M, K
  138. mflo TEMP
  139. dsll TEMP, TEMP, ZBASE_SHIFT
  140. daddu A, A, TEMP
  141. dsll TEMP, M, ZBASE_SHIFT
  142. daddu C, C, TEMP
  143. #endif
  144. #ifdef RN
  145. neg KK, OFFSET
  146. #endif
  147. #ifdef RT
  148. mult N, K
  149. mflo TEMP
  150. dsll TEMP, TEMP, ZBASE_SHIFT
  151. daddu B, B, TEMP
  152. mult N, LDC
  153. mflo TEMP
  154. daddu C, C, TEMP
  155. dsubu KK, N, OFFSET
  156. #endif
  157. andi J, N, 1
  158. blez J, .L20
  159. NOP
  160. #ifdef RT
  161. dsll TEMP, K, ZBASE_SHIFT
  162. dsubu B, B, TEMP
  163. dsubu C, C, LDC
  164. #endif
  165. MTC $0, c11
  166. move CO1, C
  167. #ifdef LN
  168. daddu KK, M, OFFSET
  169. #endif
  170. #ifdef LT
  171. move KK, OFFSET
  172. #endif
  173. #if defined(LN) || defined(RT)
  174. move AORIG, A
  175. #else
  176. move AO, A
  177. #endif
  178. #ifndef RT
  179. daddu C, CO1, LDC
  180. #endif
  181. move I, M
  182. blez I, .L39
  183. NOP
  184. .align 3
  185. .L31:
  186. #if defined(LT) || defined(RN)
  187. LD a1, 0 * SIZE(AO)
  188. MOV c21, c11
  189. LD b1, 0 * SIZE(B)
  190. MOV c31, c11
  191. LD a2, 1 * SIZE(AO)
  192. MOV c41, c11
  193. LD b2, 1 * SIZE(B)
  194. MOV c12, c11
  195. dsra L, KK, 2
  196. MOV c22, c11
  197. LD a3, 4 * SIZE(AO)
  198. MOV c32, c11
  199. LD b3, 4 * SIZE(B)
  200. NOP
  201. MOV c42, c11
  202. blez L, .L35
  203. move BO, B
  204. #else
  205. #ifdef LN
  206. dsll TEMP, K, ZBASE_SHIFT
  207. dsubu AORIG, AORIG, TEMP
  208. #endif
  209. dsll TEMP, KK, ZBASE_SHIFT
  210. daddu AO, AORIG, TEMP
  211. daddu BO, B, TEMP
  212. dsubu TEMP, K, KK
  213. LD a1, 0 * SIZE(AO)
  214. MOV c21, c11
  215. LD b1, 0 * SIZE(BO)
  216. MOV c31, c11
  217. LD a2, 1 * SIZE(AO)
  218. MOV c41, c11
  219. LD b2, 1 * SIZE(BO)
  220. MOV c12, c11
  221. dsra L, TEMP, 2
  222. MOV c22, c11
  223. LD a3, 4 * SIZE(AO)
  224. MOV c32, c11
  225. LD b3, 4 * SIZE(BO)
  226. blez L, .L35
  227. MOV c42, c11
  228. #endif
  229. .align 3
  230. .L32:
  231. MADD1 c11, c11, a1, b1
  232. LD b4, 3 * SIZE(BO)
  233. MADD3 c21, c21, a1, b2
  234. LD a1, 2 * SIZE(AO)
  235. MADD2 c12, c12, a2, b1
  236. LD b1, 2 * SIZE(BO)
  237. MADD4 c22, c22, a2, b2
  238. LD a2, 3 * SIZE(AO)
  239. MADD1 c11, c11, a1, b1
  240. LD b2, 5 * SIZE(BO)
  241. MADD3 c21, c21, a1, b4
  242. LD a1, 8 * SIZE(AO)
  243. MADD2 c12, c12, a2, b1
  244. LD b1, 8 * SIZE(BO)
  245. MADD4 c22, c22, a2, b4
  246. LD a2, 5 * SIZE(AO)
  247. MADD1 c11, c11, a3, b3
  248. LD b4, 7 * SIZE(BO)
  249. MADD3 c21, c21, a3, b2
  250. LD a3, 6 * SIZE(AO)
  251. MADD2 c12, c12, a2, b3
  252. LD b3, 6 * SIZE(BO)
  253. MADD4 c22, c22, a2, b2
  254. LD a2, 7 * SIZE(AO)
  255. MADD1 c11, c11, a3, b3
  256. LD b2, 9 * SIZE(BO)
  257. MADD3 c21, c21, a3, b4
  258. LD a3, 12 * SIZE(AO)
  259. MADD2 c12, c12, a2, b3
  260. LD b3, 12 * SIZE(BO)
  261. MADD4 c22, c22, a2, b4
  262. LD a2, 9 * SIZE(AO)
  263. daddiu AO, AO, 8 * SIZE
  264. daddiu L, L, -1
  265. bgtz L, .L32
  266. daddiu BO, BO, 8 * SIZE
  267. .align 3
  268. .L35:
  269. #if defined(LT) || defined(RN)
  270. andi L, KK, 3
  271. #else
  272. andi L, TEMP, 3
  273. #endif
  274. blez L, .L38
  275. NOP
  276. .align 3
  277. .L36:
  278. MADD1 c11, c11, a1, b1
  279. daddiu L, L, -1
  280. MADD3 c21, c21, a1, b2
  281. LD a1, 2 * SIZE(AO)
  282. MADD2 c12, c12, a2, b1
  283. LD b1, 2 * SIZE(BO)
  284. MADD4 c22, c22, a2, b2
  285. LD a2, 3 * SIZE(AO)
  286. LD b2, 3 * SIZE(BO)
  287. daddiu BO, BO, 2 * SIZE
  288. bgtz L, .L36
  289. daddiu AO, AO, 2 * SIZE
  290. .L38:
  291. ADD c11, c11, c22
  292. ADD c12, c12, c21
  293. #if defined(LN) || defined(RT)
  294. daddiu TEMP, KK, -1
  295. dsll TEMP, TEMP, ZBASE_SHIFT
  296. daddu AO, AORIG, TEMP
  297. daddu BO, B, TEMP
  298. #endif
  299. #if defined(LN) || defined(LT)
  300. LD b1, 0 * SIZE(BO)
  301. LD b2, 1 * SIZE(BO)
  302. SUB c11, b1, c11
  303. SUB c12, b2, c12
  304. #else
  305. LD b1, 0 * SIZE(AO)
  306. LD b2, 1 * SIZE(AO)
  307. SUB c11, b1, c11
  308. SUB c12, b2, c12
  309. #endif
  310. #if defined(LN) || defined(LT)
  311. LD b1, 0 * SIZE(AO)
  312. LD b2, 1 * SIZE(AO)
  313. MUL a1, b2, c12
  314. MUL a2, b2, c11
  315. MADD5 c11, a1, b1, c11
  316. MADD6 c12, a2, b1, c12
  317. #endif
  318. #if defined(RN) || defined(RT)
  319. LD b1, 0 * SIZE(BO)
  320. LD b2, 1 * SIZE(BO)
  321. MUL a1, b2, c12
  322. MUL a2, b2, c11
  323. MADD5 c11, a1, b1, c11
  324. MADD6 c12, a2, b1, c12
  325. #endif
  326. #if defined(LN) || defined(LT)
  327. ST c11, 0 * SIZE(BO)
  328. ST c12, 1 * SIZE(BO)
  329. #else
  330. ST c11, 0 * SIZE(AO)
  331. ST c12, 1 * SIZE(AO)
  332. #endif
  333. #ifdef LN
  334. daddiu CO1,CO1, -2 * SIZE
  335. #endif
  336. ST c11, 0 * SIZE(CO1)
  337. ST c12, 1 * SIZE(CO1)
  338. #ifndef LN
  339. daddiu CO1,CO1, 2 * SIZE
  340. #endif
  341. MTC $0, c11
  342. #ifdef RT
  343. dsll TEMP, K, ZBASE_SHIFT
  344. daddu AORIG, AORIG, TEMP
  345. #endif
  346. #if defined(LT) || defined(RN)
  347. dsubu TEMP, K, KK
  348. dsll TEMP, TEMP, ZBASE_SHIFT
  349. daddu AO, AO, TEMP
  350. daddu BO, BO, TEMP
  351. #endif
  352. #ifdef LT
  353. daddiu KK, KK, 1
  354. #endif
  355. #ifdef LN
  356. daddiu KK, KK, -1
  357. #endif
  358. daddiu I, I, -1
  359. bgtz I, .L31
  360. NOP
  361. .align 3
  362. .L39:
  363. #ifdef LN
  364. dsll TEMP, K, ZBASE_SHIFT
  365. daddu B, B, TEMP
  366. #endif
  367. #if defined(LT) || defined(RN)
  368. move B, BO
  369. #endif
  370. #ifdef RN
  371. daddiu KK, KK, 1
  372. #endif
  373. #ifdef RT
  374. daddiu KK, KK, -1
  375. #endif
  376. .align 3
  377. .L20:
  378. andi J, N, 2
  379. blez J, .L30
  380. NOP
  381. #ifdef RT
  382. dsll TEMP, K, 1 + ZBASE_SHIFT
  383. dsubu B, B, TEMP
  384. dsll TEMP, LDC, 1
  385. dsubu C, C, TEMP
  386. #endif
  387. MTC $0, c11
  388. move CO1, C
  389. daddu CO2, C, LDC
  390. #ifdef LN
  391. daddu KK, M, OFFSET
  392. #endif
  393. #ifdef LT
  394. move KK, OFFSET
  395. #endif
  396. #if defined(LN) || defined(RT)
  397. move AORIG, A
  398. #else
  399. move AO, A
  400. #endif
  401. #ifndef RT
  402. daddu C, CO2, LDC
  403. #endif
  404. move I, M
  405. blez I, .L29
  406. NOP
  407. .align 3
  408. .L21:
  409. #if defined(LT) || defined(RN)
  410. LD a1, 0 * SIZE(AO)
  411. MOV c21, c11
  412. LD b1, 0 * SIZE(B)
  413. MOV c31, c11
  414. LD a3, 4 * SIZE(AO)
  415. MOV c41, c11
  416. LD b2, 1 * SIZE(B)
  417. dsra L, KK, 2
  418. LD b3, 2 * SIZE(B)
  419. MOV c12, c11
  420. LD b4, 3 * SIZE(B)
  421. MOV c22, c11
  422. LD b5, 4 * SIZE(B)
  423. MOV c32, c11
  424. NOP
  425. MOV c42, c11
  426. blez L, .L25
  427. move BO, B
  428. #else
  429. #ifdef LN
  430. dsll TEMP, K, ZBASE_SHIFT
  431. dsubu AORIG, AORIG, TEMP
  432. #endif
  433. dsll L, KK, ZBASE_SHIFT
  434. dsll TEMP, KK, 1 + ZBASE_SHIFT
  435. daddu AO, AORIG, L
  436. daddu BO, B, TEMP
  437. dsubu TEMP, K, KK
  438. LD a1, 0 * SIZE(AO)
  439. MOV c21, c11
  440. LD b1, 0 * SIZE(BO)
  441. MOV c31, c11
  442. LD a3, 4 * SIZE(AO)
  443. MOV c41, c11
  444. LD b2, 1 * SIZE(BO)
  445. dsra L, TEMP, 2
  446. LD b3, 2 * SIZE(BO)
  447. MOV c12, c11
  448. LD b4, 3 * SIZE(BO)
  449. MOV c22, c11
  450. LD b5, 4 * SIZE(BO)
  451. MOV c32, c11
  452. blez L, .L25
  453. MOV c42, c11
  454. #endif
  455. .align 3
  456. .L22:
  457. MADD1 c11, c11, a1, b1
  458. LD a2, 1 * SIZE(AO)
  459. MADD3 c21, c21, a1, b2
  460. daddiu L, L, -1
  461. MADD1 c31, c31, a1, b3
  462. NOP
  463. MADD3 c41, c41, a1, b4
  464. LD a1, 2 * SIZE(AO)
  465. MADD2 c12, c12, a2, b1
  466. LD b1, 8 * SIZE(BO)
  467. MADD4 c22, c22, a2, b2
  468. LD b2, 5 * SIZE(BO)
  469. MADD2 c32, c32, a2, b3
  470. LD b3, 6 * SIZE(BO)
  471. MADD4 c42, c42, a2, b4
  472. LD b4, 7 * SIZE(BO)
  473. MADD1 c11, c11, a1, b5
  474. LD a2, 3 * SIZE(AO)
  475. MADD3 c21, c21, a1, b2
  476. NOP
  477. MADD1 c31, c31, a1, b3
  478. NOP
  479. MADD3 c41, c41, a1, b4
  480. LD a1, 8 * SIZE(AO)
  481. MADD2 c12, c12, a2, b5
  482. LD b5, 12 * SIZE(BO)
  483. MADD4 c22, c22, a2, b2
  484. LD b2, 9 * SIZE(BO)
  485. MADD2 c32, c32, a2, b3
  486. LD b3, 10 * SIZE(BO)
  487. MADD4 c42, c42, a2, b4
  488. LD b4, 11 * SIZE(BO)
  489. MADD1 c11, c11, a3, b1
  490. LD a2, 5 * SIZE(AO)
  491. MADD3 c21, c21, a3, b2
  492. NOP
  493. MADD1 c31, c31, a3, b3
  494. NOP
  495. MADD3 c41, c41, a3, b4
  496. LD a3, 6 * SIZE(AO)
  497. MADD2 c12, c12, a2, b1
  498. LD b1, 16 * SIZE(BO)
  499. MADD4 c22, c22, a2, b2
  500. LD b2, 13 * SIZE(BO)
  501. MADD2 c32, c32, a2, b3
  502. LD b3, 14 * SIZE(BO)
  503. MADD4 c42, c42, a2, b4
  504. LD b4, 15 * SIZE(BO)
  505. MADD1 c11, c11, a3, b5
  506. LD a2, 7 * SIZE(AO)
  507. MADD3 c21, c21, a3, b2
  508. daddiu AO, AO, 8 * SIZE
  509. MADD1 c31, c31, a3, b3
  510. NOP
  511. MADD3 c41, c41, a3, b4
  512. LD a3, 4 * SIZE(AO)
  513. MADD2 c12, c12, a2, b5
  514. LD b5, 20 * SIZE(BO)
  515. MADD4 c22, c22, a2, b2
  516. LD b2, 17 * SIZE(BO)
  517. MADD2 c32, c32, a2, b3
  518. LD b3, 18 * SIZE(BO)
  519. MADD4 c42, c42, a2, b4
  520. LD b4, 19 * SIZE(BO)
  521. bgtz L, .L22
  522. daddiu BO, BO, 16 * SIZE
  523. .align 3
  524. .L25:
  525. #if defined(LT) || defined(RN)
  526. andi L, KK, 3
  527. #else
  528. andi L, TEMP, 3
  529. #endif
  530. blez L, .L28
  531. NOP
  532. .align 3
  533. .L26:
  534. MADD1 c11, c11, a1, b1
  535. LD a2, 1 * SIZE(AO)
  536. MADD3 c21, c21, a1, b2
  537. daddiu L, L, -1
  538. MADD1 c31, c31, a1, b3
  539. daddiu BO, BO, 4 * SIZE
  540. MADD3 c41, c41, a1, b4
  541. LD a1, 2 * SIZE(AO)
  542. MADD2 c12, c12, a2, b1
  543. LD b1, 0 * SIZE(BO)
  544. MADD4 c22, c22, a2, b2
  545. LD b2, 1 * SIZE(BO)
  546. MADD2 c32, c32, a2, b3
  547. LD b3, 2 * SIZE(BO)
  548. MADD4 c42, c42, a2, b4
  549. LD b4, 3 * SIZE(BO)
  550. bgtz L, .L26
  551. daddiu AO, AO, 2 * SIZE
  552. .L28:
  553. ADD c11, c11, c22
  554. ADD c12, c12, c21
  555. ADD c31, c31, c42
  556. ADD c32, c32, c41
  557. #if defined(LN) || defined(RT)
  558. #ifdef LN
  559. daddiu TEMP, KK, -1
  560. #else
  561. daddiu TEMP, KK, -2
  562. #endif
  563. dsll L, TEMP, ZBASE_SHIFT
  564. dsll TEMP, TEMP, 1 + ZBASE_SHIFT
  565. daddu AO, AORIG, L
  566. daddu BO, B, TEMP
  567. #endif
  568. #if defined(LN) || defined(LT)
  569. LD b1, 0 * SIZE(BO)
  570. LD b2, 1 * SIZE(BO)
  571. LD b3, 2 * SIZE(BO)
  572. LD b4, 3 * SIZE(BO)
  573. SUB c11, b1, c11
  574. SUB c12, b2, c12
  575. SUB c31, b3, c31
  576. SUB c32, b4, c32
  577. #else
  578. LD b1, 0 * SIZE(AO)
  579. LD b2, 1 * SIZE(AO)
  580. LD b3, 2 * SIZE(AO)
  581. LD b4, 3 * SIZE(AO)
  582. SUB c11, b1, c11
  583. SUB c12, b2, c12
  584. SUB c31, b3, c31
  585. SUB c32, b4, c32
  586. #endif
  587. #if defined(LN) || defined(LT)
  588. LD b1, 0 * SIZE(AO)
  589. LD b2, 1 * SIZE(AO)
  590. MUL a1, b2, c12
  591. MUL a2, b2, c11
  592. MUL a3, b2, c32
  593. MUL a4, b2, c31
  594. MADD5 c11, a1, b1, c11
  595. MADD6 c12, a2, b1, c12
  596. MADD5 c31, a3, b1, c31
  597. MADD6 c32, a4, b1, c32
  598. #endif
  599. #ifdef RN
  600. LD b1, 0 * SIZE(BO)
  601. LD b2, 1 * SIZE(BO)
  602. LD b3, 2 * SIZE(BO)
  603. LD b4, 3 * SIZE(BO)
  604. MUL a1, b2, c12
  605. MUL a2, b2, c11
  606. MADD5 c11, a1, b1, c11
  607. MADD6 c12, a2, b1, c12
  608. NMSUB c31, c31, b3, c11
  609. MADD7 c32, c32, b4, c11
  610. MADD8 c31, c31, b4, c12
  611. NMSUB c32, c32, b3, c12
  612. LD b3, 6 * SIZE(BO)
  613. LD b4, 7 * SIZE(BO)
  614. MUL a1, b4, c32
  615. MUL a2, b4, c31
  616. MADD5 c31, a1, b3, c31
  617. MADD6 c32, a2, b3, c32
  618. #endif
  619. #ifdef RT
  620. LD b5, 6 * SIZE(BO)
  621. LD b6, 7 * SIZE(BO)
  622. LD b7, 4 * SIZE(BO)
  623. LD b8, 5 * SIZE(BO)
  624. MUL a1, b6, c32
  625. MUL a2, b6, c31
  626. MADD5 c31, a1, b5, c31
  627. MADD6 c32, a2, b5, c32
  628. NMSUB c11, c11, b7, c31
  629. MADD7 c12, c12, b8, c31
  630. MADD8 c11, c11, b8, c32
  631. NMSUB c12, c12, b7, c32
  632. LD b7, 0 * SIZE(BO)
  633. LD b8, 1 * SIZE(BO)
  634. MUL a1, b8, c12
  635. MUL a2, b8, c11
  636. MADD5 c11, a1, b7, c11
  637. MADD6 c12, a2, b7, c12
  638. #endif
  639. #if defined(LN) || defined(LT)
  640. ST c11, 0 * SIZE(BO)
  641. ST c12, 1 * SIZE(BO)
  642. ST c31, 2 * SIZE(BO)
  643. ST c32, 3 * SIZE(BO)
  644. #else
  645. ST c11, 0 * SIZE(AO)
  646. ST c12, 1 * SIZE(AO)
  647. ST c31, 2 * SIZE(AO)
  648. ST c32, 3 * SIZE(AO)
  649. #endif
  650. #ifdef LN
  651. daddiu CO1,CO1, -2 * SIZE
  652. daddiu CO2,CO2, -2 * SIZE
  653. #endif
  654. ST c11, 0 * SIZE(CO1)
  655. ST c12, 1 * SIZE(CO1)
  656. ST c31, 0 * SIZE(CO2)
  657. ST c32, 1 * SIZE(CO2)
  658. #ifndef LN
  659. daddiu CO1,CO1, 2 * SIZE
  660. daddiu CO2,CO2, 2 * SIZE
  661. #endif
  662. MTC $0, c11
  663. #ifdef RT
  664. dsll TEMP, K, ZBASE_SHIFT
  665. daddu AORIG, AORIG, TEMP
  666. #endif
  667. #if defined(LT) || defined(RN)
  668. dsubu TEMP, K, KK
  669. dsll L, TEMP, ZBASE_SHIFT
  670. dsll TEMP, TEMP, 1 + ZBASE_SHIFT
  671. daddu AO, AO, L
  672. daddu BO, BO, TEMP
  673. #endif
  674. #ifdef LT
  675. daddiu KK, KK, 1
  676. #endif
  677. #ifdef LN
  678. daddiu KK, KK, -1
  679. #endif
  680. daddiu I, I, -1
  681. bgtz I, .L21
  682. NOP
  683. .align 3
  684. .L29:
  685. #ifdef LN
  686. dsll TEMP, K, 1 + ZBASE_SHIFT
  687. daddu B, B, TEMP
  688. #endif
  689. #if defined(LT) || defined(RN)
  690. move B, BO
  691. #endif
  692. #ifdef RN
  693. daddiu KK, KK, 2
  694. #endif
  695. #ifdef RT
  696. daddiu KK, KK, -2
  697. #endif
  698. .align 3
  699. .L30:
  700. dsra J, N, 2
  701. blez J, .L999
  702. nop
  703. .L10:
  704. #ifdef RT
  705. dsll TEMP, K, 2 + ZBASE_SHIFT
  706. dsubu B, B, TEMP
  707. dsll TEMP, LDC, 2
  708. dsubu C, C, TEMP
  709. #endif
  710. move CO1, C
  711. MTC $0, c11
  712. daddu CO2, C, LDC
  713. daddu CO3, CO2, LDC
  714. daddiu J, J, -1
  715. daddu CO4, CO3, LDC
  716. MOV c21, c11
  717. MOV c31, c11
  718. MOV c41, c11
  719. MOV c51, c11
  720. move I, M
  721. #ifdef LN
  722. daddu KK, M, OFFSET
  723. #endif
  724. #ifdef LT
  725. move KK, OFFSET
  726. #endif
  727. #if defined(LN) || defined(RT)
  728. move AORIG, A
  729. #else
  730. move AO, A
  731. #endif
  732. #ifndef RT
  733. daddu C, CO4, LDC
  734. #endif
  735. blez I, .L19
  736. MOV c61, c11
  737. .align 3
  738. .L11:
  739. #if defined(LT) || defined(RN)
  740. LD a1, 0 * SIZE(AO)
  741. MOV c71, c11
  742. LD b1, 0 * SIZE(B)
  743. MOV c81, c11
  744. LD a3, 4 * SIZE(AO)
  745. MOV c12, c11
  746. LD b2, 1 * SIZE(B)
  747. MOV c22, c11
  748. dsra L, KK, 2
  749. MOV c32, c11
  750. LD b3, 2 * SIZE(B)
  751. MOV c42, c11
  752. LD b4, 3 * SIZE(B)
  753. MOV c52, c11
  754. LD b5, 4 * SIZE(B)
  755. MOV c62, c11
  756. LD b6, 8 * SIZE(B)
  757. MOV c72, c11
  758. LD b7, 12 * SIZE(B)
  759. MOV c82, c11
  760. blez L, .L15
  761. move BO, B
  762. #else
  763. #ifdef LN
  764. dsll TEMP, K, ZBASE_SHIFT
  765. dsubu AORIG, AORIG, TEMP
  766. #endif
  767. dsll L, KK, ZBASE_SHIFT
  768. dsll TEMP, KK, 2 + ZBASE_SHIFT
  769. daddu AO, AORIG, L
  770. daddu BO, B, TEMP
  771. dsubu TEMP, K, KK
  772. LD a1, 0 * SIZE(AO)
  773. MOV c71, c11
  774. LD b1, 0 * SIZE(BO)
  775. MOV c81, c11
  776. LD a3, 4 * SIZE(AO)
  777. MOV c12, c11
  778. LD b2, 1 * SIZE(BO)
  779. MOV c22, c11
  780. dsra L, TEMP, 2
  781. MOV c32, c11
  782. LD b3, 2 * SIZE(BO)
  783. MOV c42, c11
  784. LD b4, 3 * SIZE(BO)
  785. MOV c52, c11
  786. LD b5, 4 * SIZE(BO)
  787. MOV c62, c11
  788. LD b6, 8 * SIZE(BO)
  789. MOV c72, c11
  790. LD b7, 12 * SIZE(BO)
  791. MOV c82, c11
  792. blez L, .L15
  793. NOP
  794. #endif
  795. MADD1 c11, c11, a1, b1
  796. LD a2, 1 * SIZE(AO)
  797. MADD3 c21, c21, a1, b2
  798. daddiu L, L, -1
  799. MADD1 c31, c31, a1, b3
  800. NOP
  801. blez L, .L13
  802. MADD3 c41, c41, a1, b4
  803. .align 3
  804. .L12:
  805. MADD2 c12, c12, a2, b1
  806. LD b1, 16 * SIZE(BO)
  807. MADD4 c22, c22, a2, b2
  808. LD b2, 5 * SIZE(BO)
  809. MADD2 c32, c32, a2, b3
  810. LD b3, 6 * SIZE(BO)
  811. MADD4 c42, c42, a2, b4
  812. LD b4, 7 * SIZE(BO)
  813. MADD1 c51, c51, a1, b5
  814. NOP
  815. MADD3 c61, c61, a1, b2
  816. LD a4, 2 * SIZE(AO)
  817. MADD1 c71, c71, a1, b3
  818. NOP
  819. MADD3 c81, c81, a1, b4
  820. LD a1, 8 * SIZE(AO)
  821. MADD2 c52, c52, a2, b5
  822. LD b5, 20 * SIZE(BO)
  823. MADD4 c62, c62, a2, b2
  824. LD b2, 9 * SIZE(BO)
  825. MADD2 c72, c72, a2, b3
  826. LD b3, 10 * SIZE(BO)
  827. MADD4 c82, c82, a2, b4
  828. LD b4, 11 * SIZE(BO)
  829. MADD1 c11, c11, a4, b6
  830. LD a2, 3 * SIZE(AO)
  831. MADD3 c21, c21, a4, b2
  832. NOP
  833. MADD1 c31, c31, a4, b3
  834. NOP
  835. MADD3 c41, c41, a4, b4
  836. NOP
  837. MADD2 c12, c12, a2, b6
  838. LD b6, 24 * SIZE(BO)
  839. MADD4 c22, c22, a2, b2
  840. LD b2, 13 * SIZE(BO)
  841. MADD2 c32, c32, a2, b3
  842. LD b3, 14 * SIZE(BO)
  843. MADD4 c42, c42, a2, b4
  844. LD b4, 15 * SIZE(BO)
  845. MADD1 c51, c51, a4, b7
  846. NOP
  847. MADD3 c61, c61, a4, b2
  848. NOP
  849. MADD1 c71, c71, a4, b3
  850. NOP
  851. MADD3 c81, c81, a4, b4
  852. NOP
  853. MADD2 c52, c52, a2, b7
  854. LD b7, 28 * SIZE(BO)
  855. MADD4 c62, c62, a2, b2
  856. LD b2, 17 * SIZE(BO)
  857. MADD2 c72, c72, a2, b3
  858. LD b3, 18 * SIZE(BO)
  859. MADD4 c82, c82, a2, b4
  860. LD b4, 19 * SIZE(BO)
  861. MADD1 c11, c11, a3, b1
  862. LD a2, 5 * SIZE(AO)
  863. MADD3 c21, c21, a3, b2
  864. NOP
  865. MADD1 c31, c31, a3, b3
  866. NOP
  867. MADD3 c41, c41, a3, b4
  868. NOP
  869. MADD2 c12, c12, a2, b1
  870. LD b1, 32 * SIZE(BO)
  871. MADD4 c22, c22, a2, b2
  872. LD b2, 21 * SIZE(BO)
  873. MADD2 c32, c32, a2, b3
  874. LD b3, 22 * SIZE(BO)
  875. MADD4 c42, c42, a2, b4
  876. LD b4, 23 * SIZE(BO)
  877. MADD1 c51, c51, a3, b5
  878. NOP
  879. MADD3 c61, c61, a3, b2
  880. LD a4, 6 * SIZE(AO)
  881. MADD1 c71, c71, a3, b3
  882. NOP
  883. MADD3 c81, c81, a3, b4
  884. LD a3, 12 * SIZE(AO)
  885. MADD2 c52, c52, a2, b5
  886. LD b5, 36 * SIZE(BO)
  887. MADD4 c62, c62, a2, b2
  888. LD b2, 25 * SIZE(BO)
  889. MADD2 c72, c72, a2, b3
  890. LD b3, 26 * SIZE(BO)
  891. MADD4 c82, c82, a2, b4
  892. LD b4, 27 * SIZE(BO)
  893. MADD1 c11, c11, a4, b6
  894. LD a2, 7 * SIZE(AO)
  895. MADD3 c21, c21, a4, b2
  896. NOP
  897. MADD1 c31, c31, a4, b3
  898. NOP
  899. MADD3 c41, c41, a4, b4
  900. daddiu L, L, -1
  901. MADD2 c12, c12, a2, b6
  902. LD b6, 40 * SIZE(BO)
  903. MADD4 c22, c22, a2, b2
  904. LD b2, 29 * SIZE(BO)
  905. MADD2 c32, c32, a2, b3
  906. LD b3, 30 * SIZE(BO)
  907. MADD4 c42, c42, a2, b4
  908. LD b4, 31 * SIZE(BO)
  909. MADD1 c51, c51, a4, b7
  910. daddiu BO, BO, 32 * SIZE
  911. MADD3 c61, c61, a4, b2
  912. daddiu AO, AO, 8 * SIZE
  913. MADD1 c71, c71, a4, b3
  914. NOP
  915. MADD3 c81, c81, a4, b4
  916. NOP
  917. MADD2 c52, c52, a2, b7
  918. LD b7, 12 * SIZE(BO)
  919. MADD4 c62, c62, a2, b2
  920. LD b2, 1 * SIZE(BO)
  921. MADD2 c72, c72, a2, b3
  922. LD b3, 2 * SIZE(BO)
  923. MADD4 c82, c82, a2, b4
  924. LD b4, 3 * SIZE(BO)
  925. MADD1 c11, c11, a1, b1
  926. LD a2, 1 * SIZE(AO)
  927. MADD3 c21, c21, a1, b2
  928. NOP
  929. MADD1 c31, c31, a1, b3
  930. NOP
  931. bgtz L, .L12
  932. MADD3 c41, c41, a1, b4
  933. .align 3
  934. .L13:
  935. MADD2 c12, c12, a2, b1
  936. LD b1, 16 * SIZE(BO)
  937. MADD4 c22, c22, a2, b2
  938. LD b2, 5 * SIZE(BO)
  939. MADD2 c32, c32, a2, b3
  940. LD b3, 6 * SIZE(BO)
  941. MADD4 c42, c42, a2, b4
  942. LD b4, 7 * SIZE(BO)
  943. MADD1 c51, c51, a1, b5
  944. NOP
  945. MADD3 c61, c61, a1, b2
  946. LD a4, 2 * SIZE(AO)
  947. MADD1 c71, c71, a1, b3
  948. NOP
  949. MADD3 c81, c81, a1, b4
  950. LD a1, 8 * SIZE(AO)
  951. MADD2 c52, c52, a2, b5
  952. LD b5, 20 * SIZE(BO)
  953. MADD4 c62, c62, a2, b2
  954. LD b2, 9 * SIZE(BO)
  955. MADD2 c72, c72, a2, b3
  956. LD b3, 10 * SIZE(BO)
  957. MADD4 c82, c82, a2, b4
  958. LD b4, 11 * SIZE(BO)
  959. MADD1 c11, c11, a4, b6
  960. LD a2, 3 * SIZE(AO)
  961. MADD3 c21, c21, a4, b2
  962. NOP
  963. MADD1 c31, c31, a4, b3
  964. NOP
  965. MADD3 c41, c41, a4, b4
  966. NOP
  967. MADD2 c12, c12, a2, b6
  968. LD b6, 24 * SIZE(BO)
  969. MADD4 c22, c22, a2, b2
  970. LD b2, 13 * SIZE(BO)
  971. MADD2 c32, c32, a2, b3
  972. LD b3, 14 * SIZE(BO)
  973. MADD4 c42, c42, a2, b4
  974. LD b4, 15 * SIZE(BO)
  975. MADD1 c51, c51, a4, b7
  976. NOP
  977. MADD3 c61, c61, a4, b2
  978. NOP
  979. MADD1 c71, c71, a4, b3
  980. NOP
  981. MADD3 c81, c81, a4, b4
  982. NOP
  983. MADD2 c52, c52, a2, b7
  984. LD b7, 28 * SIZE(BO)
  985. MADD4 c62, c62, a2, b2
  986. LD b2, 17 * SIZE(BO)
  987. MADD2 c72, c72, a2, b3
  988. LD b3, 18 * SIZE(BO)
  989. MADD4 c82, c82, a2, b4
  990. LD b4, 19 * SIZE(BO)
  991. MADD1 c11, c11, a3, b1
  992. LD a2, 5 * SIZE(AO)
  993. MADD3 c21, c21, a3, b2
  994. NOP
  995. MADD1 c31, c31, a3, b3
  996. NOP
  997. MADD3 c41, c41, a3, b4
  998. NOP
  999. MADD2 c12, c12, a2, b1
  1000. LD b1, 32 * SIZE(BO)
  1001. MADD4 c22, c22, a2, b2
  1002. LD b2, 21 * SIZE(BO)
  1003. MADD2 c32, c32, a2, b3
  1004. LD b3, 22 * SIZE(BO)
  1005. MADD4 c42, c42, a2, b4
  1006. LD b4, 23 * SIZE(BO)
  1007. MADD1 c51, c51, a3, b5
  1008. NOP
  1009. MADD3 c61, c61, a3, b2
  1010. LD a4, 6 * SIZE(AO)
  1011. MADD1 c71, c71, a3, b3
  1012. NOP
  1013. MADD3 c81, c81, a3, b4
  1014. LD a3, 12 * SIZE(AO)
  1015. MADD2 c52, c52, a2, b5
  1016. LD b5, 36 * SIZE(BO)
  1017. MADD4 c62, c62, a2, b2
  1018. LD b2, 25 * SIZE(BO)
  1019. MADD2 c72, c72, a2, b3
  1020. LD b3, 26 * SIZE(BO)
  1021. MADD4 c82, c82, a2, b4
  1022. LD b4, 27 * SIZE(BO)
  1023. MADD1 c11, c11, a4, b6
  1024. LD a2, 7 * SIZE(AO)
  1025. MADD3 c21, c21, a4, b2
  1026. NOP
  1027. MADD1 c31, c31, a4, b3
  1028. NOP
  1029. MADD3 c41, c41, a4, b4
  1030. NOP
  1031. MADD2 c12, c12, a2, b6
  1032. LD b6, 40 * SIZE(BO)
  1033. MADD4 c22, c22, a2, b2
  1034. LD b2, 29 * SIZE(BO)
  1035. MADD2 c32, c32, a2, b3
  1036. LD b3, 30 * SIZE(BO)
  1037. MADD4 c42, c42, a2, b4
  1038. LD b4, 31 * SIZE(BO)
  1039. MADD1 c51, c51, a4, b7
  1040. daddiu BO, BO, 32 * SIZE
  1041. MADD3 c61, c61, a4, b2
  1042. daddiu AO, AO, 8 * SIZE
  1043. MADD1 c71, c71, a4, b3
  1044. NOP
  1045. MADD3 c81, c81, a4, b4
  1046. NOP
  1047. MADD2 c52, c52, a2, b7
  1048. LD b7, 12 * SIZE(BO)
  1049. MADD4 c62, c62, a2, b2
  1050. LD b2, 1 * SIZE(BO)
  1051. MADD2 c72, c72, a2, b3
  1052. LD b3, 2 * SIZE(BO)
  1053. MADD4 c82, c82, a2, b4
  1054. LD b4, 3 * SIZE(BO)
  1055. .align 3
  1056. .L15:
  1057. #if defined(LT) || defined(RN)
  1058. andi L, KK, 3
  1059. #else
  1060. andi L, TEMP, 3
  1061. #endif
  1062. blez L, .L18
  1063. NOP
  1064. .align 3
  1065. .L16:
  1066. MADD1 c11, c11, a1, b1
  1067. LD a2, 1 * SIZE(AO)
  1068. MADD3 c21, c21, a1, b2
  1069. NOP
  1070. MADD1 c31, c31, a1, b3
  1071. NOP
  1072. MADD3 c41, c41, a1, b4
  1073. NOP
  1074. MADD2 c12, c12, a2, b1
  1075. LD b1, 8 * SIZE(BO)
  1076. MADD4 c22, c22, a2, b2
  1077. LD b2, 5 * SIZE(BO)
  1078. MADD2 c32, c32, a2, b3
  1079. LD b3, 6 * SIZE(BO)
  1080. MADD4 c42, c42, a2, b4
  1081. LD b4, 7 * SIZE(BO)
  1082. MADD1 c51, c51, a1, b5
  1083. daddiu L, L, -1
  1084. MADD3 c61, c61, a1, b2
  1085. daddiu AO, AO, 2 * SIZE
  1086. MADD1 c71, c71, a1, b3
  1087. daddiu BO, BO, 8 * SIZE
  1088. MADD3 c81, c81, a1, b4
  1089. LD a1, 0 * SIZE(AO)
  1090. MADD2 c52, c52, a2, b5
  1091. LD b5, 4 * SIZE(BO)
  1092. MADD4 c62, c62, a2, b2
  1093. LD b2, 1 * SIZE(BO)
  1094. MADD2 c72, c72, a2, b3
  1095. LD b3, 2 * SIZE(BO)
  1096. MADD4 c82, c82, a2, b4
  1097. bgtz L, .L16
  1098. LD b4, 3 * SIZE(BO)
  1099. .L18:
  1100. ADD c11, c11, c22
  1101. ADD c12, c12, c21
  1102. ADD c31, c31, c42
  1103. ADD c32, c32, c41
  1104. ADD c51, c51, c62
  1105. ADD c52, c52, c61
  1106. ADD c71, c71, c82
  1107. ADD c72, c72, c81
  1108. #if defined(LN) || defined(RT)
  1109. #ifdef LN
  1110. daddiu TEMP, KK, -1
  1111. #else
  1112. daddiu TEMP, KK, -4
  1113. #endif
  1114. dsll L, TEMP, ZBASE_SHIFT
  1115. dsll TEMP, TEMP, 2 + ZBASE_SHIFT
  1116. daddu AO, AORIG, L
  1117. daddu BO, B, TEMP
  1118. #endif
  1119. #if defined(LN) || defined(LT)
  1120. LD b1, 0 * SIZE(BO)
  1121. LD b2, 1 * SIZE(BO)
  1122. LD b3, 2 * SIZE(BO)
  1123. LD b4, 3 * SIZE(BO)
  1124. LD b5, 4 * SIZE(BO)
  1125. LD b6, 5 * SIZE(BO)
  1126. LD b7, 6 * SIZE(BO)
  1127. LD b8, 7 * SIZE(BO)
  1128. SUB c11, b1, c11
  1129. SUB c12, b2, c12
  1130. SUB c31, b3, c31
  1131. SUB c32, b4, c32
  1132. SUB c51, b5, c51
  1133. SUB c52, b6, c52
  1134. SUB c71, b7, c71
  1135. SUB c72, b8, c72
  1136. #else
  1137. LD b1, 0 * SIZE(AO)
  1138. LD b2, 1 * SIZE(AO)
  1139. LD b3, 2 * SIZE(AO)
  1140. LD b4, 3 * SIZE(AO)
  1141. LD b5, 4 * SIZE(AO)
  1142. LD b6, 5 * SIZE(AO)
  1143. LD b7, 6 * SIZE(AO)
  1144. LD b8, 7 * SIZE(AO)
  1145. SUB c11, b1, c11
  1146. SUB c12, b2, c12
  1147. SUB c31, b3, c31
  1148. SUB c32, b4, c32
  1149. SUB c51, b5, c51
  1150. SUB c52, b6, c52
  1151. SUB c71, b7, c71
  1152. SUB c72, b8, c72
  1153. #endif
  1154. #if defined(LN) || defined(LT)
  1155. LD b1, 0 * SIZE(AO)
  1156. LD b2, 1 * SIZE(AO)
  1157. MUL a1, b2, c12
  1158. MUL a2, b2, c11
  1159. MUL a3, b2, c32
  1160. MUL a4, b2, c31
  1161. MADD5 c11, a1, b1, c11
  1162. MADD6 c12, a2, b1, c12
  1163. MADD5 c31, a3, b1, c31
  1164. MADD6 c32, a4, b1, c32
  1165. MUL a1, b2, c52
  1166. MUL a2, b2, c51
  1167. MUL a3, b2, c72
  1168. MUL a4, b2, c71
  1169. MADD5 c51, a1, b1, c51
  1170. MADD6 c52, a2, b1, c52
  1171. MADD5 c71, a3, b1, c71
  1172. MADD6 c72, a4, b1, c72
  1173. #endif
  1174. #ifdef RN
  1175. LD b1, 0 * SIZE(BO)
  1176. LD b2, 1 * SIZE(BO)
  1177. LD b3, 2 * SIZE(BO)
  1178. LD b4, 3 * SIZE(BO)
  1179. LD b5, 4 * SIZE(BO)
  1180. LD b6, 5 * SIZE(BO)
  1181. LD b7, 6 * SIZE(BO)
  1182. LD b8, 7 * SIZE(BO)
  1183. MUL a1, b2, c12
  1184. MUL a2, b2, c11
  1185. MADD5 c11, a1, b1, c11
  1186. MADD6 c12, a2, b1, c12
  1187. NMSUB c31, c31, b3, c11
  1188. MADD7 c32, c32, b4, c11
  1189. NMSUB c51, c51, b5, c11
  1190. MADD7 c52, c52, b6, c11
  1191. NMSUB c71, c71, b7, c11
  1192. MADD7 c72, c72, b8, c11
  1193. MADD8 c31, c31, b4, c12
  1194. NMSUB c32, c32, b3, c12
  1195. MADD8 c51, c51, b6, c12
  1196. NMSUB c52, c52, b5, c12
  1197. MADD8 c71, c71, b8, c12
  1198. NMSUB c72, c72, b7, c12
  1199. LD b3, 10 * SIZE(BO)
  1200. LD b4, 11 * SIZE(BO)
  1201. LD b5, 12 * SIZE(BO)
  1202. LD b6, 13 * SIZE(BO)
  1203. LD b7, 14 * SIZE(BO)
  1204. LD b8, 15 * SIZE(BO)
  1205. MUL a1, b4, c32
  1206. MUL a2, b4, c31
  1207. MADD5 c31, a1, b3, c31
  1208. MADD6 c32, a2, b3, c32
  1209. NMSUB c51, c51, b5, c31
  1210. MADD7 c52, c52, b6, c31
  1211. NMSUB c71, c71, b7, c31
  1212. MADD7 c72, c72, b8, c31
  1213. MADD8 c51, c51, b6, c32
  1214. NMSUB c52, c52, b5, c32
  1215. MADD8 c71, c71, b8, c32
  1216. NMSUB c72, c72, b7, c32
  1217. LD b5, 20 * SIZE(BO)
  1218. LD b6, 21 * SIZE(BO)
  1219. LD b7, 22 * SIZE(BO)
  1220. LD b8, 23 * SIZE(BO)
  1221. MUL a1, b6, c52
  1222. MUL a2, b6, c51
  1223. MADD5 c51, a1, b5, c51
  1224. MADD6 c52, a2, b5, c52
  1225. NMSUB c71, c71, b7, c51
  1226. MADD7 c72, c72, b8, c51
  1227. MADD8 c71, c71, b8, c52
  1228. NMSUB c72, c72, b7, c52
  1229. LD b7, 30 * SIZE(BO)
  1230. LD b8, 31 * SIZE(BO)
  1231. MUL a1, b8, c72
  1232. MUL a2, b8, c71
  1233. MADD5 c71, a1, b7, c71
  1234. MADD6 c72, a2, b7, c72
  1235. #endif
  1236. #ifdef RT
  1237. LD b1, 30 * SIZE(BO)
  1238. LD b2, 31 * SIZE(BO)
  1239. LD b3, 28 * SIZE(BO)
  1240. LD b4, 29 * SIZE(BO)
  1241. LD b5, 26 * SIZE(BO)
  1242. LD b6, 27 * SIZE(BO)
  1243. LD b7, 24 * SIZE(BO)
  1244. LD b8, 25 * SIZE(BO)
  1245. MUL a1, b2, c72
  1246. MUL a2, b2, c71
  1247. MADD5 c71, a1, b1, c71
  1248. MADD6 c72, a2, b1, c72
  1249. NMSUB c51, c51, b3, c71
  1250. MADD7 c52, c52, b4, c71
  1251. NMSUB c31, c31, b5, c71
  1252. MADD7 c32, c32, b6, c71
  1253. NMSUB c11, c11, b7, c71
  1254. MADD7 c12, c12, b8, c71
  1255. MADD8 c51, c51, b4, c72
  1256. NMSUB c52, c52, b3, c72
  1257. MADD8 c31, c31, b6, c72
  1258. NMSUB c32, c32, b5, c72
  1259. MADD8 c11, c11, b8, c72
  1260. NMSUB c12, c12, b7, c72
  1261. LD b3, 20 * SIZE(BO)
  1262. LD b4, 21 * SIZE(BO)
  1263. LD b5, 18 * SIZE(BO)
  1264. LD b6, 19 * SIZE(BO)
  1265. LD b7, 16 * SIZE(BO)
  1266. LD b8, 17 * SIZE(BO)
  1267. MUL a1, b4, c52
  1268. MUL a2, b4, c51
  1269. MADD5 c51, a1, b3, c51
  1270. MADD6 c52, a2, b3, c52
  1271. NMSUB c31, c31, b5, c51
  1272. MADD7 c32, c32, b6, c51
  1273. NMSUB c11, c11, b7, c51
  1274. MADD7 c12, c12, b8, c51
  1275. MADD8 c31, c31, b6, c52
  1276. NMSUB c32, c32, b5, c52
  1277. MADD8 c11, c11, b8, c52
  1278. NMSUB c12, c12, b7, c52
  1279. LD b5, 10 * SIZE(BO)
  1280. LD b6, 11 * SIZE(BO)
  1281. LD b7, 8 * SIZE(BO)
  1282. LD b8, 9 * SIZE(BO)
  1283. MUL a1, b6, c32
  1284. MUL a2, b6, c31
  1285. MADD5 c31, a1, b5, c31
  1286. MADD6 c32, a2, b5, c32
  1287. NMSUB c11, c11, b7, c31
  1288. MADD7 c12, c12, b8, c31
  1289. MADD8 c11, c11, b8, c32
  1290. NMSUB c12, c12, b7, c32
  1291. LD b7, 0 * SIZE(BO)
  1292. LD b8, 1 * SIZE(BO)
  1293. MUL a1, b8, c12
  1294. MUL a2, b8, c11
  1295. MADD5 c11, a1, b7, c11
  1296. MADD6 c12, a2, b7, c12
  1297. #endif
  1298. #if defined(LN) || defined(LT)
  1299. ST c11, 0 * SIZE(BO)
  1300. ST c12, 1 * SIZE(BO)
  1301. ST c31, 2 * SIZE(BO)
  1302. ST c32, 3 * SIZE(BO)
  1303. ST c51, 4 * SIZE(BO)
  1304. ST c52, 5 * SIZE(BO)
  1305. ST c71, 6 * SIZE(BO)
  1306. ST c72, 7 * SIZE(BO)
  1307. #else
  1308. ST c11, 0 * SIZE(AO)
  1309. ST c12, 1 * SIZE(AO)
  1310. ST c31, 2 * SIZE(AO)
  1311. ST c32, 3 * SIZE(AO)
  1312. ST c51, 4 * SIZE(AO)
  1313. ST c52, 5 * SIZE(AO)
  1314. ST c71, 6 * SIZE(AO)
  1315. ST c72, 7 * SIZE(AO)
  1316. #endif
  1317. #ifdef LN
  1318. daddiu CO1,CO1, -2 * SIZE
  1319. daddiu CO2,CO2, -2 * SIZE
  1320. daddiu CO3,CO3, -2 * SIZE
  1321. daddiu CO4,CO4, -2 * SIZE
  1322. #endif
  1323. ST c11, 0 * SIZE(CO1)
  1324. ST c12, 1 * SIZE(CO1)
  1325. ST c31, 0 * SIZE(CO2)
  1326. ST c32, 1 * SIZE(CO2)
  1327. ST c51, 0 * SIZE(CO3)
  1328. ST c52, 1 * SIZE(CO3)
  1329. ST c71, 0 * SIZE(CO4)
  1330. ST c72, 1 * SIZE(CO4)
  1331. #ifndef LN
  1332. daddiu CO1,CO1, 2 * SIZE
  1333. daddiu CO2,CO2, 2 * SIZE
  1334. daddiu CO3,CO3, 2 * SIZE
  1335. daddiu CO4,CO4, 2 * SIZE
  1336. #endif
  1337. #ifdef RT
  1338. dsll TEMP, K, ZBASE_SHIFT
  1339. daddu AORIG, AORIG, TEMP
  1340. #endif
  1341. #if defined(LT) || defined(RN)
  1342. dsubu TEMP, K, KK
  1343. dsll L, TEMP, ZBASE_SHIFT
  1344. dsll TEMP, TEMP, 2 + ZBASE_SHIFT
  1345. daddu AO, AO, L
  1346. daddu BO, BO, TEMP
  1347. #endif
  1348. #ifdef LT
  1349. daddiu KK, KK, 1
  1350. #endif
  1351. #ifdef LN
  1352. daddiu KK, KK, -1
  1353. #endif
  1354. MTC $0, c11
  1355. daddiu I, I, -1
  1356. MOV c21, c11
  1357. MOV c31, c11
  1358. MOV c41, c11
  1359. MOV c51, c11
  1360. bgtz I, .L11
  1361. MOV c61, c11
  1362. .align 3
  1363. .L19:
  1364. #ifdef LN
  1365. dsll TEMP, K, 2 + ZBASE_SHIFT
  1366. daddu B, B, TEMP
  1367. #endif
  1368. #if defined(LT) || defined(RN)
  1369. move B, BO
  1370. #endif
  1371. #ifdef RN
  1372. daddiu KK, KK, 4
  1373. #endif
  1374. #ifdef RT
  1375. daddiu KK, KK, -4
  1376. #endif
  1377. bgtz J, .L10
  1378. NOP
  1379. .align 3
  1380. .L999:
  1381. LDARG $16, 0($sp)
  1382. LDARG $17, 8($sp)
  1383. LDARG $18, 16($sp)
  1384. LDARG $19, 24($sp)
  1385. LDARG $20, 32($sp)
  1386. LDARG $21, 40($sp)
  1387. ldc1 $f24, 48($sp)
  1388. ldc1 $f25, 56($sp)
  1389. ldc1 $f26, 64($sp)
  1390. ldc1 $f27, 72($sp)
  1391. #ifndef __64BIT__
  1392. ldc1 $f20, 88($sp)
  1393. ldc1 $f21, 96($sp)
  1394. ldc1 $f22,104($sp)
  1395. ldc1 $f23,112($sp)
  1396. #endif
  1397. j $31
  1398. daddiu $sp, $sp, 128
  1399. EPILOGUE