You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RN_loongson3a.S 35 kB


  1. #define REALNAME ASMNAME
  2. #define ASSEMBLER
  3. #include "common.h"
  4. #define M $4
  5. #define N $5
  6. #define K $6
  7. #define A $8
  8. #define B $9
  9. #define C $10
  10. #define LDC $11
  11. #define AO $12
  12. #define BO $13
  13. #define I $2
  14. #define J $3
  15. #define L $7
  16. #define CO1 $14
  17. #define CO2 $15
  18. #define CO3 $16
  19. #define CO4 $17
  20. #define OFFSET $22
  21. #define KK $23
  22. #define TEMP $24
  23. #define AORIG $25
  24. #define a1 $f0
  25. #define a2 $f1
  26. #define a3 $f26
  27. #define a4 $f27
  28. #define a5 $f28
  29. #define a6 $f29
  30. #define a7 $f30
  31. #define a8 $f31
  32. #define b1 $f2
  33. #define b2 $f3
  34. #define b3 $f4
  35. #define b4 $f5
  36. #define b5 $f6
  37. #define b6 $f7
  38. #define b7 $f8
  39. #define b8 $f9
  40. #define t11 $f10
  41. #define t21 $f11
  42. #define t31 $f12
  43. #define t41 $f13
  44. #define t12 $f14
  45. #define t22 $f15
  46. #define t32 $f16
  47. #define t42 $f17
  48. #define t13 $f18
  49. #define t23 $f19
  50. #define t33 $f20
  51. #define t43 $f21
  52. #define t14 $f22
  53. #define t24 $f23
  54. #define t34 $f24
  55. #define t44 $f25
  56. PROLOGUE
  57. daddiu $sp, $sp, -144
  58. SDARG $16, 0($sp)
  59. SDARG $17, 8($sp)
  60. SDARG $18, 16($sp)
  61. SDARG $19, 24($sp)
  62. SDARG $20, 32($sp)
  63. SDARG $21, 40($sp)
  64. sdc1 $f24, 48($sp)
  65. sdc1 $f25, 56($sp)
  66. sdc1 $f26, 64($sp)
  67. sdc1 $f27, 72($sp)
  68. sdc1 $f28, 80($sp)
  69. SDARG $22, 88($sp)
  70. SDARG $23, 96($sp)
  71. SDARG $24, 104($sp)
  72. SDARG $25, 112($sp)
  73. #ifndef __64BIT__
  74. sdc1 $f20,112($sp)
  75. sdc1 $f21,120($sp)
  76. sdc1 $f22,128($sp)
  77. sdc1 $f23,136($sp)
  78. #endif
  79. # RN compute from top to bottom left to right
  80. .align 3
  81. LDARG OFFSET, 144($sp) # get the last parameter
  82. dsll LDC, LDC, BASE_SHIFT # LDC * data_Byte
  83. neg KK, OFFSET # for RN OFFSET always 0
  84. dsra J, N, 2 # J = NC/4
  85. blez J, .L30
  86. NOP
  87. .L10:
  88. daddiu J, J, -1
  89. move CO1, C
  90. daddu CO2, C, LDC
  91. daddu CO3, CO2, LDC
  92. daddu CO4, CO3, LDC
  93. move AO, A # A is the retangular matrix and B is the trigular matrix
  94. daddu C, CO4, LDC # Fixed pointer C
  95. dsra I, M, 2 # I=MC/4
  96. blez I, .L20
  97. NOP
  98. .align 3
  99. .L11:
  100. MTC $0, t11 # clear results registers
  101. MOV t21, t11
  102. MOV t31, t11
  103. MOV t41, t11
  104. MOV t12, t11
  105. MOV t22, t11
  106. MOV t32, t11
  107. MOV t42, t11
  108. MOV t13, t11
  109. MOV t23, t11
  110. MOV t33, t11
  111. MOV t43, t11
  112. MOV t14, t11
  113. MOV t24, t11
  114. MOV t34, t11
  115. MOV t44, t11
  116. LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
  117. LD a2, 1 * SIZE(AO) # get 4 a
  118. LD a3, 2 * SIZE(AO)
  119. LD a4, 3 * SIZE(AO)
  120. LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
  121. LD b2, 1 * SIZE(B) # get 4 b
  122. LD b3, 2 * SIZE(B)
  123. LD b4, 3 * SIZE(B)
  124. dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
  125. blez L, .L15
  126. move BO, B # reset B
  127. .L12:
  128. LD a5, 4 * SIZE(AO)
  129. LD a6, 5 * SIZE(AO)
  130. LD a7, 6 * SIZE(AO)
  131. LD a8, 7 * SIZE(AO)
  132. LD b5, 4 * SIZE(BO)
  133. LD b6, 5 * SIZE(BO)
  134. LD b7, 6 * SIZE(BO)
  135. LD b8, 7 * SIZE(BO)
  136. MADD t11, t11, a1, b1
  137. MADD t21, t21, a2, b1
  138. MADD t31, t31, a3, b1
  139. MADD t41, t41, a4, b1
  140. MADD t12, t12, a1, b2
  141. MADD t22, t22, a2, b2
  142. MADD t32, t32, a3, b2
  143. MADD t42, t42, a4, b2
  144. MADD t13, t13, a1, b3
  145. MADD t23, t23, a2, b3
  146. MADD t33, t33, a3, b3
  147. MADD t43, t43, a4, b3
  148. MADD t14, t14, a1, b4
  149. MADD t24, t24, a2, b4
  150. MADD t34, t34, a3, b4
  151. MADD t44, t44, a4, b4 # fisrt
  152. LD a1, 8 * SIZE(AO)
  153. LD a2, 9 * SIZE(AO)
  154. LD a3, 10 * SIZE(AO)
  155. LD a4, 11 * SIZE(AO)
  156. LD b1, 8 * SIZE(BO)
  157. LD b2, 9 * SIZE(BO)
  158. LD b3, 10 * SIZE(BO)
  159. LD b4, 11 * SIZE(BO)
  160. MADD t11, t11, a5, b5
  161. MADD t21, t21, a6, b5
  162. MADD t31, t31, a7, b5
  163. MADD t41, t41, a8, b5
  164. MADD t12, t12, a5, b6
  165. MADD t22, t22, a6, b6
  166. MADD t32, t32, a7, b6
  167. MADD t42, t42, a8, b6
  168. MADD t13, t13, a5, b7
  169. MADD t23, t23, a6, b7
  170. MADD t33, t33, a7, b7
  171. MADD t43, t43, a8, b7
  172. MADD t14, t14, a5, b8
  173. MADD t24, t24, a6, b8
  174. MADD t34, t34, a7, b8
  175. MADD t44, t44, a8, b8 # second
  176. LD a5, 12 * SIZE(AO)
  177. LD a6, 13 * SIZE(AO)
  178. LD a7, 14 * SIZE(AO)
  179. LD a8, 15 * SIZE(AO)
  180. LD b5, 12 * SIZE(BO)
  181. LD b6, 13 * SIZE(BO)
  182. LD b7, 14 * SIZE(BO)
  183. LD b8, 15 * SIZE(BO)
  184. MADD t11, t11, a1, b1
  185. MADD t21, t21, a2, b1
  186. MADD t31, t31, a3, b1
  187. MADD t41, t41, a4, b1
  188. MADD t12, t12, a1, b2
  189. MADD t22, t22, a2, b2
  190. MADD t32, t32, a3, b2
  191. MADD t42, t42, a4, b2
  192. MADD t13, t13, a1, b3
  193. MADD t23, t23, a2, b3
  194. MADD t33, t33, a3, b3
  195. MADD t43, t43, a4, b3
  196. MADD t14, t14, a1, b4
  197. MADD t24, t24, a2, b4
  198. MADD t34, t34, a3, b4
  199. MADD t44, t44, a4, b4 # third
  200. daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
  201. daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr
  202. LD a1, 0 * SIZE(AO)
  203. LD a2, 1 * SIZE(AO)
  204. LD a3, 2 * SIZE(AO)
  205. LD a4, 3 * SIZE(AO)
  206. LD b1, 0 * SIZE(BO)
  207. LD b2, 1 * SIZE(BO)
  208. LD b3, 2 * SIZE(BO)
  209. LD b4, 3 * SIZE(BO)
  210. MADD t11, t11, a5, b5
  211. MADD t21, t21, a6, b5
  212. MADD t31, t31, a7, b5
  213. MADD t41, t41, a8, b5
  214. MADD t12, t12, a5, b6
  215. MADD t22, t22, a6, b6
  216. MADD t32, t32, a7, b6
  217. MADD t42, t42, a8, b6
  218. MADD t13, t13, a5, b7
  219. MADD t23, t23, a6, b7
  220. MADD t33, t33, a7, b7
  221. MADD t43, t43, a8, b7
  222. MADD t14, t14, a5, b8
  223. MADD t24, t24, a6, b8
  224. MADD t34, t34, a7, b8
  225. MADD t44, t44, a8, b8 # fouth
  226. daddiu L, L, -1
  227. bgtz L, .L12
  228. NOP
  229. .L15:
  230. andi L, KK, 3 # deal with kc remainder part
  231. blez L, .L18
  232. NOP
  233. .align 3
  234. .L16:
  235. MADD t11, t11, a1, b1
  236. MADD t21, t21, a2, b1
  237. MADD t31, t31, a3, b1
  238. MADD t41, t41, a4, b1
  239. MADD t12, t12, a1, b2
  240. MADD t22, t22, a2, b2
  241. MADD t32, t32, a3, b2
  242. MADD t42, t42, a4, b2
  243. MADD t13, t13, a1, b3
  244. MADD t23, t23, a2, b3
  245. MADD t33, t33, a3, b3
  246. MADD t43, t43, a4, b3
  247. MADD t14, t14, a1, b4
  248. MADD t24, t24, a2, b4
  249. MADD t34, t34, a3, b4
  250. MADD t44, t44, a4, b4
  251. daddiu AO, AO, 4 * SIZE # AO += 4mr
  252. daddiu BO, BO, 4 * SIZE # BP += 4nr
  253. LD a1, 0 * SIZE(AO)
  254. LD a2, 1 * SIZE(AO)
  255. LD a3, 2 * SIZE(AO)
  256. LD a4, 3 * SIZE(AO)
  257. LD b1, 0 * SIZE(BO)
  258. LD b2, 1 * SIZE(BO)
  259. LD b3, 2 * SIZE(BO)
  260. LD b4, 3 * SIZE(BO)
  261. daddiu L, L, -1
  262. bgtz L, .L16
  263. NOP
  264. .align 3
  265. .L18: # .L18 always deal with the trigular data part
  266. LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
  267. LD b2, 1 * SIZE(AO) # Fixed results
  268. LD b3, 2 * SIZE(AO)
  269. LD b4, 3 * SIZE(AO) # sa stored as col major
  270. SUB t11, b1, t11
  271. SUB t21, b2, t21
  272. SUB t31, b3, t31
  273. SUB t41, b4, t41
  274. LD b5, 4 * SIZE(AO)
  275. LD b6, 5 * SIZE(AO)
  276. LD b7, 6 * SIZE(AO)
  277. LD b8, 7 * SIZE(AO)
  278. SUB t12, b5, t12
  279. SUB t22, b6, t22
  280. SUB t32, b7, t32
  281. SUB t42, b8, t42
  282. LD b1, 8 * SIZE(AO)
  283. LD b2, 9 * SIZE(AO)
  284. LD b3, 10 * SIZE(AO)
  285. LD b4, 11 * SIZE(AO)
  286. SUB t13, b1, t13
  287. SUB t23, b2, t23
  288. SUB t33, b3, t33
  289. SUB t43, b4, t43
  290. LD b5, 12 * SIZE(AO)
  291. LD b6, 13 * SIZE(AO)
  292. LD b7, 14 * SIZE(AO)
  293. LD b8, 15 * SIZE(AO)
  294. SUB t14, b5, t14
  295. SUB t24, b6, t24
  296. SUB t34, b7, t34
  297. SUB t44, b8, t44
  298. LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
  299. LD b2, 1 * SIZE(BO)
  300. LD b3, 2 * SIZE(BO)
  301. LD b4, 3 * SIZE(BO)
  302. MUL t11, b1, t11
  303. MUL t21, b1, t21
  304. MUL t31, b1, t31
  305. MUL t41, b1, t41
  306. NMSUB t12, t12, b2, t11
  307. NMSUB t22, t22, b2, t21
  308. NMSUB t32, t32, b2, t31
  309. NMSUB t42, t42, b2, t41
  310. NMSUB t13, t13, b3, t11
  311. NMSUB t23, t23, b3, t21
  312. NMSUB t33, t33, b3, t31
  313. NMSUB t43, t43, b3, t41
  314. NMSUB t14, t14, b4, t11
  315. NMSUB t24, t24, b4, t21
  316. NMSUB t34, t34, b4, t31
  317. NMSUB t44, t44, b4, t41
  318. LD b5, 5 * SIZE(BO)
  319. LD b6, 6 * SIZE(BO)
  320. LD b7, 7 * SIZE(BO)
  321. MUL t12, b5, t12
  322. MUL t22, b5, t22
  323. MUL t32, b5, t32
  324. MUL t42, b5, t42
  325. NMSUB t13, t13, b6, t12
  326. NMSUB t23, t23, b6, t22
  327. NMSUB t33, t33, b6, t32
  328. NMSUB t43, t43, b6, t42
  329. NMSUB t14, t14, b7, t12
  330. NMSUB t24, t24, b7, t22
  331. NMSUB t34, t34, b7, t32
  332. NMSUB t44, t44, b7, t42
  333. LD b8, 10 * SIZE(BO)
  334. LD b1, 11 * SIZE(BO)
  335. MUL t13, b8, t13
  336. MUL t23, b8, t23
  337. MUL t33, b8, t33
  338. MUL t43, b8, t43
  339. NMSUB t14, t14, b1, t13
  340. NMSUB t24, t24, b1, t23
  341. NMSUB t34, t34, b1, t33
  342. NMSUB t44, t44, b1, t43
  343. LD b2, 15 * SIZE(BO)
  344. MUL t14, b2, t14
  345. MUL t24, b2, t24
  346. MUL t34, b2, t34
  347. MUL t44, b2, t44
  348. ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
  349. ST t21, 1 * SIZE(AO)
  350. ST t31, 2 * SIZE(AO)
  351. ST t41, 3 * SIZE(AO)
  352. ST t12, 4 * SIZE(AO)
  353. ST t22, 5 * SIZE(AO)
  354. ST t32, 6 * SIZE(AO)
  355. ST t42, 7 * SIZE(AO)
  356. ST t13, 8 * SIZE(AO)
  357. ST t23, 9 * SIZE(AO)
  358. ST t33, 10 * SIZE(AO)
  359. ST t43, 11 * SIZE(AO)
  360. ST t14, 12 * SIZE(AO)
  361. ST t24, 13 * SIZE(AO)
  362. ST t34, 14 * SIZE(AO)
  363. ST t44, 15 * SIZE(AO)
  364. ST t11, 0 * SIZE(CO1) # write back results
  365. ST t21, 1 * SIZE(CO1)
  366. ST t31, 2 * SIZE(CO1)
  367. ST t41, 3 * SIZE(CO1)
  368. ST t12, 0 * SIZE(CO2)
  369. ST t22, 1 * SIZE(CO2)
  370. ST t32, 2 * SIZE(CO2)
  371. ST t42, 3 * SIZE(CO2)
  372. ST t13, 0 * SIZE(CO3)
  373. ST t23, 1 * SIZE(CO3)
  374. ST t33, 2 * SIZE(CO3)
  375. ST t43, 3 * SIZE(CO3)
  376. ST t14, 0 * SIZE(CO4)
  377. ST t24, 1 * SIZE(CO4)
  378. ST t34, 2 * SIZE(CO4)
  379. ST t44, 3 * SIZE(CO4)
  380. daddiu CO1, CO1, 4 * SIZE # fixed address
  381. daddiu CO2, CO2, 4 * SIZE
  382. daddiu CO3, CO3, 4 * SIZE
  383. daddiu CO4, CO4, 4 * SIZE
  384. dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
  385. dsll L, TEMP, 2 + BASE_SHIFT
  386. dsll TEMP, TEMP, 2 + BASE_SHIFT
  387. daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
  388. daddu BO, BO, TEMP # move BO to the end of this panel
  389. daddiu I, I, -1
  390. bgtz I, .L11
  391. NOP
  392. .align 3
  393. .L20:
  394. andi I, M, 2 # mr=2
  395. blez I, .L50
  396. nop
  397. MTC $0, t11 # clear results registers
  398. MOV t21, t11
  399. MOV t31, t11
  400. MOV t41, t11
  401. MOV t12, t11
  402. MOV t22, t11
  403. MOV t32, t11
  404. MOV t42, t11
  405. MOV t13, t11
  406. MOV t23, t11
  407. MOV t33, t11
  408. MOV t43, t11
  409. MOV t14, t11
  410. MOV t24, t11
  411. MOV t34, t11
  412. MOV t44, t11
  413. LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
  414. LD a2, 1 * SIZE(AO) # get 4 a
  415. LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
  416. LD b2, 1 * SIZE(B) # get 4 b
  417. LD b3, 2 * SIZE(B)
  418. LD b4, 3 * SIZE(B)
  419. dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
  420. blez L, .L25
  421. move BO, B # reset B
  422. .L22:
  423. LD a5, 2 * SIZE(AO)
  424. LD a6, 3 * SIZE(AO)
  425. LD b5, 4 * SIZE(BO)
  426. LD b6, 5 * SIZE(BO)
  427. LD b7, 6 * SIZE(BO)
  428. LD b8, 7 * SIZE(BO)
  429. MADD t11, t11, a1, b1
  430. MADD t21, t21, a2, b1
  431. MADD t12, t12, a1, b2
  432. MADD t22, t22, a2, b2
  433. MADD t13, t13, a1, b3
  434. MADD t23, t23, a2, b3
  435. MADD t14, t14, a1, b4
  436. MADD t24, t24, a2, b4
  437. LD a3, 4 * SIZE(AO)
  438. LD a4, 5 * SIZE(AO)
  439. LD b1, 8 * SIZE(BO)
  440. LD b2, 9 * SIZE(BO)
  441. LD b3, 10 * SIZE(BO)
  442. LD b4, 11 * SIZE(BO)
  443. MADD t11, t11, a5, b5
  444. MADD t21, t21, a6, b5
  445. MADD t12, t12, a5, b6
  446. MADD t22, t22, a6, b6
  447. MADD t13, t13, a5, b7
  448. MADD t23, t23, a6, b7
  449. MADD t14, t14, a5, b8
  450. MADD t24, t24, a6, b8
  451. LD a7, 6 * SIZE(AO)
  452. LD a8, 7 * SIZE(AO)
  453. LD b5, 12 * SIZE(BO)
  454. LD b6, 13 * SIZE(BO)
  455. LD b7, 14 * SIZE(BO)
  456. LD b8, 15 * SIZE(BO)
  457. MADD t11, t11, a3, b1
  458. MADD t21, t21, a4, b1
  459. MADD t12, t12, a3, b2
  460. MADD t22, t22, a4, b2
  461. MADD t13, t13, a3, b3
  462. MADD t23, t23, a4, b3
  463. MADD t14, t14, a3, b4
  464. MADD t24, t24, a4, b4
  465. daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
  466. daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr
  467. LD a1, 0 * SIZE(AO)
  468. LD a2, 1 * SIZE(AO)
  469. LD b1, 0 * SIZE(BO)
  470. LD b2, 1 * SIZE(BO)
  471. LD b3, 2 * SIZE(BO)
  472. LD b4, 3 * SIZE(BO)
  473. MADD t11, t11, a7, b5
  474. MADD t21, t21, a8, b5
  475. MADD t12, t12, a7, b6
  476. MADD t22, t22, a8, b6
  477. MADD t13, t13, a7, b7
  478. MADD t23, t23, a8, b7
  479. MADD t14, t14, a7, b8
  480. MADD t24, t24, a8, b8
  481. daddiu L, L, -1
  482. bgtz L, .L22
  483. NOP
  484. .L25:
  485. andi L, KK, 3 # deal with kc remainder part
  486. blez L, .L28
  487. NOP
  488. .align 3
  489. .L26:
  490. MADD t11, t11, a1, b1
  491. MADD t21, t21, a2, b1
  492. MADD t12, t12, a1, b2
  493. MADD t22, t22, a2, b2
  494. MADD t13, t13, a1, b3
  495. MADD t23, t23, a2, b3
  496. MADD t14, t14, a1, b4
  497. MADD t24, t24, a2, b4
  498. daddiu AO, AO, 2 * SIZE # AO += 2mr
  499. daddiu BO, BO, 4 * SIZE # BP += 4nr
  500. LD a1, 0 * SIZE(AO)
  501. LD a2, 1 * SIZE(AO)
  502. LD b1, 0 * SIZE(BO)
  503. LD b2, 1 * SIZE(BO)
  504. LD b3, 2 * SIZE(BO)
  505. LD b4, 3 * SIZE(BO)
  506. daddiu L, L, -1
  507. bgtz L, .L26
  508. NOP
  509. .align 3
  510. .L28: # .L18 always deal with the trigular data part
  511. LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
  512. LD b2, 1 * SIZE(AO) # Fixed results
  513. SUB t11, b1, t11
  514. SUB t21, b2, t21
  515. LD b5, 2 * SIZE(AO)
  516. LD b6, 3 * SIZE(AO)
  517. SUB t12, b5, t12
  518. SUB t22, b6, t22
  519. LD b3, 4 * SIZE(AO)
  520. LD b4, 5 * SIZE(AO)
  521. SUB t13, b3, t13
  522. SUB t23, b4, t23
  523. LD b7, 6 * SIZE(AO)
  524. LD b8, 7 * SIZE(AO)
  525. SUB t14, b7, t14
  526. SUB t24, b8, t24
  527. LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
  528. LD b2, 1 * SIZE(BO)
  529. LD b3, 2 * SIZE(BO)
  530. LD b4, 3 * SIZE(BO)
  531. MUL t11, b1, t11
  532. MUL t21, b1, t21
  533. NMSUB t12, t12, b2, t11
  534. NMSUB t22, t22, b2, t21
  535. NMSUB t13, t13, b3, t11
  536. NMSUB t23, t23, b3, t21
  537. NMSUB t14, t14, b4, t11
  538. NMSUB t24, t24, b4, t21
  539. LD b5, 5 * SIZE(BO)
  540. LD b6, 6 * SIZE(BO)
  541. LD b7, 7 * SIZE(BO)
  542. MUL t12, b5, t12
  543. MUL t22, b5, t22
  544. NMSUB t13, t13, b6, t12
  545. NMSUB t23, t23, b6, t22
  546. NMSUB t14, t14, b7, t12
  547. NMSUB t24, t24, b7, t22
  548. LD b8, 10 * SIZE(BO)
  549. LD b1, 11 * SIZE(BO)
  550. MUL t13, b8, t13
  551. MUL t23, b8, t23
  552. NMSUB t14, t14, b1, t13
  553. NMSUB t24, t24, b1, t23
  554. LD b2, 15 * SIZE(BO)
  555. MUL t14, b2, t14
  556. MUL t24, b2, t24
  557. ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
  558. ST t21, 1 * SIZE(AO)
  559. ST t12, 2 * SIZE(AO)
  560. ST t22, 3 * SIZE(AO)
  561. ST t13, 4 * SIZE(AO)
  562. ST t23, 5 * SIZE(AO)
  563. ST t14, 6 * SIZE(AO)
  564. ST t24, 7 * SIZE(AO)
  565. ST t11, 0 * SIZE(CO1) # write back results
  566. ST t21, 1 * SIZE(CO1)
  567. ST t12, 0 * SIZE(CO2)
  568. ST t22, 1 * SIZE(CO2)
  569. ST t13, 0 * SIZE(CO3)
  570. ST t23, 1 * SIZE(CO3)
  571. ST t14, 0 * SIZE(CO4)
  572. ST t24, 1 * SIZE(CO4)
  573. daddiu CO1, CO1, 2 * SIZE # fixed address
  574. daddiu CO2, CO2, 2 * SIZE # mr=2
  575. daddiu CO3, CO3, 2 * SIZE
  576. daddiu CO4, CO4, 2 * SIZE
  577. dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
  578. dsll L, TEMP, 1 + BASE_SHIFT # mr=2
  579. dsll TEMP, TEMP, 2 + BASE_SHIFT
  580. daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
  581. daddu BO, BO, TEMP # move BO to the end of this panel
  582. .align 3
  583. .L50:
  584. andi I, M, 1 # mr=1
  585. blez I, .L29
  586. nop
  587. MTC $0, t11 # clear results registers
  588. MOV t21, t11
  589. MOV t31, t11
  590. MOV t41, t11
  591. MOV t12, t11
  592. MOV t22, t11
  593. MOV t32, t11
  594. MOV t42, t11
  595. MOV t13, t11
  596. MOV t23, t11
  597. MOV t33, t11
  598. MOV t43, t11
  599. MOV t14, t11
  600. MOV t24, t11
  601. MOV t34, t11
  602. MOV t44, t11
  603. LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
  604. LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
  605. LD b2, 1 * SIZE(B) # get 4 b
  606. LD b3, 2 * SIZE(B)
  607. LD b4, 3 * SIZE(B)
  608. dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
  609. blez L, .L55
  610. move BO, B # reset B
  611. .L52:
  612. LD a5, 1 * SIZE(AO)
  613. LD b5, 4 * SIZE(BO)
  614. LD b6, 5 * SIZE(BO)
  615. LD b7, 6 * SIZE(BO)
  616. LD b8, 7 * SIZE(BO)
  617. MADD t11, t11, a1, b1
  618. MADD t12, t12, a1, b2
  619. MADD t13, t13, a1, b3
  620. MADD t14, t14, a1, b4
  621. LD a3, 2 * SIZE(AO)
  622. LD b1, 8 * SIZE(BO)
  623. LD b2, 9 * SIZE(BO)
  624. LD b3, 10 * SIZE(BO)
  625. LD b4, 11 * SIZE(BO)
  626. MADD t11, t11, a5, b5
  627. MADD t12, t12, a5, b6
  628. MADD t13, t13, a5, b7
  629. MADD t14, t14, a5, b8
  630. LD a7, 3 * SIZE(AO)
  631. LD b5, 12 * SIZE(BO)
  632. LD b6, 13 * SIZE(BO)
  633. LD b7, 14 * SIZE(BO)
  634. LD b8, 15 * SIZE(BO)
  635. MADD t11, t11, a3, b1
  636. MADD t12, t12, a3, b2
  637. MADD t13, t13, a3, b3
  638. MADD t14, t14, a3, b4
  639. daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
  640. daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr
  641. LD a1, 0 * SIZE(AO)
  642. LD b1, 0 * SIZE(BO)
  643. LD b2, 1 * SIZE(BO)
  644. LD b3, 2 * SIZE(BO)
  645. LD b4, 3 * SIZE(BO)
  646. MADD t11, t11, a7, b5
  647. MADD t12, t12, a7, b6
  648. MADD t13, t13, a7, b7
  649. MADD t14, t14, a7, b8
  650. daddiu L, L, -1
  651. bgtz L, .L52
  652. NOP
  653. .L55:
  654. andi L, KK, 3 # deal with kc remainder part
  655. blez L, .L58
  656. NOP
  657. .align 3
  658. .L56:
  659. MADD t11, t11, a1, b1
  660. MADD t12, t12, a1, b2
  661. MADD t13, t13, a1, b3
  662. MADD t14, t14, a1, b4
  663. daddiu AO, AO, 1 * SIZE # AO += 1mr
  664. daddiu BO, BO, 4 * SIZE # BP += 4nr
  665. LD a1, 0 * SIZE(AO)
  666. LD b1, 0 * SIZE(BO)
  667. LD b2, 1 * SIZE(BO)
  668. LD b3, 2 * SIZE(BO)
  669. LD b4, 3 * SIZE(BO)
  670. daddiu L, L, -1
  671. bgtz L, .L56
  672. NOP
  673. .align 3
  674. .L58: # .L18 always deal with the trigular data part
  675. LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
  676. LD b5, 1 * SIZE(AO)
  677. LD b3, 2 * SIZE(AO)
  678. LD b7, 3 * SIZE(AO)
  679. SUB t11, b1, t11
  680. SUB t12, b5, t12
  681. SUB t13, b3, t13
  682. SUB t14, b7, t14
  683. LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
  684. LD b2, 1 * SIZE(BO)
  685. LD b3, 2 * SIZE(BO)
  686. LD b4, 3 * SIZE(BO)
  687. MUL t11, b1, t11
  688. NMSUB t12, t12, b2, t11
  689. NMSUB t13, t13, b3, t11
  690. NMSUB t14, t14, b4, t11
  691. LD b5, 5 * SIZE(BO)
  692. LD b6, 6 * SIZE(BO)
  693. LD b7, 7 * SIZE(BO)
  694. MUL t12, b5, t12
  695. NMSUB t13, t13, b6, t12
  696. NMSUB t14, t14, b7, t12
  697. LD b8, 10 * SIZE(BO)
  698. LD b1, 11 * SIZE(BO)
  699. MUL t13, b8, t13
  700. NMSUB t14, t14, b1, t13
  701. LD b2, 15 * SIZE(BO)
  702. MUL t14, b2, t14
  703. ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
  704. ST t12, 1 * SIZE(AO)
  705. ST t13, 2 * SIZE(AO)
  706. ST t14, 3 * SIZE(AO)
  707. ST t11, 0 * SIZE(CO1) # write back results
  708. ST t12, 0 * SIZE(CO2)
  709. ST t13, 0 * SIZE(CO3)
  710. ST t14, 0 * SIZE(CO4)
  711. daddiu CO1, CO1, 1 * SIZE # fixed address
  712. daddiu CO2, CO2, 1 * SIZE # mr=2
  713. daddiu CO3, CO3, 1 * SIZE
  714. daddiu CO4, CO4, 1 * SIZE
  715. dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
  716. dsll L, TEMP, BASE_SHIFT # mr=2
  717. dsll TEMP, TEMP, 2 + BASE_SHIFT
  718. daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
  719. daddu BO, BO, TEMP # move BO to the end of this panel
  720. .align 3
  721. .L29:
  722. move B, BO # change to next panel of Bj
  723. daddiu KK, KK, 4 # rectangular data length increase by 4
  724. bgtz J, .L10
  725. NOP
  726. .align 3
  727. .L30:
  728. andi J, N, 2
  729. blez J, .L70
  730. nop
  731. move CO1, C
  732. daddu CO2, C, LDC
  733. move AO, A # A is the retangular matrix and B is the trigular matrix
  734. daddu C, CO2, LDC # Fixed pointer C
  735. dsra I, M, 2 # I=MC/4
  736. blez I, .L40
  737. NOP
  738. .align 3
  739. .L31:
  740. MTC $0, t11 # clear results registers
  741. MOV t21, t11
  742. MOV t31, t11
  743. MOV t41, t11
  744. MOV t12, t11
  745. MOV t22, t11
  746. MOV t32, t11
  747. MOV t42, t11
  748. LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
  749. LD a2, 1 * SIZE(AO) # get 4 a
  750. LD a3, 2 * SIZE(AO)
  751. LD a4, 3 * SIZE(AO)
  752. LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
  753. LD b2, 1 * SIZE(B) # get 4 b
  754. dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
  755. blez L, .L35
  756. move BO, B # reset B
  757. .L32:
  758. LD a5, 4 * SIZE(AO)
  759. LD a6, 5 * SIZE(AO)
  760. LD a7, 6 * SIZE(AO)
  761. LD a8, 7 * SIZE(AO)
  762. LD b5, 2 * SIZE(BO)
  763. LD b6, 3 * SIZE(BO)
  764. MADD t11, t11, a1, b1
  765. MADD t21, t21, a2, b1
  766. MADD t31, t31, a3, b1
  767. MADD t41, t41, a4, b1
  768. MADD t12, t12, a1, b2
  769. MADD t22, t22, a2, b2
  770. MADD t32, t32, a3, b2
  771. MADD t42, t42, a4, b2
  772. LD a1, 8 * SIZE(AO)
  773. LD a2, 9 * SIZE(AO)
  774. LD a3, 10 * SIZE(AO)
  775. LD a4, 11 * SIZE(AO)
  776. LD b3, 4 * SIZE(BO)
  777. LD b4, 5 * SIZE(BO)
  778. MADD t11, t11, a5, b5
  779. MADD t21, t21, a6, b5
  780. MADD t31, t31, a7, b5
  781. MADD t41, t41, a8, b5
  782. MADD t12, t12, a5, b6
  783. MADD t22, t22, a6, b6
  784. MADD t32, t32, a7, b6
  785. MADD t42, t42, a8, b6
  786. LD a5, 12 * SIZE(AO)
  787. LD a6, 13 * SIZE(AO)
  788. LD a7, 14 * SIZE(AO)
  789. LD a8, 15 * SIZE(AO)
  790. LD b7, 6 * SIZE(BO)
  791. LD b8, 7 * SIZE(BO)
  792. MADD t11, t11, a1, b3
  793. MADD t21, t21, a2, b3
  794. MADD t31, t31, a3, b3
  795. MADD t41, t41, a4, b3
  796. MADD t12, t12, a1, b4
  797. MADD t22, t22, a2, b4
  798. MADD t32, t32, a3, b4
  799. MADD t42, t42, a4, b4
  800. daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
  801. daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr
  802. LD a1, 0 * SIZE(AO)
  803. LD a2, 1 * SIZE(AO)
  804. LD a3, 2 * SIZE(AO)
  805. LD a4, 3 * SIZE(AO)
  806. LD b1, 0 * SIZE(BO)
  807. LD b2, 1 * SIZE(BO)
  808. MADD t11, t11, a5, b7
  809. MADD t21, t21, a6, b7
  810. MADD t31, t31, a7, b7
  811. MADD t41, t41, a8, b7
  812. MADD t12, t12, a5, b8
  813. MADD t22, t22, a6, b8
  814. MADD t32, t32, a7, b8
  815. MADD t42, t42, a8, b8
  816. daddiu L, L, -1
  817. bgtz L, .L32
  818. NOP
  819. .L35:
  820. andi L, KK, 3 # deal with kc remainder part
  821. blez L, .L38
  822. NOP
  823. .align 3
  824. .L36:
  825. MADD t11, t11, a1, b1
  826. MADD t21, t21, a2, b1
  827. MADD t31, t31, a3, b1
  828. MADD t41, t41, a4, b1
  829. MADD t12, t12, a1, b2
  830. MADD t22, t22, a2, b2
  831. MADD t32, t32, a3, b2
  832. MADD t42, t42, a4, b2
  833. daddiu AO, AO, 4 * SIZE # AO += 4mr
  834. daddiu BO, BO, 2 * SIZE # BP += 2nr
  835. LD a1, 0 * SIZE(AO)
  836. LD a2, 1 * SIZE(AO)
  837. LD a3, 2 * SIZE(AO)
  838. LD a4, 3 * SIZE(AO)
  839. LD b1, 0 * SIZE(BO)
  840. LD b2, 1 * SIZE(BO)
  841. daddiu L, L, -1
  842. bgtz L, .L36
  843. NOP
  844. .align 3
  845. .L38: # .L38 always deal with the trigular data part
  846. LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
  847. LD b2, 1 * SIZE(AO) # Fixed results
  848. LD b3, 2 * SIZE(AO)
  849. LD b4, 3 * SIZE(AO) # sa stored as col major
  850. SUB t11, b1, t11
  851. SUB t21, b2, t21
  852. SUB t31, b3, t31
  853. SUB t41, b4, t41
  854. LD b5, 4 * SIZE(AO)
  855. LD b6, 5 * SIZE(AO)
  856. LD b7, 6 * SIZE(AO)
  857. LD b8, 7 * SIZE(AO)
  858. SUB t12, b5, t12
  859. SUB t22, b6, t22
  860. SUB t32, b7, t32
  861. SUB t42, b8, t42
  862. LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
  863. LD b2, 1 * SIZE(BO)
  864. MUL t11, b1, t11
  865. MUL t21, b1, t21
  866. MUL t31, b1, t31
  867. MUL t41, b1, t41
  868. NMSUB t12, t12, b2, t11
  869. NMSUB t22, t22, b2, t21
  870. NMSUB t32, t32, b2, t31
  871. NMSUB t42, t42, b2, t41
  872. LD b5, 3 * SIZE(BO)
  873. MUL t12, b5, t12
  874. MUL t22, b5, t22
  875. MUL t32, b5, t32
  876. MUL t42, b5, t42
  877. ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
  878. ST t21, 1 * SIZE(AO)
  879. ST t31, 2 * SIZE(AO)
  880. ST t41, 3 * SIZE(AO)
  881. ST t12, 4 * SIZE(AO)
  882. ST t22, 5 * SIZE(AO)
  883. ST t32, 6 * SIZE(AO)
  884. ST t42, 7 * SIZE(AO)
  885. ST t11, 0 * SIZE(CO1) # write back results
  886. ST t21, 1 * SIZE(CO1)
  887. ST t31, 2 * SIZE(CO1)
  888. ST t41, 3 * SIZE(CO1)
  889. ST t12, 0 * SIZE(CO2)
  890. ST t22, 1 * SIZE(CO2)
  891. ST t32, 2 * SIZE(CO2)
  892. ST t42, 3 * SIZE(CO2)
  893. daddiu CO1, CO1, 4 * SIZE # fixed address
  894. daddiu CO2, CO2, 4 * SIZE
  895. dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
  896. dsll L, TEMP, 2 + BASE_SHIFT
  897. dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
  898. daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
  899. daddu BO, BO, TEMP # move BO to the end of this panel
  900. daddiu I, I, -1
  901. bgtz I, .L31
  902. NOP
  903. .align 3
  904. .L40:
  905. andi I, M,2
  906. blez I,.L60
  907. nop
  908. MTC $0, t11 # clear results registers
  909. MOV t21, t11
  910. MOV t12, t11
  911. MOV t22, t11
  912. LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
  913. LD a2, 1 * SIZE(AO) # get 4 a
  914. LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
  915. LD b2, 1 * SIZE(B) # get 4 b
  916. dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
  917. blez L, .L45
  918. move BO, B # reset B
  919. .L42:
  920. LD a5, 2 * SIZE(AO)
  921. LD a6, 3 * SIZE(AO)
  922. LD b5, 2 * SIZE(BO)
  923. LD b6, 3 * SIZE(BO)
  924. MADD t11, t11, a1, b1
  925. MADD t21, t21, a2, b1
  926. MADD t12, t12, a1, b2
  927. MADD t22, t22, a2, b2
  928. LD a3, 4 * SIZE(AO)
  929. LD a4, 5 * SIZE(AO)
  930. LD b3, 4 * SIZE(BO)
  931. LD b4, 5 * SIZE(BO)
  932. MADD t11, t11, a5, b5
  933. MADD t21, t21, a6, b5
  934. MADD t12, t12, a5, b6
  935. MADD t22, t22, a6, b6
  936. LD a7, 6 * SIZE(AO)
  937. LD a8, 7 * SIZE(AO)
  938. LD b7, 6 * SIZE(BO)
  939. LD b8, 7 * SIZE(BO)
  940. MADD t11, t11, a3, b3
  941. MADD t21, t21, a4, b3
  942. MADD t12, t12, a3, b4
  943. MADD t22, t22, a4, b4
  944. daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
  945. daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr
  946. LD a1, 0 * SIZE(AO)
  947. LD a2, 1 * SIZE(AO)
  948. LD b1, 0 * SIZE(BO)
  949. LD b2, 1 * SIZE(BO)
  950. MADD t11, t11, a7, b7
  951. MADD t21, t21, a8, b7
  952. MADD t12, t12, a7, b8
  953. MADD t22, t22, a8, b8
  954. daddiu L, L, -1
  955. bgtz L, .L42
  956. NOP
  957. .L45:
  958. andi L, KK, 3 # deal with kc remainder part
  959. blez L, .L48
  960. NOP
  961. .align 3
  962. .L46:
  963. MADD t11, t11, a1, b1
  964. MADD t21, t21, a2, b1
  965. MADD t12, t12, a1, b2
  966. MADD t22, t22, a2, b2
  967. daddiu AO, AO, 2 * SIZE # AO += 2mr
  968. daddiu BO, BO, 2 * SIZE # BP += 2nr
  969. LD a1, 0 * SIZE(AO)
  970. LD a2, 1 * SIZE(AO)
  971. LD b1, 0 * SIZE(BO)
  972. LD b2, 1 * SIZE(BO)
  973. daddiu L, L, -1
  974. bgtz L, .L46
  975. NOP
  976. .align 3
  977. .L48: # .L48 always deal with the trigular data part
  978. LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
  979. LD b2, 1 * SIZE(AO) # Fixed results
  980. SUB t11, b1, t11
  981. SUB t21, b2, t21
  982. LD b5, 2 * SIZE(AO)
  983. LD b6, 3 * SIZE(AO)
  984. SUB t12, b5, t12
  985. SUB t22, b6, t22
  986. LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
  987. LD b2, 1 * SIZE(BO)
  988. MUL t11, b1, t11
  989. MUL t21, b1, t21
  990. NMSUB t12, t12, b2, t11
  991. NMSUB t22, t22, b2, t21
  992. LD b5, 3 * SIZE(BO)
  993. MUL t12, b5, t12
  994. MUL t22, b5, t22
  995. ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
  996. ST t21, 1 * SIZE(AO)
  997. ST t12, 2 * SIZE(AO)
  998. ST t22, 3 * SIZE(AO)
  999. ST t11, 0 * SIZE(CO1) # write back results
  1000. ST t21, 1 * SIZE(CO1)
  1001. ST t12, 0 * SIZE(CO2)
  1002. ST t22, 1 * SIZE(CO2)
  1003. daddiu CO1, CO1, 2 * SIZE # fixed address
  1004. daddiu CO2, CO2, 2 * SIZE
  1005. dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
  1006. dsll L, TEMP, 1 + BASE_SHIFT
  1007. dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
  1008. daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
  1009. daddu BO, BO, TEMP # move BO to the end of this panel
  1010. .align 3
  1011. .L60:
  1012. andi I,M,1 # nr=2 mr=1
  1013. blez I,.L39
  1014. nop
  1015. MTC $0, t11 # clear results registers
  1016. MOV t12, t11
  1017. LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
  1018. LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
  1019. LD b2, 1 * SIZE(B) # get 4 b
  1020. dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
  1021. blez L, .L65
  1022. move BO, B # reset B
  1023. .L62:
  1024. LD a5, 1 * SIZE(AO)
  1025. LD b5, 2 * SIZE(BO)
  1026. LD b6, 3 * SIZE(BO)
  1027. MADD t11, t11, a1, b1
  1028. MADD t12, t12, a1, b2
  1029. LD a3, 2 * SIZE(AO)
  1030. LD b3, 4 * SIZE(BO)
  1031. LD b4, 5 * SIZE(BO)
  1032. MADD t11, t11, a5, b5
  1033. MADD t12, t12, a5, b6
  1034. LD a7, 3 * SIZE(AO)
  1035. LD b7, 6 * SIZE(BO)
  1036. LD b8, 7 * SIZE(BO)
  1037. MADD t11, t11, a3, b3
  1038. MADD t12, t12, a3, b4
  1039. daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
  1040. daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr
  1041. LD a1, 0 * SIZE(AO)
  1042. LD b1, 0 * SIZE(BO)
  1043. LD b2, 1 * SIZE(BO)
  1044. MADD t11, t11, a7, b7
  1045. MADD t12, t12, a7, b8
  1046. daddiu L, L, -1
  1047. bgtz L, .L62
  1048. NOP
  1049. .L65:
  1050. andi L, KK, 3 # deal with kc remainder part
  1051. blez L, .L68
  1052. NOP
  1053. .align 3
  1054. .L66:
  1055. MADD t11, t11, a1, b1
  1056. MADD t12, t12, a1, b2
  1057. daddiu AO, AO, 1 * SIZE # AO += mr
  1058. daddiu BO, BO, 2 * SIZE # BP += 2nr
  1059. LD a1, 0 * SIZE(AO)
  1060. LD b1, 0 * SIZE(BO)
  1061. LD b2, 1 * SIZE(BO)
  1062. daddiu L, L, -1
  1063. bgtz L, .L66
  1064. NOP
  1065. .align 3
  1066. .L68: # .L48 always deal with the trigular data part
  1067. LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
  1068. LD b5, 1 * SIZE(AO) # Fixed results
  1069. SUB t11, b1, t11
  1070. SUB t12, b5, t12
  1071. LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
  1072. LD b2, 1 * SIZE(BO)
  1073. MUL t11, b1, t11
  1074. NMSUB t12, t12, b2, t11
  1075. LD b5, 3 * SIZE(BO)
  1076. MUL t12, b5, t12
  1077. ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
  1078. ST t12, 1 * SIZE(AO)
  1079. ST t11, 0 * SIZE(CO1) # write back results
  1080. ST t12, 0 * SIZE(CO2)
  1081. daddiu CO1, CO1, 1 * SIZE # fixed address
  1082. daddiu CO2, CO2, 1 * SIZE
  1083. dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
  1084. dsll L, TEMP, BASE_SHIFT # mr=1
  1085. dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
  1086. daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
  1087. daddu BO, BO, TEMP # move BO to the end of this panel
  1088. .align 3
  1089. .L39:
  1090. move B, BO # change to next panel of Bj
  1091. daddiu KK, KK, 2 # rectangular data length increase by 4
  1092. .align 3
  1093. .L70:
  1094. andi J, N, 1 # nr=1
  1095. blez J, .L999
  1096. NOP
  1097. move CO1, C
  1098. move AO, A
  1099. daddu C, CO1, LDC
  1100. dsra I, M, 2 # I=MC/4
  1101. blez I, .L80
  1102. NOP
  1103. .align 3
  1104. .L71:
  1105. MTC $0, t11 # clear results registers
  1106. MOV t21, t11
  1107. MOV t31, t11
  1108. MOV t41, t11
  1109. LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
  1110. LD a2, 1 * SIZE(AO) # get 4 a
  1111. LD a3, 2 * SIZE(AO)
  1112. LD a4, 3 * SIZE(AO)
  1113. LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
  1114. dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
  1115. blez L, .L75
  1116. move BO, B # reset B
  1117. .L72:
  1118. LD a5, 4 * SIZE(AO)
  1119. LD a6, 5 * SIZE(AO)
  1120. LD a7, 6 * SIZE(AO)
  1121. LD a8, 7 * SIZE(AO)
  1122. LD b5, 1 * SIZE(BO)
  1123. MADD t11, t11, a1, b1
  1124. MADD t21, t21, a2, b1
  1125. MADD t31, t31, a3, b1
  1126. MADD t41, t41, a4, b1
  1127. LD a1, 8 * SIZE(AO)
  1128. LD a2, 9 * SIZE(AO)
  1129. LD a3, 10 * SIZE(AO)
  1130. LD a4, 11 * SIZE(AO)
  1131. LD b3, 2 * SIZE(BO)
  1132. MADD t11, t11, a5, b5
  1133. MADD t21, t21, a6, b5
  1134. MADD t31, t31, a7, b5
  1135. MADD t41, t41, a8, b5
  1136. LD a5, 12 * SIZE(AO)
  1137. LD a6, 13 * SIZE(AO)
  1138. LD a7, 14 * SIZE(AO)
  1139. LD a8, 15 * SIZE(AO)
  1140. LD b7, 3 * SIZE(BO)
  1141. MADD t11, t11, a1, b3
  1142. MADD t21, t21, a2, b3
  1143. MADD t31, t31, a3, b3
  1144. MADD t41, t41, a4, b3
  1145. daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
  1146. daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr
  1147. LD a1, 0 * SIZE(AO)
  1148. LD a2, 1 * SIZE(AO)
  1149. LD a3, 2 * SIZE(AO)
  1150. LD a4, 3 * SIZE(AO)
  1151. LD b1, 0 * SIZE(BO)
  1152. MADD t11, t11, a5, b7
  1153. MADD t21, t21, a6, b7
  1154. MADD t31, t31, a7, b7
  1155. MADD t41, t41, a8, b7
  1156. daddiu L, L, -1
  1157. bgtz L, .L72
  1158. NOP
  1159. .L75:
  1160. andi L, KK, 3 # deal with kc remainder part
  1161. blez L, .L78
  1162. NOP
  1163. .align 3
  1164. .L76:
  1165. MADD t11, t11, a1, b1
  1166. MADD t21, t21, a2, b1
  1167. MADD t31, t31, a3, b1
  1168. MADD t41, t41, a4, b1
  1169. daddiu AO, AO, 4 * SIZE # AO += 4mr
  1170. daddiu BO, BO, 1 * SIZE # BP += 1nr
  1171. LD a1, 0 * SIZE(AO)
  1172. LD a2, 1 * SIZE(AO)
  1173. LD a3, 2 * SIZE(AO)
  1174. LD a4, 3 * SIZE(AO)
  1175. LD b1, 0 * SIZE(BO)
  1176. daddiu L, L, -1
  1177. bgtz L, .L76
  1178. NOP
  1179. .align 3
  1180. .L78: # .L78 always deal with the trigular data part
  1181. LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
  1182. LD b2, 1 * SIZE(AO) # Fixed results
  1183. LD b3, 2 * SIZE(AO)
  1184. LD b4, 3 * SIZE(AO) # sa stored as col major
  1185. SUB t11, b1, t11
  1186. SUB t21, b2, t21
  1187. SUB t31, b3, t31
  1188. SUB t41, b4, t41
  1189. LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
  1190. MUL t11, b1, t11
  1191. MUL t21, b1, t21
  1192. MUL t31, b1, t31
  1193. MUL t41, b1, t41
  1194. ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
  1195. ST t21, 1 * SIZE(AO)
  1196. ST t31, 2 * SIZE(AO)
  1197. ST t41, 3 * SIZE(AO)
  1198. ST t11, 0 * SIZE(CO1) # write back results
  1199. ST t21, 1 * SIZE(CO1)
  1200. ST t31, 2 * SIZE(CO1)
  1201. ST t41, 3 * SIZE(CO1)
  1202. daddiu CO1, CO1, 4 * SIZE # fixed address
  1203. dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
  1204. dsll L, TEMP, 2 + BASE_SHIFT
  1205. dsll TEMP, TEMP, BASE_SHIFT # nr=1
  1206. daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
  1207. daddu BO, BO, TEMP # move BO to the end of this panel
  1208. daddiu I, I, -1
  1209. bgtz I, .L71
  1210. NOP
  1211. .align 3
  1212. .L80:
  1213. andi I, M, 2 # mr=2
  1214. blez I, .L90
  1215. nop
  1216. MTC $0, t11 # clear results registers
  1217. MOV t21, t11
  1218. LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
  1219. LD a2, 1 * SIZE(AO) # get 4 a
  1220. LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
  1221. dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
  1222. blez L, .L85
  1223. move BO, B # reset B
  1224. .L82:
  1225. LD a5, 2 * SIZE(AO)
  1226. LD a6, 3 * SIZE(AO)
  1227. LD b5, 1 * SIZE(BO)
  1228. MADD t11, t11, a1, b1
  1229. MADD t21, t21, a2, b1
  1230. LD a3, 4 * SIZE(AO)
  1231. LD a4, 5 * SIZE(AO)
  1232. LD b3, 2 * SIZE(BO)
  1233. MADD t11, t11, a5, b5
  1234. MADD t21, t21, a6, b5
  1235. LD a7, 6 * SIZE(AO)
  1236. LD a8, 7 * SIZE(AO)
  1237. LD b7, 3 * SIZE(BO)
  1238. MADD t11, t11, a3, b3
  1239. MADD t21, t21, a4, b3
  1240. daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
  1241. daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr
  1242. LD a1, 0 * SIZE(AO)
  1243. LD a2, 1 * SIZE(AO)
  1244. LD b1, 0 * SIZE(BO)
  1245. MADD t11, t11, a7, b7
  1246. MADD t21, t21, a8, b7
  1247. daddiu L, L, -1
  1248. bgtz L, .L82
  1249. NOP
  1250. .L85:
  1251. andi L, KK, 3 # deal with kc remainder part
  1252. blez L, .L88
  1253. NOP
  1254. .align 3
  1255. .L86:
  1256. MADD t11, t11, a1, b1
  1257. MADD t21, t21, a2, b1
  1258. daddiu AO, AO, 2 * SIZE # AO += 2mr
  1259. daddiu BO, BO, 1 * SIZE # BP += 1nr
  1260. LD a1, 0 * SIZE(AO)
  1261. LD a2, 1 * SIZE(AO)
  1262. LD b1, 0 * SIZE(BO)
  1263. daddiu L, L, -1
  1264. bgtz L, .L86
  1265. NOP
  1266. .align 3
  1267. .L88: # .L88 always deal with the trigular data part
  1268. LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
  1269. LD b2, 1 * SIZE(AO) # Fixed results
  1270. SUB t11, b1, t11
  1271. SUB t21, b2, t21
  1272. LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
  1273. MUL t11, b1, t11
  1274. MUL t21, b1, t21
  1275. ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
  1276. ST t21, 1 * SIZE(AO)
  1277. ST t11, 0 * SIZE(CO1) # write back results
  1278. ST t21, 1 * SIZE(CO1)
  1279. daddiu CO1, CO1, 2 * SIZE # fixed address
  1280. dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
  1281. dsll L, TEMP, 1 + BASE_SHIFT
  1282. dsll TEMP, TEMP, BASE_SHIFT # nr=1
  1283. daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
  1284. daddu BO, BO, TEMP # move BO to the end of this panel
  1285. .align 3
  1286. .L90:
  1287. andi I, M, 1 # mr=1
  1288. blez I, .L79
  1289. nop
  1290. MTC $0, t11 # clear results registers
  1291. LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
  1292. LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
  1293. dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
  1294. blez L, .L95
  1295. move BO, B # reset B
  1296. .L92:
  1297. LD a5, 1 * SIZE(AO)
  1298. LD b5, 1 * SIZE(BO)
  1299. MADD t11, t11, a1, b1
  1300. LD a3, 2 * SIZE(AO)
  1301. LD b3, 2 * SIZE(BO)
  1302. MADD t11, t11, a5, b5
  1303. LD a7, 3 * SIZE(AO)
  1304. LD b7, 3 * SIZE(BO)
  1305. MADD t11, t11, a3, b3
  1306. daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
  1307. daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr
  1308. LD a1, 0 * SIZE(AO)
  1309. LD b1, 0 * SIZE(BO)
  1310. MADD t11, t11, a7, b7
  1311. daddiu L, L, -1
  1312. bgtz L, .L92
  1313. NOP
  1314. .L95:
  1315. andi L, KK, 3 # deal with kc remainder part
  1316. blez L, .L98
  1317. NOP
  1318. .align 3
  1319. .L96:
  1320. MADD t11, t11, a1, b1
  1321. daddiu AO, AO, 1 * SIZE # AO += 2mr
  1322. daddiu BO, BO, 1 * SIZE # BP += 1nr
  1323. LD a1, 0 * SIZE(AO)
  1324. LD b1, 0 * SIZE(BO)
  1325. daddiu L, L, -1
  1326. bgtz L, .L96
  1327. NOP
  1328. .align 3
  1329. .L98: # .L98 always deal with the trigular data part
  1330. LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
  1331. SUB t11, b1, t11
  1332. LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
  1333. MUL t11, b1, t11
  1334. ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
  1335. ST t11, 0 * SIZE(CO1) # write back results
  1336. daddiu CO1, CO1, 1 * SIZE # fixed address
  1337. dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
  1338. dsll L, TEMP, BASE_SHIFT
  1339. dsll TEMP, TEMP, BASE_SHIFT # nr=1
  1340. daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
  1341. daddu BO, BO, TEMP # move BO to the end of this panel
  1342. .align 3
  1343. .L79:
  1344. move B, BO
  1345. daddiu KK, KK, 1
  1346. .align 3
  1347. .L999:
  1348. LDARG $16, 0($sp)
  1349. LDARG $17, 8($sp)
  1350. LDARG $18, 16($sp)
  1351. LDARG $19, 24($sp)
  1352. LDARG $20, 32($sp)
  1353. LDARG $21, 40($sp)
  1354. ldc1 $f24, 48($sp)
  1355. ldc1 $f25, 56($sp)
  1356. ldc1 $f26, 64($sp)
  1357. ldc1 $f27, 72($sp)
  1358. ldc1 $f28, 80($sp)
  1359. LDARG $22, 88($sp)
  1360. LDARG $23, 96($sp)
  1361. LDARG $24, 104($sp)
  1362. LDARG $25, 112($sp)
  1363. #ifndef __64BIT__
  1364. ldc1 $f20,112($sp)
  1365. ldc1 $f21,120($sp)
  1366. ldc1 $f22,128($sp)
  1367. ldc1 $f23,136($sp)
  1368. #endif
  1369. j $31
  1370. daddiu $sp, $sp, 144
  1371. EPILOGUE