You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_loongson3a_4x4.S 39 kB


  1. #define REALNAME ASMNAME
  2. #define ASSEMBLER
  3. #include "common.h"
  4. #define FETCH ld
  5. #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  6. #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  7. #define M $4
  8. #define N $5
  9. #define K $6
  10. #define A $8
  11. #define B $9
  12. #define C $10
  13. #define LDC $11
  14. #define AO $12
  15. #define BO $13
  16. #define CO1 $14
  17. #define CO2 $15
  18. #define CO3 $16
  19. #define CO4 $17
  20. #define KCO $18
  21. #define MCO $19
  22. #define NCO $20
  23. #define SPANB $21
  24. #define PREB $23
  25. #define PREA $24
  26. #define SPANA $25
  27. #define ALPHA $f15
  28. #if defined(TRMMKERNEL)
  29. #define OFFSET $2
  30. #define KK $3
  31. #define TEMP $7
  32. #endif
  33. #define R8 8
  34. #define R9 9
  35. #define R14 14
  36. #define R15 15
  37. #define R16 16
  38. #define R17 17
  39. #define t11 $f30
  40. #define t21 $f31
  41. #define t31 $f28
  42. #define t41 $f29
  43. #define t12 $f26
  44. #define t22 $f27
  45. #define t32 $f24
  46. #define t42 $f25
  47. #define t13 $f22
  48. #define t23 $f23
  49. #define t33 $f20
  50. #define t43 $f21
  51. #define t14 $f18
  52. #define t24 $f19
  53. #define t34 $f16
  54. #define t44 $f17
  55. #define c11 $f0
  56. #define c21 $f1
  57. #define c31 $f2
  58. #define c41 $f3
  59. #define c12 $f4
  60. #define c22 $f5
  61. #define c32 $f6
  62. #define c42 $f7
  63. #define c13 $f8
  64. #define c23 $f9
  65. #define c33 $f10
  66. #define c43 $f11
  67. #define c14 $f12
  68. #define c24 $f13
  69. #define c34 $f14
  70. #define c44 $f0
  71. #define a0 $f0
  72. #define a1 $f1
  73. #define a2 $f2
  74. #define a3 $f3
  75. #define a4 $f4
  76. #define a5 $f5
  77. #define a6 $f6
  78. #define a7 $f7
  79. #define b0 $f8
  80. #define b1 $f9
  81. #define b2 $f10
  82. #define b3 $f11
  83. #define b4 $f12
  84. #define b5 $f13
  85. #define b6 $f14
  86. #define b7 $f15
  87. #define F31 31
  88. #define F30 30
  89. #define F29 29
  90. #define F28 28
  91. #define F27 27
  92. #define F26 26
  93. #define F25 25
  94. #define F24 24
  95. #define F23 23
  96. #define F22 22
  97. #define F21 21
  98. #define F20 20
  99. #define F19 19
  100. #define F18 18
  101. #define F17 17
  102. #define F16 16
  103. #define F15 15
  104. #define F14 14
  105. #define F13 13
  106. #define F12 12
  107. #define F11 11
  108. #define F10 10
  109. #define F9 9
  110. #define F8 8
  111. #define F7 7
  112. #define F6 6
  113. #define F5 5
  114. #define F4 4
  115. #define F3 3
  116. #define F2 2
  117. #define F1 1
  118. #define F0 0
  119. PROLOGUE
  120. daddiu $sp, $sp, -160
  121. sd $16, 0($sp)
  122. sd $17, 8($sp)
  123. sd $18, 16($sp)
  124. sd $19, 24($sp)
  125. sd $20, 32($sp)
  126. sd $21, 40($sp)
  127. sd $22, 48($sp)
  128. ST $f24, 56($sp)
  129. ST $f25, 64($sp)
  130. ST $f26, 72($sp)
  131. ST $f27, 80($sp)
  132. ST $f28, 88($sp)
  133. sd $23, 96($sp)
  134. sd $24, 104($sp)
  135. sd $25, 112($sp)
  136. ST $f20,120($sp)
  137. ST $f21,128($sp)
  138. ST $f22,136($sp)
  139. ST $f23,144($sp)
  140. .align 5
  141. .L0_N4: # Loop N
  142. ST ALPHA,152($sp) # Backup ALPHA
  143. move MCO,M # Backup M
  144. move NCO,N # Backup N
  145. move KCO,K # Backup K
  146. move AO,A # Backup A_addr
  147. dsra N,NCO,2 # N=NCO/2
  148. dsll LDC,LDC,BASE_SHIFT # LDC*8Byte
  149. dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5
  150. #if defined(TRMMKERNEL)
  151. LDARG OFFSET,160($sp) # OFFSET is relate to the data part
  152. #endif
  153. #if defined(TRMMKERNEL) && !defined(LEFT)
  154. neg KK,OFFSET
  155. #endif
  156. move BO,B # Backup B_addr
  157. beq N,$0,.L0_N2 # N=0,NCO<4
  158. dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte
  159. .L0_N4_Lb: # mr=4,nr=4
  160. move CO1,C
  161. dsra M,MCO,2 # M=MCO/2
  162. move A,AO # Reset A
  163. daddu CO2,C,LDC
  164. daddu PREB,BO,SPANB # PreB point next panelB
  165. daddu CO3,CO2,LDC
  166. daddu PREA,AO,SPANA
  167. daddu CO4,CO3,LDC
  168. #if defined(TRMMKERNEL) && defined(LEFT)
  169. move KK,OFFSET
  170. #endif
  171. beqz M,.L14_M2
  172. daddu C,CO4,LDC # move C to next panel Cj
  173. .L10:
  174. #if defined(TRMMKERNEL)
  175. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  176. move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U)
  177. #else
  178. dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part
  179. dsll TEMP,KK,2 + BASE_SHIFT
  180. daddu A,A,K # move A B to data part
  181. daddu B,BO,TEMP
  182. #endif
  183. MTC $0,t11
  184. MOV t21,t11
  185. gsLQC1(R8,F1,F0,0) # a0,a1
  186. MOV t31,t11
  187. MOV t41,t11
  188. gsLQC1(R9,F9,F8,0) # b0,b1
  189. MOV t12,t11
  190. MOV t22,t11
  191. gsLQC1(R8,F3,F2,1) # a2,a3
  192. MOV t32,t11
  193. MOV t42,t11
  194. gsLQC1(R9,F11,F10,1) # b2,b3
  195. MOV t13,t11
  196. MOV t23,t11
  197. MOV t33,t11
  198. MOV t43,t11
  199. MOV t14,t11
  200. MOV t24,t11
  201. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  202. dsubu TEMP,KCO,KK # temp is the length of the data part
  203. #elif defined(LEFT)
  204. daddiu TEMP, KK, 4 # S=L,U=L
  205. #else
  206. daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part
  207. #endif
  208. dsra K,TEMP,2 # K=KCO/2
  209. MOV t34,t11
  210. beqz K,.L15
  211. MOV t44,t11
  212. #else
  213. move B,BO # Reset B
  214. MTC $0,t11 # GEMM part NR=4,MR=4
  215. gsLQC1(R8,F1,F0,0) # a0,a1
  216. MOV t21,t11
  217. MOV t31,t11
  218. gsLQC1(R9,F9,F8,0) # b0,b1
  219. MOV t41,t11
  220. MOV t12,t11
  221. gsLQC1(R8,F3,F2,1) # a2,a3
  222. MOV t22,t11
  223. MOV t32,t11
  224. gsLQC1(R9,F11,F10,1) # b2,b3
  225. MOV t42,t11
  226. dsra K,KCO,2 # K=KCO/2
  227. MOV t13,t11
  228. MOV t23,t11
  229. MOV t33,t11
  230. MOV t43,t11
  231. MOV t14,t11
  232. MOV t24,t11
  233. MOV t34,t11
  234. beqz K,.L15
  235. MOV t44,t11 # clear 16 results registers
  236. #endif
  237. .align 5
  238. .L11: # kr=4
  239. gsLQC1(R8,F5,F4,2)
  240. MADD t11,t11,a0,b0
  241. MADD t21,t21,a1,b0
  242. gsLQC1(R9,F13,F12,2)
  243. MADD t12,t12,a0,b1
  244. MADD t22,t22,a1,b1
  245. gsLQC1(R8,F7,F6,3)
  246. MADD t31,t31,a2,b0
  247. MADD t41,t41,a3,b0
  248. gsLQC1(R9,F15,F14,3)
  249. MADD t32,t32,a2,b1
  250. MADD t42,t42,a3,b1
  251. FETCH $0,(PREB)
  252. MADD t13,t13,a0,b2
  253. MADD t23,t23,a1,b2
  254. MADD t14,t14,a0,b3
  255. MADD t24,t24,a1,b3
  256. FETCH $0,(PREA)
  257. MADD t33,t33,a2,b2
  258. MADD t43,t43,a3,b2
  259. MADD t34,t34,a2,b3
  260. MADD t44,t44,a3,b3
  261. .L12:
  262. gsLQC1(R8,F1,F0,4)
  263. MADD t11,t11,a4,b4
  264. MADD t21,t21,a5,b4
  265. gsLQC1(R9,F9,F8,4)
  266. MADD t12,t12,a4,b5
  267. MADD t22,t22,a5,b5
  268. gsLQC1(R8,F3,F2,5)
  269. MADD t31,t31,a6,b4
  270. MADD t41,t41,a7,b4
  271. gsLQC1(R9,F11,F10,5)
  272. MADD t32,t32,a6,b5
  273. MADD t42,t42,a7,b5
  274. FETCH $0,4*SIZE(PREB)
  275. MADD t13,t13,a4,b6
  276. MADD t23,t23,a5,b6
  277. MADD t14,t14,a4,b7
  278. MADD t24,t24,a5,b7
  279. FETCH $0,4*SIZE(PREA)
  280. MADD t33,t33,a6,b6
  281. MADD t43,t43,a7,b6
  282. MADD t34,t34,a6,b7
  283. MADD t44,t44,a7,b7
  284. .L13:
  285. gsLQC1(R8,F5,F4,6)
  286. MADD t11,t11,a0,b0
  287. MADD t21,t21,a1,b0
  288. gsLQC1(R9,F13,F12,6)
  289. MADD t12,t12,a0,b1
  290. MADD t22,t22,a1,b1
  291. gsLQC1(R8,F7,F6,7)
  292. MADD t31,t31,a2,b0
  293. MADD t41,t41,a3,b0
  294. gsLQC1(R9,F15,F14,7)
  295. MADD t32,t32,a2,b1
  296. MADD t42,t42,a3,b1
  297. daddu A,A,16*SIZE # 4mr*4kr
  298. FETCH $0,8*SIZE(PREB)
  299. MADD t13,t13,a0,b2
  300. MADD t23,t23,a1,b2
  301. daddu B,B,16*SIZE # 4nr*4kr
  302. MADD t14,t14,a0,b3
  303. MADD t24,t24,a1,b3
  304. FETCH $0,8*SIZE(PREA)
  305. MADD t33,t33,a2,b2
  306. MADD t43,t43,a3,b2
  307. MADD t34,t34,a2,b3
  308. MADD t44,t44,a3,b3
  309. .L14:
  310. gsLQC1(R8,F1,F0,0)
  311. MADD t11,t11,a4,b4
  312. MADD t21,t21,a5,b4
  313. gsLQC1(R9,F9,F8,0)
  314. MADD t12,t12,a4,b5
  315. MADD t22,t22,a5,b5
  316. gsLQC1(R8,F3,F2,1)
  317. MADD t31,t31,a6,b4
  318. MADD t41,t41,a7,b4
  319. daddiu K,K,-1
  320. gsLQC1(R9,F11,F10,1)
  321. MADD t32,t32,a6,b5
  322. MADD t42,t42,a7,b5
  323. FETCH $0,12*SIZE(PREB)
  324. MADD t13,t13,a4,b6
  325. MADD t23,t23,a5,b6
  326. FETCH $0,12*SIZE(PREA)
  327. MADD t14,t14,a4,b7
  328. MADD t24,t24,a5,b7
  329. MADD t33,t33,a6,b6
  330. MADD t43,t43,a7,b6
  331. daddu PREB,PREB,16*SIZE
  332. MADD t34,t34,a6,b7
  333. MADD t44,t44,a7,b7
  334. bnez K,.L11
  335. daddu PREA,PREA,16*SIZE
  336. .L15: # kr=2
  337. #ifndef TRMMKERNEL
  338. andi K,KCO,2
  339. #else
  340. andi K,TEMP, 2
  341. #endif
  342. beqz K,.L18
  343. nop
  344. .L16:
  345. gsLQC1(R8,F5,F4,2)
  346. MADD t11,t11,a0,b0
  347. MADD t21,t21,a1,b0
  348. gsLQC1(R9,F13,F12,2)
  349. MADD t12,t12,a0,b1
  350. MADD t22,t22,a1,b1
  351. gsLQC1(R8,F7,F6,3)
  352. MADD t31,t31,a2,b0
  353. MADD t41,t41,a3,b0
  354. gsLQC1(R9,F15,F14,3)
  355. MADD t32,t32,a2,b1
  356. MADD t42,t42,a3,b1
  357. daddu A,A,8*SIZE # 4mr*2kr
  358. FETCH $0,0(PREB)
  359. MADD t13,t13,a0,b2
  360. MADD t23,t23,a1,b2
  361. daddu B,B,8*SIZE # 4nr*2kr
  362. FETCH $0,0(PREA)
  363. MADD t14,t14,a0,b3
  364. MADD t24,t24,a1,b3
  365. MADD t33,t33,a2,b2
  366. MADD t43,t43,a3,b2
  367. MADD t34,t34,a2,b3
  368. MADD t44,t44,a3,b3
  369. .L17:
  370. gsLQC1(R8,F1,F0,0)
  371. MADD t11,t11,a4,b4
  372. MADD t21,t21,a5,b4
  373. gsLQC1(R9,F9,F8,0)
  374. MADD t12,t12,a4,b5
  375. MADD t22,t22,a5,b5
  376. gsLQC1(R8,F3,F2,1)
  377. MADD t31,t31,a6,b4
  378. MADD t41,t41,a7,b4
  379. gsLQC1(R9,F11,F10,1)
  380. MADD t32,t32,a6,b5
  381. MADD t42,t42,a7,b5
  382. FETCH $0,4*SIZE(PREB)
  383. MADD t13,t13,a4,b6
  384. MADD t23,t23,a5,b6
  385. FETCH $0,4*SIZE(PREA)
  386. MADD t14,t14,a4,b7
  387. MADD t24,t24,a5,b7
  388. daddu PREB,PREB,8*SIZE
  389. MADD t33,t33,a6,b6
  390. MADD t43,t43,a7,b6
  391. daddu PREA,PREA,8*SIZE
  392. MADD t34,t34,a6,b7
  393. MADD t44,t44,a7,b7
  394. .L18: # kr=1
  395. #ifndef TRMMKERNEL
  396. andi K,KCO,1
  397. #else
  398. andi K,TEMP,1
  399. #endif
  400. beqz K,.L19
  401. LD ALPHA,152($sp) # Get ALPHA
  402. FETCH $0,0(PREB)
  403. MADD t11,t11,a0,b0
  404. MADD t21,t21,a1,b0
  405. daddu A,A,4*SIZE # 4mr*kr
  406. MADD t12,t12,a0,b1
  407. MADD t22,t22,a1,b1
  408. daddu B,B,4*SIZE # 4nr*kr
  409. FETCH $0,0(PREA)
  410. MADD t31,t31,a2,b0
  411. MADD t41,t41,a3,b0
  412. daddu PREB,PREB,4*SIZE
  413. MADD t32,t32,a2,b1
  414. MADD t42,t42,a3,b1
  415. daddu PREA,PREA,4*SIZE
  416. MADD t13,t13,a0,b2
  417. MADD t23,t23,a1,b2
  418. MADD t14,t14,a0,b3
  419. MADD t24,t24,a1,b3
  420. MADD t33,t33,a2,b2
  421. MADD t43,t43,a3,b2
  422. MADD t34,t34,a2,b3
  423. MADD t44,t44,a3,b3
  424. .L19: # Write Back to C
  425. #ifndef TRMMKERNEL
  426. LD c11,0(CO1) # GEMM write part
  427. LD c21,1*SIZE(CO1) # get 16 C
  428. LD c31,2*SIZE(CO1)
  429. LD c41,3*SIZE(CO1)
  430. LD c12,0(CO2)
  431. MADD t11,c11,t11,ALPHA
  432. LD c22,1*SIZE(CO2)
  433. MADD t21,c21,t21,ALPHA
  434. LD c32,2*SIZE(CO2)
  435. MADD t31,c31,t31,ALPHA
  436. LD c42,3*SIZE(CO2)
  437. MADD t41,c41,t41,ALPHA
  438. LD c13,0(CO3)
  439. MADD t12,c12,t12,ALPHA
  440. LD c23,1*SIZE(CO3)
  441. MADD t22,c22,t22,ALPHA
  442. LD c33,2*SIZE(CO3)
  443. MADD t32,c32,t32,ALPHA
  444. LD c43,3*SIZE(CO3)
  445. MADD t42,c42,t42,ALPHA
  446. LD c14,0(CO4)
  447. MADD t13,c13,t13,ALPHA
  448. LD c24,1*SIZE(CO4)
  449. MADD t23,c23,t23,ALPHA
  450. LD c34,2*SIZE(CO4)
  451. MADD t33,c33,t33,ALPHA
  452. LD c44,3*SIZE(CO4)
  453. MADD t43,c43,t43,ALPHA
  454. ST t11,0(CO1)
  455. MADD t14,c14,t14,ALPHA
  456. ST t21,1*SIZE(CO1)
  457. MADD t24,c24,t24,ALPHA
  458. ST t31,2*SIZE(CO1)
  459. MADD t34,c34,t34,ALPHA
  460. ST t41,3*SIZE(CO1)
  461. MADD t44,c44,t44,ALPHA
  462. daddiu M,M,-1 # M--
  463. ST t12,0(CO2)
  464. ST t22,1*SIZE(CO2)
  465. ST t32,2*SIZE(CO2)
  466. ST t42,3*SIZE(CO2)
  467. ST t13,0(CO3)
  468. ST t23,1*SIZE(CO3)
  469. ST t33,2*SIZE(CO3)
  470. ST t43,3*SIZE(CO3)
  471. FETCH $0,4*SIZE(CO1)
  472. FETCH $0,4*SIZE(CO2)
  473. FETCH $0,4*SIZE(CO3)
  474. FETCH $0,4*SIZE(CO4)
  475. FETCH $0,8*SIZE(CO1)
  476. FETCH $0,8*SIZE(CO2)
  477. FETCH $0,8*SIZE(CO3)
  478. FETCH $0,8*SIZE(CO4)
  479. ST t14,0(CO4)
  480. daddu CO1,CO1,4*SIZE # COi += 4
  481. ST t24,1*SIZE(CO4)
  482. daddu CO2,CO2,4*SIZE
  483. ST t34,2*SIZE(CO4)
  484. daddu CO3,CO3,4*SIZE
  485. ST t44,3*SIZE(CO4)
  486. daddu PREB,BO,SPANB
  487. bnez M,.L10
  488. daddu CO4,CO4,4*SIZE
  489. #else
  490. MUL t11, ALPHA, t11 # TRMM write back part
  491. MUL t21, ALPHA, t21
  492. MUL t31, ALPHA, t31
  493. MUL t41, ALPHA, t41
  494. ST t11, 0 * SIZE(CO1)
  495. MUL t12, ALPHA, t12
  496. ST t21, 1 * SIZE(CO1)
  497. MUL t22, ALPHA, t22
  498. ST t31, 2 * SIZE(CO1)
  499. MUL t32, ALPHA, t32
  500. ST t41, 3 * SIZE(CO1)
  501. MUL t42, ALPHA, t42
  502. ST t12, 0 * SIZE(CO2)
  503. MUL t13, ALPHA, t13
  504. ST t22, 1 * SIZE(CO2)
  505. MUL t23, ALPHA, t23
  506. ST t32, 2 * SIZE(CO2)
  507. MUL t33, ALPHA, t33
  508. ST t42, 3 * SIZE(CO2)
  509. MUL t43, ALPHA, t43
  510. ST t13, 0 * SIZE(CO3)
  511. MUL t14, ALPHA, t14
  512. ST t23, 1 * SIZE(CO3)
  513. MUL t24, ALPHA, t24
  514. ST t33, 2 * SIZE(CO3)
  515. MUL t34, ALPHA, t34
  516. ST t43, 3 * SIZE(CO3)
  517. MUL t44, ALPHA, t44
  518. ST t14, 0 * SIZE(CO4)
  519. daddiu M,M,-1 # M--
  520. ST t24, 1 * SIZE(CO4)
  521. ST t34, 2 * SIZE(CO4)
  522. ST t44, 3 * SIZE(CO4)
  523. daddiu CO1,CO1, 4 * SIZE
  524. daddiu CO2,CO2, 4 * SIZE
  525. daddiu CO3,CO3, 4 * SIZE
  526. daddiu CO4,CO4, 4 * SIZE
  527. FETCH $0,4*SIZE(CO1)
  528. FETCH $0,4*SIZE(CO2)
  529. FETCH $0,4*SIZE(CO3)
  530. FETCH $0,4*SIZE(CO4)
  531. FETCH $0,0(CO1)
  532. FETCH $0,0(CO2)
  533. FETCH $0,0(CO3)
  534. FETCH $0,0(CO4)
  535. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  536. dsubu TEMP,KCO,KK
  537. #ifdef LEFT
  538. daddiu TEMP,TEMP, -4
  539. #else
  540. daddiu TEMP,TEMP, -4
  541. #endif
  542. dsll K,TEMP,2 + BASE_SHIFT
  543. dsll TEMP,TEMP,2 + BASE_SHIFT
  544. daddu A,A,K # mov A to the end of panel Ai
  545. daddu B,B,TEMP # mov B to the end of panel Bj
  546. #endif
  547. #ifdef LEFT
  548. daddiu KK, KK,4
  549. #endif
  550. bnez M,.L10
  551. nop
  552. #endif
  553. .align 3
  554. .L14_M2:
  555. andi M, MCO, 2 # nr=4,mr=2
  556. beqz M,.L14_M1
  557. nop
  558. .L20:
  559. #if defined(TRMMKERNEL)
  560. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  561. move B,BO # Reset B
  562. #else
  563. dsll K,KK,1 + BASE_SHIFT # mr=2
  564. dsll TEMP,KK,2 + BASE_SHIFT # nr=4
  565. daddu A,A,K
  566. daddu B,BO,TEMP
  567. #endif
  568. MTC $0,t11
  569. MOV t21,t11
  570. gsLQC1(R8,F1,F0,0) # a0,a1
  571. MOV t12,t11
  572. MOV t22,t11
  573. gsLQC1(R9,F9,F8,0) # b0,b1
  574. MOV t13,t11
  575. MOV t23,t11
  576. gsLQC1(R9,F11,F10,1) # b2,b3
  577. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  578. dsubu TEMP,KCO,KK
  579. #elif defined(LEFT)
  580. daddiu TEMP,KK,2 # left part,controlled by mr, mr=2
  581. #else
  582. daddiu TEMP,KK,4 # right part,controlled by nr,nr=4
  583. #endif
  584. dsra K,TEMP,2
  585. MOV t14,t11
  586. beqz K,.L25
  587. MOV t24,t11 # clear 2*4=8 results registers
  588. #else
  589. move B,BO # Reset B
  590. MTC $0,t11
  591. gsLQC1(R8,F1,F0,0)
  592. MOV t21,t11
  593. MOV t12,t11
  594. gsLQC1(R9,F9,F8,0)
  595. MOV t22,t11
  596. dsra K,KCO,2
  597. gsLQC1(R9,F11,F10,1)
  598. MOV t13,t11
  599. MOV t23,t11
  600. MOV t14,t11
  601. beqz K,.L25
  602. MOV t24,t11
  603. #endif
  604. .L21: # nr=4,mr=2,kr=4
  605. gsLQC1(R8,F5,F4,1)
  606. MADD t11,t11,a0,b0
  607. MADD t21,t21,a1,b0
  608. gsLQC1(R9,F13,F12,2)
  609. MADD t12,t12,a0,b1
  610. MADD t22,t22,a1,b1
  611. gsLQC1(R9,F15,F14,3)
  612. MADD t13,t13,a0,b2
  613. MADD t23,t23,a1,b2
  614. MADD t14,t14,a0,b3
  615. MADD t24,t24,a1,b3
  616. gsLQC1(R8,F3,F2,2)
  617. MADD t11,t11,a4,b4
  618. MADD t21,t21,a5,b4
  619. gsLQC1(R9,F9,F8,4)
  620. MADD t12,t12,a4,b5
  621. MADD t22,t22,a5,b5
  622. gsLQC1(R9,F11,F10,5)
  623. MADD t13,t13,a4,b6
  624. MADD t23,t23,a5,b6
  625. MADD t14,t14,a4,b7
  626. MADD t24,t24,a5,b7
  627. daddiu K,K,-1
  628. gsLQC1(R8,F7,F6,3)
  629. MADD t11,t11,a2,b0
  630. MADD t21,t21,a3,b0
  631. gsLQC1(R9,F13,F12,6)
  632. MADD t12,t12,a2,b1
  633. MADD t22,t22,a3,b1
  634. gsLQC1(R9,F15,F14,7)
  635. MADD t13,t13,a2,b2
  636. MADD t23,t23,a3,b2
  637. daddu A,A,8*SIZE # 2mr*4kr
  638. MADD t14,t14,a2,b3
  639. MADD t24,t24,a3,b3
  640. daddu B,B,16*SIZE # 4nr*4kr
  641. gsLQC1(R8,F1,F0,0)
  642. MADD t11,t11,a6,b4
  643. MADD t21,t21,a7,b4
  644. gsLQC1(R9,F9,F8,0)
  645. MADD t12,t12,a6,b5
  646. MADD t22,t22,a7,b5
  647. gsLQC1(R9,F11,F10,1)
  648. MADD t13,t13,a6,b6
  649. MADD t23,t23,a7,b6
  650. MADD t14,t14,a6,b7
  651. bnez K,.L21
  652. MADD t24,t24,a7,b7
  653. .L25:
  654. #ifndef TRMMKERNEL
  655. andi K,KCO,2 # kr=2
  656. #else
  657. andi K,TEMP,2
  658. #endif
  659. beqz K,.L28
  660. nop
  661. .L26:
  662. gsLQC1(R8,F5,F4,1)
  663. MADD t11,t11,a0,b0
  664. MADD t21,t21,a1,b0
  665. gsLQC1(R9,F13,F12,2)
  666. MADD t12,t12,a0,b1
  667. MADD t22,t22,a1,b1
  668. gsLQC1(R9,F15,F14,3)
  669. MADD t13,t13,a0,b2
  670. MADD t23,t23,a1,b2
  671. daddu A,A,4*SIZE # 2mr*2kr
  672. MADD t14,t14,a0,b3
  673. MADD t24,t24,a1,b3
  674. daddu B,B,8*SIZE # 4nr*2kr
  675. .L27:
  676. gsLQC1(R8,F1,F0,0)
  677. MADD t11,t11,a4,b4
  678. MADD t21,t21,a5,b4
  679. gsLQC1(R9,F9,F8,0)
  680. MADD t12,t12,a4,b5
  681. MADD t22,t22,a5,b5
  682. gsLQC1(R9,F11,F10,1)
  683. MADD t13,t13,a4,b6
  684. MADD t23,t23,a5,b6
  685. MADD t14,t14,a4,b7
  686. MADD t24,t24,a5,b7
  687. .L28: # kr=1
  688. #ifndef TRMMKERNEL
  689. andi K,KCO,1
  690. #else
  691. andi K,TEMP,1
  692. #endif
  693. beqz K,.L29
  694. LD ALPHA,152($sp) # Get ALPHA
  695. MADD t11,t11,a0,b0
  696. MADD t21,t21,a1,b0
  697. daddu A,A,2*SIZE # 2mr*kr
  698. daddu B,B,4*SIZE # 4nr*kr
  699. MADD t12,t12,a0,b1
  700. MADD t22,t22,a1,b1
  701. MADD t13,t13,a0,b2
  702. MADD t23,t23,a1,b2
  703. MADD t14,t14,a0,b3
  704. MADD t24,t24,a1,b3
  705. .L29: # Write Back to C
  706. #ifndef TRMMKERNEL
  707. LD c11,0(CO1) # GEMM write back part
  708. LD c21,1*SIZE(CO1)
  709. LD c12,0(CO2)
  710. LD c22,1*SIZE(CO2)
  711. LD c13,0(CO3)
  712. MADD t11,c11,t11,ALPHA
  713. LD c23,1*SIZE(CO3)
  714. MADD t21,c21,t21,ALPHA
  715. LD c14,0(CO4)
  716. MADD t12,c12,t12,ALPHA
  717. LD c24,1*SIZE(CO4)
  718. MADD t22,c22,t22,ALPHA
  719. ST t11,0(CO1)
  720. MADD t13,c13,t13,ALPHA
  721. ST t21,1*SIZE(CO1)
  722. MADD t23,c23,t23,ALPHA
  723. ST t12,0(CO2)
  724. MADD t14,c14,t14,ALPHA
  725. ST t22,1*SIZE(CO2)
  726. MADD t24,c24,t24,ALPHA
  727. ST t13,0(CO3)
  728. daddu CO1,CO1,2*SIZE # COi += 2
  729. ST t23,1*SIZE(CO3)
  730. daddu CO2,CO2,2*SIZE
  731. ST t14,0(CO4)
  732. daddu CO3,CO3,2*SIZE
  733. ST t24,1*SIZE(CO4)
  734. daddu CO4,CO4,2*SIZE
  735. FETCH $0,0(CO1)
  736. FETCH $0,0(CO2)
  737. FETCH $0,0(CO3)
  738. FETCH $0,0(CO4)
  739. #else
  740. MUL t11, ALPHA, t11 # TRMM write back part
  741. MUL t21, ALPHA, t21
  742. ST t11, 0 * SIZE(CO1)
  743. MUL t12, ALPHA, t12
  744. ST t21, 1 * SIZE(CO1)
  745. MUL t22, ALPHA, t22
  746. ST t12, 0 * SIZE(CO2)
  747. MUL t13, ALPHA, t13
  748. ST t22, 1 * SIZE(CO2)
  749. MUL t23, ALPHA, t23
  750. ST t13, 0 * SIZE(CO3)
  751. MUL t14, ALPHA, t14
  752. ST t23, 1 * SIZE(CO3)
  753. MUL t24, ALPHA, t24
  754. ST t14, 0 * SIZE(CO4)
  755. ST t24, 1 * SIZE(CO4)
  756. daddiu CO1,CO1, 2 * SIZE
  757. daddiu CO2,CO2, 2 * SIZE
  758. daddiu CO3,CO3, 2 * SIZE
  759. daddiu CO4,CO4, 2 * SIZE
  760. FETCH $0,0(CO1)
  761. FETCH $0,0(CO2)
  762. FETCH $0,0(CO3)
  763. FETCH $0,0(CO4)
  764. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  765. dsubu TEMP,KCO,KK
  766. #ifdef LEFT
  767. daddiu TEMP,TEMP,-2
  768. #else
  769. daddiu TEMP,TEMP,-4
  770. #endif
  771. dsll K,TEMP,1 + BASE_SHIFT
  772. dsll TEMP,TEMP,2 + BASE_SHIFT
  773. daddu A,A,K # move A to next panel Ai
  774. daddu B,B,TEMP # move B to next panel Bj
  775. #endif
  776. #ifdef LEFT
  777. daddiu KK, KK, 2
  778. #endif
  779. #endif
  780. .align 3
  781. .L14_M1:
  782. andi M,MCO,1 # mr=1
  783. beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj
  784. nop
  785. .L30:
  786. #if defined(TRMMKERNEL)
  787. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  788. move B,BO # Reset B
  789. #else
  790. dsll K,KK, 0 + BASE_SHIFT
  791. dsll TEMP,KK,2 + BASE_SHIFT
  792. daddu A,A,K
  793. daddu B,BO,TEMP
  794. #endif
  795. MTC $0,t11
  796. MOV t12,t11
  797. LD a0, 0 * SIZE(A) # a0
  798. MOV t13,t11
  799. gsLQC1(R9,F9,F8,0) # b0,b1
  800. MOV t14,t11 # clear result registers
  801. gsLQC1(R9,F11,F10,1) # b2,b3
  802. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  803. dsubu TEMP, KCO, KK
  804. #elif defined(LEFT)
  805. daddiu TEMP, KK, 1
  806. #else
  807. daddiu TEMP, KK, 4
  808. #endif
  809. dsra K,TEMP, 2
  810. nop
  811. beqz K,.L35
  812. nop
  813. #else
  814. move B,BO # Reset B, GEMM part
  815. dsra K,KCO,2 # K=KCO/2
  816. LD a0, 0 * SIZE(A) # a0
  817. MTC $0,t11
  818. MOV t12,t11
  819. gsLQC1(R9,F9,F8,0) # b0,b1
  820. MOV t13,t11
  821. MOV t14,t11
  822. gsLQC1(R9,F11,F10,1) # b2,b3
  823. beqz K,.L35
  824. nop
  825. #endif
  826. .L31: # nr=4,mr=1,kr=4
  827. LD a1, 1*SIZE(A) # load a1
  828. MADD t11,t11,a0,b0
  829. gsLQC1(R9,F13,F12,2) # b4,b5
  830. MADD t12,t12,a0,b1
  831. gsLQC1(R9,F15,F14,3) # b6,b7
  832. MADD t13,t13,a0,b2
  833. MADD t14,t14,a0,b3
  834. LD a2, 2*SIZE(A) # a2
  835. MADD t11,t11,a1,b4
  836. gsLQC1(R9,F9,F8,4)
  837. MADD t12,t12,a1,b5
  838. gsLQC1(R9,F11,F10,5)
  839. MADD t13,t13,a1,b6
  840. MADD t14,t14,a1,b7
  841. daddiu K,K,-1
  842. LD a3, 3*SIZE(A) # a3
  843. MADD t11,t11,a2,b0
  844. gsLQC1(R9,F13,F12,6)
  845. MADD t12,t12,a2,b1
  846. daddu A,A,4*SIZE # 1mr*4kr
  847. gsLQC1(R9,F15,F14,7)
  848. MADD t13,t13,a2,b2
  849. MADD t14,t14,a2,b3
  850. daddu B,B,16*SIZE # 4nr*4kr
  851. LD a0, 0*SIZE(A) # a0
  852. MADD t11,t11,a3,b4
  853. gsLQC1(R9,F9,F8,0)
  854. MADD t12,t12,a3,b5
  855. gsLQC1(R9,F11,F10,1)
  856. MADD t13,t13,a3,b6
  857. bnez K,.L31
  858. MADD t14,t14,a3,b7
  859. .L35: # kr=2
  860. #ifndef TRMMKERNEL
  861. andi K,KCO,2
  862. #else
  863. andi K,TEMP,2
  864. #endif
  865. beqz K,.L38
  866. nop
  867. .L36:
  868. LD a1,1*SIZE(A) # load a1
  869. MADD t11,t11,a0,b0
  870. gsLQC1(R9,F13,F12,2)
  871. MADD t12,t12,a0,b1
  872. daddu A,A,2*SIZE # mr*2kr
  873. gsLQC1(R9,F15,F14,3)
  874. MADD t13,t13,a0,b2
  875. MADD t14,t14,a0,b3
  876. daddu B,B,8*SIZE # 4nr*2kr
  877. .L37:
  878. LD a0,0(A)
  879. MADD t11,t11,a1,b4
  880. gsLQC1(R9,F9,F8,0)
  881. MADD t12,t12,a1,b5
  882. gsLQC1(R9,F11,F10,1)
  883. MADD t13,t13,a1,b6
  884. MADD t14,t14,a1,b7
  885. .L38: # kr=1
  886. #ifndef TRMMKERNEL
  887. andi K,KCO,1
  888. #else
  889. andi K,TEMP,1
  890. #endif
  891. beqz K,.L39
  892. LD ALPHA,152($sp) # Get ALPHA
  893. MADD t11,t11,a0,b0
  894. MADD t12,t12,a0,b1
  895. daddu A,A,1*SIZE
  896. daddu B,B,4*SIZE
  897. MADD t13,t13,a0,b2
  898. MADD t14,t14,a0,b3
  899. .L39: # Write Back
  900. #ifndef TRMMKERNEL
  901. LD c11,0(CO1)
  902. LD c12,0(CO2)
  903. LD c13,0(CO3)
  904. LD c14,0(CO4)
  905. MADD t11,c11,t11,ALPHA
  906. MADD t12,c12,t12,ALPHA
  907. MADD t13,c13,t13,ALPHA
  908. MADD t14,c14,t14,ALPHA
  909. ST t11,0(CO1)
  910. ST t12,0(CO2)
  911. ST t13,0(CO3)
  912. ST t14,0(CO4)
  913. #else
  914. MUL t11, ALPHA, t11
  915. MUL t12, ALPHA, t12
  916. MUL t13, ALPHA, t13
  917. MUL t14, ALPHA, t14
  918. ST t11, 0 * SIZE(CO1)
  919. ST t12, 0 * SIZE(CO2)
  920. ST t13, 0 * SIZE(CO3)
  921. ST t14, 0 * SIZE(CO4)
  922. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  923. dsubu TEMP, KCO, KK
  924. #ifdef LEFT
  925. daddiu TEMP, TEMP, -1
  926. #else
  927. daddiu TEMP, TEMP, -4
  928. #endif
  929. dsll K,TEMP, 0 + BASE_SHIFT
  930. dsll TEMP,TEMP, 2 + BASE_SHIFT
  931. daddu A,A,K
  932. daddu B,B,TEMP
  933. #endif
  934. #ifdef LEFT
  935. daddiu KK, KK, 1
  936. #endif
  937. #endif
  938. .align 3
  939. .L0_N4_Loop: # mc finished
  940. daddiu N,N,-1 # N--
  941. #if defined(TRMMKERNEL) && !defined(LEFT)
  942. daddiu KK, KK,4
  943. #endif
  944. bnez N,.L0_N4_Lb
  945. move BO,B # Set BO point to next panel Bj
  946. .align 5
  947. .L0_N2:
  948. andi N,NCO,2 # nr = 2
  949. beqz N,.L0_N1
  950. nop
  951. .L0_N2_Lb:
  952. move CO1,C
  953. daddu CO2,C,LDC
  954. dsra M,MCO,2
  955. move A,AO # Reset A
  956. daddu PREA,AO,SPANA
  957. daddu C,CO2,LDC
  958. #if defined(TRMMKERNEL) && defined(LEFT)
  959. move KK, OFFSET
  960. #endif
  961. beqz M,.L12_M2
  962. nop
  963. .L40:
  964. #if defined(TRMMKERNEL)
  965. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  966. move B,BO # Reset B
  967. #else
  968. dsll K,KK, 2 + BASE_SHIFT
  969. dsll TEMP, KK,1 + BASE_SHIFT
  970. daddu A,A,K
  971. daddu B,BO,TEMP
  972. #endif
  973. MTC $0,t11
  974. MOV t21,t11
  975. gsLQC1(R8,F1,F0,0) # a0,a1
  976. MOV t31,t11
  977. MOV t41,t11
  978. gsLQC1(R9,F9,F8,0) # b0,b1
  979. MOV t12,t11
  980. MOV t22,t11
  981. gsLQC1(R8,F3,F2,1) # a2,a3
  982. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  983. dsubu TEMP,KCO,KK
  984. #elif defined(LEFT)
  985. daddiu TEMP, KK, 4
  986. #else
  987. daddiu TEMP, KK, 2
  988. #endif
  989. dsra K,TEMP,2
  990. MOV t32,t11
  991. beqz K,.L45
  992. MOV t42,t11
  993. #else
  994. move B,BO # Reset B
  995. MTC $0,t11 # gemm part
  996. gsLQC1(R8,F1,F0,0) # a0,a1
  997. MOV t21,t11
  998. MOV t31,t11
  999. gsLQC1(R9,F9,F8,0) # b0,b1
  1000. MOV t41,t11
  1001. dsra K,KCO,2 # K=KCO/2
  1002. gsLQC1(R8,F3,F2,1) # a2,a3
  1003. MOV t12,t11
  1004. MOV t22,t11
  1005. MOV t32,t11
  1006. beqz K,.L45
  1007. MOV t42,t11
  1008. #endif
  1009. .L41: # nr=2,mr=kr=4
  1010. gsLQC1(R8,F5,F4,2)
  1011. MADD t11,t11,a0,b0
  1012. MADD t21,t21,a1,b0
  1013. gsLQC1(R9,F13,F12,1)
  1014. MADD t12,t12,a0,b1
  1015. MADD t22,t22,a1,b1
  1016. gsLQC1(R8,F7,F6,3)
  1017. MADD t31,t31,a2,b0
  1018. MADD t41,t41,a3,b0
  1019. FETCH $0,(PREA)
  1020. MADD t32,t32,a2,b1
  1021. MADD t42,t42,a3,b1
  1022. .L42:
  1023. gsLQC1(R8,F1,F0,4)
  1024. MADD t11,t11,a4,b4
  1025. MADD t21,t21,a5,b4
  1026. gsLQC1(R9,F11,F10,2)
  1027. MADD t12,t12,a4,b5
  1028. MADD t22,t22,a5,b5
  1029. gsLQC1(R8,F3,F2,5)
  1030. MADD t31,t31,a6,b4
  1031. MADD t41,t41,a7,b4
  1032. FETCH $0,4*SIZE(PREA)
  1033. MADD t32,t32,a6,b5
  1034. MADD t42,t42,a7,b5
  1035. .L43:
  1036. gsLQC1(R8,F5,F4,6)
  1037. MADD t11,t11,a0,b2
  1038. MADD t21,t21,a1,b2
  1039. gsLQC1(R9,F15,F14,3)
  1040. MADD t12,t12,a0,b3
  1041. MADD t22,t22,a1,b3
  1042. gsLQC1(R8,F7,F6,7)
  1043. MADD t31,t31,a2,b2
  1044. MADD t41,t41,a3,b2
  1045. daddu B,B,8*SIZE # 2nr*4kr
  1046. FETCH $0,8*SIZE(PREA)
  1047. MADD t32,t32,a2,b3
  1048. MADD t42,t42,a3,b3
  1049. daddu A,A,16*SIZE # 4mr*4kr
  1050. .L44:
  1051. gsLQC1(R8,F1,F0,0)
  1052. MADD t11,t11,a4,b6
  1053. MADD t21,t21,a5,b6
  1054. daddiu K,K,-1
  1055. gsLQC1(R9,F9,F8,0)
  1056. MADD t12,t12,a4,b7
  1057. MADD t22,t22,a5,b7
  1058. daddu PREA,PREA,16*SIZE
  1059. gsLQC1(R8,F3,F2,1)
  1060. MADD t31,t31,a6,b6
  1061. MADD t41,t41,a7,b6
  1062. FETCH $0,-4*SIZE(PREA)
  1063. MADD t32,t32,a6,b7
  1064. bnez K,.L41
  1065. MADD t42,t42,a7,b7
  1066. .L45: # kr=2
  1067. #ifndef TRMMKERNEL
  1068. andi K,KCO,2
  1069. #else
  1070. andi K,TEMP,2
  1071. #endif
  1072. beqz K,.L48
  1073. nop
  1074. .L46:
  1075. gsLQC1(R8,F5,F4,2)
  1076. MADD t11,t11,a0,b0
  1077. MADD t21,t21,a1,b0
  1078. gsLQC1(R9,F13,F12,1)
  1079. MADD t12,t12,a0,b1
  1080. MADD t22,t22,a1,b1
  1081. gsLQC1(R8,F7,F6,3)
  1082. MADD t31,t31,a2,b0
  1083. MADD t41,t41,a3,b0
  1084. daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32
  1085. FETCH $0,0(PREA)
  1086. MADD t32,t32,a2,b1
  1087. MADD t42,t42,a3,b1
  1088. daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
  1089. .L47:
  1090. gsLQC1(R8,F1,F0,0)
  1091. MADD t11,t11,a4,b4
  1092. MADD t21,t21,a5,b4
  1093. gsLQC1(R9,F9,F8,0)
  1094. MADD t12,t12,a4,b5
  1095. MADD t22,t22,a5,b5
  1096. gsLQC1(R8,F3,F2,1)
  1097. MADD t31,t31,a6,b4
  1098. MADD t41,t41,a7,b4
  1099. FETCH $0,4*SIZE(PREA)
  1100. MADD t32,t32,a6,b5
  1101. MADD t42,t42,a7,b5
  1102. daddu PREA,PREA,8*SIZE
  1103. .L48: # kr=1
  1104. #ifndef TRMMKERNEL
  1105. andi K,KCO,1
  1106. #else
  1107. andi K,TEMP,1
  1108. #endif
  1109. beqz K,.L49
  1110. LD ALPHA,152($sp) # Get ALPHA
  1111. FETCH $0,0(PREA)
  1112. MADD t11,t11,a0,b0
  1113. MADD t21,t21,a1,b0
  1114. daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
  1115. MADD t12,t12,a0,b1
  1116. MADD t22,t22,a1,b1
  1117. daddu B,B,2*SIZE
  1118. daddu PREA,PREA,4*SIZE
  1119. MADD t31,t31,a2,b0
  1120. MADD t41,t41,a3,b0
  1121. MADD t32,t32,a2,b1
  1122. MADD t42,t42,a3,b1
  1123. .L49: # Write Back
  1124. #ifndef TRMMKERNEL
  1125. LD c11,0(CO1) # gemm write back part Fetch 16 C
  1126. LD c21,1*SIZE(CO1)
  1127. LD c31,2*SIZE(CO1)
  1128. LD c41,3*SIZE(CO1)
  1129. LD c12,0(CO2)
  1130. MADD t11,c11,t11,ALPHA
  1131. LD c22,1*SIZE(CO2)
  1132. MADD t21,c21,t21,ALPHA
  1133. LD c32,2*SIZE(CO2)
  1134. MADD t31,c31,t31,ALPHA
  1135. LD c42,3*SIZE(CO2)
  1136. MADD t41,c41,t41,ALPHA
  1137. ST t11,0(CO1)
  1138. MADD t12,c12,t12,ALPHA
  1139. ST t21,1*SIZE(CO1)
  1140. MADD t22,c22,t22,ALPHA
  1141. ST t31,2*SIZE(CO1)
  1142. MADD t32,c32,t32,ALPHA
  1143. ST t41,3*SIZE(CO1)
  1144. MADD t42,c42,t42,ALPHA
  1145. daddiu M,M,-1
  1146. ST t12,0(CO2)
  1147. ST t22,1*SIZE(CO2)
  1148. ST t32,2*SIZE(CO2)
  1149. ST t42,3*SIZE(CO2)
  1150. FETCH $0,4*SIZE(CO1)
  1151. FETCH $0,4*SIZE(CO2)
  1152. FETCH $0,8*SIZE(CO1)
  1153. FETCH $0,8*SIZE(CO2)
  1154. daddu CO1,CO1,4*SIZE
  1155. bnez M,.L40
  1156. daddu CO2,CO2,4*SIZE
  1157. #else
  1158. MUL t11, ALPHA, t11
  1159. MUL t21, ALPHA, t21
  1160. MUL t31, ALPHA, t31
  1161. MUL t41, ALPHA, t41
  1162. MUL t12, ALPHA, t12
  1163. ST t11, 0 * SIZE(CO1)
  1164. MUL t22, ALPHA, t22
  1165. ST t21, 1 * SIZE(CO1)
  1166. MUL t32, ALPHA, t32
  1167. ST t31, 2 * SIZE(CO1)
  1168. MUL t42, ALPHA, t42
  1169. ST t41, 3 * SIZE(CO1)
  1170. ST t12, 0 * SIZE(CO2)
  1171. daddiu M,M,-1
  1172. ST t22, 1 * SIZE(CO2)
  1173. ST t32, 2 * SIZE(CO2)
  1174. ST t42, 3 * SIZE(CO2)
  1175. daddiu CO1,CO1, 4*SIZE
  1176. daddiu CO2,CO2, 4*SIZE
  1177. FETCH $0,0(CO1)
  1178. FETCH $0,0(CO2)
  1179. FETCH $0,4(CO1)
  1180. FETCH $0,4(CO2)
  1181. #if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  1182. dsubu TEMP, KCO, KK
  1183. #ifdef LEFT
  1184. daddiu TEMP, TEMP, -4
  1185. #else
  1186. daddiu TEMP, TEMP, -2
  1187. #endif
  1188. dsll K,TEMP, 2 + BASE_SHIFT
  1189. dsll TEMP, TEMP, 1 + BASE_SHIFT
  1190. daddu A,A,K
  1191. daddu B,B,TEMP
  1192. #endif
  1193. #ifdef LEFT
  1194. daddiu KK, KK, 4
  1195. #endif
  1196. bnez M,.L40
  1197. nop
  1198. #endif
  1199. .align 3
  1200. .L12_M2:
  1201. andi M,MCO,2 # mr = 2
  1202. beqz M,.L12_M1
  1203. nop
  1204. .L50:
  1205. #if defined(TRMMKERNEL)
  1206. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1207. move B,BO
  1208. #else
  1209. dsll K, KK, 1 + BASE_SHIFT #mr=2
  1210. dsll TEMP, KK, 1 + BASE_SHIFT #nr=2
  1211. daddu A, A, K
  1212. daddu B, BO, TEMP
  1213. #endif
  1214. MTC $0,t11
  1215. gsLQC1(R8,F1,F0,0) #a0,a1
  1216. MOV t21,t11
  1217. gsLQC1(R9,F9,F8,0) #b0,b1
  1218. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1219. dsubu TEMP, KCO, KK
  1220. #elif defined(LEFT)
  1221. daddiu TEMP, KK, 2
  1222. #else
  1223. daddiu TEMP, KK, 2
  1224. #endif
  1225. dsra K,TEMP,2
  1226. MOV t12,t11
  1227. beqz K,.L55
  1228. MOV t22,t11
  1229. #else
  1230. move B,BO
  1231. dsra K,KCO,2 # K=KCO/2
  1232. gsLQC1(R8,F1,F0,0) #a0,a1
  1233. MTC $0,t11
  1234. MOV t21,t11
  1235. gsLQC1(R9,F9,F8,0) #b0,b1
  1236. MOV t12,t11
  1237. beqz K,.L55
  1238. MOV t22,t11
  1239. #endif
  1240. .L51: # nr=2 mr=2,kr=4
  1241. gsLQC1(R8,F5,F4,1)
  1242. MADD t11,t11,a0,b0
  1243. MADD t21,t21,a1,b0
  1244. gsLQC1(R9,F13,F12,1)
  1245. MADD t12,t12,a0,b1
  1246. MADD t22,t22,a1,b1
  1247. gsLQC1(R8,F3,F2,2)
  1248. MADD t11,t11,a4,b4
  1249. MADD t21,t21,a5,b4
  1250. gsLQC1(R9,F11,F10,2)
  1251. MADD t12,t12,a4,b5
  1252. MADD t22,t22,a5,b5
  1253. daddiu K,K,-1
  1254. gsLQC1(R8,F7,F6,3)
  1255. MADD t11,t11,a2,b2
  1256. MADD t21,t21,a3,b2
  1257. daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
  1258. gsLQC1(R9,F15,F14,3)
  1259. MADD t12,t12,a2,b3
  1260. MADD t22,t22,a3,b3
  1261. daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE
  1262. gsLQC1(R8,F1,F0,0)
  1263. MADD t11,t11,a6,b6
  1264. MADD t21,t21,a7,b6
  1265. gsLQC1(R9,F9,F8,0)
  1266. MADD t12,t12,a6,b7
  1267. bnez K,.L51
  1268. MADD t22,t22,a7,b7
  1269. .L55: # kr=2
  1270. #ifndef TRMMKERNEL
  1271. andi K,KCO,2
  1272. #else
  1273. andi K,TEMP,2
  1274. #endif
  1275. beqz K,.L58
  1276. nop
  1277. .L56:
  1278. gsLQC1(R8,F5,F4,1)
  1279. MADD t11,t11,a0,b0
  1280. MADD t21,t21,a1,b0
  1281. daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
  1282. gsLQC1(R9,F13,F12,1)
  1283. MADD t12,t12,a0,b1
  1284. MADD t22,t22,a1,b1
  1285. daddu B,B,4*SIZE # 2nr*2kr
  1286. .L57:
  1287. gsLQC1(R8,F1,F0,0)
  1288. MADD t11,t11,a4,b4
  1289. MADD t21,t21,a5,b4
  1290. gsLQC1(R9,F9,F8,0)
  1291. MADD t12,t12,a4,b5
  1292. MADD t22,t22,a5,b5
  1293. .L58: # kr=1
  1294. #ifndef TRMMKERNEL
  1295. andi K,KCO,1
  1296. #else
  1297. andi K,TEMP, 1
  1298. #endif
  1299. beqz K,.L59
  1300. LD ALPHA,152($sp) # Get ALPHA
  1301. MADD t11,t11,a0,b0
  1302. MADD t21,t21,a1,b0
  1303. daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
  1304. daddu B,B,2*SIZE # 2nr*kr
  1305. MADD t12,t12,a0,b1
  1306. MADD t22,t22,a1,b1
  1307. .L59: # Write Back
  1308. #ifndef TRMMKERNEL
  1309. LD c11,0(CO1) # write gemm part back Fetch 16 C
  1310. LD c21,1*SIZE(CO1)
  1311. LD c12,0(CO2)
  1312. LD c22,1*SIZE(CO2)
  1313. MADD t11,c11,t11,ALPHA
  1314. MADD t21,c21,t21,ALPHA
  1315. MADD t12,c12,t12,ALPHA
  1316. MADD t22,c22,t22,ALPHA
  1317. ST t11,0(CO1)
  1318. ST t21,1*SIZE(CO1)
  1319. ST t12,0(CO2)
  1320. ST t22,1*SIZE(CO2)
  1321. daddu CO1,CO1,2*SIZE
  1322. daddu CO2,CO2,2*SIZE
  1323. FETCH $0,0(CO1)
  1324. FETCH $0,0(CO2)
  1325. #else
  1326. daddiu M, M, -1
  1327. daddiu CO1,CO1, 2 * SIZE
  1328. daddiu CO2,CO2, 2 * SIZE
  1329. MUL t11, ALPHA, t11
  1330. MUL t21, ALPHA, t21
  1331. MUL t12, ALPHA, t12
  1332. MUL t22, ALPHA, t22
  1333. ST t11, -2 * SIZE(CO1)
  1334. ST t21, -1 * SIZE(CO1)
  1335. ST t12, -2 * SIZE(CO2)
  1336. ST t22, -1 * SIZE(CO2)
  1337. FETCH $0,0(CO1)
  1338. FETCH $0,0(CO2)
  1339. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1340. dsubu TEMP, KCO, KK
  1341. #ifdef LEFT
  1342. daddiu TEMP, TEMP, -2
  1343. #else
  1344. daddiu TEMP, TEMP, -2
  1345. #endif
  1346. dsll K, TEMP, 1 + BASE_SHIFT
  1347. dsll TEMP, TEMP, 1 + BASE_SHIFT
  1348. daddu A, A, K
  1349. daddu B, B, TEMP
  1350. #endif
  1351. #ifdef LEFT
  1352. daddiu KK, KK, 2
  1353. #endif
  1354. #endif
  1355. .align 3
  1356. .L12_M1:
  1357. andi M,MCO,1 # mr = 1
  1358. beqz M,.L0_N2_Loop
  1359. nop
  1360. .L60:
  1361. #if defined(TRMMKERNEL)
  1362. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1363. move B,BO # Reset B
  1364. #else
  1365. dsll K, KK, 0 + BASE_SHIFT
  1366. dsll TEMP, KK, 1 + BASE_SHIFT
  1367. daddu A, A, K
  1368. daddu B, BO, TEMP
  1369. #endif
  1370. MTC $0,t11
  1371. LD a0, 0*SIZE(A) # a0
  1372. MOV t21,t11
  1373. gsLQC1(R9,F9,F8,0) # b0,b1
  1374. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1375. dsubu TEMP, KCO, KK
  1376. #elif defined(LEFT)
  1377. daddiu TEMP, KK, 1
  1378. #else
  1379. daddiu TEMP, KK, 2
  1380. #endif
  1381. dsra K,TEMP,2
  1382. MOV t12,t11
  1383. beqz K,.L65
  1384. MOV t22,t11
  1385. #else
  1386. dsra K,KCO,2
  1387. move B,BO # Reset B
  1388. LD a0,0*SIZE(A)
  1389. MTC $0,t11
  1390. MOV t21,t11
  1391. gsLQC1(R9,F9,F8,0)
  1392. MOV t12,t11
  1393. beqz K,.L65
  1394. MOV t22,t11
  1395. #endif
  1396. .L61: # nr=2,mr=1,kr=4
  1397. LD a4, 1*SIZE(A) # a2
  1398. MADD t11,t11,a0,b0
  1399. gsLQC1(R9,F13,F12,1)
  1400. MADD t12,t12,a0,b1
  1401. LD a2, 2*SIZE(A) # a3
  1402. MADD t11,t11,a4,b4
  1403. gsLQC1(R9,F11,F10,2)
  1404. MADD t12,t12,a4,b5
  1405. LD a6, 3*SIZE(A) # a4
  1406. MADD t11,t11,a2,b2
  1407. daddiu K,K,-1
  1408. gsLQC1(R9,F15,F14,3)
  1409. MADD t12,t12,a2,b3
  1410. daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
  1411. LD a0, 0*SIZE(A)
  1412. MADD t11,t11,a6,b6
  1413. daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
  1414. gsLQC1(R9,F9,F8,0) # a0
  1415. bnez K,.L61
  1416. MADD t12,t12,a6,b7
  1417. .L65: # kr=2
  1418. #ifndef TRMMKERNEL
  1419. andi K,KCO,2
  1420. #else
  1421. andi K,TEMP,2
  1422. #endif
  1423. beqz K,.L68
  1424. nop
  1425. .L66:
  1426. LD a4, 1*SIZE(A) # a1
  1427. MADD t11,t11,a0,b0
  1428. daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
  1429. gsLQC1(R9,F13,F12,1)
  1430. MADD t12,t12,a0,b1
  1431. daddu B,B,4*SIZE
  1432. .L67:
  1433. LD a0,0(A) # a0
  1434. MADD t11,t11,a4,b4
  1435. gsLQC1(R9,F9,F8,0)
  1436. MADD t12,t12,a4,b5
  1437. .L68: # kr=1
  1438. #ifndef TRMMKERNEL
  1439. andi K,KCO,1
  1440. #else
  1441. andi K,TEMP,1
  1442. #endif
  1443. beqz K,.L69
  1444. LD ALPHA,152($sp) # Get ALPHA
  1445. MADD t11,t11,a0,b0
  1446. MADD t12,t12,a0,b1
  1447. daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
  1448. daddu B,B,2*SIZE
  1449. .L69: # Write Back
  1450. #ifndef TRMMKERNEL
  1451. LD c11,0(CO1) # Fetch 16 C
  1452. LD c12,0(CO2)
  1453. MADD t11,c11,t11,ALPHA
  1454. MADD t12,c12,t12,ALPHA
  1455. ST t11,0(CO1)
  1456. ST t12,0(CO2)
  1457. daddu CO1,CO1,1*SIZE
  1458. daddu CO2,CO2,1*SIZE
  1459. #else
  1460. MUL t11, ALPHA, t11
  1461. MUL t12, ALPHA, t12
  1462. ST t11, 0 * SIZE(CO1)
  1463. ST t12, 0 * SIZE(CO2)
  1464. daddu CO1,CO1,1*SIZE
  1465. daddu CO2,CO2,1*SIZE
  1466. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1467. dsubu TEMP, KCO, KK
  1468. #ifdef LEFT
  1469. daddiu TEMP, TEMP, -1
  1470. #else
  1471. daddiu TEMP, TEMP, -2
  1472. #endif
  1473. dsll K, TEMP, 0 + BASE_SHIFT
  1474. dsll TEMP, TEMP, 1 + BASE_SHIFT
  1475. daddu A, A, K
  1476. daddu B, B, TEMP
  1477. #endif
  1478. #ifdef LEFT
  1479. daddiu KK, KK, 1
  1480. #endif
  1481. #endif
  1482. .L0_N2_Loop:
  1483. #if defined(TRMMKERNEL) && !defined(LEFT)
  1484. daddiu KK, KK, 2
  1485. #endif
  1486. move BO, B
  1487. .align 5
  1488. .L0_N1:
  1489. andi N,NCO,1 # nr = 1
  1490. beqz N,.L999
  1491. nop
  1492. move CO1,C
  1493. dsra M,MCO,2
  1494. move A,AO # Reset A
  1495. daddu PREA,AO,SPANA
  1496. #if defined(TRMMKERNEL) && defined(LEFT)
  1497. move KK, OFFSET
  1498. #endif
  1499. beqz M,.L11_M2
  1500. daddu C,CO1,LDC
  1501. .L70:
  1502. #if defined(TRMMKERNEL)
  1503. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1504. move B, BO # Reset B
  1505. #else
  1506. dsll K, KK, 2 + BASE_SHIFT
  1507. dsll TEMP, KK, 0 + BASE_SHIFT
  1508. daddu A, A, K
  1509. daddu B, BO, TEMP
  1510. #endif
  1511. MTC $0,t11
  1512. LD b0, 0*SIZE(B)
  1513. MOV t21,t11
  1514. gsLQC1(R8,F1,F0,0) #a0,a1
  1515. MOV t31,t11
  1516. gsLQC1(R8,F3,F2,1) #a2,a3
  1517. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1518. dsubu TEMP, KCO, KK
  1519. #elif defined(LEFT)
  1520. daddiu TEMP, KK, 4
  1521. #else
  1522. daddiu TEMP, KK, 1
  1523. #endif
  1524. dsra K,TEMP,2
  1525. MOV t41,t11
  1526. beqz K,.L75
  1527. nop
  1528. #else
  1529. move B, BO # Reset B
  1530. dsra K,KCO,2
  1531. LD b0, 0*SIZE(B)
  1532. MTC $0,t11
  1533. MOV t21,t11
  1534. gsLQC1(R8,F1,F0,0) #a0,a1
  1535. MOV t31,t11
  1536. MOV t41,t11
  1537. gsLQC1(R8,F3,F2,1) #a2,a3
  1538. beqz K,.L75
  1539. nop
  1540. #endif
  1541. .L71: # nr=1,mr=kr=4
  1542. LD b4, 1*SIZE(B) # b1
  1543. MADD t11,t11,a0,b0
  1544. gsLQC1(R8,F5,F4,2)
  1545. MADD t21,t21,a1,b0
  1546. gsLQC1(R8,F7,F6,3)
  1547. FETCH $0,(PREA)
  1548. MADD t31,t31,a2,b0
  1549. MADD t41,t41,a3,b0
  1550. .L72:
  1551. LD b2, 2*SIZE(B) # b2
  1552. MADD t11,t11,a4,b4
  1553. gsLQC1(R8,F1,F0,4)
  1554. MADD t21,t21,a5,b4
  1555. gsLQC1(R8,F3,F2,5)
  1556. FETCH $0,4*SIZE(PREA)
  1557. MADD t31,t31,a6,b4
  1558. MADD t41,t41,a7,b4
  1559. .L73:
  1560. LD b6, 3*SIZE(B)
  1561. MADD t11,t11,a0,b2
  1562. daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
  1563. gsLQC1(R8,F5,F4,6)
  1564. MADD t21,t21,a1,b2
  1565. FETCH $0,8*SIZE(PREA)
  1566. gsLQC1(R8,F7,F6,7)
  1567. MADD t31,t31,a2,b2
  1568. MADD t41,t41,a3,b2
  1569. daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
  1570. .L74:
  1571. LD b0, 0*SIZE(B)
  1572. MADD t11,t11,a4,b6
  1573. daddu PREA,PREA,16*SIZE
  1574. gsLQC1(R8,F1,F0,0)
  1575. MADD t21,t21,a5,b6
  1576. daddiu K,K,-1
  1577. FETCH $0,-32(PREA)
  1578. gsLQC1(R8,F3,F2,1)
  1579. MADD t31,t31,a6,b6
  1580. bnez K,.L71
  1581. MADD t41,t41,a7,b6
  1582. .L75: # kr=2
  1583. #ifndef TRMMKERNEL
  1584. andi K,KCO,2
  1585. #else
  1586. andi K,TEMP,2
  1587. #endif
  1588. beqz K,.L78
  1589. nop
  1590. .L76:
  1591. LD b4, 1*SIZE(B)
  1592. MADD t11,t11,a0,b0
  1593. daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
  1594. gsLQC1(R8,F5,F4,2)
  1595. MADD t21,t21,a1,b0
  1596. FETCH $0,0(PREA)
  1597. gsLQC1(R8,F7,F6,3)
  1598. MADD t31,t31,a2,b0
  1599. MADD t41,t41,a3,b0
  1600. daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
  1601. .L77:
  1602. LD b0,0(B)
  1603. MADD t11,t11,a4,b4
  1604. gsLQC1(R8,F1,F0,0)
  1605. MADD t21,t21,a5,b4
  1606. FETCH $0,4*SIZE(PREA)
  1607. gsLQC1(R8,F3,F2,1)
  1608. MADD t31,t31,a6,b4
  1609. MADD t41,t41,a7,b4
  1610. daddu PREA,PREA,8*SIZE
  1611. .L78: # kr=1
  1612. #ifndef TRMMKERNEL
  1613. andi K,KCO,1
  1614. #else
  1615. andi K,TEMP,1
  1616. #endif
  1617. beqz K,.L79
  1618. LD ALPHA,152($sp) # Get ALPHA
  1619. FETCH $0,0(PREA)
  1620. MADD t11,t11,a0,b0
  1621. MADD t21,t21,a1,b0
  1622. daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
  1623. MADD t31,t31,a2,b0
  1624. MADD t41,t41,a3,b0
  1625. daddu B,B,1*SIZE
  1626. daddu PREA,PREA,4*SIZE
  1627. .L79: # Write Back
  1628. #ifndef TRMMKERNEL
  1629. LD c11,0(CO1) # Fetch 16 C
  1630. LD c21,1*SIZE(CO1)
  1631. LD c31,2*SIZE(CO1)
  1632. LD c41,3*SIZE(CO1)
  1633. MADD t11,c11,t11,ALPHA
  1634. MADD t21,c21,t21,ALPHA
  1635. MADD t31,c31,t31,ALPHA
  1636. MADD t41,c41,t41,ALPHA
  1637. ST t11,0(CO1)
  1638. ST t21,1*SIZE(CO1)
  1639. ST t31,2*SIZE(CO1)
  1640. ST t41,3*SIZE(CO1)
  1641. daddiu M,M,-1 # M--
  1642. FETCH $0,4*SIZE(CO1)
  1643. FETCH $0,8*SIZE(CO1)
  1644. bnez M,.L70 # M!=0
  1645. daddu CO1,CO1,4*SIZE # COx += 4*8Byte
  1646. #else
  1647. daddiu M,M,-1 # M--
  1648. MUL t11, ALPHA, t11
  1649. MUL t21, ALPHA, t21
  1650. MUL t31, ALPHA, t31
  1651. MUL t41, ALPHA, t41
  1652. ST t11,0(CO1)
  1653. ST t21,1*SIZE(CO1)
  1654. ST t31,2*SIZE(CO1)
  1655. ST t41,3*SIZE(CO1)
  1656. FETCH $0,4*SIZE(CO1)
  1657. FETCH $0,8*SIZE(CO1)
  1658. daddu CO1,CO1,4*SIZE
  1659. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1660. dsubu TEMP, KCO, KK
  1661. #ifdef LEFT
  1662. daddiu TEMP, TEMP, -4
  1663. #else
  1664. daddiu TEMP, TEMP, -1
  1665. #endif
  1666. dsll K, TEMP, 2 + BASE_SHIFT
  1667. dsll TEMP, TEMP, 0 + BASE_SHIFT
  1668. daddu A, A,K
  1669. daddu B, B, TEMP
  1670. #endif
  1671. #ifdef LEFT
  1672. daddiu KK, KK, 4
  1673. #endif
  1674. bnez M,.L70
  1675. nop
  1676. #endif
  1677. .align 3
  1678. .L11_M2:
  1679. andi M,MCO,2 # mr = 2
  1680. beqz M,.L11_M1
  1681. nop
  1682. .L80:
  1683. #if defined(TRMMKERNEL)
  1684. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1685. move B, BO
  1686. #else
  1687. dsll K, KK, 1 + BASE_SHIFT
  1688. dsll TEMP, KK, 0 + BASE_SHIFT
  1689. daddu A, A, K
  1690. daddu B, BO, TEMP
  1691. #endif
  1692. LD b0, 0*SIZE(B)
  1693. MTC $0,t11
  1694. gsLQC1(R8,F1,F0,0) #a0,a1
  1695. MOV t21,t11
  1696. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1697. dsubu TEMP, KCO, KK
  1698. #elif defined(LEFT)
  1699. daddiu TEMP, KK, 2
  1700. #else
  1701. daddiu TEMP, KK, 1
  1702. #endif
  1703. dsra K,TEMP,2 # K=KCO/2
  1704. beqz K,.L85
  1705. nop
  1706. #else
  1707. move B, BO
  1708. dsra K,KCO,2
  1709. LD b0, 0*SIZE(B)
  1710. MTC $0,t11
  1711. MOV t21,t11
  1712. gsLQC1(R8,F1,F0,0) #a0,a1
  1713. beqz K,.L85
  1714. nop
  1715. #endif
  1716. .L81: # nr=1,mr=2,kr=4
  1717. LD b4, 1*SIZE(B)
  1718. gsLQC1(R8,F5,F4,1)
  1719. MADD t11,t11,a0,b0
  1720. MADD t21,t21,a1,b0
  1721. LD b2, 2*SIZE(B)
  1722. gsLQC1(R8,F3,F2,2)
  1723. MADD t11,t11,a4,b4
  1724. MADD t21,t21,a5,b4
  1725. LD b6, 3*SIZE(B)
  1726. gsLQC1(R8,F7,F6,3)
  1727. MADD t11,t11,a2,b2
  1728. MADD t21,t21,a3,b2
  1729. daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
  1730. daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
  1731. LD b0, 0*SIZE(B)
  1732. gsLQC1(R8,F1,F0,0)
  1733. MADD t11,t11,a6,b6
  1734. MADD t21,t21,a7,b6
  1735. daddiu K,K,-1
  1736. bnez K,.L81
  1737. nop
  1738. .L85: # kr=2
  1739. #ifndef TRMMKERNEL
  1740. andi K,KCO,2
  1741. #else
  1742. andi K,TEMP,2
  1743. #endif
  1744. beqz K,.L88
  1745. nop
  1746. .L86:
  1747. gsLQC1(R8,F5,F4,1)
  1748. LD b4, 1*SIZE(B)
  1749. MADD t11,t11,a0,b0
  1750. MADD t21,t21,a1,b0
  1751. daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
  1752. daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
  1753. gsLQC1(R8,F1,F0,0)
  1754. LD b0,0(B)
  1755. MADD t11,t11,a4,b4
  1756. MADD t21,t21,a5,b4
  1757. .L88: # kr=1
  1758. #ifndef TRMMKERNEL
  1759. andi K,KCO,1
  1760. #else
  1761. andi K,TEMP,1
  1762. #endif
  1763. beqz K,.L89
  1764. LD ALPHA,152($sp) # Get ALPHA
  1765. MADD t11,t11,a0,b0
  1766. MADD t21,t21,a1,b0
  1767. daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
  1768. daddu B,B,1*SIZE
  1769. .L89: # Write Back
  1770. #ifndef TRMMKERNEL
  1771. LD c11,0(CO1) # Fetch 16 C
  1772. LD c21,1*SIZE(CO1)
  1773. MADD t11,c11,t11,ALPHA
  1774. MADD t21,c21,t21,ALPHA
  1775. ST t11,0(CO1)
  1776. ST t21,1*SIZE(CO1)
  1777. FETCH $0,2*SIZE(CO1)
  1778. daddu CO1,CO1,2*SIZE # COx += 2*8Byte
  1779. #else
  1780. daddu CO1,CO1,2*SIZE # COx += 2*8Byte
  1781. MUL t11, ALPHA, t11
  1782. MUL t21, ALPHA, t21
  1783. FETCH $0,0(CO1)
  1784. ST t11, -2 * SIZE(CO1)
  1785. ST t21, -1 * SIZE(CO1)
  1786. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1787. dsubu TEMP, KCO, KK
  1788. #ifdef LEFT
  1789. daddiu TEMP, TEMP, -2
  1790. #else
  1791. daddiu TEMP, TEMP, -1
  1792. #endif
  1793. dsll K, TEMP, 1 + BASE_SHIFT
  1794. dsll TEMP, TEMP, 0 + BASE_SHIFT
  1795. daddu A, A, K
  1796. daddu B, B, TEMP
  1797. #endif
  1798. #ifdef LEFT
  1799. daddiu KK, KK, 2
  1800. #endif
  1801. #endif
  1802. .align 3
  1803. .L11_M1:
  1804. andi M,MCO,1 # mr = 1
  1805. beqz M,.L999
  1806. nop
  1807. .L90:
  1808. #if defined(TRMMKERNEL)
  1809. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1810. move B, BO
  1811. #else
  1812. dsll K, KK, 0 + BASE_SHIFT
  1813. dsll TEMP, KK, 0 + BASE_SHIFT
  1814. daddu A, A, K
  1815. daddu B, BO, TEMP
  1816. #endif
  1817. LD a0, 0*SIZE(A)
  1818. LD b0, 0*SIZE(B)
  1819. MTC $0,t11
  1820. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1821. dsubu TEMP, KCO, KK
  1822. #elif defined(LEFT)
  1823. daddiu TEMP, KK, 1
  1824. #else
  1825. daddiu TEMP, KK, 1
  1826. #endif
  1827. dsra K, TEMP, 2
  1828. beqz K,.L95
  1829. nop
  1830. #else
  1831. move B, BO
  1832. LD a0, 0*SIZE(A)
  1833. LD b0, 0*SIZE(B)
  1834. dsra K,KCO,2
  1835. beqz K,.L95
  1836. MTC $0,t11
  1837. #endif
  1838. .L91: # nr=mr=1,kr=4
  1839. LD a4, 1*SIZE(A)
  1840. LD b4, 1*SIZE(B)
  1841. MADD t11,t11,a0,b0
  1842. LD a2, 2*SIZE(A)
  1843. LD b2, 2*SIZE(B)
  1844. MADD t11,t11,a4,b4
  1845. LD a6, 3*SIZE(A)
  1846. LD b6, 3*SIZE(B)
  1847. MADD t11,t11,a2,b2
  1848. daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
  1849. daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
  1850. LD a0, 0*SIZE(A)
  1851. LD b0, 0*SIZE(B)
  1852. MADD t11,t11,a6,b6
  1853. daddiu K,K,-1
  1854. bnez K,.L91
  1855. nop
  1856. .L95: # kr=2
  1857. #ifndef TRMMKERNEL
  1858. andi K,KCO,2
  1859. #else
  1860. andi K,TEMP,2
  1861. #endif
  1862. beqz K,.L98
  1863. nop
  1864. .L96:
  1865. LD a4, 1*SIZE(A)
  1866. LD b4, 1*SIZE(B)
  1867. MADD t11,t11,a0,b0
  1868. daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
  1869. daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32
  1870. LD b0,0(B)
  1871. LD a0,0(A)
  1872. MADD t11,t11,a4,b4
  1873. .L98: # kr=1
  1874. #ifndef TRMMKERNEL
  1875. andi K,KCO,1
  1876. #else
  1877. andi K,TEMP,1
  1878. #endif
  1879. beqz K,.L99
  1880. LD ALPHA,152($sp) # Get ALPHA
  1881. MADD t11,t11,a0,b0
  1882. .L99: # Write Back
  1883. #ifndef TRMMKERNEL
  1884. LD c11,0(CO1) # Fetch 16 C
  1885. MADD t11,c11,t11,ALPHA
  1886. ST t11,0(CO1)
  1887. #else
  1888. MUL t11, ALPHA, t11
  1889. ST t11, 0 * SIZE(CO1)
  1890. #endif
  1891. .L999: # End
  1892. ld $16, 0($sp)
  1893. ld $17, 8($sp)
  1894. ld $18, 16($sp)
  1895. ld $19, 24($sp)
  1896. ld $20, 32($sp)
  1897. ld $21, 40($sp)
  1898. ld $22, 48($sp)
  1899. LD $f24, 56($sp)
  1900. LD $f25, 64($sp)
  1901. LD $f26, 72($sp)
  1902. LD $f27, 80($sp)
  1903. LD $f28, 88($sp)
  1904. ld $23, 96($sp)
  1905. ld $24, 104($sp)
  1906. ld $25, 112($sp)
  1907. LD $f20,120($sp)
  1908. LD $f21,128($sp)
  1909. LD $f22,136($sp)
  1910. LD $f23,144($sp)
  1911. j $31
  1912. daddiu $sp, $sp, 160
  1913. EPILOGUE