You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel.S 24 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M $4
  41. #define N $5
  42. #define K $6
  43. #define A $9
  44. #define B $10
  45. #define C $11
  46. #define LDC $8
  47. #define AO $12
  48. #define BO $13
  49. #define I $2
  50. #define J $3
  51. #define L $7
  52. #define CO1 $14
  53. #define CO2 $15
  54. #define CO3 $16
  55. #define CO4 $17
  56. #if defined(TRMMKERNEL)
  57. #define OFFSET $18
  58. #define KK $19
  59. #define TEMP $20
  60. #endif
  61. #define a1 $f0
  62. #define a2 $f1
  63. #define a3 $f28
  64. #define a4 $f29
  65. #define b1 $f2
  66. #define b2 $f3
  67. #define b3 $f4
  68. #define b4 $f5
  69. #define b5 $f6
  70. #define b6 $f7
  71. #define b7 $f8
  72. #define b8 $f9
  73. #define a5 b8
  74. #define c11 $f10
  75. #define c12 $f11
  76. #define c21 $f12
  77. #define c22 $f13
  78. #define c31 $f14
  79. #define c32 $f17
  80. #define c41 $f18
  81. #define c42 $f19
  82. #define c51 $f20
  83. #define c52 $f21
  84. #define c61 $f22
  85. #define c62 $f23
  86. #define c71 $f24
  87. #define c72 $f25
  88. #define c81 $f26
  89. #define c82 $f27
  90. #define ALPHA_R $f15
  91. #define ALPHA_I $f16
  92. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  93. #define MADD1 MADD
  94. #define MADD2 MADD
  95. #define MADD3 MADD
  96. #define MADD4 NMSUB
  97. #endif
  98. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  99. #define MADD1 MADD
  100. #define MADD2 MADD
  101. #define MADD3 NMSUB
  102. #define MADD4 MADD
  103. #endif
  104. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  105. #define MADD1 MADD
  106. #define MADD2 NMSUB
  107. #define MADD3 MADD
  108. #define MADD4 MADD
  109. #endif
  110. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  111. #define MADD1 MADD
  112. #define MADD2 NMSUB
  113. #define MADD3 NMSUB
  114. #define MADD4 NMSUB
  115. #endif
  116. PROLOGUE
  117. LDARG LDC, 0($sp)
  118. daddiu $sp, $sp, -128
  119. SDARG $16, 0($sp)
  120. SDARG $17, 8($sp)
  121. sdc1 $f24, 16($sp)
  122. sdc1 $f25, 24($sp)
  123. sdc1 $f26, 32($sp)
  124. sdc1 $f27, 40($sp)
  125. sdc1 $f28, 48($sp)
  126. sdc1 $f29, 56($sp)
  127. #if defined(TRMMKERNEL)
  128. SDARG $18, 64($sp)
  129. SDARG $19, 72($sp)
  130. SDARG $20, 80($sp)
  131. LDARG OFFSET, 128 + 8($sp)
  132. #endif
  133. #ifndef __64BIT__
  134. sdc1 $f20, 88($sp)
  135. sdc1 $f21, 96($sp)
  136. sdc1 $f22,104($sp)
  137. sdc1 $f23,112($sp)
  138. #endif
  139. dsll LDC, LDC, ZBASE_SHIFT
  140. #if defined(TRMMKERNEL) && !defined(LEFT)
  141. neg KK, OFFSET
  142. #endif
  143. dsra J, N, 2
  144. blez J, .L20
  145. nop
  146. .L10:
  147. move CO1, C
  148. MTC $0, c11
  149. daddu CO2, C, LDC
  150. move AO, A
  151. daddu CO3, CO2, LDC
  152. daddiu J, J, -1
  153. daddu CO4, CO3, LDC
  154. MOV c21, c11
  155. MOV c31, c11
  156. #if defined(TRMMKERNEL) && defined(LEFT)
  157. move KK, OFFSET
  158. #endif
  159. MOV c41, c11
  160. MOV c51, c11
  161. move I, M
  162. daddu C, CO4, LDC
  163. blez I, .L19
  164. MOV c61, c11
  165. .L11:
  166. #if defined(TRMMKERNEL)
  167. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  168. move BO, B
  169. #else
  170. dsll L, KK, ZBASE_SHIFT
  171. dsll TEMP, KK, 2 + ZBASE_SHIFT
  172. daddu AO, AO, L
  173. daddu BO, B, TEMP
  174. #endif
  175. LD a1, 0 * SIZE(AO)
  176. MOV c71, c11
  177. LD b1, 0 * SIZE(BO)
  178. MOV c81, c11
  179. LD a3, 4 * SIZE(AO)
  180. MOV c12, c11
  181. LD b2, 1 * SIZE(BO)
  182. MOV c22, c11
  183. MOV c32, c11
  184. LD b3, 2 * SIZE(BO)
  185. MOV c42, c11
  186. LD b4, 3 * SIZE(BO)
  187. MOV c52, c11
  188. LD b5, 4 * SIZE(BO)
  189. MOV c62, c11
  190. LD b6, 8 * SIZE(BO)
  191. MOV c72, c11
  192. LD b7, 12 * SIZE(BO)
  193. MOV c82, c11
  194. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  195. dsubu TEMP, K, KK
  196. #elif defined(LEFT)
  197. daddiu TEMP, KK, 1
  198. #else
  199. daddiu TEMP, KK, 4
  200. #endif
  201. dsra L, TEMP, 2
  202. blez L, .L15
  203. NOP
  204. #else
  205. LD a1, 0 * SIZE(AO)
  206. MOV c71, c11
  207. LD b1, 0 * SIZE(B)
  208. MOV c81, c11
  209. LD a3, 4 * SIZE(AO)
  210. MOV c12, c11
  211. LD b2, 1 * SIZE(B)
  212. MOV c22, c11
  213. dsra L, K, 2
  214. MOV c32, c11
  215. LD b3, 2 * SIZE(B)
  216. MOV c42, c11
  217. LD b4, 3 * SIZE(B)
  218. MOV c52, c11
  219. LD b5, 4 * SIZE(B)
  220. MOV c62, c11
  221. LD b6, 8 * SIZE(B)
  222. MOV c72, c11
  223. LD b7, 12 * SIZE(B)
  224. MOV c82, c11
  225. blez L, .L15
  226. move BO, B
  227. #endif
  228. MADD1 c11, c11, a1, b1
  229. LD a2, 1 * SIZE(AO)
  230. MADD3 c21, c21, a1, b2
  231. daddiu L, L, -1
  232. MADD1 c31, c31, a1, b3
  233. NOP
  234. blez L, .L13
  235. MADD3 c41, c41, a1, b4
  236. .align 3
  237. .L12:
  238. MADD2 c12, c12, a2, b1
  239. LD b1, 16 * SIZE(BO)
  240. MADD4 c22, c22, a2, b2
  241. LD b2, 5 * SIZE(BO)
  242. MADD2 c32, c32, a2, b3
  243. LD b3, 6 * SIZE(BO)
  244. MADD4 c42, c42, a2, b4
  245. LD b4, 7 * SIZE(BO)
  246. MADD1 c51, c51, a1, b5
  247. NOP
  248. MADD3 c61, c61, a1, b2
  249. LD a4, 2 * SIZE(AO)
  250. MADD1 c71, c71, a1, b3
  251. NOP
  252. MADD3 c81, c81, a1, b4
  253. LD a1, 8 * SIZE(AO)
  254. MADD2 c52, c52, a2, b5
  255. LD b5, 20 * SIZE(BO)
  256. MADD4 c62, c62, a2, b2
  257. LD b2, 9 * SIZE(BO)
  258. MADD2 c72, c72, a2, b3
  259. LD b3, 10 * SIZE(BO)
  260. MADD4 c82, c82, a2, b4
  261. LD b4, 11 * SIZE(BO)
  262. MADD1 c11, c11, a4, b6
  263. LD a2, 3 * SIZE(AO)
  264. MADD3 c21, c21, a4, b2
  265. NOP
  266. MADD1 c31, c31, a4, b3
  267. NOP
  268. MADD3 c41, c41, a4, b4
  269. NOP
  270. MADD2 c12, c12, a2, b6
  271. LD b6, 24 * SIZE(BO)
  272. MADD4 c22, c22, a2, b2
  273. LD b2, 13 * SIZE(BO)
  274. MADD2 c32, c32, a2, b3
  275. LD b3, 14 * SIZE(BO)
  276. MADD4 c42, c42, a2, b4
  277. LD b4, 15 * SIZE(BO)
  278. MADD1 c51, c51, a4, b7
  279. NOP
  280. MADD3 c61, c61, a4, b2
  281. NOP
  282. MADD1 c71, c71, a4, b3
  283. NOP
  284. MADD3 c81, c81, a4, b4
  285. NOP
  286. MADD2 c52, c52, a2, b7
  287. LD b7, 28 * SIZE(BO)
  288. MADD4 c62, c62, a2, b2
  289. LD b2, 17 * SIZE(BO)
  290. MADD2 c72, c72, a2, b3
  291. LD b3, 18 * SIZE(BO)
  292. MADD4 c82, c82, a2, b4
  293. LD b4, 19 * SIZE(BO)
  294. MADD1 c11, c11, a3, b1
  295. LD a2, 5 * SIZE(AO)
  296. MADD3 c21, c21, a3, b2
  297. NOP
  298. MADD1 c31, c31, a3, b3
  299. NOP
  300. MADD3 c41, c41, a3, b4
  301. NOP
  302. MADD2 c12, c12, a2, b1
  303. LD b1, 32 * SIZE(BO)
  304. MADD4 c22, c22, a2, b2
  305. LD b2, 21 * SIZE(BO)
  306. MADD2 c32, c32, a2, b3
  307. LD b3, 22 * SIZE(BO)
  308. MADD4 c42, c42, a2, b4
  309. LD b4, 23 * SIZE(BO)
  310. MADD1 c51, c51, a3, b5
  311. NOP
  312. MADD3 c61, c61, a3, b2
  313. LD a4, 6 * SIZE(AO)
  314. MADD1 c71, c71, a3, b3
  315. NOP
  316. MADD3 c81, c81, a3, b4
  317. LD a3, 12 * SIZE(AO)
  318. MADD2 c52, c52, a2, b5
  319. LD b5, 36 * SIZE(BO)
  320. MADD4 c62, c62, a2, b2
  321. LD b2, 25 * SIZE(BO)
  322. MADD2 c72, c72, a2, b3
  323. LD b3, 26 * SIZE(BO)
  324. MADD4 c82, c82, a2, b4
  325. LD b4, 27 * SIZE(BO)
  326. MADD1 c11, c11, a4, b6
  327. LD a2, 7 * SIZE(AO)
  328. MADD3 c21, c21, a4, b2
  329. NOP
  330. MADD1 c31, c31, a4, b3
  331. NOP
  332. MADD3 c41, c41, a4, b4
  333. daddiu L, L, -1
  334. MADD2 c12, c12, a2, b6
  335. LD b6, 40 * SIZE(BO)
  336. MADD4 c22, c22, a2, b2
  337. LD b2, 29 * SIZE(BO)
  338. MADD2 c32, c32, a2, b3
  339. LD b3, 30 * SIZE(BO)
  340. MADD4 c42, c42, a2, b4
  341. LD b4, 31 * SIZE(BO)
  342. MADD1 c51, c51, a4, b7
  343. daddiu BO, BO, 32 * SIZE
  344. MADD3 c61, c61, a4, b2
  345. daddiu AO, AO, 8 * SIZE
  346. MADD1 c71, c71, a4, b3
  347. NOP
  348. MADD3 c81, c81, a4, b4
  349. NOP
  350. MADD2 c52, c52, a2, b7
  351. LD b7, 12 * SIZE(BO)
  352. MADD4 c62, c62, a2, b2
  353. LD b2, 1 * SIZE(BO)
  354. MADD2 c72, c72, a2, b3
  355. LD b3, 2 * SIZE(BO)
  356. MADD4 c82, c82, a2, b4
  357. LD b4, 3 * SIZE(BO)
  358. MADD1 c11, c11, a1, b1
  359. LD a2, 1 * SIZE(AO)
  360. MADD3 c21, c21, a1, b2
  361. NOP
  362. MADD1 c31, c31, a1, b3
  363. NOP
  364. bgtz L, .L12
  365. MADD3 c41, c41, a1, b4
  366. .align 3
  367. .L13:
  368. MADD2 c12, c12, a2, b1
  369. LD b1, 16 * SIZE(BO)
  370. MADD4 c22, c22, a2, b2
  371. LD b2, 5 * SIZE(BO)
  372. MADD2 c32, c32, a2, b3
  373. LD b3, 6 * SIZE(BO)
  374. MADD4 c42, c42, a2, b4
  375. LD b4, 7 * SIZE(BO)
  376. MADD1 c51, c51, a1, b5
  377. NOP
  378. MADD3 c61, c61, a1, b2
  379. LD a4, 2 * SIZE(AO)
  380. MADD1 c71, c71, a1, b3
  381. NOP
  382. MADD3 c81, c81, a1, b4
  383. LD a1, 8 * SIZE(AO)
  384. MADD2 c52, c52, a2, b5
  385. LD b5, 20 * SIZE(BO)
  386. MADD4 c62, c62, a2, b2
  387. LD b2, 9 * SIZE(BO)
  388. MADD2 c72, c72, a2, b3
  389. LD b3, 10 * SIZE(BO)
  390. MADD4 c82, c82, a2, b4
  391. LD b4, 11 * SIZE(BO)
  392. MADD1 c11, c11, a4, b6
  393. LD a2, 3 * SIZE(AO)
  394. MADD3 c21, c21, a4, b2
  395. NOP
  396. MADD1 c31, c31, a4, b3
  397. NOP
  398. MADD3 c41, c41, a4, b4
  399. NOP
  400. MADD2 c12, c12, a2, b6
  401. LD b6, 24 * SIZE(BO)
  402. MADD4 c22, c22, a2, b2
  403. LD b2, 13 * SIZE(BO)
  404. MADD2 c32, c32, a2, b3
  405. LD b3, 14 * SIZE(BO)
  406. MADD4 c42, c42, a2, b4
  407. LD b4, 15 * SIZE(BO)
  408. MADD1 c51, c51, a4, b7
  409. NOP
  410. MADD3 c61, c61, a4, b2
  411. NOP
  412. MADD1 c71, c71, a4, b3
  413. NOP
  414. MADD3 c81, c81, a4, b4
  415. NOP
  416. MADD2 c52, c52, a2, b7
  417. LD b7, 28 * SIZE(BO)
  418. MADD4 c62, c62, a2, b2
  419. LD b2, 17 * SIZE(BO)
  420. MADD2 c72, c72, a2, b3
  421. LD b3, 18 * SIZE(BO)
  422. MADD4 c82, c82, a2, b4
  423. LD b4, 19 * SIZE(BO)
  424. MADD1 c11, c11, a3, b1
  425. LD a2, 5 * SIZE(AO)
  426. MADD3 c21, c21, a3, b2
  427. NOP
  428. MADD1 c31, c31, a3, b3
  429. NOP
  430. MADD3 c41, c41, a3, b4
  431. NOP
  432. MADD2 c12, c12, a2, b1
  433. LD b1, 32 * SIZE(BO)
  434. MADD4 c22, c22, a2, b2
  435. LD b2, 21 * SIZE(BO)
  436. MADD2 c32, c32, a2, b3
  437. LD b3, 22 * SIZE(BO)
  438. MADD4 c42, c42, a2, b4
  439. LD b4, 23 * SIZE(BO)
  440. MADD1 c51, c51, a3, b5
  441. NOP
  442. MADD3 c61, c61, a3, b2
  443. LD a4, 6 * SIZE(AO)
  444. MADD1 c71, c71, a3, b3
  445. NOP
  446. MADD3 c81, c81, a3, b4
  447. LD a3, 12 * SIZE(AO)
  448. MADD2 c52, c52, a2, b5
  449. LD b5, 36 * SIZE(BO)
  450. MADD4 c62, c62, a2, b2
  451. LD b2, 25 * SIZE(BO)
  452. MADD2 c72, c72, a2, b3
  453. LD b3, 26 * SIZE(BO)
  454. MADD4 c82, c82, a2, b4
  455. LD b4, 27 * SIZE(BO)
  456. MADD1 c11, c11, a4, b6
  457. LD a2, 7 * SIZE(AO)
  458. MADD3 c21, c21, a4, b2
  459. NOP
  460. MADD1 c31, c31, a4, b3
  461. NOP
  462. MADD3 c41, c41, a4, b4
  463. NOP
  464. MADD2 c12, c12, a2, b6
  465. LD b6, 40 * SIZE(BO)
  466. MADD4 c22, c22, a2, b2
  467. LD b2, 29 * SIZE(BO)
  468. MADD2 c32, c32, a2, b3
  469. LD b3, 30 * SIZE(BO)
  470. MADD4 c42, c42, a2, b4
  471. LD b4, 31 * SIZE(BO)
  472. MADD1 c51, c51, a4, b7
  473. daddiu BO, BO, 32 * SIZE
  474. MADD3 c61, c61, a4, b2
  475. daddiu AO, AO, 8 * SIZE
  476. MADD1 c71, c71, a4, b3
  477. NOP
  478. MADD3 c81, c81, a4, b4
  479. NOP
  480. MADD2 c52, c52, a2, b7
  481. LD b7, 12 * SIZE(BO)
  482. MADD4 c62, c62, a2, b2
  483. LD b2, 1 * SIZE(BO)
  484. MADD2 c72, c72, a2, b3
  485. LD b3, 2 * SIZE(BO)
  486. MADD4 c82, c82, a2, b4
  487. LD b4, 3 * SIZE(BO)
  488. .align 3
  489. .L15:
  490. #ifndef TRMMKERNEL
  491. andi L, K, 3
  492. #else
  493. andi L, TEMP, 3
  494. #endif
  495. NOP
  496. blez L, .L18
  497. NOP
  498. .align 3
  499. .L16:
  500. MADD1 c11, c11, a1, b1
  501. LD a2, 1 * SIZE(AO)
  502. MADD3 c21, c21, a1, b2
  503. NOP
  504. MADD1 c31, c31, a1, b3
  505. NOP
  506. MADD3 c41, c41, a1, b4
  507. NOP
  508. MADD2 c12, c12, a2, b1
  509. LD b1, 8 * SIZE(BO)
  510. MADD4 c22, c22, a2, b2
  511. LD b2, 5 * SIZE(BO)
  512. MADD2 c32, c32, a2, b3
  513. LD b3, 6 * SIZE(BO)
  514. MADD4 c42, c42, a2, b4
  515. LD b4, 7 * SIZE(BO)
  516. MADD1 c51, c51, a1, b5
  517. daddiu L, L, -1
  518. MADD3 c61, c61, a1, b2
  519. daddiu AO, AO, 2 * SIZE
  520. MADD1 c71, c71, a1, b3
  521. daddiu BO, BO, 8 * SIZE
  522. MADD3 c81, c81, a1, b4
  523. LD a1, 0 * SIZE(AO)
  524. MADD2 c52, c52, a2, b5
  525. LD b5, 4 * SIZE(BO)
  526. MADD4 c62, c62, a2, b2
  527. LD b2, 1 * SIZE(BO)
  528. MADD2 c72, c72, a2, b3
  529. LD b3, 2 * SIZE(BO)
  530. MADD4 c82, c82, a2, b4
  531. bgtz L, .L16
  532. LD b4, 3 * SIZE(BO)
  533. .L18:
  534. #ifndef TRMMKERNEL
  535. LD b1, 0 * SIZE(CO1)
  536. ADD c11, c11, c22
  537. LD b2, 1 * SIZE(CO1)
  538. ADD c12, c12, c21
  539. LD b3, 0 * SIZE(CO2)
  540. ADD c31, c31, c42
  541. LD b4, 1 * SIZE(CO2)
  542. ADD c32, c32, c41
  543. LD b5, 0 * SIZE(CO3)
  544. ADD c51, c51, c62
  545. LD b6, 1 * SIZE(CO3)
  546. ADD c52, c52, c61
  547. LD b7, 0 * SIZE(CO4)
  548. ADD c71, c71, c82
  549. LD b8, 1 * SIZE(CO4)
  550. ADD c72, c72, c81
  551. MADD b1, b1, ALPHA_R, c11
  552. daddiu CO1,CO1, 2 * SIZE
  553. MADD b2, b2, ALPHA_R, c12
  554. daddiu CO2,CO2, 2 * SIZE
  555. MADD b3, b3, ALPHA_R, c31
  556. daddiu CO3,CO3, 2 * SIZE
  557. MADD b4, b4, ALPHA_R, c32
  558. daddiu CO4,CO4, 2 * SIZE
  559. MADD b5, b5, ALPHA_R, c51
  560. daddiu I, I, -1
  561. MADD b6, b6, ALPHA_R, c52
  562. NOP
  563. MADD b7, b7, ALPHA_R, c71
  564. NOP
  565. MADD b8, b8, ALPHA_R, c72
  566. NOP
  567. NMSUB b1, b1, ALPHA_I, c12
  568. NOP
  569. MADD b2, b2, ALPHA_I, c11
  570. MTC $0, c11
  571. NMSUB b3, b3, ALPHA_I, c32
  572. NOP
  573. MADD b4, b4, ALPHA_I, c31
  574. NOP
  575. ST b1, -2 * SIZE(CO1)
  576. NMSUB b5, b5, ALPHA_I, c52
  577. ST b2, -1 * SIZE(CO1)
  578. MADD b6, b6, ALPHA_I, c51
  579. ST b3, -2 * SIZE(CO2)
  580. NMSUB b7, b7, ALPHA_I, c72
  581. ST b4, -1 * SIZE(CO2)
  582. MADD b8, b8, ALPHA_I, c71
  583. ST b5, -2 * SIZE(CO3)
  584. MOV c21, c11
  585. ST b6, -1 * SIZE(CO3)
  586. MOV c31, c11
  587. ST b7, -2 * SIZE(CO4)
  588. MOV c41, c11
  589. ST b8, -1 * SIZE(CO4)
  590. MOV c51, c11
  591. #else
  592. ADD c11, c11, c22
  593. daddiu CO1,CO1, 2 * SIZE
  594. ADD c12, c12, c21
  595. daddiu CO2,CO2, 2 * SIZE
  596. ADD c31, c31, c42
  597. daddiu CO3,CO3, 2 * SIZE
  598. ADD c32, c32, c41
  599. daddiu CO4,CO4, 2 * SIZE
  600. ADD c51, c51, c62
  601. daddiu I, I, -1
  602. ADD c52, c52, c61
  603. ADD c71, c71, c82
  604. ADD c72, c72, c81
  605. MUL b1, ALPHA_R, c11
  606. MUL b2, ALPHA_R, c12
  607. MUL b3, ALPHA_R, c31
  608. MUL b4, ALPHA_R, c32
  609. MUL b5, ALPHA_R, c51
  610. MUL b6, ALPHA_R, c52
  611. MUL b7, ALPHA_R, c71
  612. MUL b8, ALPHA_R, c72
  613. NMSUB b1, b1, ALPHA_I, c12
  614. NOP
  615. MADD b2, b2, ALPHA_I, c11
  616. MTC $0, c11
  617. NMSUB b3, b3, ALPHA_I, c32
  618. NOP
  619. MADD b4, b4, ALPHA_I, c31
  620. NOP
  621. ST b1, -2 * SIZE(CO1)
  622. NMSUB b5, b5, ALPHA_I, c52
  623. ST b2, -1 * SIZE(CO1)
  624. MADD b6, b6, ALPHA_I, c51
  625. ST b3, -2 * SIZE(CO2)
  626. NMSUB b7, b7, ALPHA_I, c72
  627. ST b4, -1 * SIZE(CO2)
  628. MADD b8, b8, ALPHA_I, c71
  629. ST b5, -2 * SIZE(CO3)
  630. MOV c21, c11
  631. ST b6, -1 * SIZE(CO3)
  632. MOV c31, c11
  633. ST b7, -2 * SIZE(CO4)
  634. MOV c41, c11
  635. ST b8, -1 * SIZE(CO4)
  636. MOV c51, c11
  637. #if ( defined(LEFT) && defined(TRANSA)) || \
  638. (!defined(LEFT) && !defined(TRANSA))
  639. dsubu TEMP, K, KK
  640. #ifdef LEFT
  641. daddiu TEMP, TEMP, -1
  642. #else
  643. daddiu TEMP, TEMP, -4
  644. #endif
  645. dsll L, TEMP, ZBASE_SHIFT
  646. dsll TEMP, TEMP, 2 + ZBASE_SHIFT
  647. daddu AO, AO, L
  648. daddu BO, BO, TEMP
  649. #endif
  650. #ifdef LEFT
  651. daddiu KK, KK, 1
  652. #endif
  653. #endif
  654. bgtz I, .L11
  655. MOV c61, c11
  656. .align 3
  657. .L19:
  658. #if defined(TRMMKERNEL) && !defined(LEFT)
  659. daddiu KK, KK, 4
  660. #endif
  661. bgtz J, .L10
  662. move B, BO
  663. .align 3
  664. .L20:
  665. andi J, N, 2
  666. MTC $0, c11
  667. blez J, .L30
  668. move CO1, C
  669. daddu CO2, C, LDC
  670. daddu C, CO2, LDC
  671. #if defined(TRMMKERNEL) && defined(LEFT)
  672. move KK, OFFSET
  673. #endif
  674. move I, M
  675. blez I, .L29
  676. move AO, A
  677. .align 3
  678. .L21:
  679. #if defined(TRMMKERNEL)
  680. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  681. move BO, B
  682. #else
  683. dsll L, KK, ZBASE_SHIFT
  684. dsll TEMP, KK, 1 + ZBASE_SHIFT
  685. daddu AO, AO, L
  686. daddu BO, B, TEMP
  687. #endif
  688. LD a1, 0 * SIZE(AO)
  689. MOV c21, c11
  690. LD b1, 0 * SIZE(BO)
  691. MOV c31, c11
  692. LD a3, 4 * SIZE(AO)
  693. MOV c41, c11
  694. LD b2, 1 * SIZE(BO)
  695. LD b3, 2 * SIZE(BO)
  696. MOV c12, c11
  697. LD b4, 3 * SIZE(BO)
  698. MOV c22, c11
  699. LD b5, 4 * SIZE(BO)
  700. MOV c32, c11
  701. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  702. dsubu TEMP, K, KK
  703. #elif defined(LEFT)
  704. daddiu TEMP, KK, 1
  705. #else
  706. daddiu TEMP, KK, 2
  707. #endif
  708. dsra L, TEMP, 2
  709. blez L, .L25
  710. MOV c42, c11
  711. #else
  712. LD a1, 0 * SIZE(AO)
  713. MOV c21, c11
  714. LD b1, 0 * SIZE(B)
  715. MOV c31, c11
  716. LD a3, 4 * SIZE(AO)
  717. MOV c41, c11
  718. LD b2, 1 * SIZE(B)
  719. dsra L, K, 2
  720. LD b3, 2 * SIZE(B)
  721. MOV c12, c11
  722. LD b4, 3 * SIZE(B)
  723. MOV c22, c11
  724. LD b5, 4 * SIZE(B)
  725. MOV c32, c11
  726. NOP
  727. MOV c42, c11
  728. blez L, .L25
  729. move BO, B
  730. #endif
  731. .align 3
  732. .L22:
  733. MADD1 c11, c11, a1, b1
  734. LD a2, 1 * SIZE(AO)
  735. MADD3 c21, c21, a1, b2
  736. daddiu L, L, -1
  737. MADD1 c31, c31, a1, b3
  738. NOP
  739. MADD3 c41, c41, a1, b4
  740. LD a1, 2 * SIZE(AO)
  741. MADD2 c12, c12, a2, b1
  742. LD b1, 8 * SIZE(BO)
  743. MADD4 c22, c22, a2, b2
  744. LD b2, 5 * SIZE(BO)
  745. MADD2 c32, c32, a2, b3
  746. LD b3, 6 * SIZE(BO)
  747. MADD4 c42, c42, a2, b4
  748. LD b4, 7 * SIZE(BO)
  749. MADD1 c11, c11, a1, b5
  750. LD a2, 3 * SIZE(AO)
  751. MADD3 c21, c21, a1, b2
  752. NOP
  753. MADD1 c31, c31, a1, b3
  754. NOP
  755. MADD3 c41, c41, a1, b4
  756. LD a1, 8 * SIZE(AO)
  757. MADD2 c12, c12, a2, b5
  758. LD b5, 12 * SIZE(BO)
  759. MADD4 c22, c22, a2, b2
  760. LD b2, 9 * SIZE(BO)
  761. MADD2 c32, c32, a2, b3
  762. LD b3, 10 * SIZE(BO)
  763. MADD4 c42, c42, a2, b4
  764. LD b4, 11 * SIZE(BO)
  765. MADD1 c11, c11, a3, b1
  766. LD a2, 5 * SIZE(AO)
  767. MADD3 c21, c21, a3, b2
  768. NOP
  769. MADD1 c31, c31, a3, b3
  770. NOP
  771. MADD3 c41, c41, a3, b4
  772. LD a3, 6 * SIZE(AO)
  773. MADD2 c12, c12, a2, b1
  774. LD b1, 16 * SIZE(BO)
  775. MADD4 c22, c22, a2, b2
  776. LD b2, 13 * SIZE(BO)
  777. MADD2 c32, c32, a2, b3
  778. LD b3, 14 * SIZE(BO)
  779. MADD4 c42, c42, a2, b4
  780. LD b4, 15 * SIZE(BO)
  781. MADD1 c11, c11, a3, b5
  782. LD a2, 7 * SIZE(AO)
  783. MADD3 c21, c21, a3, b2
  784. daddiu AO, AO, 8 * SIZE
  785. MADD1 c31, c31, a3, b3
  786. NOP
  787. MADD3 c41, c41, a3, b4
  788. LD a3, 4 * SIZE(AO)
  789. MADD2 c12, c12, a2, b5
  790. LD b5, 20 * SIZE(BO)
  791. MADD4 c22, c22, a2, b2
  792. LD b2, 17 * SIZE(BO)
  793. MADD2 c32, c32, a2, b3
  794. LD b3, 18 * SIZE(BO)
  795. MADD4 c42, c42, a2, b4
  796. LD b4, 19 * SIZE(BO)
  797. bgtz L, .L22
  798. daddiu BO, BO, 16 * SIZE
  799. .align 3
  800. .L25:
  801. #ifndef TRMMKERNEL
  802. andi L, K, 3
  803. #else
  804. andi L, TEMP, 3
  805. #endif
  806. NOP
  807. blez L, .L28
  808. NOP
  809. .align 3
  810. .L26:
  811. MADD1 c11, c11, a1, b1
  812. LD a2, 1 * SIZE(AO)
  813. MADD3 c21, c21, a1, b2
  814. daddiu L, L, -1
  815. MADD1 c31, c31, a1, b3
  816. daddiu BO, BO, 4 * SIZE
  817. MADD3 c41, c41, a1, b4
  818. LD a1, 2 * SIZE(AO)
  819. MADD2 c12, c12, a2, b1
  820. LD b1, 0 * SIZE(BO)
  821. MADD4 c22, c22, a2, b2
  822. LD b2, 1 * SIZE(BO)
  823. MADD2 c32, c32, a2, b3
  824. LD b3, 2 * SIZE(BO)
  825. MADD4 c42, c42, a2, b4
  826. LD b4, 3 * SIZE(BO)
  827. bgtz L, .L26
  828. daddiu AO, AO, 2 * SIZE
  829. .L28:
  830. #ifndef TRMMKERNEL
  831. LD b1, 0 * SIZE(CO1)
  832. ADD c11, c11, c22
  833. LD b2, 1 * SIZE(CO1)
  834. ADD c12, c12, c21
  835. LD b3, 0 * SIZE(CO2)
  836. ADD c31, c31, c42
  837. LD b4, 1 * SIZE(CO2)
  838. ADD c32, c32, c41
  839. MADD b1, b1, ALPHA_R, c11
  840. daddiu CO1,CO1, 2 * SIZE
  841. MADD b2, b2, ALPHA_R, c12
  842. daddiu CO2,CO2, 2 * SIZE
  843. MADD b3, b3, ALPHA_R, c31
  844. daddiu I, I, -1
  845. MADD b4, b4, ALPHA_R, c32
  846. NMSUB b1, b1, ALPHA_I, c12
  847. NOP
  848. MADD b2, b2, ALPHA_I, c11
  849. MTC $0, c11
  850. NMSUB b3, b3, ALPHA_I, c32
  851. NOP
  852. MADD b4, b4, ALPHA_I, c31
  853. NOP
  854. ST b1, -2 * SIZE(CO1)
  855. ST b2, -1 * SIZE(CO1)
  856. ST b3, -2 * SIZE(CO2)
  857. #else
  858. ADD c11, c11, c22
  859. ADD c12, c12, c21
  860. ADD c31, c31, c42
  861. ADD c32, c32, c41
  862. MUL b1, ALPHA_R, c11
  863. daddiu CO1,CO1, 2 * SIZE
  864. MUL b2, ALPHA_R, c12
  865. daddiu CO2,CO2, 2 * SIZE
  866. MUL b3, ALPHA_R, c31
  867. daddiu I, I, -1
  868. MUL b4, ALPHA_R, c32
  869. NMSUB b1, b1, ALPHA_I, c12
  870. NOP
  871. MADD b2, b2, ALPHA_I, c11
  872. MTC $0, c11
  873. NMSUB b3, b3, ALPHA_I, c32
  874. NOP
  875. MADD b4, b4, ALPHA_I, c31
  876. NOP
  877. ST b1, -2 * SIZE(CO1)
  878. ST b2, -1 * SIZE(CO1)
  879. ST b3, -2 * SIZE(CO2)
  880. #if ( defined(LEFT) && defined(TRANSA)) || \
  881. (!defined(LEFT) && !defined(TRANSA))
  882. dsubu TEMP, K, KK
  883. #ifdef LEFT
  884. daddiu TEMP, TEMP, -1
  885. #else
  886. daddiu TEMP, TEMP, -2
  887. #endif
  888. dsll L, TEMP, ZBASE_SHIFT
  889. dsll TEMP, TEMP, 1 + ZBASE_SHIFT
  890. daddu AO, AO, L
  891. daddu BO, BO, TEMP
  892. #endif
  893. #ifdef LEFT
  894. daddiu KK, KK, 1
  895. #endif
  896. #endif
  897. bgtz I, .L21
  898. ST b4, -1 * SIZE(CO2)
  899. .align 3
  900. .L29:
  901. #if defined(TRMMKERNEL) && !defined(LEFT)
  902. daddiu KK, KK, 2
  903. #endif
  904. move B, BO
  905. .align 3
  906. .L30:
  907. andi J, N, 1
  908. MTC $0, c11
  909. blez J, .L999
  910. move CO1, C
  911. #if defined(TRMMKERNEL) && defined(LEFT)
  912. move KK, OFFSET
  913. #endif
  914. move I, M
  915. daddu C, CO1, LDC
  916. blez I, .L39
  917. move AO, A
  918. .align 3
  919. .L31:
  920. #if defined(TRMMKERNEL)
  921. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  922. move BO, B
  923. #else
  924. dsll TEMP, KK, ZBASE_SHIFT
  925. daddu AO, AO, TEMP
  926. daddu BO, B, TEMP
  927. #endif
  928. LD a1, 0 * SIZE(AO)
  929. MOV c21, c11
  930. LD b1, 0 * SIZE(BO)
  931. MOV c31, c11
  932. LD a2, 1 * SIZE(AO)
  933. MOV c41, c11
  934. LD b2, 1 * SIZE(BO)
  935. MOV c12, c11
  936. NOP
  937. MOV c22, c11
  938. LD a3, 4 * SIZE(AO)
  939. MOV c32, c11
  940. LD b3, 4 * SIZE(BO)
  941. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  942. dsubu TEMP, K, KK
  943. #elif defined(LEFT)
  944. daddiu TEMP, KK, 1
  945. #else
  946. daddiu TEMP, KK, 1
  947. #endif
  948. dsra L, TEMP, 2
  949. blez L, .L35
  950. MOV c42, c11
  951. #else
  952. LD a1, 0 * SIZE(AO)
  953. MOV c21, c11
  954. LD b1, 0 * SIZE(B)
  955. MOV c31, c11
  956. LD a2, 1 * SIZE(AO)
  957. MOV c41, c11
  958. LD b2, 1 * SIZE(B)
  959. MOV c12, c11
  960. dsra L, K, 2
  961. MOV c22, c11
  962. LD a3, 4 * SIZE(AO)
  963. MOV c32, c11
  964. LD b3, 4 * SIZE(B)
  965. NOP
  966. MOV c42, c11
  967. blez L, .L35
  968. move BO, B
  969. #endif
  970. .align 3
  971. .L32:
  972. MADD1 c11, c11, a1, b1
  973. LD b4, 3 * SIZE(BO)
  974. MADD3 c21, c21, a1, b2
  975. LD a1, 2 * SIZE(AO)
  976. MADD2 c12, c12, a2, b1
  977. LD b1, 2 * SIZE(BO)
  978. MADD4 c22, c22, a2, b2
  979. LD a2, 3 * SIZE(AO)
  980. MADD1 c11, c11, a1, b1
  981. LD b2, 5 * SIZE(BO)
  982. MADD3 c21, c21, a1, b4
  983. LD a1, 8 * SIZE(AO)
  984. MADD2 c12, c12, a2, b1
  985. LD b1, 8 * SIZE(BO)
  986. MADD4 c22, c22, a2, b4
  987. LD a2, 5 * SIZE(AO)
  988. MADD1 c11, c11, a3, b3
  989. LD b4, 7 * SIZE(BO)
  990. MADD3 c21, c21, a3, b2
  991. LD a3, 6 * SIZE(AO)
  992. MADD2 c12, c12, a2, b3
  993. LD b3, 6 * SIZE(BO)
  994. MADD4 c22, c22, a2, b2
  995. LD a2, 7 * SIZE(AO)
  996. MADD1 c11, c11, a3, b3
  997. LD b2, 9 * SIZE(BO)
  998. MADD3 c21, c21, a3, b4
  999. LD a3, 12 * SIZE(AO)
  1000. MADD2 c12, c12, a2, b3
  1001. LD b3, 12 * SIZE(BO)
  1002. MADD4 c22, c22, a2, b4
  1003. LD a2, 9 * SIZE(AO)
  1004. daddiu AO, AO, 8 * SIZE
  1005. daddiu L, L, -1
  1006. bgtz L, .L32
  1007. daddiu BO, BO, 8 * SIZE
  1008. .align 3
  1009. .L35:
  1010. #ifndef TRMMKERNEL
  1011. andi L, K, 3
  1012. #else
  1013. andi L, TEMP, 3
  1014. #endif
  1015. NOP
  1016. blez L, .L38
  1017. NOP
  1018. .align 3
  1019. .L36:
  1020. MADD1 c11, c11, a1, b1
  1021. daddiu L, L, -1
  1022. MADD3 c21, c21, a1, b2
  1023. LD a1, 2 * SIZE(AO)
  1024. MADD2 c12, c12, a2, b1
  1025. LD b1, 2 * SIZE(BO)
  1026. MADD4 c22, c22, a2, b2
  1027. LD a2, 3 * SIZE(AO)
  1028. LD b2, 3 * SIZE(BO)
  1029. daddiu BO, BO, 2 * SIZE
  1030. bgtz L, .L36
  1031. daddiu AO, AO, 2 * SIZE
  1032. .L38:
  1033. #ifndef TRMMKERNEL
  1034. LD b1, 0 * SIZE(CO1)
  1035. ADD c11, c11, c22
  1036. LD b2, 1 * SIZE(CO1)
  1037. ADD c12, c12, c21
  1038. MADD b1, b1, ALPHA_R, c11
  1039. daddiu CO1,CO1, 2 * SIZE
  1040. MADD b2, b2, ALPHA_R, c12
  1041. daddiu I, I, -1
  1042. NMSUB b1, b1, ALPHA_I, c12
  1043. NOP
  1044. MADD b2, b2, ALPHA_I, c11
  1045. MTC $0, c11
  1046. ST b1, -2 * SIZE(CO1)
  1047. NOP
  1048. bgtz I, .L31
  1049. ST b2, -1 * SIZE(CO1)
  1050. #else
  1051. ADD c11, c11, c22
  1052. ADD c12, c12, c21
  1053. MUL b1, ALPHA_R, c11
  1054. daddiu CO1,CO1, 2 * SIZE
  1055. MUL b2, ALPHA_R, c12
  1056. daddiu I, I, -1
  1057. NMSUB b1, b1, ALPHA_I, c12
  1058. NOP
  1059. MADD b2, b2, ALPHA_I, c11
  1060. MTC $0, c11
  1061. #if ( defined(LEFT) && defined(TRANSA)) || \
  1062. (!defined(LEFT) && !defined(TRANSA))
  1063. dsubu TEMP, K, KK
  1064. #ifdef LEFT
  1065. daddiu TEMP, TEMP, -1
  1066. #else
  1067. daddiu TEMP, TEMP, -1
  1068. #endif
  1069. dsll TEMP, TEMP, ZBASE_SHIFT
  1070. daddu AO, AO, TEMP
  1071. daddu BO, BO, TEMP
  1072. #endif
  1073. #ifdef LEFT
  1074. daddiu KK, KK, 1
  1075. #endif
  1076. ST b1, -2 * SIZE(CO1)
  1077. NOP
  1078. bgtz I, .L31
  1079. ST b2, -1 * SIZE(CO1)
  1080. #endif
  1081. .align 3
  1082. .L39:
  1083. #if defined(TRMMKERNEL) && !defined(LEFT)
  1084. daddiu KK, KK, 1
  1085. #endif
  1086. move B, BO
  1087. .align 3
  1088. .L999:
  1089. LDARG $16, 0($sp)
  1090. LDARG $17, 8($sp)
  1091. ldc1 $f24, 16($sp)
  1092. ldc1 $f25, 24($sp)
  1093. ldc1 $f26, 32($sp)
  1094. ldc1 $f27, 40($sp)
  1095. ldc1 $f28, 48($sp)
  1096. ldc1 $f29, 56($sp)
  1097. #if defined(TRMMKERNEL)
  1098. LDARG $18, 64($sp)
  1099. LDARG $19, 72($sp)
  1100. LDARG $20, 80($sp)
  1101. #endif
  1102. #ifndef __64BIT__
  1103. ldc1 $f20, 88($sp)
  1104. ldc1 $f21, 96($sp)
  1105. ldc1 $f22,104($sp)
  1106. ldc1 $f23,112($sp)
  1107. #endif
  1108. j $31
  1109. daddiu $sp, $sp, 128
  1110. EPILOGUE