You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x2.S 26 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(EV4) && !defined(EV5) && !defined(EV6)
  41. #error "Architecture is not specified."
  42. #endif
  43. #ifdef EV6
  44. #define PREFETCHSIZE 56
  45. #define UNOP unop
  46. #endif
  47. #ifdef EV5
  48. #define PREFETCHSIZE 48
  49. #define UNOP
  50. #endif
  51. #ifdef EV4
  52. #define UNOP
  53. #endif
  54. .set noat
  55. .set noreorder
  56. .arch ev6
  57. .text
  58. .align 5
  59. .globl CNAME
  60. .ent CNAME
  61. #define STACKSIZE 80
  62. #define M $16
  63. #define N $17
  64. #define K $18
  65. #define A $21
  66. #define B $22
  67. #define C $20
  68. #define LDC $23
  69. #define C1 $19
  70. #define C2 $24
  71. #define AO $at
  72. #define BO $5
  73. #define I $6
  74. #define J $7
  75. #define L $8
  76. #define a1 $f16
  77. #define a2 $f17
  78. #define a3 $f18
  79. #define a4 $f19
  80. #define b1 $f20
  81. #define b2 $f21
  82. #define b3 $f22
  83. #define b4 $f23
  84. #define t1 $f24
  85. #define t2 $f25
  86. #define t3 $f26
  87. #define t4 $f27
  88. #define a5 $f28
  89. #define a6 $f30
  90. #define b5 $f29
  91. #define alpha_i $f29
  92. #define alpha_r $f30
  93. #define c01 $f0
  94. #define c02 $f1
  95. #define c03 $f2
  96. #define c04 $f3
  97. #define c05 $f4
  98. #define c06 $f5
  99. #define c07 $f6
  100. #define c08 $f7
  101. #define c09 $f8
  102. #define c10 $f9
  103. #define c11 $f10
  104. #define c12 $f11
  105. #define c13 $f12
  106. #define c14 $f13
  107. #define c15 $f14
  108. #define c16 $f15
  109. #define TMP1 $0
  110. #define TMP2 $1
  111. #define KK $2
  112. #define BB $3
  113. #define OFFSET $4
  114. #define ALPHA_R 64($sp)
  115. #define ALPHA_I 72($sp)
  116. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  117. #define ADD1 ADD
  118. #define ADD2 SUB
  119. #define ADD3 ADD
  120. #define ADD4 ADD
  121. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  122. #define ADD1 ADD
  123. #define ADD2 ADD
  124. #define ADD3 SUB
  125. #define ADD4 ADD
  126. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  127. #define ADD1 ADD
  128. #define ADD2 ADD
  129. #define ADD3 ADD
  130. #define ADD4 SUB
  131. #else
  132. #define ADD1 ADD
  133. #define ADD2 SUB
  134. #define ADD3 SUB
  135. #define ADD4 SUB
  136. #endif
  137. CNAME:
  138. .frame $sp, STACKSIZE, $26, 0
  139. #ifdef PROFILE
  140. ldgp $gp, 0($27)
  141. lda $at, _mcount
  142. jsr $at, ($at), _mcount
  143. #endif
  144. #ifndef PROFILE
  145. .prologue 0
  146. #else
  147. .prologue 1
  148. #endif
  149. lda $sp, -STACKSIZE($sp)
  150. ldq B, 0 + STACKSIZE($sp)
  151. ldq C, 8 + STACKSIZE($sp)
  152. ldq LDC, 16 + STACKSIZE($sp)
  153. #ifdef TRMMKERNEL
  154. ldq OFFSET, 24 + STACKSIZE($sp)
  155. #endif
  156. sll LDC, ZBASE_SHIFT, LDC
  157. stt $f2, 0($sp)
  158. stt $f3, 8($sp)
  159. stt $f4, 16($sp)
  160. stt $f5, 24($sp)
  161. stt $f6, 32($sp)
  162. stt $f7, 40($sp)
  163. stt $f8, 48($sp)
  164. stt $f9, 56($sp)
  165. stt $f19, ALPHA_R
  166. stt $f20, ALPHA_I
  167. cmple M, 0, $0
  168. cmple N, 0, $1
  169. cmple K, 0, $2
  170. or $0, $1, $0
  171. or $0, $2, $0
  172. bne $0, $L999
  173. #if defined(TRMMKERNEL) && !defined(LEFT)
  174. subq $31, OFFSET, KK
  175. #endif
  176. sra N, 1, J
  177. ble J, $L30
  178. .align 4
  179. $L01:
  180. mov C, C1
  181. addq C, LDC, C2
  182. mov A, AO
  183. s4addq K, 0, BB
  184. #if defined(TRMMKERNEL) && defined(LEFT)
  185. mov OFFSET, KK
  186. #endif
  187. SXADDQ BB, B, BB
  188. addq C2, LDC, C
  189. unop
  190. sra M, 1, I
  191. fclr t1
  192. fclr t2
  193. fclr t3
  194. fclr t4
  195. fclr c01
  196. fclr c05
  197. ble I, $L20
  198. .align 4
  199. $L11:
  200. #ifndef EV4
  201. ldl $31, 0 * SIZE(BB)
  202. ldl $31, 8 * SIZE(BB)
  203. unop
  204. lda BB, 16 * SIZE(BB)
  205. #endif
  206. #if !defined(TRMMKERNEL) || \
  207. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  208. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  209. #ifdef TRMMKERNEL
  210. #ifdef LEFT
  211. addq KK, 2, TMP1
  212. #else
  213. addq KK, 2, TMP1
  214. #endif
  215. #endif
  216. LD a1, 0 * SIZE(AO)
  217. fclr c09
  218. LD a2, 1 * SIZE(AO)
  219. fclr c13
  220. LD a3, 2 * SIZE(AO)
  221. fclr c02
  222. LD a4, 3 * SIZE(AO)
  223. fclr c06
  224. LD b1, 0 * SIZE(B)
  225. fclr c10
  226. LD b2, 1 * SIZE(B)
  227. fclr c14
  228. LD b3, 2 * SIZE(B)
  229. fclr c03
  230. LD b4, 3 * SIZE(B)
  231. fclr c07
  232. lda BO, 4 * SIZE(B)
  233. fclr c11
  234. lda AO, 4 * SIZE(AO)
  235. fclr c15
  236. lds $f31, 4 * SIZE(C1)
  237. fclr c04
  238. #ifndef TRMMKERNEL
  239. lda L, -2(K)
  240. #else
  241. lda L, -2(TMP1)
  242. #endif
  243. fclr c08
  244. lds $f31, 4 * SIZE(C2)
  245. fclr c12
  246. fclr c16
  247. ble L, $L15
  248. #else
  249. sll KK, ZBASE_SHIFT + 1, TMP1
  250. addq AO, TMP1, AO
  251. addq B, TMP1, BO
  252. subq K, KK, TMP1
  253. LD a1, 0 * SIZE(AO)
  254. fclr c09
  255. LD a2, 1 * SIZE(AO)
  256. fclr c13
  257. LD a3, 2 * SIZE(AO)
  258. fclr c02
  259. LD a4, 3 * SIZE(AO)
  260. fclr c06
  261. LD b1, 0 * SIZE(BO)
  262. fclr c10
  263. LD b2, 1 * SIZE(BO)
  264. fclr c14
  265. LD b3, 2 * SIZE(BO)
  266. fclr c03
  267. LD b4, 3 * SIZE(BO)
  268. fclr c07
  269. lda BO, 4 * SIZE(BO)
  270. fclr c11
  271. lda AO, 4 * SIZE(AO)
  272. fclr c15
  273. lds $f31, 4 * SIZE(C1)
  274. fclr c04
  275. lda L, -2(TMP1)
  276. fclr c08
  277. lds $f31, 4 * SIZE(C2)
  278. fclr c12
  279. fclr c16
  280. ble L, $L15
  281. #endif
  282. .align 5
  283. $L12:
  284. /* 1 */
  285. ADD1 c11, t1, c11
  286. #ifndef EV4
  287. ldq $31, PREFETCHSIZE * SIZE(AO)
  288. #else
  289. unop
  290. #endif
  291. MUL b1, a1, t1
  292. #ifndef EV4
  293. ldl $31, PREFETCHSIZE * SIZE(BO)
  294. #else
  295. unop
  296. #endif
  297. ADD3 c12, t2, c12
  298. unop
  299. MUL b1, a2, t2
  300. unop
  301. ADD2 c16, t3, c16
  302. unop
  303. MUL b2, a2, t3
  304. LD a5, 0 * SIZE(AO)
  305. ADD4 c15, t4, c15
  306. unop
  307. MUL b2, a1, t4
  308. LD b5, 0 * SIZE(BO)
  309. /* 2 */
  310. ADD1 c01, t1, c01
  311. UNOP
  312. MUL b1, a3, t1
  313. UNOP
  314. ADD3 c02, t2, c02
  315. UNOP
  316. MUL b1, a4, t2
  317. UNOP
  318. ADD2 c06, t3, c06
  319. unop
  320. MUL b2, a4, t3
  321. unop
  322. ADD4 c05, t4, c05
  323. unop
  324. MUL b4, a1, t4
  325. unop
  326. /* 3 */
  327. ADD1 c03, t1, c03
  328. unop
  329. MUL b3, a1, t1
  330. unop
  331. ADD3 c04, t2, c04
  332. unop
  333. MUL b3, a2, t2
  334. unop
  335. ADD2 c08, t3, c08
  336. unop
  337. MUL b4, a2, t3
  338. LD a2, 1 * SIZE(AO)
  339. ADD4 c13, t4, c13
  340. unop
  341. MUL b2, a3, t4
  342. LD b2, 1 * SIZE(BO)
  343. /* 4 */
  344. ADD1 c09, t1, c09
  345. unop
  346. MUL b3, a3, t1
  347. LD a6, 2 * SIZE(AO)
  348. ADD3 c10, t2, c10
  349. unop
  350. MUL b3, a4, t2
  351. LD b3, 2 * SIZE(BO)
  352. ADD2 c14, t3, c14
  353. unop
  354. MUL b4, a4, t3
  355. LD a4, 3 * SIZE(AO)
  356. ADD4 c07, t4, c07
  357. unop
  358. MUL b4, a3, t4
  359. LD b4, 3 * SIZE(BO)
  360. /* 5 */
  361. ADD1 c11, t1, c11
  362. unop
  363. MUL b5, a5, t1
  364. LD a1, 4 * SIZE(AO)
  365. ADD3 c12, t2, c12
  366. lda L, -2(L)
  367. MUL b5, a2, t2
  368. LD b1, 4 * SIZE(BO)
  369. ADD2 c16, t3, c16
  370. unop
  371. MUL b2, a2, t3
  372. unop
  373. ADD4 c15, t4, c15
  374. unop
  375. MUL b2, a5, t4
  376. unop
  377. /* 6 */
  378. ADD1 c01, t1, c01
  379. unop
  380. MUL b5, a6, t1
  381. unop
  382. ADD3 c02, t2, c02
  383. unop
  384. MUL b5, a4, t2
  385. unop
  386. ADD2 c06, t3, c06
  387. unop
  388. MUL b2, a4, t3
  389. unop
  390. ADD4 c05, t4, c05
  391. unop
  392. MUL b4, a5, t4
  393. unop
  394. /* 7 */
  395. ADD1 c03, t1, c03
  396. lda AO, 8 * SIZE(AO)
  397. MUL b3, a5, t1
  398. unop
  399. ADD3 c04, t2, c04
  400. lda BO, 8 * SIZE(BO)
  401. MUL b3, a2, t2
  402. unop
  403. ADD2 c08, t3, c08
  404. unop
  405. MUL b4, a2, t3
  406. LD a2, -3 * SIZE(AO)
  407. ADD4 c13, t4, c13
  408. unop
  409. MUL b2, a6, t4
  410. LD b2, -3 * SIZE(BO)
  411. /* 8 */
  412. ADD1 c09, t1, c09
  413. unop
  414. MUL b3, a6, t1
  415. LD a3, -2 * SIZE(AO)
  416. ADD3 c10, t2, c10
  417. unop
  418. MUL b3, a4, t2
  419. LD b3, -2 * SIZE(BO)
  420. ADD2 c14, t3, c14
  421. unop
  422. MUL b4, a4, t3
  423. LD a4, -1 * SIZE(AO)
  424. ADD4 c07, t4, c07
  425. MUL b4, a6, t4
  426. LD b4, -1 * SIZE(BO)
  427. bgt L, $L12
  428. .align 4
  429. $L15:
  430. ADD1 c11, t1, c11
  431. ldt alpha_r, ALPHA_R
  432. MUL b1, a1, t1
  433. #ifndef TRMMKERNEL
  434. blbs K, $L18
  435. #else
  436. blbs TMP1, $L18
  437. #endif
  438. .align 4
  439. ADD3 c12, t2, c12
  440. MUL b1, a2, t2
  441. ADD2 c16, t3, c16
  442. MUL b2, a2, t3
  443. ADD4 c15, t4, c15
  444. MUL b2, a1, t4
  445. ADD1 c01, t1, c01
  446. MUL b1, a3, t1
  447. ADD3 c02, t2, c02
  448. unop
  449. MUL b1, a4, t2
  450. LD b1, 0 * SIZE(BO)
  451. ADD2 c06, t3, c06
  452. MUL b2, a4, t3
  453. ADD4 c05, t4, c05
  454. MUL b4, a1, t4
  455. ADD1 c03, t1, c03
  456. unop
  457. MUL b3, a1, t1
  458. LD a1, 0 * SIZE(AO)
  459. ADD3 c04, t2, c04
  460. unop
  461. MUL b3, a2, t2
  462. unop
  463. ADD2 c08, t3, c08
  464. unop
  465. MUL b4, a2, t3
  466. LD a2, 1 * SIZE(AO)
  467. ADD4 c13, t4, c13
  468. unop
  469. MUL b2, a3, t4
  470. LD b2, 1 * SIZE(BO)
  471. ADD1 c09, t1, c09
  472. unop
  473. MUL b3, a3, t1
  474. lda AO, 4 * SIZE(AO)
  475. ADD3 c10, t2, c10
  476. unop
  477. MUL b3, a4, t2
  478. LD b3, 2 * SIZE(BO)
  479. ADD2 c14, t3, c14
  480. unop
  481. MUL b4, a4, t3
  482. LD a4, -1 * SIZE(AO)
  483. ADD4 c07, t4, c07
  484. unop
  485. MUL b4, a3, t4
  486. LD a3, -2 * SIZE(AO)
  487. ADD1 c11, t1, c11
  488. LD b4, 3 * SIZE(BO)
  489. MUL b1, a1, t1
  490. lda BO, 4 * SIZE(BO)
  491. .align 4
  492. $L18:
  493. ADD3 c12, t2, c12
  494. unop
  495. MUL b1, a2, t2
  496. ldt alpha_i, ALPHA_I
  497. ADD2 c16, t3, c16
  498. unop
  499. MUL b2, a2, t3
  500. #ifndef TRMMKERNEL
  501. LD a5, 0 * SIZE(C1)
  502. #else
  503. unop
  504. #endif
  505. ADD4 c15, t4, c15
  506. MUL b2, a1, t4
  507. ADD1 c01, t1, c01
  508. MUL b1, a3, t1
  509. ADD3 c02, t2, c02
  510. unop
  511. MUL b1, a4, t2
  512. #ifndef TRMMKERNEL
  513. LD b1, 1 * SIZE(C1)
  514. #else
  515. unop
  516. #endif
  517. ADD2 c06, t3, c06
  518. MUL b2, a4, t3
  519. ADD4 c05, t4, c05
  520. MUL b4, a1, t4
  521. ADD1 c03, t1, c03
  522. unop
  523. MUL b3, a1, t1
  524. #ifndef TRMMKERNEL
  525. LD a1, 2 * SIZE(C1)
  526. #else
  527. unop
  528. #endif
  529. ADD3 c04, t2, c04
  530. unop
  531. MUL b3, a2, t2
  532. unop
  533. ADD2 c08, t3, c08
  534. unop
  535. MUL b4, a2, t3
  536. #ifndef TRMMKERNEL
  537. LD a2, 3 * SIZE(C1)
  538. #else
  539. unop
  540. #endif
  541. ADD4 c13, t4, c13
  542. unop
  543. MUL b2, a3, t4
  544. #ifndef TRMMKERNEL
  545. LD b2, 0 * SIZE(C2)
  546. #else
  547. unop
  548. #endif
  549. ADD1 c09, t1, c09
  550. lda I, -1(I)
  551. MUL b3, a3, t1
  552. unop
  553. ADD3 c10, t2, c10
  554. unop
  555. MUL b3, a4, t2
  556. #ifndef TRMMKERNEL
  557. LD b3, 1 * SIZE(C2)
  558. #else
  559. unop
  560. #endif
  561. ADD2 c14, t3, c14
  562. unop
  563. MUL b4, a4, t3
  564. #ifndef TRMMKERNEL
  565. LD a4, 2 * SIZE(C2)
  566. #else
  567. unop
  568. #endif
  569. ADD4 c07, t4, c07
  570. unop
  571. MUL b4, a3, t4
  572. #ifndef TRMMKERNEL
  573. LD a3, 3 * SIZE(C2)
  574. #else
  575. unop
  576. #endif
  577. ADD1 c11, t1, c11
  578. ADD3 c12, t2, c12
  579. ADD2 c16, t3, c16
  580. ADD4 c15, t4, c15
  581. ADD c01, c06, c01
  582. ADD c02, c05, c02
  583. ADD c03, c08, c03
  584. ADD c04, c07, c04
  585. ADD c09, c14, c09
  586. MUL alpha_r, c01, t1
  587. ADD c10, c13, c10
  588. MUL alpha_r, c02, t2
  589. ADD c11, c16, c11
  590. MUL alpha_r, c03, t3
  591. ADD c12, c15, c12
  592. MUL alpha_r, c04, t4
  593. #ifndef TRMMKERNEL
  594. ADD a5, t1, a5
  595. MUL alpha_i, c02, t1
  596. ADD b1, t2, b1
  597. MUL alpha_i, c01, t2
  598. ADD a1, t3, a1
  599. MUL alpha_i, c04, t3
  600. ADD a2, t4, a2
  601. MUL alpha_i, c03, t4
  602. #else
  603. ADD $f31, t1, a5
  604. MUL alpha_i, c02, t1
  605. ADD $f31, t2, b1
  606. MUL alpha_i, c01, t2
  607. ADD $f31, t3, a1
  608. MUL alpha_i, c04, t3
  609. ADD $f31, t4, a2
  610. MUL alpha_i, c03, t4
  611. #endif
  612. SUB a5, t1, a5
  613. MUL alpha_r, c09, t1
  614. ADD b1, t2, b1
  615. MUL alpha_r, c10, t2
  616. SUB a1, t3, a1
  617. MUL alpha_r, c11, t3
  618. ADD a2, t4, a2
  619. MUL alpha_r, c12, t4
  620. #ifndef TRMMKERNEL
  621. ADD b2, t1, b2
  622. MUL alpha_i, c10, t1
  623. ADD b3, t2, b3
  624. MUL alpha_i, c09, t2
  625. ADD a4, t3, a4
  626. MUL alpha_i, c12, t3
  627. ADD a3, t4, a3
  628. MUL alpha_i, c11, t4
  629. #else
  630. ADD $f31, t1, b2
  631. MUL alpha_i, c10, t1
  632. ADD $f31, t2, b3
  633. MUL alpha_i, c09, t2
  634. ADD $f31, t3, a4
  635. MUL alpha_i, c12, t3
  636. ADD $f31, t4, a3
  637. MUL alpha_i, c11, t4
  638. #endif
  639. SUB b2, t1, b2
  640. ST a5, 0 * SIZE(C1)
  641. fclr t1
  642. unop
  643. ADD b3, t2, b3
  644. ST b1, 1 * SIZE(C1)
  645. fclr t2
  646. unop
  647. SUB a4, t3, a4
  648. ST a1, 2 * SIZE(C1)
  649. fclr t3
  650. unop
  651. ADD a3, t4, a3
  652. ST a2, 3 * SIZE(C1)
  653. fclr t4
  654. unop
  655. ST b2, 0 * SIZE(C2)
  656. fclr c01
  657. ST b3, 1 * SIZE(C2)
  658. fclr c05
  659. ST a4, 2 * SIZE(C2)
  660. lda C1, 4 * SIZE(C1)
  661. ST a3, 3 * SIZE(C2)
  662. lda C2, 4 * SIZE(C2)
  663. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  664. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  665. subq K, KK, TMP1
  666. #ifdef LEFT
  667. subq TMP1, 2, TMP1
  668. #else
  669. subq TMP1, 2, TMP1
  670. #endif
  671. sll TMP1, ZBASE_SHIFT + 1, TMP1
  672. addq AO, TMP1, AO
  673. addq BO, TMP1, BO
  674. #endif
  675. #if defined(TRMMKERNEL) && defined(LEFT)
  676. addq KK, 2, KK
  677. #endif
  678. bgt I, $L11
  679. .align 4
  680. $L20:
  681. and M, 1, I
  682. ble I, $L29
  683. #if !defined(TRMMKERNEL) || \
  684. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  685. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  686. #ifdef TRMMKERNEL
  687. #ifdef LEFT
  688. addq KK, 1, TMP1
  689. #else
  690. addq KK, 2, TMP1
  691. #endif
  692. #endif
  693. LD a1, 0 * SIZE(AO)
  694. fclr c09
  695. LD a2, 1 * SIZE(AO)
  696. fclr c13
  697. LD a3, 2 * SIZE(AO)
  698. fclr c02
  699. LD a4, 3 * SIZE(AO)
  700. fclr c06
  701. LD b1, 0 * SIZE(B)
  702. fclr c10
  703. LD b2, 1 * SIZE(B)
  704. fclr c14
  705. LD b3, 2 * SIZE(B)
  706. lda AO, 2 * SIZE(AO)
  707. LD b4, 3 * SIZE(B)
  708. lda BO, 4 * SIZE(B)
  709. #ifndef TRMMKERNEL
  710. lda L, -2(K)
  711. #else
  712. lda L, -2(TMP1)
  713. #endif
  714. ble L, $L25
  715. #else
  716. sll KK, ZBASE_SHIFT + 0, TMP1
  717. addq AO, TMP1, AO
  718. sll KK, ZBASE_SHIFT + 1, TMP1
  719. addq B, TMP1, BO
  720. subq K, KK, TMP1
  721. LD a1, 0 * SIZE(AO)
  722. fclr c09
  723. LD a2, 1 * SIZE(AO)
  724. fclr c13
  725. LD a3, 2 * SIZE(AO)
  726. fclr c02
  727. LD a4, 3 * SIZE(AO)
  728. fclr c06
  729. LD b1, 0 * SIZE(BO)
  730. fclr c10
  731. LD b2, 1 * SIZE(BO)
  732. fclr c14
  733. LD b3, 2 * SIZE(BO)
  734. lda AO, 2 * SIZE(AO)
  735. LD b4, 3 * SIZE(BO)
  736. lda BO, 4 * SIZE(BO)
  737. lda L, -2(TMP1)
  738. ble L, $L25
  739. #endif
  740. .align 5
  741. $L22:
  742. ADD1 c09, t1, c09
  743. unop
  744. MUL a1, b1, t1
  745. unop
  746. ADD3 c10, t2, c10
  747. unop
  748. MUL a2, b1, t2
  749. LD b1, 0 * SIZE(BO)
  750. ADD4 c13, t3, c13
  751. unop
  752. MUL a1, b2, t3
  753. lda BO, 8 * SIZE(BO)
  754. ADD2 c14, t4, c14
  755. unop
  756. MUL a2, b2, t4
  757. LD b2, -7 * SIZE(BO)
  758. ADD1 c01, t1, c01
  759. unop
  760. MUL a1, b3, t1
  761. unop
  762. ADD3 c02, t2, c02
  763. unop
  764. MUL a2, b3, t2
  765. LD b3, -6 * SIZE(BO)
  766. ADD4 c05, t3, c05
  767. unop
  768. MUL a1, b4, t3
  769. LD a1, 2 * SIZE(AO)
  770. ADD2 c06, t4, c06
  771. MUL a2, b4, t4
  772. LD b5, -5 * SIZE(BO)
  773. ADD1 c09, t1, c09
  774. unop
  775. MUL a3, b1, t1
  776. LD a2, 3 * SIZE(AO)
  777. ADD3 c10, t2, c10
  778. unop
  779. MUL a4, b1, t2
  780. LD b1, -4 * SIZE(BO)
  781. ADD4 c13, t3, c13
  782. unop
  783. MUL a3, b2, t3
  784. lda AO, 4 * SIZE(AO)
  785. ADD2 c14, t4, c14
  786. MUL a4, b2, t4
  787. LD b2, -3 * SIZE(BO)
  788. ADD1 c01, t1, c01
  789. lda L, -2(L)
  790. MUL a3, b3, t1
  791. LD b4, -1 * SIZE(BO)
  792. ADD3 c02, t2, c02
  793. unop
  794. MUL a4, b3, t2
  795. LD b3, -2 * SIZE(BO)
  796. ADD4 c05, t3, c05
  797. unop
  798. MUL a3, b5, t3
  799. LD a3, 0 * SIZE(AO)
  800. ADD2 c06, t4, c06
  801. MUL a4, b5, t4
  802. LD a4, 1 * SIZE(AO)
  803. bgt L, $L22
  804. .align 4
  805. $L25:
  806. ADD1 c09, t1, c09
  807. ldt alpha_r, ALPHA_R
  808. MUL a1, b1, t1
  809. #ifndef TRMMKERNEL
  810. blbs K, $L28
  811. #else
  812. blbs TMP1, $L28
  813. #endif
  814. .align 4
  815. ADD3 c10, t2, c10
  816. unop
  817. MUL a2, b1, t2
  818. LD b1, 0 * SIZE(BO)
  819. ADD4 c13, t3, c13
  820. unop
  821. MUL a1, b2, t3
  822. unop
  823. ADD2 c14, t4, c14
  824. unop
  825. MUL a2, b2, t4
  826. LD b2, 1 * SIZE(BO)
  827. ADD1 c01, t1, c01
  828. unop
  829. MUL a1, b3, t1
  830. lda AO, 2 * SIZE(AO)
  831. ADD3 c02, t2, c02
  832. unop
  833. MUL a2, b3, t2
  834. LD b3, 2 * SIZE(BO)
  835. ADD4 c05, t3, c05
  836. unop
  837. MUL a1, b4, t3
  838. LD a1, -2 * SIZE(AO)
  839. ADD2 c06, t4, c06
  840. unop
  841. MUL a2, b4, t4
  842. LD a2, -1 * SIZE(AO)
  843. ADD1 c09, t1, c09
  844. LD b4, 3 * SIZE(BO)
  845. MUL a1, b1, t1
  846. lda BO, 4 * SIZE(BO)
  847. .align 4
  848. $L28:
  849. ADD3 c10, t2, c10
  850. unop
  851. MUL a2, b1, t2
  852. ldt alpha_i, ALPHA_I
  853. ADD4 c13, t3, c13
  854. unop
  855. MUL a1, b2, t3
  856. #ifndef TRMMKERNEL
  857. LD c03, 0 * SIZE(C1)
  858. #else
  859. unop
  860. #endif
  861. ADD2 c14, t4, c14
  862. unop
  863. MUL a2, b2, t4
  864. #ifndef TRMMKERNEL
  865. LD c04, 1 * SIZE(C1)
  866. #else
  867. unop
  868. #endif
  869. ADD1 c01, t1, c01
  870. unop
  871. MUL a1, b3, t1
  872. #ifndef TRMMKERNEL
  873. LD c11, 0 * SIZE(C2)
  874. #else
  875. unop
  876. #endif
  877. ADD3 c02, t2, c02
  878. unop
  879. MUL a2, b3, t2
  880. #ifndef TRMMKERNEL
  881. LD c12, 1 * SIZE(C2)
  882. #else
  883. unop
  884. #endif
  885. ADD4 c05, t3, c05
  886. MUL a1, b4, t3
  887. ADD2 c06, t4, c06
  888. MUL a2, b4, t4
  889. ADD1 c09, t1, c09
  890. ADD3 c10, t2, c10
  891. ADD4 c13, t3, c13
  892. ADD2 c14, t4, c14
  893. ADD c01, c06, c01
  894. ADD c02, c05, c02
  895. ADD c09, c14, c09
  896. ADD c10, c13, c10
  897. MUL alpha_r, c01, t1
  898. MUL alpha_r, c02, t2
  899. MUL alpha_r, c09, t3
  900. MUL alpha_r, c10, t4
  901. #ifndef TRMMKERNEL
  902. ADD c03, t1, c03
  903. MUL alpha_i, c02, t1
  904. ADD c04, t2, c04
  905. MUL alpha_i, c01, t2
  906. ADD c11, t3, c11
  907. MUL alpha_i, c10, t3
  908. ADD c12, t4, c12
  909. MUL alpha_i, c09, t4
  910. #else
  911. ADD $f31, t1, c03
  912. MUL alpha_i, c02, t1
  913. ADD $f31, t2, c04
  914. MUL alpha_i, c01, t2
  915. ADD $f31, t3, c11
  916. MUL alpha_i, c10, t3
  917. ADD $f31, t4, c12
  918. MUL alpha_i, c09, t4
  919. #endif
  920. SUB c03, t1, c03
  921. ADD c04, t2, c04
  922. SUB c11, t3, c11
  923. ADD c12, t4, c12
  924. ST c03, 0 * SIZE(C1)
  925. ST c04, 1 * SIZE(C1)
  926. ST c11, 0 * SIZE(C2)
  927. ST c12, 1 * SIZE(C2)
  928. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  929. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  930. subq K, KK, TMP1
  931. #ifdef LEFT
  932. subq TMP1, 1, TMP1
  933. #else
  934. subq TMP1, 2, TMP1
  935. #endif
  936. sll TMP1, ZBASE_SHIFT + 0, TMP2
  937. addq AO, TMP2, AO
  938. sll TMP1, ZBASE_SHIFT + 1, TMP2
  939. addq BO, TMP2, BO
  940. #endif
  941. #if defined(TRMMKERNEL) && defined(LEFT)
  942. addq KK, 1, KK
  943. #endif
  944. .align 4
  945. $L29:
  946. mov BO, B
  947. lda J, -1(J)
  948. #if defined(TRMMKERNEL) && !defined(LEFT)
  949. addq KK, 2, KK
  950. #else
  951. unop
  952. #endif
  953. bgt J, $L01
  954. .align 4
  955. $L30:
  956. and N, 1, J
  957. ble J, $L999
  958. mov C, C1
  959. mov A, AO
  960. #if defined(TRMMKERNEL) && defined(LEFT)
  961. mov OFFSET, KK
  962. #endif
  963. sra M, 1, I
  964. ble I, $L50
  965. .align 4
  966. $L41:
  967. #if !defined(TRMMKERNEL) || \
  968. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  969. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  970. #ifdef TRMMKERNEL
  971. #ifdef LEFT
  972. addq KK, 2, TMP1
  973. #else
  974. addq KK, 1, TMP1
  975. #endif
  976. #endif
  977. LD a1, 0 * SIZE(AO)
  978. fclr t1
  979. LD a2, 1 * SIZE(AO)
  980. fclr t2
  981. LD a3, 2 * SIZE(AO)
  982. fclr t3
  983. LD a4, 3 * SIZE(AO)
  984. fclr t4
  985. LD b1, 0 * SIZE(B)
  986. fclr c01
  987. LD b2, 1 * SIZE(B)
  988. fclr c05
  989. LD b3, 2 * SIZE(B)
  990. fclr c02
  991. LD b4, 3 * SIZE(B)
  992. fclr c06
  993. lda BO, 2 * SIZE(B)
  994. fclr c03
  995. lda AO, 4 * SIZE(AO)
  996. fclr c07
  997. #ifndef TRMMKERNEL
  998. lda L, -2(K)
  999. #else
  1000. lda L, -2(TMP1)
  1001. #endif
  1002. fclr c04
  1003. fclr c08
  1004. ble L, $L45
  1005. #else
  1006. sll KK, ZBASE_SHIFT + 1, TMP1
  1007. addq AO, TMP1, AO
  1008. sll KK, ZBASE_SHIFT + 0, TMP1
  1009. addq B, TMP1, BO
  1010. subq K, KK, TMP1
  1011. LD a1, 0 * SIZE(AO)
  1012. fclr t1
  1013. LD a2, 1 * SIZE(AO)
  1014. fclr t2
  1015. LD a3, 2 * SIZE(AO)
  1016. fclr t3
  1017. LD a4, 3 * SIZE(AO)
  1018. fclr t4
  1019. LD b1, 0 * SIZE(BO)
  1020. fclr c01
  1021. LD b2, 1 * SIZE(BO)
  1022. fclr c05
  1023. LD b3, 2 * SIZE(BO)
  1024. fclr c02
  1025. LD b4, 3 * SIZE(BO)
  1026. fclr c06
  1027. lda BO, 2 * SIZE(BO)
  1028. fclr c03
  1029. lda AO, 4 * SIZE(AO)
  1030. fclr c07
  1031. lda L, -2(TMP1)
  1032. fclr c04
  1033. fclr c08
  1034. ble L, $L45
  1035. #endif
  1036. .align 5
  1037. $L42:
  1038. ADD4 c05, t1, c05
  1039. unop
  1040. MUL a1, b1, t1
  1041. unop
  1042. ADD2 c06, t2, c06
  1043. lda L, -2(L)
  1044. MUL a2, b1, t2
  1045. unop
  1046. ADD4 c07, t3, c07
  1047. unop
  1048. MUL a3, b1, t3
  1049. unop
  1050. ADD2 c08, t4, c08
  1051. unop
  1052. MUL a4, b1, t4
  1053. LD b1, 2 * SIZE(BO)
  1054. ADD1 c01, t1, c01
  1055. unop
  1056. MUL a1, b2, t1
  1057. LD a1, 0 * SIZE(AO)
  1058. ADD3 c02, t2, c02
  1059. lda BO, 4 * SIZE(BO)
  1060. MUL a2, b2, t2
  1061. LD a2, 1 * SIZE(AO)
  1062. ADD1 c03, t3, c03
  1063. unop
  1064. MUL a3, b2, t3
  1065. LD a3, 2 * SIZE(AO)
  1066. ADD3 c04, t4, c04
  1067. unop
  1068. MUL a4, b2, t4
  1069. LD a5, 3 * SIZE(AO)
  1070. ADD4 c05, t1, c05
  1071. unop
  1072. MUL a1, b3, t1
  1073. LD b2, -1 * SIZE(BO)
  1074. ADD2 c06, t2, c06
  1075. unop
  1076. MUL a2, b3, t2
  1077. unop
  1078. ADD4 c07, t3, c07
  1079. unop
  1080. MUL a3, b3, t3
  1081. lda AO, 8 * SIZE(AO)
  1082. ADD2 c08, t4, c08
  1083. unop
  1084. MUL a5, b3, t4
  1085. LD b3, 0 * SIZE(BO)
  1086. ADD1 c01, t1, c01
  1087. unop
  1088. MUL a1, b4, t1
  1089. LD a1, -4 * SIZE(AO)
  1090. ADD3 c02, t2, c02
  1091. unop
  1092. MUL a2, b4, t2
  1093. LD a2, -3 * SIZE(AO)
  1094. ADD1 c03, t3, c03
  1095. LD a4, -1 * SIZE(AO)
  1096. MUL a3, b4, t3
  1097. LD a3, -2 * SIZE(AO)
  1098. ADD3 c04, t4, c04
  1099. MUL a5, b4, t4
  1100. LD b4, 1 * SIZE(BO)
  1101. bgt L, $L42
  1102. .align 4
  1103. $L45:
  1104. ADD4 c05, t1, c05
  1105. ldt alpha_r, ALPHA_R
  1106. MUL b1, a1, t1
  1107. #ifndef TRMMKERNEL
  1108. blbs K, $L48
  1109. #else
  1110. blbs TMP1, $L48
  1111. #endif
  1112. .align 4
  1113. ADD2 c06, t2, c06
  1114. MUL a2, b1, t2
  1115. ADD4 c07, t3, c07
  1116. MUL a3, b1, t3
  1117. ADD2 c08, t4, c08
  1118. unop
  1119. MUL a4, b1, t4
  1120. LD b1, 0 * SIZE(BO)
  1121. ADD1 c01, t1, c01
  1122. unop
  1123. MUL a1, b2, t1
  1124. LD a1, 0 * SIZE(AO)
  1125. ADD3 c02, t2, c02
  1126. unop
  1127. MUL a2, b2, t2
  1128. LD a2, 1 * SIZE(AO)
  1129. ADD1 c03, t3, c03
  1130. unop
  1131. MUL a3, b2, t3
  1132. LD a3, 2 * SIZE(AO)
  1133. ADD3 c04, t4, c04
  1134. MUL a4, b2, t4
  1135. LD a4, 3 * SIZE(AO)
  1136. lda AO, 4 * SIZE(AO)
  1137. ADD4 c05, t1, c05
  1138. LD b2, 1 * SIZE(BO)
  1139. MUL a1, b1, t1
  1140. lda BO, 2 * SIZE(BO)
  1141. .align 4
  1142. $L48:
  1143. ADD2 c06, t2, c06
  1144. unop
  1145. MUL a2, b1, t2
  1146. ldt alpha_i, ALPHA_I
  1147. ADD4 c07, t3, c07
  1148. lda I, -1(I)
  1149. MUL a3, b1, t3
  1150. #ifndef TRMMKERNEL
  1151. LD c09, 0 * SIZE(C1)
  1152. #else
  1153. unop
  1154. #endif
  1155. ADD2 c08, t4, c08
  1156. unop
  1157. MUL a4, b1, t4
  1158. #ifndef TRMMKERNEL
  1159. LD c10, 1 * SIZE(C1)
  1160. #else
  1161. unop
  1162. #endif
  1163. ADD1 c01, t1, c01
  1164. unop
  1165. MUL a1, b2, t1
  1166. #ifndef TRMMKERNEL
  1167. LD c11, 2 * SIZE(C1)
  1168. #else
  1169. unop
  1170. #endif
  1171. ADD3 c02, t2, c02
  1172. unop
  1173. MUL a2, b2, t2
  1174. #ifndef TRMMKERNEL
  1175. LD c12, 3 * SIZE(C1)
  1176. #else
  1177. unop
  1178. #endif
  1179. ADD1 c03, t3, c03
  1180. MUL a3, b2, t3
  1181. ADD3 c04, t4, c04
  1182. MUL a4, b2, t4
  1183. ADD4 c05, t1, c05
  1184. ADD2 c06, t2, c06
  1185. ADD4 c07, t3, c07
  1186. ADD2 c08, t4, c08
  1187. ADD c01, c06, c01
  1188. ADD c02, c05, c02
  1189. ADD c03, c08, c03
  1190. ADD c04, c07, c04
  1191. MUL alpha_r, c01, t1
  1192. MUL alpha_r, c02, t2
  1193. MUL alpha_r, c03, t3
  1194. MUL alpha_r, c04, t4
  1195. #ifndef TRMMKERNEL
  1196. ADD c09, t1, c09
  1197. MUL alpha_i, c02, t1
  1198. ADD c10, t2, c10
  1199. MUL alpha_i, c01, t2
  1200. ADD c11, t3, c11
  1201. MUL alpha_i, c04, t3
  1202. ADD c12, t4, c12
  1203. MUL alpha_i, c03, t4
  1204. #else
  1205. ADD $f31, t1, c09
  1206. MUL alpha_i, c02, t1
  1207. ADD $f31, t2, c10
  1208. MUL alpha_i, c01, t2
  1209. ADD $f31, t3, c11
  1210. MUL alpha_i, c04, t3
  1211. ADD $f31, t4, c12
  1212. MUL alpha_i, c03, t4
  1213. #endif
  1214. SUB c09, t1, c09
  1215. ADD c10, t2, c10
  1216. SUB c11, t3, c11
  1217. ADD c12, t4, c12
  1218. ST c09, 0 * SIZE(C1)
  1219. ST c10, 1 * SIZE(C1)
  1220. ST c11, 2 * SIZE(C1)
  1221. ST c12, 3 * SIZE(C1)
  1222. lda C1, 4 * SIZE(C1)
  1223. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1224. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1225. subq K, KK, TMP1
  1226. #ifdef LEFT
  1227. subq TMP1, 2, TMP1
  1228. #else
  1229. subq TMP1, 1, TMP1
  1230. #endif
  1231. sll TMP1, ZBASE_SHIFT + 1, TMP2
  1232. addq AO, TMP2, AO
  1233. sll TMP1, ZBASE_SHIFT + 0, TMP2
  1234. addq BO, TMP2, BO
  1235. #endif
  1236. #if defined(TRMMKERNEL) && defined(LEFT)
  1237. addq KK, 2, KK
  1238. #endif
  1239. bgt I, $L41
  1240. .align 4
  1241. $L50:
  1242. and M, 1, I
  1243. ble I, $L999
  1244. #if !defined(TRMMKERNEL) || \
  1245. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1246. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1247. #ifdef TRMMKERNEL
  1248. #ifdef LEFT
  1249. addq KK, 1, TMP1
  1250. #else
  1251. addq KK, 1, TMP1
  1252. #endif
  1253. #endif
  1254. LD a1, 0 * SIZE(AO)
  1255. fclr t1
  1256. LD a2, 1 * SIZE(AO)
  1257. fclr t2
  1258. LD a3, 2 * SIZE(AO)
  1259. fclr t3
  1260. LD a4, 3 * SIZE(AO)
  1261. fclr t4
  1262. LD b1, 0 * SIZE(B)
  1263. fclr c01
  1264. LD b2, 1 * SIZE(B)
  1265. fclr c05
  1266. LD b3, 2 * SIZE(B)
  1267. fclr c02
  1268. LD b4, 3 * SIZE(B)
  1269. fclr c06
  1270. lda AO, 2 * SIZE(AO)
  1271. lda BO, 2 * SIZE(B)
  1272. #ifndef TRMMKERNEL
  1273. lda L, -2(K)
  1274. #else
  1275. lda L, -2(TMP1)
  1276. #endif
  1277. ble L, $L55
  1278. #else
  1279. sll KK, ZBASE_SHIFT + 0, TMP1
  1280. addq AO, TMP1, AO
  1281. addq B, TMP1, BO
  1282. subq K, KK, TMP1
  1283. LD a1, 0 * SIZE(AO)
  1284. fclr t1
  1285. LD a2, 1 * SIZE(AO)
  1286. fclr t2
  1287. LD a3, 2 * SIZE(AO)
  1288. fclr t3
  1289. LD a4, 3 * SIZE(AO)
  1290. fclr t4
  1291. LD b1, 0 * SIZE(BO)
  1292. fclr c01
  1293. LD b2, 1 * SIZE(BO)
  1294. fclr c05
  1295. LD b3, 2 * SIZE(BO)
  1296. fclr c02
  1297. LD b4, 3 * SIZE(BO)
  1298. fclr c06
  1299. lda AO, 2 * SIZE(AO)
  1300. lda BO, 2 * SIZE(BO)
  1301. lda L, -2(TMP1)
  1302. ble L, $L55
  1303. #endif
  1304. .align 5
  1305. $L52:
  1306. ADD1 c01, t1, c01
  1307. unop
  1308. MUL a1, b1, t1
  1309. unop
  1310. ADD3 c02, t2, c02
  1311. lda AO, 4 * SIZE(AO)
  1312. MUL a2, b1, t2
  1313. LD b1, 2 * SIZE(BO)
  1314. ADD4 c05, t3, c05
  1315. lda L, -2(L)
  1316. MUL a1, b2, t3
  1317. LD a1, -2 * SIZE(AO)
  1318. ADD2 c06, t4, c06
  1319. unop
  1320. MUL a2, b2, t4
  1321. LD a2, -1 * SIZE(AO)
  1322. ADD1 c01, t1, c01
  1323. LD b2, 3 * SIZE(BO)
  1324. MUL a3, b3, t1
  1325. lda BO, 4 * SIZE(BO)
  1326. ADD3 c02, t2, c02
  1327. unop
  1328. MUL a4, b3, t2
  1329. LD b3, 0 * SIZE(BO)
  1330. ADD4 c05, t3, c05
  1331. unop
  1332. MUL a3, b4, t3
  1333. LD a3, 0 * SIZE(AO)
  1334. ADD2 c06, t4, c06
  1335. MUL a4, b4, t4
  1336. LD b4, 1 * SIZE(BO)
  1337. unop
  1338. LD a4, 1 * SIZE(AO)
  1339. unop
  1340. unop
  1341. bgt L, $L52
  1342. .align 4
  1343. $L55:
  1344. ADD1 c01, t1, c01
  1345. ldt alpha_r, ALPHA_R
  1346. MUL a1, b1, t1
  1347. #ifndef TRMMKERNEL
  1348. blbs K, $L58
  1349. #else
  1350. blbs TMP1, $L58
  1351. #endif
  1352. .align 4
  1353. ADD3 c02, t2, c02
  1354. unop
  1355. MUL a2, b1, t2
  1356. LD b1, 0 * SIZE(BO)
  1357. ADD4 c05, t3, c05
  1358. lda BO, 2 * SIZE(BO)
  1359. MUL a1, b2, t3
  1360. LD a1, 0 * SIZE(AO)
  1361. ADD2 c06, t4, c06
  1362. unop
  1363. MUL a2, b2, t4
  1364. LD a2, 1 * SIZE(AO)
  1365. ADD1 c01, t1, c01
  1366. LD b2, -1 * SIZE(BO)
  1367. MUL a1, b1, t1
  1368. lda AO, 2 * SIZE(AO)
  1369. .align 4
  1370. $L58:
  1371. ADD3 c02, t2, c02
  1372. unop
  1373. MUL a2, b1, t2
  1374. ldt alpha_i, ALPHA_I
  1375. ADD4 c05, t3, c05
  1376. unop
  1377. MUL a1, b2, t3
  1378. #ifndef TRMMKERNEL
  1379. LD c03, 0 * SIZE(C1)
  1380. #else
  1381. unop
  1382. #endif
  1383. ADD2 c06, t4, c06
  1384. unop
  1385. MUL a2, b2, t4
  1386. #ifndef TRMMKERNEL
  1387. LD c04, 1 * SIZE(C1)
  1388. #else
  1389. unop
  1390. #endif
  1391. ADD1 c01, t1, c01
  1392. ADD3 c02, t2, c02
  1393. ADD4 c05, t3, c05
  1394. ADD2 c06, t4, c06
  1395. ADD c01, c06, c01
  1396. ADD c02, c05, c02
  1397. MUL alpha_r, c01, t1
  1398. MUL alpha_r, c02, t2
  1399. MUL alpha_i, c02, t3
  1400. MUL alpha_i, c01, t4
  1401. #ifndef TRMMKERNEL
  1402. ADD c03, t1, c03
  1403. ADD c04, t2, c04
  1404. #else
  1405. ADD $f31, t1, c03
  1406. ADD $f31, t2, c04
  1407. #endif
  1408. SUB c03, t3, c03
  1409. ADD c04, t4, c04
  1410. ST c03, 0 * SIZE(C1)
  1411. ST c04, 1 * SIZE(C1)
  1412. .align 4
  1413. $L999:
  1414. ldt $f2, 0($sp)
  1415. ldt $f3, 8($sp)
  1416. ldt $f4, 16($sp)
  1417. ldt $f5, 24($sp)
  1418. ldt $f6, 32($sp)
  1419. ldt $f7, 40($sp)
  1420. ldt $f8, 48($sp)
  1421. ldt $f9, 56($sp)
  1422. clr $0
  1423. lda $sp, STACKSIZE($sp)
  1424. ret
  1425. .ident VERSION
  1426. .end CNAME