You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_g4.S 31 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA_R 296(SP)
  48. #define ALPHA_I 304(SP)
  49. #define FZERO 312(SP)
  50. #else
  51. #define STACKSIZE 256
  52. #define ALPHA_R 224(SP)
  53. #define ALPHA_I 232(SP)
  54. #define FZERO 240(SP)
  55. #endif
  56. #define M r3
  57. #define N r4
  58. #define K r5
  59. #if defined(linux) || defined(__FreeBSD__)
  60. #ifndef __64BIT__
  61. #define A r6
  62. #define B r7
  63. #define C r8
  64. #define LDC r9
  65. #define OFFSET r10
  66. #else
  67. #define A r8
  68. #define B r9
  69. #define C r10
  70. #define LDC r6
  71. #define OFFSET r7
  72. #endif
  73. #endif
  74. #if defined(_AIX) || defined(__APPLE__)
  75. #if !defined(__64BIT__) && defined(DOUBLE)
  76. #define A r10
  77. #define B r6
  78. #define C r7
  79. #define LDC r8
  80. #define OFFSET r9
  81. #else
  82. #define A r8
  83. #define B r9
  84. #define C r10
  85. #define LDC r6
  86. #define OFFSET r7
  87. #endif
  88. #endif
  89. #define TEMP r22
  90. #define KK r23
  91. #define I r24
  92. #define J r25
  93. #define AO r26
  94. #define BO r27
  95. #define CO1 r28
  96. #define CO2 r29
  97. #define PREA r30
  98. #define PREC r31
  99. #define A1 f16
  100. #define A2 f17
  101. #define A3 f18
  102. #define A4 f19
  103. #define A5 f20
  104. #define A6 f21
  105. #define B1 f22
  106. #define B2 f23
  107. #define B3 f24
  108. #define B4 f25
  109. #define B5 f26
  110. #define B6 f27
  111. #define B7 f28
  112. #define B8 f29
  113. #define B9 f30
  114. #define B10 f31
  115. #ifndef NEEDPARAM
  116. PROLOGUE
  117. PROFCODE
  118. addi SP, SP, -STACKSIZE
  119. li r0, 0
  120. stfd f14, 0(SP)
  121. stfd f15, 8(SP)
  122. stfd f16, 16(SP)
  123. stfd f17, 24(SP)
  124. stfd f18, 32(SP)
  125. stfd f19, 40(SP)
  126. stfd f20, 48(SP)
  127. stfd f21, 56(SP)
  128. stfd f22, 64(SP)
  129. stfd f23, 72(SP)
  130. stfd f24, 80(SP)
  131. stfd f25, 88(SP)
  132. stfd f26, 96(SP)
  133. stfd f27, 104(SP)
  134. stfd f28, 112(SP)
  135. stfd f29, 120(SP)
  136. stfd f30, 128(SP)
  137. stfd f31, 136(SP)
  138. #ifdef __64BIT__
  139. std r31, 144(SP)
  140. std r30, 152(SP)
  141. std r29, 160(SP)
  142. std r28, 168(SP)
  143. std r27, 176(SP)
  144. std r26, 184(SP)
  145. std r25, 192(SP)
  146. std r24, 200(SP)
  147. #ifdef TRMMKERNEL
  148. std r23, 208(SP)
  149. std r22, 216(SP)
  150. #endif
  151. #else
  152. stw r31, 144(SP)
  153. stw r30, 148(SP)
  154. stw r29, 152(SP)
  155. stw r28, 156(SP)
  156. stw r27, 160(SP)
  157. stw r26, 164(SP)
  158. stw r25, 168(SP)
  159. stw r24, 172(SP)
  160. #ifdef TRMMKERNEL
  161. stw r23, 176(SP)
  162. stw r22, 180(SP)
  163. #endif
  164. #endif
  165. stfd f1, ALPHA_R
  166. stfd f2, ALPHA_I
  167. stw r0, FZERO
  168. #if defined(linux) || defined(__FreeBSD__)
  169. #ifdef __64BIT__
  170. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  171. #endif
  172. #endif
  173. #if defined(_AIX) || defined(__APPLE__)
  174. #ifdef __64BIT__
  175. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  176. #else
  177. #ifdef DOUBLE
  178. lwz B, FRAMESLOT(0) + STACKSIZE(SP)
  179. lwz C, FRAMESLOT(1) + STACKSIZE(SP)
  180. lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
  181. #else
  182. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  183. #endif
  184. #endif
  185. #endif
  186. #ifdef TRMMKERNEL
  187. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  188. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  189. #endif
  190. #if defined(_AIX) || defined(__APPLE__)
  191. #ifdef __64BIT__
  192. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  193. #else
  194. #ifdef DOUBLE
  195. lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
  196. #else
  197. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  198. #endif
  199. #endif
  200. #endif
  201. #if defined(TRMMKERNEL) && !defined(LEFT)
  202. neg KK, OFFSET
  203. #endif
  204. #endif
  205. slwi LDC, LDC, ZBASE_SHIFT
  206. li PREA, 8 * 8 * SIZE
  207. li PREC, 3 * SIZE
  208. cmpwi cr0, M, 0
  209. ble .L999
  210. cmpwi cr0, N, 0
  211. ble .L999
  212. cmpwi cr0, K, 0
  213. ble .L999
  214. lfs f0, FZERO
  215. srawi. J, N, 1
  216. ble .L30
  217. .align 4
  218. .L10:
  219. fmr f1, f0
  220. fmr f2, f0
  221. fmr f3, f0
  222. fmr f4, f0
  223. fmr f5, f0
  224. fmr f6, f0
  225. fmr f7, f0
  226. fmr f8, f0
  227. fmr f9, f0
  228. fmr f10, f0
  229. fmr f11, f0
  230. fmr f12, f0
  231. fmr f13, f0
  232. fmr f14, f0
  233. fmr f15, f0
  234. mr CO1, C
  235. add CO2, C, LDC
  236. add C, CO2, LDC
  237. #if defined(TRMMKERNEL) && defined(LEFT)
  238. mr KK, OFFSET
  239. #endif
  240. srawi. I, M, 1
  241. mr AO, A
  242. ble .L20
  243. .align 4
  244. .L11:
  245. #ifndef TRMMKERNEL
  246. LFD A1, 0 * SIZE(AO)
  247. LFD A2, 1 * SIZE(AO)
  248. LFD A3, 2 * SIZE(AO)
  249. LFDU A5, 4 * SIZE(AO)
  250. LFD B1, 0 * SIZE(B)
  251. LFD B2, 1 * SIZE(B)
  252. LFD B3, 2 * SIZE(B)
  253. LFD B4, 3 * SIZE(B)
  254. dcbtst CO1, PREC
  255. dcbtst CO2, PREC
  256. srawi. r0, K, 1
  257. mr BO, B
  258. mtspr CTR, r0
  259. ble .L15
  260. #else
  261. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  262. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  263. LFD A1, 0 * SIZE(AO)
  264. LFD A2, 1 * SIZE(AO)
  265. LFD A3, 2 * SIZE(AO)
  266. LFDU A5, 4 * SIZE(AO)
  267. LFD B1, 0 * SIZE(B)
  268. LFD B2, 1 * SIZE(B)
  269. LFD B3, 2 * SIZE(B)
  270. LFD B4, 3 * SIZE(B)
  271. mr BO, B
  272. #else
  273. slwi r0, KK, 1 + ZBASE_SHIFT
  274. add AO, AO, r0
  275. add BO, B, r0
  276. LFD A1, 0 * SIZE(AO)
  277. LFD A2, 1 * SIZE(AO)
  278. LFD A3, 2 * SIZE(AO)
  279. LFDU A5, 4 * SIZE(AO)
  280. LFD B1, 0 * SIZE(BO)
  281. LFD B2, 1 * SIZE(BO)
  282. LFD B3, 2 * SIZE(BO)
  283. LFD B4, 3 * SIZE(BO)
  284. #endif
  285. dcbtst CO1, PREC
  286. dcbtst CO2, PREC
  287. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  288. sub TEMP, K, KK
  289. #elif defined(LEFT)
  290. addi TEMP, KK, 2
  291. #else
  292. addi TEMP, KK, 2
  293. #endif
  294. srawi. TEMP, TEMP, 1
  295. mtspr CTR, TEMP
  296. ble .L15
  297. #endif
  298. .align 4
  299. .L12:
  300. FMADD f0, A1, B1, f0
  301. dcbt AO, PREA
  302. FMADD f4, A1, B2, f4
  303. LFDU B5, 4 * SIZE(BO)
  304. FMADD f8, A1, B3, f8
  305. dcbt BO, PREA
  306. FMADD f12, A1, B4, f12
  307. LFD A4, -1 * SIZE(AO)
  308. FMADD f1, A2, B1, f1
  309. nop
  310. FMADD f5, A2, B2, f5
  311. LFD B6, 1 * SIZE(BO)
  312. FMADD f9, A2, B3, f9
  313. LFDU A1, 4 * SIZE(AO)
  314. FMADD f13, A2, B4, f13
  315. nop
  316. FMADD f2, A3, B1, f2
  317. nop
  318. FMADD f6, A3, B2, f6
  319. LFD B7, 2 * SIZE(BO)
  320. FMADD f10, A3, B3, f10
  321. LFD A2, -3 * SIZE(AO)
  322. FMADD f14, A3, B4, f14
  323. nop
  324. FMADD f3, A4, B1, f3
  325. nop
  326. FMADD f7, A4, B2, f7
  327. LFD B8, 3 * SIZE(BO)
  328. FMADD f11, A4, B3, f11
  329. LFD A3, -2 * SIZE(AO)
  330. FMADD f15, A4, B4, f15
  331. nop
  332. FMADD f0, A5, B5, f0
  333. #ifdef DOUBLE
  334. dcbt AO, PREA
  335. #else
  336. nop
  337. #endif
  338. FMADD f4, A5, B6, f4
  339. LFDU B1, 4 * SIZE(BO)
  340. FMADD f8, A5, B7, f8
  341. #ifdef DOUBLE
  342. dcbt BO, PREA
  343. #else
  344. nop
  345. #endif
  346. FMADD f12, A5, B8, f12
  347. LFD A4, -1 * SIZE(AO)
  348. FMADD f1, A2, B5, f1
  349. nop
  350. FMADD f5, A2, B6, f5
  351. LFD B2, 1 * SIZE(BO)
  352. FMADD f9, A2, B7, f9
  353. LFDU A5, 4 * SIZE(AO)
  354. FMADD f13, A2, B8, f13
  355. nop
  356. FMADD f2, A3, B5, f2
  357. nop
  358. FMADD f6, A3, B6, f6
  359. LFD B3, 2 * SIZE(BO)
  360. FMADD f10, A3, B7, f10
  361. LFD A2, -3 * SIZE(AO)
  362. FMADD f14, A3, B8, f14
  363. nop
  364. FMADD f3, A4, B5, f3
  365. nop
  366. FMADD f7, A4, B6, f7
  367. LFD B4, 3 * SIZE(BO)
  368. FMADD f11, A4, B7, f11
  369. LFD A3, -2 * SIZE(AO)
  370. FMADD f15, A4, B8, f15
  371. bdnz .L12
  372. .align 4
  373. .align 4
  374. .L15:
  375. addi AO, AO, -4 * SIZE
  376. #ifndef TRMMKERNEL
  377. andi. r0, K, 1
  378. lfd f30, ALPHA_R
  379. lfd f31, ALPHA_I
  380. ble .LKERNEL_MainFinish
  381. #else
  382. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  383. sub TEMP, K, KK
  384. #elif defined(LEFT)
  385. addi TEMP, KK, 2
  386. #else
  387. addi TEMP, KK, 2
  388. #endif
  389. andi. TEMP, TEMP, 1
  390. lfd f30, ALPHA_R
  391. lfd f31, ALPHA_I
  392. ble .LKERNEL_MainFinish
  393. #endif
  394. .align 4
  395. .L16:
  396. FMADD f0, A1, B1, f0
  397. LFD A4, 3 * SIZE(AO)
  398. FMADD f4, A1, B2, f4
  399. FMADD f8, A1, B3, f8
  400. FMADD f12, A1, B4, f12
  401. FMADD f1, A2, B1, f1
  402. FMADD f5, A2, B2, f5
  403. FMADD f9, A2, B3, f9
  404. FMADD f13, A2, B4, f13
  405. FMADD f2, A3, B1, f2
  406. FMADD f6, A3, B2, f6
  407. FMADD f10, A3, B3, f10
  408. FMADD f14, A3, B4, f14
  409. FMADD f3, A4, B1, f3
  410. FMADD f7, A4, B2, f7
  411. FMADD f11, A4, B3, f11
  412. addi AO, AO, 4 * SIZE
  413. FMADD f15, A4, B4, f15
  414. addi BO, BO, 4 * SIZE
  415. .align 4
  416. .LKERNEL_MainFinish:
  417. #ifndef TRMMKERNEL
  418. LFD f16, 0 * SIZE(CO1)
  419. LFD f17, 1 * SIZE(CO1)
  420. LFD f18, 2 * SIZE(CO1)
  421. LFD f19, 3 * SIZE(CO1)
  422. #endif
  423. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  424. defined(CC) || defined(CR) || defined(RC) || defined(RR)
  425. FSUB f0, f0, f5
  426. FADD f1, f1, f4
  427. FSUB f2, f2, f7
  428. FADD f3, f3, f6
  429. #ifndef TRMMKERNEL
  430. LFD f20, 0 * SIZE(CO2)
  431. LFD f21, 1 * SIZE(CO2)
  432. LFD f22, 2 * SIZE(CO2)
  433. LFD f23, 3 * SIZE(CO2)
  434. #endif
  435. FSUB f8, f8, f13
  436. FADD f9, f9, f12
  437. FSUB f10, f10, f15
  438. FADD f11, f11, f14
  439. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  440. FADD f0, f0, f5
  441. FSUB f1, f1, f4
  442. FADD f2, f2, f7
  443. FSUB f3, f3, f6
  444. #ifndef TRMMKERNEL
  445. LFD f20, 0 * SIZE(CO2)
  446. LFD f21, 1 * SIZE(CO2)
  447. LFD f22, 2 * SIZE(CO2)
  448. LFD f23, 3 * SIZE(CO2)
  449. #endif
  450. FADD f8, f8, f13
  451. FSUB f9, f9, f12
  452. FADD f10, f10, f15
  453. FSUB f11, f11, f14
  454. #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */
  455. FADD f0, f0, f5
  456. FSUB f1, f4, f1
  457. FADD f2, f2, f7
  458. FSUB f3, f6, f3
  459. #ifndef TRMMKERNEL
  460. LFD f20, 0 * SIZE(CO2)
  461. LFD f21, 1 * SIZE(CO2)
  462. LFD f22, 2 * SIZE(CO2)
  463. LFD f23, 3 * SIZE(CO2)
  464. #endif
  465. FADD f8, f8, f13
  466. FSUB f9, f12, f9
  467. FADD f10, f10, f15
  468. FSUB f11, f14, f11
  469. #endif
  470. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  471. #ifndef TRMMKERNEL
  472. FMADD f16, f30, f0, f16
  473. FMADD f17, f30, f1, f17
  474. FMADD f18, f30, f2, f18
  475. FMADD f19, f30, f3, f19
  476. FMADD f20, f30, f8, f20
  477. FMADD f21, f30, f9, f21
  478. FMADD f22, f30, f10, f22
  479. FMADD f23, f30, f11, f23
  480. #else
  481. FMUL f16, f30, f0
  482. FMUL f17, f30, f1
  483. FMUL f18, f30, f2
  484. FMUL f19, f30, f3
  485. FMUL f20, f30, f8
  486. FMUL f21, f30, f9
  487. FMUL f22, f30, f10
  488. FMUL f23, f30, f11
  489. #endif
  490. FNMSUB f16, f31, f1, f16
  491. FMADD f17, f31, f0, f17
  492. FNMSUB f18, f31, f3, f18
  493. FMADD f19, f31, f2, f19
  494. FNMSUB f20, f31, f9, f20
  495. FMADD f21, f31, f8, f21
  496. FNMSUB f22, f31, f11, f22
  497. FMADD f23, f31, f10, f23
  498. #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */
  499. /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */
  500. /* defined(RC)|| defined(RR) */
  501. #ifndef TRMMKERNEL
  502. FMADD f16, f30, f0, f16
  503. FNMSUB f17, f30, f1, f17
  504. FMADD f18, f30, f2, f18
  505. FNMSUB f19, f30, f3, f19
  506. FMADD f20, f30, f8, f20
  507. FNMSUB f21, f30, f9, f21
  508. FMADD f22, f30, f10, f22
  509. FNMSUB f23, f30, f11, f23
  510. FMADD f16, f31, f1, f16
  511. FMADD f17, f31, f0, f17
  512. FMADD f18, f31, f3, f18
  513. FMADD f19, f31, f2, f19
  514. FMADD f20, f31, f9, f20
  515. FMADD f21, f31, f8, f21
  516. FMADD f22, f31, f11, f22
  517. FMADD f23, f31, f10, f23
  518. #else
  519. FMUL f16, f30, f0
  520. FMUL f17, f30, f1
  521. FMUL f18, f30, f2
  522. FMUL f19, f30, f3
  523. FMUL f20, f30, f8
  524. FMUL f21, f30, f9
  525. FMUL f22, f30, f10
  526. FMUL f23, f30, f11
  527. FMADD f16, f31, f1, f16
  528. FNMADD f17, f31, f0, f17
  529. FMADD f18, f31, f3, f18
  530. FNMADD f19, f31, f2, f19
  531. FMADD f20, f31, f9, f20
  532. FNMADD f21, f31, f8, f21
  533. FMADD f22, f31, f11, f22
  534. FNMADD f23, f31, f10, f23
  535. #endif
  536. #endif
  537. STFD f16, 0 * SIZE(CO1)
  538. STFD f17, 1 * SIZE(CO1)
  539. STFD f18, 2 * SIZE(CO1)
  540. STFD f19, 3 * SIZE(CO1)
  541. lfs f0, FZERO
  542. fmr f1, f0
  543. fmr f2, f0
  544. fmr f3, f0
  545. STFD f20, 0 * SIZE(CO2)
  546. STFD f21, 1 * SIZE(CO2)
  547. STFD f22, 2 * SIZE(CO2)
  548. STFD f23, 3 * SIZE(CO2)
  549. fmr f4, f0
  550. fmr f5, f0
  551. fmr f6, f0
  552. fmr f7, f0
  553. fmr f8, f0
  554. fmr f9, f0
  555. fmr f10, f0
  556. fmr f11, f0
  557. fmr f12, f0
  558. fmr f13, f0
  559. fmr f14, f0
  560. fmr f15, f0
  561. addi CO1, CO1, 4 * SIZE
  562. addi CO2, CO2, 4 * SIZE
  563. #ifdef TRMMKERNEL
  564. #if ( defined(LEFT) && defined(TRANSA)) || \
  565. (!defined(LEFT) && !defined(TRANSA))
  566. sub TEMP, K, KK
  567. #ifdef LEFT
  568. addi TEMP, TEMP, -2
  569. #else
  570. addi TEMP, TEMP, -2
  571. #endif
  572. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  573. add AO, AO, TEMP
  574. add BO, BO, TEMP
  575. #endif
  576. #ifdef LEFT
  577. addi KK, KK, 2
  578. #endif
  579. #endif
  580. addic. I, I, -1
  581. bgt .L11
  582. .align 4
  583. .L20:
  584. andi. I, M, 1
  585. ble .L29
  586. #ifndef TRMMKERNEL
  587. LFD f16, 0 * SIZE(AO)
  588. LFD f17, 1 * SIZE(AO)
  589. LFD f18, 2 * SIZE(AO)
  590. LFD f19, 3 * SIZE(AO)
  591. LFD f20, 0 * SIZE(B)
  592. LFD f21, 1 * SIZE(B)
  593. LFD f22, 2 * SIZE(B)
  594. LFD f23, 3 * SIZE(B)
  595. LFD f24, 4 * SIZE(B)
  596. LFD f25, 5 * SIZE(B)
  597. LFD f26, 6 * SIZE(B)
  598. LFD f27, 7 * SIZE(B)
  599. lfs f0, FZERO
  600. fmr f1, f0
  601. fmr f2, f0
  602. fmr f3, f0
  603. fmr f4, f0
  604. fmr f5, f0
  605. fmr f6, f0
  606. fmr f7, f0
  607. srawi. r0, K, 2
  608. mr BO, B
  609. mtspr CTR, r0
  610. ble .L25
  611. #else
  612. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  613. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  614. LFD f16, 0 * SIZE(AO)
  615. LFD f17, 1 * SIZE(AO)
  616. LFD f18, 2 * SIZE(AO)
  617. LFD f19, 3 * SIZE(AO)
  618. LFD f20, 0 * SIZE(B)
  619. LFD f21, 1 * SIZE(B)
  620. LFD f22, 2 * SIZE(B)
  621. LFD f23, 3 * SIZE(B)
  622. LFD f24, 4 * SIZE(B)
  623. LFD f25, 5 * SIZE(B)
  624. LFD f26, 6 * SIZE(B)
  625. LFD f27, 7 * SIZE(B)
  626. mr BO, B
  627. #else
  628. slwi r0, KK, 0 + ZBASE_SHIFT
  629. slwi TEMP, KK, 1 + ZBASE_SHIFT
  630. add AO, AO, r0
  631. add BO, B, TEMP
  632. LFD f16, 0 * SIZE(AO)
  633. LFD f17, 1 * SIZE(AO)
  634. LFD f18, 2 * SIZE(AO)
  635. LFD f19, 3 * SIZE(AO)
  636. LFD f20, 0 * SIZE(BO)
  637. LFD f21, 1 * SIZE(BO)
  638. LFD f22, 2 * SIZE(BO)
  639. LFD f23, 3 * SIZE(BO)
  640. LFD f24, 4 * SIZE(BO)
  641. LFD f25, 5 * SIZE(BO)
  642. LFD f26, 6 * SIZE(BO)
  643. LFD f27, 7 * SIZE(BO)
  644. #endif
  645. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  646. sub TEMP, K, KK
  647. #elif defined(LEFT)
  648. addi TEMP, KK, 1
  649. #else
  650. addi TEMP, KK, 2
  651. #endif
  652. srawi. TEMP, TEMP, 2
  653. mtspr CTR, TEMP
  654. ble .L25
  655. #endif
  656. .align 4
  657. .L22:
  658. fmadd f0, f16, f20, f0
  659. LFD f27, 7 * SIZE(BO)
  660. fmadd f1, f16, f21, f1
  661. LFD f19, 3 * SIZE(AO)
  662. fmadd f2, f16, f22, f2
  663. nop
  664. fmadd f3, f16, f23, f3
  665. LFD f16, 4 * SIZE(AO)
  666. fmadd f4, f17, f20, f4
  667. LFD f20, 8 * SIZE(BO)
  668. fmadd f5, f17, f21, f5
  669. LFD f21, 9 * SIZE(BO)
  670. fmadd f6, f17, f22, f6
  671. LFD f22, 10 * SIZE(BO)
  672. fmadd f7, f17, f23, f7
  673. LFD f23, 11 * SIZE(BO)
  674. fmadd f0, f18, f24, f0
  675. LFD f17, 5 * SIZE(AO)
  676. fmadd f1, f18, f25, f1
  677. nop
  678. fmadd f2, f18, f26, f2
  679. nop
  680. fmadd f3, f18, f27, f3
  681. LFD f18, 6 * SIZE(AO)
  682. fmadd f4, f19, f24, f4
  683. LFD f24, 12 * SIZE(BO)
  684. fmadd f5, f19, f25, f5
  685. LFD f25, 13 * SIZE(BO)
  686. fmadd f6, f19, f26, f6
  687. LFD f26, 14 * SIZE(BO)
  688. fmadd f7, f19, f27, f7
  689. LFD f27, 15 * SIZE(BO)
  690. fmadd f0, f16, f20, f0
  691. LFD f19, 7 * SIZE(AO)
  692. fmadd f1, f16, f21, f1
  693. nop
  694. fmadd f2, f16, f22, f2
  695. nop
  696. fmadd f3, f16, f23, f3
  697. LFDU f16, 8 * SIZE(AO)
  698. fmadd f4, f17, f20, f4
  699. LFDU f20, 16 * SIZE(BO)
  700. fmadd f5, f17, f21, f5
  701. LFD f21, 1 * SIZE(BO)
  702. fmadd f6, f17, f22, f6
  703. LFD f22, 2 * SIZE(BO)
  704. fmadd f7, f17, f23, f7
  705. LFD f23, 3 * SIZE(BO)
  706. fmadd f0, f18, f24, f0
  707. LFD f17, 1 * SIZE(AO)
  708. fmadd f1, f18, f25, f1
  709. nop
  710. fmadd f2, f18, f26, f2
  711. nop
  712. fmadd f3, f18, f27, f3
  713. LFD f18, 2 * SIZE(AO)
  714. fmadd f4, f19, f24, f4
  715. LFD f24, 4 * SIZE(BO)
  716. fmadd f5, f19, f25, f5
  717. LFD f25, 5 * SIZE(BO)
  718. fmadd f6, f19, f26, f6
  719. LFD f26, 6 * SIZE(BO)
  720. fmadd f7, f19, f27, f7
  721. bdnz .L22
  722. .align 4
  723. .L25:
  724. #ifndef TRMMKERNEL
  725. andi. r0, K, 3
  726. lfd f30, ALPHA_R
  727. lfd f31, ALPHA_I
  728. mtspr CTR, r0
  729. ble .L27
  730. #else
  731. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  732. sub TEMP, K, KK
  733. #elif defined(LEFT)
  734. addi TEMP, KK, 1
  735. #else
  736. addi TEMP, KK, 2
  737. #endif
  738. andi. TEMP, TEMP, 3
  739. lfd f30, ALPHA_R
  740. lfd f31, ALPHA_I
  741. mtspr CTR, TEMP
  742. ble .L27
  743. #endif
  744. .align 4
  745. .L26:
  746. fmadd f0, f16, f20, f0
  747. fmadd f1, f16, f21, f1
  748. fmadd f2, f16, f22, f2
  749. fmadd f3, f16, f23, f3
  750. LFDU f16, 2 * SIZE(AO)
  751. fmadd f4, f17, f20, f4
  752. LFDU f20, 4 * SIZE(BO)
  753. fmadd f5, f17, f21, f5
  754. LFD f21, 1 * SIZE(BO)
  755. fmadd f6, f17, f22, f6
  756. LFD f22, 2 * SIZE(BO)
  757. fmadd f7, f17, f23, f7
  758. LFD f23, 3 * SIZE(BO)
  759. LFD f17, 1 * SIZE(AO)
  760. bdnz .L26
  761. .align 4
  762. .L27:
  763. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  764. defined(CC) || defined(CR) || defined(RC) || defined(RR)
  765. FSUB f0, f0, f5
  766. FADD f1, f1, f4
  767. FSUB f2, f2, f7
  768. FADD f3, f3, f6
  769. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  770. FADD f0, f0, f5
  771. FSUB f1, f4, f1
  772. FADD f2, f2, f7
  773. FSUB f3, f6, f3
  774. #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */
  775. FADD f0, f0, f5
  776. FSUB f1, f1, f4
  777. FADD f2, f2, f7
  778. FSUB f3, f3, f6
  779. #endif
  780. #ifndef TRMMKERNEL
  781. LFD f16, 0 * SIZE(CO1)
  782. LFD f17, 1 * SIZE(CO1)
  783. LFD f18, 0 * SIZE(CO2)
  784. LFD f19, 1 * SIZE(CO2)
  785. #endif
  786. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  787. #ifndef TRMMKERNEL
  788. FMADD f16, f30, f0, f16
  789. FMADD f17, f30, f1, f17
  790. FMADD f18, f30, f2, f18
  791. FMADD f19, f30, f3, f19
  792. #else
  793. FMUL f16, f30, f0
  794. FMUL f17, f30, f1
  795. FMUL f18, f30, f2
  796. FMUL f19, f30, f3
  797. #endif
  798. FNMSUB f16, f31, f1, f16
  799. FMADD f17, f31, f0, f17
  800. FNMSUB f18, f31, f3, f18
  801. FMADD f19, f31, f2, f19
  802. #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */
  803. /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */
  804. /* defined(RC)|| defined(RR) */
  805. #ifndef TRMMKERNEL
  806. FMADD f16, f30, f0, f16
  807. FNMSUB f17, f30, f1, f17
  808. FMADD f18, f30, f2, f18
  809. FNMSUB f19, f30, f3, f19
  810. FMADD f16, f31, f1, f16
  811. FMADD f17, f31, f0, f17
  812. FMADD f18, f31, f3, f18
  813. FMADD f19, f31, f2, f19
  814. #else
  815. FMUL f16, f30, f0
  816. FMUL f17, f30, f1
  817. FMUL f18, f30, f2
  818. FMUL f19, f30, f3
  819. FMADD f16, f31, f1, f16
  820. FNMADD f17, f31, f0, f17
  821. FMADD f18, f31, f3, f18
  822. FNMADD f19, f31, f2, f19
  823. #endif
  824. #endif
  825. STFD f16, 0 * SIZE(CO1)
  826. STFD f17, 1 * SIZE(CO1)
  827. STFD f18, 0 * SIZE(CO2)
  828. STFD f19, 1 * SIZE(CO2)
  829. addi CO1, CO1, 2 * SIZE
  830. addi CO2, CO2, 2 * SIZE
  831. #ifdef TRMMKERNEL
  832. #if ( defined(LEFT) && defined(TRANSA)) || \
  833. (!defined(LEFT) && !defined(TRANSA))
  834. sub TEMP, K, KK
  835. #ifdef LEFT
  836. addi TEMP, TEMP, -1
  837. #else
  838. addi TEMP, TEMP, -2
  839. #endif
  840. slwi r0, TEMP, 0 + ZBASE_SHIFT
  841. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  842. add AO, AO, r0
  843. add BO, BO, TEMP
  844. #endif
  845. #ifdef LEFT
  846. addi KK, KK, 1
  847. #endif
  848. #endif
  849. .align 4
  850. .L29:
  851. #if defined(TRMMKERNEL) && !defined(LEFT)
  852. addi KK, KK, 2
  853. #endif
  854. mr B, BO
  855. addic. J, J, -1
  856. lfs f0, FZERO
  857. bgt .L10
  858. .align 4
  859. .L30:
  860. andi. J, N, 1
  861. ble .L999
  862. #if defined(TRMMKERNEL) && defined(LEFT)
  863. mr KK, OFFSET
  864. #endif
  865. srawi. I, M, 1
  866. mr CO1, C
  867. add C, C, LDC
  868. mr AO, A
  869. ble .L40
  870. .align 4
  871. .L31:
  872. #ifndef TRMMKERNEL
  873. LFD f20, 0 * SIZE(AO)
  874. LFD f21, 1 * SIZE(AO)
  875. LFD f22, 2 * SIZE(AO)
  876. LFD f23, 3 * SIZE(AO)
  877. LFD f24, 4 * SIZE(AO)
  878. LFD f25, 5 * SIZE(AO)
  879. LFD f26, 6 * SIZE(AO)
  880. LFD f27, 7 * SIZE(AO)
  881. LFD f16, 0 * SIZE(B)
  882. LFD f17, 1 * SIZE(B)
  883. LFD f18, 2 * SIZE(B)
  884. LFD f19, 3 * SIZE(B)
  885. lfs f0, FZERO
  886. fmr f1, f0
  887. fmr f2, f0
  888. fmr f3, f0
  889. fmr f4, f0
  890. fmr f5, f0
  891. fmr f6, f0
  892. fmr f7, f0
  893. srawi. r0, K, 2
  894. mr BO, B
  895. mtspr CTR, r0
  896. ble .L35
  897. #else
  898. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  899. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  900. LFD f20, 0 * SIZE(AO)
  901. LFD f21, 1 * SIZE(AO)
  902. LFD f22, 2 * SIZE(AO)
  903. LFD f23, 3 * SIZE(AO)
  904. LFD f24, 4 * SIZE(AO)
  905. LFD f25, 5 * SIZE(AO)
  906. LFD f26, 6 * SIZE(AO)
  907. LFD f27, 7 * SIZE(AO)
  908. LFD f16, 0 * SIZE(B)
  909. LFD f17, 1 * SIZE(B)
  910. LFD f18, 2 * SIZE(B)
  911. LFD f19, 3 * SIZE(B)
  912. lfs f0, FZERO
  913. fmr f1, f0
  914. fmr f2, f0
  915. fmr f3, f0
  916. fmr f4, f0
  917. fmr f5, f0
  918. fmr f6, f0
  919. fmr f7, f0
  920. mr BO, B
  921. #else
  922. slwi r0, KK, 1 + ZBASE_SHIFT
  923. slwi TEMP, KK, 0 + ZBASE_SHIFT
  924. add AO, AO, r0
  925. add BO, B, TEMP
  926. LFD f20, 0 * SIZE(AO)
  927. LFD f21, 1 * SIZE(AO)
  928. LFD f22, 2 * SIZE(AO)
  929. LFD f23, 3 * SIZE(AO)
  930. LFD f24, 4 * SIZE(AO)
  931. LFD f25, 5 * SIZE(AO)
  932. LFD f26, 6 * SIZE(AO)
  933. LFD f27, 7 * SIZE(AO)
  934. LFD f16, 0 * SIZE(BO)
  935. LFD f17, 1 * SIZE(BO)
  936. LFD f18, 2 * SIZE(BO)
  937. LFD f19, 3 * SIZE(BO)
  938. lfs f0, FZERO
  939. fmr f1, f0
  940. fmr f2, f0
  941. fmr f3, f0
  942. fmr f4, f0
  943. fmr f5, f0
  944. fmr f6, f0
  945. fmr f7, f0
  946. #endif
  947. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  948. sub TEMP, K, KK
  949. #elif defined(LEFT)
  950. addi TEMP, KK, 2
  951. #else
  952. addi TEMP, KK, 1
  953. #endif
  954. srawi. TEMP, TEMP, 2
  955. mtspr CTR, TEMP
  956. ble .L35
  957. #endif
  958. .align 4
  959. .L32:
  960. fmadd f0, f16, f20, f0
  961. LFD f27, 7 * SIZE(AO)
  962. fmadd f1, f16, f21, f1
  963. LFD f19, 3 * SIZE(BO)
  964. fmadd f2, f16, f22, f2
  965. nop
  966. fmadd f3, f16, f23, f3
  967. LFD f16, 4 * SIZE(BO)
  968. fmadd f4, f17, f20, f4
  969. LFD f20, 8 * SIZE(AO)
  970. fmadd f5, f17, f21, f5
  971. LFD f21, 9 * SIZE(AO)
  972. fmadd f6, f17, f22, f6
  973. LFD f22, 10 * SIZE(AO)
  974. fmadd f7, f17, f23, f7
  975. LFD f23, 11 * SIZE(AO)
  976. fmadd f0, f18, f24, f0
  977. LFD f17, 5 * SIZE(BO)
  978. fmadd f1, f18, f25, f1
  979. nop
  980. fmadd f2, f18, f26, f2
  981. nop
  982. fmadd f3, f18, f27, f3
  983. LFD f18, 6 * SIZE(BO)
  984. fmadd f4, f19, f24, f4
  985. LFD f24, 12 * SIZE(AO)
  986. fmadd f5, f19, f25, f5
  987. LFD f25, 13 * SIZE(AO)
  988. fmadd f6, f19, f26, f6
  989. LFD f26, 14 * SIZE(AO)
  990. fmadd f7, f19, f27, f7
  991. LFD f27, 15 * SIZE(AO)
  992. fmadd f0, f16, f20, f0
  993. LFD f19, 7 * SIZE(BO)
  994. fmadd f1, f16, f21, f1
  995. nop
  996. fmadd f2, f16, f22, f2
  997. nop
  998. fmadd f3, f16, f23, f3
  999. LFDU f16, 8 * SIZE(BO)
  1000. fmadd f4, f17, f20, f4
  1001. LFDU f20, 16 * SIZE(AO)
  1002. fmadd f5, f17, f21, f5
  1003. LFD f21, 1 * SIZE(AO)
  1004. fmadd f6, f17, f22, f6
  1005. LFD f22, 2 * SIZE(AO)
  1006. fmadd f7, f17, f23, f7
  1007. LFD f23, 3 * SIZE(AO)
  1008. fmadd f0, f18, f24, f0
  1009. LFD f17, 1 * SIZE(BO)
  1010. fmadd f1, f18, f25, f1
  1011. nop
  1012. fmadd f2, f18, f26, f2
  1013. nop
  1014. fmadd f3, f18, f27, f3
  1015. LFD f18, 2 * SIZE(BO)
  1016. fmadd f4, f19, f24, f4
  1017. LFD f24, 4 * SIZE(AO)
  1018. fmadd f5, f19, f25, f5
  1019. LFD f25, 5 * SIZE(AO)
  1020. fmadd f6, f19, f26, f6
  1021. LFD f26, 6 * SIZE(AO)
  1022. fmadd f7, f19, f27, f7
  1023. bdnz .L32
  1024. .align 4
  1025. .L35:
  1026. #ifndef TRMMKERNEL
  1027. andi. r0, K, 3
  1028. lfd f30, ALPHA_R
  1029. lfd f31, ALPHA_I
  1030. mtspr CTR, r0
  1031. ble .L37
  1032. #else
  1033. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1034. sub TEMP, K, KK
  1035. #elif defined(LEFT)
  1036. addi TEMP, KK, 2
  1037. #else
  1038. addi TEMP, KK, 1
  1039. #endif
  1040. andi. TEMP, TEMP, 3
  1041. lfd f30, ALPHA_R
  1042. lfd f31, ALPHA_I
  1043. mtspr CTR, TEMP
  1044. ble .L37
  1045. #endif
  1046. .align 4
  1047. .L36:
  1048. fmadd f0, f16, f20, f0
  1049. fmadd f1, f16, f21, f1
  1050. fmadd f2, f16, f22, f2
  1051. fmadd f3, f16, f23, f3
  1052. LFDU f16, 2 * SIZE(BO)
  1053. fmadd f4, f17, f20, f4
  1054. LFDU f20, 4 * SIZE(AO)
  1055. fmadd f5, f17, f21, f5
  1056. LFD f21, 1 * SIZE(AO)
  1057. fmadd f6, f17, f22, f6
  1058. LFD f22, 2 * SIZE(AO)
  1059. fmadd f7, f17, f23, f7
  1060. LFD f23, 3 * SIZE(AO)
  1061. LFD f17, 1 * SIZE(BO)
  1062. bdnz .L36
  1063. .align 4
  1064. .L37:
  1065. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1066. defined(CC) || defined(CR) || defined(RC) || defined(RR)
  1067. FSUB f0, f0, f5
  1068. FADD f1, f1, f4
  1069. FSUB f2, f2, f7
  1070. FADD f3, f3, f6
  1071. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  1072. FADD f0, f0, f5
  1073. FSUB f1, f1, f4
  1074. FADD f2, f2, f7
  1075. FSUB f3, f3, f6
  1076. #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */
  1077. FADD f0, f0, f5
  1078. FSUB f1, f4, f1
  1079. FADD f2, f2, f7
  1080. FSUB f3, f6, f3
  1081. #endif
  1082. #ifndef TRMMKERNEL
  1083. LFD f16, 0 * SIZE(CO1)
  1084. LFD f17, 1 * SIZE(CO1)
  1085. LFD f18, 2 * SIZE(CO1)
  1086. LFD f19, 3 * SIZE(CO1)
  1087. #endif
  1088. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1089. #ifndef TRMMKERNEL
  1090. FMADD f16, f30, f0, f16
  1091. FMADD f17, f30, f1, f17
  1092. FMADD f18, f30, f2, f18
  1093. FMADD f19, f30, f3, f19
  1094. #else
  1095. FMUL f16, f30, f0
  1096. FMUL f17, f30, f1
  1097. FMUL f18, f30, f2
  1098. FMUL f19, f30, f3
  1099. #endif
  1100. FNMSUB f16, f31, f1, f16
  1101. FMADD f17, f31, f0, f17
  1102. FNMSUB f18, f31, f3, f18
  1103. FMADD f19, f31, f2, f19
  1104. #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */
  1105. /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */
  1106. /* defined(RC)|| defined(RR) */
  1107. #ifndef TRMMKERNEL
  1108. FMADD f16, f30, f0, f16
  1109. FNMSUB f17, f30, f1, f17
  1110. FMADD f18, f30, f2, f18
  1111. FNMSUB f19, f30, f3, f19
  1112. FMADD f16, f31, f1, f16
  1113. FMADD f17, f31, f0, f17
  1114. FMADD f18, f31, f3, f18
  1115. FMADD f19, f31, f2, f19
  1116. #else
  1117. FMUL f16, f30, f0
  1118. FMUL f17, f30, f1
  1119. FMUL f18, f30, f2
  1120. FMUL f19, f30, f3
  1121. FMADD f16, f31, f1, f16
  1122. FNMADD f17, f31, f0, f17
  1123. FMADD f18, f31, f3, f18
  1124. FNMADD f19, f31, f2, f19
  1125. #endif
  1126. #endif
  1127. STFD f16, 0 * SIZE(CO1)
  1128. STFD f17, 1 * SIZE(CO1)
  1129. STFD f18, 2 * SIZE(CO1)
  1130. STFD f19, 3 * SIZE(CO1)
  1131. addi CO1, CO1, 4 * SIZE
  1132. #ifdef TRMMKERNEL
  1133. #if ( defined(LEFT) && defined(TRANSA)) || \
  1134. (!defined(LEFT) && !defined(TRANSA))
  1135. sub TEMP, K, KK
  1136. #ifdef LEFT
  1137. addi TEMP, TEMP, -2
  1138. #else
  1139. addi TEMP, TEMP, -1
  1140. #endif
  1141. slwi r0, TEMP, 1 + ZBASE_SHIFT
  1142. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  1143. add AO, AO, r0
  1144. add BO, BO, TEMP
  1145. #endif
  1146. #ifdef LEFT
  1147. addi KK, KK, 2
  1148. #endif
  1149. #endif
  1150. addic. I, I, -1
  1151. bgt .L31
  1152. .align 4
  1153. .L40:
  1154. andi. I, M, 1
  1155. ble .L999
  1156. #ifndef TRMMKERNEL
  1157. LFD f16, 0 * SIZE(AO)
  1158. LFD f17, 1 * SIZE(AO)
  1159. LFD f18, 2 * SIZE(AO)
  1160. LFD f19, 3 * SIZE(AO)
  1161. LFD f20, 0 * SIZE(B)
  1162. LFD f21, 1 * SIZE(B)
  1163. LFD f22, 2 * SIZE(B)
  1164. LFD f23, 3 * SIZE(B)
  1165. lfs f0, FZERO
  1166. fmr f1, f0
  1167. fmr f2, f0
  1168. fmr f3, f0
  1169. fmr f4, f0
  1170. fmr f5, f0
  1171. fmr f6, f0
  1172. fmr f7, f0
  1173. srawi. r0, K, 2
  1174. mr BO, B
  1175. mtspr CTR, r0
  1176. ble .L45
  1177. #else
  1178. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1179. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1180. LFD f16, 0 * SIZE(AO)
  1181. LFD f17, 1 * SIZE(AO)
  1182. LFD f18, 2 * SIZE(AO)
  1183. LFD f19, 3 * SIZE(AO)
  1184. LFD f20, 0 * SIZE(B)
  1185. LFD f21, 1 * SIZE(B)
  1186. LFD f22, 2 * SIZE(B)
  1187. LFD f23, 3 * SIZE(B)
  1188. lfs f0, FZERO
  1189. fmr f1, f0
  1190. fmr f2, f0
  1191. fmr f3, f0
  1192. fmr f4, f0
  1193. fmr f5, f0
  1194. fmr f6, f0
  1195. fmr f7, f0
  1196. mr BO, B
  1197. #else
  1198. slwi r0, KK, 0 + ZBASE_SHIFT
  1199. slwi TEMP, KK, 0 + ZBASE_SHIFT
  1200. add AO, AO, r0
  1201. add BO, B, TEMP
  1202. LFD f16, 0 * SIZE(AO)
  1203. LFD f17, 1 * SIZE(AO)
  1204. LFD f18, 2 * SIZE(AO)
  1205. LFD f19, 3 * SIZE(AO)
  1206. LFD f20, 0 * SIZE(BO)
  1207. LFD f21, 1 * SIZE(BO)
  1208. LFD f22, 2 * SIZE(BO)
  1209. LFD f23, 3 * SIZE(BO)
  1210. lfs f0, FZERO
  1211. fmr f1, f0
  1212. fmr f2, f0
  1213. fmr f3, f0
  1214. fmr f4, f0
  1215. fmr f5, f0
  1216. fmr f6, f0
  1217. fmr f7, f0
  1218. #endif
  1219. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1220. sub TEMP, K, KK
  1221. #elif defined(LEFT)
  1222. addi TEMP, KK, 1
  1223. #else
  1224. addi TEMP, KK, 1
  1225. #endif
  1226. srawi. TEMP, TEMP, 2
  1227. mtspr CTR, TEMP
  1228. ble .L45
  1229. #endif
  1230. .align 4
  1231. .L42:
  1232. fmadd f0, f16, f20, f0
  1233. LFD f23, 3 * SIZE(BO)
  1234. fmadd f3, f16, f21, f3
  1235. LFD f16, 4 * SIZE(AO)
  1236. fmadd f2, f17, f20, f2
  1237. LFD f20, 4 * SIZE(BO)
  1238. fmadd f1, f17, f21, f1
  1239. LFD f17, 5 * SIZE(AO)
  1240. fmadd f4, f18, f22, f4
  1241. LFD f21, 5 * SIZE(BO)
  1242. fmadd f7, f18, f23, f7
  1243. LFD f18, 6 * SIZE(AO)
  1244. fmadd f6, f19, f22, f6
  1245. LFD f22, 6 * SIZE(BO)
  1246. fmadd f5, f19, f23, f5
  1247. LFD f19, 7 * SIZE(AO)
  1248. fmadd f0, f16, f20, f0
  1249. LFD f23, 7 * SIZE(BO)
  1250. fmadd f3, f16, f21, f3
  1251. LFDU f16, 8 * SIZE(AO)
  1252. fmadd f2, f17, f20, f2
  1253. LFDU f20, 8 * SIZE(BO)
  1254. fmadd f1, f17, f21, f1
  1255. LFD f17, 1 * SIZE(AO)
  1256. fmadd f4, f18, f22, f4
  1257. LFD f21, 1 * SIZE(BO)
  1258. fmadd f7, f18, f23, f7
  1259. LFD f18, 2 * SIZE(AO)
  1260. fmadd f6, f19, f22, f6
  1261. LFD f22, 2 * SIZE(BO)
  1262. fmadd f5, f19, f23, f5
  1263. LFD f19, 3 * SIZE(AO)
  1264. bdnz .L42
  1265. .align 4
  1266. .L45:
  1267. fadd f0, f0, f4
  1268. fadd f1, f1, f5
  1269. fadd f2, f2, f6
  1270. fadd f3, f3, f7
  1271. #ifndef TRMMKERNEL
  1272. andi. r0, K, 3
  1273. lfd f30, ALPHA_R
  1274. lfd f31, ALPHA_I
  1275. mtspr CTR,r0
  1276. ble .L47
  1277. #else
  1278. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1279. sub TEMP, K, KK
  1280. #elif defined(LEFT)
  1281. addi TEMP, KK, 1
  1282. #else
  1283. addi TEMP, KK, 1
  1284. #endif
  1285. andi. TEMP, TEMP, 3
  1286. lfd f30, ALPHA_R
  1287. lfd f31, ALPHA_I
  1288. mtspr CTR,TEMP
  1289. ble .L47
  1290. #endif
  1291. .align 4
  1292. .L46:
  1293. fmadd f0, f16, f20, f0
  1294. fmadd f3, f16, f21, f3
  1295. LFDU f16, 2 * SIZE(AO)
  1296. fmadd f2, f17, f20, f2
  1297. LFDU f20, 2 * SIZE(BO)
  1298. fmadd f1, f17, f21, f1
  1299. LFD f17, 1 * SIZE(AO)
  1300. LFD f21, 1 * SIZE(BO)
  1301. bdnz .L46
  1302. .align 4
  1303. .L47:
  1304. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1305. defined(CC) || defined(CR) || defined(RC) || defined(RR)
  1306. fsub f0, f0, f1
  1307. fadd f2, f2, f3
  1308. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  1309. fadd f0, f0, f1
  1310. fsub f2, f2, f3
  1311. #else
  1312. fadd f0, f0, f1
  1313. fsub f2, f3, f2
  1314. #endif
  1315. #ifndef TRMMKERNEL
  1316. LFD f16, 0 * SIZE(CO1)
  1317. LFD f17, 1 * SIZE(CO1)
  1318. #endif
  1319. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1320. #ifndef TRMMKERNEL
  1321. FMADD f16, f30, f0, f16
  1322. FMADD f17, f30, f2, f17
  1323. #else
  1324. FMUL f16, f30, f0
  1325. FMUL f17, f30, f2
  1326. #endif
  1327. FNMSUB f16, f31, f2, f16
  1328. FMADD f17, f31, f0, f17
  1329. #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */
  1330. /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */
  1331. /* defined(RC) || defined(RR) */
  1332. #ifndef TRMMKERNEL
  1333. FMADD f16, f30, f0, f16
  1334. FNMSUB f17, f30, f2, f17
  1335. FMADD f16, f31, f2, f16
  1336. FMADD f17, f31, f0, f17
  1337. #else
  1338. FMUL f16, f30, f0
  1339. FMUL f17, f30, f2
  1340. FMADD f16, f31, f2, f16
  1341. FNMADD f17, f31, f0, f17
  1342. #endif
  1343. #endif
  1344. STFD f16, 0 * SIZE(CO1)
  1345. STFD f17, 1 * SIZE(CO1)
  1346. .align 4
  1347. .L999:
  1348. addi r3, 0, 0
  1349. lfd f14, 0(SP)
  1350. lfd f15, 8(SP)
  1351. lfd f16, 16(SP)
  1352. lfd f17, 24(SP)
  1353. lfd f18, 32(SP)
  1354. lfd f19, 40(SP)
  1355. lfd f20, 48(SP)
  1356. lfd f21, 56(SP)
  1357. lfd f22, 64(SP)
  1358. lfd f23, 72(SP)
  1359. lfd f24, 80(SP)
  1360. lfd f25, 88(SP)
  1361. lfd f26, 96(SP)
  1362. lfd f27, 104(SP)
  1363. lfd f28, 112(SP)
  1364. lfd f29, 120(SP)
  1365. lfd f30, 128(SP)
  1366. lfd f31, 136(SP)
  1367. #ifdef __64BIT__
  1368. ld r31, 144(SP)
  1369. ld r30, 152(SP)
  1370. ld r29, 160(SP)
  1371. ld r28, 168(SP)
  1372. ld r27, 176(SP)
  1373. ld r26, 184(SP)
  1374. ld r25, 192(SP)
  1375. ld r24, 200(SP)
  1376. #ifdef TRMMKERNEL
  1377. ld r23, 208(SP)
  1378. ld r22, 216(SP)
  1379. #endif
  1380. #else
  1381. lwz r31, 144(SP)
  1382. lwz r30, 148(SP)
  1383. lwz r29, 152(SP)
  1384. lwz r28, 156(SP)
  1385. lwz r27, 160(SP)
  1386. lwz r26, 164(SP)
  1387. lwz r25, 168(SP)
  1388. lwz r24, 172(SP)
  1389. #ifdef TRMMKERNEL
  1390. lwz r23, 176(SP)
  1391. lwz r22, 180(SP)
  1392. #endif
  1393. #endif
  1394. addi SP, SP, STACKSIZE
  1395. blr
  1396. EPILOGUE
  1397. #endif