You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_ppc440.S 32 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA_R 296(SP)
  48. #define ALPHA_I 304(SP)
  49. #define FZERO 312(SP)
  50. #else
  51. #define STACKSIZE 256
  52. #define ALPHA_R 224(SP)
  53. #define ALPHA_I 232(SP)
  54. #define FZERO 240(SP)
  55. #endif
  56. #define M r3
  57. #define N r4
  58. #define K r5
  59. #if defined(linux) || defined(__FreeBSD__)
  60. #ifndef __64BIT__
  61. #define A r6
  62. #define B r7
  63. #define C r8
  64. #define LDC r9
  65. #define OFFSET r10
  66. #else
  67. #define A r8
  68. #define B r9
  69. #define C r10
  70. #define LDC r6
  71. #define OFFSET r7
  72. #endif
  73. #endif
  74. #if defined(_AIX) || defined(__APPLE__)
  75. #if !defined(__64BIT__) && defined(DOUBLE)
  76. #define A r10
  77. #define B r6
  78. #define C r7
  79. #define LDC r8
  80. #define OFFSET r9
  81. #else
  82. #define A r8
  83. #define B r9
  84. #define C r10
  85. #define LDC r6
  86. #define OFFSET r7
  87. #endif
  88. #endif
  89. #define TEMP r22
  90. #define KK r23
  91. #define I r24
  92. #define J r25
  93. #define AO r26
  94. #define BO r27
  95. #define CO1 r28
  96. #define CO2 r29
  97. #define A1 f16
  98. #define A2 f17
  99. #define A3 f18
  100. #define A4 f19
  101. #define A5 f20
  102. #define A6 f21
  103. #define B1 f22
  104. #define B2 f23
  105. #define B3 f24
  106. #define B4 f25
  107. #define B5 f26
  108. #define B6 f27
  109. #define B7 f28
  110. #define B8 f29
  111. #define B9 f30
  112. #define B10 f31
  113. #ifndef NEEDPARAM
  114. PROLOGUE
  115. PROFCODE
  116. addi SP, SP, -STACKSIZE
  117. li r0, 0
  118. stfd f14, 0(SP)
  119. stfd f15, 8(SP)
  120. stfd f16, 16(SP)
  121. stfd f17, 24(SP)
  122. stfd f18, 32(SP)
  123. stfd f19, 40(SP)
  124. stfd f20, 48(SP)
  125. stfd f21, 56(SP)
  126. stfd f22, 64(SP)
  127. stfd f23, 72(SP)
  128. stfd f24, 80(SP)
  129. stfd f25, 88(SP)
  130. stfd f26, 96(SP)
  131. stfd f27, 104(SP)
  132. stfd f28, 112(SP)
  133. stfd f29, 120(SP)
  134. stfd f30, 128(SP)
  135. stfd f31, 136(SP)
  136. #ifdef __64BIT__
  137. std r31, 144(SP)
  138. std r30, 152(SP)
  139. std r29, 160(SP)
  140. std r28, 168(SP)
  141. std r27, 176(SP)
  142. std r26, 184(SP)
  143. std r25, 192(SP)
  144. std r24, 200(SP)
  145. #ifdef TRMMKERNEL
  146. std r23, 208(SP)
  147. std r22, 216(SP)
  148. #endif
  149. #else
  150. stw r31, 144(SP)
  151. stw r30, 148(SP)
  152. stw r29, 152(SP)
  153. stw r28, 156(SP)
  154. stw r27, 160(SP)
  155. stw r26, 164(SP)
  156. stw r25, 168(SP)
  157. stw r24, 172(SP)
  158. #ifdef TRMMKERNEL
  159. stw r23, 176(SP)
  160. stw r22, 180(SP)
  161. #endif
  162. #endif
  163. stfd f1, ALPHA_R
  164. stfd f2, ALPHA_I
  165. stw r0, FZERO
  166. #if defined(linux) || defined(__FreeBSD__)
  167. #ifdef __64BIT__
  168. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  169. #endif
  170. #endif
  171. #if defined(_AIX) || defined(__APPLE__)
  172. #ifdef __64BIT__
  173. ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
  174. #else
  175. #ifdef DOUBLE
  176. lwz B, FRAMESLOT(0) + STACKSIZE(SP)
  177. lwz C, FRAMESLOT(1) + STACKSIZE(SP)
  178. lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
  179. #else
  180. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  181. #endif
  182. #endif
  183. #endif
  184. #ifdef TRMMKERNEL
  185. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  186. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  187. #endif
  188. #if defined(_AIX) || defined(__APPLE__)
  189. #ifdef __64BIT__
  190. ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  191. #else
  192. #ifdef DOUBLE
  193. lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
  194. #else
  195. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  196. #endif
  197. #endif
  198. #endif
  199. #if defined(TRMMKERNEL) && !defined(LEFT)
  200. neg KK, OFFSET
  201. #endif
  202. #endif
  203. slwi LDC, LDC, ZBASE_SHIFT
  204. cmpwi cr0, M, 0
  205. ble .L999
  206. cmpwi cr0, N, 0
  207. ble .L999
  208. cmpwi cr0, K, 0
  209. ble .L999
  210. lfs f0, FZERO
  211. srawi. J, N, 1
  212. ble .L30
  213. .align 4
  214. .L10:
  215. fmr f1, f0
  216. fmr f2, f0
  217. fmr f3, f0
  218. fmr f4, f0
  219. fmr f5, f0
  220. fmr f6, f0
  221. fmr f7, f0
  222. fmr f8, f0
  223. fmr f9, f0
  224. fmr f10, f0
  225. fmr f11, f0
  226. fmr f12, f0
  227. fmr f13, f0
  228. fmr f14, f0
  229. fmr f15, f0
  230. mr CO1, C
  231. add CO2, C, LDC
  232. add C, CO2, LDC
  233. #if defined(TRMMKERNEL) && defined(LEFT)
  234. mr KK, OFFSET
  235. #endif
  236. srawi. I, M, 1
  237. mr AO, A
  238. ble .L20
  239. .align 4
  240. .L11:
  241. #ifndef TRMMKERNEL
  242. LFD A1, 0 * SIZE(AO) ###
  243. LFD A2, 1 * SIZE(AO)
  244. LFD A4, 4 * SIZE(AO) ###
  245. LFD A5, 8 * SIZE(AO) ###
  246. LFD B1, 0 * SIZE(B) ###
  247. LFD B2, 1 * SIZE(B)
  248. LFD B3, 2 * SIZE(B)
  249. LFD B4, 3 * SIZE(B)
  250. LFD B5, 4 * SIZE(B) ###
  251. LFD B6, 8 * SIZE(B) ###
  252. LFD B7, 12 * SIZE(B) ###
  253. srawi. r0, K, 2
  254. mr BO, B
  255. mtspr CTR, r0
  256. ble .L15
  257. #else
  258. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  259. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  260. LFD A1, 0 * SIZE(AO) ###
  261. LFD A2, 1 * SIZE(AO)
  262. LFD A4, 4 * SIZE(AO) ###
  263. LFD A5, 8 * SIZE(AO) ###
  264. LFD B1, 0 * SIZE(B) ###
  265. LFD B2, 1 * SIZE(B)
  266. LFD B3, 2 * SIZE(B)
  267. LFD B4, 3 * SIZE(B)
  268. LFD B5, 4 * SIZE(B) ###
  269. LFD B6, 8 * SIZE(B) ###
  270. LFD B7, 12 * SIZE(B) ###
  271. mr BO, B
  272. #else
  273. slwi r0, KK, 1 + ZBASE_SHIFT
  274. add AO, AO, r0
  275. add BO, B, r0
  276. LFD A1, 0 * SIZE(AO) ###
  277. LFD A2, 1 * SIZE(AO)
  278. LFD A4, 4 * SIZE(AO) ###
  279. LFD A5, 8 * SIZE(AO) ###
  280. LFD B1, 0 * SIZE(BO) ###
  281. LFD B2, 1 * SIZE(BO)
  282. LFD B3, 2 * SIZE(BO)
  283. LFD B4, 3 * SIZE(BO)
  284. LFD B5, 4 * SIZE(BO) ###
  285. LFD B6, 8 * SIZE(BO) ###
  286. LFD B7, 12 * SIZE(BO) ###
  287. #endif
  288. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  289. sub TEMP, K, KK
  290. #elif defined(LEFT)
  291. addi TEMP, KK, 2
  292. #else
  293. addi TEMP, KK, 2
  294. #endif
  295. srawi. TEMP, TEMP, 2
  296. mtspr CTR, TEMP
  297. ble .L15
  298. #endif
  299. .align 4
  300. .L12:
  301. FMADD f0, A1, B1, f0
  302. LFD A3, 2 * SIZE(AO)
  303. FMADD f4, A1, B2, f4
  304. LFD A6, 12 * SIZE(AO) ###
  305. FMADD f8, A1, B3, f8
  306. nop
  307. FMADD f12, A1, B4, f12
  308. nop
  309. FMADD f1, A2, B1, f1
  310. LFD A1, 3 * SIZE(AO)
  311. FMADD f5, A2, B2, f5
  312. nop
  313. FMADD f9, A2, B3, f9
  314. nop
  315. FMADD f13, A2, B4, f13
  316. nop
  317. FMADD f2, A3, B1, f2
  318. nop
  319. FMADD f6, A3, B2, f6
  320. LFD B8, 5 * SIZE(BO)
  321. FMADD f10, A3, B3, f10
  322. LFD B9, 6 * SIZE(BO)
  323. FMADD f14, A3, B4, f14
  324. LFD B10, 7 * SIZE(BO)
  325. FMADD f3, A1, B1, f3
  326. LFD A2, 5 * SIZE(AO)
  327. FMADD f7, A1, B2, f7
  328. LFD B1, 16 * SIZE(BO) ###
  329. FMADD f11, A1, B3, f11
  330. nop
  331. FMADD f15, A1, B4, f15
  332. nop
  333. ############
  334. FMADD f0, A4, B5, f0
  335. LFD A3, 6 * SIZE(AO)
  336. FMADD f4, A4, B8, f4
  337. LFD A1, 16 * SIZE(AO) ###
  338. FMADD f8, A4, B9, f8
  339. nop
  340. FMADD f12, A4, B10, f12
  341. nop
  342. FMADD f1, A2, B5, f1
  343. LFD A4, 7 * SIZE(AO)
  344. FMADD f5, A2, B8, f5
  345. nop
  346. FMADD f9, A2, B9, f9
  347. nop
  348. FMADD f13, A2, B10, f13
  349. nop
  350. FMADD f2, A3, B5, f2
  351. nop
  352. FMADD f6, A3, B8, f6
  353. LFD B2, 9 * SIZE(BO)
  354. FMADD f10, A3, B9, f10
  355. LFD B3, 10 * SIZE(BO)
  356. FMADD f14, A3, B10, f14
  357. LFD B4, 11 * SIZE(BO)
  358. FMADD f3, A4, B5, f3
  359. LFD A2, 9 * SIZE(AO)
  360. FMADD f7, A4, B8, f7
  361. LFD B5, 20 * SIZE(BO) ###
  362. FMADD f11, A4, B9, f11
  363. nop
  364. FMADD f15, A4, B10, f15
  365. nop
  366. ############
  367. FMADD f0, A5, B6, f0
  368. LFD A3, 10 * SIZE(AO)
  369. FMADD f4, A5, B2, f4
  370. LFD A4, 20 * SIZE(AO) ###
  371. FMADD f8, A5, B3, f8
  372. nop
  373. FMADD f12, A5, B4, f12
  374. nop
  375. FMADD f1, A2, B6, f1
  376. LFD A5, 11 * SIZE(AO)
  377. FMADD f5, A2, B2, f5
  378. nop
  379. FMADD f9, A2, B3, f9
  380. nop
  381. FMADD f13, A2, B4, f13
  382. nop
  383. FMADD f2, A3, B6, f2
  384. nop
  385. FMADD f6, A3, B2, f6
  386. LFD B8, 13 * SIZE(BO)
  387. FMADD f10, A3, B3, f10
  388. LFD B9, 14 * SIZE(BO)
  389. FMADD f14, A3, B4, f14
  390. LFD B10,15 * SIZE(BO)
  391. FMADD f3, A5, B6, f3
  392. LFD A2, 13 * SIZE(AO)
  393. FMADD f7, A5, B2, f7
  394. LFD B6, 24 * SIZE(BO) ###
  395. FMADD f11, A5, B3, f11
  396. nop
  397. FMADD f15, A5, B4, f15
  398. nop
  399. ############
  400. FMADD f0, A6, B7, f0
  401. LFD A3, 14 * SIZE(AO)
  402. FMADD f4, A6, B8, f4
  403. LFD A5, 24 * SIZE(AO) ###
  404. FMADD f8, A6, B9, f8
  405. nop
  406. FMADD f12, A6, B10, f12
  407. nop
  408. FMADD f1, A2, B7, f1
  409. LFD A6, 15 * SIZE(AO)
  410. FMADD f5, A2, B8, f5
  411. nop
  412. FMADD f9, A2, B9, f9
  413. nop
  414. FMADD f13, A2, B10, f13
  415. nop
  416. FMADD f2, A3, B7, f2
  417. addi AO, AO, 16 * SIZE
  418. FMADD f6, A3, B8, f6
  419. LFD B2, 17 * SIZE(BO)
  420. FMADD f10, A3, B9, f10
  421. LFD B3, 18 * SIZE(BO)
  422. FMADD f14, A3, B10, f14
  423. LFD B4, 19 * SIZE(BO)
  424. FMADD f3, A6, B7, f3
  425. LFD A2, 1 * SIZE(AO)
  426. FMADD f7, A6, B8, f7
  427. LFD B7, 28 * SIZE(BO) ###
  428. FMADD f11, A6, B9, f11
  429. addi BO, BO, 16 * SIZE
  430. FMADD f15, A6, B10, f15
  431. bdnz .L12
  432. .align 4
  433. .L15:
  434. #ifndef TRMMKERNEL
  435. andi. r0, K, 3
  436. lfd f30, ALPHA_R
  437. lfd f31, ALPHA_I
  438. mtspr CTR, r0
  439. ble .LKERNEL_MainFinish
  440. #else
  441. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  442. sub TEMP, K, KK
  443. #elif defined(LEFT)
  444. addi TEMP, KK, 2
  445. #else
  446. addi TEMP, KK, 2
  447. #endif
  448. andi. TEMP, TEMP, 3
  449. lfd f30, ALPHA_R
  450. lfd f31, ALPHA_I
  451. mtspr CTR, TEMP
  452. ble .LKERNEL_MainFinish
  453. #endif
  454. .align 4
  455. .L16:
  456. FMADD f0, A1, B1, f0
  457. LFD A3, 2 * SIZE(AO)
  458. FMADD f4, A1, B2, f4
  459. FMADD f8, A1, B3, f8
  460. FMADD f12, A1, B4, f12
  461. LFD A4, 3 * SIZE(AO)
  462. FMADD f1, A2, B1, f1
  463. FMADD f5, A2, B2, f5
  464. FMADD f9, A2, B3, f9
  465. FMADD f13, A2, B4, f13
  466. LFDU A1, 4 * SIZE(AO)
  467. FMADD f2, A3, B1, f2
  468. FMADD f6, A3, B2, f6
  469. FMADD f10, A3, B3, f10
  470. FMADD f14, A3, B4, f14
  471. LFD A2, 1 * SIZE(AO)
  472. FMADD f3, A4, B1, f3
  473. LFDU B1, 4 * SIZE(BO)
  474. FMADD f7, A4, B2, f7
  475. LFD B2, 1 * SIZE(BO)
  476. FMADD f11, A4, B3, f11
  477. LFD B3, 2 * SIZE(BO)
  478. FMADD f15, A4, B4, f15
  479. LFD B4, 3 * SIZE(BO)
  480. bdnz .L16
  481. .align 4
  482. .LKERNEL_MainFinish:
  483. #ifndef TRMMKERNEL
  484. LFD f16, 0 * SIZE(CO1)
  485. LFD f17, 1 * SIZE(CO1)
  486. LFD f18, 2 * SIZE(CO1)
  487. LFD f19, 3 * SIZE(CO1)
  488. #endif
  489. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  490. defined(CC) || defined(CR) || defined(RC) || defined(RR)
  491. FSUB f0, f0, f5
  492. FADD f1, f1, f4
  493. FSUB f2, f2, f7
  494. FADD f3, f3, f6
  495. #ifndef TRMMKERNEL
  496. LFD f20, 0 * SIZE(CO2)
  497. LFD f21, 1 * SIZE(CO2)
  498. LFD f22, 2 * SIZE(CO2)
  499. LFD f23, 3 * SIZE(CO2)
  500. #endif
  501. FSUB f8, f8, f13
  502. FADD f9, f9, f12
  503. FSUB f10, f10, f15
  504. FADD f11, f11, f14
  505. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  506. FADD f0, f0, f5
  507. FSUB f1, f1, f4
  508. FADD f2, f2, f7
  509. FSUB f3, f3, f6
  510. #ifndef TRMMKERNEL
  511. LFD f20, 0 * SIZE(CO2)
  512. LFD f21, 1 * SIZE(CO2)
  513. LFD f22, 2 * SIZE(CO2)
  514. LFD f23, 3 * SIZE(CO2)
  515. #endif
  516. FADD f8, f8, f13
  517. FSUB f9, f9, f12
  518. FADD f10, f10, f15
  519. FSUB f11, f11, f14
  520. #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */
  521. FADD f0, f0, f5
  522. FSUB f1, f4, f1
  523. FADD f2, f2, f7
  524. FSUB f3, f6, f3
  525. #ifndef TRMMKERNEL
  526. LFD f20, 0 * SIZE(CO2)
  527. LFD f21, 1 * SIZE(CO2)
  528. LFD f22, 2 * SIZE(CO2)
  529. LFD f23, 3 * SIZE(CO2)
  530. #endif
  531. FADD f8, f8, f13
  532. FSUB f9, f12, f9
  533. FADD f10, f10, f15
  534. FSUB f11, f14, f11
  535. #endif
  536. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  537. #ifndef TRMMKERNEL
  538. FMADD f16, f30, f0, f16
  539. FMADD f17, f30, f1, f17
  540. FMADD f18, f30, f2, f18
  541. FMADD f19, f30, f3, f19
  542. FMADD f20, f30, f8, f20
  543. FMADD f21, f30, f9, f21
  544. FMADD f22, f30, f10, f22
  545. FMADD f23, f30, f11, f23
  546. #else
  547. FMUL f16, f30, f0
  548. FMUL f17, f30, f1
  549. FMUL f18, f30, f2
  550. FMUL f19, f30, f3
  551. FMUL f20, f30, f8
  552. FMUL f21, f30, f9
  553. FMUL f22, f30, f10
  554. FMUL f23, f30, f11
  555. #endif
  556. FNMSUB f16, f31, f1, f16
  557. FMADD f17, f31, f0, f17
  558. FNMSUB f18, f31, f3, f18
  559. FMADD f19, f31, f2, f19
  560. FNMSUB f20, f31, f9, f20
  561. FMADD f21, f31, f8, f21
  562. FNMSUB f22, f31, f11, f22
  563. FMADD f23, f31, f10, f23
  564. #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */
  565. /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */
  566. /* defined(RC)|| defined(RR) */
  567. #ifndef TRMMKERNEL
  568. FMADD f16, f30, f0, f16
  569. FNMSUB f17, f30, f1, f17
  570. FMADD f18, f30, f2, f18
  571. FNMSUB f19, f30, f3, f19
  572. FMADD f20, f30, f8, f20
  573. FNMSUB f21, f30, f9, f21
  574. FMADD f22, f30, f10, f22
  575. FNMSUB f23, f30, f11, f23
  576. FMADD f16, f31, f1, f16
  577. FMADD f17, f31, f0, f17
  578. FMADD f18, f31, f3, f18
  579. FMADD f19, f31, f2, f19
  580. FMADD f20, f31, f9, f20
  581. FMADD f21, f31, f8, f21
  582. FMADD f22, f31, f11, f22
  583. FMADD f23, f31, f10, f23
  584. #else
  585. FMUL f16, f30, f0
  586. FMUL f17, f30, f1
  587. FMUL f18, f30, f2
  588. FMUL f19, f30, f3
  589. FMUL f20, f30, f8
  590. FMUL f21, f30, f9
  591. FMUL f22, f30, f10
  592. FMUL f23, f30, f11
  593. FMADD f16, f31, f1, f16
  594. FNMADD f17, f31, f0, f17
  595. FMADD f18, f31, f3, f18
  596. FNMADD f19, f31, f2, f19
  597. FMADD f20, f31, f9, f20
  598. FNMADD f21, f31, f8, f21
  599. FMADD f22, f31, f11, f22
  600. FNMADD f23, f31, f10, f23
  601. #endif
  602. #endif
  603. STFD f16, 0 * SIZE(CO1)
  604. STFD f17, 1 * SIZE(CO1)
  605. STFD f18, 2 * SIZE(CO1)
  606. STFD f19, 3 * SIZE(CO1)
  607. lfs f0, FZERO
  608. fmr f1, f0
  609. fmr f2, f0
  610. fmr f3, f0
  611. STFD f20, 0 * SIZE(CO2)
  612. STFD f21, 1 * SIZE(CO2)
  613. STFD f22, 2 * SIZE(CO2)
  614. STFD f23, 3 * SIZE(CO2)
  615. fmr f4, f0
  616. fmr f5, f0
  617. fmr f6, f0
  618. fmr f7, f0
  619. fmr f8, f0
  620. fmr f9, f0
  621. fmr f10, f0
  622. fmr f11, f0
  623. fmr f12, f0
  624. fmr f13, f0
  625. fmr f14, f0
  626. fmr f15, f0
  627. addi CO1, CO1, 4 * SIZE
  628. addi CO2, CO2, 4 * SIZE
  629. #ifdef TRMMKERNEL
  630. #if ( defined(LEFT) && defined(TRANSA)) || \
  631. (!defined(LEFT) && !defined(TRANSA))
  632. sub TEMP, K, KK
  633. #ifdef LEFT
  634. addi TEMP, TEMP, -2
  635. #else
  636. addi TEMP, TEMP, -2
  637. #endif
  638. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  639. add AO, AO, TEMP
  640. add BO, BO, TEMP
  641. #endif
  642. #ifdef LEFT
  643. addi KK, KK, 2
  644. #endif
  645. #endif
  646. addic. I, I, -1
  647. bgt .L11
  648. .align 4
  649. .L20:
  650. andi. I, M, 1
  651. ble .L29
  652. #ifndef TRMMKERNEL
  653. LFD f16, 0 * SIZE(AO)
  654. LFD f17, 1 * SIZE(AO)
  655. LFD f18, 2 * SIZE(AO)
  656. LFD f19, 3 * SIZE(AO)
  657. LFD f20, 0 * SIZE(B)
  658. LFD f21, 1 * SIZE(B)
  659. LFD f22, 2 * SIZE(B)
  660. LFD f23, 3 * SIZE(B)
  661. LFD f24, 4 * SIZE(B)
  662. LFD f25, 5 * SIZE(B)
  663. LFD f26, 6 * SIZE(B)
  664. LFD f27, 7 * SIZE(B)
  665. lfs f0, FZERO
  666. fmr f1, f0
  667. fmr f2, f0
  668. fmr f3, f0
  669. fmr f4, f0
  670. fmr f5, f0
  671. fmr f6, f0
  672. fmr f7, f0
  673. srawi. r0, K, 2
  674. mr BO, B
  675. mtspr CTR, r0
  676. ble .L25
  677. #else
  678. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  679. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  680. LFD f16, 0 * SIZE(AO)
  681. LFD f17, 1 * SIZE(AO)
  682. LFD f18, 2 * SIZE(AO)
  683. LFD f19, 3 * SIZE(AO)
  684. LFD f20, 0 * SIZE(B)
  685. LFD f21, 1 * SIZE(B)
  686. LFD f22, 2 * SIZE(B)
  687. LFD f23, 3 * SIZE(B)
  688. LFD f24, 4 * SIZE(B)
  689. LFD f25, 5 * SIZE(B)
  690. LFD f26, 6 * SIZE(B)
  691. LFD f27, 7 * SIZE(B)
  692. mr BO, B
  693. #else
  694. slwi r0, KK, 0 + ZBASE_SHIFT
  695. slwi TEMP, KK, 1 + ZBASE_SHIFT
  696. add AO, AO, r0
  697. add BO, B, TEMP
  698. LFD f16, 0 * SIZE(AO)
  699. LFD f17, 1 * SIZE(AO)
  700. LFD f18, 2 * SIZE(AO)
  701. LFD f19, 3 * SIZE(AO)
  702. LFD f20, 0 * SIZE(BO)
  703. LFD f21, 1 * SIZE(BO)
  704. LFD f22, 2 * SIZE(BO)
  705. LFD f23, 3 * SIZE(BO)
  706. LFD f24, 4 * SIZE(BO)
  707. LFD f25, 5 * SIZE(BO)
  708. LFD f26, 6 * SIZE(BO)
  709. LFD f27, 7 * SIZE(BO)
  710. #endif
  711. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  712. sub TEMP, K, KK
  713. #elif defined(LEFT)
  714. addi TEMP, KK, 1
  715. #else
  716. addi TEMP, KK, 2
  717. #endif
  718. srawi. TEMP, TEMP, 2
  719. mtspr CTR, TEMP
  720. ble .L25
  721. #endif
  722. .align 4
  723. .L22:
  724. fmadd f0, f16, f20, f0
  725. LFD f27, 7 * SIZE(BO)
  726. fmadd f1, f16, f21, f1
  727. LFD f19, 3 * SIZE(AO)
  728. fmadd f2, f16, f22, f2
  729. nop
  730. fmadd f3, f16, f23, f3
  731. LFD f16, 4 * SIZE(AO)
  732. fmadd f4, f17, f20, f4
  733. LFD f20, 8 * SIZE(BO)
  734. fmadd f5, f17, f21, f5
  735. LFD f21, 9 * SIZE(BO)
  736. fmadd f6, f17, f22, f6
  737. LFD f22, 10 * SIZE(BO)
  738. fmadd f7, f17, f23, f7
  739. LFD f23, 11 * SIZE(BO)
  740. fmadd f0, f18, f24, f0
  741. LFD f17, 5 * SIZE(AO)
  742. fmadd f1, f18, f25, f1
  743. nop
  744. fmadd f2, f18, f26, f2
  745. nop
  746. fmadd f3, f18, f27, f3
  747. LFD f18, 6 * SIZE(AO)
  748. fmadd f4, f19, f24, f4
  749. LFD f24, 12 * SIZE(BO)
  750. fmadd f5, f19, f25, f5
  751. LFD f25, 13 * SIZE(BO)
  752. fmadd f6, f19, f26, f6
  753. LFD f26, 14 * SIZE(BO)
  754. fmadd f7, f19, f27, f7
  755. LFD f27, 15 * SIZE(BO)
  756. fmadd f0, f16, f20, f0
  757. LFD f19, 7 * SIZE(AO)
  758. fmadd f1, f16, f21, f1
  759. nop
  760. fmadd f2, f16, f22, f2
  761. nop
  762. fmadd f3, f16, f23, f3
  763. LFDU f16, 8 * SIZE(AO)
  764. fmadd f4, f17, f20, f4
  765. LFDU f20, 16 * SIZE(BO)
  766. fmadd f5, f17, f21, f5
  767. LFD f21, 1 * SIZE(BO)
  768. fmadd f6, f17, f22, f6
  769. LFD f22, 2 * SIZE(BO)
  770. fmadd f7, f17, f23, f7
  771. LFD f23, 3 * SIZE(BO)
  772. fmadd f0, f18, f24, f0
  773. LFD f17, 1 * SIZE(AO)
  774. fmadd f1, f18, f25, f1
  775. nop
  776. fmadd f2, f18, f26, f2
  777. nop
  778. fmadd f3, f18, f27, f3
  779. LFD f18, 2 * SIZE(AO)
  780. fmadd f4, f19, f24, f4
  781. LFD f24, 4 * SIZE(BO)
  782. fmadd f5, f19, f25, f5
  783. LFD f25, 5 * SIZE(BO)
  784. fmadd f6, f19, f26, f6
  785. LFD f26, 6 * SIZE(BO)
  786. fmadd f7, f19, f27, f7
  787. bdnz .L22
  788. .align 4
  789. .L25:
  790. #ifndef TRMMKERNEL
  791. andi. r0, K, 3
  792. lfd f30, ALPHA_R
  793. lfd f31, ALPHA_I
  794. mtspr CTR, r0
  795. ble .L27
  796. #else
  797. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  798. sub TEMP, K, KK
  799. #elif defined(LEFT)
  800. addi TEMP, KK, 1
  801. #else
  802. addi TEMP, KK, 2
  803. #endif
  804. andi. TEMP, TEMP, 3
  805. lfd f30, ALPHA_R
  806. lfd f31, ALPHA_I
  807. mtspr CTR, TEMP
  808. ble .L27
  809. #endif
  810. .align 4
  811. .L26:
  812. fmadd f0, f16, f20, f0
  813. fmadd f1, f16, f21, f1
  814. fmadd f2, f16, f22, f2
  815. fmadd f3, f16, f23, f3
  816. LFDU f16, 2 * SIZE(AO)
  817. fmadd f4, f17, f20, f4
  818. LFDU f20, 4 * SIZE(BO)
  819. fmadd f5, f17, f21, f5
  820. LFD f21, 1 * SIZE(BO)
  821. fmadd f6, f17, f22, f6
  822. LFD f22, 2 * SIZE(BO)
  823. fmadd f7, f17, f23, f7
  824. LFD f23, 3 * SIZE(BO)
  825. LFD f17, 1 * SIZE(AO)
  826. bdnz .L26
  827. .align 4
  828. .L27:
  829. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  830. defined(CC) || defined(CR) || defined(RC) || defined(RR)
  831. FSUB f0, f0, f5
  832. FADD f1, f1, f4
  833. FSUB f2, f2, f7
  834. FADD f3, f3, f6
  835. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  836. FADD f0, f0, f5
  837. FSUB f1, f4, f1
  838. FADD f2, f2, f7
  839. FSUB f3, f6, f3
  840. #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */
  841. FADD f0, f0, f5
  842. FSUB f1, f1, f4
  843. FADD f2, f2, f7
  844. FSUB f3, f3, f6
  845. #endif
  846. #ifndef TRMMKERNEL
  847. LFD f16, 0 * SIZE(CO1)
  848. LFD f17, 1 * SIZE(CO1)
  849. LFD f18, 0 * SIZE(CO2)
  850. LFD f19, 1 * SIZE(CO2)
  851. #endif
  852. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  853. #ifndef TRMMKERNEL
  854. FMADD f16, f30, f0, f16
  855. FMADD f17, f30, f1, f17
  856. FMADD f18, f30, f2, f18
  857. FMADD f19, f30, f3, f19
  858. #else
  859. FMUL f16, f30, f0
  860. FMUL f17, f30, f1
  861. FMUL f18, f30, f2
  862. FMUL f19, f30, f3
  863. #endif
  864. FNMSUB f16, f31, f1, f16
  865. FMADD f17, f31, f0, f17
  866. FNMSUB f18, f31, f3, f18
  867. FMADD f19, f31, f2, f19
  868. #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */
  869. /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */
  870. /* defined(RC)|| defined(RR) */
  871. #ifndef TRMMKERNEL
  872. FMADD f16, f30, f0, f16
  873. FNMSUB f17, f30, f1, f17
  874. FMADD f18, f30, f2, f18
  875. FNMSUB f19, f30, f3, f19
  876. FMADD f16, f31, f1, f16
  877. FMADD f17, f31, f0, f17
  878. FMADD f18, f31, f3, f18
  879. FMADD f19, f31, f2, f19
  880. #else
  881. FMUL f16, f30, f0
  882. FMUL f17, f30, f1
  883. FMUL f18, f30, f2
  884. FMUL f19, f30, f3
  885. FMADD f16, f31, f1, f16
  886. FNMADD f17, f31, f0, f17
  887. FMADD f18, f31, f3, f18
  888. FNMADD f19, f31, f2, f19
  889. #endif
  890. #endif
  891. STFD f16, 0 * SIZE(CO1)
  892. STFD f17, 1 * SIZE(CO1)
  893. STFD f18, 0 * SIZE(CO2)
  894. STFD f19, 1 * SIZE(CO2)
  895. addi CO1, CO1, 2 * SIZE
  896. addi CO2, CO2, 2 * SIZE
  897. #ifdef TRMMKERNEL
  898. #if ( defined(LEFT) && defined(TRANSA)) || \
  899. (!defined(LEFT) && !defined(TRANSA))
  900. sub TEMP, K, KK
  901. #ifdef LEFT
  902. addi TEMP, TEMP, -1
  903. #else
  904. addi TEMP, TEMP, -2
  905. #endif
  906. slwi r0, TEMP, 0 + ZBASE_SHIFT
  907. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  908. add AO, AO, r0
  909. add BO, BO, TEMP
  910. #endif
  911. #ifdef LEFT
  912. addi KK, KK, 1
  913. #endif
  914. #endif
  915. .align 4
  916. .L29:
  917. #if defined(TRMMKERNEL) && !defined(LEFT)
  918. addi KK, KK, 2
  919. #endif
  920. mr B, BO
  921. addic. J, J, -1
  922. lfs f0, FZERO
  923. bgt .L10
  924. .align 4
  925. .L30:
  926. andi. J, N, 1
  927. ble .L999
  928. #if defined(TRMMKERNEL) && defined(LEFT)
  929. mr KK, OFFSET
  930. #endif
  931. srawi. I, M, 1
  932. mr CO1, C
  933. add C, C, LDC
  934. mr AO, A
  935. ble .L40
  936. .align 4
  937. .L31:
  938. #ifndef TRMMKERNEL
  939. LFD f20, 0 * SIZE(AO)
  940. LFD f21, 1 * SIZE(AO)
  941. LFD f22, 2 * SIZE(AO)
  942. LFD f23, 3 * SIZE(AO)
  943. LFD f24, 4 * SIZE(AO)
  944. LFD f25, 5 * SIZE(AO)
  945. LFD f26, 6 * SIZE(AO)
  946. LFD f27, 7 * SIZE(AO)
  947. LFD f16, 0 * SIZE(B)
  948. LFD f17, 1 * SIZE(B)
  949. LFD f18, 2 * SIZE(B)
  950. LFD f19, 3 * SIZE(B)
  951. lfs f0, FZERO
  952. fmr f1, f0
  953. fmr f2, f0
  954. fmr f3, f0
  955. fmr f4, f0
  956. fmr f5, f0
  957. fmr f6, f0
  958. fmr f7, f0
  959. srawi. r0, K, 2
  960. mr BO, B
  961. mtspr CTR, r0
  962. ble .L35
  963. #else
  964. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  965. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  966. LFD f20, 0 * SIZE(AO)
  967. LFD f21, 1 * SIZE(AO)
  968. LFD f22, 2 * SIZE(AO)
  969. LFD f23, 3 * SIZE(AO)
  970. LFD f24, 4 * SIZE(AO)
  971. LFD f25, 5 * SIZE(AO)
  972. LFD f26, 6 * SIZE(AO)
  973. LFD f27, 7 * SIZE(AO)
  974. LFD f16, 0 * SIZE(B)
  975. LFD f17, 1 * SIZE(B)
  976. LFD f18, 2 * SIZE(B)
  977. LFD f19, 3 * SIZE(B)
  978. lfs f0, FZERO
  979. fmr f1, f0
  980. fmr f2, f0
  981. fmr f3, f0
  982. fmr f4, f0
  983. fmr f5, f0
  984. fmr f6, f0
  985. fmr f7, f0
  986. mr BO, B
  987. #else
  988. slwi r0, KK, 1 + ZBASE_SHIFT
  989. slwi TEMP, KK, 0 + ZBASE_SHIFT
  990. add AO, AO, r0
  991. add BO, B, TEMP
  992. LFD f20, 0 * SIZE(AO)
  993. LFD f21, 1 * SIZE(AO)
  994. LFD f22, 2 * SIZE(AO)
  995. LFD f23, 3 * SIZE(AO)
  996. LFD f24, 4 * SIZE(AO)
  997. LFD f25, 5 * SIZE(AO)
  998. LFD f26, 6 * SIZE(AO)
  999. LFD f27, 7 * SIZE(AO)
  1000. LFD f16, 0 * SIZE(BO)
  1001. LFD f17, 1 * SIZE(BO)
  1002. LFD f18, 2 * SIZE(BO)
  1003. LFD f19, 3 * SIZE(BO)
  1004. lfs f0, FZERO
  1005. fmr f1, f0
  1006. fmr f2, f0
  1007. fmr f3, f0
  1008. fmr f4, f0
  1009. fmr f5, f0
  1010. fmr f6, f0
  1011. fmr f7, f0
  1012. #endif
  1013. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1014. sub TEMP, K, KK
  1015. #elif defined(LEFT)
  1016. addi TEMP, KK, 2
  1017. #else
  1018. addi TEMP, KK, 1
  1019. #endif
  1020. srawi. TEMP, TEMP, 2
  1021. mtspr CTR, TEMP
  1022. ble .L35
  1023. #endif
  1024. .align 4
  1025. .L32:
  1026. fmadd f0, f16, f20, f0
  1027. LFD f27, 7 * SIZE(AO)
  1028. fmadd f1, f16, f21, f1
  1029. LFD f19, 3 * SIZE(BO)
  1030. fmadd f2, f16, f22, f2
  1031. nop
  1032. fmadd f3, f16, f23, f3
  1033. LFD f16, 4 * SIZE(BO)
  1034. fmadd f4, f17, f20, f4
  1035. LFD f20, 8 * SIZE(AO)
  1036. fmadd f5, f17, f21, f5
  1037. LFD f21, 9 * SIZE(AO)
  1038. fmadd f6, f17, f22, f6
  1039. LFD f22, 10 * SIZE(AO)
  1040. fmadd f7, f17, f23, f7
  1041. LFD f23, 11 * SIZE(AO)
  1042. fmadd f0, f18, f24, f0
  1043. LFD f17, 5 * SIZE(BO)
  1044. fmadd f1, f18, f25, f1
  1045. nop
  1046. fmadd f2, f18, f26, f2
  1047. nop
  1048. fmadd f3, f18, f27, f3
  1049. LFD f18, 6 * SIZE(BO)
  1050. fmadd f4, f19, f24, f4
  1051. LFD f24, 12 * SIZE(AO)
  1052. fmadd f5, f19, f25, f5
  1053. LFD f25, 13 * SIZE(AO)
  1054. fmadd f6, f19, f26, f6
  1055. LFD f26, 14 * SIZE(AO)
  1056. fmadd f7, f19, f27, f7
  1057. LFD f27, 15 * SIZE(AO)
  1058. fmadd f0, f16, f20, f0
  1059. LFD f19, 7 * SIZE(BO)
  1060. fmadd f1, f16, f21, f1
  1061. nop
  1062. fmadd f2, f16, f22, f2
  1063. nop
  1064. fmadd f3, f16, f23, f3
  1065. LFDU f16, 8 * SIZE(BO)
  1066. fmadd f4, f17, f20, f4
  1067. LFDU f20, 16 * SIZE(AO)
  1068. fmadd f5, f17, f21, f5
  1069. LFD f21, 1 * SIZE(AO)
  1070. fmadd f6, f17, f22, f6
  1071. LFD f22, 2 * SIZE(AO)
  1072. fmadd f7, f17, f23, f7
  1073. LFD f23, 3 * SIZE(AO)
  1074. fmadd f0, f18, f24, f0
  1075. LFD f17, 1 * SIZE(BO)
  1076. fmadd f1, f18, f25, f1
  1077. nop
  1078. fmadd f2, f18, f26, f2
  1079. nop
  1080. fmadd f3, f18, f27, f3
  1081. LFD f18, 2 * SIZE(BO)
  1082. fmadd f4, f19, f24, f4
  1083. LFD f24, 4 * SIZE(AO)
  1084. fmadd f5, f19, f25, f5
  1085. LFD f25, 5 * SIZE(AO)
  1086. fmadd f6, f19, f26, f6
  1087. LFD f26, 6 * SIZE(AO)
  1088. fmadd f7, f19, f27, f7
  1089. bdnz .L32
  1090. .align 4
  1091. .L35:
  1092. #ifndef TRMMKERNEL
  1093. andi. r0, K, 3
  1094. lfd f30, ALPHA_R
  1095. lfd f31, ALPHA_I
  1096. mtspr CTR, r0
  1097. ble .L37
  1098. #else
  1099. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1100. sub TEMP, K, KK
  1101. #elif defined(LEFT)
  1102. addi TEMP, KK, 2
  1103. #else
  1104. addi TEMP, KK, 1
  1105. #endif
  1106. andi. TEMP, TEMP, 3
  1107. lfd f30, ALPHA_R
  1108. lfd f31, ALPHA_I
  1109. mtspr CTR, TEMP
  1110. ble .L37
  1111. #endif
  1112. .align 4
  1113. .L36:
  1114. fmadd f0, f16, f20, f0
  1115. fmadd f1, f16, f21, f1
  1116. fmadd f2, f16, f22, f2
  1117. fmadd f3, f16, f23, f3
  1118. LFDU f16, 2 * SIZE(BO)
  1119. fmadd f4, f17, f20, f4
  1120. LFDU f20, 4 * SIZE(AO)
  1121. fmadd f5, f17, f21, f5
  1122. LFD f21, 1 * SIZE(AO)
  1123. fmadd f6, f17, f22, f6
  1124. LFD f22, 2 * SIZE(AO)
  1125. fmadd f7, f17, f23, f7
  1126. LFD f23, 3 * SIZE(AO)
  1127. LFD f17, 1 * SIZE(BO)
  1128. bdnz .L36
  1129. .align 4
  1130. .L37:
  1131. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1132. defined(CC) || defined(CR) || defined(RC) || defined(RR)
  1133. FSUB f0, f0, f5
  1134. FADD f1, f1, f4
  1135. FSUB f2, f2, f7
  1136. FADD f3, f3, f6
  1137. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  1138. FADD f0, f0, f5
  1139. FSUB f1, f1, f4
  1140. FADD f2, f2, f7
  1141. FSUB f3, f3, f6
  1142. #else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */
  1143. FADD f0, f0, f5
  1144. FSUB f1, f4, f1
  1145. FADD f2, f2, f7
  1146. FSUB f3, f6, f3
  1147. #endif
  1148. #ifndef TRMMKERNEL
  1149. LFD f16, 0 * SIZE(CO1)
  1150. LFD f17, 1 * SIZE(CO1)
  1151. LFD f18, 2 * SIZE(CO1)
  1152. LFD f19, 3 * SIZE(CO1)
  1153. #endif
  1154. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1155. #ifndef TRMMKERNEL
  1156. FMADD f16, f30, f0, f16
  1157. FMADD f17, f30, f1, f17
  1158. FMADD f18, f30, f2, f18
  1159. FMADD f19, f30, f3, f19
  1160. #else
  1161. FMUL f16, f30, f0
  1162. FMUL f17, f30, f1
  1163. FMUL f18, f30, f2
  1164. FMUL f19, f30, f3
  1165. #endif
  1166. FNMSUB f16, f31, f1, f16
  1167. FMADD f17, f31, f0, f17
  1168. FNMSUB f18, f31, f3, f18
  1169. FMADD f19, f31, f2, f19
  1170. #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */
  1171. /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */
  1172. /* defined(RC)|| defined(RR) */
  1173. #ifndef TRMMKERNEL
  1174. FMADD f16, f30, f0, f16
  1175. FNMSUB f17, f30, f1, f17
  1176. FMADD f18, f30, f2, f18
  1177. FNMSUB f19, f30, f3, f19
  1178. FMADD f16, f31, f1, f16
  1179. FMADD f17, f31, f0, f17
  1180. FMADD f18, f31, f3, f18
  1181. FMADD f19, f31, f2, f19
  1182. #else
  1183. FMUL f16, f30, f0
  1184. FMUL f17, f30, f1
  1185. FMUL f18, f30, f2
  1186. FMUL f19, f30, f3
  1187. FMADD f16, f31, f1, f16
  1188. FNMADD f17, f31, f0, f17
  1189. FMADD f18, f31, f3, f18
  1190. FNMADD f19, f31, f2, f19
  1191. #endif
  1192. #endif
  1193. STFD f16, 0 * SIZE(CO1)
  1194. STFD f17, 1 * SIZE(CO1)
  1195. STFD f18, 2 * SIZE(CO1)
  1196. STFD f19, 3 * SIZE(CO1)
  1197. addi CO1, CO1, 4 * SIZE
  1198. #ifdef TRMMKERNEL
  1199. #if ( defined(LEFT) && defined(TRANSA)) || \
  1200. (!defined(LEFT) && !defined(TRANSA))
  1201. sub TEMP, K, KK
  1202. #ifdef LEFT
  1203. addi TEMP, TEMP, -2
  1204. #else
  1205. addi TEMP, TEMP, -1
  1206. #endif
  1207. slwi r0, TEMP, 1 + ZBASE_SHIFT
  1208. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  1209. add AO, AO, r0
  1210. add BO, BO, TEMP
  1211. #endif
  1212. #ifdef LEFT
  1213. addi KK, KK, 2
  1214. #endif
  1215. #endif
  1216. addic. I, I, -1
  1217. bgt .L31
  1218. .align 4
  1219. .L40:
  1220. andi. I, M, 1
  1221. ble .L999
  1222. #ifndef TRMMKERNEL
  1223. LFD f16, 0 * SIZE(AO)
  1224. LFD f17, 1 * SIZE(AO)
  1225. LFD f18, 2 * SIZE(AO)
  1226. LFD f19, 3 * SIZE(AO)
  1227. LFD f20, 0 * SIZE(B)
  1228. LFD f21, 1 * SIZE(B)
  1229. LFD f22, 2 * SIZE(B)
  1230. LFD f23, 3 * SIZE(B)
  1231. lfs f0, FZERO
  1232. fmr f1, f0
  1233. fmr f2, f0
  1234. fmr f3, f0
  1235. fmr f4, f0
  1236. fmr f5, f0
  1237. fmr f6, f0
  1238. fmr f7, f0
  1239. srawi. r0, K, 2
  1240. mr BO, B
  1241. mtspr CTR, r0
  1242. ble .L45
  1243. #else
  1244. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1245. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1246. LFD f16, 0 * SIZE(AO)
  1247. LFD f17, 1 * SIZE(AO)
  1248. LFD f18, 2 * SIZE(AO)
  1249. LFD f19, 3 * SIZE(AO)
  1250. LFD f20, 0 * SIZE(B)
  1251. LFD f21, 1 * SIZE(B)
  1252. LFD f22, 2 * SIZE(B)
  1253. LFD f23, 3 * SIZE(B)
  1254. lfs f0, FZERO
  1255. fmr f1, f0
  1256. fmr f2, f0
  1257. fmr f3, f0
  1258. fmr f4, f0
  1259. fmr f5, f0
  1260. fmr f6, f0
  1261. fmr f7, f0
  1262. mr BO, B
  1263. #else
  1264. slwi r0, KK, 0 + ZBASE_SHIFT
  1265. slwi TEMP, KK, 0 + ZBASE_SHIFT
  1266. add AO, AO, r0
  1267. add BO, B, TEMP
  1268. LFD f16, 0 * SIZE(AO)
  1269. LFD f17, 1 * SIZE(AO)
  1270. LFD f18, 2 * SIZE(AO)
  1271. LFD f19, 3 * SIZE(AO)
  1272. LFD f20, 0 * SIZE(BO)
  1273. LFD f21, 1 * SIZE(BO)
  1274. LFD f22, 2 * SIZE(BO)
  1275. LFD f23, 3 * SIZE(BO)
  1276. lfs f0, FZERO
  1277. fmr f1, f0
  1278. fmr f2, f0
  1279. fmr f3, f0
  1280. fmr f4, f0
  1281. fmr f5, f0
  1282. fmr f6, f0
  1283. fmr f7, f0
  1284. #endif
  1285. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1286. sub TEMP, K, KK
  1287. #elif defined(LEFT)
  1288. addi TEMP, KK, 1
  1289. #else
  1290. addi TEMP, KK, 1
  1291. #endif
  1292. srawi. TEMP, TEMP, 2
  1293. mtspr CTR, TEMP
  1294. ble .L45
  1295. #endif
  1296. .align 4
  1297. .L42:
  1298. fmadd f0, f16, f20, f0
  1299. LFD f23, 3 * SIZE(BO)
  1300. fmadd f3, f16, f21, f3
  1301. LFD f16, 4 * SIZE(AO)
  1302. fmadd f2, f17, f20, f2
  1303. LFD f20, 4 * SIZE(BO)
  1304. fmadd f1, f17, f21, f1
  1305. LFD f17, 5 * SIZE(AO)
  1306. fmadd f4, f18, f22, f4
  1307. LFD f21, 5 * SIZE(BO)
  1308. fmadd f7, f18, f23, f7
  1309. LFD f18, 6 * SIZE(AO)
  1310. fmadd f6, f19, f22, f6
  1311. LFD f22, 6 * SIZE(BO)
  1312. fmadd f5, f19, f23, f5
  1313. LFD f19, 7 * SIZE(AO)
  1314. fmadd f0, f16, f20, f0
  1315. LFD f23, 7 * SIZE(BO)
  1316. fmadd f3, f16, f21, f3
  1317. LFDU f16, 8 * SIZE(AO)
  1318. fmadd f2, f17, f20, f2
  1319. LFDU f20, 8 * SIZE(BO)
  1320. fmadd f1, f17, f21, f1
  1321. LFD f17, 1 * SIZE(AO)
  1322. fmadd f4, f18, f22, f4
  1323. LFD f21, 1 * SIZE(BO)
  1324. fmadd f7, f18, f23, f7
  1325. LFD f18, 2 * SIZE(AO)
  1326. fmadd f6, f19, f22, f6
  1327. LFD f22, 2 * SIZE(BO)
  1328. fmadd f5, f19, f23, f5
  1329. LFD f19, 3 * SIZE(AO)
  1330. bdnz .L42
  1331. .align 4
  1332. .L45:
  1333. fadd f0, f0, f4
  1334. fadd f1, f1, f5
  1335. fadd f2, f2, f6
  1336. fadd f3, f3, f7
  1337. #ifndef TRMMKERNEL
  1338. andi. r0, K, 3
  1339. lfd f30, ALPHA_R
  1340. lfd f31, ALPHA_I
  1341. mtspr CTR,r0
  1342. ble .L47
  1343. #else
  1344. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1345. sub TEMP, K, KK
  1346. #elif defined(LEFT)
  1347. addi TEMP, KK, 1
  1348. #else
  1349. addi TEMP, KK, 1
  1350. #endif
  1351. andi. TEMP, TEMP, 3
  1352. lfd f30, ALPHA_R
  1353. lfd f31, ALPHA_I
  1354. mtspr CTR,TEMP
  1355. ble .L47
  1356. #endif
  1357. .align 4
  1358. .L46:
  1359. fmadd f0, f16, f20, f0
  1360. fmadd f3, f16, f21, f3
  1361. LFDU f16, 2 * SIZE(AO)
  1362. fmadd f2, f17, f20, f2
  1363. LFDU f20, 2 * SIZE(BO)
  1364. fmadd f1, f17, f21, f1
  1365. LFD f17, 1 * SIZE(AO)
  1366. LFD f21, 1 * SIZE(BO)
  1367. bdnz .L46
  1368. .align 4
  1369. .L47:
  1370. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1371. defined(CC) || defined(CR) || defined(RC) || defined(RR)
  1372. fsub f0, f0, f1
  1373. fadd f2, f2, f3
  1374. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  1375. fadd f0, f0, f1
  1376. fsub f2, f2, f3
  1377. #else
  1378. fadd f0, f0, f1
  1379. fsub f2, f3, f2
  1380. #endif
  1381. #ifndef TRMMKERNEL
  1382. LFD f16, 0 * SIZE(CO1)
  1383. LFD f17, 1 * SIZE(CO1)
  1384. #endif
  1385. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1386. #ifndef TRMMKERNEL
  1387. FMADD f16, f30, f0, f16
  1388. FMADD f17, f30, f2, f17
  1389. #else
  1390. FMUL f16, f30, f0
  1391. FMUL f17, f30, f2
  1392. #endif
  1393. FNMSUB f16, f31, f2, f16
  1394. FMADD f17, f31, f0, f17
  1395. #else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */
  1396. /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */
  1397. /* defined(RC) || defined(RR) */
  1398. #ifndef TRMMKERNEL
  1399. FMADD f16, f30, f0, f16
  1400. FNMSUB f17, f30, f2, f17
  1401. FMADD f16, f31, f2, f16
  1402. FMADD f17, f31, f0, f17
  1403. #else
  1404. FMUL f16, f30, f0
  1405. FMUL f17, f30, f2
  1406. FMADD f16, f31, f2, f16
  1407. FNMADD f17, f31, f0, f17
  1408. #endif
  1409. #endif
  1410. STFD f16, 0 * SIZE(CO1)
  1411. STFD f17, 1 * SIZE(CO1)
  1412. .align 4
  1413. .L999:
  1414. addi r3, 0, 0
  1415. lfd f14, 0(SP)
  1416. lfd f15, 8(SP)
  1417. lfd f16, 16(SP)
  1418. lfd f17, 24(SP)
  1419. lfd f18, 32(SP)
  1420. lfd f19, 40(SP)
  1421. lfd f20, 48(SP)
  1422. lfd f21, 56(SP)
  1423. lfd f22, 64(SP)
  1424. lfd f23, 72(SP)
  1425. lfd f24, 80(SP)
  1426. lfd f25, 88(SP)
  1427. lfd f26, 96(SP)
  1428. lfd f27, 104(SP)
  1429. lfd f28, 112(SP)
  1430. lfd f29, 120(SP)
  1431. lfd f30, 128(SP)
  1432. lfd f31, 136(SP)
  1433. #ifdef __64BIT__
  1434. ld r31, 144(SP)
  1435. ld r30, 152(SP)
  1436. ld r29, 160(SP)
  1437. ld r28, 168(SP)
  1438. ld r27, 176(SP)
  1439. ld r26, 184(SP)
  1440. ld r25, 192(SP)
  1441. ld r24, 200(SP)
  1442. #ifdef TRMMKERNEL
  1443. ld r23, 208(SP)
  1444. ld r22, 216(SP)
  1445. #endif
  1446. #else
  1447. lwz r31, 144(SP)
  1448. lwz r30, 148(SP)
  1449. lwz r29, 152(SP)
  1450. lwz r28, 156(SP)
  1451. lwz r27, 160(SP)
  1452. lwz r26, 164(SP)
  1453. lwz r25, 168(SP)
  1454. lwz r24, 172(SP)
  1455. #ifdef TRMMKERNEL
  1456. lwz r23, 176(SP)
  1457. lwz r22, 180(SP)
  1458. #endif
  1459. #endif
  1460. addi SP, SP, STACKSIZE
  1461. blr
  1462. EPILOGUE
  1463. #endif