You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zsymv_L.S 33 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef linux
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r5
  45. #define LDA r6
  46. #define X r7
  47. #define INCX r8
  48. #define Y r9
  49. #define INCY r10
  50. #define BUFFER r14
  51. #else
  52. #define M r3
  53. #define N r4
  54. #define A r7
  55. #define LDA r8
  56. #define X r9
  57. #define INCX r10
  58. #define Y r5
  59. #define INCY r6
  60. #define BUFFER r14
  61. #endif
  62. #endif
  63. #if defined(_AIX) || defined(__APPLE__)
  64. #if !defined(__64BIT__) && defined(DOUBLE)
  65. #define M r3
  66. #define N r4
  67. #define A r9
  68. #define LDA r10
  69. #define X r5
  70. #define INCX r6
  71. #define Y r7
  72. #define INCY r8
  73. #define BUFFER r14
  74. #else
  75. #define M r3
  76. #define N r4
  77. #define A r7
  78. #define LDA r8
  79. #define X r9
  80. #define INCX r10
  81. #define Y r5
  82. #define INCY r6
  83. #define BUFFER r14
  84. #endif
  85. #endif
  86. #define I r11
  87. #define J r12
  88. #define AO1 r15
  89. #define AO2 r16
  90. #define AO3 r17
  91. #define AO4 r18
  92. #define XX r19
  93. #define YY r20
  94. #define NEW_Y r21
  95. #define TEMP r22
  96. #define PREA r24
  97. #define IS r25
  98. #define y01 f0
  99. #define y02 f1
  100. #define y03 f2
  101. #define y04 f3
  102. #define y05 f4
  103. #define y06 f5
  104. #define y07 f6
  105. #define y08 f7
  106. #define xtemp1 f8
  107. #define xtemp2 f9
  108. #define xtemp3 f10
  109. #define xtemp4 f11
  110. #define xtemp5 f12
  111. #define xtemp6 f13
  112. #define xtemp7 f14
  113. #define xtemp8 f15
  114. #define atemp1 f16
  115. #define atemp2 f17
  116. #define atemp3 f18
  117. #define atemp4 f19
  118. #define xsum1 f20
  119. #define xsum2 f21
  120. #define xsum3 f22
  121. #define xsum4 f23
  122. #define a1 f24
  123. #define a2 f25
  124. #define a3 f26
  125. #define a4 f27
  126. #define a5 f28
  127. #define a6 f29
  128. #define a7 f30
  129. #define a8 f31
  130. #define alpha_r f1
  131. #define alpha_i f2
  132. #if defined(PPCG4)
  133. #define PREFETCHSIZE_A 24
  134. #endif
  135. #if defined(PPC440) || defined(PPC440FP2)
  136. #define PREFETCHSIZE_A 24
  137. #endif
  138. #ifdef PPC970
  139. #define PREFETCHSIZE_A 32
  140. #endif
  141. #ifdef CELL
  142. #define PREFETCHSIZE_A 72
  143. #endif
  144. #ifdef POWER4
  145. #define PREFETCHSIZE_A 16
  146. #endif
  147. #ifdef POWER5
  148. #define PREFETCHSIZE_A 96
  149. #endif
  150. #ifdef POWER6
  151. #define PREFETCHSIZE_A 112
  152. #endif
  153. #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
  154. #define NOP1
  155. #define NOP2
  156. #else
  157. #define NOP1 mr LDA, LDA
  158. #define NOP2 mr INCX, INCX
  159. #endif
  160. #ifndef NEEDPARAM
  161. #ifndef __64BIT__
  162. #define STACKSIZE 224
  163. #define ALPHA_R 200(SP)
  164. #define ALPHA_I 208(SP)
  165. #define FZERO 216(SP)
  166. #else
  167. #define STACKSIZE 280
  168. #define ALPHA_R 256(SP)
  169. #define ALPHA_I 264(SP)
  170. #define FZERO 272(SP)
  171. #endif
  172. #ifndef HEMV
  173. #define FMADD1 FNMSUB
  174. #define FMADD2 FMADD
  175. #else
  176. #define FMADD1 FMADD
  177. #define FMADD2 FNMSUB
  178. #endif
  179. PROLOGUE
  180. PROFCODE
  181. addi SP, SP, -STACKSIZE
  182. li r0, 0
  183. stfd f14, 0(SP)
  184. stfd f15, 8(SP)
  185. stfd f16, 16(SP)
  186. stfd f17, 24(SP)
  187. stfd f18, 32(SP)
  188. stfd f19, 40(SP)
  189. stfd f20, 48(SP)
  190. stfd f21, 56(SP)
  191. stfd f22, 64(SP)
  192. stfd f23, 72(SP)
  193. stfd f24, 80(SP)
  194. stfd f25, 88(SP)
  195. stfd f26, 96(SP)
  196. stfd f27, 104(SP)
  197. stfd f28, 112(SP)
  198. stfd f29, 120(SP)
  199. stfd f30, 128(SP)
  200. stfd f31, 136(SP)
  201. #ifdef __64BIT__
  202. std r0, FZERO
  203. std r14, 144(SP)
  204. std r15, 152(SP)
  205. std r16, 160(SP)
  206. std r17, 168(SP)
  207. std r18, 176(SP)
  208. std r19, 184(SP)
  209. std r20, 192(SP)
  210. std r21, 200(SP)
  211. std r22, 208(SP)
  212. std r23, 216(SP)
  213. std r24, 224(SP)
  214. std r25, 232(SP)
  215. std r26, 240(SP)
  216. std r27, 248(SP)
  217. #else
  218. stw r0, 0 + FZERO
  219. stw r0, 4 + FZERO
  220. stw r14, 144(SP)
  221. stw r15, 148(SP)
  222. stw r16, 152(SP)
  223. stw r17, 156(SP)
  224. stw r18, 160(SP)
  225. stw r19, 164(SP)
  226. stw r20, 168(SP)
  227. stw r21, 172(SP)
  228. stw r22, 176(SP)
  229. stw r23, 180(SP)
  230. stw r24, 184(SP)
  231. stw r25, 188(SP)
  232. stw r26, 192(SP)
  233. stw r27, 196(SP)
  234. #endif
  235. #ifdef linux
  236. #ifndef __64BIT__
  237. lwz BUFFER, 56 + STACKSIZE(SP)
  238. #else
  239. ld Y, 112 + STACKSIZE(SP)
  240. ld INCY, 120 + STACKSIZE(SP)
  241. ld BUFFER, 128 + STACKSIZE(SP)
  242. #endif
  243. #endif
  244. #if defined(_AIX) || defined(__APPLE__)
  245. #ifndef __64BIT__
  246. #ifdef DOUBLE
  247. lwz X, 56 + STACKSIZE(SP)
  248. lwz INCX, 60 + STACKSIZE(SP)
  249. lwz Y, 64 + STACKSIZE(SP)
  250. lwz INCY, 68 + STACKSIZE(SP)
  251. lwz BUFFER, 72 + STACKSIZE(SP)
  252. #else
  253. lwz Y, 56 + STACKSIZE(SP)
  254. lwz INCY, 60 + STACKSIZE(SP)
  255. lwz BUFFER, 64 + STACKSIZE(SP)
  256. #endif
  257. #else
  258. ld Y, 112 + STACKSIZE(SP)
  259. ld INCY, 120 + STACKSIZE(SP)
  260. ld BUFFER, 128 + STACKSIZE(SP)
  261. #endif
  262. #endif
  263. STFD alpha_r, ALPHA_R
  264. STFD alpha_i, ALPHA_I
  265. slwi LDA, LDA, ZBASE_SHIFT
  266. slwi INCX, INCX, ZBASE_SHIFT
  267. slwi INCY, INCY, ZBASE_SHIFT
  268. li PREA, PREFETCHSIZE_A * SIZE
  269. cmpwi cr0, M, 0
  270. ble- LL(999)
  271. cmpwi cr0, INCX, 2 * SIZE
  272. beq LL(05)
  273. mr XX, X
  274. mr X, BUFFER
  275. srawi. r0, M, 2
  276. mtspr CTR, r0
  277. ble LL(03)
  278. .align 4
  279. LL(01):
  280. LFD a1, 0 * SIZE(XX)
  281. LFD a2, 1 * SIZE(XX)
  282. add XX, XX, INCX
  283. LFD a3, 0 * SIZE(XX)
  284. LFD a4, 1 * SIZE(XX)
  285. add XX, XX, INCX
  286. LFD a5, 0 * SIZE(XX)
  287. LFD a6, 1 * SIZE(XX)
  288. add XX, XX, INCX
  289. LFD a7, 0 * SIZE(XX)
  290. LFD a8, 1 * SIZE(XX)
  291. add XX, XX, INCX
  292. dcbt XX, PREA
  293. dcbtst BUFFER, PREA
  294. STFD a1, 0 * SIZE(BUFFER)
  295. STFD a2, 1 * SIZE(BUFFER)
  296. STFD a3, 2 * SIZE(BUFFER)
  297. STFD a4, 3 * SIZE(BUFFER)
  298. STFD a5, 4 * SIZE(BUFFER)
  299. STFD a6, 5 * SIZE(BUFFER)
  300. STFD a7, 6 * SIZE(BUFFER)
  301. STFD a8, 7 * SIZE(BUFFER)
  302. addi BUFFER, BUFFER, 8 * SIZE
  303. bdnz LL(01)
  304. .align 4
  305. LL(03):
  306. andi. r0, M, 3
  307. mtspr CTR, r0
  308. ble LL(05)
  309. .align 4
  310. LL(04):
  311. LFD a1, 0 * SIZE(XX)
  312. LFD a2, 1 * SIZE(XX)
  313. add XX, XX, INCX
  314. STFD a1, 0 * SIZE(BUFFER)
  315. STFD a2, 1 * SIZE(BUFFER)
  316. addi BUFFER, BUFFER, 2 * SIZE
  317. bdnz LL(04)
  318. .align 4
  319. LL(05):
  320. mr NEW_Y, Y
  321. lfd f0, FZERO
  322. cmpwi cr0, INCY, 2 * SIZE
  323. beq LL(10)
  324. mr NEW_Y, BUFFER
  325. addi r0, M, 3
  326. srawi. r0, r0, 2
  327. mtspr CTR, r0
  328. .align 4
  329. LL(06):
  330. STFD f0, 0 * SIZE(BUFFER)
  331. STFD f0, 1 * SIZE(BUFFER)
  332. STFD f0, 2 * SIZE(BUFFER)
  333. STFD f0, 3 * SIZE(BUFFER)
  334. STFD f0, 4 * SIZE(BUFFER)
  335. STFD f0, 5 * SIZE(BUFFER)
  336. STFD f0, 6 * SIZE(BUFFER)
  337. STFD f0, 7 * SIZE(BUFFER)
  338. addi BUFFER, BUFFER, 8 * SIZE
  339. bdnz LL(06)
  340. .align 4
  341. LL(10):
  342. li IS, 0
  343. cmpwi cr0, N, 2
  344. blt LL(20)
  345. .align 4
  346. LL(11):
  347. mr AO1, A
  348. slwi TEMP, IS, ZBASE_SHIFT
  349. add AO2, A, LDA
  350. add XX, X, TEMP
  351. add A, AO2, LDA
  352. add YY, NEW_Y, TEMP
  353. addi A, A, 4 * SIZE
  354. NOP2
  355. LFD y05, ALPHA_R
  356. LFD y06, ALPHA_I
  357. LFD atemp1, 0 * SIZE(XX)
  358. LFD atemp2, 1 * SIZE(XX)
  359. LFD atemp3, 2 * SIZE(XX)
  360. LFD atemp4, 3 * SIZE(XX)
  361. LFD a1, 0 * SIZE(AO1)
  362. LFD a2, 1 * SIZE(AO1)
  363. LFD a3, 2 * SIZE(AO1)
  364. LFD a4, 3 * SIZE(AO1)
  365. LFD a7, 2 * SIZE(AO2)
  366. LFD a8, 3 * SIZE(AO2)
  367. FMUL xsum1, atemp1, a1
  368. addi AO2, AO2, 4 * SIZE
  369. FMUL xsum2, atemp2, a1
  370. LFD a1, 4 * SIZE(AO1)
  371. FMUL xsum3, atemp1, a3
  372. addi AO1, AO1, 4 * SIZE
  373. FMUL xsum4, atemp2, a3
  374. LFD a5, 0 * SIZE(AO2)
  375. #ifndef HEMV
  376. FNMSUB xsum1, atemp2, a2, xsum1
  377. #endif
  378. addi XX, XX, 4 * SIZE
  379. #ifndef HEMV
  380. FMADD xsum2, atemp1, a2, xsum2
  381. #endif
  382. LFD a2, 1 * SIZE(AO1)
  383. FNMSUB xsum3, atemp2, a4, xsum3
  384. addi YY, YY, 4 * SIZE
  385. FMADD xsum4, atemp1, a4, xsum4
  386. LFD a6, 1 * SIZE(AO2)
  387. FMADD xsum1, atemp3, a3, xsum1
  388. sub TEMP, M, IS
  389. FMADD xsum2, atemp4, a3, xsum2
  390. LFD a3, 2 * SIZE(AO1)
  391. FMADD xsum3, atemp3, a7, xsum3
  392. addi TEMP, TEMP, -2
  393. FMADD xsum4, atemp4, a7, xsum4
  394. LFD a7, 2 * SIZE(AO2)
  395. FMADD1 xsum1, atemp4, a4, xsum1
  396. srawi. r0, TEMP, 3
  397. FMADD2 xsum2, atemp3, a4, xsum2
  398. LFD a4, 3 * SIZE(AO1)
  399. #ifndef HEMV
  400. FMADD1 xsum3, atemp4, a8, xsum3
  401. #endif
  402. mtspr CTR, r0
  403. #ifndef HEMV
  404. FMADD2 xsum4, atemp3, a8, xsum4
  405. #endif
  406. LFD a8, 3 * SIZE(AO2)
  407. FMUL xtemp1, y05, atemp1
  408. LFD y01, 0 * SIZE(YY)
  409. FMUL xtemp2, y06, atemp1
  410. LFD y02, 1 * SIZE(YY)
  411. FMUL xtemp3, y05, atemp3
  412. LFD y03, 2 * SIZE(YY)
  413. FMUL xtemp4, y06, atemp3
  414. LFD y04, 3 * SIZE(YY)
  415. FNMSUB atemp1, y06, atemp2, xtemp1
  416. LFD xtemp1, 0 * SIZE(XX)
  417. FMADD atemp2, y05, atemp2, xtemp2
  418. LFD xtemp2, 1 * SIZE(XX)
  419. FNMSUB atemp3, y06, atemp4, xtemp3
  420. LFD xtemp3, 2 * SIZE(XX)
  421. FMADD atemp4, y05, atemp4, xtemp4
  422. LFD xtemp4, 3 * SIZE(XX)
  423. NOP1
  424. ble LL(15)
  425. FMADD xsum1, xtemp1, a1, xsum1
  426. DCBT(AO1, PREA)
  427. FMADD y01, atemp1, a1, y01
  428. NOP2
  429. FMADD xsum2, xtemp2, a1, xsum2
  430. NOP1
  431. FMADD y02, atemp2, a1, y02
  432. LFD a1, 4 * SIZE(AO1)
  433. FMADD xsum3, xtemp1, a5, xsum3
  434. NOP1
  435. FMADD y03, atemp1, a3, y03
  436. NOP2
  437. FMADD xsum4, xtemp2, a5, xsum4
  438. NOP1
  439. FMADD y04, atemp2, a3, y04
  440. NOP2
  441. FMADD1 xsum1, xtemp2, a2, xsum1
  442. LFD y05, 4 * SIZE(YY)
  443. FNMSUB y01, atemp2, a2, y01
  444. NOP2
  445. FMADD2 xsum2, xtemp1, a2, xsum2
  446. LFD y06, 5 * SIZE(YY)
  447. FMADD y02, atemp1, a2, y02
  448. LFD a2, 5 * SIZE(AO1)
  449. FMADD1 xsum3, xtemp2, a6, xsum3
  450. LFD xtemp2, 5 * SIZE(XX)
  451. FNMSUB y03, atemp2, a4, y03
  452. NOP2
  453. FMADD2 xsum4, xtemp1, a6, xsum4
  454. LFD xtemp1, 4 * SIZE(XX)
  455. FMADD y04, atemp1, a4, y04
  456. NOP2
  457. FMADD xsum1, xtemp3, a3, xsum1
  458. LFD y07, 6 * SIZE(YY)
  459. FMADD y01, atemp3, a5, y01
  460. NOP2
  461. FMADD xsum2, xtemp4, a3, xsum2
  462. LFD a3, 6 * SIZE(AO1)
  463. FMADD y02, atemp4, a5, y02
  464. LFD a5, 4 * SIZE(AO2)
  465. FMADD xsum3, xtemp3, a7, xsum3
  466. LFD y08, 7 * SIZE(YY)
  467. FMADD y03, atemp3, a7, y03
  468. NOP2
  469. FMADD xsum4, xtemp4, a7, xsum4
  470. NOP1
  471. FMADD y04, atemp4, a7, y04
  472. LFD a7, 6 * SIZE(AO2)
  473. FMADD1 xsum1, xtemp4, a4, xsum1
  474. NOP1
  475. FNMSUB y01, atemp4, a6, y01
  476. # DCBT(X, PREX)
  477. NOP2
  478. FMADD2 xsum2, xtemp3, a4, xsum2
  479. LFD a4, 7 * SIZE(AO1)
  480. FMADD y02, atemp3, a6, y02
  481. LFD a6, 5 * SIZE(AO2)
  482. FMADD1 xsum3, xtemp4, a8, xsum3
  483. LFD xtemp4, 7 * SIZE(XX)
  484. FNMSUB y03, atemp4, a8, y03
  485. NOP2
  486. FMADD2 xsum4, xtemp3, a8, xsum4
  487. LFD xtemp3, 6 * SIZE(XX)
  488. FMADD y04, atemp3, a8, y04
  489. LFD a8, 7 * SIZE(AO2)
  490. FMADD xsum1, xtemp1, a1, xsum1
  491. STFD y01, 0 * SIZE(YY)
  492. FMADD y05, atemp1, a1, y05
  493. NOP2
  494. FMADD xsum2, xtemp2, a1, xsum2
  495. STFD y02, 1 * SIZE(YY)
  496. FMADD y06, atemp2, a1, y06
  497. LFD a1, 8 * SIZE(AO1)
  498. FMADD xsum3, xtemp1, a5, xsum3
  499. STFD y03, 2 * SIZE(YY)
  500. FMADD y07, atemp1, a3, y07
  501. NOP2
  502. FMADD xsum4, xtemp2, a5, xsum4
  503. STFD y04, 3 * SIZE(YY)
  504. FMADD y08, atemp2, a3, y08
  505. NOP2
  506. FMADD1 xsum1, xtemp2, a2, xsum1
  507. LFD y01, 8 * SIZE(YY)
  508. FNMSUB y05, atemp2, a2, y05
  509. NOP2
  510. FMADD2 xsum2, xtemp1, a2, xsum2
  511. LFD y02, 9 * SIZE(YY)
  512. FMADD y06, atemp1, a2, y06
  513. LFD a2, 9 * SIZE(AO1)
  514. FMADD1 xsum3, xtemp2, a6, xsum3
  515. LFD xtemp2, 9 * SIZE(XX)
  516. FNMSUB y07, atemp2, a4, y07
  517. NOP2
  518. FMADD2 xsum4, xtemp1, a6, xsum4
  519. LFD xtemp1, 8 * SIZE(XX)
  520. FMADD y08, atemp1, a4, y08
  521. NOP2
  522. FMADD xsum1, xtemp3, a3, xsum1
  523. LFD y03, 10 * SIZE(YY)
  524. FMADD y05, atemp3, a5, y05
  525. NOP2
  526. FMADD xsum2, xtemp4, a3, xsum2
  527. LFD a3, 10 * SIZE(AO1)
  528. FMADD y06, atemp4, a5, y06
  529. LFD a5, 8 * SIZE(AO2)
  530. FMADD xsum3, xtemp3, a7, xsum3
  531. LFD y04, 11 * SIZE(YY)
  532. FMADD y07, atemp3, a7, y07
  533. NOP2
  534. FMADD xsum4, xtemp4, a7, xsum4
  535. NOP1
  536. FMADD y08, atemp4, a7, y08
  537. LFD a7, 10 * SIZE(AO2)
  538. FMADD1 xsum1, xtemp4, a4, xsum1
  539. NOP1
  540. FNMSUB y05, atemp4, a6, y05
  541. NOP2
  542. FMADD2 xsum2, xtemp3, a4, xsum2
  543. LFD a4, 11 * SIZE(AO1)
  544. FMADD y06, atemp3, a6, y06
  545. LFD a6, 9 * SIZE(AO2)
  546. FMADD1 xsum3, xtemp4, a8, xsum3
  547. LFD xtemp4, 11 * SIZE(XX)
  548. FNMSUB y07, atemp4, a8, y07
  549. bdz LL(13)
  550. .align 4
  551. LL(12):
  552. FMADD2 xsum4, xtemp3, a8, xsum4
  553. LFD xtemp3, 10 * SIZE(XX)
  554. FMADD y08, atemp3, a8, y08
  555. LFD a8, 11 * SIZE(AO2)
  556. FMADD xsum1, xtemp1, a1, xsum1
  557. STFD y05, 4 * SIZE(YY)
  558. FMADD y01, atemp1, a1, y01
  559. DCBT(AO2, PREA)
  560. FMADD xsum2, xtemp2, a1, xsum2
  561. STFD y06, 5 * SIZE(YY)
  562. FMADD y02, atemp2, a1, y02
  563. LFD a1, 12 * SIZE(AO1)
  564. FMADD xsum3, xtemp1, a5, xsum3
  565. STFD y07, 6 * SIZE(YY)
  566. FMADD y03, atemp1, a3, y03
  567. NOP2
  568. FMADD xsum4, xtemp2, a5, xsum4
  569. STFD y08, 7 * SIZE(YY)
  570. FMADD y04, atemp2, a3, y04
  571. NOP2
  572. FMADD1 xsum1, xtemp2, a2, xsum1
  573. LFD y05, 12 * SIZE(YY)
  574. FNMSUB y01, atemp2, a2, y01
  575. NOP2
  576. FMADD2 xsum2, xtemp1, a2, xsum2
  577. LFD y06, 13 * SIZE(YY)
  578. FMADD y02, atemp1, a2, y02
  579. LFD a2, 13 * SIZE(AO1)
  580. FMADD1 xsum3, xtemp2, a6, xsum3
  581. LFD xtemp2, 13 * SIZE(XX)
  582. FNMSUB y03, atemp2, a4, y03
  583. NOP2
  584. FMADD2 xsum4, xtemp1, a6, xsum4
  585. LFD xtemp1, 12 * SIZE(XX)
  586. FMADD y04, atemp1, a4, y04
  587. NOP2
  588. FMADD xsum1, xtemp3, a3, xsum1
  589. LFD y07, 14 * SIZE(YY)
  590. FMADD y01, atemp3, a5, y01
  591. NOP2
  592. FMADD xsum2, xtemp4, a3, xsum2
  593. LFD a3, 14 * SIZE(AO1)
  594. FMADD y02, atemp4, a5, y02
  595. LFD a5, 12 * SIZE(AO2)
  596. FMADD xsum3, xtemp3, a7, xsum3
  597. LFD y08, 15 * SIZE(YY)
  598. FMADD y03, atemp3, a7, y03
  599. NOP2
  600. FMADD xsum4, xtemp4, a7, xsum4
  601. NOP1
  602. FMADD y04, atemp4, a7, y04
  603. LFD a7, 14 * SIZE(AO2)
  604. FMADD1 xsum1, xtemp4, a4, xsum1
  605. NOP1
  606. FNMSUB y01, atemp4, a6, y01
  607. # DCBT(Y1, PREY)
  608. NOP2
  609. FMADD2 xsum2, xtemp3, a4, xsum2
  610. LFD a4, 15 * SIZE(AO1)
  611. FMADD y02, atemp3, a6, y02
  612. LFD a6, 13 * SIZE(AO2)
  613. FMADD1 xsum3, xtemp4, a8, xsum3
  614. LFD xtemp4, 15 * SIZE(XX)
  615. FNMSUB y03, atemp4, a8, y03
  616. NOP2
  617. FMADD2 xsum4, xtemp3, a8, xsum4
  618. LFD xtemp3, 14 * SIZE(XX)
  619. FMADD y04, atemp3, a8, y04
  620. LFD a8, 15 * SIZE(AO2)
  621. FMADD xsum1, xtemp1, a1, xsum1
  622. STFD y01, 8 * SIZE(YY)
  623. FMADD y05, atemp1, a1, y05
  624. NOP2
  625. FMADD xsum2, xtemp2, a1, xsum2
  626. STFD y02, 9 * SIZE(YY)
  627. FMADD y06, atemp2, a1, y06
  628. LFD a1, 16 * SIZE(AO1)
  629. FMADD xsum3, xtemp1, a5, xsum3
  630. STFD y03, 10 * SIZE(YY)
  631. FMADD y07, atemp1, a3, y07
  632. NOP2
  633. FMADD xsum4, xtemp2, a5, xsum4
  634. STFD y04, 11 * SIZE(YY)
  635. FMADD y08, atemp2, a3, y08
  636. NOP2
  637. FMADD1 xsum1, xtemp2, a2, xsum1
  638. LFD y01, 16 * SIZE(YY)
  639. FNMSUB y05, atemp2, a2, y05
  640. NOP2
  641. FMADD2 xsum2, xtemp1, a2, xsum2
  642. LFD y02, 17 * SIZE(YY)
  643. FMADD y06, atemp1, a2, y06
  644. LFD a2, 17 * SIZE(AO1)
  645. FMADD1 xsum3, xtemp2, a6, xsum3
  646. LFD xtemp2, 17 * SIZE(XX)
  647. FNMSUB y07, atemp2, a4, y07
  648. NOP2
  649. FMADD2 xsum4, xtemp1, a6, xsum4
  650. LFD xtemp1, 16 * SIZE(XX)
  651. FMADD y08, atemp1, a4, y08
  652. addi AO2, AO2, 16 * SIZE
  653. FMADD xsum1, xtemp3, a3, xsum1
  654. LFD y03, 18 * SIZE(YY)
  655. FMADD y05, atemp3, a5, y05
  656. addi XX, XX, 16 * SIZE
  657. FMADD xsum2, xtemp4, a3, xsum2
  658. LFD a3, 18 * SIZE(AO1)
  659. FMADD y06, atemp4, a5, y06
  660. LFD a5, 0 * SIZE(AO2)
  661. FMADD xsum3, xtemp3, a7, xsum3
  662. LFD y04, 19 * SIZE(YY)
  663. FMADD y07, atemp3, a7, y07
  664. NOP2
  665. FMADD xsum4, xtemp4, a7, xsum4
  666. addi AO1, AO1, 16 * SIZE
  667. FMADD y08, atemp4, a7, y08
  668. LFD a7, 2 * SIZE(AO2)
  669. FMADD1 xsum1, xtemp4, a4, xsum1
  670. addi YY, YY, 16 * SIZE
  671. FNMSUB y05, atemp4, a6, y05
  672. NOP2
  673. FMADD2 xsum2, xtemp3, a4, xsum2
  674. LFD a4, 3 * SIZE(AO1)
  675. FMADD y06, atemp3, a6, y06
  676. LFD a6, 1 * SIZE(AO2)
  677. FMADD1 xsum3, xtemp4, a8, xsum3
  678. LFD xtemp4, 3 * SIZE(XX)
  679. FNMSUB y07, atemp4, a8, y07
  680. NOP2
  681. FMADD2 xsum4, xtemp3, a8, xsum4
  682. LFD xtemp3, 2 * SIZE(XX)
  683. FMADD y08, atemp3, a8, y08
  684. LFD a8, 3 * SIZE(AO2)
  685. FMADD xsum1, xtemp1, a1, xsum1
  686. STFD y05, -4 * SIZE(YY)
  687. FMADD y01, atemp1, a1, y01
  688. DCBT(AO1, PREA)
  689. FMADD xsum2, xtemp2, a1, xsum2
  690. STFD y06, -3 * SIZE(YY)
  691. FMADD y02, atemp2, a1, y02
  692. LFD a1, 4 * SIZE(AO1)
  693. FMADD xsum3, xtemp1, a5, xsum3
  694. STFD y07, -2 * SIZE(YY)
  695. FMADD y03, atemp1, a3, y03
  696. NOP2
  697. FMADD xsum4, xtemp2, a5, xsum4
  698. STFD y08, -1 * SIZE(YY)
  699. FMADD y04, atemp2, a3, y04
  700. NOP2
  701. FMADD1 xsum1, xtemp2, a2, xsum1
  702. LFD y05, 4 * SIZE(YY)
  703. FNMSUB y01, atemp2, a2, y01
  704. NOP2
  705. FMADD2 xsum2, xtemp1, a2, xsum2
  706. LFD y06, 5 * SIZE(YY)
  707. FMADD y02, atemp1, a2, y02
  708. LFD a2, 5 * SIZE(AO1)
  709. FMADD1 xsum3, xtemp2, a6, xsum3
  710. LFD xtemp2, 5 * SIZE(XX)
  711. FNMSUB y03, atemp2, a4, y03
  712. NOP2
  713. FMADD2 xsum4, xtemp1, a6, xsum4
  714. LFD xtemp1, 4 * SIZE(XX)
  715. FMADD y04, atemp1, a4, y04
  716. NOP2
  717. FMADD xsum1, xtemp3, a3, xsum1
  718. LFD y07, 6 * SIZE(YY)
  719. FMADD y01, atemp3, a5, y01
  720. NOP2
  721. FMADD xsum2, xtemp4, a3, xsum2
  722. LFD a3, 6 * SIZE(AO1)
  723. FMADD y02, atemp4, a5, y02
  724. LFD a5, 4 * SIZE(AO2)
  725. FMADD xsum3, xtemp3, a7, xsum3
  726. LFD y08, 7 * SIZE(YY)
  727. FMADD y03, atemp3, a7, y03
  728. NOP2
  729. FMADD xsum4, xtemp4, a7, xsum4
  730. NOP1
  731. FMADD y04, atemp4, a7, y04
  732. LFD a7, 6 * SIZE(AO2)
  733. FMADD1 xsum1, xtemp4, a4, xsum1
  734. NOP1
  735. FNMSUB y01, atemp4, a6, y01
  736. # DCBT(X, PREX)
  737. NOP2
  738. FMADD2 xsum2, xtemp3, a4, xsum2
  739. LFD a4, 7 * SIZE(AO1)
  740. FMADD y02, atemp3, a6, y02
  741. LFD a6, 5 * SIZE(AO2)
  742. FMADD1 xsum3, xtemp4, a8, xsum3
  743. LFD xtemp4, 7 * SIZE(XX)
  744. FNMSUB y03, atemp4, a8, y03
  745. NOP2
  746. FMADD2 xsum4, xtemp3, a8, xsum4
  747. LFD xtemp3, 6 * SIZE(XX)
  748. FMADD y04, atemp3, a8, y04
  749. LFD a8, 7 * SIZE(AO2)
  750. FMADD xsum1, xtemp1, a1, xsum1
  751. STFD y01, 0 * SIZE(YY)
  752. FMADD y05, atemp1, a1, y05
  753. NOP2
  754. FMADD xsum2, xtemp2, a1, xsum2
  755. STFD y02, 1 * SIZE(YY)
  756. FMADD y06, atemp2, a1, y06
  757. LFD a1, 8 * SIZE(AO1)
  758. FMADD xsum3, xtemp1, a5, xsum3
  759. STFD y03, 2 * SIZE(YY)
  760. FMADD y07, atemp1, a3, y07
  761. NOP2
  762. FMADD xsum4, xtemp2, a5, xsum4
  763. STFD y04, 3 * SIZE(YY)
  764. FMADD y08, atemp2, a3, y08
  765. NOP2
  766. FMADD1 xsum1, xtemp2, a2, xsum1
  767. LFD y01, 8 * SIZE(YY)
  768. FNMSUB y05, atemp2, a2, y05
  769. NOP2
  770. FMADD2 xsum2, xtemp1, a2, xsum2
  771. LFD y02, 9 * SIZE(YY)
  772. FMADD y06, atemp1, a2, y06
  773. LFD a2, 9 * SIZE(AO1)
  774. FMADD1 xsum3, xtemp2, a6, xsum3
  775. LFD xtemp2, 9 * SIZE(XX)
  776. FNMSUB y07, atemp2, a4, y07
  777. NOP2
  778. FMADD2 xsum4, xtemp1, a6, xsum4
  779. LFD xtemp1, 8 * SIZE(XX)
  780. FMADD y08, atemp1, a4, y08
  781. NOP2
  782. FMADD xsum1, xtemp3, a3, xsum1
  783. LFD y03, 10 * SIZE(YY)
  784. FMADD y05, atemp3, a5, y05
  785. NOP2
  786. FMADD xsum2, xtemp4, a3, xsum2
  787. LFD a3, 10 * SIZE(AO1)
  788. FMADD y06, atemp4, a5, y06
  789. LFD a5, 8 * SIZE(AO2)
  790. FMADD xsum3, xtemp3, a7, xsum3
  791. LFD y04, 11 * SIZE(YY)
  792. FMADD y07, atemp3, a7, y07
  793. NOP2
  794. FMADD xsum4, xtemp4, a7, xsum4
  795. NOP1
  796. FMADD y08, atemp4, a7, y08
  797. LFD a7, 10 * SIZE(AO2)
  798. FMADD1 xsum1, xtemp4, a4, xsum1
  799. NOP1
  800. FNMSUB y05, atemp4, a6, y05
  801. NOP2
  802. FMADD2 xsum2, xtemp3, a4, xsum2
  803. LFD a4, 11 * SIZE(AO1)
  804. FMADD y06, atemp3, a6, y06
  805. LFD a6, 9 * SIZE(AO2)
  806. FMADD1 xsum3, xtemp4, a8, xsum3
  807. LFD xtemp4, 11 * SIZE(XX)
  808. FNMSUB y07, atemp4, a8, y07
  809. bdnz LL(12)
  810. .align 4
  811. LL(13):
  812. FMADD2 xsum4, xtemp3, a8, xsum4
  813. LFD xtemp3, 10 * SIZE(XX)
  814. FMADD y08, atemp3, a8, y08
  815. LFD a8, 11 * SIZE(AO2)
  816. FMADD xsum1, xtemp1, a1, xsum1
  817. STFD y05, 4 * SIZE(YY)
  818. FMADD y01, atemp1, a1, y01
  819. NOP2
  820. FMADD xsum2, xtemp2, a1, xsum2
  821. STFD y06, 5 * SIZE(YY)
  822. FMADD y02, atemp2, a1, y02
  823. LFD a1, 12 * SIZE(AO1)
  824. FMADD xsum3, xtemp1, a5, xsum3
  825. STFD y07, 6 * SIZE(YY)
  826. FMADD y03, atemp1, a3, y03
  827. NOP2
  828. FMADD xsum4, xtemp2, a5, xsum4
  829. STFD y08, 7 * SIZE(YY)
  830. FMADD y04, atemp2, a3, y04
  831. NOP2
  832. FMADD1 xsum1, xtemp2, a2, xsum1
  833. LFD y05, 12 * SIZE(YY)
  834. FNMSUB y01, atemp2, a2, y01
  835. NOP2
  836. FMADD2 xsum2, xtemp1, a2, xsum2
  837. LFD y06, 13 * SIZE(YY)
  838. FMADD y02, atemp1, a2, y02
  839. LFD a2, 13 * SIZE(AO1)
  840. FMADD1 xsum3, xtemp2, a6, xsum3
  841. LFD xtemp2, 13 * SIZE(XX)
  842. FNMSUB y03, atemp2, a4, y03
  843. NOP2
  844. FMADD2 xsum4, xtemp1, a6, xsum4
  845. LFD xtemp1, 12 * SIZE(XX)
  846. FMADD y04, atemp1, a4, y04
  847. NOP2
  848. FMADD xsum1, xtemp3, a3, xsum1
  849. LFD y07, 14 * SIZE(YY)
  850. FMADD y01, atemp3, a5, y01
  851. NOP2
  852. FMADD xsum2, xtemp4, a3, xsum2
  853. LFD a3, 14 * SIZE(AO1)
  854. FMADD y02, atemp4, a5, y02
  855. LFD a5, 12 * SIZE(AO2)
  856. FMADD xsum3, xtemp3, a7, xsum3
  857. LFD y08, 15 * SIZE(YY)
  858. FMADD y03, atemp3, a7, y03
  859. NOP2
  860. FMADD xsum4, xtemp4, a7, xsum4
  861. NOP1
  862. FMADD y04, atemp4, a7, y04
  863. LFD a7, 14 * SIZE(AO2)
  864. FMADD1 xsum1, xtemp4, a4, xsum1
  865. NOP1
  866. FNMSUB y01, atemp4, a6, y01
  867. NOP2
  868. FMADD2 xsum2, xtemp3, a4, xsum2
  869. LFD a4, 15 * SIZE(AO1)
  870. FMADD y02, atemp3, a6, y02
  871. LFD a6, 13 * SIZE(AO2)
  872. FMADD1 xsum3, xtemp4, a8, xsum3
  873. LFD xtemp4, 15 * SIZE(XX)
  874. FNMSUB y03, atemp4, a8, y03
  875. NOP2
  876. FMADD2 xsum4, xtemp3, a8, xsum4
  877. LFD xtemp3, 14 * SIZE(XX)
  878. FMADD y04, atemp3, a8, y04
  879. LFD a8, 15 * SIZE(AO2)
  880. FMADD xsum1, xtemp1, a1, xsum1
  881. STFD y01, 8 * SIZE(YY)
  882. FMADD y05, atemp1, a1, y05
  883. NOP2
  884. FMADD xsum2, xtemp2, a1, xsum2
  885. STFD y02, 9 * SIZE(YY)
  886. FMADD y06, atemp2, a1, y06
  887. LFD a1, 16 * SIZE(AO1)
  888. FMADD xsum3, xtemp1, a5, xsum3
  889. STFD y03, 10 * SIZE(YY)
  890. FMADD y07, atemp1, a3, y07
  891. NOP2
  892. FMADD xsum4, xtemp2, a5, xsum4
  893. STFD y04, 11 * SIZE(YY)
  894. FMADD y08, atemp2, a3, y08
  895. NOP2
  896. FMADD1 xsum1, xtemp2, a2, xsum1
  897. LFD y01, 16 * SIZE(YY)
  898. FNMSUB y05, atemp2, a2, y05
  899. NOP2
  900. FMADD2 xsum2, xtemp1, a2, xsum2
  901. LFD y02, 17 * SIZE(YY)
  902. FMADD y06, atemp1, a2, y06
  903. LFD a2, 17 * SIZE(AO1)
  904. FMADD1 xsum3, xtemp2, a6, xsum3
  905. LFD xtemp2, 17 * SIZE(XX)
  906. FNMSUB y07, atemp2, a4, y07
  907. NOP2
  908. FMADD2 xsum4, xtemp1, a6, xsum4
  909. LFD xtemp1, 16 * SIZE(XX)
  910. FMADD y08, atemp1, a4, y08
  911. addi AO2, AO2, 16 * SIZE
  912. FMADD xsum1, xtemp3, a3, xsum1
  913. LFD y03, 18 * SIZE(YY)
  914. FMADD y05, atemp3, a5, y05
  915. addi XX, XX, 16 * SIZE
  916. FMADD xsum2, xtemp4, a3, xsum2
  917. LFD a3, 18 * SIZE(AO1)
  918. FMADD y06, atemp4, a5, y06
  919. LFD a5, 0 * SIZE(AO2)
  920. FMADD xsum3, xtemp3, a7, xsum3
  921. LFD y04, 19 * SIZE(YY)
  922. FMADD y07, atemp3, a7, y07
  923. NOP2
  924. FMADD xsum4, xtemp4, a7, xsum4
  925. addi AO1, AO1, 16 * SIZE
  926. FMADD y08, atemp4, a7, y08
  927. LFD a7, 2 * SIZE(AO2)
  928. FMADD1 xsum1, xtemp4, a4, xsum1
  929. addi YY, YY, 16 * SIZE
  930. FNMSUB y05, atemp4, a6, y05
  931. NOP2
  932. FMADD2 xsum2, xtemp3, a4, xsum2
  933. LFD a4, 3 * SIZE(AO1)
  934. FMADD y06, atemp3, a6, y06
  935. LFD a6, 1 * SIZE(AO2)
  936. FMADD1 xsum3, xtemp4, a8, xsum3
  937. LFD xtemp4, 3 * SIZE(XX)
  938. FNMSUB y07, atemp4, a8, y07
  939. NOP2
  940. FMADD2 xsum4, xtemp3, a8, xsum4
  941. LFD xtemp3, 2 * SIZE(XX)
  942. FMADD y08, atemp3, a8, y08
  943. LFD a8, 3 * SIZE(AO2)
  944. STFD y05, -4 * SIZE(YY)
  945. STFD y06, -3 * SIZE(YY)
  946. STFD y07, -2 * SIZE(YY)
  947. STFD y08, -1 * SIZE(YY)
  948. .align 4
  949. LL(15):
  950. andi. r0, TEMP, 4
  951. ble LL(16)
  952. FMADD xsum1, xtemp1, a1, xsum1
  953. NOP1
  954. FMADD y01, atemp1, a1, y01
  955. NOP2
  956. FMADD xsum2, xtemp2, a1, xsum2
  957. NOP1
  958. FMADD y02, atemp2, a1, y02
  959. LFD a1, 4 * SIZE(AO1)
  960. FMADD xsum3, xtemp1, a5, xsum3
  961. NOP1
  962. FMADD y03, atemp1, a3, y03
  963. NOP2
  964. FMADD xsum4, xtemp2, a5, xsum4
  965. NOP1
  966. FMADD y04, atemp2, a3, y04
  967. NOP2
  968. FMADD1 xsum1, xtemp2, a2, xsum1
  969. LFD y05, 4 * SIZE(YY)
  970. FNMSUB y01, atemp2, a2, y01
  971. NOP2
  972. FMADD2 xsum2, xtemp1, a2, xsum2
  973. LFD y06, 5 * SIZE(YY)
  974. FMADD y02, atemp1, a2, y02
  975. LFD a2, 5 * SIZE(AO1)
  976. FMADD1 xsum3, xtemp2, a6, xsum3
  977. LFD xtemp2, 5 * SIZE(XX)
  978. FNMSUB y03, atemp2, a4, y03
  979. NOP2
  980. FMADD2 xsum4, xtemp1, a6, xsum4
  981. LFD xtemp1, 4 * SIZE(XX)
  982. FMADD y04, atemp1, a4, y04
  983. NOP2
  984. FMADD xsum1, xtemp3, a3, xsum1
  985. LFD y07, 6 * SIZE(YY)
  986. FMADD y01, atemp3, a5, y01
  987. NOP2
  988. FMADD xsum2, xtemp4, a3, xsum2
  989. LFD a3, 6 * SIZE(AO1)
  990. FMADD y02, atemp4, a5, y02
  991. LFD a5, 4 * SIZE(AO2)
  992. FMADD xsum3, xtemp3, a7, xsum3
  993. LFD y08, 7 * SIZE(YY)
  994. FMADD y03, atemp3, a7, y03
  995. NOP2
  996. FMADD xsum4, xtemp4, a7, xsum4
  997. NOP1
  998. FMADD y04, atemp4, a7, y04
  999. LFD a7, 6 * SIZE(AO2)
  1000. FMADD1 xsum1, xtemp4, a4, xsum1
  1001. NOP1
  1002. FNMSUB y01, atemp4, a6, y01
  1003. NOP2
  1004. FMADD2 xsum2, xtemp3, a4, xsum2
  1005. LFD a4, 7 * SIZE(AO1)
  1006. FMADD y02, atemp3, a6, y02
  1007. LFD a6, 5 * SIZE(AO2)
  1008. FMADD1 xsum3, xtemp4, a8, xsum3
  1009. LFD xtemp4, 7 * SIZE(XX)
  1010. FNMSUB y03, atemp4, a8, y03
  1011. NOP2
  1012. FMADD2 xsum4, xtemp3, a8, xsum4
  1013. LFD xtemp3, 6 * SIZE(XX)
  1014. FMADD y04, atemp3, a8, y04
  1015. LFD a8, 7 * SIZE(AO2)
  1016. FMADD xsum1, xtemp1, a1, xsum1
  1017. STFD y01, 0 * SIZE(YY)
  1018. FMADD y05, atemp1, a1, y05
  1019. NOP2
  1020. FMADD xsum2, xtemp2, a1, xsum2
  1021. STFD y02, 1 * SIZE(YY)
  1022. FMADD y06, atemp2, a1, y06
  1023. LFD a1, 8 * SIZE(AO1)
  1024. FMADD xsum3, xtemp1, a5, xsum3
  1025. STFD y03, 2 * SIZE(YY)
  1026. FMADD y07, atemp1, a3, y07
  1027. NOP2
  1028. FMADD xsum4, xtemp2, a5, xsum4
  1029. STFD y04, 3 * SIZE(YY)
  1030. FMADD y08, atemp2, a3, y08
  1031. NOP2
  1032. FMADD1 xsum1, xtemp2, a2, xsum1
  1033. LFD y01, 8 * SIZE(YY)
  1034. FNMSUB y05, atemp2, a2, y05
  1035. NOP2
  1036. FMADD2 xsum2, xtemp1, a2, xsum2
  1037. LFD y02, 9 * SIZE(YY)
  1038. FMADD y06, atemp1, a2, y06
  1039. LFD a2, 9 * SIZE(AO1)
  1040. FMADD1 xsum3, xtemp2, a6, xsum3
  1041. LFD xtemp2, 9 * SIZE(XX)
  1042. FNMSUB y07, atemp2, a4, y07
  1043. NOP2
  1044. FMADD2 xsum4, xtemp1, a6, xsum4
  1045. LFD xtemp1, 8 * SIZE(XX)
  1046. FMADD y08, atemp1, a4, y08
  1047. NOP2
  1048. FMADD xsum1, xtemp3, a3, xsum1
  1049. LFD y03, 10 * SIZE(YY)
  1050. FMADD y05, atemp3, a5, y05
  1051. NOP2
  1052. FMADD xsum2, xtemp4, a3, xsum2
  1053. LFD a3, 10 * SIZE(AO1)
  1054. FMADD y06, atemp4, a5, y06
  1055. LFD a5, 8 * SIZE(AO2)
  1056. FMADD xsum3, xtemp3, a7, xsum3
  1057. LFD y04, 11 * SIZE(YY)
  1058. FMADD y07, atemp3, a7, y07
  1059. NOP2
  1060. FMADD xsum4, xtemp4, a7, xsum4
  1061. addi YY, YY, 8 * SIZE
  1062. FMADD y08, atemp4, a7, y08
  1063. LFD a7, 10 * SIZE(AO2)
  1064. FMADD1 xsum1, xtemp4, a4, xsum1
  1065. addi AO2, AO2, 8 * SIZE
  1066. FNMSUB y05, atemp4, a6, y05
  1067. addi XX, XX, 8 * SIZE
  1068. FMADD2 xsum2, xtemp3, a4, xsum2
  1069. LFD a4, 11 * SIZE(AO1)
  1070. FMADD y06, atemp3, a6, y06
  1071. LFD a6, 1 * SIZE(AO2)
  1072. FMADD1 xsum3, xtemp4, a8, xsum3
  1073. LFD xtemp4, 3 * SIZE(XX)
  1074. FNMSUB y07, atemp4, a8, y07
  1075. addi AO1, AO1, 8 * SIZE
  1076. FMADD2 xsum4, xtemp3, a8, xsum4
  1077. LFD xtemp3, 2 * SIZE(XX)
  1078. FMADD y08, atemp3, a8, y08
  1079. LFD a8, 3 * SIZE(AO2)
  1080. STFD y05, -4 * SIZE(YY)
  1081. STFD y06, -3 * SIZE(YY)
  1082. STFD y07, -2 * SIZE(YY)
  1083. STFD y08, -1 * SIZE(YY)
  1084. .align 4
  1085. LL(16):
  1086. andi. r0, TEMP, 2
  1087. ble LL(17)
  1088. FMADD xsum1, xtemp1, a1, xsum1
  1089. NOP1
  1090. FMADD y01, atemp1, a1, y01
  1091. NOP2
  1092. FMADD xsum2, xtemp2, a1, xsum2
  1093. NOP1
  1094. FMADD y02, atemp2, a1, y02
  1095. LFD a1, 4 * SIZE(AO1)
  1096. FMADD xsum3, xtemp1, a5, xsum3
  1097. FMADD y03, atemp1, a3, y03
  1098. FMADD xsum4, xtemp2, a5, xsum4
  1099. FMADD y04, atemp2, a3, y04
  1100. FMADD1 xsum1, xtemp2, a2, xsum1
  1101. NOP1
  1102. FNMSUB y01, atemp2, a2, y01
  1103. NOP2
  1104. FMADD2 xsum2, xtemp1, a2, xsum2
  1105. NOP1
  1106. FMADD y02, atemp1, a2, y02
  1107. LFD a2, 5 * SIZE(AO1)
  1108. FMADD1 xsum3, xtemp2, a6, xsum3
  1109. LFD xtemp2, 5 * SIZE(XX)
  1110. FNMSUB y03, atemp2, a4, y03
  1111. NOP2
  1112. FMADD2 xsum4, xtemp1, a6, xsum4
  1113. LFD xtemp1, 4 * SIZE(XX)
  1114. FMADD y04, atemp1, a4, y04
  1115. NOP2
  1116. FMADD xsum1, xtemp3, a3, xsum1
  1117. NOP1
  1118. FMADD y01, atemp3, a5, y01
  1119. NOP2
  1120. FMADD xsum2, xtemp4, a3, xsum2
  1121. NOP1
  1122. FMADD y02, atemp4, a5, y02
  1123. LFD a5, 4 * SIZE(AO2)
  1124. FMADD xsum3, xtemp3, a7, xsum3
  1125. FMADD y03, atemp3, a7, y03
  1126. FMADD xsum4, xtemp4, a7, xsum4
  1127. FMADD y04, atemp4, a7, y04
  1128. FMADD1 xsum1, xtemp4, a4, xsum1
  1129. NOP1
  1130. FNMSUB y01, atemp4, a6, y01
  1131. NOP2
  1132. FMADD2 xsum2, xtemp3, a4, xsum2
  1133. NOP1
  1134. FMADD y02, atemp3, a6, y02
  1135. LFD a6, 5 * SIZE(AO2)
  1136. FMADD1 xsum3, xtemp4, a8, xsum3
  1137. addi AO1, AO1, 4 * SIZE
  1138. FNMSUB y03, atemp4, a8, y03
  1139. addi AO2, AO2, 4 * SIZE
  1140. FMADD2 xsum4, xtemp3, a8, xsum4
  1141. addi YY, YY, 4 * SIZE
  1142. FMADD y04, atemp3, a8, y04
  1143. NOP2
  1144. STFD y01, -4 * SIZE(YY)
  1145. LFD y01, 0 * SIZE(YY)
  1146. STFD y02, -3 * SIZE(YY)
  1147. LFD y02, 1 * SIZE(YY)
  1148. STFD y03, -2 * SIZE(YY)
  1149. STFD y04, -1 * SIZE(YY)
  1150. .align 4
  1151. LL(17):
  1152. andi. r0, M, 1
  1153. ble LL(18)
  1154. FMADD xsum1, xtemp1, a1, xsum1
  1155. FMADD y01, atemp1, a1, y01
  1156. FMADD xsum2, xtemp2, a1, xsum2
  1157. FMADD y02, atemp2, a1, y02
  1158. FMADD xsum3, xtemp1, a5, xsum3
  1159. FNMSUB y01, atemp2, a2, y01
  1160. FMADD xsum4, xtemp2, a5, xsum4
  1161. FMADD y02, atemp1, a2, y02
  1162. FMADD1 xsum1, xtemp2, a2, xsum1
  1163. FMADD y01, atemp3, a5, y01
  1164. FMADD2 xsum2, xtemp1, a2, xsum2
  1165. FMADD y02, atemp4, a5, y02
  1166. FMADD1 xsum3, xtemp2, a6, xsum3
  1167. FNMSUB y01, atemp4, a6, y01
  1168. FMADD2 xsum4, xtemp1, a6, xsum4
  1169. FMADD y02, atemp3, a6, y02
  1170. STFD y01, 0 * SIZE(YY)
  1171. STFD y02, 1 * SIZE(YY)
  1172. STFD y03, 2 * SIZE(YY)
  1173. STFD y04, 3 * SIZE(YY)
  1174. .align 4
  1175. LL(18):
  1176. LFD y05, ALPHA_R
  1177. LFD y06, ALPHA_I
  1178. slwi TEMP, IS, ZBASE_SHIFT
  1179. add YY, NEW_Y, TEMP
  1180. LFD y01, 0 * SIZE(YY)
  1181. LFD y02, 1 * SIZE(YY)
  1182. LFD y03, 2 * SIZE(YY)
  1183. LFD y04, 3 * SIZE(YY)
  1184. FMUL xtemp1, y05, xsum1
  1185. FMUL xtemp2, y06, xsum1
  1186. FMUL xtemp3, y05, xsum3
  1187. FMUL xtemp4, y06, xsum3
  1188. FNMSUB xsum1, y06, xsum2, xtemp1
  1189. FMADD xsum2, y05, xsum2, xtemp2
  1190. FNMSUB xsum3, y06, xsum4, xtemp3
  1191. FMADD xsum4, y05, xsum4, xtemp4
  1192. FADD y01, y01, xsum1
  1193. FADD y02, y02, xsum2
  1194. FADD y03, y03, xsum3
  1195. FADD y04, y04, xsum4
  1196. STFD y01, 0 * SIZE(YY)
  1197. addi TEMP, IS, 4
  1198. STFD y02, 1 * SIZE(YY)
  1199. addi IS, IS, 2
  1200. STFD y03, 2 * SIZE(YY)
  1201. cmpw cr0, TEMP, N
  1202. STFD y04, 3 * SIZE(YY)
  1203. ble LL(11)
  1204. .align 4
  1205. LL(20):
  1206. andi. TEMP, N, 1
  1207. ble LL(990)
  1208. slwi TEMP, IS, ZBASE_SHIFT
  1209. add XX, X, TEMP
  1210. add YY, NEW_Y, TEMP
  1211. LFD y05, ALPHA_R
  1212. LFD y06, ALPHA_I
  1213. LFD atemp1, 0 * SIZE(XX)
  1214. LFD atemp2, 1 * SIZE(XX)
  1215. LFD a1, 0 * SIZE(A)
  1216. LFD a2, 1 * SIZE(A)
  1217. FMUL xsum1, atemp1, a1
  1218. FMUL xsum2, atemp2, a1
  1219. #ifndef HEMV
  1220. FNMSUB xsum1, atemp2, a2, xsum1
  1221. FMADD xsum2, atemp1, a2, xsum2
  1222. #endif
  1223. FMUL xtemp1, y05, atemp1
  1224. FMUL xtemp2, y06, atemp1
  1225. FNMSUB atemp1, y06, atemp2, xtemp1
  1226. FMADD atemp2, y05, atemp2, xtemp2
  1227. LFD y05, ALPHA_R
  1228. LFD y06, ALPHA_I
  1229. LFD y01, 0 * SIZE(YY)
  1230. LFD y02, 1 * SIZE(YY)
  1231. FMUL xtemp1, y05, xsum1
  1232. FMUL xtemp2, y06, xsum1
  1233. FNMSUB xsum1, y06, xsum2, xtemp1
  1234. FMADD xsum2, y05, xsum2, xtemp2
  1235. FADD y01, y01, xsum1
  1236. FADD y02, y02, xsum2
  1237. STFD y01, 0 * SIZE(YY)
  1238. STFD y02, 1 * SIZE(YY)
  1239. .align 4
  1240. LL(990):
  1241. cmpwi cr0, INCY, 2 * SIZE
  1242. beq LL(999)
  1243. mr YY, Y
  1244. srawi. r0, M, 2
  1245. mtspr CTR, r0
  1246. ble LL(995)
  1247. .align 4
  1248. LL(991):
  1249. LFD f0, 0 * SIZE(Y)
  1250. LFD f1, 1 * SIZE(Y)
  1251. add Y, Y, INCY
  1252. LFD f2, 0 * SIZE(Y)
  1253. LFD f3, 1 * SIZE(Y)
  1254. add Y, Y, INCY
  1255. LFD f4, 0 * SIZE(Y)
  1256. LFD f5, 1 * SIZE(Y)
  1257. add Y, Y, INCY
  1258. LFD f6, 0 * SIZE(Y)
  1259. LFD f7, 1 * SIZE(Y)
  1260. add Y, Y, INCY
  1261. LFD f8, 0 * SIZE(NEW_Y)
  1262. LFD f9, 1 * SIZE(NEW_Y)
  1263. LFD f10, 2 * SIZE(NEW_Y)
  1264. LFD f11, 3 * SIZE(NEW_Y)
  1265. LFD f12, 4 * SIZE(NEW_Y)
  1266. LFD f13, 5 * SIZE(NEW_Y)
  1267. LFD f14, 6 * SIZE(NEW_Y)
  1268. LFD f15, 7 * SIZE(NEW_Y)
  1269. addi NEW_Y, NEW_Y, 8 * SIZE
  1270. FADD f8, f8, f0
  1271. FADD f9, f9, f1
  1272. FADD f10, f10, f2
  1273. FADD f11, f11, f3
  1274. FADD f12, f12, f4
  1275. FADD f13, f13, f5
  1276. FADD f14, f14, f6
  1277. FADD f15, f15, f7
  1278. STFD f8, 0 * SIZE(YY)
  1279. STFD f9, 1 * SIZE(YY)
  1280. add YY, YY, INCY
  1281. STFD f10, 0 * SIZE(YY)
  1282. STFD f11, 1 * SIZE(YY)
  1283. add YY, YY, INCY
  1284. STFD f12, 0 * SIZE(YY)
  1285. STFD f13, 1 * SIZE(YY)
  1286. add YY, YY, INCY
  1287. STFD f14, 0 * SIZE(YY)
  1288. STFD f15, 1 * SIZE(YY)
  1289. add YY, YY, INCY
  1290. bdnz LL(991)
  1291. .align 4
  1292. LL(995):
  1293. andi. J, M, 2
  1294. ble LL(996)
  1295. LFD f0, 0 * SIZE(Y)
  1296. LFD f1, 1 * SIZE(Y)
  1297. add Y, Y, INCY
  1298. LFD f2, 0 * SIZE(Y)
  1299. LFD f3, 1 * SIZE(Y)
  1300. add Y, Y, INCY
  1301. LFD f8, 0 * SIZE(NEW_Y)
  1302. LFD f9, 1 * SIZE(NEW_Y)
  1303. LFD f10, 2 * SIZE(NEW_Y)
  1304. LFD f11, 3 * SIZE(NEW_Y)
  1305. addi NEW_Y, NEW_Y, 4 * SIZE
  1306. FADD f8, f8, f0
  1307. FADD f9, f9, f1
  1308. FADD f10, f10, f2
  1309. FADD f11, f11, f3
  1310. STFD f8, 0 * SIZE(YY)
  1311. STFD f9, 1 * SIZE(YY)
  1312. add YY, YY, INCY
  1313. STFD f10, 0 * SIZE(YY)
  1314. STFD f11, 1 * SIZE(YY)
  1315. add YY, YY, INCY
  1316. .align 4
  1317. LL(996):
  1318. andi. J, M, 1
  1319. ble LL(999)
  1320. LFD f0, 0 * SIZE(Y)
  1321. LFD f1, 1 * SIZE(Y)
  1322. LFD f8, 0 * SIZE(NEW_Y)
  1323. LFD f9, 1 * SIZE(NEW_Y)
  1324. FADD f8, f8, f0
  1325. FADD f9, f9, f1
  1326. STFD f8, 0 * SIZE(YY)
  1327. STFD f9, 1 * SIZE(YY)
  1328. .align 4
  1329. LL(999):
  1330. li r3, 0
  1331. lfd f14, 0(SP)
  1332. lfd f15, 8(SP)
  1333. lfd f16, 16(SP)
  1334. lfd f17, 24(SP)
  1335. lfd f18, 32(SP)
  1336. lfd f19, 40(SP)
  1337. lfd f20, 48(SP)
  1338. lfd f21, 56(SP)
  1339. lfd f22, 64(SP)
  1340. lfd f23, 72(SP)
  1341. lfd f24, 80(SP)
  1342. lfd f25, 88(SP)
  1343. lfd f26, 96(SP)
  1344. lfd f27, 104(SP)
  1345. lfd f28, 112(SP)
  1346. lfd f29, 120(SP)
  1347. lfd f30, 128(SP)
  1348. lfd f31, 136(SP)
  1349. #ifdef __64BIT__
  1350. ld r14, 144(SP)
  1351. ld r15, 152(SP)
  1352. ld r16, 160(SP)
  1353. ld r17, 168(SP)
  1354. ld r18, 176(SP)
  1355. ld r19, 184(SP)
  1356. ld r20, 192(SP)
  1357. ld r21, 200(SP)
  1358. ld r22, 208(SP)
  1359. ld r23, 216(SP)
  1360. ld r24, 224(SP)
  1361. ld r25, 232(SP)
  1362. ld r26, 240(SP)
  1363. ld r27, 248(SP)
  1364. #else
  1365. lwz r14, 144(SP)
  1366. lwz r15, 148(SP)
  1367. lwz r16, 152(SP)
  1368. lwz r17, 156(SP)
  1369. lwz r18, 160(SP)
  1370. lwz r19, 164(SP)
  1371. lwz r20, 168(SP)
  1372. lwz r21, 172(SP)
  1373. lwz r22, 176(SP)
  1374. lwz r23, 180(SP)
  1375. lwz r24, 184(SP)
  1376. lwz r25, 188(SP)
  1377. lwz r26, 192(SP)
  1378. lwz r27, 196(SP)
  1379. #endif
  1380. addi SP, SP, STACKSIZE
  1381. blr
  1382. EPILOGUE
  1383. #endif