You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zsymv_U.S 33 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define IS r4
  44. #define A r5
  45. #define LDA r6
  46. #define X r7
  47. #define INCX r8
  48. #define Y r9
  49. #define INCY r10
  50. #define BUFFER r14
  51. #else
  52. #define M r3
  53. #define IS r4
  54. #define A r7
  55. #define LDA r8
  56. #define X r9
  57. #define INCX r10
  58. #define Y r5
  59. #define INCY r6
  60. #define BUFFER r14
  61. #endif
  62. #endif
  63. #if defined(_AIX) || defined(__APPLE__)
  64. #if !defined(__64BIT__) && defined(DOUBLE)
  65. #define M r3
  66. #define IS r4
  67. #define A r9
  68. #define LDA r10
  69. #define X r5
  70. #define INCX r6
  71. #define Y r7
  72. #define INCY r8
  73. #define BUFFER r14
  74. #else
  75. #define M r3
  76. #define IS r4
  77. #define A r7
  78. #define LDA r8
  79. #define X r9
  80. #define INCX r10
  81. #define Y r5
  82. #define INCY r6
  83. #define BUFFER r14
  84. #endif
  85. #endif
  86. #define I r11
  87. #define J r12
  88. #define AO1 r15
  89. #define AO2 r16
  90. #define XX r19
  91. #define YY r20
  92. #define NEW_Y r21
  93. #define TEMP r22
  94. #define PREA r24
  95. #define y01 f0
  96. #define y02 f1
  97. #define y03 f2
  98. #define y04 f3
  99. #define y05 f4
  100. #define y06 f5
  101. #define y07 f6
  102. #define y08 f7
  103. #define xtemp1 f8
  104. #define xtemp2 f9
  105. #define xtemp3 f10
  106. #define xtemp4 f11
  107. #define xtemp5 f12
  108. #define xtemp6 f13
  109. #define xtemp7 f14
  110. #define xtemp8 f15
  111. #define atemp1 f16
  112. #define atemp2 f17
  113. #define atemp3 f18
  114. #define atemp4 f19
  115. #define xsum1 f20
  116. #define xsum2 f21
  117. #define xsum3 f22
  118. #define xsum4 f23
  119. #define a1 f24
  120. #define a2 f25
  121. #define a3 f26
  122. #define a4 f27
  123. #define a5 f28
  124. #define a6 f29
  125. #define a7 f30
  126. #define a8 f31
  127. #define alpha_r f1
  128. #define alpha_i f2
  129. #if defined(PPCG4)
  130. #define PREFETCHSIZE_A 24
  131. #endif
  132. #if defined(PPC440) || defined(PPC440FP2)
  133. #define PREFETCHSIZE_A 24
  134. #endif
  135. #ifdef PPC970
  136. #define PREFETCHSIZE_A 32
  137. #endif
  138. #ifdef CELL
  139. #define PREFETCHSIZE_A 72
  140. #endif
  141. #ifdef POWER4
  142. #define PREFETCHSIZE_A 16
  143. #endif
  144. #ifdef POWER5
  145. #define PREFETCHSIZE_A 96
  146. #endif
  147. #ifdef POWER6
  148. #define PREFETCHSIZE_A 112
  149. #endif
  150. #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
  151. #define NOP1
  152. #define NOP2
  153. #else
  154. #define NOP1 mr LDA, LDA
  155. #define NOP2 mr INCX, INCX
  156. #endif
  157. #ifndef NEEDPARAM
  158. #ifndef __64BIT__
  159. #define STACKSIZE 224
  160. #define ALPHA_R 200(SP)
  161. #define ALPHA_I 208(SP)
  162. #define FZERO 216(SP)
  163. #else
  164. #define STACKSIZE 280
  165. #define ALPHA_R 256(SP)
  166. #define ALPHA_I 264(SP)
  167. #define FZERO 272(SP)
  168. #endif
  169. #ifndef HEMV
  170. #define FMADD1 FNMSUB
  171. #define FMADD2 FMADD
  172. #else
  173. #define FMADD1 FMADD
  174. #define FMADD2 FNMSUB
  175. #endif
  176. PROLOGUE
  177. PROFCODE
  178. addi SP, SP, -STACKSIZE
  179. li r0, 0
  180. stfd f14, 0(SP)
  181. stfd f15, 8(SP)
  182. stfd f16, 16(SP)
  183. stfd f17, 24(SP)
  184. stfd f18, 32(SP)
  185. stfd f19, 40(SP)
  186. stfd f20, 48(SP)
  187. stfd f21, 56(SP)
  188. stfd f22, 64(SP)
  189. stfd f23, 72(SP)
  190. stfd f24, 80(SP)
  191. stfd f25, 88(SP)
  192. stfd f26, 96(SP)
  193. stfd f27, 104(SP)
  194. stfd f28, 112(SP)
  195. stfd f29, 120(SP)
  196. stfd f30, 128(SP)
  197. stfd f31, 136(SP)
  198. #ifdef __64BIT__
  199. std r0, FZERO
  200. std r14, 144(SP)
  201. std r15, 152(SP)
  202. std r16, 160(SP)
  203. std r17, 168(SP)
  204. std r18, 176(SP)
  205. std r19, 184(SP)
  206. std r20, 192(SP)
  207. std r21, 200(SP)
  208. std r22, 208(SP)
  209. std r23, 216(SP)
  210. std r24, 224(SP)
  211. std r25, 232(SP)
  212. std r26, 240(SP)
  213. std r27, 248(SP)
  214. #else
  215. stw r0, 0 + FZERO
  216. stw r0, 4 + FZERO
  217. stw r14, 144(SP)
  218. stw r15, 148(SP)
  219. stw r16, 152(SP)
  220. stw r17, 156(SP)
  221. stw r18, 160(SP)
  222. stw r19, 164(SP)
  223. stw r20, 168(SP)
  224. stw r21, 172(SP)
  225. stw r22, 176(SP)
  226. stw r23, 180(SP)
  227. stw r24, 184(SP)
  228. stw r25, 188(SP)
  229. stw r26, 192(SP)
  230. stw r27, 196(SP)
  231. #endif
  232. #if defined(linux) || defined(__FreeBSD__)
  233. #ifndef __64BIT__
  234. lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
  235. #else
  236. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  237. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  238. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  239. #endif
  240. #endif
  241. #if defined(_AIX) || defined(__APPLE__)
  242. #ifndef __64BIT__
  243. #ifdef DOUBLE
  244. lwz X, FRAMESLOT(0) + STACKSIZE(SP)
  245. lwz INCX, FRAMESLOT(1) + STACKSIZE(SP)
  246. lwz Y, FRAMESLOT(2) + STACKSIZE(SP)
  247. lwz INCY, FRAMESLOT(3) + STACKSIZE(SP)
  248. lwz BUFFER, FRAMESLOT(4) + STACKSIZE(SP)
  249. #else
  250. lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
  251. lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
  252. lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  253. #endif
  254. #else
  255. ld Y, FRAMESLOT(0) + STACKSIZE(SP)
  256. ld INCY, FRAMESLOT(1) + STACKSIZE(SP)
  257. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  258. #endif
  259. #endif
  260. STFD alpha_r, ALPHA_R
  261. STFD alpha_i, ALPHA_I
  262. slwi LDA, LDA, ZBASE_SHIFT
  263. slwi INCX, INCX, ZBASE_SHIFT
  264. slwi INCY, INCY, ZBASE_SHIFT
  265. li PREA, PREFETCHSIZE_A * SIZE
  266. sub IS, M, IS
  267. cmpwi cr0, M, 0
  268. ble- LL(999)
  269. mullw TEMP, IS, LDA
  270. add A, A, TEMP
  271. cmpwi cr0, INCX, 2 * SIZE
  272. beq LL(05)
  273. mr XX, X
  274. mr X, BUFFER
  275. srawi. r0, M, 2
  276. mtspr CTR, r0
  277. ble LL(03)
  278. .align 4
  279. LL(01):
  280. LFD a1, 0 * SIZE(XX)
  281. LFD a2, 1 * SIZE(XX)
  282. add XX, XX, INCX
  283. LFD a3, 0 * SIZE(XX)
  284. LFD a4, 1 * SIZE(XX)
  285. add XX, XX, INCX
  286. LFD a5, 0 * SIZE(XX)
  287. LFD a6, 1 * SIZE(XX)
  288. add XX, XX, INCX
  289. LFD a7, 0 * SIZE(XX)
  290. LFD a8, 1 * SIZE(XX)
  291. add XX, XX, INCX
  292. dcbt XX, PREA
  293. dcbtst BUFFER, PREA
  294. STFD a1, 0 * SIZE(BUFFER)
  295. STFD a2, 1 * SIZE(BUFFER)
  296. STFD a3, 2 * SIZE(BUFFER)
  297. STFD a4, 3 * SIZE(BUFFER)
  298. STFD a5, 4 * SIZE(BUFFER)
  299. STFD a6, 5 * SIZE(BUFFER)
  300. STFD a7, 6 * SIZE(BUFFER)
  301. STFD a8, 7 * SIZE(BUFFER)
  302. addi BUFFER, BUFFER, 8 * SIZE
  303. bdnz LL(01)
  304. .align 4
  305. LL(03):
  306. andi. r0, M, 3
  307. mtspr CTR, r0
  308. ble LL(05)
  309. .align 4
  310. LL(04):
  311. LFD a1, 0 * SIZE(XX)
  312. LFD a2, 1 * SIZE(XX)
  313. add XX, XX, INCX
  314. STFD a1, 0 * SIZE(BUFFER)
  315. STFD a2, 1 * SIZE(BUFFER)
  316. addi BUFFER, BUFFER, 2 * SIZE
  317. bdnz LL(04)
  318. .align 4
  319. LL(05):
  320. mr NEW_Y, Y
  321. lfd f0, FZERO
  322. cmpwi cr0, INCY, 2 * SIZE
  323. beq LL(10)
  324. mr NEW_Y, BUFFER
  325. addi r0, M, 3
  326. srawi. r0, r0, 2
  327. mtspr CTR, r0
  328. .align 4
  329. LL(06):
  330. STFD f0, 0 * SIZE(BUFFER)
  331. STFD f0, 1 * SIZE(BUFFER)
  332. STFD f0, 2 * SIZE(BUFFER)
  333. STFD f0, 3 * SIZE(BUFFER)
  334. STFD f0, 4 * SIZE(BUFFER)
  335. STFD f0, 5 * SIZE(BUFFER)
  336. STFD f0, 6 * SIZE(BUFFER)
  337. STFD f0, 7 * SIZE(BUFFER)
  338. addi BUFFER, BUFFER, 8 * SIZE
  339. bdnz LL(06)
  340. .align 4
  341. LL(10):
  342. addi TEMP, IS, 2
  343. cmpw cr0, TEMP, M
  344. bgt LL(20)
  345. .align 4
  346. LL(11):
  347. mr AO1, A
  348. add AO2, A, LDA
  349. add A, AO2, LDA
  350. slwi TEMP, IS, ZBASE_SHIFT
  351. add TEMP, X, TEMP
  352. LFD y05, ALPHA_R
  353. LFD y06, ALPHA_I
  354. LFD xtemp1, 0 * SIZE(TEMP)
  355. LFD xtemp2, 1 * SIZE(TEMP)
  356. LFD xtemp3, 2 * SIZE(TEMP)
  357. LFD xtemp4, 3 * SIZE(TEMP)
  358. FMUL atemp1, y05, xtemp1
  359. FMUL atemp2, y06, xtemp1
  360. FMUL atemp3, y05, xtemp3
  361. FMUL atemp4, y06, xtemp3
  362. FNMSUB atemp1, y06, xtemp2, atemp1
  363. FMADD atemp2, y05, xtemp2, atemp2
  364. FNMSUB atemp3, y06, xtemp4, atemp3
  365. FMADD atemp4, y05, xtemp4, atemp4
  366. lfd xsum1, FZERO
  367. fmr xsum2, xsum1
  368. fmr xsum3, xsum1
  369. fmr xsum4, xsum1
  370. mr XX, X
  371. mr YY, NEW_Y
  372. LFD a1, 0 * SIZE(AO1)
  373. LFD a2, 1 * SIZE(AO1)
  374. LFD a3, 2 * SIZE(AO1)
  375. LFD a4, 3 * SIZE(AO1)
  376. LFD a5, 0 * SIZE(AO2)
  377. LFD a6, 1 * SIZE(AO2)
  378. LFD a7, 2 * SIZE(AO2)
  379. LFD a8, 3 * SIZE(AO2)
  380. LFD xtemp1, 0 * SIZE(XX)
  381. LFD xtemp2, 1 * SIZE(XX)
  382. LFD xtemp3, 2 * SIZE(XX)
  383. LFD xtemp4, 3 * SIZE(XX)
  384. LFD y01, 0 * SIZE(YY)
  385. LFD y02, 1 * SIZE(YY)
  386. LFD y03, 2 * SIZE(YY)
  387. LFD y04, 3 * SIZE(YY)
  388. srawi. r0, IS, 3
  389. mtspr CTR, r0
  390. ble LL(15)
  391. FMADD xsum1, xtemp1, a1, xsum1
  392. DCBT(AO1, PREA)
  393. FMADD y01, atemp1, a1, y01
  394. NOP2
  395. FMADD xsum2, xtemp2, a1, xsum2
  396. NOP1
  397. FMADD y02, atemp2, a1, y02
  398. LFD a1, 4 * SIZE(AO1)
  399. FMADD xsum3, xtemp1, a5, xsum3
  400. NOP1
  401. FMADD y03, atemp1, a3, y03
  402. NOP2
  403. FMADD xsum4, xtemp2, a5, xsum4
  404. NOP1
  405. FMADD y04, atemp2, a3, y04
  406. NOP2
  407. FMADD1 xsum1, xtemp2, a2, xsum1
  408. LFD y05, 4 * SIZE(YY)
  409. FNMSUB y01, atemp2, a2, y01
  410. NOP2
  411. FMADD2 xsum2, xtemp1, a2, xsum2
  412. LFD y06, 5 * SIZE(YY)
  413. FMADD y02, atemp1, a2, y02
  414. LFD a2, 5 * SIZE(AO1)
  415. FMADD1 xsum3, xtemp2, a6, xsum3
  416. LFD xtemp2, 5 * SIZE(XX)
  417. FNMSUB y03, atemp2, a4, y03
  418. NOP2
  419. FMADD2 xsum4, xtemp1, a6, xsum4
  420. LFD xtemp1, 4 * SIZE(XX)
  421. FMADD y04, atemp1, a4, y04
  422. NOP2
  423. FMADD xsum1, xtemp3, a3, xsum1
  424. LFD y07, 6 * SIZE(YY)
  425. FMADD y01, atemp3, a5, y01
  426. NOP2
  427. FMADD xsum2, xtemp4, a3, xsum2
  428. LFD a3, 6 * SIZE(AO1)
  429. FMADD y02, atemp4, a5, y02
  430. LFD a5, 4 * SIZE(AO2)
  431. FMADD xsum3, xtemp3, a7, xsum3
  432. LFD y08, 7 * SIZE(YY)
  433. FMADD y03, atemp3, a7, y03
  434. NOP2
  435. FMADD xsum4, xtemp4, a7, xsum4
  436. NOP1
  437. FMADD y04, atemp4, a7, y04
  438. LFD a7, 6 * SIZE(AO2)
  439. FMADD1 xsum1, xtemp4, a4, xsum1
  440. NOP1
  441. FNMSUB y01, atemp4, a6, y01
  442. # DCBT(X, PREX)
  443. NOP2
  444. FMADD2 xsum2, xtemp3, a4, xsum2
  445. LFD a4, 7 * SIZE(AO1)
  446. FMADD y02, atemp3, a6, y02
  447. LFD a6, 5 * SIZE(AO2)
  448. FMADD1 xsum3, xtemp4, a8, xsum3
  449. LFD xtemp4, 7 * SIZE(XX)
  450. FNMSUB y03, atemp4, a8, y03
  451. NOP2
  452. FMADD2 xsum4, xtemp3, a8, xsum4
  453. LFD xtemp3, 6 * SIZE(XX)
  454. FMADD y04, atemp3, a8, y04
  455. LFD a8, 7 * SIZE(AO2)
  456. FMADD xsum1, xtemp1, a1, xsum1
  457. STFD y01, 0 * SIZE(YY)
  458. FMADD y05, atemp1, a1, y05
  459. NOP2
  460. FMADD xsum2, xtemp2, a1, xsum2
  461. STFD y02, 1 * SIZE(YY)
  462. FMADD y06, atemp2, a1, y06
  463. LFD a1, 8 * SIZE(AO1)
  464. FMADD xsum3, xtemp1, a5, xsum3
  465. STFD y03, 2 * SIZE(YY)
  466. FMADD y07, atemp1, a3, y07
  467. NOP2
  468. FMADD xsum4, xtemp2, a5, xsum4
  469. STFD y04, 3 * SIZE(YY)
  470. FMADD y08, atemp2, a3, y08
  471. NOP2
  472. FMADD1 xsum1, xtemp2, a2, xsum1
  473. LFD y01, 8 * SIZE(YY)
  474. FNMSUB y05, atemp2, a2, y05
  475. NOP2
  476. FMADD2 xsum2, xtemp1, a2, xsum2
  477. LFD y02, 9 * SIZE(YY)
  478. FMADD y06, atemp1, a2, y06
  479. LFD a2, 9 * SIZE(AO1)
  480. FMADD1 xsum3, xtemp2, a6, xsum3
  481. LFD xtemp2, 9 * SIZE(XX)
  482. FNMSUB y07, atemp2, a4, y07
  483. NOP2
  484. FMADD2 xsum4, xtemp1, a6, xsum4
  485. LFD xtemp1, 8 * SIZE(XX)
  486. FMADD y08, atemp1, a4, y08
  487. NOP2
  488. FMADD xsum1, xtemp3, a3, xsum1
  489. LFD y03, 10 * SIZE(YY)
  490. FMADD y05, atemp3, a5, y05
  491. NOP2
  492. FMADD xsum2, xtemp4, a3, xsum2
  493. LFD a3, 10 * SIZE(AO1)
  494. FMADD y06, atemp4, a5, y06
  495. LFD a5, 8 * SIZE(AO2)
  496. FMADD xsum3, xtemp3, a7, xsum3
  497. LFD y04, 11 * SIZE(YY)
  498. FMADD y07, atemp3, a7, y07
  499. NOP2
  500. FMADD xsum4, xtemp4, a7, xsum4
  501. NOP1
  502. FMADD y08, atemp4, a7, y08
  503. LFD a7, 10 * SIZE(AO2)
  504. FMADD1 xsum1, xtemp4, a4, xsum1
  505. NOP1
  506. FNMSUB y05, atemp4, a6, y05
  507. NOP2
  508. FMADD2 xsum2, xtemp3, a4, xsum2
  509. LFD a4, 11 * SIZE(AO1)
  510. FMADD y06, atemp3, a6, y06
  511. LFD a6, 9 * SIZE(AO2)
  512. FMADD1 xsum3, xtemp4, a8, xsum3
  513. LFD xtemp4, 11 * SIZE(XX)
  514. FNMSUB y07, atemp4, a8, y07
  515. bdz LL(13)
  516. .align 4
  517. LL(12):
  518. FMADD2 xsum4, xtemp3, a8, xsum4
  519. LFD xtemp3, 10 * SIZE(XX)
  520. FMADD y08, atemp3, a8, y08
  521. LFD a8, 11 * SIZE(AO2)
  522. FMADD xsum1, xtemp1, a1, xsum1
  523. STFD y05, 4 * SIZE(YY)
  524. FMADD y01, atemp1, a1, y01
  525. DCBT(AO2, PREA)
  526. FMADD xsum2, xtemp2, a1, xsum2
  527. STFD y06, 5 * SIZE(YY)
  528. FMADD y02, atemp2, a1, y02
  529. LFD a1, 12 * SIZE(AO1)
  530. FMADD xsum3, xtemp1, a5, xsum3
  531. STFD y07, 6 * SIZE(YY)
  532. FMADD y03, atemp1, a3, y03
  533. NOP2
  534. FMADD xsum4, xtemp2, a5, xsum4
  535. STFD y08, 7 * SIZE(YY)
  536. FMADD y04, atemp2, a3, y04
  537. NOP2
  538. FMADD1 xsum1, xtemp2, a2, xsum1
  539. LFD y05, 12 * SIZE(YY)
  540. FNMSUB y01, atemp2, a2, y01
  541. NOP2
  542. FMADD2 xsum2, xtemp1, a2, xsum2
  543. LFD y06, 13 * SIZE(YY)
  544. FMADD y02, atemp1, a2, y02
  545. LFD a2, 13 * SIZE(AO1)
  546. FMADD1 xsum3, xtemp2, a6, xsum3
  547. LFD xtemp2, 13 * SIZE(XX)
  548. FNMSUB y03, atemp2, a4, y03
  549. NOP2
  550. FMADD2 xsum4, xtemp1, a6, xsum4
  551. LFD xtemp1, 12 * SIZE(XX)
  552. FMADD y04, atemp1, a4, y04
  553. NOP2
  554. FMADD xsum1, xtemp3, a3, xsum1
  555. LFD y07, 14 * SIZE(YY)
  556. FMADD y01, atemp3, a5, y01
  557. NOP2
  558. FMADD xsum2, xtemp4, a3, xsum2
  559. LFD a3, 14 * SIZE(AO1)
  560. FMADD y02, atemp4, a5, y02
  561. LFD a5, 12 * SIZE(AO2)
  562. FMADD xsum3, xtemp3, a7, xsum3
  563. LFD y08, 15 * SIZE(YY)
  564. FMADD y03, atemp3, a7, y03
  565. NOP2
  566. FMADD xsum4, xtemp4, a7, xsum4
  567. NOP1
  568. FMADD y04, atemp4, a7, y04
  569. LFD a7, 14 * SIZE(AO2)
  570. FMADD1 xsum1, xtemp4, a4, xsum1
  571. NOP1
  572. FNMSUB y01, atemp4, a6, y01
  573. # DCBT(Y1, PREY)
  574. NOP2
  575. FMADD2 xsum2, xtemp3, a4, xsum2
  576. LFD a4, 15 * SIZE(AO1)
  577. FMADD y02, atemp3, a6, y02
  578. LFD a6, 13 * SIZE(AO2)
  579. FMADD1 xsum3, xtemp4, a8, xsum3
  580. LFD xtemp4, 15 * SIZE(XX)
  581. FNMSUB y03, atemp4, a8, y03
  582. NOP2
  583. FMADD2 xsum4, xtemp3, a8, xsum4
  584. LFD xtemp3, 14 * SIZE(XX)
  585. FMADD y04, atemp3, a8, y04
  586. LFD a8, 15 * SIZE(AO2)
  587. FMADD xsum1, xtemp1, a1, xsum1
  588. STFD y01, 8 * SIZE(YY)
  589. FMADD y05, atemp1, a1, y05
  590. NOP2
  591. FMADD xsum2, xtemp2, a1, xsum2
  592. STFD y02, 9 * SIZE(YY)
  593. FMADD y06, atemp2, a1, y06
  594. LFD a1, 16 * SIZE(AO1)
  595. FMADD xsum3, xtemp1, a5, xsum3
  596. STFD y03, 10 * SIZE(YY)
  597. FMADD y07, atemp1, a3, y07
  598. NOP2
  599. FMADD xsum4, xtemp2, a5, xsum4
  600. STFD y04, 11 * SIZE(YY)
  601. FMADD y08, atemp2, a3, y08
  602. NOP2
  603. FMADD1 xsum1, xtemp2, a2, xsum1
  604. LFD y01, 16 * SIZE(YY)
  605. FNMSUB y05, atemp2, a2, y05
  606. NOP2
  607. FMADD2 xsum2, xtemp1, a2, xsum2
  608. LFD y02, 17 * SIZE(YY)
  609. FMADD y06, atemp1, a2, y06
  610. LFD a2, 17 * SIZE(AO1)
  611. FMADD1 xsum3, xtemp2, a6, xsum3
  612. LFD xtemp2, 17 * SIZE(XX)
  613. FNMSUB y07, atemp2, a4, y07
  614. NOP2
  615. FMADD2 xsum4, xtemp1, a6, xsum4
  616. LFD xtemp1, 16 * SIZE(XX)
  617. FMADD y08, atemp1, a4, y08
  618. addi AO2, AO2, 16 * SIZE
  619. FMADD xsum1, xtemp3, a3, xsum1
  620. LFD y03, 18 * SIZE(YY)
  621. FMADD y05, atemp3, a5, y05
  622. addi XX, XX, 16 * SIZE
  623. FMADD xsum2, xtemp4, a3, xsum2
  624. LFD a3, 18 * SIZE(AO1)
  625. FMADD y06, atemp4, a5, y06
  626. LFD a5, 0 * SIZE(AO2)
  627. FMADD xsum3, xtemp3, a7, xsum3
  628. LFD y04, 19 * SIZE(YY)
  629. FMADD y07, atemp3, a7, y07
  630. NOP2
  631. FMADD xsum4, xtemp4, a7, xsum4
  632. addi AO1, AO1, 16 * SIZE
  633. FMADD y08, atemp4, a7, y08
  634. LFD a7, 2 * SIZE(AO2)
  635. FMADD1 xsum1, xtemp4, a4, xsum1
  636. addi YY, YY, 16 * SIZE
  637. FNMSUB y05, atemp4, a6, y05
  638. NOP2
  639. FMADD2 xsum2, xtemp3, a4, xsum2
  640. LFD a4, 3 * SIZE(AO1)
  641. FMADD y06, atemp3, a6, y06
  642. LFD a6, 1 * SIZE(AO2)
  643. FMADD1 xsum3, xtemp4, a8, xsum3
  644. LFD xtemp4, 3 * SIZE(XX)
  645. FNMSUB y07, atemp4, a8, y07
  646. NOP2
  647. FMADD2 xsum4, xtemp3, a8, xsum4
  648. LFD xtemp3, 2 * SIZE(XX)
  649. FMADD y08, atemp3, a8, y08
  650. LFD a8, 3 * SIZE(AO2)
  651. FMADD xsum1, xtemp1, a1, xsum1
  652. STFD y05, -4 * SIZE(YY)
  653. FMADD y01, atemp1, a1, y01
  654. DCBT(AO1, PREA)
  655. FMADD xsum2, xtemp2, a1, xsum2
  656. STFD y06, -3 * SIZE(YY)
  657. FMADD y02, atemp2, a1, y02
  658. LFD a1, 4 * SIZE(AO1)
  659. FMADD xsum3, xtemp1, a5, xsum3
  660. STFD y07, -2 * SIZE(YY)
  661. FMADD y03, atemp1, a3, y03
  662. NOP2
  663. FMADD xsum4, xtemp2, a5, xsum4
  664. STFD y08, -1 * SIZE(YY)
  665. FMADD y04, atemp2, a3, y04
  666. NOP2
  667. FMADD1 xsum1, xtemp2, a2, xsum1
  668. LFD y05, 4 * SIZE(YY)
  669. FNMSUB y01, atemp2, a2, y01
  670. NOP2
  671. FMADD2 xsum2, xtemp1, a2, xsum2
  672. LFD y06, 5 * SIZE(YY)
  673. FMADD y02, atemp1, a2, y02
  674. LFD a2, 5 * SIZE(AO1)
  675. FMADD1 xsum3, xtemp2, a6, xsum3
  676. LFD xtemp2, 5 * SIZE(XX)
  677. FNMSUB y03, atemp2, a4, y03
  678. NOP2
  679. FMADD2 xsum4, xtemp1, a6, xsum4
  680. LFD xtemp1, 4 * SIZE(XX)
  681. FMADD y04, atemp1, a4, y04
  682. NOP2
  683. FMADD xsum1, xtemp3, a3, xsum1
  684. LFD y07, 6 * SIZE(YY)
  685. FMADD y01, atemp3, a5, y01
  686. NOP2
  687. FMADD xsum2, xtemp4, a3, xsum2
  688. LFD a3, 6 * SIZE(AO1)
  689. FMADD y02, atemp4, a5, y02
  690. LFD a5, 4 * SIZE(AO2)
  691. FMADD xsum3, xtemp3, a7, xsum3
  692. LFD y08, 7 * SIZE(YY)
  693. FMADD y03, atemp3, a7, y03
  694. NOP2
  695. FMADD xsum4, xtemp4, a7, xsum4
  696. NOP1
  697. FMADD y04, atemp4, a7, y04
  698. LFD a7, 6 * SIZE(AO2)
  699. FMADD1 xsum1, xtemp4, a4, xsum1
  700. NOP1
  701. FNMSUB y01, atemp4, a6, y01
  702. # DCBT(X, PREX)
  703. NOP2
  704. FMADD2 xsum2, xtemp3, a4, xsum2
  705. LFD a4, 7 * SIZE(AO1)
  706. FMADD y02, atemp3, a6, y02
  707. LFD a6, 5 * SIZE(AO2)
  708. FMADD1 xsum3, xtemp4, a8, xsum3
  709. LFD xtemp4, 7 * SIZE(XX)
  710. FNMSUB y03, atemp4, a8, y03
  711. NOP2
  712. FMADD2 xsum4, xtemp3, a8, xsum4
  713. LFD xtemp3, 6 * SIZE(XX)
  714. FMADD y04, atemp3, a8, y04
  715. LFD a8, 7 * SIZE(AO2)
  716. FMADD xsum1, xtemp1, a1, xsum1
  717. STFD y01, 0 * SIZE(YY)
  718. FMADD y05, atemp1, a1, y05
  719. NOP2
  720. FMADD xsum2, xtemp2, a1, xsum2
  721. STFD y02, 1 * SIZE(YY)
  722. FMADD y06, atemp2, a1, y06
  723. LFD a1, 8 * SIZE(AO1)
  724. FMADD xsum3, xtemp1, a5, xsum3
  725. STFD y03, 2 * SIZE(YY)
  726. FMADD y07, atemp1, a3, y07
  727. NOP2
  728. FMADD xsum4, xtemp2, a5, xsum4
  729. STFD y04, 3 * SIZE(YY)
  730. FMADD y08, atemp2, a3, y08
  731. NOP2
  732. FMADD1 xsum1, xtemp2, a2, xsum1
  733. LFD y01, 8 * SIZE(YY)
  734. FNMSUB y05, atemp2, a2, y05
  735. NOP2
  736. FMADD2 xsum2, xtemp1, a2, xsum2
  737. LFD y02, 9 * SIZE(YY)
  738. FMADD y06, atemp1, a2, y06
  739. LFD a2, 9 * SIZE(AO1)
  740. FMADD1 xsum3, xtemp2, a6, xsum3
  741. LFD xtemp2, 9 * SIZE(XX)
  742. FNMSUB y07, atemp2, a4, y07
  743. NOP2
  744. FMADD2 xsum4, xtemp1, a6, xsum4
  745. LFD xtemp1, 8 * SIZE(XX)
  746. FMADD y08, atemp1, a4, y08
  747. NOP2
  748. FMADD xsum1, xtemp3, a3, xsum1
  749. LFD y03, 10 * SIZE(YY)
  750. FMADD y05, atemp3, a5, y05
  751. NOP2
  752. FMADD xsum2, xtemp4, a3, xsum2
  753. LFD a3, 10 * SIZE(AO1)
  754. FMADD y06, atemp4, a5, y06
  755. LFD a5, 8 * SIZE(AO2)
  756. FMADD xsum3, xtemp3, a7, xsum3
  757. LFD y04, 11 * SIZE(YY)
  758. FMADD y07, atemp3, a7, y07
  759. NOP2
  760. FMADD xsum4, xtemp4, a7, xsum4
  761. NOP1
  762. FMADD y08, atemp4, a7, y08
  763. LFD a7, 10 * SIZE(AO2)
  764. FMADD1 xsum1, xtemp4, a4, xsum1
  765. NOP1
  766. FNMSUB y05, atemp4, a6, y05
  767. NOP2
  768. FMADD2 xsum2, xtemp3, a4, xsum2
  769. LFD a4, 11 * SIZE(AO1)
  770. FMADD y06, atemp3, a6, y06
  771. LFD a6, 9 * SIZE(AO2)
  772. FMADD1 xsum3, xtemp4, a8, xsum3
  773. LFD xtemp4, 11 * SIZE(XX)
  774. FNMSUB y07, atemp4, a8, y07
  775. bdnz LL(12)
  776. .align 4
  777. LL(13):
  778. FMADD2 xsum4, xtemp3, a8, xsum4
  779. LFD xtemp3, 10 * SIZE(XX)
  780. FMADD y08, atemp3, a8, y08
  781. LFD a8, 11 * SIZE(AO2)
  782. FMADD xsum1, xtemp1, a1, xsum1
  783. STFD y05, 4 * SIZE(YY)
  784. FMADD y01, atemp1, a1, y01
  785. NOP2
  786. FMADD xsum2, xtemp2, a1, xsum2
  787. STFD y06, 5 * SIZE(YY)
  788. FMADD y02, atemp2, a1, y02
  789. LFD a1, 12 * SIZE(AO1)
  790. FMADD xsum3, xtemp1, a5, xsum3
  791. STFD y07, 6 * SIZE(YY)
  792. FMADD y03, atemp1, a3, y03
  793. NOP2
  794. FMADD xsum4, xtemp2, a5, xsum4
  795. STFD y08, 7 * SIZE(YY)
  796. FMADD y04, atemp2, a3, y04
  797. NOP2
  798. FMADD1 xsum1, xtemp2, a2, xsum1
  799. LFD y05, 12 * SIZE(YY)
  800. FNMSUB y01, atemp2, a2, y01
  801. NOP2
  802. FMADD2 xsum2, xtemp1, a2, xsum2
  803. LFD y06, 13 * SIZE(YY)
  804. FMADD y02, atemp1, a2, y02
  805. LFD a2, 13 * SIZE(AO1)
  806. FMADD1 xsum3, xtemp2, a6, xsum3
  807. LFD xtemp2, 13 * SIZE(XX)
  808. FNMSUB y03, atemp2, a4, y03
  809. NOP2
  810. FMADD2 xsum4, xtemp1, a6, xsum4
  811. LFD xtemp1, 12 * SIZE(XX)
  812. FMADD y04, atemp1, a4, y04
  813. NOP2
  814. FMADD xsum1, xtemp3, a3, xsum1
  815. LFD y07, 14 * SIZE(YY)
  816. FMADD y01, atemp3, a5, y01
  817. NOP2
  818. FMADD xsum2, xtemp4, a3, xsum2
  819. LFD a3, 14 * SIZE(AO1)
  820. FMADD y02, atemp4, a5, y02
  821. LFD a5, 12 * SIZE(AO2)
  822. FMADD xsum3, xtemp3, a7, xsum3
  823. LFD y08, 15 * SIZE(YY)
  824. FMADD y03, atemp3, a7, y03
  825. NOP2
  826. FMADD xsum4, xtemp4, a7, xsum4
  827. NOP1
  828. FMADD y04, atemp4, a7, y04
  829. LFD a7, 14 * SIZE(AO2)
  830. FMADD1 xsum1, xtemp4, a4, xsum1
  831. NOP1
  832. FNMSUB y01, atemp4, a6, y01
  833. NOP2
  834. FMADD2 xsum2, xtemp3, a4, xsum2
  835. LFD a4, 15 * SIZE(AO1)
  836. FMADD y02, atemp3, a6, y02
  837. LFD a6, 13 * SIZE(AO2)
  838. FMADD1 xsum3, xtemp4, a8, xsum3
  839. LFD xtemp4, 15 * SIZE(XX)
  840. FNMSUB y03, atemp4, a8, y03
  841. NOP2
  842. FMADD2 xsum4, xtemp3, a8, xsum4
  843. LFD xtemp3, 14 * SIZE(XX)
  844. FMADD y04, atemp3, a8, y04
  845. LFD a8, 15 * SIZE(AO2)
  846. FMADD xsum1, xtemp1, a1, xsum1
  847. STFD y01, 8 * SIZE(YY)
  848. FMADD y05, atemp1, a1, y05
  849. NOP2
  850. FMADD xsum2, xtemp2, a1, xsum2
  851. STFD y02, 9 * SIZE(YY)
  852. FMADD y06, atemp2, a1, y06
  853. LFD a1, 16 * SIZE(AO1)
  854. FMADD xsum3, xtemp1, a5, xsum3
  855. STFD y03, 10 * SIZE(YY)
  856. FMADD y07, atemp1, a3, y07
  857. NOP2
  858. FMADD xsum4, xtemp2, a5, xsum4
  859. STFD y04, 11 * SIZE(YY)
  860. FMADD y08, atemp2, a3, y08
  861. NOP2
  862. FMADD1 xsum1, xtemp2, a2, xsum1
  863. LFD y01, 16 * SIZE(YY)
  864. FNMSUB y05, atemp2, a2, y05
  865. NOP2
  866. FMADD2 xsum2, xtemp1, a2, xsum2
  867. LFD y02, 17 * SIZE(YY)
  868. FMADD y06, atemp1, a2, y06
  869. LFD a2, 17 * SIZE(AO1)
  870. FMADD1 xsum3, xtemp2, a6, xsum3
  871. LFD xtemp2, 17 * SIZE(XX)
  872. FNMSUB y07, atemp2, a4, y07
  873. NOP2
  874. FMADD2 xsum4, xtemp1, a6, xsum4
  875. LFD xtemp1, 16 * SIZE(XX)
  876. FMADD y08, atemp1, a4, y08
  877. addi AO2, AO2, 16 * SIZE
  878. FMADD xsum1, xtemp3, a3, xsum1
  879. LFD y03, 18 * SIZE(YY)
  880. FMADD y05, atemp3, a5, y05
  881. addi XX, XX, 16 * SIZE
  882. FMADD xsum2, xtemp4, a3, xsum2
  883. LFD a3, 18 * SIZE(AO1)
  884. FMADD y06, atemp4, a5, y06
  885. LFD a5, 0 * SIZE(AO2)
  886. FMADD xsum3, xtemp3, a7, xsum3
  887. LFD y04, 19 * SIZE(YY)
  888. FMADD y07, atemp3, a7, y07
  889. NOP2
  890. FMADD xsum4, xtemp4, a7, xsum4
  891. addi AO1, AO1, 16 * SIZE
  892. FMADD y08, atemp4, a7, y08
  893. LFD a7, 2 * SIZE(AO2)
  894. FMADD1 xsum1, xtemp4, a4, xsum1
  895. addi YY, YY, 16 * SIZE
  896. FNMSUB y05, atemp4, a6, y05
  897. NOP2
  898. FMADD2 xsum2, xtemp3, a4, xsum2
  899. LFD a4, 3 * SIZE(AO1)
  900. FMADD y06, atemp3, a6, y06
  901. LFD a6, 1 * SIZE(AO2)
  902. FMADD1 xsum3, xtemp4, a8, xsum3
  903. LFD xtemp4, 3 * SIZE(XX)
  904. FNMSUB y07, atemp4, a8, y07
  905. NOP2
  906. FMADD2 xsum4, xtemp3, a8, xsum4
  907. LFD xtemp3, 2 * SIZE(XX)
  908. FMADD y08, atemp3, a8, y08
  909. LFD a8, 3 * SIZE(AO2)
  910. STFD y05, -4 * SIZE(YY)
  911. STFD y06, -3 * SIZE(YY)
  912. STFD y07, -2 * SIZE(YY)
  913. STFD y08, -1 * SIZE(YY)
  914. .align 4
  915. LL(15):
  916. andi. r0, IS, 4
  917. ble LL(16)
  918. FMADD xsum1, xtemp1, a1, xsum1
  919. NOP1
  920. FMADD y01, atemp1, a1, y01
  921. NOP2
  922. FMADD xsum2, xtemp2, a1, xsum2
  923. NOP1
  924. FMADD y02, atemp2, a1, y02
  925. LFD a1, 4 * SIZE(AO1)
  926. FMADD xsum3, xtemp1, a5, xsum3
  927. NOP1
  928. FMADD y03, atemp1, a3, y03
  929. NOP2
  930. FMADD xsum4, xtemp2, a5, xsum4
  931. NOP1
  932. FMADD y04, atemp2, a3, y04
  933. NOP2
  934. FMADD1 xsum1, xtemp2, a2, xsum1
  935. LFD y05, 4 * SIZE(YY)
  936. FNMSUB y01, atemp2, a2, y01
  937. NOP2
  938. FMADD2 xsum2, xtemp1, a2, xsum2
  939. LFD y06, 5 * SIZE(YY)
  940. FMADD y02, atemp1, a2, y02
  941. LFD a2, 5 * SIZE(AO1)
  942. FMADD1 xsum3, xtemp2, a6, xsum3
  943. LFD xtemp2, 5 * SIZE(XX)
  944. FNMSUB y03, atemp2, a4, y03
  945. NOP2
  946. FMADD2 xsum4, xtemp1, a6, xsum4
  947. LFD xtemp1, 4 * SIZE(XX)
  948. FMADD y04, atemp1, a4, y04
  949. NOP2
  950. FMADD xsum1, xtemp3, a3, xsum1
  951. LFD y07, 6 * SIZE(YY)
  952. FMADD y01, atemp3, a5, y01
  953. NOP2
  954. FMADD xsum2, xtemp4, a3, xsum2
  955. LFD a3, 6 * SIZE(AO1)
  956. FMADD y02, atemp4, a5, y02
  957. LFD a5, 4 * SIZE(AO2)
  958. FMADD xsum3, xtemp3, a7, xsum3
  959. LFD y08, 7 * SIZE(YY)
  960. FMADD y03, atemp3, a7, y03
  961. NOP2
  962. FMADD xsum4, xtemp4, a7, xsum4
  963. NOP1
  964. FMADD y04, atemp4, a7, y04
  965. LFD a7, 6 * SIZE(AO2)
  966. FMADD1 xsum1, xtemp4, a4, xsum1
  967. NOP1
  968. FNMSUB y01, atemp4, a6, y01
  969. NOP2
  970. FMADD2 xsum2, xtemp3, a4, xsum2
  971. LFD a4, 7 * SIZE(AO1)
  972. FMADD y02, atemp3, a6, y02
  973. LFD a6, 5 * SIZE(AO2)
  974. FMADD1 xsum3, xtemp4, a8, xsum3
  975. LFD xtemp4, 7 * SIZE(XX)
  976. FNMSUB y03, atemp4, a8, y03
  977. NOP2
  978. FMADD2 xsum4, xtemp3, a8, xsum4
  979. LFD xtemp3, 6 * SIZE(XX)
  980. FMADD y04, atemp3, a8, y04
  981. LFD a8, 7 * SIZE(AO2)
  982. FMADD xsum1, xtemp1, a1, xsum1
  983. STFD y01, 0 * SIZE(YY)
  984. FMADD y05, atemp1, a1, y05
  985. NOP2
  986. FMADD xsum2, xtemp2, a1, xsum2
  987. STFD y02, 1 * SIZE(YY)
  988. FMADD y06, atemp2, a1, y06
  989. LFD a1, 8 * SIZE(AO1)
  990. FMADD xsum3, xtemp1, a5, xsum3
  991. STFD y03, 2 * SIZE(YY)
  992. FMADD y07, atemp1, a3, y07
  993. NOP2
  994. FMADD xsum4, xtemp2, a5, xsum4
  995. STFD y04, 3 * SIZE(YY)
  996. FMADD y08, atemp2, a3, y08
  997. NOP2
  998. FMADD1 xsum1, xtemp2, a2, xsum1
  999. LFD y01, 8 * SIZE(YY)
  1000. FNMSUB y05, atemp2, a2, y05
  1001. NOP2
  1002. FMADD2 xsum2, xtemp1, a2, xsum2
  1003. LFD y02, 9 * SIZE(YY)
  1004. FMADD y06, atemp1, a2, y06
  1005. LFD a2, 9 * SIZE(AO1)
  1006. FMADD1 xsum3, xtemp2, a6, xsum3
  1007. LFD xtemp2, 9 * SIZE(XX)
  1008. FNMSUB y07, atemp2, a4, y07
  1009. NOP2
  1010. FMADD2 xsum4, xtemp1, a6, xsum4
  1011. LFD xtemp1, 8 * SIZE(XX)
  1012. FMADD y08, atemp1, a4, y08
  1013. NOP2
  1014. FMADD xsum1, xtemp3, a3, xsum1
  1015. LFD y03, 10 * SIZE(YY)
  1016. FMADD y05, atemp3, a5, y05
  1017. NOP2
  1018. FMADD xsum2, xtemp4, a3, xsum2
  1019. LFD a3, 10 * SIZE(AO1)
  1020. FMADD y06, atemp4, a5, y06
  1021. LFD a5, 8 * SIZE(AO2)
  1022. FMADD xsum3, xtemp3, a7, xsum3
  1023. LFD y04, 11 * SIZE(YY)
  1024. FMADD y07, atemp3, a7, y07
  1025. NOP2
  1026. FMADD xsum4, xtemp4, a7, xsum4
  1027. NOP1
  1028. FMADD y08, atemp4, a7, y08
  1029. LFD a7, 10 * SIZE(AO2)
  1030. FMADD1 xsum1, xtemp4, a4, xsum1
  1031. NOP1
  1032. FNMSUB y05, atemp4, a6, y05
  1033. NOP2
  1034. FMADD2 xsum2, xtemp3, a4, xsum2
  1035. LFD a4, 11 * SIZE(AO1)
  1036. FMADD y06, atemp3, a6, y06
  1037. LFD a6, 9 * SIZE(AO2)
  1038. FMADD1 xsum3, xtemp4, a8, xsum3
  1039. LFD xtemp4, 11 * SIZE(XX)
  1040. FNMSUB y07, atemp4, a8, y07
  1041. FMADD2 xsum4, xtemp3, a8, xsum4
  1042. LFD xtemp3, 10 * SIZE(XX)
  1043. FMADD y08, atemp3, a8, y08
  1044. LFD a8, 11 * SIZE(AO2)
  1045. STFD y05, 4 * SIZE(YY)
  1046. STFD y06, 5 * SIZE(YY)
  1047. STFD y07, 6 * SIZE(YY)
  1048. STFD y08, 7 * SIZE(YY)
  1049. addi AO1, AO1, 8 * SIZE
  1050. addi AO2, AO2, 8 * SIZE
  1051. addi XX, XX, 8 * SIZE
  1052. addi YY, YY, 8 * SIZE
  1053. .align 4
  1054. LL(16):
  1055. andi. r0, IS, 2
  1056. ble LL(18)
  1057. FMADD xsum1, xtemp1, a1, xsum1
  1058. FMADD y01, atemp1, a1, y01
  1059. FMADD xsum2, xtemp2, a1, xsum2
  1060. FMADD y02, atemp2, a1, y02
  1061. FMADD xsum3, xtemp1, a5, xsum3
  1062. FMADD y03, atemp1, a3, y03
  1063. FMADD xsum4, xtemp2, a5, xsum4
  1064. FMADD y04, atemp2, a3, y04
  1065. FMADD1 xsum1, xtemp2, a2, xsum1
  1066. FNMSUB y01, atemp2, a2, y01
  1067. FMADD2 xsum2, xtemp1, a2, xsum2
  1068. FMADD y02, atemp1, a2, y02
  1069. FMADD1 xsum3, xtemp2, a6, xsum3
  1070. FNMSUB y03, atemp2, a4, y03
  1071. FMADD2 xsum4, xtemp1, a6, xsum4
  1072. FMADD y04, atemp1, a4, y04
  1073. FMADD xsum1, xtemp3, a3, xsum1
  1074. FMADD y01, atemp3, a5, y01
  1075. FMADD xsum2, xtemp4, a3, xsum2
  1076. FMADD y02, atemp4, a5, y02
  1077. FMADD xsum3, xtemp3, a7, xsum3
  1078. FMADD y03, atemp3, a7, y03
  1079. FMADD xsum4, xtemp4, a7, xsum4
  1080. FMADD y04, atemp4, a7, y04
  1081. FMADD1 xsum1, xtemp4, a4, xsum1
  1082. FNMSUB y01, atemp4, a6, y01
  1083. FMADD2 xsum2, xtemp3, a4, xsum2
  1084. FMADD y02, atemp3, a6, y02
  1085. FMADD1 xsum3, xtemp4, a8, xsum3
  1086. FNMSUB y03, atemp4, a8, y03
  1087. FMADD2 xsum4, xtemp3, a8, xsum4
  1088. FMADD y04, atemp3, a8, y04
  1089. STFD y01, 0 * SIZE(YY)
  1090. STFD y02, 1 * SIZE(YY)
  1091. STFD y03, 2 * SIZE(YY)
  1092. STFD y04, 3 * SIZE(YY)
  1093. LFD a1, 4 * SIZE(AO1)
  1094. LFD a2, 5 * SIZE(AO1)
  1095. LFD a5, 4 * SIZE(AO2)
  1096. LFD a6, 5 * SIZE(AO2)
  1097. LFD a7, 6 * SIZE(AO2)
  1098. LFD a8, 7 * SIZE(AO2)
  1099. LFD y01, 4 * SIZE(YY)
  1100. LFD y02, 5 * SIZE(YY)
  1101. LFD y03, 6 * SIZE(YY)
  1102. LFD y04, 7 * SIZE(YY)
  1103. addi YY, YY, 4 * SIZE
  1104. .align 4
  1105. LL(18):
  1106. LFD y05, ALPHA_R
  1107. LFD y06, ALPHA_I
  1108. FMUL xtemp1, y05, xsum1
  1109. FMUL xtemp2, y06, xsum1
  1110. FMUL xtemp3, y05, xsum3
  1111. FMUL xtemp4, y06, xsum3
  1112. FNMSUB xsum1, y06, xsum2, xtemp1
  1113. FMADD xsum2, y05, xsum2, xtemp2
  1114. FNMSUB xsum3, y06, xsum4, xtemp3
  1115. FMADD xsum4, y05, xsum4, xtemp4
  1116. FMADD xsum1, atemp1, a1, xsum1
  1117. FMADD xsum2, atemp2, a1, xsum2
  1118. FMADD xsum3, atemp1, a5, xsum3
  1119. FMADD xsum4, atemp2, a5, xsum4
  1120. #ifndef HEMV
  1121. FMADD1 xsum1, atemp2, a2, xsum1
  1122. FMADD2 xsum2, atemp1, a2, xsum2
  1123. #endif
  1124. FMADD1 xsum3, atemp2, a6, xsum3
  1125. FMADD2 xsum4, atemp1, a6, xsum4
  1126. FMADD xsum1, atemp3, a5, xsum1
  1127. FMADD xsum2, atemp4, a5, xsum2
  1128. FMADD xsum3, atemp3, a7, xsum3
  1129. FMADD xsum4, atemp4, a7, xsum4
  1130. FNMSUB xsum1, atemp4, a6, xsum1
  1131. FMADD xsum2, atemp3, a6, xsum2
  1132. #ifndef HEMV
  1133. FNMSUB xsum3, atemp4, a8, xsum3
  1134. FMADD xsum4, atemp3, a8, xsum4
  1135. #endif
  1136. FADD y01, y01, xsum1
  1137. FADD y02, y02, xsum2
  1138. FADD y03, y03, xsum3
  1139. FADD y04, y04, xsum4
  1140. STFD y01, 0 * SIZE(YY)
  1141. addi TEMP, IS, 4
  1142. STFD y02, 1 * SIZE(YY)
  1143. addi IS, IS, 2
  1144. STFD y03, 2 * SIZE(YY)
  1145. cmpw cr0, TEMP, M
  1146. STFD y04, 3 * SIZE(YY)
  1147. ble LL(11)
  1148. .align 4
  1149. LL(20):
  1150. andi. TEMP, M, 1
  1151. ble LL(990)
  1152. mr AO1, A
  1153. slwi TEMP, IS, ZBASE_SHIFT
  1154. add TEMP, X, TEMP
  1155. LFD y05, ALPHA_R
  1156. LFD y06, ALPHA_I
  1157. LFD xtemp1, 0 * SIZE(TEMP)
  1158. LFD xtemp2, 1 * SIZE(TEMP)
  1159. FMUL atemp1, y05, xtemp1
  1160. FMUL atemp2, y06, xtemp1
  1161. FNMSUB atemp1, y06, xtemp2, atemp1
  1162. FMADD atemp2, y05, xtemp2, atemp2
  1163. lfd xsum1, FZERO
  1164. fmr xsum2, xsum1
  1165. mr XX, X
  1166. mr YY, NEW_Y
  1167. LFD a1, 0 * SIZE(AO1)
  1168. LFD a2, 1 * SIZE(AO1)
  1169. LFD xtemp1, 0 * SIZE(XX)
  1170. LFD xtemp2, 1 * SIZE(XX)
  1171. LFD y01, 0 * SIZE(YY)
  1172. LFD y02, 1 * SIZE(YY)
  1173. mtspr CTR, IS
  1174. cmpwi cr0, IS, 0
  1175. ble LL(28)
  1176. .align 4
  1177. LL(22):
  1178. FMADD xsum1, xtemp1, a1, xsum1
  1179. FMADD y01, atemp1, a1, y01
  1180. FMADD xsum2, xtemp2, a1, xsum2
  1181. FMADD y02, atemp2, a1, y02
  1182. LFD a1, 2 * SIZE(AO1)
  1183. FMADD1 xsum1, xtemp2, a2, xsum1
  1184. LFD xtemp2, 3 * SIZE(XX)
  1185. FNMSUB y01, atemp2, a2, y01
  1186. FMADD2 xsum2, xtemp1, a2, xsum2
  1187. LFD xtemp1, 2 * SIZE(XX)
  1188. FMADD y02, atemp1, a2, y02
  1189. LFD a2, 3 * SIZE(AO1)
  1190. addi AO1, AO1, 2 * SIZE
  1191. addi XX, XX, 2 * SIZE
  1192. addi YY, YY, 2 * SIZE
  1193. STFD y01, -2 * SIZE(YY)
  1194. LFD y01, 0 * SIZE(YY)
  1195. STFD y02, -1 * SIZE(YY)
  1196. LFD y02, 1 * SIZE(YY)
  1197. bdnz LL(22)
  1198. .align 4
  1199. LL(28):
  1200. LFD y05, ALPHA_R
  1201. LFD y06, ALPHA_I
  1202. FMUL xtemp1, y05, xsum1
  1203. FMUL xtemp2, y06, xsum1
  1204. FNMSUB xsum1, y06, xsum2, xtemp1
  1205. FMADD xsum2, y05, xsum2, xtemp2
  1206. FMADD xsum1, atemp1, a1, xsum1
  1207. FMADD xsum2, atemp2, a1, xsum2
  1208. #ifndef HEMV
  1209. FNMSUB xsum1, atemp2, a2, xsum1
  1210. FMADD xsum2, atemp1, a2, xsum2
  1211. #endif
  1212. FADD y01, y01, xsum1
  1213. FADD y02, y02, xsum2
  1214. STFD y01, 0 * SIZE(YY)
  1215. STFD y02, 1 * SIZE(YY)
  1216. .align 4
  1217. LL(990):
  1218. cmpwi cr0, INCY, 2 * SIZE
  1219. beq LL(999)
  1220. mr YY, Y
  1221. srawi. r0, M, 2
  1222. mtspr CTR, r0
  1223. ble LL(995)
  1224. .align 4
  1225. LL(991):
  1226. LFD f0, 0 * SIZE(Y)
  1227. LFD f1, 1 * SIZE(Y)
  1228. add Y, Y, INCY
  1229. LFD f2, 0 * SIZE(Y)
  1230. LFD f3, 1 * SIZE(Y)
  1231. add Y, Y, INCY
  1232. LFD f4, 0 * SIZE(Y)
  1233. LFD f5, 1 * SIZE(Y)
  1234. add Y, Y, INCY
  1235. LFD f6, 0 * SIZE(Y)
  1236. LFD f7, 1 * SIZE(Y)
  1237. add Y, Y, INCY
  1238. LFD f8, 0 * SIZE(NEW_Y)
  1239. LFD f9, 1 * SIZE(NEW_Y)
  1240. LFD f10, 2 * SIZE(NEW_Y)
  1241. LFD f11, 3 * SIZE(NEW_Y)
  1242. LFD f12, 4 * SIZE(NEW_Y)
  1243. LFD f13, 5 * SIZE(NEW_Y)
  1244. LFD f14, 6 * SIZE(NEW_Y)
  1245. LFD f15, 7 * SIZE(NEW_Y)
  1246. addi NEW_Y, NEW_Y, 8 * SIZE
  1247. FADD f8, f8, f0
  1248. FADD f9, f9, f1
  1249. FADD f10, f10, f2
  1250. FADD f11, f11, f3
  1251. FADD f12, f12, f4
  1252. FADD f13, f13, f5
  1253. FADD f14, f14, f6
  1254. FADD f15, f15, f7
  1255. STFD f8, 0 * SIZE(YY)
  1256. STFD f9, 1 * SIZE(YY)
  1257. add YY, YY, INCY
  1258. STFD f10, 0 * SIZE(YY)
  1259. STFD f11, 1 * SIZE(YY)
  1260. add YY, YY, INCY
  1261. STFD f12, 0 * SIZE(YY)
  1262. STFD f13, 1 * SIZE(YY)
  1263. add YY, YY, INCY
  1264. STFD f14, 0 * SIZE(YY)
  1265. STFD f15, 1 * SIZE(YY)
  1266. add YY, YY, INCY
  1267. bdnz LL(991)
  1268. .align 4
  1269. LL(995):
  1270. andi. J, M, 2
  1271. ble LL(996)
  1272. LFD f0, 0 * SIZE(Y)
  1273. LFD f1, 1 * SIZE(Y)
  1274. add Y, Y, INCY
  1275. LFD f2, 0 * SIZE(Y)
  1276. LFD f3, 1 * SIZE(Y)
  1277. add Y, Y, INCY
  1278. LFD f8, 0 * SIZE(NEW_Y)
  1279. LFD f9, 1 * SIZE(NEW_Y)
  1280. LFD f10, 2 * SIZE(NEW_Y)
  1281. LFD f11, 3 * SIZE(NEW_Y)
  1282. addi NEW_Y, NEW_Y, 4 * SIZE
  1283. FADD f8, f8, f0
  1284. FADD f9, f9, f1
  1285. FADD f10, f10, f2
  1286. FADD f11, f11, f3
  1287. STFD f8, 0 * SIZE(YY)
  1288. STFD f9, 1 * SIZE(YY)
  1289. add YY, YY, INCY
  1290. STFD f10, 0 * SIZE(YY)
  1291. STFD f11, 1 * SIZE(YY)
  1292. add YY, YY, INCY
  1293. .align 4
  1294. LL(996):
  1295. andi. J, M, 1
  1296. ble LL(999)
  1297. LFD f0, 0 * SIZE(Y)
  1298. LFD f1, 1 * SIZE(Y)
  1299. LFD f8, 0 * SIZE(NEW_Y)
  1300. LFD f9, 1 * SIZE(NEW_Y)
  1301. FADD f8, f8, f0
  1302. FADD f9, f9, f1
  1303. STFD f8, 0 * SIZE(YY)
  1304. STFD f9, 1 * SIZE(YY)
  1305. .align 4
  1306. LL(999):
  1307. li r3, 0
  1308. lfd f14, 0(SP)
  1309. lfd f15, 8(SP)
  1310. lfd f16, 16(SP)
  1311. lfd f17, 24(SP)
  1312. lfd f18, 32(SP)
  1313. lfd f19, 40(SP)
  1314. lfd f20, 48(SP)
  1315. lfd f21, 56(SP)
  1316. lfd f22, 64(SP)
  1317. lfd f23, 72(SP)
  1318. lfd f24, 80(SP)
  1319. lfd f25, 88(SP)
  1320. lfd f26, 96(SP)
  1321. lfd f27, 104(SP)
  1322. lfd f28, 112(SP)
  1323. lfd f29, 120(SP)
  1324. lfd f30, 128(SP)
  1325. lfd f31, 136(SP)
  1326. #ifdef __64BIT__
  1327. ld r14, 144(SP)
  1328. ld r15, 152(SP)
  1329. ld r16, 160(SP)
  1330. ld r17, 168(SP)
  1331. ld r18, 176(SP)
  1332. ld r19, 184(SP)
  1333. ld r20, 192(SP)
  1334. ld r21, 200(SP)
  1335. ld r22, 208(SP)
  1336. ld r23, 216(SP)
  1337. ld r24, 224(SP)
  1338. ld r25, 232(SP)
  1339. ld r26, 240(SP)
  1340. ld r27, 248(SP)
  1341. #else
  1342. lwz r14, 144(SP)
  1343. lwz r15, 148(SP)
  1344. lwz r16, 152(SP)
  1345. lwz r17, 156(SP)
  1346. lwz r18, 160(SP)
  1347. lwz r19, 164(SP)
  1348. lwz r20, 168(SP)
  1349. lwz r21, 172(SP)
  1350. lwz r22, 176(SP)
  1351. lwz r23, 180(SP)
  1352. lwz r24, 184(SP)
  1353. lwz r25, 188(SP)
  1354. lwz r26, 192(SP)
  1355. lwz r27, 196(SP)
  1356. #endif
  1357. addi SP, SP, STACKSIZE
  1358. blr
  1359. EPILOGUE
  1360. #endif