You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

symv_U.S 29 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define IS r4
  44. #define A r5
  45. #define LDA r6
  46. #define X r7
  47. #define INCX r8
  48. #define Y r9
  49. #define INCY r10
  50. #define BUFFER r14
  51. #else
  52. #define M r3
  53. #define IS r4
  54. #define A r6
  55. #define LDA r7
  56. #define X r8
  57. #define INCX r9
  58. #define Y r10
  59. #define INCY r5
  60. #define BUFFER r14
  61. #endif
  62. #endif
  63. #if defined(_AIX) || defined(__APPLE__)
  64. #if !defined(__64BIT__) && defined(DOUBLE)
  65. #define M r3
  66. #define IS r4
  67. #define A r7
  68. #define LDA r8
  69. #define X r9
  70. #define INCX r10
  71. #define Y r5
  72. #define INCY r6
  73. #define BUFFER r14
  74. #else
  75. #define M r3
  76. #define IS r4
  77. #define A r6
  78. #define LDA r7
  79. #define X r8
  80. #define INCX r9
  81. #define Y r10
  82. #define INCY r5
  83. #define BUFFER r14
  84. #endif
  85. #endif
  86. #define I r11
  87. #define J r12
  88. #define AO1 r15
  89. #define AO2 r16
  90. #define AO3 r17
  91. #define AO4 r18
  92. #define XX r19
  93. #define YY r20
  94. #define NEW_Y r21
  95. #define TEMP r22
  96. #define PREA r24
  97. #define y01 f0
  98. #define y02 f1
  99. #define y03 f2
  100. #define y04 f3
  101. #define atemp1 f4
  102. #define atemp2 f5
  103. #define atemp3 f6
  104. #define atemp4 f7
  105. #define xtemp1 f8
  106. #define xtemp2 f9
  107. #define xtemp3 f10
  108. #define xtemp4 f11
  109. #define xsum1 f12
  110. #define xsum2 f13
  111. #define xsum3 f14
  112. #define xsum4 f15
  113. #define a1 f16
  114. #define a2 f17
  115. #define a3 f18
  116. #define a4 f19
  117. #define a5 f20
  118. #define a6 f21
  119. #define a7 f22
  120. #define a8 f23
  121. #define a9 f24
  122. #define a10 f25
  123. #define a11 f26
  124. #define a12 f27
  125. #define a13 f28
  126. #define a14 f29
  127. #define a15 f30
  128. #define a16 f31
  129. #define alpha f1
  130. #if defined(PPCG4)
  131. #define PREFETCHSIZE_A 24
  132. #endif
  133. #if defined(PPC440) || defined(PPC440FP2)
  134. #define PREFETCHSIZE_A 24
  135. #endif
  136. #ifdef PPC970
  137. #define PREFETCHSIZE_A 64
  138. #endif
  139. #ifdef CELL
  140. #define PREFETCHSIZE_A 72
  141. #endif
  142. #ifdef POWER4
  143. #define PREFETCHSIZE_A 16
  144. #endif
  145. #ifdef POWER5
  146. #define PREFETCHSIZE_A 96
  147. #endif
  148. #ifdef POWER6
  149. #define PREFETCHSIZE_A 40
  150. #endif
  151. #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
  152. #define NOP1
  153. #define NOP2
  154. #else
  155. #define NOP1 mr LDA, LDA
  156. #define NOP2 mr INCX, INCX
  157. #endif
  158. #ifndef NEEDPARAM
  159. #ifndef __64BIT__
  160. #define STACKSIZE 224
  161. #define ALPHA 200(SP)
  162. #define FZERO 208(SP)
  163. #else
  164. #define STACKSIZE 280
  165. #define ALPHA 256(SP)
  166. #define FZERO 264(SP)
  167. #endif
  168. PROLOGUE
  169. PROFCODE
  170. addi SP, SP, -STACKSIZE
  171. li r0, 0
  172. stfd f14, 0(SP)
  173. stfd f15, 8(SP)
  174. stfd f16, 16(SP)
  175. stfd f17, 24(SP)
  176. stfd f18, 32(SP)
  177. stfd f19, 40(SP)
  178. stfd f20, 48(SP)
  179. stfd f21, 56(SP)
  180. stfd f22, 64(SP)
  181. stfd f23, 72(SP)
  182. stfd f24, 80(SP)
  183. stfd f25, 88(SP)
  184. stfd f26, 96(SP)
  185. stfd f27, 104(SP)
  186. stfd f28, 112(SP)
  187. stfd f29, 120(SP)
  188. stfd f30, 128(SP)
  189. stfd f31, 136(SP)
  190. #ifdef __64BIT__
  191. std r0, FZERO
  192. std r14, 144(SP)
  193. std r15, 152(SP)
  194. std r16, 160(SP)
  195. std r17, 168(SP)
  196. std r18, 176(SP)
  197. std r19, 184(SP)
  198. std r20, 192(SP)
  199. std r21, 200(SP)
  200. std r22, 208(SP)
  201. std r23, 216(SP)
  202. std r24, 224(SP)
  203. std r25, 232(SP)
  204. std r26, 240(SP)
  205. std r27, 248(SP)
  206. #else
  207. stw r0, 0 + FZERO
  208. stw r0, 4 + FZERO
  209. stw r14, 144(SP)
  210. stw r15, 148(SP)
  211. stw r16, 152(SP)
  212. stw r17, 156(SP)
  213. stw r18, 160(SP)
  214. stw r19, 164(SP)
  215. stw r20, 168(SP)
  216. stw r21, 172(SP)
  217. stw r22, 176(SP)
  218. stw r23, 180(SP)
  219. stw r24, 184(SP)
  220. stw r25, 188(SP)
  221. stw r26, 192(SP)
  222. stw r27, 196(SP)
  223. #endif
  224. #if defined(linux) || defined(__FreeBSD__)
  225. #ifndef __64BIT__
  226. lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
  227. #else
  228. ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
  229. ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  230. #endif
  231. #endif
  232. #if defined(_AIX) || defined(__APPLE__)
  233. #ifndef __64BIT__
  234. #ifdef DOUBLE
  235. lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
  236. lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
  237. lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  238. #else
  239. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  240. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  241. #endif
  242. #else
  243. ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
  244. ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  245. #endif
  246. #endif
  247. STFD alpha, ALPHA
  248. slwi LDA, LDA, BASE_SHIFT
  249. slwi INCX, INCX, BASE_SHIFT
  250. slwi INCY, INCY, BASE_SHIFT
  251. li PREA, PREFETCHSIZE_A * SIZE
  252. sub IS, M, IS
  253. cmpwi cr0, M, 0
  254. ble- LL(999)
  255. mullw TEMP, IS, LDA
  256. add A, A, TEMP
  257. cmpwi cr0, INCX, SIZE
  258. beq LL(05)
  259. mr XX, X
  260. mr X, BUFFER
  261. srawi. r0, M, 3
  262. mtspr CTR, r0
  263. ble LL(03)
  264. .align 4
  265. LL(01):
  266. LFD a1, 0 * SIZE(XX)
  267. add XX, XX, INCX
  268. LFD a2, 0 * SIZE(XX)
  269. add XX, XX, INCX
  270. LFD a3, 0 * SIZE(XX)
  271. add XX, XX, INCX
  272. LFD a4, 0 * SIZE(XX)
  273. add XX, XX, INCX
  274. LFD a5, 0 * SIZE(XX)
  275. add XX, XX, INCX
  276. LFD a6, 0 * SIZE(XX)
  277. add XX, XX, INCX
  278. LFD a7, 0 * SIZE(XX)
  279. add XX, XX, INCX
  280. LFD a8, 0 * SIZE(XX)
  281. add XX, XX, INCX
  282. dcbt XX, PREA
  283. dcbtst BUFFER, PREA
  284. STFD a1, 0 * SIZE(BUFFER)
  285. STFD a2, 1 * SIZE(BUFFER)
  286. STFD a3, 2 * SIZE(BUFFER)
  287. STFD a4, 3 * SIZE(BUFFER)
  288. STFD a5, 4 * SIZE(BUFFER)
  289. STFD a6, 5 * SIZE(BUFFER)
  290. STFD a7, 6 * SIZE(BUFFER)
  291. STFD a8, 7 * SIZE(BUFFER)
  292. addi BUFFER, BUFFER, 8 * SIZE
  293. bdnz LL(01)
  294. .align 4
  295. LL(03):
  296. andi. r0, M, 7
  297. mtspr CTR, r0
  298. ble LL(05)
  299. .align 4
  300. LL(04):
  301. LFD a1, 0 * SIZE(XX)
  302. add XX, XX, INCX
  303. STFD a1, 0 * SIZE(BUFFER)
  304. addi BUFFER, BUFFER, 1 * SIZE
  305. bdnz LL(04)
  306. .align 4
  307. LL(05):
  308. mr NEW_Y, Y
  309. lfd f0, FZERO
  310. cmpwi cr0, INCY, SIZE
  311. beq LL(10)
  312. mr NEW_Y, BUFFER
  313. addi r0, M, 7
  314. srawi. r0, r0, 3
  315. mtspr CTR, r0
  316. .align 4
  317. LL(06):
  318. STFD f0, 0 * SIZE(BUFFER)
  319. STFD f0, 1 * SIZE(BUFFER)
  320. STFD f0, 2 * SIZE(BUFFER)
  321. STFD f0, 3 * SIZE(BUFFER)
  322. STFD f0, 4 * SIZE(BUFFER)
  323. STFD f0, 5 * SIZE(BUFFER)
  324. STFD f0, 6 * SIZE(BUFFER)
  325. STFD f0, 7 * SIZE(BUFFER)
  326. addi BUFFER, BUFFER, 8 * SIZE
  327. bdnz LL(06)
  328. .align 4
  329. LL(10):
  330. addi TEMP, IS, 4
  331. cmpw cr0, TEMP, M
  332. bgt LL(20)
  333. .align 4
  334. LL(11):
  335. mr AO1, A
  336. add AO2, A, LDA
  337. add AO3, AO2, LDA
  338. add AO4, AO3, LDA
  339. add A, AO4, LDA
  340. slwi TEMP, IS, BASE_SHIFT
  341. add TEMP, X, TEMP
  342. LFD a16, ALPHA
  343. lfd xsum1, FZERO
  344. LFD atemp1, 0 * SIZE(TEMP)
  345. LFD atemp2, 1 * SIZE(TEMP)
  346. LFD atemp3, 2 * SIZE(TEMP)
  347. LFD atemp4, 3 * SIZE(TEMP)
  348. LFD xtemp1, 0 * SIZE(X)
  349. LFD xtemp2, 1 * SIZE(X)
  350. LFD xtemp3, 2 * SIZE(X)
  351. LFD xtemp4, 3 * SIZE(X)
  352. LFD y01, 0 * SIZE(NEW_Y)
  353. LFD y02, 1 * SIZE(NEW_Y)
  354. LFD y03, 2 * SIZE(NEW_Y)
  355. LFD y04, 3 * SIZE(NEW_Y)
  356. LFD a1, 0 * SIZE(AO1)
  357. FMUL atemp1, a16, atemp1
  358. LFD a2, 1 * SIZE(AO1)
  359. FMUL atemp2, a16, atemp2
  360. LFD a3, 2 * SIZE(AO1)
  361. FMUL atemp3, a16, atemp3
  362. LFD a4, 3 * SIZE(AO1)
  363. FMUL atemp4, a16, atemp4
  364. LFD a5, 0 * SIZE(AO2)
  365. fmr xsum2, xsum1
  366. LFD a6, 1 * SIZE(AO2)
  367. fmr xsum3, xsum1
  368. LFD a7, 2 * SIZE(AO2)
  369. fmr xsum4, xsum1
  370. LFD a8, 3 * SIZE(AO2)
  371. LFD a9, 0 * SIZE(AO3)
  372. LFD a10, 1 * SIZE(AO3)
  373. LFD a11, 2 * SIZE(AO3)
  374. LFD a12, 3 * SIZE(AO3)
  375. LFD a13, 0 * SIZE(AO4)
  376. LFD a14, 1 * SIZE(AO4)
  377. LFD a15, 2 * SIZE(AO4)
  378. LFD a16, 3 * SIZE(AO4)
  379. mr XX, X
  380. mr YY, NEW_Y
  381. srawi. r0, IS, 4
  382. mtspr CTR, r0
  383. ble LL(14)
  384. .align 4
  385. LL(12):
  386. FMADD xsum1, xtemp1, a1, xsum1
  387. DCBT(AO1, PREA)
  388. FMADD y01, atemp1, a1, y01
  389. LFD a1, 4 * SIZE(AO1)
  390. FMADD xsum2, xtemp1, a5, xsum2
  391. NOP1
  392. FMADD y02, atemp1, a2, y02
  393. NOP2
  394. FMADD xsum3, xtemp1, a9, xsum3
  395. NOP1
  396. FMADD y03, atemp1, a3, y03
  397. NOP2
  398. FMADD xsum4, xtemp1, a13, xsum4
  399. LFD xtemp1, 4 * SIZE(XX)
  400. FMADD y04, atemp1, a4, y04
  401. NOP2
  402. FMADD xsum1, xtemp2, a2, xsum1
  403. LFD a2, 5 * SIZE(AO1)
  404. FMADD y01, atemp2, a5, y01
  405. LFD a5, 4 * SIZE(AO2)
  406. FMADD xsum2, xtemp2, a6, xsum2
  407. NOP1
  408. FMADD y02, atemp2, a6, y02
  409. LFD a6, 5 * SIZE(AO2)
  410. FMADD xsum3, xtemp2, a10, xsum3
  411. NOP1
  412. FMADD y03, atemp2, a7, y03
  413. NOP2
  414. FMADD xsum4, xtemp2, a14, xsum4
  415. LFD xtemp2, 5 * SIZE(XX)
  416. FMADD y04, atemp2, a8, y04
  417. # DCBT(X, PREX)
  418. NOP2
  419. FMADD xsum1, xtemp3, a3, xsum1
  420. LFD a3, 6 * SIZE(AO1)
  421. FMADD y01, atemp3, a9, y01
  422. LFD a9, 4 * SIZE(AO3)
  423. FMADD xsum2, xtemp3, a7, xsum2
  424. LFD a7, 6 * SIZE(AO2)
  425. FMADD y02, atemp3, a10, y02
  426. LFD a10, 5 * SIZE(AO3)
  427. FMADD xsum3, xtemp3, a11, xsum3
  428. NOP1
  429. FMADD y03, atemp3, a11, y03
  430. LFD a11, 6 * SIZE(AO3)
  431. FMADD xsum4, xtemp3, a15, xsum4
  432. LFD xtemp3, 6 * SIZE(XX)
  433. FMADD y04, atemp3, a12, y04
  434. NOP2
  435. FMADD xsum1, xtemp4, a4, xsum1
  436. LFD a4, 7 * SIZE(AO1)
  437. FMADD y01, atemp4, a13, y01
  438. LFD a13, 4 * SIZE(AO4)
  439. FMADD xsum2, xtemp4, a8, xsum2
  440. LFD a8, 7 * SIZE(AO2)
  441. FMADD y02, atemp4, a14, y02
  442. LFD a14, 5 * SIZE(AO4)
  443. FMADD xsum3, xtemp4, a12, xsum3
  444. LFD a12, 7 * SIZE(AO3)
  445. FMADD y03, atemp4, a15, y03
  446. LFD a15, 6 * SIZE(AO4)
  447. FMADD xsum4, xtemp4, a16, xsum4
  448. LFD xtemp4, 7 * SIZE(XX)
  449. FMADD y04, atemp4, a16, y04
  450. LFD a16, 7 * SIZE(AO4)
  451. STFD y01, 0 * SIZE(YY)
  452. LFD y01, 4 * SIZE(YY)
  453. STFD y02, 1 * SIZE(YY)
  454. LFD y02, 5 * SIZE(YY)
  455. STFD y03, 2 * SIZE(YY)
  456. LFD y03, 6 * SIZE(YY)
  457. STFD y04, 3 * SIZE(YY)
  458. LFD y04, 7 * SIZE(YY)
  459. FMADD xsum1, xtemp1, a1, xsum1
  460. DCBT(AO2, PREA)
  461. FMADD y01, atemp1, a1, y01
  462. LFD a1, 8 * SIZE(AO1)
  463. FMADD xsum2, xtemp1, a5, xsum2
  464. NOP1
  465. FMADD y02, atemp1, a2, y02
  466. NOP2
  467. FMADD xsum3, xtemp1, a9, xsum3
  468. NOP1
  469. FMADD y03, atemp1, a3, y03
  470. NOP2
  471. FMADD xsum4, xtemp1, a13, xsum4
  472. LFD xtemp1, 8 * SIZE(XX)
  473. FMADD y04, atemp1, a4, y04
  474. NOP2
  475. FMADD xsum1, xtemp2, a2, xsum1
  476. LFD a2, 9 * SIZE(AO1)
  477. FMADD y01, atemp2, a5, y01
  478. LFD a5, 8 * SIZE(AO2)
  479. FMADD xsum2, xtemp2, a6, xsum2
  480. NOP1
  481. FMADD y02, atemp2, a6, y02
  482. LFD a6, 9 * SIZE(AO2)
  483. FMADD xsum3, xtemp2, a10, xsum3
  484. NOP1
  485. FMADD y03, atemp2, a7, y03
  486. NOP2
  487. FMADD xsum4, xtemp2, a14, xsum4
  488. LFD xtemp2, 9 * SIZE(XX)
  489. FMADD y04, atemp2, a8, y04
  490. NOP2
  491. FMADD xsum1, xtemp3, a3, xsum1
  492. LFD a3, 10 * SIZE(AO1)
  493. FMADD y01, atemp3, a9, y01
  494. LFD a9, 8 * SIZE(AO3)
  495. FMADD xsum2, xtemp3, a7, xsum2
  496. LFD a7, 10 * SIZE(AO2)
  497. FMADD y02, atemp3, a10, y02
  498. LFD a10, 9 * SIZE(AO3)
  499. FMADD xsum3, xtemp3, a11, xsum3
  500. NOP1
  501. FMADD y03, atemp3, a11, y03
  502. LFD a11, 10 * SIZE(AO3)
  503. FMADD xsum4, xtemp3, a15, xsum4
  504. LFD xtemp3, 10 * SIZE(XX)
  505. FMADD y04, atemp3, a12, y04
  506. NOP2
  507. FMADD xsum1, xtemp4, a4, xsum1
  508. LFD a4, 11 * SIZE(AO1)
  509. FMADD y01, atemp4, a13, y01
  510. LFD a13, 8 * SIZE(AO4)
  511. FMADD xsum2, xtemp4, a8, xsum2
  512. LFD a8, 11 * SIZE(AO2)
  513. FMADD y02, atemp4, a14, y02
  514. LFD a14, 9 * SIZE(AO4)
  515. FMADD xsum3, xtemp4, a12, xsum3
  516. LFD a12, 11 * SIZE(AO3)
  517. FMADD y03, atemp4, a15, y03
  518. LFD a15, 10 * SIZE(AO4)
  519. FMADD xsum4, xtemp4, a16, xsum4
  520. LFD xtemp4, 11 * SIZE(XX)
  521. FMADD y04, atemp4, a16, y04
  522. LFD a16, 11 * SIZE(AO4)
  523. STFD y01, 4 * SIZE(YY)
  524. LFD y01, 8 * SIZE(YY)
  525. STFD y02, 5 * SIZE(YY)
  526. LFD y02, 9 * SIZE(YY)
  527. STFD y03, 6 * SIZE(YY)
  528. LFD y03, 10 * SIZE(YY)
  529. STFD y04, 7 * SIZE(YY)
  530. LFD y04, 11 * SIZE(YY)
  531. FMADD xsum1, xtemp1, a1, xsum1
  532. DCBT(AO3, PREA)
  533. FMADD y01, atemp1, a1, y01
  534. LFD a1, 12 * SIZE(AO1)
  535. FMADD xsum2, xtemp1, a5, xsum2
  536. NOP1
  537. FMADD y02, atemp1, a2, y02
  538. NOP2
  539. FMADD xsum3, xtemp1, a9, xsum3
  540. NOP1
  541. FMADD y03, atemp1, a3, y03
  542. NOP2
  543. FMADD xsum4, xtemp1, a13, xsum4
  544. LFD xtemp1, 12 * SIZE(XX)
  545. FMADD y04, atemp1, a4, y04
  546. NOP2
  547. FMADD xsum1, xtemp2, a2, xsum1
  548. LFD a2, 13 * SIZE(AO1)
  549. FMADD y01, atemp2, a5, y01
  550. LFD a5, 12 * SIZE(AO2)
  551. FMADD xsum2, xtemp2, a6, xsum2
  552. NOP1
  553. FMADD y02, atemp2, a6, y02
  554. LFD a6, 13 * SIZE(AO2)
  555. FMADD xsum3, xtemp2, a10, xsum3
  556. NOP1
  557. FMADD y03, atemp2, a7, y03
  558. # DCBT(Y1, PREY)
  559. NOP2
  560. FMADD xsum4, xtemp2, a14, xsum4
  561. LFD xtemp2, 13 * SIZE(XX)
  562. FMADD y04, atemp2, a8, y04
  563. NOP2
  564. FMADD xsum1, xtemp3, a3, xsum1
  565. LFD a3, 14 * SIZE(AO1)
  566. FMADD y01, atemp3, a9, y01
  567. LFD a9, 12 * SIZE(AO3)
  568. FMADD xsum2, xtemp3, a7, xsum2
  569. LFD a7, 14 * SIZE(AO2)
  570. FMADD y02, atemp3, a10, y02
  571. LFD a10,13 * SIZE(AO3)
  572. FMADD xsum3, xtemp3, a11, xsum3
  573. NOP1
  574. FMADD y03, atemp3, a11, y03
  575. LFD a11, 14 * SIZE(AO3)
  576. FMADD xsum4, xtemp3, a15, xsum4
  577. LFD xtemp3, 14 * SIZE(XX)
  578. FMADD y04, atemp3, a12, y04
  579. NOP2
  580. FMADD xsum1, xtemp4, a4, xsum1
  581. LFD a4, 15 * SIZE(AO1)
  582. FMADD y01, atemp4, a13, y01
  583. LFD a13,12 * SIZE(AO4)
  584. FMADD xsum2, xtemp4, a8, xsum2
  585. LFD a8, 15 * SIZE(AO2)
  586. FMADD y02, atemp4, a14, y02
  587. LFD a14, 13 * SIZE(AO4)
  588. FMADD xsum3, xtemp4, a12, xsum3
  589. LFD a12, 15 * SIZE(AO3)
  590. FMADD y03, atemp4, a15, y03
  591. LFD a15, 14 * SIZE(AO4)
  592. FMADD xsum4, xtemp4, a16, xsum4
  593. LFD xtemp4, 15 * SIZE(XX)
  594. FMADD y04, atemp4, a16, y04
  595. LFD a16, 15 * SIZE(AO4)
  596. STFD y01, 8 * SIZE(YY)
  597. LFD y01, 12 * SIZE(YY)
  598. STFD y02, 9 * SIZE(YY)
  599. LFD y02, 13 * SIZE(YY)
  600. STFD y03, 10 * SIZE(YY)
  601. LFD y03, 14 * SIZE(YY)
  602. STFD y04, 11 * SIZE(YY)
  603. LFD y04, 15 * SIZE(YY)
  604. FMADD xsum1, xtemp1, a1, xsum1
  605. DCBT(AO4, PREA)
  606. FMADD y01, atemp1, a1, y01
  607. LFD a1, 16 * SIZE(AO1)
  608. FMADD xsum2, xtemp1, a5, xsum2
  609. NOP1
  610. FMADD y02, atemp1, a2, y02
  611. NOP2
  612. FMADD xsum3, xtemp1, a9, xsum3
  613. NOP1
  614. FMADD y03, atemp1, a3, y03
  615. NOP2
  616. FMADD xsum4, xtemp1, a13, xsum4
  617. LFD xtemp1, 16 * SIZE(XX)
  618. FMADD y04, atemp1, a4, y04
  619. addi YY, YY, 16 * SIZE
  620. FMADD xsum1, xtemp2, a2, xsum1
  621. LFD a2, 17 * SIZE(AO1)
  622. FMADD y01, atemp2, a5, y01
  623. LFD a5, 16 * SIZE(AO2)
  624. FMADD xsum2, xtemp2, a6, xsum2
  625. addi AO3, AO3, 16 * SIZE
  626. FMADD y02, atemp2, a6, y02
  627. LFD a6, 17 * SIZE(AO2)
  628. FMADD xsum3, xtemp2, a10, xsum3
  629. addi AO1, AO1, 16 * SIZE
  630. FMADD y03, atemp2, a7, y03
  631. addi AO2, AO2, 16 * SIZE
  632. FMADD xsum4, xtemp2, a14, xsum4
  633. LFD xtemp2, 17 * SIZE(XX)
  634. FMADD y04, atemp2, a8, y04
  635. addi AO4, AO4, 16 * SIZE
  636. FMADD xsum1, xtemp3, a3, xsum1
  637. LFD a3, 2 * SIZE(AO1)
  638. FMADD y01, atemp3, a9, y01
  639. LFD a9, 0 * SIZE(AO3)
  640. FMADD xsum2, xtemp3, a7, xsum2
  641. LFD a7, 2 * SIZE(AO2)
  642. FMADD y02, atemp3, a10, y02
  643. LFD a10, 1 * SIZE(AO3)
  644. FMADD xsum3, xtemp3, a11, xsum3
  645. NOP1
  646. FMADD y03, atemp3, a11, y03
  647. LFD a11, 2 * SIZE(AO3)
  648. FMADD xsum4, xtemp3, a15, xsum4
  649. LFD xtemp3, 18 * SIZE(XX)
  650. FMADD y04, atemp3, a12, y04
  651. addi XX, XX, 16 * SIZE
  652. FMADD xsum1, xtemp4, a4, xsum1
  653. LFD a4, 3 * SIZE(AO1)
  654. FMADD y01, atemp4, a13, y01
  655. LFD a13, 0 * SIZE(AO4)
  656. FMADD xsum2, xtemp4, a8, xsum2
  657. LFD a8, 3 * SIZE(AO2)
  658. FMADD y02, atemp4, a14, y02
  659. LFD a14, 1 * SIZE(AO4)
  660. FMADD xsum3, xtemp4, a12, xsum3
  661. LFD a12, 3 * SIZE(AO3)
  662. FMADD y03, atemp4, a15, y03
  663. LFD a15, 2 * SIZE(AO4)
  664. FMADD xsum4, xtemp4, a16, xsum4
  665. LFD xtemp4, 3 * SIZE(XX)
  666. FMADD y04, atemp4, a16, y04
  667. LFD a16, 3 * SIZE(AO4)
  668. STFD y01, -4 * SIZE(YY)
  669. LFD y01, 0 * SIZE(YY)
  670. STFD y02, -3 * SIZE(YY)
  671. LFD y02, 1 * SIZE(YY)
  672. STFD y03, -2 * SIZE(YY)
  673. LFD y03, 2 * SIZE(YY)
  674. STFD y04, -1 * SIZE(YY)
  675. LFD y04, 3 * SIZE(YY)
  676. bdnz LL(12)
  677. .align 4
  678. LL(14):
  679. andi. r0, IS, 8
  680. ble LL(15)
  681. FMADD xsum1, xtemp1, a1, xsum1
  682. NOP1
  683. FMADD y01, atemp1, a1, y01
  684. LFD a1, 4 * SIZE(AO1)
  685. FMADD xsum2, xtemp1, a5, xsum2
  686. NOP1
  687. FMADD y02, atemp1, a2, y02
  688. NOP2
  689. FMADD xsum3, xtemp1, a9, xsum3
  690. NOP1
  691. FMADD y03, atemp1, a3, y03
  692. NOP2
  693. FMADD xsum4, xtemp1, a13, xsum4
  694. LFD xtemp1, 4 * SIZE(XX)
  695. FMADD y04, atemp1, a4, y04
  696. NOP2
  697. FMADD xsum1, xtemp2, a2, xsum1
  698. LFD a2, 5 * SIZE(AO1)
  699. FMADD y01, atemp2, a5, y01
  700. LFD a5, 4 * SIZE(AO2)
  701. FMADD xsum2, xtemp2, a6, xsum2
  702. NOP1
  703. FMADD y02, atemp2, a6, y02
  704. LFD a6, 5 * SIZE(AO2)
  705. FMADD xsum3, xtemp2, a10, xsum3
  706. NOP1
  707. FMADD y03, atemp2, a7, y03
  708. NOP2
  709. FMADD xsum4, xtemp2, a14, xsum4
  710. LFD xtemp2, 5 * SIZE(XX)
  711. FMADD y04, atemp2, a8, y04
  712. NOP2
  713. FMADD xsum1, xtemp3, a3, xsum1
  714. LFD a3, 6 * SIZE(AO1)
  715. FMADD y01, atemp3, a9, y01
  716. LFD a9, 4 * SIZE(AO3)
  717. FMADD xsum2, xtemp3, a7, xsum2
  718. LFD a7, 6 * SIZE(AO2)
  719. FMADD y02, atemp3, a10, y02
  720. LFD a10, 5 * SIZE(AO3)
  721. FMADD xsum3, xtemp3, a11, xsum3
  722. NOP1
  723. FMADD y03, atemp3, a11, y03
  724. LFD a11, 6 * SIZE(AO3)
  725. FMADD xsum4, xtemp3, a15, xsum4
  726. LFD xtemp3, 6 * SIZE(XX)
  727. FMADD y04, atemp3, a12, y04
  728. NOP2
  729. FMADD xsum1, xtemp4, a4, xsum1
  730. LFD a4, 7 * SIZE(AO1)
  731. FMADD y01, atemp4, a13, y01
  732. LFD a13, 4 * SIZE(AO4)
  733. FMADD xsum2, xtemp4, a8, xsum2
  734. LFD a8, 7 * SIZE(AO2)
  735. FMADD y02, atemp4, a14, y02
  736. LFD a14, 5 * SIZE(AO4)
  737. FMADD xsum3, xtemp4, a12, xsum3
  738. LFD a12, 7 * SIZE(AO3)
  739. FMADD y03, atemp4, a15, y03
  740. LFD a15, 6 * SIZE(AO4)
  741. FMADD xsum4, xtemp4, a16, xsum4
  742. LFD xtemp4, 7 * SIZE(XX)
  743. FMADD y04, atemp4, a16, y04
  744. LFD a16, 7 * SIZE(AO4)
  745. STFD y01, 0 * SIZE(YY)
  746. LFD y01, 4 * SIZE(YY)
  747. STFD y02, 1 * SIZE(YY)
  748. LFD y02, 5 * SIZE(YY)
  749. STFD y03, 2 * SIZE(YY)
  750. LFD y03, 6 * SIZE(YY)
  751. STFD y04, 3 * SIZE(YY)
  752. LFD y04, 7 * SIZE(YY)
  753. FMADD xsum1, xtemp1, a1, xsum1
  754. NOP1
  755. FMADD y01, atemp1, a1, y01
  756. LFD a1, 8 * SIZE(AO1)
  757. FMADD xsum2, xtemp1, a5, xsum2
  758. NOP1
  759. FMADD y02, atemp1, a2, y02
  760. NOP2
  761. FMADD xsum3, xtemp1, a9, xsum3
  762. NOP1
  763. FMADD y03, atemp1, a3, y03
  764. NOP2
  765. FMADD xsum4, xtemp1, a13, xsum4
  766. LFD xtemp1, 8 * SIZE(XX)
  767. FMADD y04, atemp1, a4, y04
  768. NOP2
  769. FMADD xsum1, xtemp2, a2, xsum1
  770. LFD a2, 9 * SIZE(AO1)
  771. FMADD y01, atemp2, a5, y01
  772. LFD a5, 8 * SIZE(AO2)
  773. FMADD xsum2, xtemp2, a6, xsum2
  774. NOP1
  775. FMADD y02, atemp2, a6, y02
  776. LFD a6, 9 * SIZE(AO2)
  777. FMADD xsum3, xtemp2, a10, xsum3
  778. NOP1
  779. FMADD y03, atemp2, a7, y03
  780. NOP2
  781. FMADD xsum4, xtemp2, a14, xsum4
  782. LFD xtemp2, 9 * SIZE(XX)
  783. FMADD y04, atemp2, a8, y04
  784. NOP2
  785. FMADD xsum1, xtemp3, a3, xsum1
  786. LFD a3, 10 * SIZE(AO1)
  787. FMADD y01, atemp3, a9, y01
  788. LFD a9, 8 * SIZE(AO3)
  789. FMADD xsum2, xtemp3, a7, xsum2
  790. LFD a7, 10 * SIZE(AO2)
  791. FMADD y02, atemp3, a10, y02
  792. LFD a10, 9 * SIZE(AO3)
  793. FMADD xsum3, xtemp3, a11, xsum3
  794. NOP1
  795. FMADD y03, atemp3, a11, y03
  796. LFD a11, 10 * SIZE(AO3)
  797. FMADD xsum4, xtemp3, a15, xsum4
  798. LFD xtemp3, 10 * SIZE(XX)
  799. FMADD y04, atemp3, a12, y04
  800. NOP2
  801. FMADD xsum1, xtemp4, a4, xsum1
  802. LFD a4, 11 * SIZE(AO1)
  803. FMADD y01, atemp4, a13, y01
  804. LFD a13, 8 * SIZE(AO4)
  805. FMADD xsum2, xtemp4, a8, xsum2
  806. LFD a8, 11 * SIZE(AO2)
  807. FMADD y02, atemp4, a14, y02
  808. LFD a14, 9 * SIZE(AO4)
  809. FMADD xsum3, xtemp4, a12, xsum3
  810. LFD a12, 11 * SIZE(AO3)
  811. FMADD y03, atemp4, a15, y03
  812. LFD a15, 10 * SIZE(AO4)
  813. FMADD xsum4, xtemp4, a16, xsum4
  814. LFD xtemp4, 11 * SIZE(XX)
  815. FMADD y04, atemp4, a16, y04
  816. LFD a16, 11 * SIZE(AO4)
  817. addi AO1, AO1, 8 * SIZE
  818. addi AO2, AO2, 8 * SIZE
  819. addi AO3, AO3, 8 * SIZE
  820. addi AO4, AO4, 8 * SIZE
  821. STFD y01, 4 * SIZE(YY)
  822. LFD y01, 8 * SIZE(YY)
  823. STFD y02, 5 * SIZE(YY)
  824. LFD y02, 9 * SIZE(YY)
  825. STFD y03, 6 * SIZE(YY)
  826. LFD y03, 10 * SIZE(YY)
  827. STFD y04, 7 * SIZE(YY)
  828. LFD y04, 11 * SIZE(YY)
  829. addi XX, XX, 8 * SIZE
  830. addi YY, YY, 8 * SIZE
  831. .align 4
  832. LL(15):
  833. andi. r0, IS, 4
  834. ble LL(18)
  835. FMADD xsum1, xtemp1, a1, xsum1
  836. NOP1
  837. FMADD y01, atemp1, a1, y01
  838. LFD a1, 4 * SIZE(AO1)
  839. FMADD xsum2, xtemp1, a5, xsum2
  840. NOP1
  841. FMADD y02, atemp1, a2, y02
  842. NOP2
  843. FMADD xsum3, xtemp1, a9, xsum3
  844. NOP1
  845. FMADD y03, atemp1, a3, y03
  846. NOP2
  847. FMADD xsum4, xtemp1, a13, xsum4
  848. LFD xtemp1, 4 * SIZE(XX)
  849. FMADD y04, atemp1, a4, y04
  850. NOP2
  851. FMADD xsum1, xtemp2, a2, xsum1
  852. LFD a2, 5 * SIZE(AO1)
  853. FMADD y01, atemp2, a5, y01
  854. LFD a5, 4 * SIZE(AO2)
  855. FMADD xsum2, xtemp2, a6, xsum2
  856. NOP1
  857. FMADD y02, atemp2, a6, y02
  858. LFD a6, 5 * SIZE(AO2)
  859. FMADD xsum3, xtemp2, a10, xsum3
  860. NOP1
  861. FMADD y03, atemp2, a7, y03
  862. NOP2
  863. FMADD xsum4, xtemp2, a14, xsum4
  864. LFD xtemp2, 5 * SIZE(XX)
  865. FMADD y04, atemp2, a8, y04
  866. NOP2
  867. FMADD xsum1, xtemp3, a3, xsum1
  868. LFD a3, 6 * SIZE(AO1)
  869. FMADD y01, atemp3, a9, y01
  870. LFD a9, 4 * SIZE(AO3)
  871. FMADD xsum2, xtemp3, a7, xsum2
  872. LFD a7, 6 * SIZE(AO2)
  873. FMADD y02, atemp3, a10, y02
  874. LFD a10, 5 * SIZE(AO3)
  875. FMADD xsum3, xtemp3, a11, xsum3
  876. NOP1
  877. FMADD y03, atemp3, a11, y03
  878. LFD a11, 6 * SIZE(AO3)
  879. FMADD xsum4, xtemp3, a15, xsum4
  880. LFD xtemp3, 6 * SIZE(XX)
  881. FMADD y04, atemp3, a12, y04
  882. NOP2
  883. FMADD xsum1, xtemp4, a4, xsum1
  884. LFD a4, 7 * SIZE(AO1)
  885. FMADD y01, atemp4, a13, y01
  886. LFD a13, 4 * SIZE(AO4)
  887. FMADD xsum2, xtemp4, a8, xsum2
  888. LFD a8, 7 * SIZE(AO2)
  889. FMADD y02, atemp4, a14, y02
  890. LFD a14, 5 * SIZE(AO4)
  891. FMADD xsum3, xtemp4, a12, xsum3
  892. LFD a12, 7 * SIZE(AO3)
  893. FMADD y03, atemp4, a15, y03
  894. LFD a15, 6 * SIZE(AO4)
  895. FMADD xsum4, xtemp4, a16, xsum4
  896. LFD xtemp4, 7 * SIZE(XX)
  897. FMADD y04, atemp4, a16, y04
  898. LFD a16, 7 * SIZE(AO4)
  899. addi AO1, AO1, 4 * SIZE
  900. addi AO2, AO2, 4 * SIZE
  901. addi AO3, AO3, 4 * SIZE
  902. addi AO4, AO4, 4 * SIZE
  903. STFD y01, 0 * SIZE(YY)
  904. LFD y01, 4 * SIZE(YY)
  905. STFD y02, 1 * SIZE(YY)
  906. LFD y02, 5 * SIZE(YY)
  907. STFD y03, 2 * SIZE(YY)
  908. LFD y03, 6 * SIZE(YY)
  909. STFD y04, 3 * SIZE(YY)
  910. LFD y04, 7 * SIZE(YY)
  911. addi XX, XX, 4 * SIZE
  912. addi YY, YY, 4 * SIZE
  913. .align 4
  914. LL(18):
  915. LFD xtemp1, ALPHA
  916. FMUL xsum1, xtemp1, xsum1
  917. FMUL xsum2, xtemp1, xsum2
  918. FMUL xsum3, xtemp1, xsum3
  919. FMUL xsum4, xtemp1, xsum4
  920. FMADD xsum1, atemp1, a1, xsum1
  921. FMADD xsum2, atemp1, a5, xsum2
  922. FMADD xsum3, atemp1, a9, xsum3
  923. FMADD xsum4, atemp1, a13, xsum4
  924. FMADD xsum1, atemp2, a5, xsum1
  925. FMADD xsum2, atemp2, a6, xsum2
  926. FMADD xsum3, atemp2, a10, xsum3
  927. FMADD xsum4, atemp2, a14, xsum4
  928. FMADD xsum1, atemp3, a9, xsum1
  929. FMADD xsum2, atemp3, a10, xsum2
  930. FMADD xsum3, atemp3, a11, xsum3
  931. FMADD xsum4, atemp3, a15, xsum4
  932. FMADD xsum1, atemp4, a13, xsum1
  933. FMADD xsum2, atemp4, a14, xsum2
  934. FMADD xsum3, atemp4, a15, xsum3
  935. FMADD xsum4, atemp4, a16, xsum4
  936. FADD y01, y01, xsum1
  937. FADD y02, y02, xsum2
  938. FADD y03, y03, xsum3
  939. FADD y04, y04, xsum4
  940. STFD y01, 0 * SIZE(YY)
  941. STFD y02, 1 * SIZE(YY)
  942. STFD y03, 2 * SIZE(YY)
  943. STFD y04, 3 * SIZE(YY)
  944. addi TEMP, IS, 8
  945. addi IS, IS, 4
  946. cmpw cr0, TEMP, M
  947. ble LL(11)
  948. .align 4
  949. LL(20):
  950. andi. TEMP, M, 2
  951. ble LL(30)
  952. mr AO1, A
  953. add AO2, A, LDA
  954. add A, AO2, LDA
  955. slwi TEMP, IS, BASE_SHIFT
  956. add TEMP, X, TEMP
  957. LFD atemp1, 0 * SIZE(TEMP)
  958. LFD atemp2, 1 * SIZE(TEMP)
  959. LFD a1, ALPHA
  960. FMUL atemp1, a1, atemp1
  961. FMUL atemp2, a1, atemp2
  962. lfd xsum1, FZERO
  963. fmr xsum2, xsum1
  964. mr XX, X
  965. mr YY, NEW_Y
  966. LFD xtemp1, 0 * SIZE(XX)
  967. LFD xtemp2, 1 * SIZE(XX)
  968. LFD y01, 0 * SIZE(YY)
  969. LFD y02, 1 * SIZE(YY)
  970. LFD a1, 0 * SIZE(AO1)
  971. LFD a2, 1 * SIZE(AO1)
  972. LFD a5, 0 * SIZE(AO2)
  973. LFD a6, 1 * SIZE(AO2)
  974. srawi. r0, IS, 1
  975. mtspr CTR, r0
  976. ble LL(28)
  977. .align 4
  978. LL(22):
  979. FMADD xsum1, xtemp1, a1, xsum1
  980. FMADD xsum2, xtemp1, a5, xsum2
  981. FMADD xsum1, xtemp2, a2, xsum1
  982. FMADD xsum2, xtemp2, a6, xsum2
  983. FMADD y01, atemp1, a1, y01
  984. FMADD y02, atemp1, a2, y02
  985. FMADD y01, atemp2, a5, y01
  986. FMADD y02, atemp2, a6, y02
  987. LFD xtemp1, 2 * SIZE(XX)
  988. LFD xtemp2, 3 * SIZE(XX)
  989. LFD a1, 2 * SIZE(AO1)
  990. LFD a2, 3 * SIZE(AO1)
  991. LFD a5, 2 * SIZE(AO2)
  992. LFD a6, 3 * SIZE(AO2)
  993. STFD y01, 0 * SIZE(YY)
  994. STFD y02, 1 * SIZE(YY)
  995. LFD y01, 2 * SIZE(YY)
  996. LFD y02, 3 * SIZE(YY)
  997. addi AO1, AO1, 2 * SIZE
  998. addi AO2, AO2, 2 * SIZE
  999. addi XX, XX, 2 * SIZE
  1000. addi YY, YY, 2 * SIZE
  1001. bdnz LL(22)
  1002. .align 4
  1003. LL(28):
  1004. LFD xtemp1, ALPHA
  1005. FMUL xsum1, xtemp1, xsum1
  1006. FMUL xsum2, xtemp1, xsum2
  1007. FMADD xsum1, atemp1, a1, xsum1
  1008. FMADD xsum2, atemp1, a5, xsum2
  1009. FMADD xsum1, atemp2, a5, xsum1
  1010. FMADD xsum2, atemp2, a6, xsum2
  1011. FADD y01, y01, xsum1
  1012. FADD y02, y02, xsum2
  1013. STFD y01, 0 * SIZE(YY)
  1014. STFD y02, 1 * SIZE(YY)
  1015. addi IS, IS, 2
  1016. .align 4
  1017. LL(30):
  1018. andi. TEMP, M, 1
  1019. ble LL(990)
  1020. mr AO1, A
  1021. slwi TEMP, IS, BASE_SHIFT
  1022. add TEMP, X, TEMP
  1023. LFD atemp1, 0 * SIZE(TEMP)
  1024. LFD a1, ALPHA
  1025. FMUL atemp1, a1, atemp1
  1026. lfd xsum1, FZERO
  1027. mr XX, X
  1028. mr YY, NEW_Y
  1029. LFD xtemp1, 0 * SIZE(XX)
  1030. LFD y01, 0 * SIZE(YY)
  1031. LFD a1, 0 * SIZE(AO1)
  1032. mtspr CTR, IS
  1033. cmpwi cr0, IS, 0
  1034. ble LL(38)
  1035. .align 4
  1036. LL(32):
  1037. FMADD xsum1, xtemp1, a1, xsum1
  1038. FMADD y01, atemp1, a1, y01
  1039. LFD xtemp1, 1 * SIZE(XX)
  1040. LFD a1, 1 * SIZE(AO1)
  1041. STFD y01, 0 * SIZE(YY)
  1042. LFD y01, 1 * SIZE(YY)
  1043. addi AO1, AO1, 1 * SIZE
  1044. addi XX, XX, 1 * SIZE
  1045. addi YY, YY, 1 * SIZE
  1046. bdnz LL(32)
  1047. .align 4
  1048. LL(38):
  1049. LFD xtemp1, ALPHA
  1050. FMUL xsum1, xtemp1, xsum1
  1051. FMADD xsum1, atemp1, a1, xsum1
  1052. FADD y01, y01, xsum1
  1053. STFD y01, 0 * SIZE(YY)
  1054. .align 4
  1055. LL(990):
  1056. cmpwi cr0, INCY, SIZE
  1057. beq LL(999)
  1058. mr YY, Y
  1059. srawi. r0, M, 3
  1060. mtspr CTR, r0
  1061. ble LL(995)
  1062. .align 4
  1063. LL(991):
  1064. LFD f0, 0 * SIZE(Y)
  1065. add Y, Y, INCY
  1066. LFD f1, 0 * SIZE(Y)
  1067. add Y, Y, INCY
  1068. LFD f2, 0 * SIZE(Y)
  1069. add Y, Y, INCY
  1070. LFD f3, 0 * SIZE(Y)
  1071. add Y, Y, INCY
  1072. LFD f4, 0 * SIZE(Y)
  1073. add Y, Y, INCY
  1074. LFD f5, 0 * SIZE(Y)
  1075. add Y, Y, INCY
  1076. LFD f6, 0 * SIZE(Y)
  1077. add Y, Y, INCY
  1078. LFD f7, 0 * SIZE(Y)
  1079. add Y, Y, INCY
  1080. LFD f8, 0 * SIZE(NEW_Y)
  1081. LFD f9, 1 * SIZE(NEW_Y)
  1082. LFD f10, 2 * SIZE(NEW_Y)
  1083. LFD f11, 3 * SIZE(NEW_Y)
  1084. LFD f12, 4 * SIZE(NEW_Y)
  1085. LFD f13, 5 * SIZE(NEW_Y)
  1086. LFD f14, 6 * SIZE(NEW_Y)
  1087. LFD f15, 7 * SIZE(NEW_Y)
  1088. addi NEW_Y, NEW_Y, 8 * SIZE
  1089. FADD f8, f8, f0
  1090. FADD f9, f9, f1
  1091. FADD f10, f10, f2
  1092. FADD f11, f11, f3
  1093. FADD f12, f12, f4
  1094. FADD f13, f13, f5
  1095. FADD f14, f14, f6
  1096. FADD f15, f15, f7
  1097. STFD f8, 0 * SIZE(YY)
  1098. add YY, YY, INCY
  1099. STFD f9, 0 * SIZE(YY)
  1100. add YY, YY, INCY
  1101. STFD f10, 0 * SIZE(YY)
  1102. add YY, YY, INCY
  1103. STFD f11, 0 * SIZE(YY)
  1104. add YY, YY, INCY
  1105. STFD f12, 0 * SIZE(YY)
  1106. add YY, YY, INCY
  1107. STFD f13, 0 * SIZE(YY)
  1108. add YY, YY, INCY
  1109. STFD f14, 0 * SIZE(YY)
  1110. add YY, YY, INCY
  1111. STFD f15, 0 * SIZE(YY)
  1112. add YY, YY, INCY
  1113. bdnz LL(991)
  1114. .align 4
  1115. LL(995):
  1116. andi. J, M, 4
  1117. ble LL(996)
  1118. LFD f0, 0 * SIZE(Y)
  1119. add Y, Y, INCY
  1120. LFD f1, 0 * SIZE(Y)
  1121. add Y, Y, INCY
  1122. LFD f2, 0 * SIZE(Y)
  1123. add Y, Y, INCY
  1124. LFD f3, 0 * SIZE(Y)
  1125. add Y, Y, INCY
  1126. LFD f8, 0 * SIZE(NEW_Y)
  1127. LFD f9, 1 * SIZE(NEW_Y)
  1128. LFD f10, 2 * SIZE(NEW_Y)
  1129. LFD f11, 3 * SIZE(NEW_Y)
  1130. addi NEW_Y, NEW_Y, 4 * SIZE
  1131. FADD f8, f8, f0
  1132. FADD f9, f9, f1
  1133. FADD f10, f10, f2
  1134. FADD f11, f11, f3
  1135. STFD f8, 0 * SIZE(YY)
  1136. add YY, YY, INCY
  1137. STFD f9, 0 * SIZE(YY)
  1138. add YY, YY, INCY
  1139. STFD f10, 0 * SIZE(YY)
  1140. add YY, YY, INCY
  1141. STFD f11, 0 * SIZE(YY)
  1142. add YY, YY, INCY
  1143. .align 4
  1144. LL(996):
  1145. andi. J, M, 2
  1146. ble LL(997)
  1147. LFD f0, 0 * SIZE(Y)
  1148. add Y, Y, INCY
  1149. LFD f1, 0 * SIZE(Y)
  1150. add Y, Y, INCY
  1151. LFD f8, 0 * SIZE(NEW_Y)
  1152. LFD f9, 1 * SIZE(NEW_Y)
  1153. addi NEW_Y, NEW_Y, 2 * SIZE
  1154. FADD f8, f8, f0
  1155. FADD f9, f9, f1
  1156. STFD f8, 0 * SIZE(YY)
  1157. add YY, YY, INCY
  1158. STFD f9, 0 * SIZE(YY)
  1159. add YY, YY, INCY
  1160. .align 4
  1161. LL(997):
  1162. andi. J, M, 1
  1163. ble LL(999)
  1164. LFD f0, 0 * SIZE(Y)
  1165. LFD f8, 0 * SIZE(NEW_Y)
  1166. FADD f8, f8, f0
  1167. STFD f8, 0 * SIZE(YY)
  1168. .align 4
  1169. LL(999):
  1170. li r3, 0
  1171. lfd f14, 0(SP)
  1172. lfd f15, 8(SP)
  1173. lfd f16, 16(SP)
  1174. lfd f17, 24(SP)
  1175. lfd f18, 32(SP)
  1176. lfd f19, 40(SP)
  1177. lfd f20, 48(SP)
  1178. lfd f21, 56(SP)
  1179. lfd f22, 64(SP)
  1180. lfd f23, 72(SP)
  1181. lfd f24, 80(SP)
  1182. lfd f25, 88(SP)
  1183. lfd f26, 96(SP)
  1184. lfd f27, 104(SP)
  1185. lfd f28, 112(SP)
  1186. lfd f29, 120(SP)
  1187. lfd f30, 128(SP)
  1188. lfd f31, 136(SP)
  1189. #ifdef __64BIT__
  1190. ld r14, 144(SP)
  1191. ld r15, 152(SP)
  1192. ld r16, 160(SP)
  1193. ld r17, 168(SP)
  1194. ld r18, 176(SP)
  1195. ld r19, 184(SP)
  1196. ld r20, 192(SP)
  1197. ld r21, 200(SP)
  1198. ld r22, 208(SP)
  1199. ld r23, 216(SP)
  1200. ld r24, 224(SP)
  1201. ld r25, 232(SP)
  1202. ld r26, 240(SP)
  1203. ld r27, 248(SP)
  1204. #else
  1205. lwz r14, 144(SP)
  1206. lwz r15, 148(SP)
  1207. lwz r16, 152(SP)
  1208. lwz r17, 156(SP)
  1209. lwz r18, 160(SP)
  1210. lwz r19, 164(SP)
  1211. lwz r20, 168(SP)
  1212. lwz r21, 172(SP)
  1213. lwz r22, 176(SP)
  1214. lwz r23, 180(SP)
  1215. lwz r24, 184(SP)
  1216. lwz r25, 188(SP)
  1217. lwz r26, 192(SP)
  1218. lwz r27, 196(SP)
  1219. #endif
  1220. addi SP, SP, STACKSIZE
  1221. blr
  1222. EPILOGUE
  1223. #endif