You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

symv_L.S 30 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if defined(linux) || defined(__FreeBSD__)
  41. #ifndef __64BIT__
  42. #define M r3
  43. #define N r4
  44. #define A r5
  45. #define LDA r6
  46. #define X r7
  47. #define INCX r8
  48. #define Y r9
  49. #define INCY r10
  50. #define BUFFER r14
  51. #else
  52. #define M r3
  53. #define N r4
  54. #define A r6
  55. #define LDA r7
  56. #define X r8
  57. #define INCX r9
  58. #define Y r10
  59. #define INCY r5
  60. #define BUFFER r14
  61. #endif
  62. #endif
  63. #if defined(_AIX) || defined(__APPLE__)
  64. #if !defined(__64BIT__) && defined(DOUBLE)
  65. #define M r3
  66. #define N r4
  67. #define A r7
  68. #define LDA r8
  69. #define X r9
  70. #define INCX r10
  71. #define Y r5
  72. #define INCY r6
  73. #define BUFFER r14
  74. #else
  75. #define M r3
  76. #define N r4
  77. #define A r6
  78. #define LDA r7
  79. #define X r8
  80. #define INCX r9
  81. #define Y r10
  82. #define INCY r5
  83. #define BUFFER r14
  84. #endif
  85. #endif
  86. #define I r11
  87. #define J r12
  88. #define AO1 r15
  89. #define AO2 r16
  90. #define AO3 r17
  91. #define AO4 r18
  92. #define XX r19
  93. #define YY r20
  94. #define NEW_Y r21
  95. #define TEMP r22
  96. #define PREA r24
  97. #define IS r25
  98. #define y01 f0
  99. #define y02 f1
  100. #define y03 f2
  101. #define y04 f3
  102. #define atemp1 f4
  103. #define atemp2 f5
  104. #define atemp3 f6
  105. #define atemp4 f7
  106. #define xtemp1 f8
  107. #define xtemp2 f9
  108. #define xtemp3 f10
  109. #define xtemp4 f11
  110. #define xsum1 f12
  111. #define xsum2 f13
  112. #define xsum3 f14
  113. #define xsum4 f15
  114. #define a1 f16
  115. #define a2 f17
  116. #define a3 f18
  117. #define a4 f19
  118. #define a5 f20
  119. #define a6 f21
  120. #define a7 f22
  121. #define a8 f23
  122. #define a9 f24
  123. #define a10 f25
  124. #define a11 f26
  125. #define a12 f27
  126. #define a13 f28
  127. #define a14 f29
  128. #define a15 f30
  129. #define a16 f31
  130. #define alpha f1
  131. #if defined(PPCG4)
  132. #define PREFETCHSIZE_A 24
  133. #endif
  134. #if defined(PPC440) || defined(PPC440FP2)
  135. #define PREFETCHSIZE_A 24
  136. #endif
  137. #ifdef PPC970
  138. #define PREFETCHSIZE_A 64
  139. #endif
  140. #ifdef CELL
  141. #define PREFETCHSIZE_A 72
  142. #endif
  143. #ifdef POWER4
  144. #define PREFETCHSIZE_A 16
  145. #endif
  146. #ifdef POWER5
  147. #define PREFETCHSIZE_A 96
  148. #endif
  149. #ifdef POWER6
  150. #define PREFETCHSIZE_A 40
  151. #endif
  152. #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
  153. #define NOP1
  154. #define NOP2
  155. #else
  156. #define NOP1 mr LDA, LDA
  157. #define NOP2 mr INCX, INCX
  158. #endif
  159. #ifndef NEEDPARAM
  160. #ifndef __64BIT__
  161. #define STACKSIZE 224
  162. #define ALPHA 200(SP)
  163. #define FZERO 208(SP)
  164. #else
  165. #define STACKSIZE 280
  166. #define ALPHA 256(SP)
  167. #define FZERO 264(SP)
  168. #endif
  169. PROLOGUE
  170. PROFCODE
  171. addi SP, SP, -STACKSIZE
  172. li r0, 0
  173. stfd f14, 0(SP)
  174. stfd f15, 8(SP)
  175. stfd f16, 16(SP)
  176. stfd f17, 24(SP)
  177. stfd f18, 32(SP)
  178. stfd f19, 40(SP)
  179. stfd f20, 48(SP)
  180. stfd f21, 56(SP)
  181. stfd f22, 64(SP)
  182. stfd f23, 72(SP)
  183. stfd f24, 80(SP)
  184. stfd f25, 88(SP)
  185. stfd f26, 96(SP)
  186. stfd f27, 104(SP)
  187. stfd f28, 112(SP)
  188. stfd f29, 120(SP)
  189. stfd f30, 128(SP)
  190. stfd f31, 136(SP)
  191. #ifdef __64BIT__
  192. std r0, FZERO
  193. std r14, 144(SP)
  194. std r15, 152(SP)
  195. std r16, 160(SP)
  196. std r17, 168(SP)
  197. std r18, 176(SP)
  198. std r19, 184(SP)
  199. std r20, 192(SP)
  200. std r21, 200(SP)
  201. std r22, 208(SP)
  202. std r23, 216(SP)
  203. std r24, 224(SP)
  204. std r25, 232(SP)
  205. std r26, 240(SP)
  206. std r27, 248(SP)
  207. #else
  208. stw r0, 0 + FZERO
  209. stw r0, 4 + FZERO
  210. stw r14, 144(SP)
  211. stw r15, 148(SP)
  212. stw r16, 152(SP)
  213. stw r17, 156(SP)
  214. stw r18, 160(SP)
  215. stw r19, 164(SP)
  216. stw r20, 168(SP)
  217. stw r21, 172(SP)
  218. stw r22, 176(SP)
  219. stw r23, 180(SP)
  220. stw r24, 184(SP)
  221. stw r25, 188(SP)
  222. stw r26, 192(SP)
  223. stw r27, 196(SP)
  224. #endif
  225. #if defined(linux) || defined(__FreeBSD__)
  226. #ifndef __64BIT__
  227. lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP)
  228. #else
  229. ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
  230. ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  231. #endif
  232. #endif
  233. #if defined(_AIX) || defined(__APPLE__)
  234. #ifndef __64BIT__
  235. #ifdef DOUBLE
  236. lwz Y, FRAMESLOT(0) + STACKSIZE(SP)
  237. lwz INCY, FRAMESLOT(1) + STACKSIZE(SP)
  238. lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  239. #else
  240. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  241. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  242. #endif
  243. #else
  244. ld INCY, FRAMESLOT(0) + STACKSIZE(SP)
  245. ld BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  246. #endif
  247. #endif
  248. STFD alpha, ALPHA
  249. slwi LDA, LDA, BASE_SHIFT
  250. slwi INCX, INCX, BASE_SHIFT
  251. slwi INCY, INCY, BASE_SHIFT
  252. li PREA, PREFETCHSIZE_A * SIZE
  253. cmpwi cr0, M, 0
  254. ble- LL(999)
  255. cmpwi cr0, INCX, SIZE
  256. beq LL(05)
  257. mr XX, X
  258. mr X, BUFFER
  259. srawi. r0, M, 3
  260. mtspr CTR, r0
  261. ble LL(03)
  262. .align 4
  263. LL(01):
  264. LFD a1, 0 * SIZE(XX)
  265. add XX, XX, INCX
  266. LFD a2, 0 * SIZE(XX)
  267. add XX, XX, INCX
  268. LFD a3, 0 * SIZE(XX)
  269. add XX, XX, INCX
  270. LFD a4, 0 * SIZE(XX)
  271. add XX, XX, INCX
  272. LFD a5, 0 * SIZE(XX)
  273. add XX, XX, INCX
  274. LFD a6, 0 * SIZE(XX)
  275. add XX, XX, INCX
  276. LFD a7, 0 * SIZE(XX)
  277. add XX, XX, INCX
  278. LFD a8, 0 * SIZE(XX)
  279. add XX, XX, INCX
  280. dcbt XX, PREA
  281. dcbtst BUFFER, PREA
  282. STFD a1, 0 * SIZE(BUFFER)
  283. STFD a2, 1 * SIZE(BUFFER)
  284. STFD a3, 2 * SIZE(BUFFER)
  285. STFD a4, 3 * SIZE(BUFFER)
  286. STFD a5, 4 * SIZE(BUFFER)
  287. STFD a6, 5 * SIZE(BUFFER)
  288. STFD a7, 6 * SIZE(BUFFER)
  289. STFD a8, 7 * SIZE(BUFFER)
  290. addi BUFFER, BUFFER, 8 * SIZE
  291. bdnz LL(01)
  292. .align 4
  293. LL(03):
  294. andi. r0, M, 7
  295. mtspr CTR, r0
  296. ble LL(05)
  297. .align 4
  298. LL(04):
  299. LFD a1, 0 * SIZE(XX)
  300. add XX, XX, INCX
  301. STFD a1, 0 * SIZE(BUFFER)
  302. addi BUFFER, BUFFER, 1 * SIZE
  303. bdnz LL(04)
  304. .align 4
  305. LL(05):
  306. mr NEW_Y, Y
  307. lfd f0, FZERO
  308. cmpwi cr0, INCY, SIZE
  309. beq LL(10)
  310. mr NEW_Y, BUFFER
  311. addi r0, M, 7
  312. srawi. r0, r0, 3
  313. mtspr CTR, r0
  314. .align 4
  315. LL(06):
  316. STFD f0, 0 * SIZE(BUFFER)
  317. STFD f0, 1 * SIZE(BUFFER)
  318. STFD f0, 2 * SIZE(BUFFER)
  319. STFD f0, 3 * SIZE(BUFFER)
  320. STFD f0, 4 * SIZE(BUFFER)
  321. STFD f0, 5 * SIZE(BUFFER)
  322. STFD f0, 6 * SIZE(BUFFER)
  323. STFD f0, 7 * SIZE(BUFFER)
  324. addi BUFFER, BUFFER, 8 * SIZE
  325. bdnz LL(06)
  326. .align 4
  327. LL(10):
  328. li IS, 0
  329. cmpwi cr0, N, 4
  330. blt LL(20)
  331. .align 4
  332. LL(11):
  333. mr AO1, A
  334. add AO2, A, LDA
  335. add AO3, AO2, LDA
  336. add AO4, AO3, LDA
  337. add A, AO4, LDA
  338. addi A, A, 4 * SIZE
  339. slwi TEMP, IS, BASE_SHIFT
  340. add XX, X, TEMP
  341. add YY, NEW_Y, TEMP
  342. LFD atemp1, 0 * SIZE(XX)
  343. LFD atemp2, 1 * SIZE(XX)
  344. LFD atemp3, 2 * SIZE(XX)
  345. LFD atemp4, 3 * SIZE(XX)
  346. LFD a1, 0 * SIZE(AO1)
  347. LFD a2, 1 * SIZE(AO1)
  348. LFD a3, 2 * SIZE(AO1)
  349. LFD a4, 3 * SIZE(AO1)
  350. LFD a6, 1 * SIZE(AO2)
  351. LFD a7, 2 * SIZE(AO2)
  352. LFD a8, 3 * SIZE(AO2)
  353. LFD a11, 2 * SIZE(AO3)
  354. LFD a12, 3 * SIZE(AO3)
  355. LFD a16, 3 * SIZE(AO4)
  356. LFD a5, ALPHA
  357. FMUL xsum1, atemp1, a1
  358. FMUL xsum2, atemp1, a2
  359. FMUL xsum3, atemp1, a3
  360. FMUL xsum4, atemp1, a4
  361. FMADD xsum1, atemp2, a2, xsum1
  362. FMADD xsum2, atemp2, a6, xsum2
  363. FMADD xsum3, atemp2, a7, xsum3
  364. FMADD xsum4, atemp2, a8, xsum4
  365. FMADD xsum1, atemp3, a3, xsum1
  366. FMADD xsum2, atemp3, a7, xsum2
  367. FMADD xsum3, atemp3, a11, xsum3
  368. FMADD xsum4, atemp3, a12, xsum4
  369. FMADD xsum1, atemp4, a4, xsum1
  370. FMADD xsum2, atemp4, a8, xsum2
  371. FMADD xsum3, atemp4, a12, xsum3
  372. FMADD xsum4, atemp4, a16, xsum4
  373. FMUL atemp1, a5, atemp1
  374. FMUL atemp2, a5, atemp2
  375. FMUL atemp3, a5, atemp3
  376. FMUL atemp4, a5, atemp4
  377. LFD xtemp1, 4 * SIZE(XX)
  378. LFD xtemp2, 5 * SIZE(XX)
  379. LFD xtemp3, 6 * SIZE(XX)
  380. LFD xtemp4, 7 * SIZE(XX)
  381. LFD y01, 4 * SIZE(YY)
  382. LFD y02, 5 * SIZE(YY)
  383. LFD y03, 6 * SIZE(YY)
  384. LFD y04, 7 * SIZE(YY)
  385. LFD a1, 4 * SIZE(AO1)
  386. LFD a2, 5 * SIZE(AO1)
  387. LFD a3, 6 * SIZE(AO1)
  388. LFD a4, 7 * SIZE(AO1)
  389. LFD a5, 4 * SIZE(AO2)
  390. LFD a6, 5 * SIZE(AO2)
  391. LFD a7, 6 * SIZE(AO2)
  392. LFD a8, 7 * SIZE(AO2)
  393. LFD a9, 4 * SIZE(AO3)
  394. LFD a10, 5 * SIZE(AO3)
  395. LFD a11, 6 * SIZE(AO3)
  396. LFD a12, 7 * SIZE(AO3)
  397. LFD a13, 4 * SIZE(AO4)
  398. LFD a14, 5 * SIZE(AO4)
  399. LFD a15, 6 * SIZE(AO4)
  400. LFD a16, 7 * SIZE(AO4)
  401. addi AO1, AO1, 4 * SIZE
  402. addi AO2, AO2, 4 * SIZE
  403. addi AO3, AO3, 4 * SIZE
  404. addi AO4, AO4, 4 * SIZE
  405. addi XX, XX, 4 * SIZE
  406. addi YY, YY, 4 * SIZE
  407. sub TEMP, M, IS
  408. addi TEMP, TEMP, -4
  409. srawi. r0, TEMP, 4
  410. mtspr CTR, r0
  411. ble LL(14)
  412. .align 4
  413. LL(12):
  414. FMADD xsum1, xtemp1, a1, xsum1
  415. DCBT(AO1, PREA)
  416. FMADD y01, atemp1, a1, y01
  417. LFD a1, 4 * SIZE(AO1)
  418. FMADD xsum2, xtemp1, a5, xsum2
  419. NOP1
  420. FMADD y02, atemp1, a2, y02
  421. NOP2
  422. FMADD xsum3, xtemp1, a9, xsum3
  423. NOP1
  424. FMADD y03, atemp1, a3, y03
  425. NOP2
  426. FMADD xsum4, xtemp1, a13, xsum4
  427. LFD xtemp1, 4 * SIZE(XX)
  428. FMADD y04, atemp1, a4, y04
  429. NOP2
  430. FMADD xsum1, xtemp2, a2, xsum1
  431. LFD a2, 5 * SIZE(AO1)
  432. FMADD y01, atemp2, a5, y01
  433. LFD a5, 4 * SIZE(AO2)
  434. FMADD xsum2, xtemp2, a6, xsum2
  435. NOP1
  436. FMADD y02, atemp2, a6, y02
  437. LFD a6, 5 * SIZE(AO2)
  438. FMADD xsum3, xtemp2, a10, xsum3
  439. NOP1
  440. FMADD y03, atemp2, a7, y03
  441. NOP2
  442. FMADD xsum4, xtemp2, a14, xsum4
  443. LFD xtemp2, 5 * SIZE(XX)
  444. FMADD y04, atemp2, a8, y04
  445. # DCBT(X, PREX)
  446. NOP2
  447. FMADD xsum1, xtemp3, a3, xsum1
  448. LFD a3, 6 * SIZE(AO1)
  449. FMADD y01, atemp3, a9, y01
  450. LFD a9, 4 * SIZE(AO3)
  451. FMADD xsum2, xtemp3, a7, xsum2
  452. LFD a7, 6 * SIZE(AO2)
  453. FMADD y02, atemp3, a10, y02
  454. LFD a10, 5 * SIZE(AO3)
  455. FMADD xsum3, xtemp3, a11, xsum3
  456. NOP1
  457. FMADD y03, atemp3, a11, y03
  458. LFD a11, 6 * SIZE(AO3)
  459. FMADD xsum4, xtemp3, a15, xsum4
  460. LFD xtemp3, 6 * SIZE(XX)
  461. FMADD y04, atemp3, a12, y04
  462. NOP2
  463. FMADD xsum1, xtemp4, a4, xsum1
  464. LFD a4, 7 * SIZE(AO1)
  465. FMADD y01, atemp4, a13, y01
  466. LFD a13, 4 * SIZE(AO4)
  467. FMADD xsum2, xtemp4, a8, xsum2
  468. LFD a8, 7 * SIZE(AO2)
  469. FMADD y02, atemp4, a14, y02
  470. LFD a14, 5 * SIZE(AO4)
  471. FMADD xsum3, xtemp4, a12, xsum3
  472. LFD a12, 7 * SIZE(AO3)
  473. FMADD y03, atemp4, a15, y03
  474. LFD a15, 6 * SIZE(AO4)
  475. FMADD xsum4, xtemp4, a16, xsum4
  476. LFD xtemp4, 7 * SIZE(XX)
  477. FMADD y04, atemp4, a16, y04
  478. LFD a16, 7 * SIZE(AO4)
  479. STFD y01, 0 * SIZE(YY)
  480. LFD y01, 4 * SIZE(YY)
  481. STFD y02, 1 * SIZE(YY)
  482. LFD y02, 5 * SIZE(YY)
  483. STFD y03, 2 * SIZE(YY)
  484. LFD y03, 6 * SIZE(YY)
  485. STFD y04, 3 * SIZE(YY)
  486. LFD y04, 7 * SIZE(YY)
  487. FMADD xsum1, xtemp1, a1, xsum1
  488. DCBT(AO2, PREA)
  489. FMADD y01, atemp1, a1, y01
  490. LFD a1, 8 * SIZE(AO1)
  491. FMADD xsum2, xtemp1, a5, xsum2
  492. NOP1
  493. FMADD y02, atemp1, a2, y02
  494. NOP2
  495. FMADD xsum3, xtemp1, a9, xsum3
  496. NOP1
  497. FMADD y03, atemp1, a3, y03
  498. NOP2
  499. FMADD xsum4, xtemp1, a13, xsum4
  500. LFD xtemp1, 8 * SIZE(XX)
  501. FMADD y04, atemp1, a4, y04
  502. NOP2
  503. FMADD xsum1, xtemp2, a2, xsum1
  504. LFD a2, 9 * SIZE(AO1)
  505. FMADD y01, atemp2, a5, y01
  506. LFD a5, 8 * SIZE(AO2)
  507. FMADD xsum2, xtemp2, a6, xsum2
  508. NOP1
  509. FMADD y02, atemp2, a6, y02
  510. LFD a6, 9 * SIZE(AO2)
  511. FMADD xsum3, xtemp2, a10, xsum3
  512. NOP1
  513. FMADD y03, atemp2, a7, y03
  514. NOP2
  515. FMADD xsum4, xtemp2, a14, xsum4
  516. LFD xtemp2, 9 * SIZE(XX)
  517. FMADD y04, atemp2, a8, y04
  518. NOP2
  519. FMADD xsum1, xtemp3, a3, xsum1
  520. LFD a3, 10 * SIZE(AO1)
  521. FMADD y01, atemp3, a9, y01
  522. LFD a9, 8 * SIZE(AO3)
  523. FMADD xsum2, xtemp3, a7, xsum2
  524. LFD a7, 10 * SIZE(AO2)
  525. FMADD y02, atemp3, a10, y02
  526. LFD a10, 9 * SIZE(AO3)
  527. FMADD xsum3, xtemp3, a11, xsum3
  528. NOP1
  529. FMADD y03, atemp3, a11, y03
  530. LFD a11, 10 * SIZE(AO3)
  531. FMADD xsum4, xtemp3, a15, xsum4
  532. LFD xtemp3, 10 * SIZE(XX)
  533. FMADD y04, atemp3, a12, y04
  534. NOP2
  535. FMADD xsum1, xtemp4, a4, xsum1
  536. LFD a4, 11 * SIZE(AO1)
  537. FMADD y01, atemp4, a13, y01
  538. LFD a13, 8 * SIZE(AO4)
  539. FMADD xsum2, xtemp4, a8, xsum2
  540. LFD a8, 11 * SIZE(AO2)
  541. FMADD y02, atemp4, a14, y02
  542. LFD a14, 9 * SIZE(AO4)
  543. FMADD xsum3, xtemp4, a12, xsum3
  544. LFD a12, 11 * SIZE(AO3)
  545. FMADD y03, atemp4, a15, y03
  546. LFD a15, 10 * SIZE(AO4)
  547. FMADD xsum4, xtemp4, a16, xsum4
  548. LFD xtemp4, 11 * SIZE(XX)
  549. FMADD y04, atemp4, a16, y04
  550. LFD a16, 11 * SIZE(AO4)
  551. STFD y01, 4 * SIZE(YY)
  552. LFD y01, 8 * SIZE(YY)
  553. STFD y02, 5 * SIZE(YY)
  554. LFD y02, 9 * SIZE(YY)
  555. STFD y03, 6 * SIZE(YY)
  556. LFD y03, 10 * SIZE(YY)
  557. STFD y04, 7 * SIZE(YY)
  558. LFD y04, 11 * SIZE(YY)
  559. FMADD xsum1, xtemp1, a1, xsum1
  560. DCBT(AO3, PREA)
  561. FMADD y01, atemp1, a1, y01
  562. LFD a1, 12 * SIZE(AO1)
  563. FMADD xsum2, xtemp1, a5, xsum2
  564. NOP1
  565. FMADD y02, atemp1, a2, y02
  566. NOP2
  567. FMADD xsum3, xtemp1, a9, xsum3
  568. NOP1
  569. FMADD y03, atemp1, a3, y03
  570. NOP2
  571. FMADD xsum4, xtemp1, a13, xsum4
  572. LFD xtemp1, 12 * SIZE(XX)
  573. FMADD y04, atemp1, a4, y04
  574. NOP2
  575. FMADD xsum1, xtemp2, a2, xsum1
  576. LFD a2, 13 * SIZE(AO1)
  577. FMADD y01, atemp2, a5, y01
  578. LFD a5, 12 * SIZE(AO2)
  579. FMADD xsum2, xtemp2, a6, xsum2
  580. NOP1
  581. FMADD y02, atemp2, a6, y02
  582. LFD a6, 13 * SIZE(AO2)
  583. FMADD xsum3, xtemp2, a10, xsum3
  584. NOP1
  585. FMADD y03, atemp2, a7, y03
  586. # DCBT(Y1, PREY)
  587. NOP2
  588. FMADD xsum4, xtemp2, a14, xsum4
  589. LFD xtemp2, 13 * SIZE(XX)
  590. FMADD y04, atemp2, a8, y04
  591. NOP2
  592. FMADD xsum1, xtemp3, a3, xsum1
  593. LFD a3, 14 * SIZE(AO1)
  594. FMADD y01, atemp3, a9, y01
  595. LFD a9, 12 * SIZE(AO3)
  596. FMADD xsum2, xtemp3, a7, xsum2
  597. LFD a7, 14 * SIZE(AO2)
  598. FMADD y02, atemp3, a10, y02
  599. LFD a10,13 * SIZE(AO3)
  600. FMADD xsum3, xtemp3, a11, xsum3
  601. NOP1
  602. FMADD y03, atemp3, a11, y03
  603. LFD a11, 14 * SIZE(AO3)
  604. FMADD xsum4, xtemp3, a15, xsum4
  605. LFD xtemp3, 14 * SIZE(XX)
  606. FMADD y04, atemp3, a12, y04
  607. NOP2
  608. FMADD xsum1, xtemp4, a4, xsum1
  609. LFD a4, 15 * SIZE(AO1)
  610. FMADD y01, atemp4, a13, y01
  611. LFD a13,12 * SIZE(AO4)
  612. FMADD xsum2, xtemp4, a8, xsum2
  613. LFD a8, 15 * SIZE(AO2)
  614. FMADD y02, atemp4, a14, y02
  615. LFD a14, 13 * SIZE(AO4)
  616. FMADD xsum3, xtemp4, a12, xsum3
  617. LFD a12, 15 * SIZE(AO3)
  618. FMADD y03, atemp4, a15, y03
  619. LFD a15, 14 * SIZE(AO4)
  620. FMADD xsum4, xtemp4, a16, xsum4
  621. LFD xtemp4, 15 * SIZE(XX)
  622. FMADD y04, atemp4, a16, y04
  623. LFD a16, 15 * SIZE(AO4)
  624. STFD y01, 8 * SIZE(YY)
  625. LFD y01, 12 * SIZE(YY)
  626. STFD y02, 9 * SIZE(YY)
  627. LFD y02, 13 * SIZE(YY)
  628. STFD y03, 10 * SIZE(YY)
  629. LFD y03, 14 * SIZE(YY)
  630. STFD y04, 11 * SIZE(YY)
  631. LFD y04, 15 * SIZE(YY)
  632. FMADD xsum1, xtemp1, a1, xsum1
  633. DCBT(AO4, PREA)
  634. FMADD y01, atemp1, a1, y01
  635. LFD a1, 16 * SIZE(AO1)
  636. FMADD xsum2, xtemp1, a5, xsum2
  637. NOP1
  638. FMADD y02, atemp1, a2, y02
  639. NOP2
  640. FMADD xsum3, xtemp1, a9, xsum3
  641. NOP1
  642. FMADD y03, atemp1, a3, y03
  643. NOP2
  644. FMADD xsum4, xtemp1, a13, xsum4
  645. LFD xtemp1, 16 * SIZE(XX)
  646. FMADD y04, atemp1, a4, y04
  647. addi YY, YY, 16 * SIZE
  648. FMADD xsum1, xtemp2, a2, xsum1
  649. LFD a2, 17 * SIZE(AO1)
  650. FMADD y01, atemp2, a5, y01
  651. LFD a5, 16 * SIZE(AO2)
  652. FMADD xsum2, xtemp2, a6, xsum2
  653. addi AO3, AO3, 16 * SIZE
  654. FMADD y02, atemp2, a6, y02
  655. LFD a6, 17 * SIZE(AO2)
  656. FMADD xsum3, xtemp2, a10, xsum3
  657. addi AO1, AO1, 16 * SIZE
  658. FMADD y03, atemp2, a7, y03
  659. addi AO2, AO2, 16 * SIZE
  660. FMADD xsum4, xtemp2, a14, xsum4
  661. LFD xtemp2, 17 * SIZE(XX)
  662. FMADD y04, atemp2, a8, y04
  663. addi AO4, AO4, 16 * SIZE
  664. FMADD xsum1, xtemp3, a3, xsum1
  665. LFD a3, 2 * SIZE(AO1)
  666. FMADD y01, atemp3, a9, y01
  667. LFD a9, 0 * SIZE(AO3)
  668. FMADD xsum2, xtemp3, a7, xsum2
  669. LFD a7, 2 * SIZE(AO2)
  670. FMADD y02, atemp3, a10, y02
  671. LFD a10, 1 * SIZE(AO3)
  672. FMADD xsum3, xtemp3, a11, xsum3
  673. NOP1
  674. FMADD y03, atemp3, a11, y03
  675. LFD a11, 2 * SIZE(AO3)
  676. FMADD xsum4, xtemp3, a15, xsum4
  677. LFD xtemp3, 18 * SIZE(XX)
  678. FMADD y04, atemp3, a12, y04
  679. addi XX, XX, 16 * SIZE
  680. FMADD xsum1, xtemp4, a4, xsum1
  681. LFD a4, 3 * SIZE(AO1)
  682. FMADD y01, atemp4, a13, y01
  683. LFD a13, 0 * SIZE(AO4)
  684. FMADD xsum2, xtemp4, a8, xsum2
  685. LFD a8, 3 * SIZE(AO2)
  686. FMADD y02, atemp4, a14, y02
  687. LFD a14, 1 * SIZE(AO4)
  688. FMADD xsum3, xtemp4, a12, xsum3
  689. LFD a12, 3 * SIZE(AO3)
  690. FMADD y03, atemp4, a15, y03
  691. LFD a15, 2 * SIZE(AO4)
  692. FMADD xsum4, xtemp4, a16, xsum4
  693. LFD xtemp4, 3 * SIZE(XX)
  694. FMADD y04, atemp4, a16, y04
  695. LFD a16, 3 * SIZE(AO4)
  696. STFD y01, -4 * SIZE(YY)
  697. LFD y01, 0 * SIZE(YY)
  698. STFD y02, -3 * SIZE(YY)
  699. LFD y02, 1 * SIZE(YY)
  700. STFD y03, -2 * SIZE(YY)
  701. LFD y03, 2 * SIZE(YY)
  702. STFD y04, -1 * SIZE(YY)
  703. LFD y04, 3 * SIZE(YY)
  704. bdnz LL(12)
  705. .align 4
  706. LL(14):
  707. sub TEMP, M, IS
  708. addi TEMP, TEMP, -4
  709. andi. r0, TEMP, 8
  710. ble LL(15)
  711. FMADD xsum1, xtemp1, a1, xsum1
  712. NOP1
  713. FMADD y01, atemp1, a1, y01
  714. LFD a1, 4 * SIZE(AO1)
  715. FMADD xsum2, xtemp1, a5, xsum2
  716. NOP1
  717. FMADD y02, atemp1, a2, y02
  718. NOP2
  719. FMADD xsum3, xtemp1, a9, xsum3
  720. NOP1
  721. FMADD y03, atemp1, a3, y03
  722. NOP2
  723. FMADD xsum4, xtemp1, a13, xsum4
  724. LFD xtemp1, 4 * SIZE(XX)
  725. FMADD y04, atemp1, a4, y04
  726. NOP2
  727. FMADD xsum1, xtemp2, a2, xsum1
  728. LFD a2, 5 * SIZE(AO1)
  729. FMADD y01, atemp2, a5, y01
  730. LFD a5, 4 * SIZE(AO2)
  731. FMADD xsum2, xtemp2, a6, xsum2
  732. NOP1
  733. FMADD y02, atemp2, a6, y02
  734. LFD a6, 5 * SIZE(AO2)
  735. FMADD xsum3, xtemp2, a10, xsum3
  736. NOP1
  737. FMADD y03, atemp2, a7, y03
  738. NOP2
  739. FMADD xsum4, xtemp2, a14, xsum4
  740. LFD xtemp2, 5 * SIZE(XX)
  741. FMADD y04, atemp2, a8, y04
  742. NOP2
  743. FMADD xsum1, xtemp3, a3, xsum1
  744. LFD a3, 6 * SIZE(AO1)
  745. FMADD y01, atemp3, a9, y01
  746. LFD a9, 4 * SIZE(AO3)
  747. FMADD xsum2, xtemp3, a7, xsum2
  748. LFD a7, 6 * SIZE(AO2)
  749. FMADD y02, atemp3, a10, y02
  750. LFD a10, 5 * SIZE(AO3)
  751. FMADD xsum3, xtemp3, a11, xsum3
  752. NOP1
  753. FMADD y03, atemp3, a11, y03
  754. LFD a11, 6 * SIZE(AO3)
  755. FMADD xsum4, xtemp3, a15, xsum4
  756. LFD xtemp3, 6 * SIZE(XX)
  757. FMADD y04, atemp3, a12, y04
  758. NOP2
  759. FMADD xsum1, xtemp4, a4, xsum1
  760. LFD a4, 7 * SIZE(AO1)
  761. FMADD y01, atemp4, a13, y01
  762. LFD a13, 4 * SIZE(AO4)
  763. FMADD xsum2, xtemp4, a8, xsum2
  764. LFD a8, 7 * SIZE(AO2)
  765. FMADD y02, atemp4, a14, y02
  766. LFD a14, 5 * SIZE(AO4)
  767. FMADD xsum3, xtemp4, a12, xsum3
  768. LFD a12, 7 * SIZE(AO3)
  769. FMADD y03, atemp4, a15, y03
  770. LFD a15, 6 * SIZE(AO4)
  771. FMADD xsum4, xtemp4, a16, xsum4
  772. LFD xtemp4, 7 * SIZE(XX)
  773. FMADD y04, atemp4, a16, y04
  774. LFD a16, 7 * SIZE(AO4)
  775. STFD y01, 0 * SIZE(YY)
  776. LFD y01, 4 * SIZE(YY)
  777. STFD y02, 1 * SIZE(YY)
  778. LFD y02, 5 * SIZE(YY)
  779. STFD y03, 2 * SIZE(YY)
  780. LFD y03, 6 * SIZE(YY)
  781. STFD y04, 3 * SIZE(YY)
  782. LFD y04, 7 * SIZE(YY)
  783. FMADD xsum1, xtemp1, a1, xsum1
  784. NOP1
  785. FMADD y01, atemp1, a1, y01
  786. LFD a1, 8 * SIZE(AO1)
  787. FMADD xsum2, xtemp1, a5, xsum2
  788. NOP1
  789. FMADD y02, atemp1, a2, y02
  790. NOP2
  791. FMADD xsum3, xtemp1, a9, xsum3
  792. NOP1
  793. FMADD y03, atemp1, a3, y03
  794. NOP2
  795. FMADD xsum4, xtemp1, a13, xsum4
  796. LFD xtemp1, 8 * SIZE(XX)
  797. FMADD y04, atemp1, a4, y04
  798. NOP2
  799. FMADD xsum1, xtemp2, a2, xsum1
  800. LFD a2, 9 * SIZE(AO1)
  801. FMADD y01, atemp2, a5, y01
  802. LFD a5, 8 * SIZE(AO2)
  803. FMADD xsum2, xtemp2, a6, xsum2
  804. NOP1
  805. FMADD y02, atemp2, a6, y02
  806. LFD a6, 9 * SIZE(AO2)
  807. FMADD xsum3, xtemp2, a10, xsum3
  808. NOP1
  809. FMADD y03, atemp2, a7, y03
  810. NOP2
  811. FMADD xsum4, xtemp2, a14, xsum4
  812. LFD xtemp2, 9 * SIZE(XX)
  813. FMADD y04, atemp2, a8, y04
  814. NOP2
  815. FMADD xsum1, xtemp3, a3, xsum1
  816. LFD a3, 10 * SIZE(AO1)
  817. FMADD y01, atemp3, a9, y01
  818. LFD a9, 8 * SIZE(AO3)
  819. FMADD xsum2, xtemp3, a7, xsum2
  820. LFD a7, 10 * SIZE(AO2)
  821. FMADD y02, atemp3, a10, y02
  822. LFD a10, 9 * SIZE(AO3)
  823. FMADD xsum3, xtemp3, a11, xsum3
  824. NOP1
  825. FMADD y03, atemp3, a11, y03
  826. LFD a11, 10 * SIZE(AO3)
  827. FMADD xsum4, xtemp3, a15, xsum4
  828. LFD xtemp3, 10 * SIZE(XX)
  829. FMADD y04, atemp3, a12, y04
  830. NOP2
  831. FMADD xsum1, xtemp4, a4, xsum1
  832. LFD a4, 11 * SIZE(AO1)
  833. FMADD y01, atemp4, a13, y01
  834. LFD a13, 8 * SIZE(AO4)
  835. FMADD xsum2, xtemp4, a8, xsum2
  836. LFD a8, 11 * SIZE(AO2)
  837. FMADD y02, atemp4, a14, y02
  838. LFD a14, 9 * SIZE(AO4)
  839. FMADD xsum3, xtemp4, a12, xsum3
  840. LFD a12, 11 * SIZE(AO3)
  841. FMADD y03, atemp4, a15, y03
  842. LFD a15, 10 * SIZE(AO4)
  843. FMADD xsum4, xtemp4, a16, xsum4
  844. LFD xtemp4, 11 * SIZE(XX)
  845. FMADD y04, atemp4, a16, y04
  846. LFD a16, 11 * SIZE(AO4)
  847. addi AO1, AO1, 8 * SIZE
  848. addi AO2, AO2, 8 * SIZE
  849. addi AO3, AO3, 8 * SIZE
  850. addi AO4, AO4, 8 * SIZE
  851. STFD y01, 4 * SIZE(YY)
  852. LFD y01, 8 * SIZE(YY)
  853. STFD y02, 5 * SIZE(YY)
  854. LFD y02, 9 * SIZE(YY)
  855. STFD y03, 6 * SIZE(YY)
  856. LFD y03, 10 * SIZE(YY)
  857. STFD y04, 7 * SIZE(YY)
  858. LFD y04, 11 * SIZE(YY)
  859. addi XX, XX, 8 * SIZE
  860. addi YY, YY, 8 * SIZE
  861. .align 4
  862. LL(15):
  863. sub TEMP, M, IS
  864. addi TEMP, TEMP, -4
  865. andi. r0, TEMP, 4
  866. ble LL(16)
  867. FMADD xsum1, xtemp1, a1, xsum1
  868. NOP1
  869. FMADD y01, atemp1, a1, y01
  870. LFD a1, 4 * SIZE(AO1)
  871. FMADD xsum2, xtemp1, a5, xsum2
  872. NOP1
  873. FMADD y02, atemp1, a2, y02
  874. NOP2
  875. FMADD xsum3, xtemp1, a9, xsum3
  876. NOP1
  877. FMADD y03, atemp1, a3, y03
  878. NOP2
  879. FMADD xsum4, xtemp1, a13, xsum4
  880. LFD xtemp1, 4 * SIZE(XX)
  881. FMADD y04, atemp1, a4, y04
  882. NOP2
  883. FMADD xsum1, xtemp2, a2, xsum1
  884. LFD a2, 5 * SIZE(AO1)
  885. FMADD y01, atemp2, a5, y01
  886. LFD a5, 4 * SIZE(AO2)
  887. FMADD xsum2, xtemp2, a6, xsum2
  888. NOP1
  889. FMADD y02, atemp2, a6, y02
  890. LFD a6, 5 * SIZE(AO2)
  891. FMADD xsum3, xtemp2, a10, xsum3
  892. NOP1
  893. FMADD y03, atemp2, a7, y03
  894. NOP2
  895. FMADD xsum4, xtemp2, a14, xsum4
  896. LFD xtemp2, 5 * SIZE(XX)
  897. FMADD y04, atemp2, a8, y04
  898. NOP2
  899. FMADD xsum1, xtemp3, a3, xsum1
  900. LFD a3, 6 * SIZE(AO1)
  901. FMADD y01, atemp3, a9, y01
  902. LFD a9, 4 * SIZE(AO3)
  903. FMADD xsum2, xtemp3, a7, xsum2
  904. LFD a7, 6 * SIZE(AO2)
  905. FMADD y02, atemp3, a10, y02
  906. LFD a10, 5 * SIZE(AO3)
  907. FMADD xsum3, xtemp3, a11, xsum3
  908. NOP1
  909. FMADD y03, atemp3, a11, y03
  910. LFD a11, 6 * SIZE(AO3)
  911. FMADD xsum4, xtemp3, a15, xsum4
  912. LFD xtemp3, 6 * SIZE(XX)
  913. FMADD y04, atemp3, a12, y04
  914. NOP2
  915. FMADD xsum1, xtemp4, a4, xsum1
  916. LFD a4, 7 * SIZE(AO1)
  917. FMADD y01, atemp4, a13, y01
  918. LFD a13, 4 * SIZE(AO4)
  919. FMADD xsum2, xtemp4, a8, xsum2
  920. LFD a8, 7 * SIZE(AO2)
  921. FMADD y02, atemp4, a14, y02
  922. LFD a14, 5 * SIZE(AO4)
  923. FMADD xsum3, xtemp4, a12, xsum3
  924. LFD a12, 7 * SIZE(AO3)
  925. FMADD y03, atemp4, a15, y03
  926. LFD a15, 6 * SIZE(AO4)
  927. FMADD xsum4, xtemp4, a16, xsum4
  928. LFD xtemp4, 7 * SIZE(XX)
  929. FMADD y04, atemp4, a16, y04
  930. LFD a16, 7 * SIZE(AO4)
  931. addi AO1, AO1, 4 * SIZE
  932. addi AO2, AO2, 4 * SIZE
  933. addi AO3, AO3, 4 * SIZE
  934. addi AO4, AO4, 4 * SIZE
  935. STFD y01, 0 * SIZE(YY)
  936. LFD y01, 4 * SIZE(YY)
  937. STFD y02, 1 * SIZE(YY)
  938. LFD y02, 5 * SIZE(YY)
  939. STFD y03, 2 * SIZE(YY)
  940. LFD y03, 6 * SIZE(YY)
  941. STFD y04, 3 * SIZE(YY)
  942. LFD y04, 7 * SIZE(YY)
  943. addi XX, XX, 4 * SIZE
  944. addi YY, YY, 4 * SIZE
  945. .align 4
  946. LL(16):
  947. andi. r0, M, 2
  948. ble LL(17)
  949. FMADD xsum1, xtemp1, a1, xsum1
  950. FMADD y01, atemp1, a1, y01
  951. LFD a1, 2 * SIZE(AO1)
  952. FMADD xsum2, xtemp1, a5, xsum2
  953. FMADD y02, atemp1, a2, y02
  954. FMADD xsum3, xtemp1, a9, xsum3
  955. FMADD y01, atemp2, a5, y01
  956. LFD a5, 2 * SIZE(AO2)
  957. FMADD xsum4, xtemp1, a13, xsum4
  958. LFD xtemp1, 2 * SIZE(XX)
  959. FMADD y02, atemp2, a6, y02
  960. FMADD xsum1, xtemp2, a2, xsum1
  961. FMADD y01, atemp3, a9, y01
  962. LFD a9, 2 * SIZE(AO3)
  963. FMADD xsum2, xtemp2, a6, xsum2
  964. FMADD y02, atemp3, a10, y02
  965. FMADD xsum3, xtemp2, a10, xsum3
  966. FMADD y01, atemp4, a13, y01
  967. LFD a13, 2 * SIZE(AO4)
  968. FMADD xsum4, xtemp2, a14, xsum4
  969. FMADD y02, atemp4, a14, y02
  970. STFD y01, 0 * SIZE(YY)
  971. LFD y01, 2 * SIZE(YY)
  972. STFD y02, 1 * SIZE(YY)
  973. addi YY, YY, 2 * SIZE
  974. .align 4
  975. LL(17):
  976. andi. r0, M, 1
  977. ble LL(18)
  978. FMADD xsum1, xtemp1, a1, xsum1
  979. FMADD y01, atemp1, a1, y01
  980. FMADD xsum2, xtemp1, a5, xsum2
  981. FMADD y01, atemp2, a5, y01
  982. FMADD xsum3, xtemp1, a9, xsum3
  983. FMADD y01, atemp3, a9, y01
  984. FMADD xsum4, xtemp1, a13, xsum4
  985. FMADD y01, atemp4, a13, y01
  986. STFD y01, 0 * SIZE(YY)
  987. .align 4
  988. LL(18):
  989. slwi TEMP, IS, BASE_SHIFT
  990. add YY, NEW_Y, TEMP
  991. LFD y01, 0 * SIZE(YY)
  992. LFD y02, 1 * SIZE(YY)
  993. LFD y03, 2 * SIZE(YY)
  994. LFD y04, 3 * SIZE(YY)
  995. LFD xtemp1, ALPHA
  996. FMUL xsum1, xtemp1, xsum1
  997. FMUL xsum2, xtemp1, xsum2
  998. FMUL xsum3, xtemp1, xsum3
  999. FMUL xsum4, xtemp1, xsum4
  1000. FADD y01, y01, xsum1
  1001. FADD y02, y02, xsum2
  1002. FADD y03, y03, xsum3
  1003. FADD y04, y04, xsum4
  1004. STFD y01, 0 * SIZE(YY)
  1005. STFD y02, 1 * SIZE(YY)
  1006. STFD y03, 2 * SIZE(YY)
  1007. STFD y04, 3 * SIZE(YY)
  1008. addi TEMP, IS, 8
  1009. addi IS, IS, 4
  1010. cmpw cr0, TEMP, N
  1011. ble LL(11)
  1012. .align 4
  1013. LL(20):
  1014. andi. TEMP, N, 2
  1015. ble LL(30)
  1016. mr AO1, A
  1017. add AO2, A, LDA
  1018. add A, AO2, LDA
  1019. addi A, A, 2 * SIZE
  1020. slwi TEMP, IS, BASE_SHIFT
  1021. add XX, X, TEMP
  1022. add YY, NEW_Y, TEMP
  1023. LFD atemp1, 0 * SIZE(XX)
  1024. LFD atemp2, 1 * SIZE(XX)
  1025. LFD a1, 0 * SIZE(AO1)
  1026. LFD a2, 1 * SIZE(AO1)
  1027. LFD a6, 1 * SIZE(AO2)
  1028. LFD a5, ALPHA
  1029. FMUL xsum1, atemp1, a1
  1030. FMUL xsum2, atemp1, a2
  1031. FMADD xsum1, atemp2, a2, xsum1
  1032. FMADD xsum2, atemp2, a6, xsum2
  1033. FMUL atemp1, a5, atemp1
  1034. FMUL atemp2, a5, atemp2
  1035. LFD xtemp1, 2 * SIZE(XX)
  1036. LFD y01, 2 * SIZE(YY)
  1037. LFD a1, 2 * SIZE(AO1)
  1038. LFD a5, 2 * SIZE(AO2)
  1039. andi. r0, M, 1
  1040. ble LL(28)
  1041. FMADD xsum1, xtemp1, a1, xsum1
  1042. FMADD y01, atemp1, a1, y01
  1043. FMADD xsum2, xtemp1, a5, xsum2
  1044. FMADD y01, atemp2, a5, y01
  1045. STFD y01, 2 * SIZE(YY)
  1046. .align 4
  1047. LL(28):
  1048. slwi TEMP, IS, BASE_SHIFT
  1049. add YY, NEW_Y, TEMP
  1050. LFD y01, 0 * SIZE(YY)
  1051. LFD y02, 1 * SIZE(YY)
  1052. LFD xtemp1, ALPHA
  1053. FMUL xsum1, xtemp1, xsum1
  1054. FMUL xsum2, xtemp1, xsum2
  1055. FADD y01, y01, xsum1
  1056. FADD y02, y02, xsum2
  1057. STFD y01, 0 * SIZE(YY)
  1058. STFD y02, 1 * SIZE(YY)
  1059. addi IS, IS, 2
  1060. .align 4
  1061. LL(30):
  1062. andi. TEMP, N, 1
  1063. ble LL(990)
  1064. mr AO1, A
  1065. slwi TEMP, IS, BASE_SHIFT
  1066. add XX, X, TEMP
  1067. add YY, NEW_Y, TEMP
  1068. LFD atemp1, 0 * SIZE(XX)
  1069. LFD a1, 0 * SIZE(AO1)
  1070. LFD xtemp1, ALPHA
  1071. LFD y01, 0 * SIZE(YY)
  1072. FMUL xsum1, atemp1, a1
  1073. FMUL xsum1, xtemp1, xsum1
  1074. FADD y01, y01, xsum1
  1075. STFD y01, 0 * SIZE(YY)
  1076. .align 4
  1077. LL(990):
  1078. cmpwi cr0, INCY, SIZE
  1079. beq LL(999)
  1080. mr YY, Y
  1081. srawi. r0, M, 3
  1082. mtspr CTR, r0
  1083. ble LL(995)
  1084. .align 4
  1085. LL(991):
  1086. LFD f0, 0 * SIZE(Y)
  1087. add Y, Y, INCY
  1088. LFD f1, 0 * SIZE(Y)
  1089. add Y, Y, INCY
  1090. LFD f2, 0 * SIZE(Y)
  1091. add Y, Y, INCY
  1092. LFD f3, 0 * SIZE(Y)
  1093. add Y, Y, INCY
  1094. LFD f4, 0 * SIZE(Y)
  1095. add Y, Y, INCY
  1096. LFD f5, 0 * SIZE(Y)
  1097. add Y, Y, INCY
  1098. LFD f6, 0 * SIZE(Y)
  1099. add Y, Y, INCY
  1100. LFD f7, 0 * SIZE(Y)
  1101. add Y, Y, INCY
  1102. LFD f8, 0 * SIZE(NEW_Y)
  1103. LFD f9, 1 * SIZE(NEW_Y)
  1104. LFD f10, 2 * SIZE(NEW_Y)
  1105. LFD f11, 3 * SIZE(NEW_Y)
  1106. LFD f12, 4 * SIZE(NEW_Y)
  1107. LFD f13, 5 * SIZE(NEW_Y)
  1108. LFD f14, 6 * SIZE(NEW_Y)
  1109. LFD f15, 7 * SIZE(NEW_Y)
  1110. addi NEW_Y, NEW_Y, 8 * SIZE
  1111. FADD f8, f8, f0
  1112. FADD f9, f9, f1
  1113. FADD f10, f10, f2
  1114. FADD f11, f11, f3
  1115. FADD f12, f12, f4
  1116. FADD f13, f13, f5
  1117. FADD f14, f14, f6
  1118. FADD f15, f15, f7
  1119. STFD f8, 0 * SIZE(YY)
  1120. add YY, YY, INCY
  1121. STFD f9, 0 * SIZE(YY)
  1122. add YY, YY, INCY
  1123. STFD f10, 0 * SIZE(YY)
  1124. add YY, YY, INCY
  1125. STFD f11, 0 * SIZE(YY)
  1126. add YY, YY, INCY
  1127. STFD f12, 0 * SIZE(YY)
  1128. add YY, YY, INCY
  1129. STFD f13, 0 * SIZE(YY)
  1130. add YY, YY, INCY
  1131. STFD f14, 0 * SIZE(YY)
  1132. add YY, YY, INCY
  1133. STFD f15, 0 * SIZE(YY)
  1134. add YY, YY, INCY
  1135. bdnz LL(991)
  1136. .align 4
  1137. LL(995):
  1138. andi. J, M, 4
  1139. ble LL(996)
  1140. LFD f0, 0 * SIZE(Y)
  1141. add Y, Y, INCY
  1142. LFD f1, 0 * SIZE(Y)
  1143. add Y, Y, INCY
  1144. LFD f2, 0 * SIZE(Y)
  1145. add Y, Y, INCY
  1146. LFD f3, 0 * SIZE(Y)
  1147. add Y, Y, INCY
  1148. LFD f8, 0 * SIZE(NEW_Y)
  1149. LFD f9, 1 * SIZE(NEW_Y)
  1150. LFD f10, 2 * SIZE(NEW_Y)
  1151. LFD f11, 3 * SIZE(NEW_Y)
  1152. addi NEW_Y, NEW_Y, 4 * SIZE
  1153. FADD f8, f8, f0
  1154. FADD f9, f9, f1
  1155. FADD f10, f10, f2
  1156. FADD f11, f11, f3
  1157. STFD f8, 0 * SIZE(YY)
  1158. add YY, YY, INCY
  1159. STFD f9, 0 * SIZE(YY)
  1160. add YY, YY, INCY
  1161. STFD f10, 0 * SIZE(YY)
  1162. add YY, YY, INCY
  1163. STFD f11, 0 * SIZE(YY)
  1164. add YY, YY, INCY
  1165. .align 4
  1166. LL(996):
  1167. andi. J, M, 2
  1168. ble LL(997)
  1169. LFD f0, 0 * SIZE(Y)
  1170. add Y, Y, INCY
  1171. LFD f1, 0 * SIZE(Y)
  1172. add Y, Y, INCY
  1173. LFD f8, 0 * SIZE(NEW_Y)
  1174. LFD f9, 1 * SIZE(NEW_Y)
  1175. addi NEW_Y, NEW_Y, 2 * SIZE
  1176. FADD f8, f8, f0
  1177. FADD f9, f9, f1
  1178. STFD f8, 0 * SIZE(YY)
  1179. add YY, YY, INCY
  1180. STFD f9, 0 * SIZE(YY)
  1181. add YY, YY, INCY
  1182. .align 4
  1183. LL(997):
  1184. andi. J, M, 1
  1185. ble LL(999)
  1186. LFD f0, 0 * SIZE(Y)
  1187. LFD f8, 0 * SIZE(NEW_Y)
  1188. FADD f8, f8, f0
  1189. STFD f8, 0 * SIZE(YY)
  1190. .align 4
  1191. LL(999):
  1192. li r3, 0
  1193. lfd f14, 0(SP)
  1194. lfd f15, 8(SP)
  1195. lfd f16, 16(SP)
  1196. lfd f17, 24(SP)
  1197. lfd f18, 32(SP)
  1198. lfd f19, 40(SP)
  1199. lfd f20, 48(SP)
  1200. lfd f21, 56(SP)
  1201. lfd f22, 64(SP)
  1202. lfd f23, 72(SP)
  1203. lfd f24, 80(SP)
  1204. lfd f25, 88(SP)
  1205. lfd f26, 96(SP)
  1206. lfd f27, 104(SP)
  1207. lfd f28, 112(SP)
  1208. lfd f29, 120(SP)
  1209. lfd f30, 128(SP)
  1210. lfd f31, 136(SP)
  1211. #ifdef __64BIT__
  1212. ld r14, 144(SP)
  1213. ld r15, 152(SP)
  1214. ld r16, 160(SP)
  1215. ld r17, 168(SP)
  1216. ld r18, 176(SP)
  1217. ld r19, 184(SP)
  1218. ld r20, 192(SP)
  1219. ld r21, 200(SP)
  1220. ld r22, 208(SP)
  1221. ld r23, 216(SP)
  1222. ld r24, 224(SP)
  1223. ld r25, 232(SP)
  1224. ld r26, 240(SP)
  1225. ld r27, 248(SP)
  1226. #else
  1227. lwz r14, 144(SP)
  1228. lwz r15, 148(SP)
  1229. lwz r16, 152(SP)
  1230. lwz r17, 156(SP)
  1231. lwz r18, 160(SP)
  1232. lwz r19, 164(SP)
  1233. lwz r20, 168(SP)
  1234. lwz r21, 172(SP)
  1235. lwz r22, 176(SP)
  1236. lwz r23, 180(SP)
  1237. lwz r24, 184(SP)
  1238. lwz r25, 188(SP)
  1239. lwz r26, 192(SP)
  1240. lwz r27, 196(SP)
  1241. #endif
  1242. addi SP, SP, STACKSIZE
  1243. blr
  1244. EPILOGUE
  1245. #endif