You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t.S 31 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define P 2048
  41. #ifndef __64BIT__
  42. #define STACKSIZE 224
  43. #else
  44. #define STACKSIZE 304
  45. #endif
  46. #if defined(linux) || defined(__FreeBSD__)
  47. #ifndef __64BIT__
  48. #define M r3
  49. #define N r4
  50. #define A r6
  51. #define LDA r7
  52. #define X r8
  53. #define INCX r9
  54. #define Y r10
  55. #define INCY r5
  56. #else
  57. #define M r3
  58. #define N r4
  59. #define A r8
  60. #define LDA r9
  61. #define X r10
  62. #define INCX r5
  63. #define Y r6
  64. #define INCY r7
  65. #endif
  66. #endif
  67. #if defined(_AIX) || defined(__APPLE__)
  68. #if !defined(__64BIT__) && defined(DOUBLE)
  69. #define M r3
  70. #define N r4
  71. #define A r10
  72. #define LDA r5
  73. #define X r6
  74. #define INCX r7
  75. #define Y r8
  76. #define INCY r9
  77. #else
  78. #define M r3
  79. #define N r4
  80. #define A r8
  81. #define LDA r9
  82. #define X r10
  83. #define INCX r5
  84. #define Y r6
  85. #define INCY r7
  86. #endif
  87. #endif
  88. #define BUFFER r11
  89. #define XP r12
  90. #define MIN_N r14
  91. #define J r15
  92. #define CO r16
  93. #define BO r17
  94. #define PLDA_M r18
  95. #define AO1 r19
  96. #define AO2 r20
  97. #define AO3 r21
  98. #define AO4 r22
  99. #define IS r23
  100. #define PREA r24
  101. #define PREC r25
  102. #define Y1 r23 /* dummy; should be same as gemv_n.S */
  103. #define Y2 r24 /* dummy; should be same as gemv_n.S */
  104. #if defined(PPCG4)
  105. #define PREFETCHSIZE_A 34
  106. #define PREFETCHSIZE_C 16
  107. #endif
  108. #if defined(PPC440) || defined(PPC440FP2)
  109. #define PREFETCHSIZE_A 34
  110. #define PREFETCHSIZE_C 16
  111. #endif
  112. #ifdef PPC970
  113. #define PREFETCHSIZE_A 56
  114. #define PREFETCHSIZE_C 16
  115. #endif
  116. #ifdef CELL
  117. #define PREFETCHSIZE_A 56
  118. #define PREFETCHSIZE_C 16
  119. #endif
  120. #ifdef POWER3
  121. #define PREFETCHSIZE_A 34
  122. #define PREFETCHSIZE_C 16
  123. #endif
  124. #ifdef POWER4
  125. #define PREFETCHSIZE_A 34
  126. #define PREFETCHSIZE_C 16
  127. #endif
  128. #ifdef POWER5
  129. #define PREFETCHSIZE_A 40
  130. #define PREFETCHSIZE_C 8
  131. #endif
  132. #ifdef POWER6
  133. #define PREFETCHSIZE_A 24
  134. #define PREFETCHSIZE_C 8
  135. #endif
  136. #ifdef POWER8
  137. #define PREFETCHSIZE_A 24
  138. #define PREFETCHSIZE_C 8
  139. #endif
  140. #if !(defined(CONJ) && defined(XCONJ))
  141. #define FMADDR FMADD
  142. #define FMSUBR FNMSUB
  143. #else
  144. #define FMADDR FNMSUB
  145. #define FMSUBR FMADD
  146. #endif
  147. #ifndef NEEDPARAM
  148. #ifndef __64BIT__
  149. #define FZERO 200(SP)
  150. #define ALPHA_R 208(SP)
  151. #define ALPHA_I 216(SP)
  152. #else
  153. #define FZERO 256(SP)
  154. #define ALPHA_R 264(SP)
  155. #define ALPHA_I 272(SP)
  156. #endif
  157. PROLOGUE
  158. PROFCODE
  159. addi SP, SP, -STACKSIZE
  160. li r0, 0
  161. stfd f14, 0(SP)
  162. stfd f15, 8(SP)
  163. stfd f16, 16(SP)
  164. stfd f17, 24(SP)
  165. stfd f18, 32(SP)
  166. stfd f19, 40(SP)
  167. stfd f20, 48(SP)
  168. stfd f21, 56(SP)
  169. stfd f22, 64(SP)
  170. stfd f23, 72(SP)
  171. stfd f24, 80(SP)
  172. stfd f25, 88(SP)
  173. stfd f26, 96(SP)
  174. stfd f27, 104(SP)
  175. stfd f28, 112(SP)
  176. stfd f29, 120(SP)
  177. stfd f30, 128(SP)
  178. stfd f31, 136(SP)
  179. #ifdef __64BIT__
  180. std r14, 144(SP)
  181. std r15, 152(SP)
  182. std r16, 160(SP)
  183. std r17, 168(SP)
  184. std r18, 176(SP)
  185. std r19, 184(SP)
  186. std r20, 192(SP)
  187. std r21, 200(SP)
  188. std r22, 208(SP)
  189. std r23, 216(SP)
  190. std r24, 224(SP)
  191. std r25, 232(SP)
  192. std r0, FZERO
  193. #else
  194. stw r14, 144(SP)
  195. stw r15, 148(SP)
  196. stw r16, 152(SP)
  197. stw r17, 156(SP)
  198. stw r18, 160(SP)
  199. stw r19, 164(SP)
  200. stw r20, 168(SP)
  201. stw r21, 172(SP)
  202. stw r22, 176(SP)
  203. stw r23, 180(SP)
  204. stw r24, 184(SP)
  205. stw r25, 188(SP)
  206. stw r0, FZERO
  207. stw r0, 4 + FZERO
  208. #endif
  209. #if defined(linux) || defined(__FreeBSD__)
  210. #ifndef __64BIT__
  211. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  212. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  213. #else
  214. ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
  215. ld Y, FRAMESLOT(1) + STACKSIZE(SP)
  216. ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
  217. ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  218. #endif
  219. #endif
  220. #if defined(_AIX) || defined(__APPLE__)
  221. #ifndef __64BIT__
  222. #ifdef DOUBLE
  223. lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
  224. lwz X, FRAMESLOT(1) + STACKSIZE(SP)
  225. lwz INCX, FRAMESLOT(2) + STACKSIZE(SP)
  226. lwz Y, FRAMESLOT(3) + STACKSIZE(SP)
  227. lwz INCY, FRAMESLOT(4) + STACKSIZE(SP)
  228. lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP)
  229. #else
  230. lwz INCX, FRAMESLOT(0) + STACKSIZE(SP)
  231. lwz Y, FRAMESLOT(1) + STACKSIZE(SP)
  232. lwz INCY, FRAMESLOT(2) + STACKSIZE(SP)
  233. lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  234. #endif
  235. #else
  236. ld INCX, FRAMESLOT(0) + STACKSIZE(SP)
  237. ld Y, FRAMESLOT(1) + STACKSIZE(SP)
  238. ld INCY, FRAMESLOT(2) + STACKSIZE(SP)
  239. ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  240. #endif
  241. #endif
  242. stfd f1, ALPHA_R
  243. stfd f2, ALPHA_I
  244. mullw PLDA_M, LDA, N
  245. li XP, P
  246. subf PLDA_M, XP, PLDA_M
  247. slwi PLDA_M, PLDA_M, ZBASE_SHIFT
  248. slwi LDA, LDA, ZBASE_SHIFT
  249. slwi INCX, INCX, ZBASE_SHIFT
  250. slwi INCY, INCY, ZBASE_SHIFT
  251. li IS, 0
  252. li PREA, PREFETCHSIZE_A * SIZE
  253. li PREC, PREFETCHSIZE_C * SIZE
  254. cmpwi cr0, M, 0
  255. ble LL(End)
  256. cmpwi cr0, N, 0
  257. ble LL(End)
  258. .align 4
  259. LL(ISLoop):
  260. subf MIN_N, IS, M
  261. slwi r0, IS, ZBASE_SHIFT
  262. cmpi cr0, 0, MIN_N, P
  263. ble+ LL(min_nP)
  264. li MIN_N, P
  265. LL(min_nP):
  266. add XP, X, r0
  267. cmpwi cr0, INCX, 2 * SIZE
  268. beq LL(Main)
  269. mr XP, BUFFER
  270. addi CO, BUFFER, -SIZE
  271. srawi. r0, MIN_N, 2
  272. mtspr CTR, r0
  273. ble LL(CopyRemain)
  274. .align 4
  275. LL(CopyKernel):
  276. LFD f0, 0 * SIZE(X)
  277. LFD f1, 1 * SIZE(X)
  278. add X, X, INCX
  279. LFD f2, 0 * SIZE(X)
  280. LFD f3, 1 * SIZE(X)
  281. add X, X, INCX
  282. LFD f4, 0 * SIZE(X)
  283. LFD f5, 1 * SIZE(X)
  284. add X, X, INCX
  285. LFD f6, 0 * SIZE(X)
  286. LFD f7, 1 * SIZE(X)
  287. add X, X, INCX
  288. STFD f0, 1 * SIZE(CO)
  289. STFD f1, 2 * SIZE(CO)
  290. STFD f2, 3 * SIZE(CO)
  291. STFD f3, 4 * SIZE(CO)
  292. STFD f4, 5 * SIZE(CO)
  293. STFD f5, 6 * SIZE(CO)
  294. STFD f6, 7 * SIZE(CO)
  295. STFDU f7, 8 * SIZE(CO)
  296. bdnz LL(CopyKernel)
  297. .align 4
  298. LL(CopyRemain):
  299. andi. r0, MIN_N, 3
  300. mtspr CTR, r0
  301. ble LL(Main)
  302. .align 4
  303. LL(CopySub):
  304. LFD f0, 0 * SIZE(X)
  305. LFD f1, 1 * SIZE(X)
  306. add X, X, INCX
  307. STFD f0, 1 * SIZE(CO)
  308. STFDU f1, 2 * SIZE(CO)
  309. bdnz LL(CopySub)
  310. .align 4
  311. LL(Main):
  312. mr CO, Y
  313. addi XP, XP, -SIZE
  314. srawi. J, N, 2
  315. ble LL(Remain)
  316. .align 4
  317. LL(MainHead):
  318. mr AO1, A
  319. add AO2, A, LDA
  320. add AO3, AO2, LDA
  321. add AO4, AO3, LDA
  322. add A, AO4, LDA
  323. mr BO, XP
  324. lfd f0, FZERO
  325. fmr f1, f0
  326. fmr f2, f0
  327. fmr f3, f0
  328. fmr f4, f0
  329. fmr f5, f0
  330. fmr f6, f0
  331. fmr f7, f0
  332. fmr f8, f0
  333. fmr f9, f0
  334. fmr f10, f0
  335. fmr f11, f0
  336. fmr f12, f0
  337. fmr f13, f0
  338. fmr f14, f0
  339. fmr f15, f0
  340. dcbtst PREC, CO
  341. srawi. r0, MIN_N, 3
  342. mtspr CTR, r0
  343. ble LL(MainN3)
  344. LFD f16, 0 * SIZE(AO1)
  345. LFD f17, 1 * SIZE(AO1)
  346. LFD f18, 0 * SIZE(AO2)
  347. LFD f19, 1 * SIZE(AO2)
  348. LFD f20, 0 * SIZE(AO3)
  349. LFD f21, 1 * SIZE(AO3)
  350. LFD f22, 0 * SIZE(AO4)
  351. LFD f23, 1 * SIZE(AO4)
  352. LFD f24, 1 * SIZE(BO)
  353. LFD f25, 2 * SIZE(BO)
  354. LFD f26, 3 * SIZE(BO)
  355. LFD f27, 4 * SIZE(BO)
  356. LFD f28, 5 * SIZE(BO)
  357. LFD f29, 6 * SIZE(BO)
  358. LFD f30, 7 * SIZE(BO)
  359. LFD f31, 8 * SIZE(BO)
  360. bdz LL(MainKernelSkip)
  361. .align 5
  362. LL(MainKernel):
  363. FMADD f0, f16, f24, f0
  364. FMADD f1, f16, f25, f1
  365. FMADD f2, f17, f24, f2
  366. FMADD f3, f17, f25, f3
  367. FMADD f4, f18, f24, f4
  368. FMADD f5, f18, f25, f5
  369. FMADD f6, f19, f24, f6
  370. FMADD f7, f19, f25, f7
  371. LFD f16, 2 * SIZE(AO1)
  372. LFD f17, 3 * SIZE(AO1)
  373. LFD f18, 2 * SIZE(AO2)
  374. LFD f19, 3 * SIZE(AO2)
  375. FMADD f8, f20, f24, f8
  376. FMADD f9, f20, f25, f9
  377. FMADD f10, f21, f24, f10
  378. FMADD f11, f21, f25, f11
  379. FMADD f12, f22, f24, f12
  380. FMADD f13, f22, f25, f13
  381. FMADD f14, f23, f24, f14
  382. FMADD f15, f23, f25, f15
  383. LFD f20, 2 * SIZE(AO3)
  384. LFD f21, 3 * SIZE(AO3)
  385. LFD f22, 2 * SIZE(AO4)
  386. LFD f23, 3 * SIZE(AO4)
  387. FMADD f0, f16, f26, f0
  388. FMADD f1, f16, f27, f1
  389. FMADD f2, f17, f26, f2
  390. FMADD f3, f17, f27, f3
  391. FMADD f4, f18, f26, f4
  392. FMADD f5, f18, f27, f5
  393. FMADD f6, f19, f26, f6
  394. FMADD f7, f19, f27, f7
  395. LFD f16, 4 * SIZE(AO1)
  396. LFD f17, 5 * SIZE(AO1)
  397. LFD f18, 4 * SIZE(AO2)
  398. LFD f19, 5 * SIZE(AO2)
  399. FMADD f8, f20, f26, f8
  400. FMADD f9, f20, f27, f9
  401. FMADD f10, f21, f26, f10
  402. FMADD f11, f21, f27, f11
  403. FMADD f12, f22, f26, f12
  404. FMADD f13, f22, f27, f13
  405. FMADD f14, f23, f26, f14
  406. FMADD f15, f23, f27, f15
  407. LFD f20, 4 * SIZE(AO3)
  408. LFD f21, 5 * SIZE(AO3)
  409. LFD f22, 4 * SIZE(AO4)
  410. LFD f23, 5 * SIZE(AO4)
  411. LFD f24, 9 * SIZE(BO)
  412. LFD f25, 10 * SIZE(BO)
  413. LFD f26, 11 * SIZE(BO)
  414. LFD f27, 12 * SIZE(BO)
  415. FMADD f0, f16, f28, f0
  416. FMADD f1, f16, f29, f1
  417. FMADD f2, f17, f28, f2
  418. FMADD f3, f17, f29, f3
  419. FMADD f4, f18, f28, f4
  420. FMADD f5, f18, f29, f5
  421. FMADD f6, f19, f28, f6
  422. FMADD f7, f19, f29, f7
  423. LFD f16, 6 * SIZE(AO1)
  424. LFD f17, 7 * SIZE(AO1)
  425. LFD f18, 6 * SIZE(AO2)
  426. LFD f19, 7 * SIZE(AO2)
  427. FMADD f8, f20, f28, f8
  428. FMADD f9, f20, f29, f9
  429. FMADD f10, f21, f28, f10
  430. FMADD f11, f21, f29, f11
  431. FMADD f12, f22, f28, f12
  432. FMADD f13, f22, f29, f13
  433. FMADD f14, f23, f28, f14
  434. FMADD f15, f23, f29, f15
  435. LFD f20, 6 * SIZE(AO3)
  436. LFD f21, 7 * SIZE(AO3)
  437. LFD f22, 6 * SIZE(AO4)
  438. LFD f23, 7 * SIZE(AO4)
  439. FMADD f0, f16, f30, f0
  440. FMADD f1, f16, f31, f1
  441. FMADD f2, f17, f30, f2
  442. FMADD f3, f17, f31, f3
  443. FMADD f4, f18, f30, f4
  444. FMADD f5, f18, f31, f5
  445. FMADD f6, f19, f30, f6
  446. FMADD f7, f19, f31, f7
  447. LFD f16, 8 * SIZE(AO1)
  448. LFD f17, 9 * SIZE(AO1)
  449. LFD f18, 8 * SIZE(AO2)
  450. LFD f19, 9 * SIZE(AO2)
  451. FMADD f8, f20, f30, f8
  452. FMADD f9, f20, f31, f9
  453. FMADD f10, f21, f30, f10
  454. FMADD f11, f21, f31, f11
  455. FMADD f12, f22, f30, f12
  456. FMADD f13, f22, f31, f13
  457. FMADD f14, f23, f30, f14
  458. FMADD f15, f23, f31, f15
  459. LFD f20, 8 * SIZE(AO3)
  460. LFD f21, 9 * SIZE(AO3)
  461. LFD f22, 8 * SIZE(AO4)
  462. LFD f23, 9 * SIZE(AO4)
  463. LFD f28, 13 * SIZE(BO)
  464. LFD f29, 14 * SIZE(BO)
  465. LFD f30, 15 * SIZE(BO)
  466. LFD f31, 16 * SIZE(BO)
  467. FMADD f0, f16, f24, f0
  468. FMADD f1, f16, f25, f1
  469. FMADD f2, f17, f24, f2
  470. FMADD f3, f17, f25, f3
  471. FMADD f4, f18, f24, f4
  472. FMADD f5, f18, f25, f5
  473. FMADD f6, f19, f24, f6
  474. FMADD f7, f19, f25, f7
  475. LFD f16, 10 * SIZE(AO1)
  476. LFD f17, 11 * SIZE(AO1)
  477. LFD f18, 10 * SIZE(AO2)
  478. LFD f19, 11 * SIZE(AO2)
  479. FMADD f8, f20, f24, f8
  480. FMADD f9, f20, f25, f9
  481. FMADD f10, f21, f24, f10
  482. FMADD f11, f21, f25, f11
  483. FMADD f12, f22, f24, f12
  484. FMADD f13, f22, f25, f13
  485. FMADD f14, f23, f24, f14
  486. FMADD f15, f23, f25, f15
  487. LFD f20, 10 * SIZE(AO3)
  488. LFD f21, 11 * SIZE(AO3)
  489. LFD f22, 10 * SIZE(AO4)
  490. LFD f23, 11 * SIZE(AO4)
  491. FMADD f0, f16, f26, f0
  492. FMADD f1, f16, f27, f1
  493. FMADD f2, f17, f26, f2
  494. FMADD f3, f17, f27, f3
  495. FMADD f4, f18, f26, f4
  496. FMADD f5, f18, f27, f5
  497. FMADD f6, f19, f26, f6
  498. FMADD f7, f19, f27, f7
  499. LFD f16, 12 * SIZE(AO1)
  500. LFD f17, 13 * SIZE(AO1)
  501. LFD f18, 12 * SIZE(AO2)
  502. LFD f19, 13 * SIZE(AO2)
  503. FMADD f8, f20, f26, f8
  504. FMADD f9, f20, f27, f9
  505. FMADD f10, f21, f26, f10
  506. FMADD f11, f21, f27, f11
  507. FMADD f12, f22, f26, f12
  508. FMADD f13, f22, f27, f13
  509. FMADD f14, f23, f26, f14
  510. FMADD f15, f23, f27, f15
  511. LFD f20, 12 * SIZE(AO3)
  512. LFD f21, 13 * SIZE(AO3)
  513. LFD f22, 12 * SIZE(AO4)
  514. LFD f23, 13 * SIZE(AO4)
  515. LFD f24, 17 * SIZE(BO)
  516. LFD f25, 18 * SIZE(BO)
  517. LFD f26, 19 * SIZE(BO)
  518. LFD f27, 20 * SIZE(BO)
  519. FMADD f0, f16, f28, f0
  520. FMADD f1, f16, f29, f1
  521. FMADD f2, f17, f28, f2
  522. FMADD f3, f17, f29, f3
  523. FMADD f4, f18, f28, f4
  524. FMADD f5, f18, f29, f5
  525. FMADD f6, f19, f28, f6
  526. FMADD f7, f19, f29, f7
  527. LFD f16, 14 * SIZE(AO1)
  528. LFD f17, 15 * SIZE(AO1)
  529. LFD f18, 14 * SIZE(AO2)
  530. LFD f19, 15 * SIZE(AO2)
  531. FMADD f8, f20, f28, f8
  532. FMADD f9, f20, f29, f9
  533. FMADD f10, f21, f28, f10
  534. FMADD f11, f21, f29, f11
  535. FMADD f12, f22, f28, f12
  536. FMADD f13, f22, f29, f13
  537. FMADD f14, f23, f28, f14
  538. FMADD f15, f23, f29, f15
  539. LFD f20, 14 * SIZE(AO3)
  540. LFD f21, 15 * SIZE(AO3)
  541. LFD f22, 14 * SIZE(AO4)
  542. LFD f23, 15 * SIZE(AO4)
  543. FMADD f0, f16, f30, f0
  544. FMADD f1, f16, f31, f1
  545. FMADD f2, f17, f30, f2
  546. FMADD f3, f17, f31, f3
  547. FMADD f4, f18, f30, f4
  548. FMADD f5, f18, f31, f5
  549. FMADD f6, f19, f30, f6
  550. FMADD f7, f19, f31, f7
  551. LFD f16, 16 * SIZE(AO1)
  552. LFD f17, 17 * SIZE(AO1)
  553. LFD f18, 16 * SIZE(AO2)
  554. LFD f19, 17 * SIZE(AO2)
  555. addi AO1, AO1, 16 * SIZE
  556. addi AO2, AO2, 16 * SIZE
  557. DCBT(AO1, PREA)
  558. DCBT(AO2, PREA)
  559. FMADD f8, f20, f30, f8
  560. FMADD f9, f20, f31, f9
  561. FMADD f10, f21, f30, f10
  562. FMADD f11, f21, f31, f11
  563. FMADD f12, f22, f30, f12
  564. FMADD f13, f22, f31, f13
  565. FMADD f14, f23, f30, f14
  566. FMADD f15, f23, f31, f15
  567. LFD f20, 16 * SIZE(AO3)
  568. LFD f21, 17 * SIZE(AO3)
  569. LFD f22, 16 * SIZE(AO4)
  570. LFD f23, 17 * SIZE(AO4)
  571. LFD f28, 21 * SIZE(BO)
  572. LFD f29, 22 * SIZE(BO)
  573. LFD f30, 23 * SIZE(BO)
  574. LFD f31, 24 * SIZE(BO)
  575. addi AO3, AO3, 16 * SIZE
  576. addi AO4, AO4, 16 * SIZE
  577. DCBT(AO3, PREA)
  578. DCBT(AO4, PREA)
  579. addi BO, BO, 16 * SIZE
  580. bdnz LL(MainKernel)
  581. .align 4
  582. LL(MainKernelSkip):
  583. FMADD f0, f16, f24, f0
  584. FMADD f1, f16, f25, f1
  585. FMADD f2, f17, f24, f2
  586. FMADD f3, f17, f25, f3
  587. FMADD f4, f18, f24, f4
  588. FMADD f5, f18, f25, f5
  589. FMADD f6, f19, f24, f6
  590. FMADD f7, f19, f25, f7
  591. LFD f16, 2 * SIZE(AO1)
  592. LFD f17, 3 * SIZE(AO1)
  593. LFD f18, 2 * SIZE(AO2)
  594. LFD f19, 3 * SIZE(AO2)
  595. FMADD f8, f20, f24, f8
  596. FMADD f9, f20, f25, f9
  597. FMADD f10, f21, f24, f10
  598. FMADD f11, f21, f25, f11
  599. FMADD f12, f22, f24, f12
  600. FMADD f13, f22, f25, f13
  601. FMADD f14, f23, f24, f14
  602. FMADD f15, f23, f25, f15
  603. LFD f20, 2 * SIZE(AO3)
  604. LFD f21, 3 * SIZE(AO3)
  605. LFD f22, 2 * SIZE(AO4)
  606. LFD f23, 3 * SIZE(AO4)
  607. FMADD f0, f16, f26, f0
  608. FMADD f1, f16, f27, f1
  609. FMADD f2, f17, f26, f2
  610. FMADD f3, f17, f27, f3
  611. FMADD f4, f18, f26, f4
  612. FMADD f5, f18, f27, f5
  613. FMADD f6, f19, f26, f6
  614. FMADD f7, f19, f27, f7
  615. LFD f16, 4 * SIZE(AO1)
  616. LFD f17, 5 * SIZE(AO1)
  617. LFD f18, 4 * SIZE(AO2)
  618. LFD f19, 5 * SIZE(AO2)
  619. FMADD f8, f20, f26, f8
  620. FMADD f9, f20, f27, f9
  621. FMADD f10, f21, f26, f10
  622. FMADD f11, f21, f27, f11
  623. FMADD f12, f22, f26, f12
  624. FMADD f13, f22, f27, f13
  625. FMADD f14, f23, f26, f14
  626. FMADD f15, f23, f27, f15
  627. LFD f20, 4 * SIZE(AO3)
  628. LFD f21, 5 * SIZE(AO3)
  629. LFD f22, 4 * SIZE(AO4)
  630. LFD f23, 5 * SIZE(AO4)
  631. FMADD f0, f16, f28, f0
  632. FMADD f1, f16, f29, f1
  633. FMADD f2, f17, f28, f2
  634. FMADD f3, f17, f29, f3
  635. FMADD f4, f18, f28, f4
  636. FMADD f5, f18, f29, f5
  637. FMADD f6, f19, f28, f6
  638. FMADD f7, f19, f29, f7
  639. LFD f16, 6 * SIZE(AO1)
  640. LFD f17, 7 * SIZE(AO1)
  641. LFD f18, 6 * SIZE(AO2)
  642. LFD f19, 7 * SIZE(AO2)
  643. FMADD f8, f20, f28, f8
  644. FMADD f9, f20, f29, f9
  645. FMADD f10, f21, f28, f10
  646. FMADD f11, f21, f29, f11
  647. FMADD f12, f22, f28, f12
  648. FMADD f13, f22, f29, f13
  649. FMADD f14, f23, f28, f14
  650. FMADD f15, f23, f29, f15
  651. LFD f20, 6 * SIZE(AO3)
  652. LFD f21, 7 * SIZE(AO3)
  653. LFD f22, 6 * SIZE(AO4)
  654. LFD f23, 7 * SIZE(AO4)
  655. FMADD f0, f16, f30, f0
  656. FMADD f1, f16, f31, f1
  657. FMADD f2, f17, f30, f2
  658. FMADD f3, f17, f31, f3
  659. FMADD f4, f18, f30, f4
  660. FMADD f5, f18, f31, f5
  661. FMADD f6, f19, f30, f6
  662. FMADD f7, f19, f31, f7
  663. LFD f16, 8 * SIZE(AO1)
  664. LFD f17, 9 * SIZE(AO1)
  665. LFD f18, 8 * SIZE(AO2)
  666. LFD f19, 9 * SIZE(AO2)
  667. FMADD f8, f20, f30, f8
  668. FMADD f9, f20, f31, f9
  669. FMADD f10, f21, f30, f10
  670. FMADD f11, f21, f31, f11
  671. FMADD f12, f22, f30, f12
  672. FMADD f13, f22, f31, f13
  673. FMADD f14, f23, f30, f14
  674. FMADD f15, f23, f31, f15
  675. LFD f20, 8 * SIZE(AO3)
  676. LFD f21, 9 * SIZE(AO3)
  677. LFD f22, 8 * SIZE(AO4)
  678. LFD f23, 9 * SIZE(AO4)
  679. LFD f24, 9 * SIZE(BO)
  680. LFD f25, 10 * SIZE(BO)
  681. LFD f26, 11 * SIZE(BO)
  682. LFD f27, 12 * SIZE(BO)
  683. LFD f28, 13 * SIZE(BO)
  684. LFD f29, 14 * SIZE(BO)
  685. LFD f30, 15 * SIZE(BO)
  686. LFDU f31, 16 * SIZE(BO)
  687. FMADD f0, f16, f24, f0
  688. FMADD f1, f16, f25, f1
  689. FMADD f2, f17, f24, f2
  690. FMADD f3, f17, f25, f3
  691. FMADD f4, f18, f24, f4
  692. FMADD f5, f18, f25, f5
  693. FMADD f6, f19, f24, f6
  694. FMADD f7, f19, f25, f7
  695. LFD f16, 10 * SIZE(AO1)
  696. LFD f17, 11 * SIZE(AO1)
  697. LFD f18, 10 * SIZE(AO2)
  698. LFD f19, 11 * SIZE(AO2)
  699. FMADD f8, f20, f24, f8
  700. FMADD f9, f20, f25, f9
  701. FMADD f10, f21, f24, f10
  702. FMADD f11, f21, f25, f11
  703. FMADD f12, f22, f24, f12
  704. FMADD f13, f22, f25, f13
  705. FMADD f14, f23, f24, f14
  706. FMADD f15, f23, f25, f15
  707. LFD f20, 10 * SIZE(AO3)
  708. LFD f21, 11 * SIZE(AO3)
  709. LFD f22, 10 * SIZE(AO4)
  710. LFD f23, 11 * SIZE(AO4)
  711. FMADD f0, f16, f26, f0
  712. FMADD f1, f16, f27, f1
  713. FMADD f2, f17, f26, f2
  714. FMADD f3, f17, f27, f3
  715. FMADD f4, f18, f26, f4
  716. FMADD f5, f18, f27, f5
  717. FMADD f6, f19, f26, f6
  718. FMADD f7, f19, f27, f7
  719. LFD f16, 12 * SIZE(AO1)
  720. LFD f17, 13 * SIZE(AO1)
  721. LFD f18, 12 * SIZE(AO2)
  722. LFD f19, 13 * SIZE(AO2)
  723. FMADD f8, f20, f26, f8
  724. FMADD f9, f20, f27, f9
  725. FMADD f10, f21, f26, f10
  726. FMADD f11, f21, f27, f11
  727. FMADD f12, f22, f26, f12
  728. FMADD f13, f22, f27, f13
  729. FMADD f14, f23, f26, f14
  730. FMADD f15, f23, f27, f15
  731. LFD f20, 12 * SIZE(AO3)
  732. LFD f21, 13 * SIZE(AO3)
  733. LFD f22, 12 * SIZE(AO4)
  734. LFD f23, 13 * SIZE(AO4)
  735. FMADD f0, f16, f28, f0
  736. FMADD f1, f16, f29, f1
  737. FMADD f2, f17, f28, f2
  738. FMADD f3, f17, f29, f3
  739. FMADD f4, f18, f28, f4
  740. FMADD f5, f18, f29, f5
  741. FMADD f6, f19, f28, f6
  742. FMADD f7, f19, f29, f7
  743. LFD f16, 14 * SIZE(AO1)
  744. LFD f17, 15 * SIZE(AO1)
  745. LFD f18, 14 * SIZE(AO2)
  746. LFD f19, 15 * SIZE(AO2)
  747. FMADD f8, f20, f28, f8
  748. FMADD f9, f20, f29, f9
  749. FMADD f10, f21, f28, f10
  750. FMADD f11, f21, f29, f11
  751. FMADD f12, f22, f28, f12
  752. FMADD f13, f22, f29, f13
  753. FMADD f14, f23, f28, f14
  754. FMADD f15, f23, f29, f15
  755. LFD f20, 14 * SIZE(AO3)
  756. LFD f21, 15 * SIZE(AO3)
  757. LFD f22, 14 * SIZE(AO4)
  758. LFD f23, 15 * SIZE(AO4)
  759. addi AO1, AO1, 16 * SIZE
  760. addi AO2, AO2, 16 * SIZE
  761. addi AO3, AO3, 16 * SIZE
  762. addi AO4, AO4, 16 * SIZE
  763. FMADD f0, f16, f30, f0
  764. FMADD f1, f16, f31, f1
  765. FMADD f2, f17, f30, f2
  766. FMADD f3, f17, f31, f3
  767. FMADD f4, f18, f30, f4
  768. FMADD f5, f18, f31, f5
  769. FMADD f6, f19, f30, f6
  770. FMADD f7, f19, f31, f7
  771. FMADD f8, f20, f30, f8
  772. FMADD f9, f20, f31, f9
  773. FMADD f10, f21, f30, f10
  774. FMADD f11, f21, f31, f11
  775. FMADD f12, f22, f30, f12
  776. FMADD f13, f22, f31, f13
  777. FMADD f14, f23, f30, f14
  778. FMADD f15, f23, f31, f15
  779. .align 4
  780. LL(MainN3):
  781. andi. r0, MIN_N, 7
  782. mtspr CTR, r0
  783. ble LL(MainFinish)
  784. .align 4
  785. LFD f16, 0 * SIZE(AO1)
  786. LFD f17, 1 * SIZE(AO1)
  787. LFD f18, 0 * SIZE(AO2)
  788. LFD f19, 1 * SIZE(AO2)
  789. LFD f20, 0 * SIZE(AO3)
  790. LFD f21, 1 * SIZE(AO3)
  791. LFD f22, 0 * SIZE(AO4)
  792. LFD f23, 1 * SIZE(AO4)
  793. LFD f24, 1 * SIZE(BO)
  794. LFDU f25, 2 * SIZE(BO)
  795. addi AO1, AO1, 2 * SIZE
  796. addi AO2, AO2, 2 * SIZE
  797. addi AO3, AO3, 2 * SIZE
  798. addi AO4, AO4, 2 * SIZE
  799. bdz LL(MainN3KernelSkip)
  800. .align 4
  801. LL(MainN3Kernel):
  802. FMADD f0, f16, f24, f0
  803. FMADD f1, f16, f25, f1
  804. FMADD f2, f17, f24, f2
  805. FMADD f3, f17, f25, f3
  806. FMADD f4, f18, f24, f4
  807. FMADD f5, f18, f25, f5
  808. FMADD f6, f19, f24, f6
  809. FMADD f7, f19, f25, f7
  810. LFD f16, 0 * SIZE(AO1)
  811. LFD f17, 1 * SIZE(AO1)
  812. LFD f18, 0 * SIZE(AO2)
  813. LFD f19, 1 * SIZE(AO2)
  814. FMADD f8, f20, f24, f8
  815. FMADD f9, f20, f25, f9
  816. FMADD f10, f21, f24, f10
  817. FMADD f11, f21, f25, f11
  818. FMADD f12, f22, f24, f12
  819. FMADD f13, f22, f25, f13
  820. FMADD f14, f23, f24, f14
  821. FMADD f15, f23, f25, f15
  822. LFD f20, 0 * SIZE(AO3)
  823. LFD f21, 1 * SIZE(AO3)
  824. LFD f22, 0 * SIZE(AO4)
  825. LFD f23, 1 * SIZE(AO4)
  826. LFD f24, 1 * SIZE(BO)
  827. LFDU f25, 2 * SIZE(BO)
  828. addi AO1, AO1, 2 * SIZE
  829. addi AO2, AO2, 2 * SIZE
  830. addi AO3, AO3, 2 * SIZE
  831. addi AO4, AO4, 2 * SIZE
  832. bdnz LL(MainN3Kernel)
  833. .align 4
  834. LL(MainN3KernelSkip):
  835. FMADD f0, f16, f24, f0
  836. FMADD f1, f16, f25, f1
  837. FMADD f2, f17, f24, f2
  838. FMADD f3, f17, f25, f3
  839. FMADD f4, f18, f24, f4
  840. FMADD f5, f18, f25, f5
  841. FMADD f6, f19, f24, f6
  842. FMADD f7, f19, f25, f7
  843. FMADD f8, f20, f24, f8
  844. FMADD f9, f20, f25, f9
  845. FMADD f10, f21, f24, f10
  846. FMADD f11, f21, f25, f11
  847. FMADD f12, f22, f24, f12
  848. FMADD f13, f22, f25, f13
  849. FMADD f14, f23, f24, f14
  850. FMADD f15, f23, f25, f15
  851. .align 4
  852. LL(MainFinish):
  853. lfd f30, ALPHA_R
  854. lfd f31, ALPHA_I
  855. #ifndef XCONJ
  856. #ifndef CONJ
  857. FSUB f0, f0, f3
  858. FADD f1, f1, f2
  859. FSUB f4, f4, f7
  860. FADD f5, f5, f6
  861. FSUB f8, f8, f11
  862. FADD f9, f9, f10
  863. FSUB f12, f12, f15
  864. FADD f13, f13, f14
  865. #else
  866. FADD f0, f0, f3
  867. FSUB f1, f1, f2
  868. FADD f4, f4, f7
  869. FSUB f5, f5, f6
  870. FADD f8, f8, f11
  871. FSUB f9, f9, f10
  872. FADD f12, f12, f15
  873. FSUB f13, f13, f14
  874. #endif
  875. #else
  876. #ifndef CONJ
  877. FADD f0, f0, f3
  878. FSUB f1, f2, f1
  879. FADD f4, f4, f7
  880. FSUB f5, f6, f5
  881. FADD f8, f8, f11
  882. FSUB f9, f10, f9
  883. FADD f12, f12, f15
  884. FSUB f13, f14, f13
  885. #else
  886. FSUB f0, f0, f3
  887. FADD f1, f1, f2
  888. FSUB f4, f4, f7
  889. FADD f5, f5, f6
  890. FSUB f8, f8, f11
  891. FADD f9, f9, f10
  892. FSUB f12, f12, f15
  893. FADD f13, f13, f14
  894. #endif
  895. #endif
  896. mr BO, CO
  897. cmpwi cr0, INCY, 2 * SIZE
  898. bne LL(FinishN1)
  899. LFD f16, 0 * SIZE(CO)
  900. LFD f17, 1 * SIZE(CO)
  901. LFD f18, 2 * SIZE(CO)
  902. LFD f19, 3 * SIZE(CO)
  903. LFD f20, 4 * SIZE(CO)
  904. LFD f21, 5 * SIZE(CO)
  905. LFD f22, 6 * SIZE(CO)
  906. LFD f23, 7 * SIZE(CO)
  907. FMADD f16, f30, f0, f16
  908. FMADDR f17, f30, f1, f17
  909. FMADD f18, f30, f4, f18
  910. FMADDR f19, f30, f5, f19
  911. FMADD f20, f30, f8, f20
  912. FMADDR f21, f30, f9, f21
  913. FMADD f22, f30, f12, f22
  914. FMADDR f23, f30, f13, f23
  915. FMSUBR f16, f31, f1, f16
  916. FMADD f17, f31, f0, f17
  917. FMSUBR f18, f31, f5, f18
  918. FMADD f19, f31, f4, f19
  919. FMSUBR f20, f31, f9, f20
  920. FMADD f21, f31, f8, f21
  921. FMSUBR f22, f31, f13, f22
  922. FMADD f23, f31, f12, f23
  923. STFD f16, 0 * SIZE(CO)
  924. STFD f17, 1 * SIZE(CO)
  925. STFD f18, 2 * SIZE(CO)
  926. STFD f19, 3 * SIZE(CO)
  927. STFD f20, 4 * SIZE(CO)
  928. STFD f21, 5 * SIZE(CO)
  929. STFD f22, 6 * SIZE(CO)
  930. STFD f23, 7 * SIZE(CO)
  931. addi CO, CO, 8 * SIZE
  932. addi J, J, -1
  933. cmpwi cr0, J, 0
  934. bgt LL(MainHead)
  935. b LL(Remain)
  936. .align 4
  937. LL(FinishN1):
  938. LFD f16, 0 * SIZE(CO)
  939. LFD f17, 1 * SIZE(CO)
  940. add CO, CO, INCY
  941. LFD f18, 0 * SIZE(CO)
  942. LFD f19, 1 * SIZE(CO)
  943. add CO, CO, INCY
  944. LFD f20, 0 * SIZE(CO)
  945. LFD f21, 1 * SIZE(CO)
  946. add CO, CO, INCY
  947. LFD f22, 0 * SIZE(CO)
  948. LFD f23, 1 * SIZE(CO)
  949. add CO, CO, INCY
  950. FMADD f16, f30, f0, f16
  951. FMADDR f17, f30, f1, f17
  952. FMADD f18, f30, f4, f18
  953. FMADDR f19, f30, f5, f19
  954. FMADD f20, f30, f8, f20
  955. FMADDR f21, f30, f9, f21
  956. FMADD f22, f30, f12, f22
  957. FMADDR f23, f30, f13, f23
  958. FMSUBR f16, f31, f1, f16
  959. FMADD f17, f31, f0, f17
  960. FMSUBR f18, f31, f5, f18
  961. FMADD f19, f31, f4, f19
  962. FMSUBR f20, f31, f9, f20
  963. FMADD f21, f31, f8, f21
  964. FMSUBR f22, f31, f13, f22
  965. FMADD f23, f31, f12, f23
  966. STFD f16, 0 * SIZE(BO)
  967. STFD f17, 1 * SIZE(BO)
  968. add BO, BO, INCY
  969. STFD f18, 0 * SIZE(BO)
  970. STFD f19, 1 * SIZE(BO)
  971. add BO, BO, INCY
  972. STFD f20, 0 * SIZE(BO)
  973. STFD f21, 1 * SIZE(BO)
  974. add BO, BO, INCY
  975. STFD f22, 0 * SIZE(BO)
  976. STFD f23, 1 * SIZE(BO)
  977. addi J, J, -1
  978. cmpwi cr0, J, 0
  979. bgt LL(MainHead)
  980. .align 4
  981. LL(Remain):
  982. andi. J, N, 3
  983. ble LL(ISEnd)
  984. .align 4
  985. LL(RemainHead):
  986. mr AO1, A
  987. add A, A, LDA
  988. mr BO, XP
  989. lfd f0, FZERO
  990. fmr f1, f0
  991. fmr f2, f0
  992. fmr f3, f0
  993. fmr f4, f0
  994. fmr f5, f0
  995. fmr f6, f0
  996. fmr f7, f0
  997. fmr f8, f0
  998. fmr f9, f0
  999. fmr f10, f0
  1000. fmr f11, f0
  1001. fmr f12, f0
  1002. fmr f13, f0
  1003. fmr f14, f0
  1004. fmr f15, f0
  1005. srawi. r0 , MIN_N, 3
  1006. mtspr CTR, r0
  1007. ble LL(RemainN3)
  1008. LFD f16, 0 * SIZE(AO1)
  1009. LFD f17, 1 * SIZE(AO1)
  1010. LFD f18, 2 * SIZE(AO1)
  1011. LFD f19, 3 * SIZE(AO1)
  1012. LFD f20, 4 * SIZE(AO1)
  1013. LFD f21, 5 * SIZE(AO1)
  1014. LFD f22, 6 * SIZE(AO1)
  1015. LFD f23, 7 * SIZE(AO1)
  1016. LFD f24, 1 * SIZE(BO)
  1017. LFD f25, 2 * SIZE(BO)
  1018. LFD f26, 3 * SIZE(BO)
  1019. LFD f27, 4 * SIZE(BO)
  1020. LFD f28, 5 * SIZE(BO)
  1021. LFD f29, 6 * SIZE(BO)
  1022. LFD f30, 7 * SIZE(BO)
  1023. LFD f31, 8 * SIZE(BO)
  1024. bdz LL(RemainKernelSkip)
  1025. .align 4
  1026. LL(RemainKernel):
  1027. FMADD f0, f16, f24, f0
  1028. FMADD f1, f16, f25, f1
  1029. FMADD f2, f17, f24, f2
  1030. FMADD f3, f17, f25, f3
  1031. FMADD f4, f18, f26, f4
  1032. FMADD f5, f18, f27, f5
  1033. FMADD f6, f19, f26, f6
  1034. FMADD f7, f19, f27, f7
  1035. LFD f16, 8 * SIZE(AO1)
  1036. LFD f17, 9 * SIZE(AO1)
  1037. LFD f18, 10 * SIZE(AO1)
  1038. LFD f19, 11 * SIZE(AO1)
  1039. LFD f24, 9 * SIZE(BO)
  1040. LFD f25, 10 * SIZE(BO)
  1041. LFD f26, 11 * SIZE(BO)
  1042. LFD f27, 12 * SIZE(BO)
  1043. FMADD f8, f20, f28, f8
  1044. FMADD f9, f20, f29, f9
  1045. FMADD f10, f21, f28, f10
  1046. FMADD f11, f21, f29, f11
  1047. FMADD f12, f22, f30, f12
  1048. FMADD f13, f22, f31, f13
  1049. FMADD f14, f23, f30, f14
  1050. FMADD f15, f23, f31, f15
  1051. LFD f20, 12 * SIZE(AO1)
  1052. LFD f21, 13 * SIZE(AO1)
  1053. LFD f22, 14 * SIZE(AO1)
  1054. LFD f23, 15 * SIZE(AO1)
  1055. LFD f28, 13 * SIZE(BO)
  1056. LFD f29, 14 * SIZE(BO)
  1057. LFD f30, 15 * SIZE(BO)
  1058. LFD f31, 16 * SIZE(BO)
  1059. FMADD f0, f16, f24, f0
  1060. FMADD f1, f16, f25, f1
  1061. FMADD f2, f17, f24, f2
  1062. FMADD f3, f17, f25, f3
  1063. FMADD f4, f18, f26, f4
  1064. FMADD f5, f18, f27, f5
  1065. FMADD f6, f19, f26, f6
  1066. FMADD f7, f19, f27, f7
  1067. LFD f16, 16 * SIZE(AO1)
  1068. LFD f17, 17 * SIZE(AO1)
  1069. LFD f18, 18 * SIZE(AO1)
  1070. LFD f19, 19 * SIZE(AO1)
  1071. LFD f24, 17 * SIZE(BO)
  1072. LFD f25, 18 * SIZE(BO)
  1073. LFD f26, 19 * SIZE(BO)
  1074. LFD f27, 20 * SIZE(BO)
  1075. FMADD f8, f20, f28, f8
  1076. FMADD f9, f20, f29, f9
  1077. FMADD f10, f21, f28, f10
  1078. FMADD f11, f21, f29, f11
  1079. FMADD f12, f22, f30, f12
  1080. FMADD f13, f22, f31, f13
  1081. FMADD f14, f23, f30, f14
  1082. FMADD f15, f23, f31, f15
  1083. LFD f20, 20 * SIZE(AO1)
  1084. LFD f21, 21 * SIZE(AO1)
  1085. LFD f22, 22 * SIZE(AO1)
  1086. LFD f23, 23 * SIZE(AO1)
  1087. LFD f28, 21 * SIZE(BO)
  1088. LFD f29, 22 * SIZE(BO)
  1089. LFD f30, 23 * SIZE(BO)
  1090. LFD f31, 24 * SIZE(BO)
  1091. addi AO1, AO1, 16 * SIZE
  1092. addi BO, BO, 16 * SIZE
  1093. DCBT(AO1, PREA)
  1094. bdnz LL(RemainKernel)
  1095. .align 4
  1096. LL(RemainKernelSkip):
  1097. FMADD f0, f16, f24, f0
  1098. FMADD f1, f16, f25, f1
  1099. FMADD f2, f17, f24, f2
  1100. FMADD f3, f17, f25, f3
  1101. FMADD f4, f18, f26, f4
  1102. FMADD f5, f18, f27, f5
  1103. FMADD f6, f19, f26, f6
  1104. FMADD f7, f19, f27, f7
  1105. LFD f16, 8 * SIZE(AO1)
  1106. LFD f17, 9 * SIZE(AO1)
  1107. LFD f18, 10 * SIZE(AO1)
  1108. LFD f19, 11 * SIZE(AO1)
  1109. LFD f24, 9 * SIZE(BO)
  1110. LFD f25, 10 * SIZE(BO)
  1111. LFD f26, 11 * SIZE(BO)
  1112. LFD f27, 12 * SIZE(BO)
  1113. FMADD f8, f20, f28, f8
  1114. FMADD f9, f20, f29, f9
  1115. FMADD f10, f21, f28, f10
  1116. FMADD f11, f21, f29, f11
  1117. FMADD f12, f22, f30, f12
  1118. FMADD f13, f22, f31, f13
  1119. FMADD f14, f23, f30, f14
  1120. FMADD f15, f23, f31, f15
  1121. LFD f20, 12 * SIZE(AO1)
  1122. LFD f21, 13 * SIZE(AO1)
  1123. LFD f22, 14 * SIZE(AO1)
  1124. LFD f23, 15 * SIZE(AO1)
  1125. LFD f28, 13 * SIZE(BO)
  1126. LFD f29, 14 * SIZE(BO)
  1127. LFD f30, 15 * SIZE(BO)
  1128. LFDU f31, 16 * SIZE(BO)
  1129. FMADD f0, f16, f24, f0
  1130. FMADD f1, f16, f25, f1
  1131. FMADD f2, f17, f24, f2
  1132. FMADD f3, f17, f25, f3
  1133. FMADD f4, f18, f26, f4
  1134. FMADD f5, f18, f27, f5
  1135. FMADD f6, f19, f26, f6
  1136. FMADD f7, f19, f27, f7
  1137. FMADD f8, f20, f28, f8
  1138. FMADD f9, f20, f29, f9
  1139. FMADD f10, f21, f28, f10
  1140. FMADD f11, f21, f29, f11
  1141. FMADD f12, f22, f30, f12
  1142. FMADD f13, f22, f31, f13
  1143. FMADD f14, f23, f30, f14
  1144. FMADD f15, f23, f31, f15
  1145. addi AO1, AO1, 16 * SIZE
  1146. .align 4
  1147. LL(RemainN3):
  1148. andi. r0, MIN_N, 7
  1149. mtspr CTR, r0
  1150. ble LL(RemainFinish)
  1151. .align 4
  1152. LFD f16, 0 * SIZE(AO1)
  1153. LFD f17, 1 * SIZE(AO1)
  1154. LFD f24, 1 * SIZE(BO)
  1155. LFDU f25, 2 * SIZE(BO)
  1156. addi AO1, AO1, 2 * SIZE
  1157. bdz LL(RemainN3KernelSkip)
  1158. .align 4
  1159. LL(RemainN3Kernel):
  1160. FMADD f0, f16, f24, f0
  1161. FMADD f1, f16, f25, f1
  1162. FMADD f2, f17, f24, f2
  1163. FMADD f3, f17, f25, f3
  1164. LFD f16, 0 * SIZE(AO1)
  1165. LFD f17, 1 * SIZE(AO1)
  1166. LFD f24, 1 * SIZE(BO)
  1167. LFDU f25, 2 * SIZE(BO)
  1168. addi AO1, AO1, 2 * SIZE
  1169. bdnz LL(RemainN3Kernel)
  1170. .align 4
  1171. LL(RemainN3KernelSkip):
  1172. FMADD f0, f16, f24, f0
  1173. FMADD f1, f16, f25, f1
  1174. FMADD f2, f17, f24, f2
  1175. FMADD f3, f17, f25, f3
  1176. .align 4
  1177. LL(RemainFinish):
  1178. lfd f30, ALPHA_R
  1179. lfd f31, ALPHA_I
  1180. LFD f16, 0 * SIZE(CO)
  1181. LFD f17, 1 * SIZE(CO)
  1182. FADD f0, f0, f4
  1183. FADD f1, f1, f5
  1184. FADD f2, f2, f6
  1185. FADD f3, f3, f7
  1186. FADD f8, f8, f12
  1187. FADD f9, f9, f13
  1188. FADD f10, f10, f14
  1189. FADD f11, f11, f15
  1190. FADD f0, f0, f8
  1191. FADD f1, f1, f9
  1192. FADD f2, f2, f10
  1193. FADD f3, f3, f11
  1194. #ifndef XCONJ
  1195. #ifndef CONJ
  1196. FSUB f0, f0, f3
  1197. FADD f1, f1, f2
  1198. #else
  1199. FADD f0, f0, f3
  1200. FSUB f1, f1, f2
  1201. #endif
  1202. #else
  1203. #ifndef CONJ
  1204. FADD f0, f0, f3
  1205. FSUB f1, f2, f1
  1206. #else
  1207. FSUB f0, f0, f3
  1208. FADD f1, f1, f2
  1209. #endif
  1210. #endif
  1211. FMADD f16, f30, f0, f16
  1212. FMADDR f17, f30, f1, f17
  1213. FMSUBR f16, f31, f1, f16
  1214. FMADD f17, f31, f0, f17
  1215. STFD f16, 0 * SIZE(CO)
  1216. STFD f17, 1 * SIZE(CO)
  1217. add CO, CO, INCY
  1218. addi J, J, -1
  1219. cmpi cr0, 0, J, 0
  1220. bgt LL(RemainHead)
  1221. .align 4
  1222. LL(ISEnd):
  1223. subf A, PLDA_M, A
  1224. addi IS, IS, P
  1225. cmp cr0, 0, IS, M
  1226. blt LL(ISLoop)
  1227. .align 4
  1228. LL(End):
  1229. li r3, 0
  1230. lfd f14, 0(SP)
  1231. lfd f15, 8(SP)
  1232. lfd f16, 16(SP)
  1233. lfd f17, 24(SP)
  1234. lfd f18, 32(SP)
  1235. lfd f19, 40(SP)
  1236. lfd f20, 48(SP)
  1237. lfd f21, 56(SP)
  1238. lfd f22, 64(SP)
  1239. lfd f23, 72(SP)
  1240. lfd f24, 80(SP)
  1241. lfd f25, 88(SP)
  1242. lfd f26, 96(SP)
  1243. lfd f27, 104(SP)
  1244. lfd f28, 112(SP)
  1245. lfd f29, 120(SP)
  1246. lfd f30, 128(SP)
  1247. lfd f31, 136(SP)
  1248. #ifdef __64BIT__
  1249. ld r14, 144(SP)
  1250. ld r15, 152(SP)
  1251. ld r16, 160(SP)
  1252. ld r17, 168(SP)
  1253. ld r18, 176(SP)
  1254. ld r19, 184(SP)
  1255. ld r20, 192(SP)
  1256. ld r21, 200(SP)
  1257. ld r22, 208(SP)
  1258. ld r23, 216(SP)
  1259. ld r24, 224(SP)
  1260. ld r25, 232(SP)
  1261. #else
  1262. lwz r14, 144(SP)
  1263. lwz r15, 148(SP)
  1264. lwz r16, 152(SP)
  1265. lwz r17, 156(SP)
  1266. lwz r18, 160(SP)
  1267. lwz r19, 164(SP)
  1268. lwz r20, 168(SP)
  1269. lwz r21, 172(SP)
  1270. lwz r22, 176(SP)
  1271. lwz r23, 180(SP)
  1272. lwz r24, 184(SP)
  1273. lwz r25, 188(SP)
  1274. #endif
  1275. addi SP, SP, STACKSIZE
  1276. blr
  1277. EPILOGUE
  1278. #endif