You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ger.S 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef NEEDPARAM
  41. #ifndef DOUBLE
  42. #include "sparam.h"
  43. #else
  44. #include "dparam.h"
  45. #endif
  46. #endif
  47. #if defined(linux) || defined(__FreeBSD__)
  48. #ifndef __64BIT__
  49. #define M r3
  50. #define N r4
  51. #define X r6
  52. #define INCX r7
  53. #define Y r8
  54. #define INCY r9
  55. #define A r10
  56. #define LDA r5
  57. #else
  58. #define M r3
  59. #define N r4
  60. #define X r7
  61. #define INCX r8
  62. #define Y r9
  63. #define INCY r10
  64. #define A r5
  65. #define LDA r6
  66. #endif
  67. #endif
  68. #if defined(_AIX) || defined(__APPLE__)
  69. #if !defined(__64BIT__) && defined(DOUBLE)
  70. #define M r3
  71. #define N r4
  72. #define X r8
  73. #define INCX r9
  74. #define Y r10
  75. #define INCY r5
  76. #define A r6
  77. #define LDA r7
  78. #else
  79. #define M r3
  80. #define N r4
  81. #define X r7
  82. #define INCX r8
  83. #define Y r9
  84. #define INCY r10
  85. #define A r5
  86. #define LDA r6
  87. #endif
  88. #endif
  89. #define I r11
  90. #define J r12
  91. #define AO1 r14
  92. #define AO2 r15
  93. #define AO3 r16
  94. #define AO4 r17
  95. #define AO5 r18
  96. #define AO6 r19
  97. #define AO7 r20
  98. #define AO8 r21
  99. #define X1 r22
  100. #define PREA r23
  101. #define PREC r24
  102. #define XX r25
  103. #define BUFFER r26
  104. #define y01 f0
  105. #define y02 f1
  106. #define y03 f2
  107. #define y04 f3
  108. #define y05 f4
  109. #define y06 f5
  110. #define y07 f6
  111. #define y08 f7
  112. #define alpha1 f8
  113. #define alpha2 f9
  114. #define a1 f12
  115. #define a2 f13
  116. #define a3 f14
  117. #define a4 f15
  118. #define a5 f16
  119. #define a6 f17
  120. #define a7 f18
  121. #define a8 f19
  122. #define a9 f20
  123. #define a10 f21
  124. #define a11 f22
  125. #define a12 f23
  126. #define a13 f24
  127. #define a14 f25
  128. #define a15 f26
  129. #define a16 f27
  130. #define alpha f31
  131. #if defined(PPC440) || defined(PPC440FP2)
  132. #define PREFETCHSIZE_A 24
  133. #define PREFETCHSIZE_C 16
  134. #endif
  135. #ifdef PPC970
  136. #define PREFETCHSIZE_A 16
  137. #define PREFETCHSIZE_C 16
  138. #endif
  139. #ifdef POWER4
  140. #define PREFETCHSIZE_A 16
  141. #define PREFETCHSIZE_C 16
  142. #endif
  143. #ifdef POWER5
  144. #define PREFETCHSIZE_A 16
  145. #define PREFETCHSIZE_C 16
  146. #endif
  147. #ifndef NEEDPARAM
  148. #ifndef __64BIT__
  149. #define STACKSIZE 224
  150. #else
  151. #define STACKSIZE 280
  152. #endif
  153. PROLOGUE
  154. PROFCODE
  155. addi SP, SP, -STACKSIZE
  156. stfd f14, 0(SP)
  157. stfd f15, 8(SP)
  158. stfd f16, 16(SP)
  159. stfd f17, 24(SP)
  160. stfd f18, 32(SP)
  161. stfd f19, 40(SP)
  162. stfd f20, 48(SP)
  163. stfd f21, 56(SP)
  164. stfd f22, 64(SP)
  165. stfd f23, 72(SP)
  166. stfd f24, 80(SP)
  167. stfd f25, 88(SP)
  168. stfd f26, 96(SP)
  169. stfd f27, 104(SP)
  170. stfd f28, 112(SP)
  171. stfd f29, 120(SP)
  172. stfd f30, 128(SP)
  173. stfd f31, 136(SP)
  174. #ifdef __64BIT__
  175. std r14, 144(SP)
  176. std r15, 152(SP)
  177. std r16, 160(SP)
  178. std r17, 168(SP)
  179. std r18, 176(SP)
  180. std r19, 184(SP)
  181. std r20, 192(SP)
  182. std r21, 200(SP)
  183. std r22, 208(SP)
  184. std r23, 216(SP)
  185. std r24, 224(SP)
  186. std r25, 232(SP)
  187. std r26, 240(SP)
  188. std r27, 248(SP)
  189. #else
  190. stw r14, 144(SP)
  191. stw r15, 148(SP)
  192. stw r16, 152(SP)
  193. stw r17, 156(SP)
  194. stw r18, 160(SP)
  195. stw r19, 164(SP)
  196. stw r20, 168(SP)
  197. stw r21, 172(SP)
  198. stw r22, 176(SP)
  199. stw r23, 180(SP)
  200. stw r24, 184(SP)
  201. stw r25, 188(SP)
  202. stw r26, 192(SP)
  203. stw r27, 196(SP)
  204. #endif
  205. #if defined(linux) || defined(__FreeBSD__)
  206. #ifndef __64BIT__
  207. lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
  208. lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
  209. #else
  210. ld A, FRAMESLOT(0) + STACKSIZE(SP)
  211. ld LDA, FRAMESLOT(1) + STACKSIZE(SP)
  212. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  213. #endif
  214. #endif
  215. #if defined(_AIX) || defined(__APPLE__)
  216. #ifndef __64BIT__
  217. #ifdef DOUBLE
  218. lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
  219. lwz A, FRAMESLOT(1) + STACKSIZE(SP)
  220. lwz LDA, FRAMESLOT(2) + STACKSIZE(SP)
  221. lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP)
  222. #else
  223. lwz A, FRAMESLOT(0) + STACKSIZE(SP)
  224. lwz LDA, FRAMESLOT(1) + STACKSIZE(SP)
  225. lwz BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  226. #endif
  227. #else
  228. ld A, FRAMESLOT(0) + STACKSIZE(SP)
  229. ld LDA, FRAMESLOT(1) + STACKSIZE(SP)
  230. ld BUFFER, FRAMESLOT(2) + STACKSIZE(SP)
  231. #endif
  232. #endif
  233. fmr alpha, f1
  234. slwi LDA, LDA, BASE_SHIFT
  235. slwi INCX, INCX, BASE_SHIFT
  236. slwi INCY, INCY, BASE_SHIFT
  237. li PREA, PREFETCHSIZE_A * SIZE
  238. li PREC, PREFETCHSIZE_C * SIZE
  239. cmpwi cr0, M, 0
  240. ble- LL(999)
  241. cmpwi cr0, N, 0
  242. ble- LL(999)
  243. mr XX, X
  244. cmpi cr0, 0, INCX, SIZE
  245. beq LL(10)
  246. mr XX, BUFFER
  247. mr X1, BUFFER
  248. srawi. r0, M, 3
  249. mtspr CTR, r0
  250. ble LL(05)
  251. .align 4
  252. LL(01):
  253. LFD a1, 0 * SIZE(X)
  254. add X, X, INCX
  255. LFD a2, 0 * SIZE(X)
  256. add X, X, INCX
  257. LFD a3, 0 * SIZE(X)
  258. add X, X, INCX
  259. LFD a4, 0 * SIZE(X)
  260. add X, X, INCX
  261. LFD a5, 0 * SIZE(X)
  262. add X, X, INCX
  263. LFD a6, 0 * SIZE(X)
  264. add X, X, INCX
  265. LFD a7, 0 * SIZE(X)
  266. add X, X, INCX
  267. LFD a8, 0 * SIZE(X)
  268. add X, X, INCX
  269. STFD a1, 0 * SIZE(X1)
  270. STFD a2, 1 * SIZE(X1)
  271. STFD a3, 2 * SIZE(X1)
  272. STFD a4, 3 * SIZE(X1)
  273. STFD a5, 4 * SIZE(X1)
  274. STFD a6, 5 * SIZE(X1)
  275. STFD a7, 6 * SIZE(X1)
  276. STFD a8, 7 * SIZE(X1)
  277. addi X1, X1, 8 * SIZE
  278. bdnz+ LL(01)
  279. .align 4
  280. LL(05):
  281. andi. r0, M, 7
  282. mtspr CTR, r0
  283. ble LL(10)
  284. .align 4
  285. LL(06):
  286. LFD a1, 0 * SIZE(X)
  287. add X, X, INCX
  288. STFD a1, 0 * SIZE(X1)
  289. addi X1, X1, SIZE
  290. bdnz+ LL(06)
  291. .align 4
  292. LL(10):
  293. srawi. J, N, 1
  294. ble LL(20)
  295. .align 4
  296. LL(11):
  297. LFD alpha1, 0 * SIZE(Y)
  298. add Y, Y, INCY
  299. LFD alpha2, 0 * SIZE(Y)
  300. add Y, Y, INCY
  301. FMUL alpha1, alpha, alpha1
  302. FMUL alpha2, alpha, alpha2
  303. mr AO1, A
  304. add AO2, A, LDA
  305. add A, AO2, LDA
  306. mr X1, XX
  307. srawi. r0, M, 4
  308. mtspr CTR, r0
  309. ble LL(15)
  310. LFD a1, 0 * SIZE(AO1)
  311. LFD a2, 1 * SIZE(AO1)
  312. LFD a3, 2 * SIZE(AO1)
  313. LFD a4, 3 * SIZE(AO1)
  314. LFD a5, 4 * SIZE(AO1)
  315. LFD a6, 5 * SIZE(AO1)
  316. LFD a7, 6 * SIZE(AO1)
  317. LFD a8, 7 * SIZE(AO1)
  318. LFD y01, 0 * SIZE(X1)
  319. LFD y02, 1 * SIZE(X1)
  320. LFD y03, 2 * SIZE(X1)
  321. LFD y04, 3 * SIZE(X1)
  322. LFD y05, 4 * SIZE(X1)
  323. LFD y06, 5 * SIZE(X1)
  324. LFD y07, 6 * SIZE(X1)
  325. LFD y08, 7 * SIZE(X1)
  326. LFD a9, 0 * SIZE(AO2)
  327. LFD a10, 1 * SIZE(AO2)
  328. LFD a11, 2 * SIZE(AO2)
  329. LFD a12, 3 * SIZE(AO2)
  330. LFD a13, 4 * SIZE(AO2)
  331. LFD a14, 5 * SIZE(AO2)
  332. LFD a15, 6 * SIZE(AO2)
  333. LFD a16, 7 * SIZE(AO2)
  334. bdz LL(13)
  335. .align 4
  336. LL(12):
  337. FMADD a1, alpha1, y01, a1
  338. FMADD a2, alpha1, y02, a2
  339. FMADD a3, alpha1, y03, a3
  340. FMADD a4, alpha1, y04, a4
  341. FMADD a5, alpha1, y05, a5
  342. FMADD a6, alpha1, y06, a6
  343. FMADD a7, alpha1, y07, a7
  344. FMADD a8, alpha1, y08, a8
  345. STFD a1, 0 * SIZE(AO1)
  346. STFD a2, 1 * SIZE(AO1)
  347. STFD a3, 2 * SIZE(AO1)
  348. STFD a4, 3 * SIZE(AO1)
  349. LFD a1, 8 * SIZE(AO1)
  350. LFD a2, 9 * SIZE(AO1)
  351. LFD a3, 10 * SIZE(AO1)
  352. LFD a4, 11 * SIZE(AO1)
  353. STFD a5, 4 * SIZE(AO1)
  354. STFD a6, 5 * SIZE(AO1)
  355. STFD a7, 6 * SIZE(AO1)
  356. STFD a8, 7 * SIZE(AO1)
  357. LFD a5, 12 * SIZE(AO1)
  358. LFD a6, 13 * SIZE(AO1)
  359. LFD a7, 14 * SIZE(AO1)
  360. LFD a8, 15 * SIZE(AO1)
  361. FMADD a9, alpha2, y01, a9
  362. FMADD a10, alpha2, y02, a10
  363. FMADD a11, alpha2, y03, a11
  364. FMADD a12, alpha2, y04, a12
  365. LFD y01, 8 * SIZE(X1)
  366. LFD y02, 9 * SIZE(X1)
  367. LFD y03, 10 * SIZE(X1)
  368. LFD y04, 11 * SIZE(X1)
  369. FMADD a13, alpha2, y05, a13
  370. FMADD a14, alpha2, y06, a14
  371. FMADD a15, alpha2, y07, a15
  372. FMADD a16, alpha2, y08, a16
  373. LFD y05, 12 * SIZE(X1)
  374. LFD y06, 13 * SIZE(X1)
  375. LFD y07, 14 * SIZE(X1)
  376. LFD y08, 15 * SIZE(X1)
  377. STFD a9, 0 * SIZE(AO2)
  378. STFD a10, 1 * SIZE(AO2)
  379. STFD a11, 2 * SIZE(AO2)
  380. STFD a12, 3 * SIZE(AO2)
  381. LFD a9, 8 * SIZE(AO2)
  382. LFD a10, 9 * SIZE(AO2)
  383. LFD a11, 10 * SIZE(AO2)
  384. LFD a12, 11 * SIZE(AO2)
  385. STFD a13, 4 * SIZE(AO2)
  386. STFD a14, 5 * SIZE(AO2)
  387. STFD a15, 6 * SIZE(AO2)
  388. STFD a16, 7 * SIZE(AO2)
  389. LFD a13, 12 * SIZE(AO2)
  390. LFD a14, 13 * SIZE(AO2)
  391. LFD a15, 14 * SIZE(AO2)
  392. LFD a16, 15 * SIZE(AO2)
  393. FMADD a1, alpha1, y01, a1
  394. FMADD a2, alpha1, y02, a2
  395. FMADD a3, alpha1, y03, a3
  396. FMADD a4, alpha1, y04, a4
  397. FMADD a5, alpha1, y05, a5
  398. FMADD a6, alpha1, y06, a6
  399. FMADD a7, alpha1, y07, a7
  400. FMADD a8, alpha1, y08, a8
  401. STFD a1, 8 * SIZE(AO1)
  402. STFD a2, 9 * SIZE(AO1)
  403. STFD a3, 10 * SIZE(AO1)
  404. STFD a4, 11 * SIZE(AO1)
  405. LFD a1, 16 * SIZE(AO1)
  406. LFD a2, 17 * SIZE(AO1)
  407. LFD a3, 18 * SIZE(AO1)
  408. LFD a4, 19 * SIZE(AO1)
  409. STFD a5, 12 * SIZE(AO1)
  410. STFD a6, 13 * SIZE(AO1)
  411. STFD a7, 14 * SIZE(AO1)
  412. STFD a8, 15 * SIZE(AO1)
  413. LFD a5, 20 * SIZE(AO1)
  414. LFD a6, 21 * SIZE(AO1)
  415. LFD a7, 22 * SIZE(AO1)
  416. LFD a8, 23 * SIZE(AO1)
  417. FMADD a9, alpha2, y01, a9
  418. FMADD a10, alpha2, y02, a10
  419. FMADD a11, alpha2, y03, a11
  420. FMADD a12, alpha2, y04, a12
  421. LFD y01, 16 * SIZE(X1)
  422. LFD y02, 17 * SIZE(X1)
  423. LFD y03, 18 * SIZE(X1)
  424. LFD y04, 19 * SIZE(X1)
  425. FMADD a13, alpha2, y05, a13
  426. FMADD a14, alpha2, y06, a14
  427. FMADD a15, alpha2, y07, a15
  428. FMADD a16, alpha2, y08, a16
  429. LFD y05, 20 * SIZE(X1)
  430. LFD y06, 21 * SIZE(X1)
  431. LFD y07, 22 * SIZE(X1)
  432. LFD y08, 23 * SIZE(X1)
  433. STFD a9, 8 * SIZE(AO2)
  434. STFD a10, 9 * SIZE(AO2)
  435. STFD a11, 10 * SIZE(AO2)
  436. STFD a12, 11 * SIZE(AO2)
  437. LFD a9, 16 * SIZE(AO2)
  438. LFD a10, 17 * SIZE(AO2)
  439. LFD a11, 18 * SIZE(AO2)
  440. LFD a12, 19 * SIZE(AO2)
  441. STFD a13, 12 * SIZE(AO2)
  442. STFD a14, 13 * SIZE(AO2)
  443. STFD a15, 14 * SIZE(AO2)
  444. STFD a16, 15 * SIZE(AO2)
  445. LFD a13, 20 * SIZE(AO2)
  446. LFD a14, 21 * SIZE(AO2)
  447. LFD a15, 22 * SIZE(AO2)
  448. LFD a16, 23 * SIZE(AO2)
  449. addi AO1, AO1, 16 * SIZE
  450. addi AO2, AO2, 16 * SIZE
  451. addi X1, X1, 16 * SIZE
  452. DCBT(AO1, PREA)
  453. DCBT(AO2, PREA)
  454. DCBT(Y1, PREY)
  455. bdnz+ LL(12)
  456. .align 4
  457. LL(13):
  458. FMADD a1, alpha1, y01, a1
  459. FMADD a2, alpha1, y02, a2
  460. FMADD a3, alpha1, y03, a3
  461. FMADD a4, alpha1, y04, a4
  462. FMADD a5, alpha1, y05, a5
  463. FMADD a6, alpha1, y06, a6
  464. FMADD a7, alpha1, y07, a7
  465. FMADD a8, alpha1, y08, a8
  466. STFD a1, 0 * SIZE(AO1)
  467. STFD a2, 1 * SIZE(AO1)
  468. STFD a3, 2 * SIZE(AO1)
  469. STFD a4, 3 * SIZE(AO1)
  470. LFD a1, 8 * SIZE(AO1)
  471. LFD a2, 9 * SIZE(AO1)
  472. LFD a3, 10 * SIZE(AO1)
  473. LFD a4, 11 * SIZE(AO1)
  474. STFD a5, 4 * SIZE(AO1)
  475. STFD a6, 5 * SIZE(AO1)
  476. STFD a7, 6 * SIZE(AO1)
  477. STFD a8, 7 * SIZE(AO1)
  478. LFD a5, 12 * SIZE(AO1)
  479. LFD a6, 13 * SIZE(AO1)
  480. LFD a7, 14 * SIZE(AO1)
  481. LFD a8, 15 * SIZE(AO1)
  482. FMADD a9, alpha2, y01, a9
  483. FMADD a10, alpha2, y02, a10
  484. FMADD a11, alpha2, y03, a11
  485. FMADD a12, alpha2, y04, a12
  486. LFD y01, 8 * SIZE(X1)
  487. LFD y02, 9 * SIZE(X1)
  488. LFD y03, 10 * SIZE(X1)
  489. LFD y04, 11 * SIZE(X1)
  490. FMADD a13, alpha2, y05, a13
  491. FMADD a14, alpha2, y06, a14
  492. FMADD a15, alpha2, y07, a15
  493. FMADD a16, alpha2, y08, a16
  494. LFD y05, 12 * SIZE(X1)
  495. LFD y06, 13 * SIZE(X1)
  496. LFD y07, 14 * SIZE(X1)
  497. LFD y08, 15 * SIZE(X1)
  498. STFD a9, 0 * SIZE(AO2)
  499. STFD a10, 1 * SIZE(AO2)
  500. STFD a11, 2 * SIZE(AO2)
  501. STFD a12, 3 * SIZE(AO2)
  502. LFD a9, 8 * SIZE(AO2)
  503. LFD a10, 9 * SIZE(AO2)
  504. LFD a11, 10 * SIZE(AO2)
  505. LFD a12, 11 * SIZE(AO2)
  506. STFD a13, 4 * SIZE(AO2)
  507. STFD a14, 5 * SIZE(AO2)
  508. STFD a15, 6 * SIZE(AO2)
  509. STFD a16, 7 * SIZE(AO2)
  510. LFD a13, 12 * SIZE(AO2)
  511. LFD a14, 13 * SIZE(AO2)
  512. LFD a15, 14 * SIZE(AO2)
  513. LFD a16, 15 * SIZE(AO2)
  514. FMADD a1, alpha1, y01, a1
  515. FMADD a2, alpha1, y02, a2
  516. FMADD a3, alpha1, y03, a3
  517. FMADD a4, alpha1, y04, a4
  518. FMADD a5, alpha1, y05, a5
  519. FMADD a6, alpha1, y06, a6
  520. FMADD a7, alpha1, y07, a7
  521. FMADD a8, alpha1, y08, a8
  522. STFD a1, 8 * SIZE(AO1)
  523. STFD a2, 9 * SIZE(AO1)
  524. STFD a3, 10 * SIZE(AO1)
  525. STFD a4, 11 * SIZE(AO1)
  526. LFD a1, 16 * SIZE(AO1)
  527. LFD a2, 17 * SIZE(AO1)
  528. LFD a3, 18 * SIZE(AO1)
  529. LFD a4, 19 * SIZE(AO1)
  530. STFD a5, 12 * SIZE(AO1)
  531. STFD a6, 13 * SIZE(AO1)
  532. STFD a7, 14 * SIZE(AO1)
  533. STFD a8, 15 * SIZE(AO1)
  534. LFD a5, 20 * SIZE(AO1)
  535. LFD a6, 21 * SIZE(AO1)
  536. LFD a7, 22 * SIZE(AO1)
  537. LFD a8, 23 * SIZE(AO1)
  538. FMADD a9, alpha2, y01, a9
  539. FMADD a10, alpha2, y02, a10
  540. FMADD a11, alpha2, y03, a11
  541. FMADD a12, alpha2, y04, a12
  542. FMADD a13, alpha2, y05, a13
  543. FMADD a14, alpha2, y06, a14
  544. FMADD a15, alpha2, y07, a15
  545. FMADD a16, alpha2, y08, a16
  546. STFD a9, 8 * SIZE(AO2)
  547. STFD a10, 9 * SIZE(AO2)
  548. STFD a11, 10 * SIZE(AO2)
  549. STFD a12, 11 * SIZE(AO2)
  550. STFD a13, 12 * SIZE(AO2)
  551. STFD a14, 13 * SIZE(AO2)
  552. STFD a15, 14 * SIZE(AO2)
  553. STFD a16, 15 * SIZE(AO2)
  554. addi AO1, AO1, 16 * SIZE
  555. addi AO2, AO2, 16 * SIZE
  556. addi X1, X1, 16 * SIZE
  557. .align 4
  558. LL(15):
  559. andi. r0, M, 15
  560. ble LL(19)
  561. andi. r0, M, 8
  562. ble LL(16)
  563. LFD y01, 0 * SIZE(X1)
  564. LFD y02, 1 * SIZE(X1)
  565. LFD y03, 2 * SIZE(X1)
  566. LFD y04, 3 * SIZE(X1)
  567. LFD y05, 4 * SIZE(X1)
  568. LFD y06, 5 * SIZE(X1)
  569. LFD y07, 6 * SIZE(X1)
  570. LFD y08, 7 * SIZE(X1)
  571. LFD a1, 0 * SIZE(AO1)
  572. LFD a2, 1 * SIZE(AO1)
  573. LFD a3, 2 * SIZE(AO1)
  574. LFD a4, 3 * SIZE(AO1)
  575. LFD a5, 4 * SIZE(AO1)
  576. LFD a6, 5 * SIZE(AO1)
  577. LFD a7, 6 * SIZE(AO1)
  578. LFD a8, 7 * SIZE(AO1)
  579. LFD a9, 0 * SIZE(AO2)
  580. LFD a10, 1 * SIZE(AO2)
  581. LFD a11, 2 * SIZE(AO2)
  582. LFD a12, 3 * SIZE(AO2)
  583. LFD a13, 4 * SIZE(AO2)
  584. LFD a14, 5 * SIZE(AO2)
  585. LFD a15, 6 * SIZE(AO2)
  586. LFD a16, 7 * SIZE(AO2)
  587. FMADD a1, alpha1, y01, a1
  588. FMADD a2, alpha1, y02, a2
  589. FMADD a3, alpha1, y03, a3
  590. FMADD a4, alpha1, y04, a4
  591. STFD a1, 0 * SIZE(AO1)
  592. STFD a2, 1 * SIZE(AO1)
  593. STFD a3, 2 * SIZE(AO1)
  594. STFD a4, 3 * SIZE(AO1)
  595. FMADD a5, alpha1, y05, a5
  596. FMADD a6, alpha1, y06, a6
  597. FMADD a7, alpha1, y07, a7
  598. FMADD a8, alpha1, y08, a8
  599. STFD a5, 4 * SIZE(AO1)
  600. STFD a6, 5 * SIZE(AO1)
  601. STFD a7, 6 * SIZE(AO1)
  602. STFD a8, 7 * SIZE(AO1)
  603. FMADD a9, alpha2, y01, a9
  604. FMADD a10, alpha2, y02, a10
  605. FMADD a11, alpha2, y03, a11
  606. FMADD a12, alpha2, y04, a12
  607. STFD a9, 0 * SIZE(AO2)
  608. STFD a10, 1 * SIZE(AO2)
  609. STFD a11, 2 * SIZE(AO2)
  610. STFD a12, 3 * SIZE(AO2)
  611. FMADD a13, alpha2, y05, a13
  612. FMADD a14, alpha2, y06, a14
  613. FMADD a15, alpha2, y07, a15
  614. FMADD a16, alpha2, y08, a16
  615. STFD a13, 4 * SIZE(AO2)
  616. STFD a14, 5 * SIZE(AO2)
  617. STFD a15, 6 * SIZE(AO2)
  618. STFD a16, 7 * SIZE(AO2)
  619. addi AO1, AO1, 8 * SIZE
  620. addi AO2, AO2, 8 * SIZE
  621. addi X1, X1, 8 * SIZE
  622. .align 4
  623. LL(16):
  624. andi. r0, M, 4
  625. ble LL(17)
  626. LFD a1, 0 * SIZE(AO1)
  627. LFD a2, 1 * SIZE(AO1)
  628. LFD a3, 2 * SIZE(AO1)
  629. LFD a4, 3 * SIZE(AO1)
  630. LFD y01, 0 * SIZE(X1)
  631. LFD y02, 1 * SIZE(X1)
  632. LFD y03, 2 * SIZE(X1)
  633. LFD y04, 3 * SIZE(X1)
  634. LFD a5, 0 * SIZE(AO2)
  635. LFD a6, 1 * SIZE(AO2)
  636. LFD a7, 2 * SIZE(AO2)
  637. LFD a8, 3 * SIZE(AO2)
  638. FMADD a1, alpha1, y01, a1
  639. FMADD a2, alpha1, y02, a2
  640. FMADD a3, alpha1, y03, a3
  641. FMADD a4, alpha1, y04, a4
  642. STFD a1, 0 * SIZE(AO1)
  643. STFD a2, 1 * SIZE(AO1)
  644. STFD a3, 2 * SIZE(AO1)
  645. STFD a4, 3 * SIZE(AO1)
  646. FMADD a5, alpha2, y01, a5
  647. FMADD a6, alpha2, y02, a6
  648. FMADD a7, alpha2, y03, a7
  649. FMADD a8, alpha2, y04, a8
  650. STFD a5, 0 * SIZE(AO2)
  651. STFD a6, 1 * SIZE(AO2)
  652. STFD a7, 2 * SIZE(AO2)
  653. STFD a8, 3 * SIZE(AO2)
  654. addi AO1, AO1, 4 * SIZE
  655. addi AO2, AO2, 4 * SIZE
  656. addi X1, X1, 4 * SIZE
  657. .align 4
  658. LL(17):
  659. andi. r0, M, 2
  660. ble LL(18)
  661. LFD a1, 0 * SIZE(AO1)
  662. LFD a2, 1 * SIZE(AO1)
  663. LFD a3, 0 * SIZE(AO2)
  664. LFD a4, 1 * SIZE(AO2)
  665. LFD y01, 0 * SIZE(X1)
  666. LFD y02, 1 * SIZE(X1)
  667. FMADD a1, alpha1, y01, a1
  668. FMADD a2, alpha1, y02, a2
  669. FMADD a3, alpha2, y01, a3
  670. FMADD a4, alpha2, y02, a4
  671. STFD a1, 0 * SIZE(AO1)
  672. STFD a2, 1 * SIZE(AO1)
  673. STFD a3, 0 * SIZE(AO2)
  674. STFD a4, 1 * SIZE(AO2)
  675. addi AO1, AO1, 2 * SIZE
  676. addi AO2, AO2, 2 * SIZE
  677. addi X1, X1, 2 * SIZE
  678. .align 4
  679. LL(18):
  680. andi. r0, M, 1
  681. ble LL(19)
  682. LFD y01, 0 * SIZE(X1)
  683. LFD a1, 0 * SIZE(AO1)
  684. LFD a2, 0 * SIZE(AO2)
  685. FMADD a1, alpha1, y01, a1
  686. FMADD a2, alpha2, y01, a2
  687. STFD a1, 0 * SIZE(AO1)
  688. STFD a2, 0 * SIZE(AO2)
  689. .align 4
  690. LL(19):
  691. addi J, J, -1
  692. cmpi cr0, 0, J, 0
  693. bgt LL(11)
  694. .align 4
  695. LL(20):
  696. andi. J, N, 1
  697. ble LL(999)
  698. .align 4
  699. LL(21):
  700. LFD alpha1, 0 * SIZE(Y)
  701. FMUL alpha1, alpha, alpha1
  702. mr AO1, A
  703. mr X1, XX
  704. srawi. r0, M, 4
  705. mtspr CTR, r0
  706. ble LL(25)
  707. LFD a1, 0 * SIZE(AO1)
  708. LFD a2, 1 * SIZE(AO1)
  709. LFD a3, 2 * SIZE(AO1)
  710. LFD a4, 3 * SIZE(AO1)
  711. LFD a5, 4 * SIZE(AO1)
  712. LFD a6, 5 * SIZE(AO1)
  713. LFD a7, 6 * SIZE(AO1)
  714. LFD a8, 7 * SIZE(AO1)
  715. LFD y01, 0 * SIZE(X1)
  716. LFD y02, 1 * SIZE(X1)
  717. LFD y03, 2 * SIZE(X1)
  718. LFD y04, 3 * SIZE(X1)
  719. LFD y05, 4 * SIZE(X1)
  720. LFD y06, 5 * SIZE(X1)
  721. LFD y07, 6 * SIZE(X1)
  722. LFD y08, 7 * SIZE(X1)
  723. bdz LL(23)
  724. .align 4
  725. LL(22):
  726. FMADD a1, alpha1, y01, a1
  727. FMADD a2, alpha1, y02, a2
  728. FMADD a3, alpha1, y03, a3
  729. FMADD a4, alpha1, y04, a4
  730. FMADD a5, alpha1, y05, a5
  731. FMADD a6, alpha1, y06, a6
  732. FMADD a7, alpha1, y07, a7
  733. FMADD a8, alpha1, y08, a8
  734. STFD a1, 0 * SIZE(AO1)
  735. STFD a2, 1 * SIZE(AO1)
  736. STFD a3, 2 * SIZE(AO1)
  737. STFD a4, 3 * SIZE(AO1)
  738. LFD a1, 8 * SIZE(AO1)
  739. LFD a2, 9 * SIZE(AO1)
  740. LFD a3, 10 * SIZE(AO1)
  741. LFD a4, 11 * SIZE(AO1)
  742. STFD a5, 4 * SIZE(AO1)
  743. STFD a6, 5 * SIZE(AO1)
  744. STFD a7, 6 * SIZE(AO1)
  745. STFD a8, 7 * SIZE(AO1)
  746. LFD a5, 12 * SIZE(AO1)
  747. LFD a6, 13 * SIZE(AO1)
  748. LFD a7, 14 * SIZE(AO1)
  749. LFD a8, 15 * SIZE(AO1)
  750. LFD y01, 8 * SIZE(X1)
  751. LFD y02, 9 * SIZE(X1)
  752. LFD y03, 10 * SIZE(X1)
  753. LFD y04, 11 * SIZE(X1)
  754. LFD y05, 12 * SIZE(X1)
  755. LFD y06, 13 * SIZE(X1)
  756. LFD y07, 14 * SIZE(X1)
  757. LFD y08, 15 * SIZE(X1)
  758. FMADD a1, alpha1, y01, a1
  759. FMADD a2, alpha1, y02, a2
  760. FMADD a3, alpha1, y03, a3
  761. FMADD a4, alpha1, y04, a4
  762. FMADD a5, alpha1, y05, a5
  763. FMADD a6, alpha1, y06, a6
  764. FMADD a7, alpha1, y07, a7
  765. FMADD a8, alpha1, y08, a8
  766. STFD a1, 8 * SIZE(AO1)
  767. STFD a2, 9 * SIZE(AO1)
  768. STFD a3, 10 * SIZE(AO1)
  769. STFD a4, 11 * SIZE(AO1)
  770. LFD a1, 16 * SIZE(AO1)
  771. LFD a2, 17 * SIZE(AO1)
  772. LFD a3, 18 * SIZE(AO1)
  773. LFD a4, 19 * SIZE(AO1)
  774. STFD a5, 12 * SIZE(AO1)
  775. STFD a6, 13 * SIZE(AO1)
  776. STFD a7, 14 * SIZE(AO1)
  777. STFD a8, 15 * SIZE(AO1)
  778. LFD a5, 20 * SIZE(AO1)
  779. LFD a6, 21 * SIZE(AO1)
  780. LFD a7, 22 * SIZE(AO1)
  781. LFD a8, 23 * SIZE(AO1)
  782. LFD y01, 16 * SIZE(X1)
  783. LFD y02, 17 * SIZE(X1)
  784. LFD y03, 18 * SIZE(X1)
  785. LFD y04, 19 * SIZE(X1)
  786. LFD y05, 20 * SIZE(X1)
  787. LFD y06, 21 * SIZE(X1)
  788. LFD y07, 22 * SIZE(X1)
  789. LFD y08, 23 * SIZE(X1)
  790. addi AO1, AO1, 16 * SIZE
  791. addi X1, X1, 16 * SIZE
  792. DCBT(AO1, PREA)
  793. DCBT(Y1, PREY)
  794. bdnz+ LL(22)
  795. .align 4
  796. LL(23):
  797. FMADD a1, alpha1, y01, a1
  798. FMADD a2, alpha1, y02, a2
  799. FMADD a3, alpha1, y03, a3
  800. FMADD a4, alpha1, y04, a4
  801. FMADD a5, alpha1, y05, a5
  802. FMADD a6, alpha1, y06, a6
  803. FMADD a7, alpha1, y07, a7
  804. FMADD a8, alpha1, y08, a8
  805. STFD a1, 0 * SIZE(AO1)
  806. STFD a2, 1 * SIZE(AO1)
  807. STFD a3, 2 * SIZE(AO1)
  808. STFD a4, 3 * SIZE(AO1)
  809. LFD a1, 8 * SIZE(AO1)
  810. LFD a2, 9 * SIZE(AO1)
  811. LFD a3, 10 * SIZE(AO1)
  812. LFD a4, 11 * SIZE(AO1)
  813. STFD a5, 4 * SIZE(AO1)
  814. STFD a6, 5 * SIZE(AO1)
  815. STFD a7, 6 * SIZE(AO1)
  816. STFD a8, 7 * SIZE(AO1)
  817. LFD a5, 12 * SIZE(AO1)
  818. LFD a6, 13 * SIZE(AO1)
  819. LFD a7, 14 * SIZE(AO1)
  820. LFD a8, 15 * SIZE(AO1)
  821. LFD y01, 8 * SIZE(X1)
  822. LFD y02, 9 * SIZE(X1)
  823. LFD y03, 10 * SIZE(X1)
  824. LFD y04, 11 * SIZE(X1)
  825. LFD y05, 12 * SIZE(X1)
  826. LFD y06, 13 * SIZE(X1)
  827. LFD y07, 14 * SIZE(X1)
  828. LFD y08, 15 * SIZE(X1)
  829. FMADD a1, alpha1, y01, a1
  830. FMADD a2, alpha1, y02, a2
  831. FMADD a3, alpha1, y03, a3
  832. FMADD a4, alpha1, y04, a4
  833. FMADD a5, alpha1, y05, a5
  834. FMADD a6, alpha1, y06, a6
  835. FMADD a7, alpha1, y07, a7
  836. FMADD a8, alpha1, y08, a8
  837. STFD a1, 8 * SIZE(AO1)
  838. STFD a2, 9 * SIZE(AO1)
  839. STFD a3, 10 * SIZE(AO1)
  840. STFD a4, 11 * SIZE(AO1)
  841. LFD a1, 16 * SIZE(AO1)
  842. LFD a2, 17 * SIZE(AO1)
  843. LFD a3, 18 * SIZE(AO1)
  844. LFD a4, 19 * SIZE(AO1)
  845. STFD a5, 12 * SIZE(AO1)
  846. STFD a6, 13 * SIZE(AO1)
  847. STFD a7, 14 * SIZE(AO1)
  848. STFD a8, 15 * SIZE(AO1)
  849. LFD a5, 20 * SIZE(AO1)
  850. LFD a6, 21 * SIZE(AO1)
  851. LFD a7, 22 * SIZE(AO1)
  852. LFD a8, 23 * SIZE(AO1)
  853. addi AO1, AO1, 16 * SIZE
  854. addi X1, X1, 16 * SIZE
  855. .align 4
  856. LL(25):
  857. andi. r0, M, 15
  858. ble LL(999)
  859. andi. r0, M, 8
  860. ble LL(26)
  861. LFD y01, 0 * SIZE(X1)
  862. LFD y02, 1 * SIZE(X1)
  863. LFD y03, 2 * SIZE(X1)
  864. LFD y04, 3 * SIZE(X1)
  865. LFD y05, 4 * SIZE(X1)
  866. LFD y06, 5 * SIZE(X1)
  867. LFD y07, 6 * SIZE(X1)
  868. LFD y08, 7 * SIZE(X1)
  869. LFD a1, 0 * SIZE(AO1)
  870. LFD a2, 1 * SIZE(AO1)
  871. LFD a3, 2 * SIZE(AO1)
  872. LFD a4, 3 * SIZE(AO1)
  873. LFD a5, 4 * SIZE(AO1)
  874. LFD a6, 5 * SIZE(AO1)
  875. LFD a7, 6 * SIZE(AO1)
  876. LFD a8, 7 * SIZE(AO1)
  877. FMADD a1, alpha1, y01, a1
  878. FMADD a2, alpha1, y02, a2
  879. FMADD a3, alpha1, y03, a3
  880. FMADD a4, alpha1, y04, a4
  881. STFD a1, 0 * SIZE(AO1)
  882. STFD a2, 1 * SIZE(AO1)
  883. STFD a3, 2 * SIZE(AO1)
  884. STFD a4, 3 * SIZE(AO1)
  885. FMADD a5, alpha1, y05, a5
  886. FMADD a6, alpha1, y06, a6
  887. FMADD a7, alpha1, y07, a7
  888. FMADD a8, alpha1, y08, a8
  889. STFD a5, 4 * SIZE(AO1)
  890. STFD a6, 5 * SIZE(AO1)
  891. STFD a7, 6 * SIZE(AO1)
  892. STFD a8, 7 * SIZE(AO1)
  893. addi AO1, AO1, 8 * SIZE
  894. addi X1, X1, 8 * SIZE
  895. .align 4
  896. LL(26):
  897. andi. r0, M, 4
  898. ble LL(27)
  899. LFD a1, 0 * SIZE(AO1)
  900. LFD a2, 1 * SIZE(AO1)
  901. LFD a3, 2 * SIZE(AO1)
  902. LFD a4, 3 * SIZE(AO1)
  903. LFD y01, 0 * SIZE(X1)
  904. LFD y02, 1 * SIZE(X1)
  905. LFD y03, 2 * SIZE(X1)
  906. LFD y04, 3 * SIZE(X1)
  907. FMADD a1, alpha1, y01, a1
  908. FMADD a2, alpha1, y02, a2
  909. FMADD a3, alpha1, y03, a3
  910. FMADD a4, alpha1, y04, a4
  911. STFD a1, 0 * SIZE(AO1)
  912. STFD a2, 1 * SIZE(AO1)
  913. STFD a3, 2 * SIZE(AO1)
  914. STFD a4, 3 * SIZE(AO1)
  915. addi AO1, AO1, 4 * SIZE
  916. addi X1, X1, 4 * SIZE
  917. .align 4
  918. LL(27):
  919. andi. r0, M, 2
  920. ble LL(28)
  921. LFD a1, 0 * SIZE(AO1)
  922. LFD a2, 1 * SIZE(AO1)
  923. LFD y01, 0 * SIZE(X1)
  924. LFD y02, 1 * SIZE(X1)
  925. FMADD a1, alpha1, y01, a1
  926. FMADD a2, alpha1, y02, a2
  927. STFD a1, 0 * SIZE(AO1)
  928. STFD a2, 1 * SIZE(AO1)
  929. addi AO1, AO1, 2 * SIZE
  930. addi X1, X1, 2 * SIZE
  931. .align 4
  932. LL(28):
  933. andi. r0, M, 1
  934. ble LL(999)
  935. LFD y01, 0 * SIZE(X1)
  936. LFD a1, 0 * SIZE(AO1)
  937. FMADD a1, alpha1, y01, a1
  938. STFD a1, 0 * SIZE(AO1)
  939. .align 4
  940. LL(999):
  941. li r3, 0
  942. lfd f14, 0(SP)
  943. lfd f15, 8(SP)
  944. lfd f16, 16(SP)
  945. lfd f17, 24(SP)
  946. lfd f18, 32(SP)
  947. lfd f19, 40(SP)
  948. lfd f20, 48(SP)
  949. lfd f21, 56(SP)
  950. lfd f22, 64(SP)
  951. lfd f23, 72(SP)
  952. lfd f24, 80(SP)
  953. lfd f25, 88(SP)
  954. lfd f26, 96(SP)
  955. lfd f27, 104(SP)
  956. lfd f28, 112(SP)
  957. lfd f29, 120(SP)
  958. lfd f30, 128(SP)
  959. lfd f31, 136(SP)
  960. #ifdef __64BIT__
  961. ld r14, 144(SP)
  962. ld r15, 152(SP)
  963. ld r16, 160(SP)
  964. ld r17, 168(SP)
  965. ld r18, 176(SP)
  966. ld r19, 184(SP)
  967. ld r20, 192(SP)
  968. ld r21, 200(SP)
  969. ld r22, 208(SP)
  970. ld r23, 216(SP)
  971. ld r24, 224(SP)
  972. ld r25, 232(SP)
  973. ld r26, 240(SP)
  974. ld r27, 248(SP)
  975. #else
  976. lwz r14, 144(SP)
  977. lwz r15, 148(SP)
  978. lwz r16, 152(SP)
  979. lwz r17, 156(SP)
  980. lwz r18, 160(SP)
  981. lwz r19, 164(SP)
  982. lwz r20, 168(SP)
  983. lwz r21, 172(SP)
  984. lwz r22, 176(SP)
  985. lwz r23, 180(SP)
  986. lwz r24, 184(SP)
  987. lwz r25, 188(SP)
  988. lwz r26, 192(SP)
  989. lwz r27, 196(SP)
  990. #endif
  991. addi SP, SP, STACKSIZE
  992. blr
  993. EPILOGUE
  994. #endif