You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN.S 62 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #if defined(linux) || defined(__FreeBSD__)
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define AORIG r18
  88. #define TEMP r19
  89. #define KK r20
  90. #define I r21
  91. #define J r22
  92. #define AO r23
  93. #define BO r24
  94. #define CO1 r25
  95. #define CO2 r26
  96. #define CO3 r27
  97. #define CO4 r28
  98. #define PREA r29
  99. #define PREB r30
  100. #define PREC r31
  101. #ifndef NEEDPARAM
  102. PROLOGUE
  103. PROFCODE
  104. addi SP, SP, -STACKSIZE
  105. li r0, 0
  106. stfd f14, 0(SP)
  107. stfd f15, 8(SP)
  108. stfd f16, 16(SP)
  109. stfd f17, 24(SP)
  110. stfd f18, 32(SP)
  111. stfd f19, 40(SP)
  112. stfd f20, 48(SP)
  113. stfd f21, 56(SP)
  114. stfd f22, 64(SP)
  115. stfd f23, 72(SP)
  116. stfd f24, 80(SP)
  117. stfd f25, 88(SP)
  118. stfd f26, 96(SP)
  119. stfd f27, 104(SP)
  120. stfd f28, 112(SP)
  121. stfd f29, 120(SP)
  122. stfd f30, 128(SP)
  123. stfd f31, 136(SP)
  124. #ifdef __64BIT__
  125. std r31, 144(SP)
  126. std r30, 152(SP)
  127. std r29, 160(SP)
  128. std r28, 168(SP)
  129. std r27, 176(SP)
  130. std r26, 184(SP)
  131. std r25, 192(SP)
  132. std r24, 200(SP)
  133. std r23, 208(SP)
  134. std r22, 216(SP)
  135. std r21, 224(SP)
  136. std r20, 232(SP)
  137. std r19, 240(SP)
  138. std r18, 248(SP)
  139. #else
  140. stw r31, 144(SP)
  141. stw r30, 148(SP)
  142. stw r29, 152(SP)
  143. stw r28, 156(SP)
  144. stw r27, 160(SP)
  145. stw r26, 164(SP)
  146. stw r25, 168(SP)
  147. stw r24, 172(SP)
  148. stw r23, 176(SP)
  149. stw r22, 180(SP)
  150. stw r21, 184(SP)
  151. stw r20, 188(SP)
  152. stw r19, 192(SP)
  153. stw r18, 196(SP)
  154. #endif
  155. stw r0, FZERO
  156. #if defined(_AIX) || defined(__APPLE__)
  157. #if !defined(__64BIT__) && defined(DOUBLE)
  158. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  159. #endif
  160. #endif
  161. slwi LDC, LDC, BASE_SHIFT
  162. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  163. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  164. #endif
  165. #if defined(_AIX) || defined(__APPLE__)
  166. #ifdef __64BIT__
  167. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  168. #else
  169. #ifdef DOUBLE
  170. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  171. #else
  172. lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  173. #endif
  174. #endif
  175. #endif
  176. #ifdef LN
  177. mullw r0, M, K
  178. slwi r0, r0, BASE_SHIFT
  179. add A, A, r0
  180. slwi r0, M, BASE_SHIFT
  181. add C, C, r0
  182. #endif
  183. #ifdef RN
  184. neg KK, OFFSET
  185. #endif
  186. #ifdef RT
  187. mullw r0, N, K
  188. slwi r0, r0, BASE_SHIFT
  189. add B, B, r0
  190. mullw r0, N, LDC
  191. add C, C, r0
  192. sub KK, N, OFFSET
  193. #endif
  194. cmpwi cr0, M, 0
  195. ble LL(999)
  196. cmpwi cr0, N, 0
  197. ble LL(999)
  198. cmpwi cr0, K, 0
  199. ble LL(999)
  200. #ifndef PREFETCHTEST
  201. #ifdef LN
  202. li PREC, -4 * SIZE
  203. #else
  204. li PREC, 4 * SIZE
  205. #endif
  206. #else
  207. #if defined(linux) || defined(__FreeBSD__)
  208. #ifndef __64BIT__
  209. mr PREA, r10
  210. lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
  211. lwz PREC, FRAMESLOT(1) + STACKSIZE(SP)
  212. #else
  213. ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  214. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  215. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  216. #endif
  217. #endif
  218. #if defined(_AIX) || defined(__APPLE__)
  219. #ifdef __64BIT__
  220. ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  221. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  222. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  223. #else
  224. #ifdef DOUBLE
  225. lwz PREA, FRAMESLOT(1) + STACKSIZE(SP)
  226. lwz PREB, FRAMESLOT(2) + STACKSIZE(SP)
  227. lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
  228. #else
  229. lwz PREA, FRAMESLOT(0) + STACKSIZE(SP)
  230. lwz PREB, FRAMESLOT(1) + STACKSIZE(SP)
  231. lwz PREC, FRAMESLOT(2) + STACKSIZE(SP)
  232. #endif
  233. #endif
  234. #endif
  235. #endif
  236. #ifndef PREFETCHTEST
  237. #ifdef PPC970
  238. #ifdef ALLOC_HUGETLB
  239. li PREA, (16 * 5 * SIZE | 1)
  240. li PREB, (16 * 5 * SIZE | 3)
  241. #else
  242. li PREA, (16 * 14 * SIZE | 1)
  243. li PREB, (16 * 8 * SIZE | 3)
  244. #endif
  245. #endif
  246. #ifdef POWER4
  247. #ifdef ALLOC_HUGETLB
  248. li PREA, (16 * 1 * SIZE + 16)
  249. li PREB, (16 * 1 * SIZE + 16)
  250. #else
  251. li PREA, (16 * 2 * SIZE + 16)
  252. li PREB, (16 * 2 * SIZE + 16)
  253. #endif
  254. #endif
  255. #ifdef POWER5
  256. #ifdef ALLOC_HUGETLB
  257. li PREA, (16 * 7 * SIZE | 1)
  258. li PREB, (16 * 7 * SIZE | 3)
  259. #else
  260. li PREA, (16 * 12 * SIZE | 1)
  261. li PREB, (16 * 6 * SIZE | 3)
  262. #endif
  263. #endif
  264. #endif
  265. lfs f0, FZERO
  266. srawi. J, N, 2
  267. ble LL(40)
  268. .align 4
  269. LL(10):
  270. #ifdef RT
  271. slwi r0, K, 2 + BASE_SHIFT
  272. sub B, B, r0
  273. slwi r0, LDC, 2
  274. sub C, C, r0
  275. #endif
  276. mr CO1, C
  277. add CO2, C, LDC
  278. add CO3, CO2, LDC
  279. add CO4, CO3, LDC
  280. #ifdef LN
  281. add KK, M, OFFSET
  282. #endif
  283. #ifdef LT
  284. mr KK, OFFSET
  285. #endif
  286. fmr f1, f0
  287. fmr f2, f0
  288. fmr f3, f0
  289. fmr f4, f0
  290. fmr f5, f0
  291. fmr f6, f0
  292. fmr f7, f0
  293. fmr f8, f0
  294. fmr f9, f0
  295. fmr f10, f0
  296. fmr f11, f0
  297. fmr f12, f0
  298. fmr f13, f0
  299. fmr f14, f0
  300. fmr f15, f0
  301. #if defined(LN) || defined(RT)
  302. mr AORIG, A
  303. #else
  304. mr AO, A
  305. #endif
  306. #ifndef RT
  307. add C, CO4, LDC
  308. #endif
  309. LL(30):
  310. andi. I, M, 1
  311. ble LL(20)
  312. #if defined(LT) || defined(RN)
  313. LFD f16, 0 * SIZE(AO)
  314. LFD f17, 1 * SIZE(AO)
  315. LFD f18, 2 * SIZE(AO)
  316. LFD f19, 3 * SIZE(AO)
  317. LFD f20, 0 * SIZE(B)
  318. LFD f21, 1 * SIZE(B)
  319. LFD f22, 2 * SIZE(B)
  320. LFD f23, 3 * SIZE(B)
  321. LFD f24, 4 * SIZE(B)
  322. LFD f25, 5 * SIZE(B)
  323. LFD f26, 6 * SIZE(B)
  324. LFD f27, 7 * SIZE(B)
  325. srawi. r0, KK, 2
  326. mtspr CTR, r0
  327. mr BO, B
  328. #else
  329. #ifdef LN
  330. slwi r0, K, BASE_SHIFT
  331. sub AORIG, AORIG, r0
  332. #endif
  333. slwi r0, KK, 0 + BASE_SHIFT
  334. slwi TEMP, KK, 2 + BASE_SHIFT
  335. add AO, AORIG, r0
  336. add BO, B, TEMP
  337. sub TEMP, K, KK
  338. LFD f16, 0 * SIZE(AO)
  339. LFD f17, 1 * SIZE(AO)
  340. LFD f18, 2 * SIZE(AO)
  341. LFD f19, 3 * SIZE(AO)
  342. LFD f20, 0 * SIZE(BO)
  343. LFD f21, 1 * SIZE(BO)
  344. LFD f22, 2 * SIZE(BO)
  345. LFD f23, 3 * SIZE(BO)
  346. LFD f24, 4 * SIZE(BO)
  347. LFD f25, 5 * SIZE(BO)
  348. LFD f26, 6 * SIZE(BO)
  349. LFD f27, 7 * SIZE(BO)
  350. srawi. r0, TEMP, 2
  351. mtspr CTR, r0
  352. #endif
  353. ble LL(35)
  354. .align 5
  355. LL(32):
  356. FMADD f0, f16, f20, f0
  357. FMADD f4, f16, f21, f4
  358. FMADD f8, f16, f22, f8
  359. FMADD f12, f16, f23, f12
  360. LFD f20, 8 * SIZE(BO)
  361. LFD f21, 9 * SIZE(BO)
  362. LFD f22, 10 * SIZE(BO)
  363. LFD f23, 11 * SIZE(BO)
  364. FMADD f1, f17, f24, f1
  365. FMADD f5, f17, f25, f5
  366. FMADD f9, f17, f26, f9
  367. FMADD f13, f17, f27, f13
  368. LFD f24, 12 * SIZE(BO)
  369. LFD f25, 13 * SIZE(BO)
  370. LFD f26, 14 * SIZE(BO)
  371. LFD f27, 15 * SIZE(BO)
  372. FMADD f0, f18, f20, f0
  373. FMADD f4, f18, f21, f4
  374. FMADD f8, f18, f22, f8
  375. FMADD f12, f18, f23, f12
  376. LFD f20, 16 * SIZE(BO)
  377. LFD f21, 17 * SIZE(BO)
  378. LFD f22, 18 * SIZE(BO)
  379. LFD f23, 19 * SIZE(BO)
  380. FMADD f1, f19, f24, f1
  381. FMADD f5, f19, f25, f5
  382. FMADD f9, f19, f26, f9
  383. FMADD f13, f19, f27, f13
  384. LFD f16, 4 * SIZE(AO)
  385. LFD f17, 5 * SIZE(AO)
  386. LFD f18, 6 * SIZE(AO)
  387. LFD f19, 7 * SIZE(AO)
  388. LFD f24, 20 * SIZE(BO)
  389. LFD f25, 21 * SIZE(BO)
  390. LFD f26, 22 * SIZE(BO)
  391. LFD f27, 23 * SIZE(BO)
  392. addi AO, AO, 4 * SIZE
  393. addi BO, BO, 16 * SIZE
  394. DCBT(BO, PREB)
  395. bdnz LL(32)
  396. fadd f0, f1, f0
  397. fadd f4, f5, f4
  398. fadd f8, f9, f8
  399. fadd f12, f13, f12
  400. .align 4
  401. LL(35):
  402. #if defined(LT) || defined(RN)
  403. andi. r0, KK, 3
  404. #else
  405. andi. r0, TEMP, 3
  406. #endif
  407. mtspr CTR, r0
  408. ble+ LL(38)
  409. .align 4
  410. LL(36):
  411. FMADD f0, f16, f20, f0
  412. FMADD f4, f16, f21, f4
  413. FMADD f8, f16, f22, f8
  414. FMADD f12, f16, f23, f12
  415. LFD f16, 1 * SIZE(AO)
  416. LFD f20, 4 * SIZE(BO)
  417. LFD f21, 5 * SIZE(BO)
  418. LFD f22, 6 * SIZE(BO)
  419. LFD f23, 7 * SIZE(BO)
  420. addi BO, BO, 4 * SIZE
  421. addi AO, AO, 1 * SIZE
  422. bdnz LL(36)
  423. .align 4
  424. LL(38):
  425. #if defined(LN) || defined(RT)
  426. #ifdef LN
  427. subi r0, KK, 1
  428. #else
  429. subi r0, KK, 4
  430. #endif
  431. slwi TEMP, r0, 0 + BASE_SHIFT
  432. slwi r0, r0, 2 + BASE_SHIFT
  433. add AO, AORIG, TEMP
  434. add BO, B, r0
  435. #endif
  436. #if defined(LN) || defined(LT)
  437. LFD f16, 0 * SIZE(BO)
  438. LFD f17, 1 * SIZE(BO)
  439. LFD f18, 2 * SIZE(BO)
  440. LFD f19, 3 * SIZE(BO)
  441. FSUB f0, f16, f0
  442. FSUB f4, f17, f4
  443. FSUB f8, f18, f8
  444. FSUB f12, f19, f12
  445. #else
  446. LFD f16, 0 * SIZE(AO)
  447. LFD f20, 1 * SIZE(AO)
  448. LFD f24, 2 * SIZE(AO)
  449. LFD f28, 3 * SIZE(AO)
  450. FSUB f0, f16, f0
  451. FSUB f4, f20, f4
  452. FSUB f8, f24, f8
  453. FSUB f12, f28, f12
  454. #endif
  455. #ifdef LN
  456. LFD f21, 0 * SIZE(AO)
  457. FMUL f0, f21, f0
  458. FMUL f4, f21, f4
  459. FMUL f8, f21, f8
  460. FMUL f12, f21, f12
  461. #endif
  462. #ifdef LT
  463. LFD f16, 0 * SIZE(AO)
  464. FMUL f0, f16, f0
  465. FMUL f4, f16, f4
  466. FMUL f8, f16, f8
  467. FMUL f12, f16, f12
  468. #endif
  469. #ifdef RN
  470. LFD f16, 0 * SIZE(BO)
  471. LFD f17, 1 * SIZE(BO)
  472. LFD f18, 2 * SIZE(BO)
  473. LFD f19, 3 * SIZE(BO)
  474. FMUL f0, f16, f0
  475. FNMSUB f4, f17, f0, f4
  476. FNMSUB f8, f18, f0, f8
  477. FNMSUB f12, f19, f0, f12
  478. LFD f16, 5 * SIZE(BO)
  479. LFD f17, 6 * SIZE(BO)
  480. LFD f18, 7 * SIZE(BO)
  481. LFD f19, 10 * SIZE(BO)
  482. LFD f20, 11 * SIZE(BO)
  483. LFD f21, 15 * SIZE(BO)
  484. FMUL f4, f16, f4
  485. FNMSUB f8, f17, f4, f8
  486. FNMSUB f12, f18, f4, f12
  487. FMUL f8, f19, f8
  488. FNMSUB f12, f20, f8, f12
  489. FMUL f12, f21, f12
  490. #endif
  491. #ifdef RT
  492. LFD f16, 15 * SIZE(BO)
  493. LFD f17, 14 * SIZE(BO)
  494. LFD f18, 13 * SIZE(BO)
  495. LFD f19, 12 * SIZE(BO)
  496. FMUL f12, f16, f12
  497. FNMSUB f8, f17, f12, f8
  498. FNMSUB f4, f18, f12, f4
  499. FNMSUB f0, f19, f12, f0
  500. LFD f16, 10 * SIZE(BO)
  501. LFD f17, 9 * SIZE(BO)
  502. LFD f18, 8 * SIZE(BO)
  503. LFD f19, 5 * SIZE(BO)
  504. FMUL f8, f16, f8
  505. LFD f20, 4 * SIZE(BO)
  506. LFD f21, 0 * SIZE(BO)
  507. FNMSUB f4, f17, f8, f4
  508. FNMSUB f0, f18, f8, f0
  509. FMUL f4, f19, f4
  510. FNMSUB f0, f20, f4, f0
  511. FMUL f0, f21, f0
  512. #endif
  513. #ifdef LN
  514. subi CO1, CO1, 1 * SIZE
  515. subi CO2, CO2, 1 * SIZE
  516. subi CO3, CO3, 1 * SIZE
  517. subi CO4, CO4, 1 * SIZE
  518. #endif
  519. #if defined(LN) || defined(LT)
  520. STFD f0, 0 * SIZE(BO)
  521. STFD f4, 1 * SIZE(BO)
  522. STFD f8, 2 * SIZE(BO)
  523. STFD f12, 3 * SIZE(BO)
  524. #else
  525. STFD f0, 0 * SIZE(AO)
  526. STFD f4, 1 * SIZE(AO)
  527. STFD f8, 2 * SIZE(AO)
  528. STFD f12, 3 * SIZE(AO)
  529. #endif
  530. STFD f0, 0 * SIZE(CO1)
  531. STFD f4, 0 * SIZE(CO2)
  532. STFD f8, 0 * SIZE(CO3)
  533. STFD f12, 0 * SIZE(CO4)
  534. lfs f0, FZERO
  535. fmr f1, f0
  536. fmr f4, f0
  537. fmr f5, f0
  538. fmr f8, f0
  539. fmr f9, f0
  540. fmr f12, f0
  541. fmr f13, f0
  542. #ifndef LN
  543. addi CO1, CO1, 1 * SIZE
  544. addi CO2, CO2, 1 * SIZE
  545. addi CO3, CO3, 1 * SIZE
  546. addi CO4, CO4, 1 * SIZE
  547. #endif
  548. #ifdef RT
  549. slwi r0, K, 0 + BASE_SHIFT
  550. add AORIG, AORIG, r0
  551. #endif
  552. #if defined(LT) || defined(RN)
  553. sub TEMP, K, KK
  554. slwi r0, TEMP, 0 + BASE_SHIFT
  555. slwi TEMP, TEMP, 2 + BASE_SHIFT
  556. add AO, AO, r0
  557. add BO, BO, TEMP
  558. #endif
  559. #ifdef LN
  560. subi KK, KK, 1
  561. #endif
  562. #ifdef LT
  563. addi KK, KK, 1
  564. #endif
  565. .align 4
  566. LL(20):
  567. andi. I, M, 2
  568. ble LL(09)
  569. #if defined(LT) || defined(RN)
  570. LFD f16, 0 * SIZE(AO)
  571. LFD f17, 1 * SIZE(AO)
  572. LFD f18, 2 * SIZE(AO)
  573. LFD f19, 3 * SIZE(AO)
  574. LFD f20, 0 * SIZE(B)
  575. LFD f21, 1 * SIZE(B)
  576. LFD f22, 2 * SIZE(B)
  577. LFD f23, 3 * SIZE(B)
  578. LFD f24, 4 * SIZE(B)
  579. LFD f25, 5 * SIZE(B)
  580. LFD f26, 6 * SIZE(B)
  581. LFD f27, 7 * SIZE(B)
  582. srawi. r0, KK, 2
  583. mtspr CTR, r0
  584. mr BO, B
  585. #else
  586. #ifdef LN
  587. slwi r0, K, 1 + BASE_SHIFT
  588. sub AORIG, AORIG, r0
  589. #endif
  590. slwi r0, KK, 1 + BASE_SHIFT
  591. slwi TEMP, KK, 2 + BASE_SHIFT
  592. add AO, AORIG, r0
  593. add BO, B, TEMP
  594. sub TEMP, K, KK
  595. LFD f16, 0 * SIZE(AO)
  596. LFD f17, 1 * SIZE(AO)
  597. LFD f18, 2 * SIZE(AO)
  598. LFD f19, 3 * SIZE(AO)
  599. LFD f20, 0 * SIZE(BO)
  600. LFD f21, 1 * SIZE(BO)
  601. LFD f22, 2 * SIZE(BO)
  602. LFD f23, 3 * SIZE(BO)
  603. LFD f24, 4 * SIZE(BO)
  604. LFD f25, 5 * SIZE(BO)
  605. LFD f26, 6 * SIZE(BO)
  606. LFD f27, 7 * SIZE(BO)
  607. srawi. r0, TEMP, 2
  608. mtspr CTR, r0
  609. #endif
  610. ble LL(25)
  611. .align 5
  612. LL(22):
  613. FMADD f0, f16, f20, f0
  614. FMADD f1, f17, f20, f1
  615. FMADD f4, f16, f21, f4
  616. FMADD f5, f17, f21, f5
  617. FMADD f8, f16, f22, f8
  618. FMADD f9, f17, f22, f9
  619. FMADD f12, f16, f23, f12
  620. FMADD f13, f17, f23, f13
  621. LFD f20, 8 * SIZE(BO)
  622. LFD f21, 9 * SIZE(BO)
  623. LFD f22, 10 * SIZE(BO)
  624. LFD f23, 11 * SIZE(BO)
  625. FMADD f2, f18, f24, f2
  626. FMADD f3, f19, f24, f3
  627. FMADD f6, f18, f25, f6
  628. FMADD f7, f19, f25, f7
  629. FMADD f10, f18, f26, f10
  630. FMADD f11, f19, f26, f11
  631. FMADD f14, f18, f27, f14
  632. FMADD f15, f19, f27, f15
  633. LFD f16, 4 * SIZE(AO)
  634. LFD f17, 5 * SIZE(AO)
  635. LFD f18, 6 * SIZE(AO)
  636. LFD f19, 7 * SIZE(AO)
  637. FMADD f0, f16, f20, f0
  638. FMADD f1, f17, f20, f1
  639. FMADD f4, f16, f21, f4
  640. FMADD f5, f17, f21, f5
  641. LFD f24, 12 * SIZE(BO)
  642. LFD f25, 13 * SIZE(BO)
  643. LFD f26, 14 * SIZE(BO)
  644. LFD f27, 15 * SIZE(BO)
  645. FMADD f8, f16, f22, f8
  646. FMADD f9, f17, f22, f9
  647. FMADD f12, f16, f23, f12
  648. FMADD f13, f17, f23, f13
  649. LFD f20, 16 * SIZE(BO)
  650. LFD f21, 17 * SIZE(BO)
  651. LFD f22, 18 * SIZE(BO)
  652. LFD f23, 19 * SIZE(BO)
  653. FMADD f2, f18, f24, f2
  654. FMADD f3, f19, f24, f3
  655. FMADD f6, f18, f25, f6
  656. FMADD f7, f19, f25, f7
  657. FMADD f10, f18, f26, f10
  658. FMADD f11, f19, f26, f11
  659. FMADD f14, f18, f27, f14
  660. FMADD f15, f19, f27, f15
  661. LFD f16, 8 * SIZE(AO)
  662. LFD f17, 9 * SIZE(AO)
  663. LFD f18, 10 * SIZE(AO)
  664. LFD f19, 11 * SIZE(AO)
  665. LFD f24, 20 * SIZE(BO)
  666. LFD f25, 21 * SIZE(BO)
  667. LFD f26, 22 * SIZE(BO)
  668. LFD f27, 23 * SIZE(BO)
  669. addi AO, AO, 8 * SIZE
  670. addi BO, BO, 16 * SIZE
  671. DCBT(BO, PREB)
  672. bdnz LL(22)
  673. fadd f0, f2, f0
  674. fadd f1, f3, f1
  675. fadd f4, f6, f4
  676. fadd f5, f7, f5
  677. fadd f8, f10, f8
  678. fadd f9, f11, f9
  679. fadd f12, f14, f12
  680. fadd f13, f15, f13
  681. .align 4
  682. LL(25):
  683. #if defined(LT) || defined(RN)
  684. andi. r0, KK, 3
  685. #else
  686. andi. r0, TEMP, 3
  687. #endif
  688. mtspr CTR, r0
  689. ble+ LL(28)
  690. .align 4
  691. LL(26):
  692. FMADD f0, f16, f20, f0
  693. FMADD f1, f17, f20, f1
  694. FMADD f4, f16, f21, f4
  695. FMADD f5, f17, f21, f5
  696. FMADD f8, f16, f22, f8
  697. FMADD f9, f17, f22, f9
  698. FMADD f12, f16, f23, f12
  699. FMADD f13, f17, f23, f13
  700. LFD f16, 2 * SIZE(AO)
  701. LFD f17, 3 * SIZE(AO)
  702. LFD f20, 4 * SIZE(BO)
  703. LFD f21, 5 * SIZE(BO)
  704. LFD f22, 6 * SIZE(BO)
  705. LFD f23, 7 * SIZE(BO)
  706. addi BO, BO, 4 * SIZE
  707. addi AO, AO, 2 * SIZE
  708. bdnz LL(26)
  709. .align 4
  710. LL(28):
  711. #if defined(LN) || defined(RT)
  712. #ifdef LN
  713. subi r0, KK, 2
  714. #else
  715. subi r0, KK, 4
  716. #endif
  717. slwi TEMP, r0, 1 + BASE_SHIFT
  718. slwi r0, r0, 2 + BASE_SHIFT
  719. add AO, AORIG, TEMP
  720. add BO, B, r0
  721. #endif
  722. #if defined(LN) || defined(LT)
  723. LFD f16, 0 * SIZE(BO)
  724. LFD f17, 1 * SIZE(BO)
  725. LFD f18, 2 * SIZE(BO)
  726. LFD f19, 3 * SIZE(BO)
  727. LFD f20, 4 * SIZE(BO)
  728. LFD f21, 5 * SIZE(BO)
  729. LFD f22, 6 * SIZE(BO)
  730. LFD f23, 7 * SIZE(BO)
  731. FSUB f0, f16, f0
  732. FSUB f4, f17, f4
  733. FSUB f8, f18, f8
  734. FSUB f12, f19, f12
  735. FSUB f1, f20, f1
  736. FSUB f5, f21, f5
  737. FSUB f9, f22, f9
  738. FSUB f13, f23, f13
  739. #else
  740. LFD f16, 0 * SIZE(AO)
  741. LFD f17, 1 * SIZE(AO)
  742. LFD f20, 2 * SIZE(AO)
  743. LFD f21, 3 * SIZE(AO)
  744. LFD f24, 4 * SIZE(AO)
  745. LFD f25, 5 * SIZE(AO)
  746. LFD f28, 6 * SIZE(AO)
  747. LFD f29, 7 * SIZE(AO)
  748. FSUB f0, f16, f0
  749. FSUB f1, f17, f1
  750. FSUB f4, f20, f4
  751. FSUB f5, f21, f5
  752. FSUB f8, f24, f8
  753. FSUB f9, f25, f9
  754. FSUB f12, f28, f12
  755. FSUB f13, f29, f13
  756. #endif
  757. #ifdef LN
  758. LFD f19, 3 * SIZE(AO)
  759. LFD f20, 2 * SIZE(AO)
  760. LFD f21, 0 * SIZE(AO)
  761. FMUL f1, f19, f1
  762. FMUL f5, f19, f5
  763. FMUL f9, f19, f9
  764. FMUL f13, f19, f13
  765. FNMSUB f0, f20, f1, f0
  766. FNMSUB f4, f20, f5, f4
  767. FNMSUB f8, f20, f9, f8
  768. FNMSUB f12, f20, f13, f12
  769. FMUL f0, f21, f0
  770. FMUL f4, f21, f4
  771. FMUL f8, f21, f8
  772. FMUL f12, f21, f12
  773. #endif
  774. #ifdef LT
  775. LFD f16, 0 * SIZE(AO)
  776. LFD f17, 1 * SIZE(AO)
  777. FMUL f0, f16, f0
  778. FMUL f4, f16, f4
  779. FMUL f8, f16, f8
  780. FMUL f12, f16, f12
  781. FNMSUB f1, f17, f0, f1
  782. FNMSUB f5, f17, f4, f5
  783. FNMSUB f9, f17, f8, f9
  784. FNMSUB f13, f17, f12, f13
  785. LFD f17, 3 * SIZE(AO)
  786. FMUL f1, f17, f1
  787. FMUL f5, f17, f5
  788. FMUL f9, f17, f9
  789. FMUL f13, f17, f13
  790. #endif
  791. #ifdef RN
  792. LFD f16, 0 * SIZE(BO)
  793. LFD f17, 1 * SIZE(BO)
  794. LFD f18, 2 * SIZE(BO)
  795. LFD f19, 3 * SIZE(BO)
  796. FMUL f0, f16, f0
  797. FMUL f1, f16, f1
  798. FNMSUB f4, f17, f0, f4
  799. FNMSUB f5, f17, f1, f5
  800. FNMSUB f8, f18, f0, f8
  801. FNMSUB f9, f18, f1, f9
  802. FNMSUB f12, f19, f0, f12
  803. FNMSUB f13, f19, f1, f13
  804. LFD f16, 5 * SIZE(BO)
  805. LFD f17, 6 * SIZE(BO)
  806. LFD f18, 7 * SIZE(BO)
  807. LFD f19, 10 * SIZE(BO)
  808. LFD f20, 11 * SIZE(BO)
  809. LFD f21, 15 * SIZE(BO)
  810. FMUL f4, f16, f4
  811. FMUL f5, f16, f5
  812. FNMSUB f8, f17, f4, f8
  813. FNMSUB f9, f17, f5, f9
  814. FNMSUB f12, f18, f4, f12
  815. FNMSUB f13, f18, f5, f13
  816. FMUL f8, f19, f8
  817. FMUL f9, f19, f9
  818. FNMSUB f12, f20, f8, f12
  819. FNMSUB f13, f20, f9, f13
  820. FMUL f12, f21, f12
  821. FMUL f13, f21, f13
  822. #endif
  823. #ifdef RT
  824. LFD f16, 15 * SIZE(BO)
  825. LFD f17, 14 * SIZE(BO)
  826. LFD f18, 13 * SIZE(BO)
  827. LFD f19, 12 * SIZE(BO)
  828. FMUL f12, f16, f12
  829. FMUL f13, f16, f13
  830. FNMSUB f8, f17, f12, f8
  831. FNMSUB f9, f17, f13, f9
  832. FNMSUB f4, f18, f12, f4
  833. FNMSUB f5, f18, f13, f5
  834. FNMSUB f0, f19, f12, f0
  835. FNMSUB f1, f19, f13, f1
  836. LFD f16, 10 * SIZE(BO)
  837. LFD f17, 9 * SIZE(BO)
  838. LFD f18, 8 * SIZE(BO)
  839. LFD f19, 5 * SIZE(BO)
  840. LFD f20, 4 * SIZE(BO)
  841. LFD f21, 0 * SIZE(BO)
  842. FMUL f8, f16, f8
  843. FMUL f9, f16, f9
  844. FNMSUB f4, f17, f8, f4
  845. FNMSUB f5, f17, f9, f5
  846. FNMSUB f0, f18, f8, f0
  847. FNMSUB f1, f18, f9, f1
  848. FMUL f4, f19, f4
  849. FMUL f5, f19, f5
  850. FNMSUB f0, f20, f4, f0
  851. FNMSUB f1, f20, f5, f1
  852. FMUL f0, f21, f0
  853. FMUL f1, f21, f1
  854. #endif
  855. #ifdef LN
  856. subi CO1, CO1, 2 * SIZE
  857. subi CO2, CO2, 2 * SIZE
  858. subi CO3, CO3, 2 * SIZE
  859. subi CO4, CO4, 2 * SIZE
  860. #endif
  861. #if defined(LN) || defined(LT)
  862. STFD f0, 0 * SIZE(BO)
  863. STFD f4, 1 * SIZE(BO)
  864. STFD f8, 2 * SIZE(BO)
  865. STFD f12, 3 * SIZE(BO)
  866. STFD f1, 4 * SIZE(BO)
  867. STFD f5, 5 * SIZE(BO)
  868. STFD f9, 6 * SIZE(BO)
  869. STFD f13, 7 * SIZE(BO)
  870. #else
  871. STFD f0, 0 * SIZE(AO)
  872. STFD f1, 1 * SIZE(AO)
  873. STFD f4, 2 * SIZE(AO)
  874. STFD f5, 3 * SIZE(AO)
  875. STFD f8, 4 * SIZE(AO)
  876. STFD f9, 5 * SIZE(AO)
  877. STFD f12, 6 * SIZE(AO)
  878. STFD f13, 7 * SIZE(AO)
  879. #endif
  880. STFD f0, 0 * SIZE(CO1)
  881. STFD f1, 1 * SIZE(CO1)
  882. STFD f4, 0 * SIZE(CO2)
  883. STFD f5, 1 * SIZE(CO2)
  884. STFD f8, 0 * SIZE(CO3)
  885. STFD f9, 1 * SIZE(CO3)
  886. STFD f12, 0 * SIZE(CO4)
  887. STFD f13, 1 * SIZE(CO4)
  888. lfs f0, FZERO
  889. fmr f1, f0
  890. fmr f2, f0
  891. fmr f3, f0
  892. fmr f4, f0
  893. fmr f5, f0
  894. fmr f6, f0
  895. fmr f7, f0
  896. fmr f8, f0
  897. fmr f9, f0
  898. fmr f10, f0
  899. fmr f11, f0
  900. fmr f12, f0
  901. fmr f13, f0
  902. fmr f14, f0
  903. fmr f15, f0
  904. #ifndef LN
  905. addi CO1, CO1, 2 * SIZE
  906. addi CO2, CO2, 2 * SIZE
  907. addi CO3, CO3, 2 * SIZE
  908. addi CO4, CO4, 2 * SIZE
  909. #endif
  910. #ifdef RT
  911. slwi r0, K, 1 + BASE_SHIFT
  912. add AORIG, AORIG, r0
  913. #endif
  914. #if defined(LT) || defined(RN)
  915. sub TEMP, K, KK
  916. slwi r0, TEMP, 1 + BASE_SHIFT
  917. slwi TEMP, TEMP, 2 + BASE_SHIFT
  918. add AO, AO, r0
  919. add BO, BO, TEMP
  920. #endif
  921. #ifdef LN
  922. subi KK, KK, 2
  923. #endif
  924. #ifdef LT
  925. addi KK, KK, 2
  926. #endif
  927. .align 4
  928. LL(09):
  929. srawi. I, M, 2
  930. ble LL(39)
  931. .align 4
  932. LL(11):
  933. #if defined(LT) || defined(RN)
  934. LFD f16, 0 * SIZE(AO)
  935. LFD f17, 1 * SIZE(AO)
  936. LFD f18, 2 * SIZE(AO)
  937. LFD f19, 3 * SIZE(AO)
  938. LFD f20, 0 * SIZE(B)
  939. LFD f21, 1 * SIZE(B)
  940. LFD f22, 2 * SIZE(B)
  941. LFD f23, 3 * SIZE(B)
  942. dcbt CO1, PREC
  943. dcbt CO2, PREC
  944. dcbt CO3, PREC
  945. dcbt CO4, PREC
  946. srawi. r0, KK, 2
  947. mtspr CTR, r0
  948. mr BO, B
  949. #else
  950. #ifdef LN
  951. slwi r0, K, 2 + BASE_SHIFT
  952. sub AORIG, AORIG, r0
  953. #endif
  954. slwi TEMP, KK, 2 + BASE_SHIFT
  955. add AO, AORIG, TEMP
  956. add BO, B, TEMP
  957. sub TEMP, K, KK
  958. LFD f16, 0 * SIZE(AO)
  959. LFD f17, 1 * SIZE(AO)
  960. LFD f18, 2 * SIZE(AO)
  961. LFD f19, 3 * SIZE(AO)
  962. LFD f20, 0 * SIZE(BO)
  963. LFD f21, 1 * SIZE(BO)
  964. LFD f22, 2 * SIZE(BO)
  965. LFD f23, 3 * SIZE(BO)
  966. dcbt CO1, PREC
  967. dcbt CO2, PREC
  968. dcbt CO3, PREC
  969. dcbt CO4, PREC
  970. srawi. r0, TEMP, 2
  971. mtspr CTR, r0
  972. #endif
  973. ble LL(15)
  974. .align 4
  975. LL(12):
  976. FMADD f0, f16, f20, f0
  977. FMADD f5, f17, f21, f5
  978. FMADD f10, f18, f22, f10
  979. FMADD f15, f19, f23, f15
  980. LFD f28, 4 * SIZE(BO)
  981. LFD f29, 5 * SIZE(BO)
  982. LFD f30, 6 * SIZE(BO)
  983. LFD f31, 7 * SIZE(BO)
  984. FMADD f1, f17, f20, f1
  985. FMADD f2, f18, f20, f2
  986. FMADD f3, f19, f20, f3
  987. FMADD f4, f16, f21, f4
  988. LFD f24, 4 * SIZE(AO)
  989. LFD f25, 5 * SIZE(AO)
  990. LFD f26, 6 * SIZE(AO)
  991. LFD f27, 7 * SIZE(AO)
  992. FMADD f6, f18, f21, f6
  993. FMADD f7, f19, f21, f7
  994. FMADD f8, f16, f22, f8
  995. FMADD f9, f17, f22, f9
  996. FMADD f11, f19, f22, f11
  997. FMADD f12, f16, f23, f12
  998. FMADD f13, f17, f23, f13
  999. FMADD f14, f18, f23, f14
  1000. LFD f20, 8 * SIZE(BO)
  1001. LFD f21, 9 * SIZE(BO)
  1002. LFD f22, 10 * SIZE(BO)
  1003. LFD f23, 11 * SIZE(BO)
  1004. FMADD f0, f24, f28, f0
  1005. FMADD f5, f25, f29, f5
  1006. FMADD f10, f26, f30, f10
  1007. FMADD f15, f27, f31, f15
  1008. LFD f16, 8 * SIZE(AO)
  1009. LFD f17, 9 * SIZE(AO)
  1010. LFD f18, 10 * SIZE(AO)
  1011. LFD f19, 11 * SIZE(AO)
  1012. FMADD f1, f25, f28, f1
  1013. FMADD f2, f26, f28, f2
  1014. FMADD f3, f27, f28, f3
  1015. FMADD f4, f24, f29, f4
  1016. FMADD f6, f26, f29, f6
  1017. FMADD f7, f27, f29, f7
  1018. FMADD f8, f24, f30, f8
  1019. FMADD f9, f25, f30, f9
  1020. FMADD f11, f27, f30, f11
  1021. FMADD f12, f24, f31, f12
  1022. FMADD f13, f25, f31, f13
  1023. FMADD f14, f26, f31, f14
  1024. LFD f28, 12 * SIZE(BO)
  1025. LFD f29, 13 * SIZE(BO)
  1026. LFD f30, 14 * SIZE(BO)
  1027. LFD f31, 15 * SIZE(BO)
  1028. FMADD f0, f16, f20, f0
  1029. FMADD f5, f17, f21, f5
  1030. FMADD f10, f18, f22, f10
  1031. FMADD f15, f19, f23, f15
  1032. LFD f24, 12 * SIZE(AO)
  1033. LFD f25, 13 * SIZE(AO)
  1034. LFD f26, 14 * SIZE(AO)
  1035. LFD f27, 15 * SIZE(AO)
  1036. FMADD f1, f17, f20, f1
  1037. FMADD f2, f18, f20, f2
  1038. FMADD f3, f19, f20, f3
  1039. FMADD f4, f16, f21, f4
  1040. FMADD f6, f18, f21, f6
  1041. FMADD f7, f19, f21, f7
  1042. FMADD f8, f16, f22, f8
  1043. FMADD f9, f17, f22, f9
  1044. FMADD f11, f19, f22, f11
  1045. FMADD f12, f16, f23, f12
  1046. FMADD f13, f17, f23, f13
  1047. FMADD f14, f18, f23, f14
  1048. LFD f20, 16 * SIZE(BO)
  1049. LFD f21, 17 * SIZE(BO)
  1050. LFD f22, 18 * SIZE(BO)
  1051. LFD f23, 19 * SIZE(BO)
  1052. FMADD f0, f24, f28, f0
  1053. FMADD f5, f25, f29, f5
  1054. FMADD f10, f26, f30, f10
  1055. FMADD f15, f27, f31, f15
  1056. LFD f16, 16 * SIZE(AO)
  1057. LFD f17, 17 * SIZE(AO)
  1058. LFD f18, 18 * SIZE(AO)
  1059. LFD f19, 19 * SIZE(AO)
  1060. FMADD f1, f25, f28, f1
  1061. FMADD f2, f26, f28, f2
  1062. FMADD f3, f27, f28, f3
  1063. FMADD f4, f24, f29, f4
  1064. FMADD f6, f26, f29, f6
  1065. FMADD f7, f27, f29, f7
  1066. FMADD f8, f24, f30, f8
  1067. FMADD f9, f25, f30, f9
  1068. FMADD f11, f27, f30, f11
  1069. FMADD f12, f24, f31, f12
  1070. FMADD f13, f25, f31, f13
  1071. FMADD f14, f26, f31, f14
  1072. addi AO, AO, 16 * SIZE
  1073. addi BO, BO, 16 * SIZE
  1074. #ifdef PPC970
  1075. #ifndef ALLOC_HUGETLB
  1076. DCBT(AO, PREA)
  1077. #endif
  1078. DCBT(BO, PREB)
  1079. #endif
  1080. #ifdef POWER4
  1081. #ifndef ALLOC_HUGETLB
  1082. DCBT(AO, PREA)
  1083. #endif
  1084. DCBT(BO, PREB)
  1085. #endif
  1086. #ifdef POWER5
  1087. DCBT(AO, PREA)
  1088. DCBT(BO, PREB)
  1089. #endif
  1090. bdnz LL(12)
  1091. .align 4
  1092. LL(15):
  1093. #if defined(LT) || defined(RN)
  1094. andi. r0, KK, 3
  1095. #else
  1096. andi. r0, TEMP, 3
  1097. #endif
  1098. mtspr CTR, r0
  1099. ble+ LL(18)
  1100. .align 4
  1101. LL(16):
  1102. FMADD f0, f16, f20, f0
  1103. FMADD f5, f17, f21, f5
  1104. FMADD f10, f18, f22, f10
  1105. FMADD f15, f19, f23, f15
  1106. FMADD f1, f17, f20, f1
  1107. FMADD f2, f18, f20, f2
  1108. FMADD f3, f19, f20, f3
  1109. FMADD f4, f16, f21, f4
  1110. FMADD f6, f18, f21, f6
  1111. FMADD f7, f19, f21, f7
  1112. FMADD f8, f16, f22, f8
  1113. FMADD f9, f17, f22, f9
  1114. FMADD f11, f19, f22, f11
  1115. FMADD f12, f16, f23, f12
  1116. FMADD f13, f17, f23, f13
  1117. FMADD f14, f18, f23, f14
  1118. LFD f16, 4 * SIZE(AO)
  1119. LFD f17, 5 * SIZE(AO)
  1120. LFD f18, 6 * SIZE(AO)
  1121. LFD f19, 7 * SIZE(AO)
  1122. LFD f20, 4 * SIZE(BO)
  1123. LFD f21, 5 * SIZE(BO)
  1124. LFD f22, 6 * SIZE(BO)
  1125. LFD f23, 7 * SIZE(BO)
  1126. addi BO, BO, 4 * SIZE
  1127. addi AO, AO, 4 * SIZE
  1128. bdnz LL(16)
  1129. .align 4
  1130. LL(18):
  1131. #if defined(LN) || defined(RT)
  1132. subi r0, KK, 4
  1133. slwi r0, r0, 2 + BASE_SHIFT
  1134. add AO, AORIG, r0
  1135. add BO, B, r0
  1136. #endif
  1137. #if defined(LN) || defined(LT)
  1138. LFD f16, 0 * SIZE(BO)
  1139. LFD f17, 1 * SIZE(BO)
  1140. LFD f18, 2 * SIZE(BO)
  1141. LFD f19, 3 * SIZE(BO)
  1142. LFD f20, 4 * SIZE(BO)
  1143. LFD f21, 5 * SIZE(BO)
  1144. LFD f22, 6 * SIZE(BO)
  1145. LFD f23, 7 * SIZE(BO)
  1146. LFD f24, 8 * SIZE(BO)
  1147. LFD f25, 9 * SIZE(BO)
  1148. LFD f26, 10 * SIZE(BO)
  1149. LFD f27, 11 * SIZE(BO)
  1150. LFD f28, 12 * SIZE(BO)
  1151. LFD f29, 13 * SIZE(BO)
  1152. LFD f30, 14 * SIZE(BO)
  1153. LFD f31, 15 * SIZE(BO)
  1154. FSUB f0, f16, f0
  1155. FSUB f4, f17, f4
  1156. FSUB f8, f18, f8
  1157. FSUB f12, f19, f12
  1158. FSUB f1, f20, f1
  1159. FSUB f5, f21, f5
  1160. FSUB f9, f22, f9
  1161. FSUB f13, f23, f13
  1162. FSUB f2, f24, f2
  1163. FSUB f6, f25, f6
  1164. FSUB f10, f26, f10
  1165. FSUB f14, f27, f14
  1166. FSUB f3, f28, f3
  1167. FSUB f7, f29, f7
  1168. FSUB f11, f30, f11
  1169. FSUB f15, f31, f15
  1170. #else
  1171. LFD f16, 0 * SIZE(AO)
  1172. LFD f17, 1 * SIZE(AO)
  1173. LFD f18, 2 * SIZE(AO)
  1174. LFD f19, 3 * SIZE(AO)
  1175. LFD f20, 4 * SIZE(AO)
  1176. LFD f21, 5 * SIZE(AO)
  1177. LFD f22, 6 * SIZE(AO)
  1178. LFD f23, 7 * SIZE(AO)
  1179. LFD f24, 8 * SIZE(AO)
  1180. LFD f25, 9 * SIZE(AO)
  1181. LFD f26, 10 * SIZE(AO)
  1182. LFD f27, 11 * SIZE(AO)
  1183. LFD f28, 12 * SIZE(AO)
  1184. LFD f29, 13 * SIZE(AO)
  1185. LFD f30, 14 * SIZE(AO)
  1186. LFD f31, 15 * SIZE(AO)
  1187. FSUB f0, f16, f0
  1188. FSUB f1, f17, f1
  1189. FSUB f2, f18, f2
  1190. FSUB f3, f19, f3
  1191. FSUB f4, f20, f4
  1192. FSUB f5, f21, f5
  1193. FSUB f6, f22, f6
  1194. FSUB f7, f23, f7
  1195. FSUB f8, f24, f8
  1196. FSUB f9, f25, f9
  1197. FSUB f10, f26, f10
  1198. FSUB f11, f27, f11
  1199. FSUB f12, f28, f12
  1200. FSUB f13, f29, f13
  1201. FSUB f14, f30, f14
  1202. FSUB f15, f31, f15
  1203. #endif
  1204. #ifdef LN
  1205. LFD f16, 15 * SIZE(AO)
  1206. LFD f17, 14 * SIZE(AO)
  1207. LFD f18, 13 * SIZE(AO)
  1208. LFD f19, 12 * SIZE(AO)
  1209. FMUL f3, f16, f3
  1210. FMUL f7, f16, f7
  1211. FMUL f11, f16, f11
  1212. FMUL f15, f16, f15
  1213. FNMSUB f2, f17, f3, f2
  1214. FNMSUB f6, f17, f7, f6
  1215. FNMSUB f10, f17, f11, f10
  1216. FNMSUB f14, f17, f15, f14
  1217. FNMSUB f1, f18, f3, f1
  1218. FNMSUB f5, f18, f7, f5
  1219. FNMSUB f9, f18, f11, f9
  1220. FNMSUB f13, f18, f15, f13
  1221. FNMSUB f0, f19, f3, f0
  1222. FNMSUB f4, f19, f7, f4
  1223. FNMSUB f8, f19, f11, f8
  1224. FNMSUB f12, f19, f15, f12
  1225. LFD f16, 10 * SIZE(AO)
  1226. LFD f17, 9 * SIZE(AO)
  1227. LFD f18, 8 * SIZE(AO)
  1228. LFD f19, 5 * SIZE(AO)
  1229. FMUL f2, f16, f2
  1230. FMUL f6, f16, f6
  1231. FMUL f10, f16, f10
  1232. FMUL f14, f16, f14
  1233. LFD f20, 4 * SIZE(AO)
  1234. LFD f21, 0 * SIZE(AO)
  1235. FNMSUB f1, f17, f2, f1
  1236. FNMSUB f5, f17, f6, f5
  1237. FNMSUB f9, f17, f10, f9
  1238. FNMSUB f13, f17, f14, f13
  1239. FNMSUB f0, f18, f2, f0
  1240. FNMSUB f4, f18, f6, f4
  1241. FNMSUB f8, f18, f10, f8
  1242. FNMSUB f12, f18, f14, f12
  1243. FMUL f1, f19, f1
  1244. FMUL f5, f19, f5
  1245. FMUL f9, f19, f9
  1246. FMUL f13, f19, f13
  1247. FNMSUB f0, f20, f1, f0
  1248. FNMSUB f4, f20, f5, f4
  1249. FNMSUB f8, f20, f9, f8
  1250. FNMSUB f12, f20, f13, f12
  1251. FMUL f0, f21, f0
  1252. FMUL f4, f21, f4
  1253. FMUL f8, f21, f8
  1254. FMUL f12, f21, f12
  1255. #endif
  1256. #ifdef LT
  1257. LFD f16, 0 * SIZE(AO)
  1258. LFD f17, 1 * SIZE(AO)
  1259. LFD f18, 2 * SIZE(AO)
  1260. LFD f19, 3 * SIZE(AO)
  1261. FMUL f0, f16, f0
  1262. FMUL f4, f16, f4
  1263. FMUL f8, f16, f8
  1264. FMUL f12, f16, f12
  1265. FNMSUB f1, f17, f0, f1
  1266. FNMSUB f5, f17, f4, f5
  1267. FNMSUB f9, f17, f8, f9
  1268. FNMSUB f13, f17, f12, f13
  1269. FNMSUB f2, f18, f0, f2
  1270. FNMSUB f6, f18, f4, f6
  1271. FNMSUB f10, f18, f8, f10
  1272. FNMSUB f14, f18, f12, f14
  1273. FNMSUB f3, f19, f0, f3
  1274. FNMSUB f7, f19, f4, f7
  1275. FNMSUB f11, f19, f8, f11
  1276. FNMSUB f15, f19, f12, f15
  1277. LFD f16, 5 * SIZE(AO)
  1278. LFD f17, 6 * SIZE(AO)
  1279. LFD f18, 7 * SIZE(AO)
  1280. LFD f19, 10 * SIZE(AO)
  1281. FMUL f1, f16, f1
  1282. FMUL f5, f16, f5
  1283. FMUL f9, f16, f9
  1284. FMUL f13, f16, f13
  1285. LFD f20, 11 * SIZE(AO)
  1286. LFD f21, 15 * SIZE(AO)
  1287. FNMSUB f2, f17, f1, f2
  1288. FNMSUB f6, f17, f5, f6
  1289. FNMSUB f10, f17, f9, f10
  1290. FNMSUB f14, f17, f13, f14
  1291. FNMSUB f3, f18, f1, f3
  1292. FNMSUB f7, f18, f5, f7
  1293. FNMSUB f11, f18, f9, f11
  1294. FNMSUB f15, f18, f13, f15
  1295. FMUL f2, f19, f2
  1296. FMUL f6, f19, f6
  1297. FMUL f10, f19, f10
  1298. FMUL f14, f19, f14
  1299. FNMSUB f3, f20, f2, f3
  1300. FNMSUB f7, f20, f6, f7
  1301. FNMSUB f11, f20, f10, f11
  1302. FNMSUB f15, f20, f14, f15
  1303. FMUL f3, f21, f3
  1304. FMUL f7, f21, f7
  1305. FMUL f11, f21, f11
  1306. FMUL f15, f21, f15
  1307. #endif
  1308. #ifdef RN
  1309. LFD f16, 0 * SIZE(BO)
  1310. LFD f17, 1 * SIZE(BO)
  1311. LFD f18, 2 * SIZE(BO)
  1312. LFD f19, 3 * SIZE(BO)
  1313. FMUL f0, f16, f0
  1314. FMUL f1, f16, f1
  1315. FMUL f2, f16, f2
  1316. FMUL f3, f16, f3
  1317. FNMSUB f4, f17, f0, f4
  1318. FNMSUB f5, f17, f1, f5
  1319. FNMSUB f6, f17, f2, f6
  1320. FNMSUB f7, f17, f3, f7
  1321. FNMSUB f8, f18, f0, f8
  1322. FNMSUB f9, f18, f1, f9
  1323. FNMSUB f10, f18, f2, f10
  1324. FNMSUB f11, f18, f3, f11
  1325. FNMSUB f12, f19, f0, f12
  1326. FNMSUB f13, f19, f1, f13
  1327. FNMSUB f14, f19, f2, f14
  1328. FNMSUB f15, f19, f3, f15
  1329. LFD f16, 5 * SIZE(BO)
  1330. LFD f17, 6 * SIZE(BO)
  1331. LFD f18, 7 * SIZE(BO)
  1332. LFD f19, 10 * SIZE(BO)
  1333. FMUL f4, f16, f4
  1334. FMUL f5, f16, f5
  1335. FMUL f6, f16, f6
  1336. FMUL f7, f16, f7
  1337. LFD f20, 11 * SIZE(BO)
  1338. LFD f21, 15 * SIZE(BO)
  1339. FNMSUB f8, f17, f4, f8
  1340. FNMSUB f9, f17, f5, f9
  1341. FNMSUB f10, f17, f6, f10
  1342. FNMSUB f11, f17, f7, f11
  1343. FNMSUB f12, f18, f4, f12
  1344. FNMSUB f13, f18, f5, f13
  1345. FNMSUB f14, f18, f6, f14
  1346. FNMSUB f15, f18, f7, f15
  1347. FMUL f8, f19, f8
  1348. FMUL f9, f19, f9
  1349. FMUL f10, f19, f10
  1350. FMUL f11, f19, f11
  1351. FNMSUB f12, f20, f8, f12
  1352. FNMSUB f13, f20, f9, f13
  1353. FNMSUB f14, f20, f10, f14
  1354. FNMSUB f15, f20, f11, f15
  1355. FMUL f12, f21, f12
  1356. FMUL f13, f21, f13
  1357. FMUL f14, f21, f14
  1358. FMUL f15, f21, f15
  1359. #endif
  1360. #ifdef RT
  1361. LFD f16, 15 * SIZE(BO)
  1362. LFD f17, 14 * SIZE(BO)
  1363. LFD f18, 13 * SIZE(BO)
  1364. LFD f19, 12 * SIZE(BO)
  1365. FMUL f12, f16, f12
  1366. FMUL f13, f16, f13
  1367. FMUL f14, f16, f14
  1368. FMUL f15, f16, f15
  1369. FNMSUB f8, f17, f12, f8
  1370. FNMSUB f9, f17, f13, f9
  1371. FNMSUB f10, f17, f14, f10
  1372. FNMSUB f11, f17, f15, f11
  1373. FNMSUB f4, f18, f12, f4
  1374. FNMSUB f5, f18, f13, f5
  1375. FNMSUB f6, f18, f14, f6
  1376. FNMSUB f7, f18, f15, f7
  1377. FNMSUB f0, f19, f12, f0
  1378. FNMSUB f1, f19, f13, f1
  1379. FNMSUB f2, f19, f14, f2
  1380. FNMSUB f3, f19, f15, f3
  1381. LFD f16, 10 * SIZE(BO)
  1382. LFD f17, 9 * SIZE(BO)
  1383. LFD f18, 8 * SIZE(BO)
  1384. LFD f19, 5 * SIZE(BO)
  1385. FMUL f8, f16, f8
  1386. FMUL f9, f16, f9
  1387. FMUL f10, f16, f10
  1388. FMUL f11, f16, f11
  1389. LFD f20, 4 * SIZE(BO)
  1390. LFD f21, 0 * SIZE(BO)
  1391. FNMSUB f4, f17, f8, f4
  1392. FNMSUB f5, f17, f9, f5
  1393. FNMSUB f6, f17, f10, f6
  1394. FNMSUB f7, f17, f11, f7
  1395. FNMSUB f0, f18, f8, f0
  1396. FNMSUB f1, f18, f9, f1
  1397. FNMSUB f2, f18, f10, f2
  1398. FNMSUB f3, f18, f11, f3
  1399. FMUL f4, f19, f4
  1400. FMUL f5, f19, f5
  1401. FMUL f6, f19, f6
  1402. FMUL f7, f19, f7
  1403. FNMSUB f0, f20, f4, f0
  1404. FNMSUB f1, f20, f5, f1
  1405. FNMSUB f2, f20, f6, f2
  1406. FNMSUB f3, f20, f7, f3
  1407. FMUL f0, f21, f0
  1408. FMUL f1, f21, f1
  1409. FMUL f2, f21, f2
  1410. FMUL f3, f21, f3
  1411. #endif
  1412. #ifdef LN
  1413. subi CO1, CO1, 4 * SIZE
  1414. subi CO2, CO2, 4 * SIZE
  1415. subi CO3, CO3, 4 * SIZE
  1416. subi CO4, CO4, 4 * SIZE
  1417. #endif
  1418. #if defined(LN) || defined(LT)
  1419. STFD f0, 0 * SIZE(BO)
  1420. STFD f4, 1 * SIZE(BO)
  1421. STFD f8, 2 * SIZE(BO)
  1422. STFD f12, 3 * SIZE(BO)
  1423. STFD f1, 4 * SIZE(BO)
  1424. STFD f5, 5 * SIZE(BO)
  1425. STFD f9, 6 * SIZE(BO)
  1426. STFD f13, 7 * SIZE(BO)
  1427. STFD f2, 8 * SIZE(BO)
  1428. STFD f6, 9 * SIZE(BO)
  1429. STFD f10, 10 * SIZE(BO)
  1430. STFD f14, 11 * SIZE(BO)
  1431. STFD f3, 12 * SIZE(BO)
  1432. STFD f7, 13 * SIZE(BO)
  1433. STFD f11, 14 * SIZE(BO)
  1434. STFD f15, 15 * SIZE(BO)
  1435. #else
  1436. STFD f0, 0 * SIZE(AO)
  1437. STFD f1, 1 * SIZE(AO)
  1438. STFD f2, 2 * SIZE(AO)
  1439. STFD f3, 3 * SIZE(AO)
  1440. STFD f4, 4 * SIZE(AO)
  1441. STFD f5, 5 * SIZE(AO)
  1442. STFD f6, 6 * SIZE(AO)
  1443. STFD f7, 7 * SIZE(AO)
  1444. STFD f8, 8 * SIZE(AO)
  1445. STFD f9, 9 * SIZE(AO)
  1446. STFD f10, 10 * SIZE(AO)
  1447. STFD f11, 11 * SIZE(AO)
  1448. STFD f12, 12 * SIZE(AO)
  1449. STFD f13, 13 * SIZE(AO)
  1450. STFD f14, 14 * SIZE(AO)
  1451. STFD f15, 15 * SIZE(AO)
  1452. #endif
  1453. STFD f0, 0 * SIZE(CO1)
  1454. STFD f1, 1 * SIZE(CO1)
  1455. STFD f2, 2 * SIZE(CO1)
  1456. STFD f3, 3 * SIZE(CO1)
  1457. STFD f4, 0 * SIZE(CO2)
  1458. STFD f5, 1 * SIZE(CO2)
  1459. STFD f6, 2 * SIZE(CO2)
  1460. STFD f7, 3 * SIZE(CO2)
  1461. STFD f8, 0 * SIZE(CO3)
  1462. STFD f9, 1 * SIZE(CO3)
  1463. STFD f10, 2 * SIZE(CO3)
  1464. STFD f11, 3 * SIZE(CO3)
  1465. STFD f12, 0 * SIZE(CO4)
  1466. STFD f13, 1 * SIZE(CO4)
  1467. STFD f14, 2 * SIZE(CO4)
  1468. STFD f15, 3 * SIZE(CO4)
  1469. lfs f0, FZERO
  1470. fmr f1, f0
  1471. fmr f2, f0
  1472. fmr f3, f0
  1473. fmr f4, f0
  1474. fmr f5, f0
  1475. fmr f6, f0
  1476. fmr f7, f0
  1477. fmr f8, f0
  1478. fmr f9, f0
  1479. fmr f10, f0
  1480. fmr f11, f0
  1481. fmr f12, f0
  1482. fmr f13, f0
  1483. fmr f14, f0
  1484. fmr f15, f0
  1485. #ifndef LN
  1486. addi CO1, CO1, 4 * SIZE
  1487. addi CO2, CO2, 4 * SIZE
  1488. addi CO3, CO3, 4 * SIZE
  1489. addi CO4, CO4, 4 * SIZE
  1490. #endif
  1491. #ifdef RT
  1492. slwi r0, K, 2 + BASE_SHIFT
  1493. add AORIG, AORIG, r0
  1494. #endif
  1495. #if defined(LT) || defined(RN)
  1496. sub TEMP, K, KK
  1497. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1498. add AO, AO, TEMP
  1499. add BO, BO, TEMP
  1500. #endif
  1501. #ifdef LT
  1502. addi KK, KK, 4
  1503. #endif
  1504. #ifdef LN
  1505. subi KK, KK, 4
  1506. #endif
  1507. addic. I, I, -1
  1508. bgt+ LL(11)
  1509. .align 4
  1510. LL(39):
  1511. #ifdef LN
  1512. slwi r0, K, 2 + BASE_SHIFT
  1513. add B, B, r0
  1514. #endif
  1515. #if defined(LT) || defined(RN)
  1516. mr B, BO
  1517. #endif
  1518. #ifdef RN
  1519. addi KK, KK, 4
  1520. #endif
  1521. #ifdef RT
  1522. subi KK, KK, 4
  1523. #endif
  1524. addic. J, J, -1
  1525. lfs f0, FZERO
  1526. bgt LL(10)
  1527. .align 4
  1528. LL(40):
  1529. andi. J, N, 2
  1530. ble LL(70)
  1531. #ifdef RT
  1532. slwi r0, K, 1 + BASE_SHIFT
  1533. sub B, B, r0
  1534. slwi r0, LDC, 1
  1535. sub C, C, r0
  1536. #endif
  1537. mr CO1, C
  1538. add CO2, C, LDC
  1539. #ifdef LN
  1540. add KK, M, OFFSET
  1541. #endif
  1542. #ifdef LT
  1543. mr KK, OFFSET
  1544. #endif
  1545. fmr f1, f0
  1546. fmr f2, f0
  1547. fmr f3, f0
  1548. fmr f4, f0
  1549. fmr f5, f0
  1550. fmr f6, f0
  1551. fmr f7, f0
  1552. #if defined(LN) || defined(RT)
  1553. mr AORIG, A
  1554. #else
  1555. mr AO, A
  1556. #endif
  1557. #ifndef RT
  1558. add C, CO2, LDC
  1559. #endif
  1560. LL(60):
  1561. andi. I, M, 1
  1562. ble LL(50)
  1563. #if defined(LT) || defined(RN)
  1564. LFD f16, 0 * SIZE(AO)
  1565. LFD f17, 1 * SIZE(AO)
  1566. LFD f18, 2 * SIZE(AO)
  1567. LFD f19, 3 * SIZE(AO)
  1568. LFD f20, 0 * SIZE(B)
  1569. LFD f21, 1 * SIZE(B)
  1570. LFD f22, 2 * SIZE(B)
  1571. LFD f23, 3 * SIZE(B)
  1572. LFD f24, 4 * SIZE(B)
  1573. LFD f25, 5 * SIZE(B)
  1574. LFD f26, 6 * SIZE(B)
  1575. LFD f27, 7 * SIZE(B)
  1576. srawi. r0, KK, 2
  1577. mtspr CTR, r0
  1578. mr BO, B
  1579. #else
  1580. #ifdef LN
  1581. slwi r0, K, BASE_SHIFT
  1582. sub AORIG, AORIG, r0
  1583. #endif
  1584. slwi r0, KK, 0 + BASE_SHIFT
  1585. slwi TEMP, KK, 1 + BASE_SHIFT
  1586. add AO, AORIG, r0
  1587. add BO, B, TEMP
  1588. sub TEMP, K, KK
  1589. LFD f16, 0 * SIZE(AO)
  1590. LFD f17, 1 * SIZE(AO)
  1591. LFD f18, 2 * SIZE(AO)
  1592. LFD f19, 3 * SIZE(AO)
  1593. LFD f20, 0 * SIZE(BO)
  1594. LFD f21, 1 * SIZE(BO)
  1595. LFD f22, 2 * SIZE(BO)
  1596. LFD f23, 3 * SIZE(BO)
  1597. LFD f24, 4 * SIZE(BO)
  1598. LFD f25, 5 * SIZE(BO)
  1599. LFD f26, 6 * SIZE(BO)
  1600. LFD f27, 7 * SIZE(BO)
  1601. srawi. r0, TEMP, 2
  1602. mtspr CTR, r0
  1603. #endif
  1604. ble LL(65)
  1605. .align 5
  1606. LL(62):
  1607. FMADD f0, f16, f20, f0
  1608. FMADD f1, f16, f21, f1
  1609. FMADD f2, f17, f22, f2
  1610. FMADD f3, f17, f23, f3
  1611. LFD f20, 8 * SIZE(BO)
  1612. LFD f21, 9 * SIZE(BO)
  1613. LFD f22, 10 * SIZE(BO)
  1614. LFD f23, 11 * SIZE(BO)
  1615. FMADD f0, f18, f24, f0
  1616. FMADD f1, f18, f25, f1
  1617. FMADD f2, f19, f26, f2
  1618. FMADD f3, f19, f27, f3
  1619. LFD f16, 4 * SIZE(AO)
  1620. LFD f17, 5 * SIZE(AO)
  1621. LFD f18, 6 * SIZE(AO)
  1622. LFD f19, 7 * SIZE(AO)
  1623. LFD f24, 12 * SIZE(BO)
  1624. LFD f25, 13 * SIZE(BO)
  1625. LFD f26, 14 * SIZE(BO)
  1626. LFD f27, 15 * SIZE(BO)
  1627. addi AO, AO, 4 * SIZE
  1628. addi BO, BO, 8 * SIZE
  1629. bdnz LL(62)
  1630. .align 4
  1631. LL(65):
  1632. #if defined(LT) || defined(RN)
  1633. andi. r0, KK, 3
  1634. #else
  1635. andi. r0, TEMP, 3
  1636. #endif
  1637. mtspr CTR, r0
  1638. ble+ LL(68)
  1639. .align 4
  1640. LL(66):
  1641. FMADD f0, f16, f20, f0
  1642. FMADD f1, f16, f21, f1
  1643. LFD f16, 1 * SIZE(AO)
  1644. LFD f20, 2 * SIZE(BO)
  1645. LFD f21, 3 * SIZE(BO)
  1646. addi BO, BO, 2 * SIZE
  1647. addi AO, AO, 1 * SIZE
  1648. bdnz LL(66)
  1649. .align 4
  1650. LL(68):
  1651. FADD f0, f2, f0
  1652. FADD f1, f3, f1
  1653. #if defined(LN) || defined(RT)
  1654. #ifdef LN
  1655. subi r0, KK, 1
  1656. #else
  1657. subi r0, KK, 2
  1658. #endif
  1659. slwi TEMP, r0, 0 + BASE_SHIFT
  1660. slwi r0, r0, 1 + BASE_SHIFT
  1661. add AO, AORIG, TEMP
  1662. add BO, B, r0
  1663. #endif
  1664. #if defined(LN) || defined(LT)
  1665. LFD f16, 0 * SIZE(BO)
  1666. LFD f17, 1 * SIZE(BO)
  1667. FSUB f0, f16, f0
  1668. FSUB f1, f17, f1
  1669. #else
  1670. LFD f16, 0 * SIZE(AO)
  1671. LFD f20, 1 * SIZE(AO)
  1672. FSUB f0, f16, f0
  1673. FSUB f1, f20, f1
  1674. #endif
  1675. #ifdef LN
  1676. LFD f21, 0 * SIZE(AO)
  1677. FMUL f0, f21, f0
  1678. FMUL f1, f21, f1
  1679. #endif
  1680. #ifdef LT
  1681. LFD f16, 0 * SIZE(AO)
  1682. FMUL f0, f16, f0
  1683. FMUL f1, f16, f1
  1684. #endif
  1685. #ifdef RN
  1686. LFD f16, 0 * SIZE(BO)
  1687. LFD f17, 1 * SIZE(BO)
  1688. LFD f18, 3 * SIZE(BO)
  1689. FMUL f0, f16, f0
  1690. FNMSUB f1, f17, f0, f1
  1691. FMUL f1, f18, f1
  1692. #endif
  1693. #ifdef RT
  1694. LFD f19, 3 * SIZE(BO)
  1695. LFD f20, 2 * SIZE(BO)
  1696. LFD f21, 0 * SIZE(BO)
  1697. FMUL f1, f19, f1
  1698. FNMSUB f0, f20, f1, f0
  1699. FMUL f0, f21, f0
  1700. #endif
  1701. #ifdef LN
  1702. subi CO1, CO1, 1 * SIZE
  1703. subi CO2, CO2, 1 * SIZE
  1704. #endif
  1705. #if defined(LN) || defined(LT)
  1706. STFD f0, 0 * SIZE(BO)
  1707. STFD f1, 1 * SIZE(BO)
  1708. #else
  1709. STFD f0, 0 * SIZE(AO)
  1710. STFD f1, 1 * SIZE(AO)
  1711. #endif
  1712. STFD f0, 0 * SIZE(CO1)
  1713. STFD f1, 0 * SIZE(CO2)
  1714. lfs f0, FZERO
  1715. fmr f1, f0
  1716. fmr f4, f0
  1717. fmr f5, f0
  1718. #ifndef LN
  1719. addi CO1, CO1, 1 * SIZE
  1720. addi CO2, CO2, 1 * SIZE
  1721. #endif
  1722. #ifdef RT
  1723. slwi r0, K, 0 + BASE_SHIFT
  1724. add AORIG, AORIG, r0
  1725. #endif
  1726. #if defined(LT) || defined(RN)
  1727. sub TEMP, K, KK
  1728. slwi r0, TEMP, 0 + BASE_SHIFT
  1729. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1730. add AO, AO, r0
  1731. add BO, BO, TEMP
  1732. #endif
  1733. #ifdef LN
  1734. subi KK, KK, 1
  1735. #endif
  1736. #ifdef LT
  1737. addi KK, KK, 1
  1738. #endif
  1739. .align 4
  1740. LL(50):
  1741. andi. I, M, 2
  1742. ble LL(41)
  1743. #if defined(LT) || defined(RN)
  1744. LFD f16, 0 * SIZE(AO)
  1745. LFD f17, 1 * SIZE(AO)
  1746. LFD f18, 2 * SIZE(AO)
  1747. LFD f19, 3 * SIZE(AO)
  1748. LFD f20, 0 * SIZE(B)
  1749. LFD f21, 1 * SIZE(B)
  1750. LFD f22, 2 * SIZE(B)
  1751. LFD f23, 3 * SIZE(B)
  1752. LFD f24, 4 * SIZE(B)
  1753. LFD f25, 5 * SIZE(B)
  1754. LFD f26, 6 * SIZE(B)
  1755. LFD f27, 7 * SIZE(B)
  1756. srawi. r0, KK, 2
  1757. mtspr CTR, r0
  1758. mr BO, B
  1759. #else
  1760. #ifdef LN
  1761. slwi r0, K, 1 + BASE_SHIFT
  1762. sub AORIG, AORIG, r0
  1763. #endif
  1764. slwi r0, KK, 1 + BASE_SHIFT
  1765. slwi TEMP, KK, 1 + BASE_SHIFT
  1766. add AO, AORIG, r0
  1767. add BO, B, TEMP
  1768. sub TEMP, K, KK
  1769. LFD f16, 0 * SIZE(AO)
  1770. LFD f17, 1 * SIZE(AO)
  1771. LFD f18, 2 * SIZE(AO)
  1772. LFD f19, 3 * SIZE(AO)
  1773. LFD f20, 0 * SIZE(BO)
  1774. LFD f21, 1 * SIZE(BO)
  1775. LFD f22, 2 * SIZE(BO)
  1776. LFD f23, 3 * SIZE(BO)
  1777. LFD f24, 4 * SIZE(BO)
  1778. LFD f25, 5 * SIZE(BO)
  1779. LFD f26, 6 * SIZE(BO)
  1780. LFD f27, 7 * SIZE(BO)
  1781. srawi. r0, TEMP, 2
  1782. mtspr CTR, r0
  1783. #endif
  1784. ble LL(55)
  1785. .align 5
  1786. LL(52):
  1787. FMADD f0, f16, f20, f0
  1788. FMADD f1, f17, f20, f1
  1789. FMADD f2, f16, f21, f2
  1790. FMADD f3, f17, f21, f3
  1791. FMADD f4, f18, f22, f4
  1792. FMADD f5, f19, f22, f5
  1793. FMADD f6, f18, f23, f6
  1794. FMADD f7, f19, f23, f7
  1795. LFD f16, 4 * SIZE(AO)
  1796. LFD f17, 5 * SIZE(AO)
  1797. LFD f18, 6 * SIZE(AO)
  1798. LFD f19, 7 * SIZE(AO)
  1799. LFD f20, 8 * SIZE(BO)
  1800. LFD f21, 9 * SIZE(BO)
  1801. LFD f22, 10 * SIZE(BO)
  1802. LFD f23, 11 * SIZE(BO)
  1803. FMADD f0, f16, f24, f0
  1804. FMADD f1, f17, f24, f1
  1805. FMADD f2, f16, f25, f2
  1806. FMADD f3, f17, f25, f3
  1807. FMADD f4, f18, f26, f4
  1808. FMADD f5, f19, f26, f5
  1809. FMADD f6, f18, f27, f6
  1810. FMADD f7, f19, f27, f7
  1811. LFD f16, 8 * SIZE(AO)
  1812. LFD f17, 9 * SIZE(AO)
  1813. LFD f18, 10 * SIZE(AO)
  1814. LFD f19, 11 * SIZE(AO)
  1815. LFD f24, 12 * SIZE(BO)
  1816. LFD f25, 13 * SIZE(BO)
  1817. LFD f26, 14 * SIZE(BO)
  1818. LFD f27, 15 * SIZE(BO)
  1819. addi AO, AO, 8 * SIZE
  1820. addi BO, BO, 8 * SIZE
  1821. DCBT(BO, PREB)
  1822. bdnz LL(52)
  1823. .align 4
  1824. LL(55):
  1825. #if defined(LT) || defined(RN)
  1826. andi. r0, KK, 3
  1827. #else
  1828. andi. r0, TEMP, 3
  1829. #endif
  1830. mtspr CTR, r0
  1831. ble+ LL(58)
  1832. .align 4
  1833. LL(56):
  1834. FMADD f0, f16, f20, f0
  1835. FMADD f1, f17, f20, f1
  1836. FMADD f2, f16, f21, f2
  1837. FMADD f3, f17, f21, f3
  1838. LFD f16, 2 * SIZE(AO)
  1839. LFD f17, 3 * SIZE(AO)
  1840. LFD f20, 2 * SIZE(BO)
  1841. LFD f21, 3 * SIZE(BO)
  1842. addi BO, BO, 2 * SIZE
  1843. addi AO, AO, 2 * SIZE
  1844. bdnz LL(56)
  1845. .align 4
  1846. LL(58):
  1847. FADD f0, f4, f0
  1848. FADD f1, f5, f1
  1849. FADD f2, f6, f2
  1850. FADD f3, f7, f3
  1851. #if defined(LN) || defined(RT)
  1852. #ifdef LN
  1853. subi r0, KK, 2
  1854. #else
  1855. subi r0, KK, 2
  1856. #endif
  1857. slwi TEMP, r0, 1 + BASE_SHIFT
  1858. slwi r0, r0, 1 + BASE_SHIFT
  1859. add AO, AORIG, TEMP
  1860. add BO, B, r0
  1861. #endif
  1862. #if defined(LN) || defined(LT)
  1863. LFD f16, 0 * SIZE(BO)
  1864. LFD f17, 1 * SIZE(BO)
  1865. LFD f20, 2 * SIZE(BO)
  1866. LFD f21, 3 * SIZE(BO)
  1867. FSUB f0, f16, f0
  1868. FSUB f2, f17, f2
  1869. FSUB f1, f20, f1
  1870. FSUB f3, f21, f3
  1871. #else
  1872. LFD f16, 0 * SIZE(AO)
  1873. LFD f17, 1 * SIZE(AO)
  1874. LFD f20, 2 * SIZE(AO)
  1875. LFD f21, 3 * SIZE(AO)
  1876. FSUB f0, f16, f0
  1877. FSUB f1, f17, f1
  1878. FSUB f2, f20, f2
  1879. FSUB f3, f21, f3
  1880. #endif
  1881. #ifdef LN
  1882. LFD f19, 3 * SIZE(AO)
  1883. LFD f20, 2 * SIZE(AO)
  1884. LFD f21, 0 * SIZE(AO)
  1885. FMUL f1, f19, f1
  1886. FMUL f3, f19, f3
  1887. FNMSUB f0, f20, f1, f0
  1888. FNMSUB f2, f20, f3, f2
  1889. FMUL f0, f21, f0
  1890. FMUL f2, f21, f2
  1891. #endif
  1892. #ifdef LT
  1893. LFD f16, 0 * SIZE(AO)
  1894. LFD f17, 1 * SIZE(AO)
  1895. FMUL f0, f16, f0
  1896. FMUL f2, f16, f2
  1897. FNMSUB f1, f17, f0, f1
  1898. FNMSUB f3, f17, f2, f3
  1899. LFD f17, 3 * SIZE(AO)
  1900. FMUL f1, f17, f1
  1901. FMUL f3, f17, f3
  1902. #endif
  1903. #ifdef RN
  1904. LFD f16, 0 * SIZE(BO)
  1905. LFD f17, 1 * SIZE(BO)
  1906. LFD f18, 3 * SIZE(BO)
  1907. FMUL f0, f16, f0
  1908. FMUL f1, f16, f1
  1909. FNMSUB f2, f17, f0, f2
  1910. FNMSUB f3, f17, f1, f3
  1911. FMUL f2, f18, f2
  1912. FMUL f3, f18, f3
  1913. #endif
  1914. #ifdef RT
  1915. LFD f19, 3 * SIZE(BO)
  1916. LFD f20, 2 * SIZE(BO)
  1917. LFD f21, 0 * SIZE(BO)
  1918. FMUL f2, f19, f2
  1919. FMUL f3, f19, f3
  1920. FNMSUB f0, f20, f2, f0
  1921. FNMSUB f1, f20, f3, f1
  1922. FMUL f0, f21, f0
  1923. FMUL f1, f21, f1
  1924. #endif
  1925. #ifdef LN
  1926. subi CO1, CO1, 2 * SIZE
  1927. subi CO2, CO2, 2 * SIZE
  1928. #endif
  1929. #if defined(LN) || defined(LT)
  1930. STFD f0, 0 * SIZE(BO)
  1931. STFD f2, 1 * SIZE(BO)
  1932. STFD f1, 2 * SIZE(BO)
  1933. STFD f3, 3 * SIZE(BO)
  1934. #else
  1935. STFD f0, 0 * SIZE(AO)
  1936. STFD f1, 1 * SIZE(AO)
  1937. STFD f2, 2 * SIZE(AO)
  1938. STFD f3, 3 * SIZE(AO)
  1939. #endif
  1940. STFD f0, 0 * SIZE(CO1)
  1941. STFD f1, 1 * SIZE(CO1)
  1942. STFD f2, 0 * SIZE(CO2)
  1943. STFD f3, 1 * SIZE(CO2)
  1944. lfs f0, FZERO
  1945. fmr f1, f0
  1946. fmr f2, f0
  1947. fmr f3, f0
  1948. fmr f4, f0
  1949. fmr f5, f0
  1950. fmr f6, f0
  1951. fmr f7, f0
  1952. #ifndef LN
  1953. addi CO1, CO1, 2 * SIZE
  1954. addi CO2, CO2, 2 * SIZE
  1955. #endif
  1956. #ifdef RT
  1957. slwi r0, K, 1 + BASE_SHIFT
  1958. add AORIG, AORIG, r0
  1959. #endif
  1960. #if defined(LT) || defined(RN)
  1961. sub TEMP, K, KK
  1962. slwi r0, TEMP, 1 + BASE_SHIFT
  1963. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1964. add AO, AO, r0
  1965. add BO, BO, TEMP
  1966. #endif
  1967. #ifdef LN
  1968. subi KK, KK, 2
  1969. #endif
  1970. #ifdef LT
  1971. addi KK, KK, 2
  1972. #endif
  1973. .align 4
  1974. LL(41):
  1975. srawi. I, M, 2
  1976. ble LL(69)
  1977. .align 4
  1978. LL(42):
  1979. #if defined(LT) || defined(RN)
  1980. LFD f16, 0 * SIZE(AO)
  1981. LFD f17, 1 * SIZE(AO)
  1982. LFD f18, 2 * SIZE(AO)
  1983. LFD f19, 3 * SIZE(AO)
  1984. LFD f20, 0 * SIZE(B)
  1985. LFD f21, 1 * SIZE(B)
  1986. LFD f22, 2 * SIZE(B)
  1987. LFD f23, 3 * SIZE(B)
  1988. dcbt CO1, PREC
  1989. dcbt CO2, PREC
  1990. srawi. r0, KK, 2
  1991. mtspr CTR, r0
  1992. mr BO, B
  1993. #else
  1994. #ifdef LN
  1995. slwi r0, K, 2 + BASE_SHIFT
  1996. sub AORIG, AORIG, r0
  1997. #endif
  1998. slwi r0, KK, 2 + BASE_SHIFT
  1999. slwi TEMP, KK, 1 + BASE_SHIFT
  2000. add AO, AORIG, r0
  2001. add BO, B, TEMP
  2002. sub TEMP, K, KK
  2003. LFD f16, 0 * SIZE(AO)
  2004. LFD f17, 1 * SIZE(AO)
  2005. LFD f18, 2 * SIZE(AO)
  2006. LFD f19, 3 * SIZE(AO)
  2007. LFD f20, 0 * SIZE(BO)
  2008. LFD f21, 1 * SIZE(BO)
  2009. LFD f22, 2 * SIZE(BO)
  2010. LFD f23, 3 * SIZE(BO)
  2011. dcbt CO1, PREC
  2012. dcbt CO2, PREC
  2013. srawi. r0, TEMP, 2
  2014. mtspr CTR, r0
  2015. #endif
  2016. ble LL(45)
  2017. .align 5
  2018. LL(43):
  2019. FMADD f0, f16, f20, f0
  2020. FMADD f1, f17, f20, f1
  2021. FMADD f2, f18, f20, f2
  2022. FMADD f3, f19, f20, f3
  2023. FMADD f4, f16, f21, f4
  2024. FMADD f5, f17, f21, f5
  2025. FMADD f6, f18, f21, f6
  2026. FMADD f7, f19, f21, f7
  2027. LFD f16, 4 * SIZE(AO)
  2028. LFD f17, 5 * SIZE(AO)
  2029. LFD f18, 6 * SIZE(AO)
  2030. LFD f19, 7 * SIZE(AO)
  2031. FMADD f0, f16, f22, f0
  2032. FMADD f1, f17, f22, f1
  2033. FMADD f2, f18, f22, f2
  2034. FMADD f3, f19, f22, f3
  2035. FMADD f4, f16, f23, f4
  2036. FMADD f5, f17, f23, f5
  2037. FMADD f6, f18, f23, f6
  2038. FMADD f7, f19, f23, f7
  2039. LFD f16, 8 * SIZE(AO)
  2040. LFD f17, 9 * SIZE(AO)
  2041. LFD f18, 10 * SIZE(AO)
  2042. LFD f19, 11 * SIZE(AO)
  2043. LFD f20, 4 * SIZE(BO)
  2044. LFD f21, 5 * SIZE(BO)
  2045. LFD f22, 6 * SIZE(BO)
  2046. LFD f23, 7 * SIZE(BO)
  2047. FMADD f0, f16, f20, f0
  2048. FMADD f1, f17, f20, f1
  2049. FMADD f2, f18, f20, f2
  2050. FMADD f3, f19, f20, f3
  2051. FMADD f4, f16, f21, f4
  2052. FMADD f5, f17, f21, f5
  2053. FMADD f6, f18, f21, f6
  2054. FMADD f7, f19, f21, f7
  2055. LFD f16, 12 * SIZE(AO)
  2056. LFD f17, 13 * SIZE(AO)
  2057. LFD f18, 14 * SIZE(AO)
  2058. LFD f19, 15 * SIZE(AO)
  2059. FMADD f0, f16, f22, f0
  2060. FMADD f1, f17, f22, f1
  2061. FMADD f2, f18, f22, f2
  2062. FMADD f3, f19, f22, f3
  2063. FMADD f4, f16, f23, f4
  2064. FMADD f5, f17, f23, f5
  2065. FMADD f6, f18, f23, f6
  2066. FMADD f7, f19, f23, f7
  2067. LFD f16, 16 * SIZE(AO)
  2068. LFD f17, 17 * SIZE(AO)
  2069. LFD f18, 18 * SIZE(AO)
  2070. LFD f19, 19 * SIZE(AO)
  2071. LFD f20, 8 * SIZE(BO)
  2072. LFD f21, 9 * SIZE(BO)
  2073. LFD f22, 10 * SIZE(BO)
  2074. LFD f23, 11 * SIZE(BO)
  2075. addi AO, AO, 16 * SIZE
  2076. addi BO, BO, 8 * SIZE
  2077. DCBT(BO, PREB)
  2078. bdnz LL(43)
  2079. .align 4
  2080. LL(45):
  2081. #if defined(LT) || defined(RN)
  2082. andi. r0, KK, 3
  2083. #else
  2084. andi. r0, TEMP, 3
  2085. #endif
  2086. mtspr CTR, r0
  2087. ble+ LL(48)
  2088. .align 4
  2089. LL(46):
  2090. FMADD f0, f16, f20, f0
  2091. FMADD f1, f17, f20, f1
  2092. FMADD f2, f18, f20, f2
  2093. FMADD f3, f19, f20, f3
  2094. FMADD f4, f16, f21, f4
  2095. FMADD f5, f17, f21, f5
  2096. FMADD f6, f18, f21, f6
  2097. FMADD f7, f19, f21, f7
  2098. LFD f16, 4 * SIZE(AO)
  2099. LFD f17, 5 * SIZE(AO)
  2100. LFD f18, 6 * SIZE(AO)
  2101. LFD f19, 7 * SIZE(AO)
  2102. LFD f20, 2 * SIZE(BO)
  2103. LFD f21, 3 * SIZE(BO)
  2104. addi BO, BO, 2 * SIZE
  2105. addi AO, AO, 4 * SIZE
  2106. bdnz LL(46)
  2107. .align 4
  2108. LL(48):
  2109. #if defined(LN) || defined(RT)
  2110. #ifdef LN
  2111. subi r0, KK, 4
  2112. #else
  2113. subi r0, KK, 2
  2114. #endif
  2115. slwi TEMP, r0, 2 + BASE_SHIFT
  2116. slwi r0, r0, 1 + BASE_SHIFT
  2117. add AO, AORIG, TEMP
  2118. add BO, B, r0
  2119. #endif
  2120. #if defined(LN) || defined(LT)
  2121. LFD f16, 0 * SIZE(BO)
  2122. LFD f17, 1 * SIZE(BO)
  2123. LFD f20, 2 * SIZE(BO)
  2124. LFD f21, 3 * SIZE(BO)
  2125. LFD f24, 4 * SIZE(BO)
  2126. LFD f25, 5 * SIZE(BO)
  2127. LFD f28, 6 * SIZE(BO)
  2128. LFD f29, 7 * SIZE(BO)
  2129. FSUB f0, f16, f0
  2130. FSUB f4, f17, f4
  2131. FSUB f1, f20, f1
  2132. FSUB f5, f21, f5
  2133. FSUB f2, f24, f2
  2134. FSUB f6, f25, f6
  2135. FSUB f3, f28, f3
  2136. FSUB f7, f29, f7
  2137. #else
  2138. LFD f16, 0 * SIZE(AO)
  2139. LFD f17, 1 * SIZE(AO)
  2140. LFD f18, 2 * SIZE(AO)
  2141. LFD f19, 3 * SIZE(AO)
  2142. LFD f20, 4 * SIZE(AO)
  2143. LFD f21, 5 * SIZE(AO)
  2144. LFD f22, 6 * SIZE(AO)
  2145. LFD f23, 7 * SIZE(AO)
  2146. FSUB f0, f16, f0
  2147. FSUB f1, f17, f1
  2148. FSUB f2, f18, f2
  2149. FSUB f3, f19, f3
  2150. FSUB f4, f20, f4
  2151. FSUB f5, f21, f5
  2152. FSUB f6, f22, f6
  2153. FSUB f7, f23, f7
  2154. #endif
  2155. #ifdef LN
  2156. LFD f16, 15 * SIZE(AO)
  2157. LFD f17, 14 * SIZE(AO)
  2158. LFD f18, 13 * SIZE(AO)
  2159. LFD f19, 12 * SIZE(AO)
  2160. FMUL f3, f16, f3
  2161. FMUL f7, f16, f7
  2162. FNMSUB f2, f17, f3, f2
  2163. FNMSUB f6, f17, f7, f6
  2164. FNMSUB f1, f18, f3, f1
  2165. FNMSUB f5, f18, f7, f5
  2166. FNMSUB f0, f19, f3, f0
  2167. FNMSUB f4, f19, f7, f4
  2168. LFD f16, 10 * SIZE(AO)
  2169. LFD f17, 9 * SIZE(AO)
  2170. LFD f18, 8 * SIZE(AO)
  2171. LFD f19, 5 * SIZE(AO)
  2172. LFD f20, 4 * SIZE(AO)
  2173. LFD f21, 0 * SIZE(AO)
  2174. FMUL f2, f16, f2
  2175. FMUL f6, f16, f6
  2176. FNMSUB f1, f17, f2, f1
  2177. FNMSUB f5, f17, f6, f5
  2178. FNMSUB f0, f18, f2, f0
  2179. FNMSUB f4, f18, f6, f4
  2180. FMUL f1, f19, f1
  2181. FMUL f5, f19, f5
  2182. FNMSUB f0, f20, f1, f0
  2183. FNMSUB f4, f20, f5, f4
  2184. FMUL f0, f21, f0
  2185. FMUL f4, f21, f4
  2186. #endif
  2187. #ifdef LT
  2188. LFD f16, 0 * SIZE(AO)
  2189. LFD f17, 1 * SIZE(AO)
  2190. LFD f18, 2 * SIZE(AO)
  2191. LFD f19, 3 * SIZE(AO)
  2192. FMUL f0, f16, f0
  2193. FMUL f4, f16, f4
  2194. FNMSUB f1, f17, f0, f1
  2195. FNMSUB f5, f17, f4, f5
  2196. FNMSUB f2, f18, f0, f2
  2197. FNMSUB f6, f18, f4, f6
  2198. FNMSUB f3, f19, f0, f3
  2199. FNMSUB f7, f19, f4, f7
  2200. LFD f17, 5 * SIZE(AO)
  2201. LFD f18, 6 * SIZE(AO)
  2202. LFD f19, 7 * SIZE(AO)
  2203. FMUL f1, f17, f1
  2204. FMUL f5, f17, f5
  2205. FNMSUB f2, f18, f1, f2
  2206. FNMSUB f6, f18, f5, f6
  2207. FNMSUB f3, f19, f1, f3
  2208. FNMSUB f7, f19, f5, f7
  2209. LFD f18, 10 * SIZE(AO)
  2210. LFD f19, 11 * SIZE(AO)
  2211. FMUL f2, f18, f2
  2212. FMUL f6, f18, f6
  2213. FNMSUB f3, f19, f2, f3
  2214. FNMSUB f7, f19, f6, f7
  2215. LFD f19, 15 * SIZE(AO)
  2216. FMUL f3, f19, f3
  2217. FMUL f7, f19, f7
  2218. #endif
  2219. #ifdef RN
  2220. LFD f16, 0 * SIZE(BO)
  2221. LFD f17, 1 * SIZE(BO)
  2222. LFD f18, 3 * SIZE(BO)
  2223. FMUL f0, f16, f0
  2224. FMUL f1, f16, f1
  2225. FMUL f2, f16, f2
  2226. FMUL f3, f16, f3
  2227. FNMSUB f4, f17, f0, f4
  2228. FNMSUB f5, f17, f1, f5
  2229. FNMSUB f6, f17, f2, f6
  2230. FNMSUB f7, f17, f3, f7
  2231. FMUL f4, f18, f4
  2232. FMUL f5, f18, f5
  2233. FMUL f6, f18, f6
  2234. FMUL f7, f18, f7
  2235. #endif
  2236. #ifdef RT
  2237. LFD f19, 3 * SIZE(BO)
  2238. LFD f20, 2 * SIZE(BO)
  2239. LFD f21, 0 * SIZE(BO)
  2240. FMUL f4, f19, f4
  2241. FMUL f5, f19, f5
  2242. FMUL f6, f19, f6
  2243. FMUL f7, f19, f7
  2244. FNMSUB f0, f20, f4, f0
  2245. FNMSUB f1, f20, f5, f1
  2246. FNMSUB f2, f20, f6, f2
  2247. FNMSUB f3, f20, f7, f3
  2248. FMUL f0, f21, f0
  2249. FMUL f1, f21, f1
  2250. FMUL f2, f21, f2
  2251. FMUL f3, f21, f3
  2252. #endif
  2253. #ifdef LN
  2254. subi CO1, CO1, 4 * SIZE
  2255. subi CO2, CO2, 4 * SIZE
  2256. #endif
  2257. #if defined(LN) || defined(LT)
  2258. STFD f0, 0 * SIZE(BO)
  2259. STFD f4, 1 * SIZE(BO)
  2260. STFD f1, 2 * SIZE(BO)
  2261. STFD f5, 3 * SIZE(BO)
  2262. STFD f2, 4 * SIZE(BO)
  2263. STFD f6, 5 * SIZE(BO)
  2264. STFD f3, 6 * SIZE(BO)
  2265. STFD f7, 7 * SIZE(BO)
  2266. #else
  2267. STFD f0, 0 * SIZE(AO)
  2268. STFD f1, 1 * SIZE(AO)
  2269. STFD f2, 2 * SIZE(AO)
  2270. STFD f3, 3 * SIZE(AO)
  2271. STFD f4, 4 * SIZE(AO)
  2272. STFD f5, 5 * SIZE(AO)
  2273. STFD f6, 6 * SIZE(AO)
  2274. STFD f7, 7 * SIZE(AO)
  2275. #endif
  2276. STFD f0, 0 * SIZE(CO1)
  2277. STFD f1, 1 * SIZE(CO1)
  2278. STFD f2, 2 * SIZE(CO1)
  2279. STFD f3, 3 * SIZE(CO1)
  2280. STFD f4, 0 * SIZE(CO2)
  2281. STFD f5, 1 * SIZE(CO2)
  2282. STFD f6, 2 * SIZE(CO2)
  2283. STFD f7, 3 * SIZE(CO2)
  2284. lfs f0, FZERO
  2285. fmr f1, f0
  2286. fmr f2, f0
  2287. fmr f3, f0
  2288. fmr f4, f0
  2289. fmr f5, f0
  2290. fmr f6, f0
  2291. fmr f7, f0
  2292. #ifndef LN
  2293. addi CO1, CO1, 4 * SIZE
  2294. addi CO2, CO2, 4 * SIZE
  2295. #endif
  2296. #ifdef RT
  2297. slwi r0, K, 2 + BASE_SHIFT
  2298. add AORIG, AORIG, r0
  2299. #endif
  2300. #if defined(LT) || defined(RN)
  2301. sub TEMP, K, KK
  2302. slwi r0, TEMP, 2 + BASE_SHIFT
  2303. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2304. add AO, AO, r0
  2305. add BO, BO, TEMP
  2306. #endif
  2307. #ifdef LN
  2308. subi KK, KK, 4
  2309. #endif
  2310. #ifdef LT
  2311. addi KK, KK, 4
  2312. #endif
  2313. addic. I, I, -1
  2314. bgt+ LL(42)
  2315. .align 4
  2316. LL(69):
  2317. #ifdef LN
  2318. slwi r0, K, 1 + BASE_SHIFT
  2319. add B, B, r0
  2320. #endif
  2321. #if defined(LT) || defined(RN)
  2322. mr B, BO
  2323. #endif
  2324. #ifdef RN
  2325. addi KK, KK, 2
  2326. #endif
  2327. #ifdef RT
  2328. subi KK, KK, 2
  2329. #endif
  2330. lfs f0, FZERO
  2331. .align 4
  2332. LL(70):
  2333. andi. J, N, 1
  2334. ble LL(999)
  2335. #ifdef RT
  2336. slwi r0, K, 0 + BASE_SHIFT
  2337. sub B, B, r0
  2338. sub C, C, LDC
  2339. #endif
  2340. mr CO1, C
  2341. #ifdef LN
  2342. add KK, M, OFFSET
  2343. #endif
  2344. #ifdef LT
  2345. mr KK, OFFSET
  2346. #endif
  2347. fmr f1, f0
  2348. fmr f2, f0
  2349. fmr f3, f0
  2350. #if defined(LN) || defined(RT)
  2351. mr AORIG, A
  2352. #else
  2353. mr AO, A
  2354. #endif
  2355. #ifndef RT
  2356. add C, CO1, LDC
  2357. #endif
  2358. .align 4
  2359. LL(90):
  2360. andi. I, M, 1
  2361. ble LL(80)
  2362. #if defined(LT) || defined(RN)
  2363. LFD f16, 0 * SIZE(AO)
  2364. LFD f17, 1 * SIZE(AO)
  2365. LFD f18, 2 * SIZE(AO)
  2366. LFD f19, 3 * SIZE(AO)
  2367. LFD f20, 0 * SIZE(B)
  2368. LFD f21, 1 * SIZE(B)
  2369. LFD f22, 2 * SIZE(B)
  2370. LFD f23, 3 * SIZE(B)
  2371. srawi. r0, KK, 3
  2372. mtspr CTR, r0
  2373. mr BO, B
  2374. #else
  2375. #ifdef LN
  2376. slwi r0, K, BASE_SHIFT
  2377. sub AORIG, AORIG, r0
  2378. #endif
  2379. slwi r0, KK, 0 + BASE_SHIFT
  2380. slwi TEMP, KK, 0 + BASE_SHIFT
  2381. add AO, AORIG, r0
  2382. add BO, B, TEMP
  2383. sub TEMP, K, KK
  2384. LFD f16, 0 * SIZE(AO)
  2385. LFD f17, 1 * SIZE(AO)
  2386. LFD f18, 2 * SIZE(AO)
  2387. LFD f19, 3 * SIZE(AO)
  2388. LFD f20, 0 * SIZE(BO)
  2389. LFD f21, 1 * SIZE(BO)
  2390. LFD f22, 2 * SIZE(BO)
  2391. LFD f23, 3 * SIZE(BO)
  2392. srawi. r0, TEMP, 3
  2393. mtspr CTR, r0
  2394. #endif
  2395. ble LL(95)
  2396. .align 5
  2397. LL(92):
  2398. FMADD f0, f16, f20, f0
  2399. FMADD f1, f17, f21, f1
  2400. FMADD f2, f18, f22, f2
  2401. FMADD f3, f19, f23, f3
  2402. LFD f16, 4 * SIZE(AO)
  2403. LFD f17, 5 * SIZE(AO)
  2404. LFD f18, 6 * SIZE(AO)
  2405. LFD f19, 7 * SIZE(AO)
  2406. LFD f20, 4 * SIZE(BO)
  2407. LFD f21, 5 * SIZE(BO)
  2408. LFD f22, 6 * SIZE(BO)
  2409. LFD f23, 7 * SIZE(BO)
  2410. FMADD f0, f16, f20, f0
  2411. FMADD f1, f17, f21, f1
  2412. FMADD f2, f18, f22, f2
  2413. FMADD f3, f19, f23, f3
  2414. LFD f16, 8 * SIZE(AO)
  2415. LFD f17, 9 * SIZE(AO)
  2416. LFD f18, 10 * SIZE(AO)
  2417. LFD f19, 11 * SIZE(AO)
  2418. LFD f20, 8 * SIZE(BO)
  2419. LFD f21, 9 * SIZE(BO)
  2420. LFD f22, 10 * SIZE(BO)
  2421. LFD f23, 11 * SIZE(BO)
  2422. addi AO, AO, 8 * SIZE
  2423. addi BO, BO, 8 * SIZE
  2424. bdnz LL(92)
  2425. .align 4
  2426. LL(95):
  2427. #if defined(LT) || defined(RN)
  2428. andi. r0, KK, 7
  2429. #else
  2430. andi. r0, TEMP, 7
  2431. #endif
  2432. mtspr CTR, r0
  2433. ble+ LL(98)
  2434. .align 4
  2435. LL(96):
  2436. FMADD f0, f16, f20, f0
  2437. LFD f16, 1 * SIZE(AO)
  2438. LFD f20, 1 * SIZE(BO)
  2439. addi BO, BO, 1 * SIZE
  2440. addi AO, AO, 1 * SIZE
  2441. bdnz LL(96)
  2442. .align 4
  2443. LL(98):
  2444. FADD f0, f1, f0
  2445. FADD f2, f3, f2
  2446. FADD f0, f2, f0
  2447. #if defined(LN) || defined(RT)
  2448. #ifdef LN
  2449. subi r0, KK, 1
  2450. #else
  2451. subi r0, KK, 1
  2452. #endif
  2453. slwi TEMP, r0, 0 + BASE_SHIFT
  2454. slwi r0, r0, 0 + BASE_SHIFT
  2455. add AO, AORIG, TEMP
  2456. add BO, B, r0
  2457. #endif
  2458. #if defined(LN) || defined(LT)
  2459. LFD f16, 0 * SIZE(BO)
  2460. FSUB f0, f16, f0
  2461. #else
  2462. LFD f16, 0 * SIZE(AO)
  2463. FSUB f0, f16, f0
  2464. #endif
  2465. #ifdef LN
  2466. LFD f21, 0 * SIZE(AO)
  2467. FMUL f0, f21, f0
  2468. #endif
  2469. #ifdef LT
  2470. LFD f16, 0 * SIZE(AO)
  2471. FMUL f0, f16, f0
  2472. #endif
  2473. #ifdef RN
  2474. LFD f16, 0 * SIZE(BO)
  2475. FMUL f0, f16, f0
  2476. #endif
  2477. #ifdef RT
  2478. LFD f21, 0 * SIZE(BO)
  2479. FMUL f0, f21, f0
  2480. #endif
  2481. #ifdef LN
  2482. subi CO1, CO1, 1 * SIZE
  2483. #endif
  2484. #if defined(LN) || defined(LT)
  2485. STFD f0, 0 * SIZE(BO)
  2486. #else
  2487. STFD f0, 0 * SIZE(AO)
  2488. #endif
  2489. STFD f0, 0 * SIZE(CO1)
  2490. lfs f0, FZERO
  2491. fmr f1, f0
  2492. fmr f2, f0
  2493. fmr f3, f0
  2494. #ifndef LN
  2495. addi CO1, CO1, 1 * SIZE
  2496. #endif
  2497. #ifdef RT
  2498. slwi r0, K, 0 + BASE_SHIFT
  2499. add AORIG, AORIG, r0
  2500. #endif
  2501. #if defined(LT) || defined(RN)
  2502. sub TEMP, K, KK
  2503. slwi r0, TEMP, 0 + BASE_SHIFT
  2504. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2505. add AO, AO, r0
  2506. add BO, BO, TEMP
  2507. #endif
  2508. #ifdef LN
  2509. subi KK, KK, 1
  2510. #endif
  2511. #ifdef LT
  2512. addi KK, KK, 1
  2513. #endif
  2514. .align 4
  2515. LL(80):
  2516. andi. I, M, 2
  2517. ble LL(71)
  2518. #if defined(LT) || defined(RN)
  2519. LFD f16, 0 * SIZE(AO)
  2520. LFD f17, 1 * SIZE(AO)
  2521. LFD f18, 2 * SIZE(AO)
  2522. LFD f19, 3 * SIZE(AO)
  2523. LFD f20, 0 * SIZE(B)
  2524. LFD f21, 1 * SIZE(B)
  2525. LFD f22, 2 * SIZE(B)
  2526. LFD f23, 3 * SIZE(B)
  2527. srawi. r0, KK, 2
  2528. mtspr CTR, r0
  2529. mr BO, B
  2530. #else
  2531. #ifdef LN
  2532. slwi r0, K, 1 + BASE_SHIFT
  2533. sub AORIG, AORIG, r0
  2534. #endif
  2535. slwi r0, KK, 1 + BASE_SHIFT
  2536. slwi TEMP, KK, 0 + BASE_SHIFT
  2537. add AO, AORIG, r0
  2538. add BO, B, TEMP
  2539. sub TEMP, K, KK
  2540. LFD f16, 0 * SIZE(AO)
  2541. LFD f17, 1 * SIZE(AO)
  2542. LFD f18, 2 * SIZE(AO)
  2543. LFD f19, 3 * SIZE(AO)
  2544. LFD f20, 0 * SIZE(BO)
  2545. LFD f21, 1 * SIZE(BO)
  2546. LFD f22, 2 * SIZE(BO)
  2547. LFD f23, 3 * SIZE(BO)
  2548. srawi. r0, TEMP, 2
  2549. mtspr CTR, r0
  2550. #endif
  2551. ble LL(85)
  2552. .align 5
  2553. LL(82):
  2554. FMADD f0, f16, f20, f0
  2555. FMADD f1, f17, f20, f1
  2556. FMADD f2, f18, f21, f2
  2557. FMADD f3, f19, f21, f3
  2558. LFD f16, 4 * SIZE(AO)
  2559. LFD f17, 5 * SIZE(AO)
  2560. LFD f18, 6 * SIZE(AO)
  2561. LFD f19, 7 * SIZE(AO)
  2562. FMADD f0, f16, f22, f0
  2563. FMADD f1, f17, f22, f1
  2564. FMADD f2, f18, f23, f2
  2565. FMADD f3, f19, f23, f3
  2566. LFD f16, 8 * SIZE(AO)
  2567. LFD f17, 9 * SIZE(AO)
  2568. LFD f18, 10 * SIZE(AO)
  2569. LFD f19, 11 * SIZE(AO)
  2570. LFD f20, 4 * SIZE(BO)
  2571. LFD f21, 5 * SIZE(BO)
  2572. LFD f22, 6 * SIZE(BO)
  2573. LFD f23, 7 * SIZE(BO)
  2574. addi AO, AO, 8 * SIZE
  2575. addi BO, BO, 4 * SIZE
  2576. DCBT(BO, PREB)
  2577. bdnz LL(82)
  2578. .align 4
  2579. LL(85):
  2580. #if defined(LT) || defined(RN)
  2581. andi. r0, KK, 3
  2582. #else
  2583. andi. r0, TEMP, 3
  2584. #endif
  2585. mtspr CTR, r0
  2586. ble+ LL(88)
  2587. .align 4
  2588. LL(86):
  2589. FMADD f0, f16, f20, f0
  2590. FMADD f1, f17, f20, f1
  2591. LFD f16, 2 * SIZE(AO)
  2592. LFD f17, 3 * SIZE(AO)
  2593. LFD f20, 1 * SIZE(BO)
  2594. addi BO, BO, 1 * SIZE
  2595. addi AO, AO, 2 * SIZE
  2596. bdnz LL(86)
  2597. .align 4
  2598. LL(88):
  2599. FADD f0, f2, f0
  2600. FADD f1, f3, f1
  2601. #if defined(LN) || defined(RT)
  2602. #ifdef LN
  2603. subi r0, KK, 2
  2604. #else
  2605. subi r0, KK, 1
  2606. #endif
  2607. slwi TEMP, r0, 1 + BASE_SHIFT
  2608. slwi r0, r0, 0 + BASE_SHIFT
  2609. add AO, AORIG, TEMP
  2610. add BO, B, r0
  2611. #endif
  2612. #if defined(LN) || defined(LT)
  2613. LFD f16, 0 * SIZE(BO)
  2614. LFD f20, 1 * SIZE(BO)
  2615. FSUB f0, f16, f0
  2616. FSUB f1, f20, f1
  2617. #else
  2618. LFD f16, 0 * SIZE(AO)
  2619. LFD f17, 1 * SIZE(AO)
  2620. FSUB f0, f16, f0
  2621. FSUB f1, f17, f1
  2622. #endif
  2623. #ifdef LN
  2624. LFD f19, 3 * SIZE(AO)
  2625. LFD f20, 2 * SIZE(AO)
  2626. LFD f21, 0 * SIZE(AO)
  2627. FMUL f1, f19, f1
  2628. FNMSUB f0, f20, f1, f0
  2629. FMUL f0, f21, f0
  2630. #endif
  2631. #ifdef LT
  2632. LFD f16, 0 * SIZE(AO)
  2633. LFD f17, 1 * SIZE(AO)
  2634. FMUL f0, f16, f0
  2635. FNMSUB f1, f17, f0, f1
  2636. LFD f17, 3 * SIZE(AO)
  2637. FMUL f1, f17, f1
  2638. #endif
  2639. #ifdef RN
  2640. LFD f16, 0 * SIZE(BO)
  2641. FMUL f0, f16, f0
  2642. FMUL f1, f16, f1
  2643. #endif
  2644. #ifdef RT
  2645. LFD f21, 0 * SIZE(BO)
  2646. FMUL f0, f21, f0
  2647. FMUL f1, f21, f1
  2648. #endif
  2649. #ifdef LN
  2650. subi CO1, CO1, 2 * SIZE
  2651. #endif
  2652. #if defined(LN) || defined(LT)
  2653. STFD f0, 0 * SIZE(BO)
  2654. STFD f1, 1 * SIZE(BO)
  2655. #else
  2656. STFD f0, 0 * SIZE(AO)
  2657. STFD f1, 1 * SIZE(AO)
  2658. #endif
  2659. STFD f0, 0 * SIZE(CO1)
  2660. STFD f1, 1 * SIZE(CO1)
  2661. lfs f0, FZERO
  2662. fmr f1, f0
  2663. fmr f2, f0
  2664. fmr f3, f0
  2665. #ifndef LN
  2666. addi CO1, CO1, 2 * SIZE
  2667. #endif
  2668. #ifdef RT
  2669. slwi r0, K, 1 + BASE_SHIFT
  2670. add AORIG, AORIG, r0
  2671. #endif
  2672. #if defined(LT) || defined(RN)
  2673. sub TEMP, K, KK
  2674. slwi r0, TEMP, 1 + BASE_SHIFT
  2675. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2676. add AO, AO, r0
  2677. add BO, BO, TEMP
  2678. #endif
  2679. #ifdef LN
  2680. subi KK, KK, 2
  2681. #endif
  2682. #ifdef LT
  2683. addi KK, KK, 2
  2684. #endif
  2685. .align 4
  2686. LL(71):
  2687. srawi. I, M, 2
  2688. ble LL(999)
  2689. .align 4
  2690. LL(72):
  2691. #if defined(LT) || defined(RN)
  2692. LFD f16, 0 * SIZE(AO)
  2693. LFD f17, 1 * SIZE(AO)
  2694. LFD f18, 2 * SIZE(AO)
  2695. LFD f19, 3 * SIZE(AO)
  2696. LFD f20, 0 * SIZE(B)
  2697. LFD f21, 1 * SIZE(B)
  2698. LFD f22, 2 * SIZE(B)
  2699. LFD f23, 3 * SIZE(B)
  2700. dcbt CO1, PREC
  2701. srawi. r0, KK, 2
  2702. mtspr CTR, r0
  2703. mr BO, B
  2704. #else
  2705. #ifdef LN
  2706. slwi r0, K, 2 + BASE_SHIFT
  2707. sub AORIG, AORIG, r0
  2708. #endif
  2709. slwi r0, KK, 2 + BASE_SHIFT
  2710. slwi TEMP, KK, 0 + BASE_SHIFT
  2711. add AO, AORIG, r0
  2712. add BO, B, TEMP
  2713. sub TEMP, K, KK
  2714. LFD f16, 0 * SIZE(AO)
  2715. LFD f17, 1 * SIZE(AO)
  2716. LFD f18, 2 * SIZE(AO)
  2717. LFD f19, 3 * SIZE(AO)
  2718. LFD f20, 0 * SIZE(BO)
  2719. LFD f21, 1 * SIZE(BO)
  2720. LFD f22, 2 * SIZE(BO)
  2721. LFD f23, 3 * SIZE(BO)
  2722. dcbt CO1, PREC
  2723. srawi. r0, TEMP, 2
  2724. mtspr CTR, r0
  2725. #endif
  2726. ble LL(75)
  2727. .align 5
  2728. LL(73):
  2729. FMADD f0, f16, f20, f0
  2730. FMADD f1, f17, f20, f1
  2731. FMADD f2, f18, f20, f2
  2732. FMADD f3, f19, f20, f3
  2733. LFD f16, 4 * SIZE(AO)
  2734. LFD f17, 5 * SIZE(AO)
  2735. LFD f18, 6 * SIZE(AO)
  2736. LFD f19, 7 * SIZE(AO)
  2737. FMADD f0, f16, f21, f0
  2738. FMADD f1, f17, f21, f1
  2739. FMADD f2, f18, f21, f2
  2740. FMADD f3, f19, f21, f3
  2741. LFD f16, 8 * SIZE(AO)
  2742. LFD f17, 9 * SIZE(AO)
  2743. LFD f18, 10 * SIZE(AO)
  2744. LFD f19, 11 * SIZE(AO)
  2745. FMADD f0, f16, f22, f0
  2746. FMADD f1, f17, f22, f1
  2747. FMADD f2, f18, f22, f2
  2748. FMADD f3, f19, f22, f3
  2749. LFD f16, 12 * SIZE(AO)
  2750. LFD f17, 13 * SIZE(AO)
  2751. LFD f18, 14 * SIZE(AO)
  2752. LFD f19, 15 * SIZE(AO)
  2753. FMADD f0, f16, f23, f0
  2754. FMADD f1, f17, f23, f1
  2755. FMADD f2, f18, f23, f2
  2756. FMADD f3, f19, f23, f3
  2757. LFD f16, 16 * SIZE(AO)
  2758. LFD f17, 17 * SIZE(AO)
  2759. LFD f18, 18 * SIZE(AO)
  2760. LFD f19, 19 * SIZE(AO)
  2761. LFD f20, 4 * SIZE(BO)
  2762. LFD f21, 5 * SIZE(BO)
  2763. LFD f22, 6 * SIZE(BO)
  2764. LFD f23, 7 * SIZE(BO)
  2765. addi AO, AO, 16 * SIZE
  2766. addi BO, BO, 4 * SIZE
  2767. DCBT(BO, PREB)
  2768. bdnz LL(73)
  2769. .align 4
  2770. LL(75):
  2771. #if defined(LT) || defined(RN)
  2772. andi. r0, KK, 3
  2773. #else
  2774. andi. r0, TEMP, 3
  2775. #endif
  2776. mtspr CTR, r0
  2777. ble+ LL(78)
  2778. .align 4
  2779. LL(76):
  2780. FMADD f0, f16, f20, f0
  2781. FMADD f1, f17, f20, f1
  2782. FMADD f2, f18, f20, f2
  2783. FMADD f3, f19, f20, f3
  2784. LFD f16, 4 * SIZE(AO)
  2785. LFD f17, 5 * SIZE(AO)
  2786. LFD f18, 6 * SIZE(AO)
  2787. LFD f19, 7 * SIZE(AO)
  2788. LFD f20, 1 * SIZE(BO)
  2789. addi BO, BO, 1 * SIZE
  2790. addi AO, AO, 4 * SIZE
  2791. bdnz LL(76)
  2792. .align 4
  2793. LL(78):
  2794. #if defined(LN) || defined(RT)
  2795. #ifdef LN
  2796. subi r0, KK, 4
  2797. #else
  2798. subi r0, KK, 1
  2799. #endif
  2800. slwi TEMP, r0, 2 + BASE_SHIFT
  2801. slwi r0, r0, 0 + BASE_SHIFT
  2802. add AO, AORIG, TEMP
  2803. add BO, B, r0
  2804. #endif
  2805. #if defined(LN) || defined(LT)
  2806. LFD f16, 0 * SIZE(BO)
  2807. LFD f20, 1 * SIZE(BO)
  2808. LFD f24, 2 * SIZE(BO)
  2809. LFD f28, 3 * SIZE(BO)
  2810. FSUB f0, f16, f0
  2811. FSUB f1, f20, f1
  2812. FSUB f2, f24, f2
  2813. FSUB f3, f28, f3
  2814. #else
  2815. LFD f16, 0 * SIZE(AO)
  2816. LFD f17, 1 * SIZE(AO)
  2817. LFD f18, 2 * SIZE(AO)
  2818. LFD f19, 3 * SIZE(AO)
  2819. FSUB f0, f16, f0
  2820. FSUB f1, f17, f1
  2821. FSUB f2, f18, f2
  2822. FSUB f3, f19, f3
  2823. #endif
  2824. #ifdef LN
  2825. LFD f16, 15 * SIZE(AO)
  2826. LFD f17, 14 * SIZE(AO)
  2827. LFD f18, 13 * SIZE(AO)
  2828. LFD f19, 12 * SIZE(AO)
  2829. FMUL f3, f16, f3
  2830. FNMSUB f2, f17, f3, f2
  2831. FNMSUB f1, f18, f3, f1
  2832. FNMSUB f0, f19, f3, f0
  2833. LFD f16, 10 * SIZE(AO)
  2834. LFD f17, 9 * SIZE(AO)
  2835. LFD f18, 8 * SIZE(AO)
  2836. LFD f19, 5 * SIZE(AO)
  2837. LFD f20, 4 * SIZE(AO)
  2838. LFD f21, 0 * SIZE(AO)
  2839. FMUL f2, f16, f2
  2840. FNMSUB f1, f17, f2, f1
  2841. FNMSUB f0, f18, f2, f0
  2842. FMUL f1, f19, f1
  2843. FNMSUB f0, f20, f1, f0
  2844. FMUL f0, f21, f0
  2845. #endif
  2846. #ifdef LT
  2847. LFD f16, 0 * SIZE(AO)
  2848. LFD f17, 1 * SIZE(AO)
  2849. LFD f18, 2 * SIZE(AO)
  2850. LFD f19, 3 * SIZE(AO)
  2851. FMUL f0, f16, f0
  2852. FNMSUB f1, f17, f0, f1
  2853. FNMSUB f2, f18, f0, f2
  2854. FNMSUB f3, f19, f0, f3
  2855. LFD f17, 5 * SIZE(AO)
  2856. LFD f18, 6 * SIZE(AO)
  2857. LFD f19, 7 * SIZE(AO)
  2858. FMUL f1, f17, f1
  2859. FNMSUB f2, f18, f1, f2
  2860. FNMSUB f3, f19, f1, f3
  2861. LFD f18, 10 * SIZE(AO)
  2862. LFD f19, 11 * SIZE(AO)
  2863. FMUL f2, f18, f2
  2864. FNMSUB f3, f19, f2, f3
  2865. LFD f19, 15 * SIZE(AO)
  2866. FMUL f3, f19, f3
  2867. #endif
  2868. #ifdef RN
  2869. LFD f16, 0 * SIZE(BO)
  2870. FMUL f0, f16, f0
  2871. FMUL f1, f16, f1
  2872. FMUL f2, f16, f2
  2873. FMUL f3, f16, f3
  2874. #endif
  2875. #ifdef RT
  2876. LFD f21, 0 * SIZE(BO)
  2877. FMUL f0, f21, f0
  2878. FMUL f1, f21, f1
  2879. FMUL f2, f21, f2
  2880. FMUL f3, f21, f3
  2881. #endif
  2882. #ifdef LN
  2883. subi CO1, CO1, 4 * SIZE
  2884. #endif
  2885. #if defined(LN) || defined(LT)
  2886. STFD f0, 0 * SIZE(BO)
  2887. STFD f1, 1 * SIZE(BO)
  2888. STFD f2, 2 * SIZE(BO)
  2889. STFD f3, 3 * SIZE(BO)
  2890. #else
  2891. STFD f0, 0 * SIZE(AO)
  2892. STFD f1, 1 * SIZE(AO)
  2893. STFD f2, 2 * SIZE(AO)
  2894. STFD f3, 3 * SIZE(AO)
  2895. #endif
  2896. STFD f0, 0 * SIZE(CO1)
  2897. STFD f1, 1 * SIZE(CO1)
  2898. STFD f2, 2 * SIZE(CO1)
  2899. STFD f3, 3 * SIZE(CO1)
  2900. lfs f0, FZERO
  2901. fmr f1, f0
  2902. fmr f2, f0
  2903. fmr f3, f0
  2904. #ifndef LN
  2905. addi CO1, CO1, 4 * SIZE
  2906. #endif
  2907. #ifdef RT
  2908. slwi r0, K, 2 + BASE_SHIFT
  2909. add AORIG, AORIG, r0
  2910. #endif
  2911. #if defined(LT) || defined(RN)
  2912. sub TEMP, K, KK
  2913. slwi r0, TEMP, 2 + BASE_SHIFT
  2914. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2915. add AO, AO, r0
  2916. add BO, BO, TEMP
  2917. #endif
  2918. #ifdef LN
  2919. subi KK, KK, 4
  2920. #endif
  2921. #ifdef LT
  2922. addi KK, KK, 4
  2923. #endif
  2924. addic. I, I, -1
  2925. bgt+ LL(72)
  2926. .align 4
  2927. LL(999):
  2928. addi r3, 0, 0
  2929. lfd f14, 0(SP)
  2930. lfd f15, 8(SP)
  2931. lfd f16, 16(SP)
  2932. lfd f17, 24(SP)
  2933. lfd f18, 32(SP)
  2934. lfd f19, 40(SP)
  2935. lfd f20, 48(SP)
  2936. lfd f21, 56(SP)
  2937. lfd f22, 64(SP)
  2938. lfd f23, 72(SP)
  2939. lfd f24, 80(SP)
  2940. lfd f25, 88(SP)
  2941. lfd f26, 96(SP)
  2942. lfd f27, 104(SP)
  2943. lfd f28, 112(SP)
  2944. lfd f29, 120(SP)
  2945. lfd f30, 128(SP)
  2946. lfd f31, 136(SP)
  2947. #ifdef __64BIT__
  2948. ld r31, 144(SP)
  2949. ld r30, 152(SP)
  2950. ld r29, 160(SP)
  2951. ld r28, 168(SP)
  2952. ld r27, 176(SP)
  2953. ld r26, 184(SP)
  2954. ld r25, 192(SP)
  2955. ld r24, 200(SP)
  2956. ld r23, 208(SP)
  2957. ld r22, 216(SP)
  2958. ld r21, 224(SP)
  2959. ld r20, 232(SP)
  2960. ld r19, 240(SP)
  2961. ld r18, 248(SP)
  2962. #else
  2963. lwz r31, 144(SP)
  2964. lwz r30, 148(SP)
  2965. lwz r29, 152(SP)
  2966. lwz r28, 156(SP)
  2967. lwz r27, 160(SP)
  2968. lwz r26, 164(SP)
  2969. lwz r25, 168(SP)
  2970. lwz r24, 172(SP)
  2971. lwz r23, 176(SP)
  2972. lwz r22, 180(SP)
  2973. lwz r21, 184(SP)
  2974. lwz r20, 188(SP)
  2975. lwz r19, 192(SP)
  2976. lwz r18, 196(SP)
  2977. #endif
  2978. addi SP, SP, STACKSIZE
  2979. blr
  2980. EPILOGUE
  2981. #endif