You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT.S 63 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #if defined(linux) || defined(__FreeBSD__)
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define AORIG r18
  88. #define TEMP r19
  89. #define KK r20
  90. #define I r21
  91. #define J r22
  92. #define AO r23
  93. #define BO r24
  94. #define CO1 r25
  95. #define CO2 r26
  96. #define CO3 r27
  97. #define CO4 r28
  98. #define PREA r29
  99. #define PREB r30
  100. #define PREC r31
  101. #ifndef NEEDPARAM
  102. PROLOGUE
  103. PROFCODE
  104. addi SP, SP, -STACKSIZE
  105. li r0, 0
  106. stfd f14, 0(SP)
  107. stfd f15, 8(SP)
  108. stfd f16, 16(SP)
  109. stfd f17, 24(SP)
  110. stfd f18, 32(SP)
  111. stfd f19, 40(SP)
  112. stfd f20, 48(SP)
  113. stfd f21, 56(SP)
  114. stfd f22, 64(SP)
  115. stfd f23, 72(SP)
  116. stfd f24, 80(SP)
  117. stfd f25, 88(SP)
  118. stfd f26, 96(SP)
  119. stfd f27, 104(SP)
  120. stfd f28, 112(SP)
  121. stfd f29, 120(SP)
  122. stfd f30, 128(SP)
  123. stfd f31, 136(SP)
  124. #ifdef __64BIT__
  125. std r31, 144(SP)
  126. std r30, 152(SP)
  127. std r29, 160(SP)
  128. std r28, 168(SP)
  129. std r27, 176(SP)
  130. std r26, 184(SP)
  131. std r25, 192(SP)
  132. std r24, 200(SP)
  133. std r23, 208(SP)
  134. std r22, 216(SP)
  135. std r21, 224(SP)
  136. std r20, 232(SP)
  137. std r19, 240(SP)
  138. std r18, 248(SP)
  139. #else
  140. stw r31, 144(SP)
  141. stw r30, 148(SP)
  142. stw r29, 152(SP)
  143. stw r28, 156(SP)
  144. stw r27, 160(SP)
  145. stw r26, 164(SP)
  146. stw r25, 168(SP)
  147. stw r24, 172(SP)
  148. stw r23, 176(SP)
  149. stw r22, 180(SP)
  150. stw r21, 184(SP)
  151. stw r20, 188(SP)
  152. stw r19, 192(SP)
  153. stw r18, 196(SP)
  154. #endif
  155. stw r0, FZERO
  156. #if defined(_AIX) || defined(__APPLE__)
  157. #if !defined(__64BIT__) && defined(DOUBLE)
  158. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  159. #endif
  160. #endif
  161. slwi LDC, LDC, BASE_SHIFT
  162. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  163. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  164. #endif
  165. #if defined(_AIX) || defined(__APPLE__)
  166. #ifdef __64BIT__
  167. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  168. #else
  169. #ifdef DOUBLE
  170. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  171. #else
  172. lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  173. #endif
  174. #endif
  175. #endif
  176. #ifdef LN
  177. mullw r0, M, K
  178. slwi r0, r0, BASE_SHIFT
  179. add A, A, r0
  180. slwi r0, M, BASE_SHIFT
  181. add C, C, r0
  182. #endif
  183. #ifdef RN
  184. neg KK, OFFSET
  185. #endif
  186. #ifdef RT
  187. mullw r0, N, K
  188. slwi r0, r0, BASE_SHIFT
  189. add B, B, r0
  190. mullw r0, N, LDC
  191. add C, C, r0
  192. sub KK, N, OFFSET
  193. #endif
  194. cmpwi cr0, M, 0
  195. ble LL(999)
  196. cmpwi cr0, N, 0
  197. ble LL(999)
  198. cmpwi cr0, K, 0
  199. ble LL(999)
  200. #ifndef PREFETCHTEST
  201. #if defined(TRSMKERNEL) && defined(LN)
  202. /* Direction is special */
  203. #ifdef PPC970
  204. li PREC, -4 * SIZE
  205. #endif
  206. #ifdef POWER4
  207. li PREC, -4 * SIZE
  208. #endif
  209. #ifdef POWER5
  210. li PREC, -4 * SIZE
  211. #endif
  212. #else
  213. /* Normal prefetch */
  214. #ifdef PPC970
  215. li PREC, 4 * SIZE
  216. #endif
  217. #ifdef POWER4
  218. li PREC, 4 * SIZE /* is 12 best? */
  219. #endif
  220. #ifdef POWER5
  221. li PREC, 3 * SIZE
  222. #endif
  223. #endif
  224. #else
  225. #if defined(linux) || defined(__FreeBSD__)
  226. #ifndef __64BIT__
  227. mr PREA, r10
  228. lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
  229. lwz PREC, FRAMESLOT(1) + STACKSIZE(SP)
  230. #else
  231. ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  232. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  233. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  234. #endif
  235. #endif
  236. #if defined(_AIX) || defined(__APPLE__)
  237. #ifdef __64BIT__
  238. ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  239. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  240. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  241. #else
  242. #ifdef DOUBLE
  243. lwz PREA, FRAMESLOT(1) + STACKSIZE(SP)
  244. lwz PREB, FRAMESLOT(2) + STACKSIZE(SP)
  245. lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
  246. #else
  247. lwz PREA, FRAMESLOT(0) + STACKSIZE(SP)
  248. lwz PREB, FRAMESLOT(1) + STACKSIZE(SP)
  249. lwz PREC, FRAMESLOT(2) + STACKSIZE(SP)
  250. #endif
  251. #endif
  252. #endif
  253. #endif
  254. #ifndef PREFETCHTEST
  255. #ifdef PPC970
  256. #ifdef ALLOC_HUGETLB
  257. li PREA, (16 * 5 * SIZE | 1)
  258. li PREB, (16 * 5 * SIZE | 3)
  259. #else
  260. li PREA, (16 * 14 * SIZE | 1)
  261. li PREB, (16 * 8 * SIZE | 3)
  262. #endif
  263. #endif
  264. #ifdef POWER4
  265. #ifdef ALLOC_HUGETLB
  266. li PREA, (16 * 1 * SIZE + 16)
  267. li PREB, (16 * 1 * SIZE + 16)
  268. #else
  269. li PREA, (16 * 2 * SIZE + 16)
  270. li PREB, (16 * 2 * SIZE + 16)
  271. #endif
  272. #endif
  273. #ifdef POWER5
  274. #ifdef ALLOC_HUGETLB
  275. li PREA, (16 * 7 * SIZE | 1)
  276. li PREB, (16 * 7 * SIZE | 3)
  277. #else
  278. li PREA, (16 * 12 * SIZE | 1)
  279. li PREB, (16 * 6 * SIZE | 3)
  280. #endif
  281. #endif
  282. #endif
  283. lfs f0, FZERO
  284. LL(70):
  285. andi. J, N, 1
  286. ble LL(40)
  287. #ifdef RT
  288. slwi r0, K, 0 + BASE_SHIFT
  289. sub B, B, r0
  290. sub C, C, LDC
  291. #endif
  292. mr CO1, C
  293. #ifdef LN
  294. add KK, M, OFFSET
  295. #endif
  296. #ifdef LT
  297. mr KK, OFFSET
  298. #endif
  299. fmr f1, f0
  300. fmr f2, f0
  301. fmr f3, f0
  302. srawi. I, M, 2
  303. #if defined(LN) || defined(RT)
  304. mr AORIG, A
  305. #else
  306. mr AO, A
  307. #endif
  308. #ifndef RT
  309. add C, CO1, LDC
  310. #endif
  311. ble LL(80)
  312. .align 4
  313. LL(71):
  314. #if defined(LT) || defined(RN)
  315. LFD f16, 0 * SIZE(AO)
  316. LFD f17, 1 * SIZE(AO)
  317. LFD f18, 2 * SIZE(AO)
  318. LFD f19, 3 * SIZE(AO)
  319. LFD f20, 0 * SIZE(B)
  320. LFD f21, 1 * SIZE(B)
  321. LFD f22, 2 * SIZE(B)
  322. LFD f23, 3 * SIZE(B)
  323. dcbt CO1, PREC
  324. srawi. r0, KK, 2
  325. mtspr CTR, r0
  326. mr BO, B
  327. #else
  328. #ifdef LN
  329. slwi r0, K, 2 + BASE_SHIFT
  330. sub AORIG, AORIG, r0
  331. #endif
  332. slwi r0, KK, 2 + BASE_SHIFT
  333. slwi TEMP, KK, 0 + BASE_SHIFT
  334. add AO, AORIG, r0
  335. add BO, B, TEMP
  336. sub TEMP, K, KK
  337. LFD f16, 0 * SIZE(AO)
  338. LFD f17, 1 * SIZE(AO)
  339. LFD f18, 2 * SIZE(AO)
  340. LFD f19, 3 * SIZE(AO)
  341. LFD f20, 0 * SIZE(BO)
  342. LFD f21, 1 * SIZE(BO)
  343. LFD f22, 2 * SIZE(BO)
  344. LFD f23, 3 * SIZE(BO)
  345. dcbt CO1, PREC
  346. srawi. r0, TEMP, 2
  347. mtspr CTR, r0
  348. #endif
  349. ble LL(75)
  350. .align 5
  351. LL(72):
  352. FMADD f0, f16, f20, f0
  353. FMADD f1, f17, f20, f1
  354. FMADD f2, f18, f20, f2
  355. FMADD f3, f19, f20, f3
  356. LFD f16, 4 * SIZE(AO)
  357. LFD f17, 5 * SIZE(AO)
  358. LFD f18, 6 * SIZE(AO)
  359. LFD f19, 7 * SIZE(AO)
  360. FMADD f0, f16, f21, f0
  361. FMADD f1, f17, f21, f1
  362. FMADD f2, f18, f21, f2
  363. FMADD f3, f19, f21, f3
  364. LFD f16, 8 * SIZE(AO)
  365. LFD f17, 9 * SIZE(AO)
  366. LFD f18, 10 * SIZE(AO)
  367. LFD f19, 11 * SIZE(AO)
  368. FMADD f0, f16, f22, f0
  369. FMADD f1, f17, f22, f1
  370. FMADD f2, f18, f22, f2
  371. FMADD f3, f19, f22, f3
  372. LFD f16, 12 * SIZE(AO)
  373. LFD f17, 13 * SIZE(AO)
  374. LFD f18, 14 * SIZE(AO)
  375. LFD f19, 15 * SIZE(AO)
  376. FMADD f0, f16, f23, f0
  377. FMADD f1, f17, f23, f1
  378. FMADD f2, f18, f23, f2
  379. FMADD f3, f19, f23, f3
  380. LFD f16, 16 * SIZE(AO)
  381. LFD f17, 17 * SIZE(AO)
  382. LFD f18, 18 * SIZE(AO)
  383. LFD f19, 19 * SIZE(AO)
  384. LFD f20, 4 * SIZE(BO)
  385. LFD f21, 5 * SIZE(BO)
  386. LFD f22, 6 * SIZE(BO)
  387. LFD f23, 7 * SIZE(BO)
  388. addi AO, AO, 16 * SIZE
  389. addi BO, BO, 4 * SIZE
  390. DCBT(BO, PREB)
  391. bdnz LL(72)
  392. .align 4
  393. LL(75):
  394. #if defined(LT) || defined(RN)
  395. andi. r0, KK, 3
  396. #else
  397. andi. r0, TEMP, 3
  398. #endif
  399. mtspr CTR, r0
  400. ble+ LL(78)
  401. .align 4
  402. LL(76):
  403. FMADD f0, f16, f20, f0
  404. FMADD f1, f17, f20, f1
  405. FMADD f2, f18, f20, f2
  406. FMADD f3, f19, f20, f3
  407. LFD f16, 4 * SIZE(AO)
  408. LFD f17, 5 * SIZE(AO)
  409. LFD f18, 6 * SIZE(AO)
  410. LFD f19, 7 * SIZE(AO)
  411. LFD f20, 1 * SIZE(BO)
  412. addi BO, BO, 1 * SIZE
  413. addi AO, AO, 4 * SIZE
  414. bdnz LL(76)
  415. .align 4
  416. LL(78):
  417. #if defined(LN) || defined(RT)
  418. #ifdef LN
  419. subi r0, KK, 4
  420. #else
  421. subi r0, KK, 1
  422. #endif
  423. slwi TEMP, r0, 2 + BASE_SHIFT
  424. slwi r0, r0, 0 + BASE_SHIFT
  425. add AO, AORIG, TEMP
  426. add BO, B, r0
  427. #endif
  428. #if defined(LN) || defined(LT)
  429. LFD f16, 0 * SIZE(BO)
  430. LFD f20, 1 * SIZE(BO)
  431. LFD f24, 2 * SIZE(BO)
  432. LFD f28, 3 * SIZE(BO)
  433. FSUB f0, f16, f0
  434. FSUB f1, f20, f1
  435. FSUB f2, f24, f2
  436. FSUB f3, f28, f3
  437. #else
  438. LFD f16, 0 * SIZE(AO)
  439. LFD f17, 1 * SIZE(AO)
  440. LFD f18, 2 * SIZE(AO)
  441. LFD f19, 3 * SIZE(AO)
  442. FSUB f0, f16, f0
  443. FSUB f1, f17, f1
  444. FSUB f2, f18, f2
  445. FSUB f3, f19, f3
  446. #endif
  447. #ifdef LN
  448. LFD f16, 15 * SIZE(AO)
  449. LFD f17, 14 * SIZE(AO)
  450. LFD f18, 13 * SIZE(AO)
  451. LFD f19, 12 * SIZE(AO)
  452. FMUL f3, f16, f3
  453. FNMSUB f2, f17, f3, f2
  454. FNMSUB f1, f18, f3, f1
  455. FNMSUB f0, f19, f3, f0
  456. LFD f16, 10 * SIZE(AO)
  457. LFD f17, 9 * SIZE(AO)
  458. LFD f18, 8 * SIZE(AO)
  459. LFD f19, 5 * SIZE(AO)
  460. LFD f20, 4 * SIZE(AO)
  461. LFD f21, 0 * SIZE(AO)
  462. FMUL f2, f16, f2
  463. FNMSUB f1, f17, f2, f1
  464. FNMSUB f0, f18, f2, f0
  465. FMUL f1, f19, f1
  466. FNMSUB f0, f20, f1, f0
  467. FMUL f0, f21, f0
  468. #endif
  469. #ifdef LT
  470. LFD f16, 0 * SIZE(AO)
  471. LFD f17, 1 * SIZE(AO)
  472. LFD f18, 2 * SIZE(AO)
  473. LFD f19, 3 * SIZE(AO)
  474. FMUL f0, f16, f0
  475. FNMSUB f1, f17, f0, f1
  476. FNMSUB f2, f18, f0, f2
  477. FNMSUB f3, f19, f0, f3
  478. LFD f17, 5 * SIZE(AO)
  479. LFD f18, 6 * SIZE(AO)
  480. LFD f19, 7 * SIZE(AO)
  481. FMUL f1, f17, f1
  482. FNMSUB f2, f18, f1, f2
  483. FNMSUB f3, f19, f1, f3
  484. LFD f18, 10 * SIZE(AO)
  485. LFD f19, 11 * SIZE(AO)
  486. FMUL f2, f18, f2
  487. FNMSUB f3, f19, f2, f3
  488. LFD f19, 15 * SIZE(AO)
  489. FMUL f3, f19, f3
  490. #endif
  491. #ifdef RN
  492. LFD f16, 0 * SIZE(BO)
  493. FMUL f0, f16, f0
  494. FMUL f1, f16, f1
  495. FMUL f2, f16, f2
  496. FMUL f3, f16, f3
  497. #endif
  498. #ifdef RT
  499. LFD f21, 0 * SIZE(BO)
  500. FMUL f0, f21, f0
  501. FMUL f1, f21, f1
  502. FMUL f2, f21, f2
  503. FMUL f3, f21, f3
  504. #endif
  505. #ifdef LN
  506. subi CO1, CO1, 4 * SIZE
  507. #endif
  508. #if defined(LN) || defined(LT)
  509. STFD f0, 0 * SIZE(BO)
  510. STFD f1, 1 * SIZE(BO)
  511. STFD f2, 2 * SIZE(BO)
  512. STFD f3, 3 * SIZE(BO)
  513. #else
  514. STFD f0, 0 * SIZE(AO)
  515. STFD f1, 1 * SIZE(AO)
  516. STFD f2, 2 * SIZE(AO)
  517. STFD f3, 3 * SIZE(AO)
  518. #endif
  519. STFD f0, 0 * SIZE(CO1)
  520. STFD f1, 1 * SIZE(CO1)
  521. STFD f2, 2 * SIZE(CO1)
  522. STFD f3, 3 * SIZE(CO1)
  523. lfs f0, FZERO
  524. fmr f1, f0
  525. fmr f2, f0
  526. fmr f3, f0
  527. #ifndef LN
  528. addi CO1, CO1, 4 * SIZE
  529. #endif
  530. #ifdef RT
  531. slwi r0, K, 2 + BASE_SHIFT
  532. add AORIG, AORIG, r0
  533. #endif
  534. #if defined(LT) || defined(RN)
  535. sub TEMP, K, KK
  536. slwi r0, TEMP, 2 + BASE_SHIFT
  537. slwi TEMP, TEMP, 0 + BASE_SHIFT
  538. add AO, AO, r0
  539. add BO, BO, TEMP
  540. #endif
  541. #ifdef LN
  542. subi KK, KK, 4
  543. #endif
  544. #ifdef LT
  545. addi KK, KK, 4
  546. #endif
  547. addic. I, I, -1
  548. bgt+ LL(71)
  549. .align 4
  550. LL(80):
  551. andi. I, M, 2
  552. ble LL(90)
  553. #if defined(LT) || defined(RN)
  554. LFD f16, 0 * SIZE(AO)
  555. LFD f17, 1 * SIZE(AO)
  556. LFD f18, 2 * SIZE(AO)
  557. LFD f19, 3 * SIZE(AO)
  558. LFD f20, 0 * SIZE(B)
  559. LFD f21, 1 * SIZE(B)
  560. LFD f22, 2 * SIZE(B)
  561. LFD f23, 3 * SIZE(B)
  562. srawi. r0, KK, 2
  563. mtspr CTR, r0
  564. mr BO, B
  565. #else
  566. #ifdef LN
  567. slwi r0, K, 1 + BASE_SHIFT
  568. sub AORIG, AORIG, r0
  569. #endif
  570. slwi r0, KK, 1 + BASE_SHIFT
  571. slwi TEMP, KK, 0 + BASE_SHIFT
  572. add AO, AORIG, r0
  573. add BO, B, TEMP
  574. sub TEMP, K, KK
  575. LFD f16, 0 * SIZE(AO)
  576. LFD f17, 1 * SIZE(AO)
  577. LFD f18, 2 * SIZE(AO)
  578. LFD f19, 3 * SIZE(AO)
  579. LFD f20, 0 * SIZE(BO)
  580. LFD f21, 1 * SIZE(BO)
  581. LFD f22, 2 * SIZE(BO)
  582. LFD f23, 3 * SIZE(BO)
  583. srawi. r0, TEMP, 2
  584. mtspr CTR, r0
  585. #endif
  586. ble LL(85)
  587. .align 5
  588. LL(82):
  589. FMADD f0, f16, f20, f0
  590. FMADD f1, f17, f20, f1
  591. FMADD f2, f18, f21, f2
  592. FMADD f3, f19, f21, f3
  593. LFD f16, 4 * SIZE(AO)
  594. LFD f17, 5 * SIZE(AO)
  595. LFD f18, 6 * SIZE(AO)
  596. LFD f19, 7 * SIZE(AO)
  597. FMADD f0, f16, f22, f0
  598. FMADD f1, f17, f22, f1
  599. FMADD f2, f18, f23, f2
  600. FMADD f3, f19, f23, f3
  601. LFD f16, 8 * SIZE(AO)
  602. LFD f17, 9 * SIZE(AO)
  603. LFD f18, 10 * SIZE(AO)
  604. LFD f19, 11 * SIZE(AO)
  605. LFD f20, 4 * SIZE(BO)
  606. LFD f21, 5 * SIZE(BO)
  607. LFD f22, 6 * SIZE(BO)
  608. LFD f23, 7 * SIZE(BO)
  609. addi AO, AO, 8 * SIZE
  610. addi BO, BO, 4 * SIZE
  611. DCBT(BO, PREB)
  612. bdnz LL(82)
  613. .align 4
  614. LL(85):
  615. #if defined(LT) || defined(RN)
  616. andi. r0, KK, 3
  617. #else
  618. andi. r0, TEMP, 3
  619. #endif
  620. mtspr CTR, r0
  621. ble+ LL(88)
  622. .align 4
  623. LL(86):
  624. FMADD f0, f16, f20, f0
  625. FMADD f1, f17, f20, f1
  626. LFD f16, 2 * SIZE(AO)
  627. LFD f17, 3 * SIZE(AO)
  628. LFD f20, 1 * SIZE(BO)
  629. addi BO, BO, 1 * SIZE
  630. addi AO, AO, 2 * SIZE
  631. bdnz LL(86)
  632. .align 4
  633. LL(88):
  634. FADD f0, f2, f0
  635. FADD f1, f3, f1
  636. #if defined(LN) || defined(RT)
  637. #ifdef LN
  638. subi r0, KK, 2
  639. #else
  640. subi r0, KK, 1
  641. #endif
  642. slwi TEMP, r0, 1 + BASE_SHIFT
  643. slwi r0, r0, 0 + BASE_SHIFT
  644. add AO, AORIG, TEMP
  645. add BO, B, r0
  646. #endif
  647. #if defined(LN) || defined(LT)
  648. LFD f16, 0 * SIZE(BO)
  649. LFD f20, 1 * SIZE(BO)
  650. FSUB f0, f16, f0
  651. FSUB f1, f20, f1
  652. #else
  653. LFD f16, 0 * SIZE(AO)
  654. LFD f17, 1 * SIZE(AO)
  655. FSUB f0, f16, f0
  656. FSUB f1, f17, f1
  657. #endif
  658. #ifdef LN
  659. LFD f19, 3 * SIZE(AO)
  660. LFD f20, 2 * SIZE(AO)
  661. LFD f21, 0 * SIZE(AO)
  662. FMUL f1, f19, f1
  663. FNMSUB f0, f20, f1, f0
  664. FMUL f0, f21, f0
  665. #endif
  666. #ifdef LT
  667. LFD f16, 0 * SIZE(AO)
  668. LFD f17, 1 * SIZE(AO)
  669. FMUL f0, f16, f0
  670. FNMSUB f1, f17, f0, f1
  671. LFD f17, 3 * SIZE(AO)
  672. FMUL f1, f17, f1
  673. #endif
  674. #ifdef RN
  675. LFD f16, 0 * SIZE(BO)
  676. FMUL f0, f16, f0
  677. FMUL f1, f16, f1
  678. #endif
  679. #ifdef RT
  680. LFD f21, 0 * SIZE(BO)
  681. FMUL f0, f21, f0
  682. FMUL f1, f21, f1
  683. #endif
  684. #ifdef LN
  685. subi CO1, CO1, 2 * SIZE
  686. #endif
  687. #if defined(LN) || defined(LT)
  688. STFD f0, 0 * SIZE(BO)
  689. STFD f1, 1 * SIZE(BO)
  690. #else
  691. STFD f0, 0 * SIZE(AO)
  692. STFD f1, 1 * SIZE(AO)
  693. #endif
  694. STFD f0, 0 * SIZE(CO1)
  695. STFD f1, 1 * SIZE(CO1)
  696. lfs f0, FZERO
  697. fmr f1, f0
  698. fmr f2, f0
  699. fmr f3, f0
  700. #ifndef LN
  701. addi CO1, CO1, 2 * SIZE
  702. #endif
  703. #ifdef RT
  704. slwi r0, K, 1 + BASE_SHIFT
  705. add AORIG, AORIG, r0
  706. #endif
  707. #if defined(LT) || defined(RN)
  708. sub TEMP, K, KK
  709. slwi r0, TEMP, 1 + BASE_SHIFT
  710. slwi TEMP, TEMP, 0 + BASE_SHIFT
  711. add AO, AO, r0
  712. add BO, BO, TEMP
  713. #endif
  714. #ifdef LN
  715. subi KK, KK, 2
  716. #endif
  717. #ifdef LT
  718. addi KK, KK, 2
  719. #endif
  720. .align 4
  721. LL(90):
  722. andi. I, M, 1
  723. ble LL(99)
  724. #if defined(LT) || defined(RN)
  725. LFD f16, 0 * SIZE(AO)
  726. LFD f17, 1 * SIZE(AO)
  727. LFD f18, 2 * SIZE(AO)
  728. LFD f19, 3 * SIZE(AO)
  729. LFD f20, 0 * SIZE(B)
  730. LFD f21, 1 * SIZE(B)
  731. LFD f22, 2 * SIZE(B)
  732. LFD f23, 3 * SIZE(B)
  733. srawi. r0, KK, 3
  734. mtspr CTR, r0
  735. mr BO, B
  736. #else
  737. #ifdef LN
  738. slwi r0, K, BASE_SHIFT
  739. sub AORIG, AORIG, r0
  740. #endif
  741. slwi r0, KK, 0 + BASE_SHIFT
  742. slwi TEMP, KK, 0 + BASE_SHIFT
  743. add AO, AORIG, r0
  744. add BO, B, TEMP
  745. sub TEMP, K, KK
  746. LFD f16, 0 * SIZE(AO)
  747. LFD f17, 1 * SIZE(AO)
  748. LFD f18, 2 * SIZE(AO)
  749. LFD f19, 3 * SIZE(AO)
  750. LFD f20, 0 * SIZE(BO)
  751. LFD f21, 1 * SIZE(BO)
  752. LFD f22, 2 * SIZE(BO)
  753. LFD f23, 3 * SIZE(BO)
  754. srawi. r0, TEMP, 3
  755. mtspr CTR, r0
  756. #endif
  757. ble LL(95)
  758. .align 5
  759. LL(92):
  760. FMADD f0, f16, f20, f0
  761. FMADD f1, f17, f21, f1
  762. FMADD f2, f18, f22, f2
  763. FMADD f3, f19, f23, f3
  764. LFD f16, 4 * SIZE(AO)
  765. LFD f17, 5 * SIZE(AO)
  766. LFD f18, 6 * SIZE(AO)
  767. LFD f19, 7 * SIZE(AO)
  768. LFD f20, 4 * SIZE(BO)
  769. LFD f21, 5 * SIZE(BO)
  770. LFD f22, 6 * SIZE(BO)
  771. LFD f23, 7 * SIZE(BO)
  772. FMADD f0, f16, f20, f0
  773. FMADD f1, f17, f21, f1
  774. FMADD f2, f18, f22, f2
  775. FMADD f3, f19, f23, f3
  776. LFD f16, 8 * SIZE(AO)
  777. LFD f17, 9 * SIZE(AO)
  778. LFD f18, 10 * SIZE(AO)
  779. LFD f19, 11 * SIZE(AO)
  780. LFD f20, 8 * SIZE(BO)
  781. LFD f21, 9 * SIZE(BO)
  782. LFD f22, 10 * SIZE(BO)
  783. LFD f23, 11 * SIZE(BO)
  784. addi AO, AO, 8 * SIZE
  785. addi BO, BO, 8 * SIZE
  786. bdnz LL(92)
  787. .align 4
  788. LL(95):
  789. #if defined(LT) || defined(RN)
  790. andi. r0, KK, 7
  791. #else
  792. andi. r0, TEMP, 7
  793. #endif
  794. mtspr CTR, r0
  795. ble+ LL(98)
  796. .align 4
  797. LL(96):
  798. FMADD f0, f16, f20, f0
  799. LFD f16, 1 * SIZE(AO)
  800. LFD f20, 1 * SIZE(BO)
  801. addi BO, BO, 1 * SIZE
  802. addi AO, AO, 1 * SIZE
  803. bdnz LL(96)
  804. .align 4
  805. LL(98):
  806. FADD f0, f1, f0
  807. FADD f2, f3, f2
  808. FADD f0, f2, f0
  809. #if defined(LN) || defined(RT)
  810. #ifdef LN
  811. subi r0, KK, 1
  812. #else
  813. subi r0, KK, 1
  814. #endif
  815. slwi TEMP, r0, 0 + BASE_SHIFT
  816. slwi r0, r0, 0 + BASE_SHIFT
  817. add AO, AORIG, TEMP
  818. add BO, B, r0
  819. #endif
  820. #if defined(LN) || defined(LT)
  821. LFD f16, 0 * SIZE(BO)
  822. FSUB f0, f16, f0
  823. #else
  824. LFD f16, 0 * SIZE(AO)
  825. FSUB f0, f16, f0
  826. #endif
  827. #ifdef LN
  828. LFD f21, 0 * SIZE(AO)
  829. FMUL f0, f21, f0
  830. #endif
  831. #ifdef LT
  832. LFD f16, 0 * SIZE(AO)
  833. FMUL f0, f16, f0
  834. #endif
  835. #ifdef RN
  836. LFD f16, 0 * SIZE(BO)
  837. FMUL f0, f16, f0
  838. #endif
  839. #ifdef RT
  840. LFD f21, 0 * SIZE(BO)
  841. FMUL f0, f21, f0
  842. #endif
  843. #ifdef LN
  844. subi CO1, CO1, 1 * SIZE
  845. #endif
  846. #if defined(LN) || defined(LT)
  847. STFD f0, 0 * SIZE(BO)
  848. #else
  849. STFD f0, 0 * SIZE(AO)
  850. #endif
  851. STFD f0, 0 * SIZE(CO1)
  852. lfs f0, FZERO
  853. #ifndef LN
  854. addi CO1, CO1, 1 * SIZE
  855. #endif
  856. #ifdef RT
  857. slwi r0, K, 0 + BASE_SHIFT
  858. add AORIG, AORIG, r0
  859. #endif
  860. #if defined(LT) || defined(RN)
  861. sub TEMP, K, KK
  862. slwi r0, TEMP, 0 + BASE_SHIFT
  863. slwi TEMP, TEMP, 0 + BASE_SHIFT
  864. add AO, AO, r0
  865. add BO, BO, TEMP
  866. #endif
  867. #ifdef LN
  868. subi KK, KK, 1
  869. #endif
  870. #ifdef LT
  871. addi KK, KK, 1
  872. #endif
  873. .align 4
  874. LL(99):
  875. #ifdef LN
  876. slwi r0, K, 0 + BASE_SHIFT
  877. add B, B, r0
  878. #endif
  879. #if defined(LT) || defined(RN)
  880. mr B, BO
  881. #endif
  882. #ifdef RN
  883. addi KK, KK, 1
  884. #endif
  885. #ifdef RT
  886. subi KK, KK, 1
  887. #endif
  888. .align 4
  889. LL(40):
  890. andi. J, N, 2
  891. ble LL(09)
  892. #ifdef RT
  893. slwi r0, K, 1 + BASE_SHIFT
  894. sub B, B, r0
  895. slwi r0, LDC, 1
  896. sub C, C, r0
  897. #endif
  898. mr CO1, C
  899. add CO2, C, LDC
  900. #ifdef LN
  901. add KK, M, OFFSET
  902. #endif
  903. #ifdef LT
  904. mr KK, OFFSET
  905. #endif
  906. fmr f1, f0
  907. fmr f2, f0
  908. fmr f3, f0
  909. fmr f4, f0
  910. fmr f5, f0
  911. fmr f6, f0
  912. fmr f7, f0
  913. srawi. I, M, 2
  914. #if defined(LN) || defined(RT)
  915. mr AORIG, A
  916. #else
  917. mr AO, A
  918. #endif
  919. #ifndef RT
  920. add C, CO2, LDC
  921. #endif
  922. ble LL(50)
  923. .align 4
  924. LL(41):
  925. #if defined(LT) || defined(RN)
  926. LFD f16, 0 * SIZE(AO)
  927. LFD f17, 1 * SIZE(AO)
  928. LFD f18, 2 * SIZE(AO)
  929. LFD f19, 3 * SIZE(AO)
  930. LFD f20, 0 * SIZE(B)
  931. LFD f21, 1 * SIZE(B)
  932. LFD f22, 2 * SIZE(B)
  933. LFD f23, 3 * SIZE(B)
  934. dcbt CO1, PREC
  935. dcbt CO2, PREC
  936. srawi. r0, KK, 2
  937. mtspr CTR, r0
  938. mr BO, B
  939. #else
  940. #ifdef LN
  941. slwi r0, K, 2 + BASE_SHIFT
  942. sub AORIG, AORIG, r0
  943. #endif
  944. slwi r0, KK, 2 + BASE_SHIFT
  945. slwi TEMP, KK, 1 + BASE_SHIFT
  946. add AO, AORIG, r0
  947. add BO, B, TEMP
  948. sub TEMP, K, KK
  949. LFD f16, 0 * SIZE(AO)
  950. LFD f17, 1 * SIZE(AO)
  951. LFD f18, 2 * SIZE(AO)
  952. LFD f19, 3 * SIZE(AO)
  953. LFD f20, 0 * SIZE(BO)
  954. LFD f21, 1 * SIZE(BO)
  955. LFD f22, 2 * SIZE(BO)
  956. LFD f23, 3 * SIZE(BO)
  957. dcbt CO1, PREC
  958. dcbt CO2, PREC
  959. srawi. r0, TEMP, 2
  960. mtspr CTR, r0
  961. #endif
  962. ble LL(45)
  963. .align 5
  964. LL(42):
  965. FMADD f0, f16, f20, f0
  966. FMADD f1, f17, f20, f1
  967. FMADD f2, f18, f20, f2
  968. FMADD f3, f19, f20, f3
  969. FMADD f4, f16, f21, f4
  970. FMADD f5, f17, f21, f5
  971. FMADD f6, f18, f21, f6
  972. FMADD f7, f19, f21, f7
  973. LFD f16, 4 * SIZE(AO)
  974. LFD f17, 5 * SIZE(AO)
  975. LFD f18, 6 * SIZE(AO)
  976. LFD f19, 7 * SIZE(AO)
  977. FMADD f0, f16, f22, f0
  978. FMADD f1, f17, f22, f1
  979. FMADD f2, f18, f22, f2
  980. FMADD f3, f19, f22, f3
  981. FMADD f4, f16, f23, f4
  982. FMADD f5, f17, f23, f5
  983. FMADD f6, f18, f23, f6
  984. FMADD f7, f19, f23, f7
  985. LFD f16, 8 * SIZE(AO)
  986. LFD f17, 9 * SIZE(AO)
  987. LFD f18, 10 * SIZE(AO)
  988. LFD f19, 11 * SIZE(AO)
  989. LFD f20, 4 * SIZE(BO)
  990. LFD f21, 5 * SIZE(BO)
  991. LFD f22, 6 * SIZE(BO)
  992. LFD f23, 7 * SIZE(BO)
  993. FMADD f0, f16, f20, f0
  994. FMADD f1, f17, f20, f1
  995. FMADD f2, f18, f20, f2
  996. FMADD f3, f19, f20, f3
  997. FMADD f4, f16, f21, f4
  998. FMADD f5, f17, f21, f5
  999. FMADD f6, f18, f21, f6
  1000. FMADD f7, f19, f21, f7
  1001. LFD f16, 12 * SIZE(AO)
  1002. LFD f17, 13 * SIZE(AO)
  1003. LFD f18, 14 * SIZE(AO)
  1004. LFD f19, 15 * SIZE(AO)
  1005. FMADD f0, f16, f22, f0
  1006. FMADD f1, f17, f22, f1
  1007. FMADD f2, f18, f22, f2
  1008. FMADD f3, f19, f22, f3
  1009. FMADD f4, f16, f23, f4
  1010. FMADD f5, f17, f23, f5
  1011. FMADD f6, f18, f23, f6
  1012. FMADD f7, f19, f23, f7
  1013. LFD f16, 16 * SIZE(AO)
  1014. LFD f17, 17 * SIZE(AO)
  1015. LFD f18, 18 * SIZE(AO)
  1016. LFD f19, 19 * SIZE(AO)
  1017. LFD f20, 8 * SIZE(BO)
  1018. LFD f21, 9 * SIZE(BO)
  1019. LFD f22, 10 * SIZE(BO)
  1020. LFD f23, 11 * SIZE(BO)
  1021. addi AO, AO, 16 * SIZE
  1022. addi BO, BO, 8 * SIZE
  1023. DCBT(BO, PREB)
  1024. bdnz LL(42)
  1025. .align 4
  1026. LL(45):
  1027. #if defined(LT) || defined(RN)
  1028. andi. r0, KK, 3
  1029. #else
  1030. andi. r0, TEMP, 3
  1031. #endif
  1032. mtspr CTR, r0
  1033. ble+ LL(48)
  1034. .align 4
  1035. LL(46):
  1036. FMADD f0, f16, f20, f0
  1037. FMADD f1, f17, f20, f1
  1038. FMADD f2, f18, f20, f2
  1039. FMADD f3, f19, f20, f3
  1040. FMADD f4, f16, f21, f4
  1041. FMADD f5, f17, f21, f5
  1042. FMADD f6, f18, f21, f6
  1043. FMADD f7, f19, f21, f7
  1044. LFD f16, 4 * SIZE(AO)
  1045. LFD f17, 5 * SIZE(AO)
  1046. LFD f18, 6 * SIZE(AO)
  1047. LFD f19, 7 * SIZE(AO)
  1048. LFD f20, 2 * SIZE(BO)
  1049. LFD f21, 3 * SIZE(BO)
  1050. addi BO, BO, 2 * SIZE
  1051. addi AO, AO, 4 * SIZE
  1052. bdnz LL(46)
  1053. .align 4
  1054. LL(48):
  1055. #if defined(LN) || defined(RT)
  1056. #ifdef LN
  1057. subi r0, KK, 4
  1058. #else
  1059. subi r0, KK, 2
  1060. #endif
  1061. slwi TEMP, r0, 2 + BASE_SHIFT
  1062. slwi r0, r0, 1 + BASE_SHIFT
  1063. add AO, AORIG, TEMP
  1064. add BO, B, r0
  1065. #endif
  1066. #if defined(LN) || defined(LT)
  1067. LFD f16, 0 * SIZE(BO)
  1068. LFD f17, 1 * SIZE(BO)
  1069. LFD f20, 2 * SIZE(BO)
  1070. LFD f21, 3 * SIZE(BO)
  1071. LFD f24, 4 * SIZE(BO)
  1072. LFD f25, 5 * SIZE(BO)
  1073. LFD f28, 6 * SIZE(BO)
  1074. LFD f29, 7 * SIZE(BO)
  1075. FSUB f0, f16, f0
  1076. FSUB f4, f17, f4
  1077. FSUB f1, f20, f1
  1078. FSUB f5, f21, f5
  1079. FSUB f2, f24, f2
  1080. FSUB f6, f25, f6
  1081. FSUB f3, f28, f3
  1082. FSUB f7, f29, f7
  1083. #else
  1084. LFD f16, 0 * SIZE(AO)
  1085. LFD f17, 1 * SIZE(AO)
  1086. LFD f18, 2 * SIZE(AO)
  1087. LFD f19, 3 * SIZE(AO)
  1088. LFD f20, 4 * SIZE(AO)
  1089. LFD f21, 5 * SIZE(AO)
  1090. LFD f22, 6 * SIZE(AO)
  1091. LFD f23, 7 * SIZE(AO)
  1092. FSUB f0, f16, f0
  1093. FSUB f1, f17, f1
  1094. FSUB f2, f18, f2
  1095. FSUB f3, f19, f3
  1096. FSUB f4, f20, f4
  1097. FSUB f5, f21, f5
  1098. FSUB f6, f22, f6
  1099. FSUB f7, f23, f7
  1100. #endif
  1101. #ifdef LN
  1102. LFD f16, 15 * SIZE(AO)
  1103. LFD f17, 14 * SIZE(AO)
  1104. LFD f18, 13 * SIZE(AO)
  1105. LFD f19, 12 * SIZE(AO)
  1106. FMUL f3, f16, f3
  1107. FMUL f7, f16, f7
  1108. FNMSUB f2, f17, f3, f2
  1109. FNMSUB f6, f17, f7, f6
  1110. FNMSUB f1, f18, f3, f1
  1111. FNMSUB f5, f18, f7, f5
  1112. FNMSUB f0, f19, f3, f0
  1113. FNMSUB f4, f19, f7, f4
  1114. LFD f16, 10 * SIZE(AO)
  1115. LFD f17, 9 * SIZE(AO)
  1116. LFD f18, 8 * SIZE(AO)
  1117. LFD f19, 5 * SIZE(AO)
  1118. LFD f20, 4 * SIZE(AO)
  1119. LFD f21, 0 * SIZE(AO)
  1120. FMUL f2, f16, f2
  1121. FMUL f6, f16, f6
  1122. FNMSUB f1, f17, f2, f1
  1123. FNMSUB f5, f17, f6, f5
  1124. FNMSUB f0, f18, f2, f0
  1125. FNMSUB f4, f18, f6, f4
  1126. FMUL f1, f19, f1
  1127. FMUL f5, f19, f5
  1128. FNMSUB f0, f20, f1, f0
  1129. FNMSUB f4, f20, f5, f4
  1130. FMUL f0, f21, f0
  1131. FMUL f4, f21, f4
  1132. #endif
  1133. #ifdef LT
  1134. LFD f16, 0 * SIZE(AO)
  1135. LFD f17, 1 * SIZE(AO)
  1136. LFD f18, 2 * SIZE(AO)
  1137. LFD f19, 3 * SIZE(AO)
  1138. FMUL f0, f16, f0
  1139. FMUL f4, f16, f4
  1140. FNMSUB f1, f17, f0, f1
  1141. FNMSUB f5, f17, f4, f5
  1142. FNMSUB f2, f18, f0, f2
  1143. FNMSUB f6, f18, f4, f6
  1144. FNMSUB f3, f19, f0, f3
  1145. FNMSUB f7, f19, f4, f7
  1146. LFD f17, 5 * SIZE(AO)
  1147. LFD f18, 6 * SIZE(AO)
  1148. LFD f19, 7 * SIZE(AO)
  1149. FMUL f1, f17, f1
  1150. FMUL f5, f17, f5
  1151. FNMSUB f2, f18, f1, f2
  1152. FNMSUB f6, f18, f5, f6
  1153. FNMSUB f3, f19, f1, f3
  1154. FNMSUB f7, f19, f5, f7
  1155. LFD f18, 10 * SIZE(AO)
  1156. LFD f19, 11 * SIZE(AO)
  1157. FMUL f2, f18, f2
  1158. FMUL f6, f18, f6
  1159. FNMSUB f3, f19, f2, f3
  1160. FNMSUB f7, f19, f6, f7
  1161. LFD f19, 15 * SIZE(AO)
  1162. FMUL f3, f19, f3
  1163. FMUL f7, f19, f7
  1164. #endif
  1165. #ifdef RN
  1166. LFD f16, 0 * SIZE(BO)
  1167. LFD f17, 1 * SIZE(BO)
  1168. LFD f18, 3 * SIZE(BO)
  1169. FMUL f0, f16, f0
  1170. FMUL f1, f16, f1
  1171. FMUL f2, f16, f2
  1172. FMUL f3, f16, f3
  1173. FNMSUB f4, f17, f0, f4
  1174. FNMSUB f5, f17, f1, f5
  1175. FNMSUB f6, f17, f2, f6
  1176. FNMSUB f7, f17, f3, f7
  1177. FMUL f4, f18, f4
  1178. FMUL f5, f18, f5
  1179. FMUL f6, f18, f6
  1180. FMUL f7, f18, f7
  1181. #endif
  1182. #ifdef RT
  1183. LFD f19, 3 * SIZE(BO)
  1184. LFD f20, 2 * SIZE(BO)
  1185. LFD f21, 0 * SIZE(BO)
  1186. FMUL f4, f19, f4
  1187. FMUL f5, f19, f5
  1188. FMUL f6, f19, f6
  1189. FMUL f7, f19, f7
  1190. FNMSUB f0, f20, f4, f0
  1191. FNMSUB f1, f20, f5, f1
  1192. FNMSUB f2, f20, f6, f2
  1193. FNMSUB f3, f20, f7, f3
  1194. FMUL f0, f21, f0
  1195. FMUL f1, f21, f1
  1196. FMUL f2, f21, f2
  1197. FMUL f3, f21, f3
  1198. #endif
  1199. #ifdef LN
  1200. subi CO1, CO1, 4 * SIZE
  1201. subi CO2, CO2, 4 * SIZE
  1202. #endif
  1203. #if defined(LN) || defined(LT)
  1204. STFD f0, 0 * SIZE(BO)
  1205. STFD f4, 1 * SIZE(BO)
  1206. STFD f1, 2 * SIZE(BO)
  1207. STFD f5, 3 * SIZE(BO)
  1208. STFD f2, 4 * SIZE(BO)
  1209. STFD f6, 5 * SIZE(BO)
  1210. STFD f3, 6 * SIZE(BO)
  1211. STFD f7, 7 * SIZE(BO)
  1212. #else
  1213. STFD f0, 0 * SIZE(AO)
  1214. STFD f1, 1 * SIZE(AO)
  1215. STFD f2, 2 * SIZE(AO)
  1216. STFD f3, 3 * SIZE(AO)
  1217. STFD f4, 4 * SIZE(AO)
  1218. STFD f5, 5 * SIZE(AO)
  1219. STFD f6, 6 * SIZE(AO)
  1220. STFD f7, 7 * SIZE(AO)
  1221. #endif
  1222. STFD f0, 0 * SIZE(CO1)
  1223. STFD f1, 1 * SIZE(CO1)
  1224. STFD f2, 2 * SIZE(CO1)
  1225. STFD f3, 3 * SIZE(CO1)
  1226. STFD f4, 0 * SIZE(CO2)
  1227. STFD f5, 1 * SIZE(CO2)
  1228. STFD f6, 2 * SIZE(CO2)
  1229. STFD f7, 3 * SIZE(CO2)
  1230. lfs f0, FZERO
  1231. fmr f1, f0
  1232. fmr f2, f0
  1233. fmr f3, f0
  1234. fmr f4, f0
  1235. fmr f5, f0
  1236. fmr f6, f0
  1237. fmr f7, f0
  1238. #ifndef LN
  1239. addi CO1, CO1, 4 * SIZE
  1240. addi CO2, CO2, 4 * SIZE
  1241. #endif
  1242. #ifdef RT
  1243. slwi r0, K, 2 + BASE_SHIFT
  1244. add AORIG, AORIG, r0
  1245. #endif
  1246. #if defined(LT) || defined(RN)
  1247. sub TEMP, K, KK
  1248. slwi r0, TEMP, 2 + BASE_SHIFT
  1249. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1250. add AO, AO, r0
  1251. add BO, BO, TEMP
  1252. #endif
  1253. #ifdef LN
  1254. subi KK, KK, 4
  1255. #endif
  1256. #ifdef LT
  1257. addi KK, KK, 4
  1258. #endif
  1259. addic. I, I, -1
  1260. bgt+ LL(41)
  1261. .align 4
  1262. LL(50):
  1263. andi. I, M, 2
  1264. ble LL(60)
  1265. #if defined(LT) || defined(RN)
  1266. LFD f16, 0 * SIZE(AO)
  1267. LFD f17, 1 * SIZE(AO)
  1268. LFD f18, 2 * SIZE(AO)
  1269. LFD f19, 3 * SIZE(AO)
  1270. LFD f20, 0 * SIZE(B)
  1271. LFD f21, 1 * SIZE(B)
  1272. LFD f22, 2 * SIZE(B)
  1273. LFD f23, 3 * SIZE(B)
  1274. LFD f24, 4 * SIZE(B)
  1275. LFD f25, 5 * SIZE(B)
  1276. LFD f26, 6 * SIZE(B)
  1277. LFD f27, 7 * SIZE(B)
  1278. srawi. r0, KK, 2
  1279. mtspr CTR, r0
  1280. mr BO, B
  1281. #else
  1282. #ifdef LN
  1283. slwi r0, K, 1 + BASE_SHIFT
  1284. sub AORIG, AORIG, r0
  1285. #endif
  1286. slwi r0, KK, 1 + BASE_SHIFT
  1287. slwi TEMP, KK, 1 + BASE_SHIFT
  1288. add AO, AORIG, r0
  1289. add BO, B, TEMP
  1290. sub TEMP, K, KK
  1291. LFD f16, 0 * SIZE(AO)
  1292. LFD f17, 1 * SIZE(AO)
  1293. LFD f18, 2 * SIZE(AO)
  1294. LFD f19, 3 * SIZE(AO)
  1295. LFD f20, 0 * SIZE(BO)
  1296. LFD f21, 1 * SIZE(BO)
  1297. LFD f22, 2 * SIZE(BO)
  1298. LFD f23, 3 * SIZE(BO)
  1299. LFD f24, 4 * SIZE(BO)
  1300. LFD f25, 5 * SIZE(BO)
  1301. LFD f26, 6 * SIZE(BO)
  1302. LFD f27, 7 * SIZE(BO)
  1303. srawi. r0, TEMP, 2
  1304. mtspr CTR, r0
  1305. #endif
  1306. ble LL(55)
  1307. .align 5
  1308. LL(52):
  1309. FMADD f0, f16, f20, f0
  1310. FMADD f1, f17, f20, f1
  1311. FMADD f2, f16, f21, f2
  1312. FMADD f3, f17, f21, f3
  1313. FMADD f4, f18, f22, f4
  1314. FMADD f5, f19, f22, f5
  1315. FMADD f6, f18, f23, f6
  1316. FMADD f7, f19, f23, f7
  1317. LFD f16, 4 * SIZE(AO)
  1318. LFD f17, 5 * SIZE(AO)
  1319. LFD f18, 6 * SIZE(AO)
  1320. LFD f19, 7 * SIZE(AO)
  1321. LFD f20, 8 * SIZE(BO)
  1322. LFD f21, 9 * SIZE(BO)
  1323. LFD f22, 10 * SIZE(BO)
  1324. LFD f23, 11 * SIZE(BO)
  1325. FMADD f0, f16, f24, f0
  1326. FMADD f1, f17, f24, f1
  1327. FMADD f2, f16, f25, f2
  1328. FMADD f3, f17, f25, f3
  1329. FMADD f4, f18, f26, f4
  1330. FMADD f5, f19, f26, f5
  1331. FMADD f6, f18, f27, f6
  1332. FMADD f7, f19, f27, f7
  1333. LFD f16, 8 * SIZE(AO)
  1334. LFD f17, 9 * SIZE(AO)
  1335. LFD f18, 10 * SIZE(AO)
  1336. LFD f19, 11 * SIZE(AO)
  1337. LFD f24, 12 * SIZE(BO)
  1338. LFD f25, 13 * SIZE(BO)
  1339. LFD f26, 14 * SIZE(BO)
  1340. LFD f27, 15 * SIZE(BO)
  1341. addi AO, AO, 8 * SIZE
  1342. addi BO, BO, 8 * SIZE
  1343. DCBT(BO, PREB)
  1344. bdnz LL(52)
  1345. .align 4
  1346. LL(55):
  1347. #if defined(LT) || defined(RN)
  1348. andi. r0, KK, 3
  1349. #else
  1350. andi. r0, TEMP, 3
  1351. #endif
  1352. mtspr CTR, r0
  1353. ble+ LL(58)
  1354. .align 4
  1355. LL(56):
  1356. FMADD f0, f16, f20, f0
  1357. FMADD f1, f17, f20, f1
  1358. FMADD f2, f16, f21, f2
  1359. FMADD f3, f17, f21, f3
  1360. LFD f16, 2 * SIZE(AO)
  1361. LFD f17, 3 * SIZE(AO)
  1362. LFD f20, 2 * SIZE(BO)
  1363. LFD f21, 3 * SIZE(BO)
  1364. addi BO, BO, 2 * SIZE
  1365. addi AO, AO, 2 * SIZE
  1366. bdnz LL(56)
  1367. .align 4
  1368. LL(58):
  1369. FADD f0, f4, f0
  1370. FADD f1, f5, f1
  1371. FADD f2, f6, f2
  1372. FADD f3, f7, f3
  1373. #if defined(LN) || defined(RT)
  1374. #ifdef LN
  1375. subi r0, KK, 2
  1376. #else
  1377. subi r0, KK, 2
  1378. #endif
  1379. slwi TEMP, r0, 1 + BASE_SHIFT
  1380. slwi r0, r0, 1 + BASE_SHIFT
  1381. add AO, AORIG, TEMP
  1382. add BO, B, r0
  1383. #endif
  1384. #if defined(LN) || defined(LT)
  1385. LFD f16, 0 * SIZE(BO)
  1386. LFD f17, 1 * SIZE(BO)
  1387. LFD f20, 2 * SIZE(BO)
  1388. LFD f21, 3 * SIZE(BO)
  1389. FSUB f0, f16, f0
  1390. FSUB f2, f17, f2
  1391. FSUB f1, f20, f1
  1392. FSUB f3, f21, f3
  1393. #else
  1394. LFD f16, 0 * SIZE(AO)
  1395. LFD f17, 1 * SIZE(AO)
  1396. LFD f20, 2 * SIZE(AO)
  1397. LFD f21, 3 * SIZE(AO)
  1398. FSUB f0, f16, f0
  1399. FSUB f1, f17, f1
  1400. FSUB f2, f20, f2
  1401. FSUB f3, f21, f3
  1402. #endif
  1403. #ifdef LN
  1404. LFD f19, 3 * SIZE(AO)
  1405. LFD f20, 2 * SIZE(AO)
  1406. LFD f21, 0 * SIZE(AO)
  1407. FMUL f1, f19, f1
  1408. FMUL f3, f19, f3
  1409. FNMSUB f0, f20, f1, f0
  1410. FNMSUB f2, f20, f3, f2
  1411. FMUL f0, f21, f0
  1412. FMUL f2, f21, f2
  1413. #endif
  1414. #ifdef LT
  1415. LFD f16, 0 * SIZE(AO)
  1416. LFD f17, 1 * SIZE(AO)
  1417. FMUL f0, f16, f0
  1418. FMUL f2, f16, f2
  1419. FNMSUB f1, f17, f0, f1
  1420. FNMSUB f3, f17, f2, f3
  1421. LFD f17, 3 * SIZE(AO)
  1422. FMUL f1, f17, f1
  1423. FMUL f3, f17, f3
  1424. #endif
  1425. #ifdef RN
  1426. LFD f16, 0 * SIZE(BO)
  1427. LFD f17, 1 * SIZE(BO)
  1428. LFD f18, 3 * SIZE(BO)
  1429. FMUL f0, f16, f0
  1430. FMUL f1, f16, f1
  1431. FNMSUB f2, f17, f0, f2
  1432. FNMSUB f3, f17, f1, f3
  1433. FMUL f2, f18, f2
  1434. FMUL f3, f18, f3
  1435. #endif
  1436. #ifdef RT
  1437. LFD f19, 3 * SIZE(BO)
  1438. LFD f20, 2 * SIZE(BO)
  1439. LFD f21, 0 * SIZE(BO)
  1440. FMUL f2, f19, f2
  1441. FMUL f3, f19, f3
  1442. FNMSUB f0, f20, f2, f0
  1443. FNMSUB f1, f20, f3, f1
  1444. FMUL f0, f21, f0
  1445. FMUL f1, f21, f1
  1446. #endif
  1447. #ifdef LN
  1448. subi CO1, CO1, 2 * SIZE
  1449. subi CO2, CO2, 2 * SIZE
  1450. #endif
  1451. #if defined(LN) || defined(LT)
  1452. STFD f0, 0 * SIZE(BO)
  1453. STFD f2, 1 * SIZE(BO)
  1454. STFD f1, 2 * SIZE(BO)
  1455. STFD f3, 3 * SIZE(BO)
  1456. #else
  1457. STFD f0, 0 * SIZE(AO)
  1458. STFD f1, 1 * SIZE(AO)
  1459. STFD f2, 2 * SIZE(AO)
  1460. STFD f3, 3 * SIZE(AO)
  1461. #endif
  1462. STFD f0, 0 * SIZE(CO1)
  1463. STFD f1, 1 * SIZE(CO1)
  1464. STFD f2, 0 * SIZE(CO2)
  1465. STFD f3, 1 * SIZE(CO2)
  1466. lfs f0, FZERO
  1467. fmr f1, f0
  1468. fmr f2, f0
  1469. fmr f3, f0
  1470. fmr f4, f0
  1471. fmr f5, f0
  1472. fmr f6, f0
  1473. fmr f7, f0
  1474. #ifndef LN
  1475. addi CO1, CO1, 2 * SIZE
  1476. addi CO2, CO2, 2 * SIZE
  1477. #endif
  1478. #ifdef RT
  1479. slwi r0, K, 1 + BASE_SHIFT
  1480. add AORIG, AORIG, r0
  1481. #endif
  1482. #if defined(LT) || defined(RN)
  1483. sub TEMP, K, KK
  1484. slwi r0, TEMP, 1 + BASE_SHIFT
  1485. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1486. add AO, AO, r0
  1487. add BO, BO, TEMP
  1488. #endif
  1489. #ifdef LN
  1490. subi KK, KK, 2
  1491. #endif
  1492. #ifdef LT
  1493. addi KK, KK, 2
  1494. #endif
  1495. .align 4
  1496. LL(60):
  1497. andi. I, M, 1
  1498. ble LL(69)
  1499. #if defined(LT) || defined(RN)
  1500. LFD f16, 0 * SIZE(AO)
  1501. LFD f17, 1 * SIZE(AO)
  1502. LFD f18, 2 * SIZE(AO)
  1503. LFD f19, 3 * SIZE(AO)
  1504. LFD f20, 0 * SIZE(B)
  1505. LFD f21, 1 * SIZE(B)
  1506. LFD f22, 2 * SIZE(B)
  1507. LFD f23, 3 * SIZE(B)
  1508. LFD f24, 4 * SIZE(B)
  1509. LFD f25, 5 * SIZE(B)
  1510. LFD f26, 6 * SIZE(B)
  1511. LFD f27, 7 * SIZE(B)
  1512. srawi. r0, KK, 2
  1513. mtspr CTR, r0
  1514. mr BO, B
  1515. #else
  1516. #ifdef LN
  1517. slwi r0, K, BASE_SHIFT
  1518. sub AORIG, AORIG, r0
  1519. #endif
  1520. slwi r0, KK, 0 + BASE_SHIFT
  1521. slwi TEMP, KK, 1 + BASE_SHIFT
  1522. add AO, AORIG, r0
  1523. add BO, B, TEMP
  1524. sub TEMP, K, KK
  1525. LFD f16, 0 * SIZE(AO)
  1526. LFD f17, 1 * SIZE(AO)
  1527. LFD f18, 2 * SIZE(AO)
  1528. LFD f19, 3 * SIZE(AO)
  1529. LFD f20, 0 * SIZE(BO)
  1530. LFD f21, 1 * SIZE(BO)
  1531. LFD f22, 2 * SIZE(BO)
  1532. LFD f23, 3 * SIZE(BO)
  1533. LFD f24, 4 * SIZE(BO)
  1534. LFD f25, 5 * SIZE(BO)
  1535. LFD f26, 6 * SIZE(BO)
  1536. LFD f27, 7 * SIZE(BO)
  1537. srawi. r0, TEMP, 2
  1538. mtspr CTR, r0
  1539. #endif
  1540. ble LL(65)
  1541. .align 5
  1542. LL(62):
  1543. FMADD f0, f16, f20, f0
  1544. FMADD f1, f16, f21, f1
  1545. FMADD f2, f17, f22, f2
  1546. FMADD f3, f17, f23, f3
  1547. LFD f20, 8 * SIZE(BO)
  1548. LFD f21, 9 * SIZE(BO)
  1549. LFD f22, 10 * SIZE(BO)
  1550. LFD f23, 11 * SIZE(BO)
  1551. FMADD f0, f18, f24, f0
  1552. FMADD f1, f18, f25, f1
  1553. FMADD f2, f19, f26, f2
  1554. FMADD f3, f19, f27, f3
  1555. LFD f16, 4 * SIZE(AO)
  1556. LFD f17, 5 * SIZE(AO)
  1557. LFD f18, 6 * SIZE(AO)
  1558. LFD f19, 7 * SIZE(AO)
  1559. LFD f24, 12 * SIZE(BO)
  1560. LFD f25, 13 * SIZE(BO)
  1561. LFD f26, 14 * SIZE(BO)
  1562. LFD f27, 15 * SIZE(BO)
  1563. addi AO, AO, 4 * SIZE
  1564. addi BO, BO, 8 * SIZE
  1565. bdnz LL(62)
  1566. .align 4
  1567. LL(65):
  1568. #if defined(LT) || defined(RN)
  1569. andi. r0, KK, 3
  1570. #else
  1571. andi. r0, TEMP, 3
  1572. #endif
  1573. mtspr CTR, r0
  1574. ble+ LL(68)
  1575. .align 4
  1576. LL(66):
  1577. FMADD f0, f16, f20, f0
  1578. FMADD f1, f16, f21, f1
  1579. LFD f16, 1 * SIZE(AO)
  1580. LFD f20, 2 * SIZE(BO)
  1581. LFD f21, 3 * SIZE(BO)
  1582. addi BO, BO, 2 * SIZE
  1583. addi AO, AO, 1 * SIZE
  1584. bdnz LL(66)
  1585. .align 4
  1586. LL(68):
  1587. FADD f0, f2, f0
  1588. FADD f1, f3, f1
  1589. #if defined(LN) || defined(RT)
  1590. #ifdef LN
  1591. subi r0, KK, 1
  1592. #else
  1593. subi r0, KK, 2
  1594. #endif
  1595. slwi TEMP, r0, 0 + BASE_SHIFT
  1596. slwi r0, r0, 1 + BASE_SHIFT
  1597. add AO, AORIG, TEMP
  1598. add BO, B, r0
  1599. #endif
  1600. #if defined(LN) || defined(LT)
  1601. LFD f16, 0 * SIZE(BO)
  1602. LFD f17, 1 * SIZE(BO)
  1603. FSUB f0, f16, f0
  1604. FSUB f1, f17, f1
  1605. #else
  1606. LFD f16, 0 * SIZE(AO)
  1607. LFD f20, 1 * SIZE(AO)
  1608. FSUB f0, f16, f0
  1609. FSUB f1, f20, f1
  1610. #endif
  1611. #ifdef LN
  1612. LFD f21, 0 * SIZE(AO)
  1613. FMUL f0, f21, f0
  1614. FMUL f1, f21, f1
  1615. #endif
  1616. #ifdef LT
  1617. LFD f16, 0 * SIZE(AO)
  1618. FMUL f0, f16, f0
  1619. FMUL f1, f16, f1
  1620. #endif
  1621. #ifdef RN
  1622. LFD f16, 0 * SIZE(BO)
  1623. LFD f17, 1 * SIZE(BO)
  1624. LFD f18, 3 * SIZE(BO)
  1625. FMUL f0, f16, f0
  1626. FNMSUB f1, f17, f0, f1
  1627. FMUL f1, f18, f1
  1628. #endif
  1629. #ifdef RT
  1630. LFD f19, 3 * SIZE(BO)
  1631. LFD f20, 2 * SIZE(BO)
  1632. LFD f21, 0 * SIZE(BO)
  1633. FMUL f1, f19, f1
  1634. FNMSUB f0, f20, f1, f0
  1635. FMUL f0, f21, f0
  1636. #endif
  1637. #ifdef LN
  1638. subi CO1, CO1, 1 * SIZE
  1639. subi CO2, CO2, 1 * SIZE
  1640. #endif
  1641. #if defined(LN) || defined(LT)
  1642. STFD f0, 0 * SIZE(BO)
  1643. STFD f1, 1 * SIZE(BO)
  1644. #else
  1645. STFD f0, 0 * SIZE(AO)
  1646. STFD f1, 1 * SIZE(AO)
  1647. #endif
  1648. STFD f0, 0 * SIZE(CO1)
  1649. STFD f1, 0 * SIZE(CO2)
  1650. lfs f0, FZERO
  1651. fmr f1, f0
  1652. fmr f4, f0
  1653. fmr f5, f0
  1654. #ifndef LN
  1655. addi CO1, CO1, 1 * SIZE
  1656. addi CO2, CO2, 1 * SIZE
  1657. #endif
  1658. #ifdef RT
  1659. slwi r0, K, 0 + BASE_SHIFT
  1660. add AORIG, AORIG, r0
  1661. #endif
  1662. #if defined(LT) || defined(RN)
  1663. sub TEMP, K, KK
  1664. slwi r0, TEMP, 0 + BASE_SHIFT
  1665. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1666. add AO, AO, r0
  1667. add BO, BO, TEMP
  1668. #endif
  1669. #ifdef LN
  1670. subi KK, KK, 1
  1671. #endif
  1672. #ifdef LT
  1673. addi KK, KK, 1
  1674. #endif
  1675. .align 4
  1676. LL(69):
  1677. #ifdef LN
  1678. slwi r0, K, 1 + BASE_SHIFT
  1679. add B, B, r0
  1680. #endif
  1681. #if defined(LT) || defined(RN)
  1682. mr B, BO
  1683. #endif
  1684. #ifdef RN
  1685. addi KK, KK, 2
  1686. #endif
  1687. #ifdef RT
  1688. subi KK, KK, 2
  1689. #endif
  1690. lfs f0, FZERO
  1691. .align 4
  1692. LL(09):
  1693. srawi. J, N, 2
  1694. ble LL(999)
  1695. .align 4
  1696. LL(10):
  1697. #ifdef RT
  1698. slwi r0, K, 2 + BASE_SHIFT
  1699. sub B, B, r0
  1700. slwi r0, LDC, 2
  1701. sub C, C, r0
  1702. #endif
  1703. mr CO1, C
  1704. add CO2, C, LDC
  1705. add CO3, CO2, LDC
  1706. add CO4, CO3, LDC
  1707. #ifdef LN
  1708. add KK, M, OFFSET
  1709. #endif
  1710. #ifdef LT
  1711. mr KK, OFFSET
  1712. #endif
  1713. fmr f1, f0
  1714. fmr f2, f0
  1715. fmr f3, f0
  1716. fmr f4, f0
  1717. fmr f5, f0
  1718. fmr f6, f0
  1719. fmr f7, f0
  1720. fmr f8, f0
  1721. fmr f9, f0
  1722. fmr f10, f0
  1723. fmr f11, f0
  1724. fmr f12, f0
  1725. fmr f13, f0
  1726. fmr f14, f0
  1727. fmr f15, f0
  1728. srawi. I, M, 2
  1729. #if defined(LN) || defined(RT)
  1730. mr AORIG, A
  1731. #else
  1732. mr AO, A
  1733. #endif
  1734. #ifndef RT
  1735. add C, CO4, LDC
  1736. #endif
  1737. ble LL(20)
  1738. .align 4
  1739. LL(11):
  1740. #if defined(LT) || defined(RN)
  1741. LFD f16, 0 * SIZE(AO)
  1742. LFD f17, 1 * SIZE(AO)
  1743. LFD f18, 2 * SIZE(AO)
  1744. LFD f19, 3 * SIZE(AO)
  1745. LFD f20, 0 * SIZE(B)
  1746. LFD f21, 1 * SIZE(B)
  1747. LFD f22, 2 * SIZE(B)
  1748. LFD f23, 3 * SIZE(B)
  1749. dcbt CO1, PREC
  1750. dcbt CO2, PREC
  1751. dcbt CO3, PREC
  1752. dcbt CO4, PREC
  1753. srawi. r0, KK, 2
  1754. mtspr CTR, r0
  1755. mr BO, B
  1756. #else
  1757. #ifdef LN
  1758. slwi r0, K, 2 + BASE_SHIFT
  1759. sub AORIG, AORIG, r0
  1760. #endif
  1761. slwi TEMP, KK, 2 + BASE_SHIFT
  1762. add AO, AORIG, TEMP
  1763. add BO, B, TEMP
  1764. sub TEMP, K, KK
  1765. LFD f16, 0 * SIZE(AO)
  1766. LFD f17, 1 * SIZE(AO)
  1767. LFD f18, 2 * SIZE(AO)
  1768. LFD f19, 3 * SIZE(AO)
  1769. LFD f20, 0 * SIZE(BO)
  1770. LFD f21, 1 * SIZE(BO)
  1771. LFD f22, 2 * SIZE(BO)
  1772. LFD f23, 3 * SIZE(BO)
  1773. dcbt CO1, PREC
  1774. dcbt CO2, PREC
  1775. dcbt CO3, PREC
  1776. dcbt CO4, PREC
  1777. srawi. r0, TEMP, 2
  1778. mtspr CTR, r0
  1779. #endif
  1780. ble LL(15)
  1781. .align 4
  1782. LL(12):
  1783. FMADD f0, f16, f20, f0
  1784. FMADD f5, f17, f21, f5
  1785. FMADD f10, f18, f22, f10
  1786. FMADD f15, f19, f23, f15
  1787. LFD f28, 4 * SIZE(BO)
  1788. LFD f29, 5 * SIZE(BO)
  1789. LFD f30, 6 * SIZE(BO)
  1790. LFD f31, 7 * SIZE(BO)
  1791. FMADD f1, f17, f20, f1
  1792. FMADD f2, f18, f20, f2
  1793. FMADD f3, f19, f20, f3
  1794. FMADD f4, f16, f21, f4
  1795. LFD f24, 4 * SIZE(AO)
  1796. LFD f25, 5 * SIZE(AO)
  1797. LFD f26, 6 * SIZE(AO)
  1798. LFD f27, 7 * SIZE(AO)
  1799. FMADD f6, f18, f21, f6
  1800. FMADD f7, f19, f21, f7
  1801. FMADD f8, f16, f22, f8
  1802. FMADD f9, f17, f22, f9
  1803. FMADD f11, f19, f22, f11
  1804. FMADD f12, f16, f23, f12
  1805. FMADD f13, f17, f23, f13
  1806. FMADD f14, f18, f23, f14
  1807. LFD f20, 8 * SIZE(BO)
  1808. LFD f21, 9 * SIZE(BO)
  1809. LFD f22, 10 * SIZE(BO)
  1810. LFD f23, 11 * SIZE(BO)
  1811. FMADD f0, f24, f28, f0
  1812. FMADD f5, f25, f29, f5
  1813. FMADD f10, f26, f30, f10
  1814. FMADD f15, f27, f31, f15
  1815. LFD f16, 8 * SIZE(AO)
  1816. LFD f17, 9 * SIZE(AO)
  1817. LFD f18, 10 * SIZE(AO)
  1818. LFD f19, 11 * SIZE(AO)
  1819. FMADD f1, f25, f28, f1
  1820. FMADD f2, f26, f28, f2
  1821. FMADD f3, f27, f28, f3
  1822. FMADD f4, f24, f29, f4
  1823. FMADD f6, f26, f29, f6
  1824. FMADD f7, f27, f29, f7
  1825. FMADD f8, f24, f30, f8
  1826. FMADD f9, f25, f30, f9
  1827. FMADD f11, f27, f30, f11
  1828. FMADD f12, f24, f31, f12
  1829. FMADD f13, f25, f31, f13
  1830. FMADD f14, f26, f31, f14
  1831. LFD f28, 12 * SIZE(BO)
  1832. LFD f29, 13 * SIZE(BO)
  1833. LFD f30, 14 * SIZE(BO)
  1834. LFD f31, 15 * SIZE(BO)
  1835. FMADD f0, f16, f20, f0
  1836. FMADD f5, f17, f21, f5
  1837. FMADD f10, f18, f22, f10
  1838. FMADD f15, f19, f23, f15
  1839. LFD f24, 12 * SIZE(AO)
  1840. LFD f25, 13 * SIZE(AO)
  1841. LFD f26, 14 * SIZE(AO)
  1842. LFD f27, 15 * SIZE(AO)
  1843. FMADD f1, f17, f20, f1
  1844. FMADD f2, f18, f20, f2
  1845. FMADD f3, f19, f20, f3
  1846. FMADD f4, f16, f21, f4
  1847. FMADD f6, f18, f21, f6
  1848. FMADD f7, f19, f21, f7
  1849. FMADD f8, f16, f22, f8
  1850. FMADD f9, f17, f22, f9
  1851. FMADD f11, f19, f22, f11
  1852. FMADD f12, f16, f23, f12
  1853. FMADD f13, f17, f23, f13
  1854. FMADD f14, f18, f23, f14
  1855. LFD f20, 16 * SIZE(BO)
  1856. LFD f21, 17 * SIZE(BO)
  1857. LFD f22, 18 * SIZE(BO)
  1858. LFD f23, 19 * SIZE(BO)
  1859. FMADD f0, f24, f28, f0
  1860. FMADD f5, f25, f29, f5
  1861. FMADD f10, f26, f30, f10
  1862. FMADD f15, f27, f31, f15
  1863. LFD f16, 16 * SIZE(AO)
  1864. LFD f17, 17 * SIZE(AO)
  1865. LFD f18, 18 * SIZE(AO)
  1866. LFD f19, 19 * SIZE(AO)
  1867. FMADD f1, f25, f28, f1
  1868. FMADD f2, f26, f28, f2
  1869. FMADD f3, f27, f28, f3
  1870. FMADD f4, f24, f29, f4
  1871. FMADD f6, f26, f29, f6
  1872. FMADD f7, f27, f29, f7
  1873. FMADD f8, f24, f30, f8
  1874. FMADD f9, f25, f30, f9
  1875. FMADD f11, f27, f30, f11
  1876. FMADD f12, f24, f31, f12
  1877. FMADD f13, f25, f31, f13
  1878. FMADD f14, f26, f31, f14
  1879. addi AO, AO, 16 * SIZE
  1880. addi BO, BO, 16 * SIZE
  1881. #ifdef PPC970
  1882. #ifndef ALLOC_HUGETLB
  1883. DCBT(AO, PREA)
  1884. #endif
  1885. DCBT(BO, PREB)
  1886. #endif
  1887. #ifdef POWER4
  1888. #ifndef ALLOC_HUGETLB
  1889. DCBT(AO, PREA)
  1890. #endif
  1891. DCBT(BO, PREB)
  1892. #endif
  1893. #ifdef POWER5
  1894. DCBT(AO, PREA)
  1895. DCBT(BO, PREB)
  1896. #endif
  1897. bdnz LL(12)
  1898. .align 4
  1899. LL(15):
  1900. #if defined(LT) || defined(RN)
  1901. andi. r0, KK, 3
  1902. #else
  1903. andi. r0, TEMP, 3
  1904. #endif
  1905. mtspr CTR, r0
  1906. ble+ LL(18)
  1907. .align 4
  1908. LL(16):
  1909. FMADD f0, f16, f20, f0
  1910. FMADD f5, f17, f21, f5
  1911. FMADD f10, f18, f22, f10
  1912. FMADD f15, f19, f23, f15
  1913. FMADD f1, f17, f20, f1
  1914. FMADD f2, f18, f20, f2
  1915. FMADD f3, f19, f20, f3
  1916. FMADD f4, f16, f21, f4
  1917. FMADD f6, f18, f21, f6
  1918. FMADD f7, f19, f21, f7
  1919. FMADD f8, f16, f22, f8
  1920. FMADD f9, f17, f22, f9
  1921. FMADD f11, f19, f22, f11
  1922. FMADD f12, f16, f23, f12
  1923. FMADD f13, f17, f23, f13
  1924. FMADD f14, f18, f23, f14
  1925. LFD f16, 4 * SIZE(AO)
  1926. LFD f17, 5 * SIZE(AO)
  1927. LFD f18, 6 * SIZE(AO)
  1928. LFD f19, 7 * SIZE(AO)
  1929. LFD f20, 4 * SIZE(BO)
  1930. LFD f21, 5 * SIZE(BO)
  1931. LFD f22, 6 * SIZE(BO)
  1932. LFD f23, 7 * SIZE(BO)
  1933. addi BO, BO, 4 * SIZE
  1934. addi AO, AO, 4 * SIZE
  1935. bdnz LL(16)
  1936. .align 4
  1937. LL(18):
  1938. #if defined(LN) || defined(RT)
  1939. subi r0, KK, 4
  1940. slwi r0, r0, 2 + BASE_SHIFT
  1941. add AO, AORIG, r0
  1942. add BO, B, r0
  1943. #endif
  1944. #if defined(LN) || defined(LT)
  1945. LFD f16, 0 * SIZE(BO)
  1946. LFD f17, 1 * SIZE(BO)
  1947. LFD f18, 2 * SIZE(BO)
  1948. LFD f19, 3 * SIZE(BO)
  1949. LFD f20, 4 * SIZE(BO)
  1950. LFD f21, 5 * SIZE(BO)
  1951. LFD f22, 6 * SIZE(BO)
  1952. LFD f23, 7 * SIZE(BO)
  1953. LFD f24, 8 * SIZE(BO)
  1954. LFD f25, 9 * SIZE(BO)
  1955. LFD f26, 10 * SIZE(BO)
  1956. LFD f27, 11 * SIZE(BO)
  1957. LFD f28, 12 * SIZE(BO)
  1958. LFD f29, 13 * SIZE(BO)
  1959. LFD f30, 14 * SIZE(BO)
  1960. LFD f31, 15 * SIZE(BO)
  1961. FSUB f0, f16, f0
  1962. FSUB f4, f17, f4
  1963. FSUB f8, f18, f8
  1964. FSUB f12, f19, f12
  1965. FSUB f1, f20, f1
  1966. FSUB f5, f21, f5
  1967. FSUB f9, f22, f9
  1968. FSUB f13, f23, f13
  1969. FSUB f2, f24, f2
  1970. FSUB f6, f25, f6
  1971. FSUB f10, f26, f10
  1972. FSUB f14, f27, f14
  1973. FSUB f3, f28, f3
  1974. FSUB f7, f29, f7
  1975. FSUB f11, f30, f11
  1976. FSUB f15, f31, f15
  1977. #else
  1978. LFD f16, 0 * SIZE(AO)
  1979. LFD f17, 1 * SIZE(AO)
  1980. LFD f18, 2 * SIZE(AO)
  1981. LFD f19, 3 * SIZE(AO)
  1982. LFD f20, 4 * SIZE(AO)
  1983. LFD f21, 5 * SIZE(AO)
  1984. LFD f22, 6 * SIZE(AO)
  1985. LFD f23, 7 * SIZE(AO)
  1986. LFD f24, 8 * SIZE(AO)
  1987. LFD f25, 9 * SIZE(AO)
  1988. LFD f26, 10 * SIZE(AO)
  1989. LFD f27, 11 * SIZE(AO)
  1990. LFD f28, 12 * SIZE(AO)
  1991. LFD f29, 13 * SIZE(AO)
  1992. LFD f30, 14 * SIZE(AO)
  1993. LFD f31, 15 * SIZE(AO)
  1994. FSUB f0, f16, f0
  1995. FSUB f1, f17, f1
  1996. FSUB f2, f18, f2
  1997. FSUB f3, f19, f3
  1998. FSUB f4, f20, f4
  1999. FSUB f5, f21, f5
  2000. FSUB f6, f22, f6
  2001. FSUB f7, f23, f7
  2002. FSUB f8, f24, f8
  2003. FSUB f9, f25, f9
  2004. FSUB f10, f26, f10
  2005. FSUB f11, f27, f11
  2006. FSUB f12, f28, f12
  2007. FSUB f13, f29, f13
  2008. FSUB f14, f30, f14
  2009. FSUB f15, f31, f15
  2010. #endif
  2011. #ifdef LN
  2012. LFD f16, 15 * SIZE(AO)
  2013. LFD f17, 14 * SIZE(AO)
  2014. LFD f18, 13 * SIZE(AO)
  2015. LFD f19, 12 * SIZE(AO)
  2016. FMUL f3, f16, f3
  2017. FMUL f7, f16, f7
  2018. FMUL f11, f16, f11
  2019. FMUL f15, f16, f15
  2020. FNMSUB f2, f17, f3, f2
  2021. FNMSUB f6, f17, f7, f6
  2022. FNMSUB f10, f17, f11, f10
  2023. FNMSUB f14, f17, f15, f14
  2024. FNMSUB f1, f18, f3, f1
  2025. FNMSUB f5, f18, f7, f5
  2026. FNMSUB f9, f18, f11, f9
  2027. FNMSUB f13, f18, f15, f13
  2028. FNMSUB f0, f19, f3, f0
  2029. FNMSUB f4, f19, f7, f4
  2030. FNMSUB f8, f19, f11, f8
  2031. FNMSUB f12, f19, f15, f12
  2032. LFD f16, 10 * SIZE(AO)
  2033. LFD f17, 9 * SIZE(AO)
  2034. LFD f18, 8 * SIZE(AO)
  2035. LFD f19, 5 * SIZE(AO)
  2036. FMUL f2, f16, f2
  2037. FMUL f6, f16, f6
  2038. FMUL f10, f16, f10
  2039. FMUL f14, f16, f14
  2040. LFD f20, 4 * SIZE(AO)
  2041. LFD f21, 0 * SIZE(AO)
  2042. FNMSUB f1, f17, f2, f1
  2043. FNMSUB f5, f17, f6, f5
  2044. FNMSUB f9, f17, f10, f9
  2045. FNMSUB f13, f17, f14, f13
  2046. FNMSUB f0, f18, f2, f0
  2047. FNMSUB f4, f18, f6, f4
  2048. FNMSUB f8, f18, f10, f8
  2049. FNMSUB f12, f18, f14, f12
  2050. FMUL f1, f19, f1
  2051. FMUL f5, f19, f5
  2052. FMUL f9, f19, f9
  2053. FMUL f13, f19, f13
  2054. FNMSUB f0, f20, f1, f0
  2055. FNMSUB f4, f20, f5, f4
  2056. FNMSUB f8, f20, f9, f8
  2057. FNMSUB f12, f20, f13, f12
  2058. FMUL f0, f21, f0
  2059. FMUL f4, f21, f4
  2060. FMUL f8, f21, f8
  2061. FMUL f12, f21, f12
  2062. #endif
  2063. #ifdef LT
  2064. LFD f16, 0 * SIZE(AO)
  2065. LFD f17, 1 * SIZE(AO)
  2066. LFD f18, 2 * SIZE(AO)
  2067. LFD f19, 3 * SIZE(AO)
  2068. FMUL f0, f16, f0
  2069. FMUL f4, f16, f4
  2070. FMUL f8, f16, f8
  2071. FMUL f12, f16, f12
  2072. FNMSUB f1, f17, f0, f1
  2073. FNMSUB f5, f17, f4, f5
  2074. FNMSUB f9, f17, f8, f9
  2075. FNMSUB f13, f17, f12, f13
  2076. FNMSUB f2, f18, f0, f2
  2077. FNMSUB f6, f18, f4, f6
  2078. FNMSUB f10, f18, f8, f10
  2079. FNMSUB f14, f18, f12, f14
  2080. FNMSUB f3, f19, f0, f3
  2081. FNMSUB f7, f19, f4, f7
  2082. FNMSUB f11, f19, f8, f11
  2083. FNMSUB f15, f19, f12, f15
  2084. LFD f16, 5 * SIZE(AO)
  2085. LFD f17, 6 * SIZE(AO)
  2086. LFD f18, 7 * SIZE(AO)
  2087. LFD f19, 10 * SIZE(AO)
  2088. FMUL f1, f16, f1
  2089. FMUL f5, f16, f5
  2090. FMUL f9, f16, f9
  2091. FMUL f13, f16, f13
  2092. LFD f20, 11 * SIZE(AO)
  2093. LFD f21, 15 * SIZE(AO)
  2094. FNMSUB f2, f17, f1, f2
  2095. FNMSUB f6, f17, f5, f6
  2096. FNMSUB f10, f17, f9, f10
  2097. FNMSUB f14, f17, f13, f14
  2098. FNMSUB f3, f18, f1, f3
  2099. FNMSUB f7, f18, f5, f7
  2100. FNMSUB f11, f18, f9, f11
  2101. FNMSUB f15, f18, f13, f15
  2102. FMUL f2, f19, f2
  2103. FMUL f6, f19, f6
  2104. FMUL f10, f19, f10
  2105. FMUL f14, f19, f14
  2106. FNMSUB f3, f20, f2, f3
  2107. FNMSUB f7, f20, f6, f7
  2108. FNMSUB f11, f20, f10, f11
  2109. FNMSUB f15, f20, f14, f15
  2110. FMUL f3, f21, f3
  2111. FMUL f7, f21, f7
  2112. FMUL f11, f21, f11
  2113. FMUL f15, f21, f15
  2114. #endif
  2115. #ifdef RN
  2116. LFD f16, 0 * SIZE(BO)
  2117. LFD f17, 1 * SIZE(BO)
  2118. LFD f18, 2 * SIZE(BO)
  2119. LFD f19, 3 * SIZE(BO)
  2120. FMUL f0, f16, f0
  2121. FMUL f1, f16, f1
  2122. FMUL f2, f16, f2
  2123. FMUL f3, f16, f3
  2124. FNMSUB f4, f17, f0, f4
  2125. FNMSUB f5, f17, f1, f5
  2126. FNMSUB f6, f17, f2, f6
  2127. FNMSUB f7, f17, f3, f7
  2128. FNMSUB f8, f18, f0, f8
  2129. FNMSUB f9, f18, f1, f9
  2130. FNMSUB f10, f18, f2, f10
  2131. FNMSUB f11, f18, f3, f11
  2132. FNMSUB f12, f19, f0, f12
  2133. FNMSUB f13, f19, f1, f13
  2134. FNMSUB f14, f19, f2, f14
  2135. FNMSUB f15, f19, f3, f15
  2136. LFD f16, 5 * SIZE(BO)
  2137. LFD f17, 6 * SIZE(BO)
  2138. LFD f18, 7 * SIZE(BO)
  2139. LFD f19, 10 * SIZE(BO)
  2140. FMUL f4, f16, f4
  2141. FMUL f5, f16, f5
  2142. FMUL f6, f16, f6
  2143. FMUL f7, f16, f7
  2144. LFD f20, 11 * SIZE(BO)
  2145. LFD f21, 15 * SIZE(BO)
  2146. FNMSUB f8, f17, f4, f8
  2147. FNMSUB f9, f17, f5, f9
  2148. FNMSUB f10, f17, f6, f10
  2149. FNMSUB f11, f17, f7, f11
  2150. FNMSUB f12, f18, f4, f12
  2151. FNMSUB f13, f18, f5, f13
  2152. FNMSUB f14, f18, f6, f14
  2153. FNMSUB f15, f18, f7, f15
  2154. FMUL f8, f19, f8
  2155. FMUL f9, f19, f9
  2156. FMUL f10, f19, f10
  2157. FMUL f11, f19, f11
  2158. FNMSUB f12, f20, f8, f12
  2159. FNMSUB f13, f20, f9, f13
  2160. FNMSUB f14, f20, f10, f14
  2161. FNMSUB f15, f20, f11, f15
  2162. FMUL f12, f21, f12
  2163. FMUL f13, f21, f13
  2164. FMUL f14, f21, f14
  2165. FMUL f15, f21, f15
  2166. #endif
  2167. #ifdef RT
  2168. LFD f16, 15 * SIZE(BO)
  2169. LFD f17, 14 * SIZE(BO)
  2170. LFD f18, 13 * SIZE(BO)
  2171. LFD f19, 12 * SIZE(BO)
  2172. FMUL f12, f16, f12
  2173. FMUL f13, f16, f13
  2174. FMUL f14, f16, f14
  2175. FMUL f15, f16, f15
  2176. FNMSUB f8, f17, f12, f8
  2177. FNMSUB f9, f17, f13, f9
  2178. FNMSUB f10, f17, f14, f10
  2179. FNMSUB f11, f17, f15, f11
  2180. FNMSUB f4, f18, f12, f4
  2181. FNMSUB f5, f18, f13, f5
  2182. FNMSUB f6, f18, f14, f6
  2183. FNMSUB f7, f18, f15, f7
  2184. FNMSUB f0, f19, f12, f0
  2185. FNMSUB f1, f19, f13, f1
  2186. FNMSUB f2, f19, f14, f2
  2187. FNMSUB f3, f19, f15, f3
  2188. LFD f16, 10 * SIZE(BO)
  2189. LFD f17, 9 * SIZE(BO)
  2190. LFD f18, 8 * SIZE(BO)
  2191. LFD f19, 5 * SIZE(BO)
  2192. FMUL f8, f16, f8
  2193. FMUL f9, f16, f9
  2194. FMUL f10, f16, f10
  2195. FMUL f11, f16, f11
  2196. LFD f20, 4 * SIZE(BO)
  2197. LFD f21, 0 * SIZE(BO)
  2198. FNMSUB f4, f17, f8, f4
  2199. FNMSUB f5, f17, f9, f5
  2200. FNMSUB f6, f17, f10, f6
  2201. FNMSUB f7, f17, f11, f7
  2202. FNMSUB f0, f18, f8, f0
  2203. FNMSUB f1, f18, f9, f1
  2204. FNMSUB f2, f18, f10, f2
  2205. FNMSUB f3, f18, f11, f3
  2206. FMUL f4, f19, f4
  2207. FMUL f5, f19, f5
  2208. FMUL f6, f19, f6
  2209. FMUL f7, f19, f7
  2210. FNMSUB f0, f20, f4, f0
  2211. FNMSUB f1, f20, f5, f1
  2212. FNMSUB f2, f20, f6, f2
  2213. FNMSUB f3, f20, f7, f3
  2214. FMUL f0, f21, f0
  2215. FMUL f1, f21, f1
  2216. FMUL f2, f21, f2
  2217. FMUL f3, f21, f3
  2218. #endif
  2219. #ifdef LN
  2220. subi CO1, CO1, 4 * SIZE
  2221. subi CO2, CO2, 4 * SIZE
  2222. subi CO3, CO3, 4 * SIZE
  2223. subi CO4, CO4, 4 * SIZE
  2224. #endif
  2225. #if defined(LN) || defined(LT)
  2226. STFD f0, 0 * SIZE(BO)
  2227. STFD f4, 1 * SIZE(BO)
  2228. STFD f8, 2 * SIZE(BO)
  2229. STFD f12, 3 * SIZE(BO)
  2230. STFD f1, 4 * SIZE(BO)
  2231. STFD f5, 5 * SIZE(BO)
  2232. STFD f9, 6 * SIZE(BO)
  2233. STFD f13, 7 * SIZE(BO)
  2234. STFD f2, 8 * SIZE(BO)
  2235. STFD f6, 9 * SIZE(BO)
  2236. STFD f10, 10 * SIZE(BO)
  2237. STFD f14, 11 * SIZE(BO)
  2238. STFD f3, 12 * SIZE(BO)
  2239. STFD f7, 13 * SIZE(BO)
  2240. STFD f11, 14 * SIZE(BO)
  2241. STFD f15, 15 * SIZE(BO)
  2242. #else
  2243. STFD f0, 0 * SIZE(AO)
  2244. STFD f1, 1 * SIZE(AO)
  2245. STFD f2, 2 * SIZE(AO)
  2246. STFD f3, 3 * SIZE(AO)
  2247. STFD f4, 4 * SIZE(AO)
  2248. STFD f5, 5 * SIZE(AO)
  2249. STFD f6, 6 * SIZE(AO)
  2250. STFD f7, 7 * SIZE(AO)
  2251. STFD f8, 8 * SIZE(AO)
  2252. STFD f9, 9 * SIZE(AO)
  2253. STFD f10, 10 * SIZE(AO)
  2254. STFD f11, 11 * SIZE(AO)
  2255. STFD f12, 12 * SIZE(AO)
  2256. STFD f13, 13 * SIZE(AO)
  2257. STFD f14, 14 * SIZE(AO)
  2258. STFD f15, 15 * SIZE(AO)
  2259. #endif
  2260. STFD f0, 0 * SIZE(CO1)
  2261. STFD f1, 1 * SIZE(CO1)
  2262. STFD f2, 2 * SIZE(CO1)
  2263. STFD f3, 3 * SIZE(CO1)
  2264. STFD f4, 0 * SIZE(CO2)
  2265. STFD f5, 1 * SIZE(CO2)
  2266. STFD f6, 2 * SIZE(CO2)
  2267. STFD f7, 3 * SIZE(CO2)
  2268. STFD f8, 0 * SIZE(CO3)
  2269. STFD f9, 1 * SIZE(CO3)
  2270. STFD f10, 2 * SIZE(CO3)
  2271. STFD f11, 3 * SIZE(CO3)
  2272. STFD f12, 0 * SIZE(CO4)
  2273. STFD f13, 1 * SIZE(CO4)
  2274. STFD f14, 2 * SIZE(CO4)
  2275. STFD f15, 3 * SIZE(CO4)
  2276. lfs f0, FZERO
  2277. fmr f1, f0
  2278. fmr f2, f0
  2279. fmr f3, f0
  2280. fmr f4, f0
  2281. fmr f5, f0
  2282. fmr f6, f0
  2283. fmr f7, f0
  2284. fmr f8, f0
  2285. fmr f9, f0
  2286. fmr f10, f0
  2287. fmr f11, f0
  2288. fmr f12, f0
  2289. fmr f13, f0
  2290. fmr f14, f0
  2291. fmr f15, f0
  2292. #ifndef LN
  2293. addi CO1, CO1, 4 * SIZE
  2294. addi CO2, CO2, 4 * SIZE
  2295. addi CO3, CO3, 4 * SIZE
  2296. addi CO4, CO4, 4 * SIZE
  2297. #endif
  2298. #ifdef RT
  2299. slwi r0, K, 2 + BASE_SHIFT
  2300. add AORIG, AORIG, r0
  2301. #endif
  2302. #if defined(LT) || defined(RN)
  2303. sub TEMP, K, KK
  2304. slwi TEMP, TEMP, 2 + BASE_SHIFT
  2305. add AO, AO, TEMP
  2306. add BO, BO, TEMP
  2307. #endif
  2308. #ifdef LT
  2309. addi KK, KK, 4
  2310. #endif
  2311. #ifdef LN
  2312. subi KK, KK, 4
  2313. #endif
  2314. addic. I, I, -1
  2315. bgt+ LL(11)
  2316. .align 4
  2317. LL(20):
  2318. andi. I, M, 2
  2319. ble LL(30)
  2320. #if defined(LT) || defined(RN)
  2321. LFD f16, 0 * SIZE(AO)
  2322. LFD f17, 1 * SIZE(AO)
  2323. LFD f18, 2 * SIZE(AO)
  2324. LFD f19, 3 * SIZE(AO)
  2325. LFD f20, 0 * SIZE(B)
  2326. LFD f21, 1 * SIZE(B)
  2327. LFD f22, 2 * SIZE(B)
  2328. LFD f23, 3 * SIZE(B)
  2329. LFD f24, 4 * SIZE(B)
  2330. LFD f25, 5 * SIZE(B)
  2331. LFD f26, 6 * SIZE(B)
  2332. LFD f27, 7 * SIZE(B)
  2333. srawi. r0, KK, 2
  2334. mtspr CTR, r0
  2335. mr BO, B
  2336. #else
  2337. #ifdef LN
  2338. slwi r0, K, 1 + BASE_SHIFT
  2339. sub AORIG, AORIG, r0
  2340. #endif
  2341. slwi r0, KK, 1 + BASE_SHIFT
  2342. slwi TEMP, KK, 2 + BASE_SHIFT
  2343. add AO, AORIG, r0
  2344. add BO, B, TEMP
  2345. sub TEMP, K, KK
  2346. LFD f16, 0 * SIZE(AO)
  2347. LFD f17, 1 * SIZE(AO)
  2348. LFD f18, 2 * SIZE(AO)
  2349. LFD f19, 3 * SIZE(AO)
  2350. LFD f20, 0 * SIZE(BO)
  2351. LFD f21, 1 * SIZE(BO)
  2352. LFD f22, 2 * SIZE(BO)
  2353. LFD f23, 3 * SIZE(BO)
  2354. LFD f24, 4 * SIZE(BO)
  2355. LFD f25, 5 * SIZE(BO)
  2356. LFD f26, 6 * SIZE(BO)
  2357. LFD f27, 7 * SIZE(BO)
  2358. srawi. r0, TEMP, 2
  2359. mtspr CTR, r0
  2360. #endif
  2361. ble LL(25)
  2362. .align 5
  2363. LL(22):
  2364. FMADD f0, f16, f20, f0
  2365. FMADD f1, f17, f20, f1
  2366. FMADD f4, f16, f21, f4
  2367. FMADD f5, f17, f21, f5
  2368. FMADD f8, f16, f22, f8
  2369. FMADD f9, f17, f22, f9
  2370. FMADD f12, f16, f23, f12
  2371. FMADD f13, f17, f23, f13
  2372. LFD f20, 8 * SIZE(BO)
  2373. LFD f21, 9 * SIZE(BO)
  2374. LFD f22, 10 * SIZE(BO)
  2375. LFD f23, 11 * SIZE(BO)
  2376. FMADD f2, f18, f24, f2
  2377. FMADD f3, f19, f24, f3
  2378. FMADD f6, f18, f25, f6
  2379. FMADD f7, f19, f25, f7
  2380. FMADD f10, f18, f26, f10
  2381. FMADD f11, f19, f26, f11
  2382. FMADD f14, f18, f27, f14
  2383. FMADD f15, f19, f27, f15
  2384. LFD f16, 4 * SIZE(AO)
  2385. LFD f17, 5 * SIZE(AO)
  2386. LFD f18, 6 * SIZE(AO)
  2387. LFD f19, 7 * SIZE(AO)
  2388. FMADD f0, f16, f20, f0
  2389. FMADD f1, f17, f20, f1
  2390. FMADD f4, f16, f21, f4
  2391. FMADD f5, f17, f21, f5
  2392. LFD f24, 12 * SIZE(BO)
  2393. LFD f25, 13 * SIZE(BO)
  2394. LFD f26, 14 * SIZE(BO)
  2395. LFD f27, 15 * SIZE(BO)
  2396. FMADD f8, f16, f22, f8
  2397. FMADD f9, f17, f22, f9
  2398. FMADD f12, f16, f23, f12
  2399. FMADD f13, f17, f23, f13
  2400. LFD f20, 16 * SIZE(BO)
  2401. LFD f21, 17 * SIZE(BO)
  2402. LFD f22, 18 * SIZE(BO)
  2403. LFD f23, 19 * SIZE(BO)
  2404. FMADD f2, f18, f24, f2
  2405. FMADD f3, f19, f24, f3
  2406. FMADD f6, f18, f25, f6
  2407. FMADD f7, f19, f25, f7
  2408. FMADD f10, f18, f26, f10
  2409. FMADD f11, f19, f26, f11
  2410. FMADD f14, f18, f27, f14
  2411. FMADD f15, f19, f27, f15
  2412. LFD f16, 8 * SIZE(AO)
  2413. LFD f17, 9 * SIZE(AO)
  2414. LFD f18, 10 * SIZE(AO)
  2415. LFD f19, 11 * SIZE(AO)
  2416. LFD f24, 20 * SIZE(BO)
  2417. LFD f25, 21 * SIZE(BO)
  2418. LFD f26, 22 * SIZE(BO)
  2419. LFD f27, 23 * SIZE(BO)
  2420. addi AO, AO, 8 * SIZE
  2421. addi BO, BO, 16 * SIZE
  2422. DCBT(BO, PREB)
  2423. bdnz LL(22)
  2424. fadd f0, f2, f0
  2425. fadd f1, f3, f1
  2426. fadd f4, f6, f4
  2427. fadd f5, f7, f5
  2428. fadd f8, f10, f8
  2429. fadd f9, f11, f9
  2430. fadd f12, f14, f12
  2431. fadd f13, f15, f13
  2432. .align 4
  2433. LL(25):
  2434. #if defined(LT) || defined(RN)
  2435. andi. r0, KK, 3
  2436. #else
  2437. andi. r0, TEMP, 3
  2438. #endif
  2439. mtspr CTR, r0
  2440. ble+ LL(28)
  2441. .align 4
  2442. LL(26):
  2443. FMADD f0, f16, f20, f0
  2444. FMADD f1, f17, f20, f1
  2445. FMADD f4, f16, f21, f4
  2446. FMADD f5, f17, f21, f5
  2447. FMADD f8, f16, f22, f8
  2448. FMADD f9, f17, f22, f9
  2449. FMADD f12, f16, f23, f12
  2450. FMADD f13, f17, f23, f13
  2451. LFD f16, 2 * SIZE(AO)
  2452. LFD f17, 3 * SIZE(AO)
  2453. LFD f20, 4 * SIZE(BO)
  2454. LFD f21, 5 * SIZE(BO)
  2455. LFD f22, 6 * SIZE(BO)
  2456. LFD f23, 7 * SIZE(BO)
  2457. addi BO, BO, 4 * SIZE
  2458. addi AO, AO, 2 * SIZE
  2459. bdnz LL(26)
  2460. .align 4
  2461. LL(28):
  2462. #if defined(LN) || defined(RT)
  2463. #ifdef LN
  2464. subi r0, KK, 2
  2465. #else
  2466. subi r0, KK, 4
  2467. #endif
  2468. slwi TEMP, r0, 1 + BASE_SHIFT
  2469. slwi r0, r0, 2 + BASE_SHIFT
  2470. add AO, AORIG, TEMP
  2471. add BO, B, r0
  2472. #endif
  2473. #if defined(LN) || defined(LT)
  2474. LFD f16, 0 * SIZE(BO)
  2475. LFD f17, 1 * SIZE(BO)
  2476. LFD f18, 2 * SIZE(BO)
  2477. LFD f19, 3 * SIZE(BO)
  2478. LFD f20, 4 * SIZE(BO)
  2479. LFD f21, 5 * SIZE(BO)
  2480. LFD f22, 6 * SIZE(BO)
  2481. LFD f23, 7 * SIZE(BO)
  2482. FSUB f0, f16, f0
  2483. FSUB f4, f17, f4
  2484. FSUB f8, f18, f8
  2485. FSUB f12, f19, f12
  2486. FSUB f1, f20, f1
  2487. FSUB f5, f21, f5
  2488. FSUB f9, f22, f9
  2489. FSUB f13, f23, f13
  2490. #else
  2491. LFD f16, 0 * SIZE(AO)
  2492. LFD f17, 1 * SIZE(AO)
  2493. LFD f20, 2 * SIZE(AO)
  2494. LFD f21, 3 * SIZE(AO)
  2495. LFD f24, 4 * SIZE(AO)
  2496. LFD f25, 5 * SIZE(AO)
  2497. LFD f28, 6 * SIZE(AO)
  2498. LFD f29, 7 * SIZE(AO)
  2499. FSUB f0, f16, f0
  2500. FSUB f1, f17, f1
  2501. FSUB f4, f20, f4
  2502. FSUB f5, f21, f5
  2503. FSUB f8, f24, f8
  2504. FSUB f9, f25, f9
  2505. FSUB f12, f28, f12
  2506. FSUB f13, f29, f13
  2507. #endif
  2508. #ifdef LN
  2509. LFD f19, 3 * SIZE(AO)
  2510. LFD f20, 2 * SIZE(AO)
  2511. LFD f21, 0 * SIZE(AO)
  2512. FMUL f1, f19, f1
  2513. FMUL f5, f19, f5
  2514. FMUL f9, f19, f9
  2515. FMUL f13, f19, f13
  2516. FNMSUB f0, f20, f1, f0
  2517. FNMSUB f4, f20, f5, f4
  2518. FNMSUB f8, f20, f9, f8
  2519. FNMSUB f12, f20, f13, f12
  2520. FMUL f0, f21, f0
  2521. FMUL f4, f21, f4
  2522. FMUL f8, f21, f8
  2523. FMUL f12, f21, f12
  2524. #endif
  2525. #ifdef LT
  2526. LFD f16, 0 * SIZE(AO)
  2527. LFD f17, 1 * SIZE(AO)
  2528. FMUL f0, f16, f0
  2529. FMUL f4, f16, f4
  2530. FMUL f8, f16, f8
  2531. FMUL f12, f16, f12
  2532. FNMSUB f1, f17, f0, f1
  2533. FNMSUB f5, f17, f4, f5
  2534. FNMSUB f9, f17, f8, f9
  2535. FNMSUB f13, f17, f12, f13
  2536. LFD f17, 3 * SIZE(AO)
  2537. FMUL f1, f17, f1
  2538. FMUL f5, f17, f5
  2539. FMUL f9, f17, f9
  2540. FMUL f13, f17, f13
  2541. #endif
  2542. #ifdef RN
  2543. LFD f16, 0 * SIZE(BO)
  2544. LFD f17, 1 * SIZE(BO)
  2545. LFD f18, 2 * SIZE(BO)
  2546. LFD f19, 3 * SIZE(BO)
  2547. FMUL f0, f16, f0
  2548. FMUL f1, f16, f1
  2549. FNMSUB f4, f17, f0, f4
  2550. FNMSUB f5, f17, f1, f5
  2551. FNMSUB f8, f18, f0, f8
  2552. FNMSUB f9, f18, f1, f9
  2553. FNMSUB f12, f19, f0, f12
  2554. FNMSUB f13, f19, f1, f13
  2555. LFD f16, 5 * SIZE(BO)
  2556. LFD f17, 6 * SIZE(BO)
  2557. LFD f18, 7 * SIZE(BO)
  2558. LFD f19, 10 * SIZE(BO)
  2559. LFD f20, 11 * SIZE(BO)
  2560. LFD f21, 15 * SIZE(BO)
  2561. FMUL f4, f16, f4
  2562. FMUL f5, f16, f5
  2563. FNMSUB f8, f17, f4, f8
  2564. FNMSUB f9, f17, f5, f9
  2565. FNMSUB f12, f18, f4, f12
  2566. FNMSUB f13, f18, f5, f13
  2567. FMUL f8, f19, f8
  2568. FMUL f9, f19, f9
  2569. FNMSUB f12, f20, f8, f12
  2570. FNMSUB f13, f20, f9, f13
  2571. FMUL f12, f21, f12
  2572. FMUL f13, f21, f13
  2573. #endif
  2574. #ifdef RT
  2575. LFD f16, 15 * SIZE(BO)
  2576. LFD f17, 14 * SIZE(BO)
  2577. LFD f18, 13 * SIZE(BO)
  2578. LFD f19, 12 * SIZE(BO)
  2579. FMUL f12, f16, f12
  2580. FMUL f13, f16, f13
  2581. FNMSUB f8, f17, f12, f8
  2582. FNMSUB f9, f17, f13, f9
  2583. FNMSUB f4, f18, f12, f4
  2584. FNMSUB f5, f18, f13, f5
  2585. FNMSUB f0, f19, f12, f0
  2586. FNMSUB f1, f19, f13, f1
  2587. LFD f16, 10 * SIZE(BO)
  2588. LFD f17, 9 * SIZE(BO)
  2589. LFD f18, 8 * SIZE(BO)
  2590. LFD f19, 5 * SIZE(BO)
  2591. LFD f20, 4 * SIZE(BO)
  2592. LFD f21, 0 * SIZE(BO)
  2593. FMUL f8, f16, f8
  2594. FMUL f9, f16, f9
  2595. FNMSUB f4, f17, f8, f4
  2596. FNMSUB f5, f17, f9, f5
  2597. FNMSUB f0, f18, f8, f0
  2598. FNMSUB f1, f18, f9, f1
  2599. FMUL f4, f19, f4
  2600. FMUL f5, f19, f5
  2601. FNMSUB f0, f20, f4, f0
  2602. FNMSUB f1, f20, f5, f1
  2603. FMUL f0, f21, f0
  2604. FMUL f1, f21, f1
  2605. #endif
  2606. #ifdef LN
  2607. subi CO1, CO1, 2 * SIZE
  2608. subi CO2, CO2, 2 * SIZE
  2609. subi CO3, CO3, 2 * SIZE
  2610. subi CO4, CO4, 2 * SIZE
  2611. #endif
  2612. #if defined(LN) || defined(LT)
  2613. STFD f0, 0 * SIZE(BO)
  2614. STFD f4, 1 * SIZE(BO)
  2615. STFD f8, 2 * SIZE(BO)
  2616. STFD f12, 3 * SIZE(BO)
  2617. STFD f1, 4 * SIZE(BO)
  2618. STFD f5, 5 * SIZE(BO)
  2619. STFD f9, 6 * SIZE(BO)
  2620. STFD f13, 7 * SIZE(BO)
  2621. #else
  2622. STFD f0, 0 * SIZE(AO)
  2623. STFD f1, 1 * SIZE(AO)
  2624. STFD f4, 2 * SIZE(AO)
  2625. STFD f5, 3 * SIZE(AO)
  2626. STFD f8, 4 * SIZE(AO)
  2627. STFD f9, 5 * SIZE(AO)
  2628. STFD f12, 6 * SIZE(AO)
  2629. STFD f13, 7 * SIZE(AO)
  2630. #endif
  2631. STFD f0, 0 * SIZE(CO1)
  2632. STFD f1, 1 * SIZE(CO1)
  2633. STFD f4, 0 * SIZE(CO2)
  2634. STFD f5, 1 * SIZE(CO2)
  2635. STFD f8, 0 * SIZE(CO3)
  2636. STFD f9, 1 * SIZE(CO3)
  2637. STFD f12, 0 * SIZE(CO4)
  2638. STFD f13, 1 * SIZE(CO4)
  2639. lfs f0, FZERO
  2640. fmr f1, f0
  2641. fmr f2, f0
  2642. fmr f3, f0
  2643. fmr f4, f0
  2644. fmr f5, f0
  2645. fmr f6, f0
  2646. fmr f7, f0
  2647. fmr f8, f0
  2648. fmr f9, f0
  2649. fmr f10, f0
  2650. fmr f11, f0
  2651. fmr f12, f0
  2652. fmr f13, f0
  2653. fmr f14, f0
  2654. fmr f15, f0
  2655. #ifndef LN
  2656. addi CO1, CO1, 2 * SIZE
  2657. addi CO2, CO2, 2 * SIZE
  2658. addi CO3, CO3, 2 * SIZE
  2659. addi CO4, CO4, 2 * SIZE
  2660. #endif
  2661. #ifdef RT
  2662. slwi r0, K, 1 + BASE_SHIFT
  2663. add AORIG, AORIG, r0
  2664. #endif
  2665. #if defined(LT) || defined(RN)
  2666. sub TEMP, K, KK
  2667. slwi r0, TEMP, 1 + BASE_SHIFT
  2668. slwi TEMP, TEMP, 2 + BASE_SHIFT
  2669. add AO, AO, r0
  2670. add BO, BO, TEMP
  2671. #endif
  2672. #ifdef LN
  2673. subi KK, KK, 2
  2674. #endif
  2675. #ifdef LT
  2676. addi KK, KK, 2
  2677. #endif
  2678. .align 4
  2679. LL(30):
  2680. andi. I, M, 1
  2681. ble LL(39)
  2682. #if defined(LT) || defined(RN)
  2683. LFD f16, 0 * SIZE(AO)
  2684. LFD f17, 1 * SIZE(AO)
  2685. LFD f18, 2 * SIZE(AO)
  2686. LFD f19, 3 * SIZE(AO)
  2687. LFD f20, 0 * SIZE(B)
  2688. LFD f21, 1 * SIZE(B)
  2689. LFD f22, 2 * SIZE(B)
  2690. LFD f23, 3 * SIZE(B)
  2691. LFD f24, 4 * SIZE(B)
  2692. LFD f25, 5 * SIZE(B)
  2693. LFD f26, 6 * SIZE(B)
  2694. LFD f27, 7 * SIZE(B)
  2695. srawi. r0, KK, 2
  2696. mtspr CTR, r0
  2697. mr BO, B
  2698. #else
  2699. #ifdef LN
  2700. slwi r0, K, BASE_SHIFT
  2701. sub AORIG, AORIG, r0
  2702. #endif
  2703. slwi r0, KK, 0 + BASE_SHIFT
  2704. slwi TEMP, KK, 2 + BASE_SHIFT
  2705. add AO, AORIG, r0
  2706. add BO, B, TEMP
  2707. sub TEMP, K, KK
  2708. LFD f16, 0 * SIZE(AO)
  2709. LFD f17, 1 * SIZE(AO)
  2710. LFD f18, 2 * SIZE(AO)
  2711. LFD f19, 3 * SIZE(AO)
  2712. LFD f20, 0 * SIZE(BO)
  2713. LFD f21, 1 * SIZE(BO)
  2714. LFD f22, 2 * SIZE(BO)
  2715. LFD f23, 3 * SIZE(BO)
  2716. LFD f24, 4 * SIZE(BO)
  2717. LFD f25, 5 * SIZE(BO)
  2718. LFD f26, 6 * SIZE(BO)
  2719. LFD f27, 7 * SIZE(BO)
  2720. srawi. r0, TEMP, 2
  2721. mtspr CTR, r0
  2722. #endif
  2723. ble LL(35)
  2724. .align 5
  2725. LL(32):
  2726. FMADD f0, f16, f20, f0
  2727. FMADD f4, f16, f21, f4
  2728. FMADD f8, f16, f22, f8
  2729. FMADD f12, f16, f23, f12
  2730. LFD f20, 8 * SIZE(BO)
  2731. LFD f21, 9 * SIZE(BO)
  2732. LFD f22, 10 * SIZE(BO)
  2733. LFD f23, 11 * SIZE(BO)
  2734. FMADD f1, f17, f24, f1
  2735. FMADD f5, f17, f25, f5
  2736. FMADD f9, f17, f26, f9
  2737. FMADD f13, f17, f27, f13
  2738. LFD f24, 12 * SIZE(BO)
  2739. LFD f25, 13 * SIZE(BO)
  2740. LFD f26, 14 * SIZE(BO)
  2741. LFD f27, 15 * SIZE(BO)
  2742. FMADD f0, f18, f20, f0
  2743. FMADD f4, f18, f21, f4
  2744. FMADD f8, f18, f22, f8
  2745. FMADD f12, f18, f23, f12
  2746. LFD f20, 16 * SIZE(BO)
  2747. LFD f21, 17 * SIZE(BO)
  2748. LFD f22, 18 * SIZE(BO)
  2749. LFD f23, 19 * SIZE(BO)
  2750. FMADD f1, f19, f24, f1
  2751. FMADD f5, f19, f25, f5
  2752. FMADD f9, f19, f26, f9
  2753. FMADD f13, f19, f27, f13
  2754. LFD f16, 4 * SIZE(AO)
  2755. LFD f17, 5 * SIZE(AO)
  2756. LFD f18, 6 * SIZE(AO)
  2757. LFD f19, 7 * SIZE(AO)
  2758. LFD f24, 20 * SIZE(BO)
  2759. LFD f25, 21 * SIZE(BO)
  2760. LFD f26, 22 * SIZE(BO)
  2761. LFD f27, 23 * SIZE(BO)
  2762. addi AO, AO, 4 * SIZE
  2763. addi BO, BO, 16 * SIZE
  2764. DCBT(BO, PREB)
  2765. bdnz LL(32)
  2766. fadd f0, f1, f0
  2767. fadd f4, f5, f4
  2768. fadd f8, f9, f8
  2769. fadd f12, f13, f12
  2770. .align 4
  2771. LL(35):
  2772. #if defined(LT) || defined(RN)
  2773. andi. r0, KK, 3
  2774. #else
  2775. andi. r0, TEMP, 3
  2776. #endif
  2777. mtspr CTR, r0
  2778. ble+ LL(38)
  2779. .align 4
  2780. LL(36):
  2781. FMADD f0, f16, f20, f0
  2782. FMADD f4, f16, f21, f4
  2783. FMADD f8, f16, f22, f8
  2784. FMADD f12, f16, f23, f12
  2785. LFD f16, 1 * SIZE(AO)
  2786. LFD f20, 4 * SIZE(BO)
  2787. LFD f21, 5 * SIZE(BO)
  2788. LFD f22, 6 * SIZE(BO)
  2789. LFD f23, 7 * SIZE(BO)
  2790. addi BO, BO, 4 * SIZE
  2791. addi AO, AO, 1 * SIZE
  2792. bdnz LL(36)
  2793. .align 4
  2794. LL(38):
  2795. #if defined(LN) || defined(RT)
  2796. #ifdef LN
  2797. subi r0, KK, 1
  2798. #else
  2799. subi r0, KK, 4
  2800. #endif
  2801. slwi TEMP, r0, 0 + BASE_SHIFT
  2802. slwi r0, r0, 2 + BASE_SHIFT
  2803. add AO, AORIG, TEMP
  2804. add BO, B, r0
  2805. #endif
  2806. #if defined(LN) || defined(LT)
  2807. LFD f16, 0 * SIZE(BO)
  2808. LFD f17, 1 * SIZE(BO)
  2809. LFD f18, 2 * SIZE(BO)
  2810. LFD f19, 3 * SIZE(BO)
  2811. FSUB f0, f16, f0
  2812. FSUB f4, f17, f4
  2813. FSUB f8, f18, f8
  2814. FSUB f12, f19, f12
  2815. #else
  2816. LFD f16, 0 * SIZE(AO)
  2817. LFD f20, 1 * SIZE(AO)
  2818. LFD f24, 2 * SIZE(AO)
  2819. LFD f28, 3 * SIZE(AO)
  2820. FSUB f0, f16, f0
  2821. FSUB f4, f20, f4
  2822. FSUB f8, f24, f8
  2823. FSUB f12, f28, f12
  2824. #endif
  2825. #ifdef LN
  2826. LFD f21, 0 * SIZE(AO)
  2827. FMUL f0, f21, f0
  2828. FMUL f4, f21, f4
  2829. FMUL f8, f21, f8
  2830. FMUL f12, f21, f12
  2831. #endif
  2832. #ifdef LT
  2833. LFD f16, 0 * SIZE(AO)
  2834. FMUL f0, f16, f0
  2835. FMUL f4, f16, f4
  2836. FMUL f8, f16, f8
  2837. FMUL f12, f16, f12
  2838. #endif
  2839. #ifdef RN
  2840. LFD f16, 0 * SIZE(BO)
  2841. LFD f17, 1 * SIZE(BO)
  2842. LFD f18, 2 * SIZE(BO)
  2843. LFD f19, 3 * SIZE(BO)
  2844. FMUL f0, f16, f0
  2845. FNMSUB f4, f17, f0, f4
  2846. FNMSUB f8, f18, f0, f8
  2847. FNMSUB f12, f19, f0, f12
  2848. LFD f16, 5 * SIZE(BO)
  2849. LFD f17, 6 * SIZE(BO)
  2850. LFD f18, 7 * SIZE(BO)
  2851. LFD f19, 10 * SIZE(BO)
  2852. LFD f20, 11 * SIZE(BO)
  2853. LFD f21, 15 * SIZE(BO)
  2854. FMUL f4, f16, f4
  2855. FNMSUB f8, f17, f4, f8
  2856. FNMSUB f12, f18, f4, f12
  2857. FMUL f8, f19, f8
  2858. FNMSUB f12, f20, f8, f12
  2859. FMUL f12, f21, f12
  2860. #endif
  2861. #ifdef RT
  2862. LFD f16, 15 * SIZE(BO)
  2863. LFD f17, 14 * SIZE(BO)
  2864. LFD f18, 13 * SIZE(BO)
  2865. LFD f19, 12 * SIZE(BO)
  2866. FMUL f12, f16, f12
  2867. FNMSUB f8, f17, f12, f8
  2868. FNMSUB f4, f18, f12, f4
  2869. FNMSUB f0, f19, f12, f0
  2870. LFD f16, 10 * SIZE(BO)
  2871. LFD f17, 9 * SIZE(BO)
  2872. LFD f18, 8 * SIZE(BO)
  2873. LFD f19, 5 * SIZE(BO)
  2874. FMUL f8, f16, f8
  2875. LFD f20, 4 * SIZE(BO)
  2876. LFD f21, 0 * SIZE(BO)
  2877. FNMSUB f4, f17, f8, f4
  2878. FNMSUB f0, f18, f8, f0
  2879. FMUL f4, f19, f4
  2880. FNMSUB f0, f20, f4, f0
  2881. FMUL f0, f21, f0
  2882. #endif
  2883. #ifdef LN
  2884. subi CO1, CO1, 1 * SIZE
  2885. subi CO2, CO2, 1 * SIZE
  2886. subi CO3, CO3, 1 * SIZE
  2887. subi CO4, CO4, 1 * SIZE
  2888. #endif
  2889. #if defined(LN) || defined(LT)
  2890. STFD f0, 0 * SIZE(BO)
  2891. STFD f4, 1 * SIZE(BO)
  2892. STFD f8, 2 * SIZE(BO)
  2893. STFD f12, 3 * SIZE(BO)
  2894. #else
  2895. STFD f0, 0 * SIZE(AO)
  2896. STFD f4, 1 * SIZE(AO)
  2897. STFD f8, 2 * SIZE(AO)
  2898. STFD f12, 3 * SIZE(AO)
  2899. #endif
  2900. STFD f0, 0 * SIZE(CO1)
  2901. STFD f4, 0 * SIZE(CO2)
  2902. STFD f8, 0 * SIZE(CO3)
  2903. STFD f12, 0 * SIZE(CO4)
  2904. lfs f0, FZERO
  2905. fmr f1, f0
  2906. fmr f4, f0
  2907. fmr f5, f0
  2908. fmr f8, f0
  2909. fmr f9, f0
  2910. fmr f12, f0
  2911. fmr f13, f0
  2912. #ifndef LN
  2913. addi CO1, CO1, 1 * SIZE
  2914. addi CO2, CO2, 1 * SIZE
  2915. addi CO3, CO3, 1 * SIZE
  2916. addi CO4, CO4, 1 * SIZE
  2917. #endif
  2918. #ifdef RT
  2919. slwi r0, K, 0 + BASE_SHIFT
  2920. add AORIG, AORIG, r0
  2921. #endif
  2922. #if defined(LT) || defined(RN)
  2923. sub TEMP, K, KK
  2924. slwi r0, TEMP, 0 + BASE_SHIFT
  2925. slwi TEMP, TEMP, 2 + BASE_SHIFT
  2926. add AO, AO, r0
  2927. add BO, BO, TEMP
  2928. #endif
  2929. #ifdef LN
  2930. subi KK, KK, 1
  2931. #endif
  2932. #ifdef LT
  2933. addi KK, KK, 1
  2934. #endif
  2935. .align 4
  2936. LL(39):
  2937. #ifdef LN
  2938. slwi r0, K, 2 + BASE_SHIFT
  2939. add B, B, r0
  2940. #endif
  2941. #if defined(LT) || defined(RN)
  2942. mr B, BO
  2943. #endif
  2944. #ifdef RN
  2945. addi KK, KK, 4
  2946. #endif
  2947. #ifdef RT
  2948. subi KK, KK, 4
  2949. #endif
  2950. addic. J, J, -1
  2951. lfs f0, FZERO
  2952. bgt LL(10)
  2953. .align 4
  2954. LL(999):
  2955. addi r3, 0, 0
  2956. lfd f14, 0(SP)
  2957. lfd f15, 8(SP)
  2958. lfd f16, 16(SP)
  2959. lfd f17, 24(SP)
  2960. lfd f18, 32(SP)
  2961. lfd f19, 40(SP)
  2962. lfd f20, 48(SP)
  2963. lfd f21, 56(SP)
  2964. lfd f22, 64(SP)
  2965. lfd f23, 72(SP)
  2966. lfd f24, 80(SP)
  2967. lfd f25, 88(SP)
  2968. lfd f26, 96(SP)
  2969. lfd f27, 104(SP)
  2970. lfd f28, 112(SP)
  2971. lfd f29, 120(SP)
  2972. lfd f30, 128(SP)
  2973. lfd f31, 136(SP)
  2974. #ifdef __64BIT__
  2975. ld r31, 144(SP)
  2976. ld r30, 152(SP)
  2977. ld r29, 160(SP)
  2978. ld r28, 168(SP)
  2979. ld r27, 176(SP)
  2980. ld r26, 184(SP)
  2981. ld r25, 192(SP)
  2982. ld r24, 200(SP)
  2983. ld r23, 208(SP)
  2984. ld r22, 216(SP)
  2985. ld r21, 224(SP)
  2986. ld r20, 232(SP)
  2987. ld r19, 240(SP)
  2988. ld r18, 248(SP)
  2989. #else
  2990. lwz r31, 144(SP)
  2991. lwz r30, 148(SP)
  2992. lwz r29, 152(SP)
  2993. lwz r28, 156(SP)
  2994. lwz r27, 160(SP)
  2995. lwz r26, 164(SP)
  2996. lwz r25, 168(SP)
  2997. lwz r24, 172(SP)
  2998. lwz r23, 176(SP)
  2999. lwz r22, 180(SP)
  3000. lwz r21, 184(SP)
  3001. lwz r20, 188(SP)
  3002. lwz r19, 192(SP)
  3003. lwz r18, 196(SP)
  3004. #endif
  3005. addi SP, SP, STACKSIZE
  3006. blr
  3007. EPILOGUE
  3008. #endif