You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_cell_RT.S 62 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #if defined(linux) || defined(__FreeBSD__)
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define AORIG r18
  88. #define TEMP r19
  89. #define KK r20
  90. #define I r21
  91. #define J r22
  92. #define AO r23
  93. #define BO r24
  94. #define CO1 r25
  95. #define CO2 r26
  96. #define CO3 r27
  97. #define CO4 r28
  98. #define PREA r29
  99. #define PREB r30
  100. #define PREC r31
  101. #ifndef NEEDPARAM
  102. PROLOGUE
  103. PROFCODE
  104. addi SP, SP, -STACKSIZE
  105. li r0, 0
  106. stfd f14, 0(SP)
  107. stfd f15, 8(SP)
  108. stfd f16, 16(SP)
  109. stfd f17, 24(SP)
  110. stfd f18, 32(SP)
  111. stfd f19, 40(SP)
  112. stfd f20, 48(SP)
  113. stfd f21, 56(SP)
  114. stfd f22, 64(SP)
  115. stfd f23, 72(SP)
  116. stfd f24, 80(SP)
  117. stfd f25, 88(SP)
  118. stfd f26, 96(SP)
  119. stfd f27, 104(SP)
  120. stfd f28, 112(SP)
  121. stfd f29, 120(SP)
  122. stfd f30, 128(SP)
  123. stfd f31, 136(SP)
  124. #ifdef __64BIT__
  125. std r31, 144(SP)
  126. std r30, 152(SP)
  127. std r29, 160(SP)
  128. std r28, 168(SP)
  129. std r27, 176(SP)
  130. std r26, 184(SP)
  131. std r25, 192(SP)
  132. std r24, 200(SP)
  133. std r23, 208(SP)
  134. std r22, 216(SP)
  135. std r21, 224(SP)
  136. std r20, 232(SP)
  137. std r19, 240(SP)
  138. std r18, 248(SP)
  139. #else
  140. stw r31, 144(SP)
  141. stw r30, 148(SP)
  142. stw r29, 152(SP)
  143. stw r28, 156(SP)
  144. stw r27, 160(SP)
  145. stw r26, 164(SP)
  146. stw r25, 168(SP)
  147. stw r24, 172(SP)
  148. stw r23, 176(SP)
  149. stw r22, 180(SP)
  150. stw r21, 184(SP)
  151. stw r20, 188(SP)
  152. stw r19, 192(SP)
  153. stw r18, 196(SP)
  154. #endif
  155. stw r0, FZERO
  156. #if defined(_AIX) || defined(__APPLE__)
  157. #if !defined(__64BIT__) && defined(DOUBLE)
  158. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  159. #endif
  160. #endif
  161. slwi LDC, LDC, BASE_SHIFT
  162. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  163. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  164. #endif
  165. #if defined(_AIX) || defined(__APPLE__)
  166. #ifdef __64BIT__
  167. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  168. #else
  169. #ifdef DOUBLE
  170. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  171. #else
  172. lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  173. #endif
  174. #endif
  175. #endif
  176. #ifdef LN
  177. mullw r0, M, K
  178. slwi r0, r0, BASE_SHIFT
  179. add A, A, r0
  180. slwi r0, M, BASE_SHIFT
  181. add C, C, r0
  182. #endif
  183. #ifdef RN
  184. neg KK, OFFSET
  185. #endif
  186. #ifdef RT
  187. mullw r0, N, K
  188. slwi r0, r0, BASE_SHIFT
  189. add B, B, r0
  190. mullw r0, N, LDC
  191. add C, C, r0
  192. sub KK, N, OFFSET
  193. #endif
  194. cmpwi cr0, M, 0
  195. ble LL(999)
  196. cmpwi cr0, N, 0
  197. ble LL(999)
  198. cmpwi cr0, K, 0
  199. ble LL(999)
  200. #ifndef PREFETCHTEST
  201. li PREC, -4 * SIZE
  202. #else
  203. #if defined(linux) || defined(__FreeBSD__)
  204. #ifndef __64BIT__
  205. mr PREA, r10
  206. lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
  207. lwz PREC, FRAMESLOT(1) + STACKSIZE(SP)
  208. #else
  209. ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  210. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  211. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  212. #endif
  213. #endif
  214. #if defined(_AIX) || defined(__APPLE__)
  215. #ifdef __64BIT__
  216. ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  217. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  218. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  219. #else
  220. #ifdef DOUBLE
  221. lwz PREA, FRAMESLOT(1) + STACKSIZE(SP)
  222. lwz PREB, FRAMESLOT(2) + STACKSIZE(SP)
  223. lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
  224. #else
  225. lwz PREA, FRAMESLOT(0) + STACKSIZE(SP)
  226. lwz PREB, FRAMESLOT(1) + STACKSIZE(SP)
  227. lwz PREC, FRAMESLOT(2) + STACKSIZE(SP)
  228. #endif
  229. #endif
  230. #endif
  231. #endif
  232. #ifndef PREFETCHTEST
  233. #ifdef PPC970
  234. #ifdef ALLOC_HUGETLB
  235. li PREA, (16 * 5 * SIZE | 1)
  236. li PREB, (16 * 5 * SIZE | 3)
  237. #else
  238. li PREA, (16 * 14 * SIZE | 1)
  239. li PREB, (16 * 8 * SIZE | 3)
  240. #endif
  241. #endif
  242. #ifdef POWER4
  243. #ifdef ALLOC_HUGETLB
  244. li PREA, (16 * 1 * SIZE + 16)
  245. li PREB, (16 * 1 * SIZE + 16)
  246. #else
  247. li PREA, (16 * 2 * SIZE + 16)
  248. li PREB, (16 * 2 * SIZE + 16)
  249. #endif
  250. #endif
  251. #ifdef POWER5
  252. #ifdef ALLOC_HUGETLB
  253. li PREA, (16 * 7 * SIZE | 1)
  254. li PREB, (16 * 7 * SIZE | 3)
  255. #else
  256. li PREA, (16 * 12 * SIZE | 1)
  257. li PREB, (16 * 6 * SIZE | 3)
  258. #endif
  259. #endif
  260. #ifdef CELL
  261. li PREA, (16 * 12 * SIZE)
  262. li PREB, (16 * 12 * SIZE)
  263. #endif
  264. #endif
  265. lfs f0, FZERO
  266. LL(70):
  267. andi. J, N, 1
  268. ble LL(40)
  269. #ifdef RT
  270. slwi r0, K, 0 + BASE_SHIFT
  271. sub B, B, r0
  272. sub C, C, LDC
  273. #endif
  274. mr CO1, C
  275. #ifdef LN
  276. add KK, M, OFFSET
  277. #endif
  278. #ifdef LT
  279. mr KK, OFFSET
  280. #endif
  281. fmr f1, f0
  282. fmr f2, f0
  283. fmr f3, f0
  284. srawi. I, M, 2
  285. #if defined(LN) || defined(RT)
  286. mr AORIG, A
  287. #else
  288. mr AO, A
  289. #endif
  290. #ifndef RT
  291. add C, CO1, LDC
  292. #endif
  293. ble LL(80)
  294. .align 4
  295. LL(71):
  296. #if defined(LT) || defined(RN)
  297. LFD f16, 0 * SIZE(AO)
  298. LFD f17, 1 * SIZE(AO)
  299. LFD f18, 2 * SIZE(AO)
  300. LFD f19, 3 * SIZE(AO)
  301. LFD f20, 0 * SIZE(B)
  302. LFD f21, 1 * SIZE(B)
  303. LFD f22, 2 * SIZE(B)
  304. LFD f23, 3 * SIZE(B)
  305. dcbt CO1, PREC
  306. srawi. r0, KK, 2
  307. mtspr CTR, r0
  308. mr BO, B
  309. #else
  310. #ifdef LN
  311. slwi r0, K, 2 + BASE_SHIFT
  312. sub AORIG, AORIG, r0
  313. #endif
  314. slwi r0, KK, 2 + BASE_SHIFT
  315. slwi TEMP, KK, 0 + BASE_SHIFT
  316. add AO, AORIG, r0
  317. add BO, B, TEMP
  318. sub TEMP, K, KK
  319. LFD f16, 0 * SIZE(AO)
  320. LFD f17, 1 * SIZE(AO)
  321. LFD f18, 2 * SIZE(AO)
  322. LFD f19, 3 * SIZE(AO)
  323. LFD f20, 0 * SIZE(BO)
  324. LFD f21, 1 * SIZE(BO)
  325. LFD f22, 2 * SIZE(BO)
  326. LFD f23, 3 * SIZE(BO)
  327. dcbt CO1, PREC
  328. srawi. r0, TEMP, 2
  329. mtspr CTR, r0
  330. #endif
  331. ble LL(75)
  332. .align 5
  333. LL(72):
  334. FMADD f0, f16, f20, f0
  335. FMADD f1, f17, f20, f1
  336. FMADD f2, f18, f20, f2
  337. FMADD f3, f19, f20, f3
  338. LFD f16, 4 * SIZE(AO)
  339. LFD f17, 5 * SIZE(AO)
  340. LFD f18, 6 * SIZE(AO)
  341. LFD f19, 7 * SIZE(AO)
  342. FMADD f0, f16, f21, f0
  343. FMADD f1, f17, f21, f1
  344. FMADD f2, f18, f21, f2
  345. FMADD f3, f19, f21, f3
  346. LFD f16, 8 * SIZE(AO)
  347. LFD f17, 9 * SIZE(AO)
  348. LFD f18, 10 * SIZE(AO)
  349. LFD f19, 11 * SIZE(AO)
  350. FMADD f0, f16, f22, f0
  351. FMADD f1, f17, f22, f1
  352. FMADD f2, f18, f22, f2
  353. FMADD f3, f19, f22, f3
  354. LFD f16, 12 * SIZE(AO)
  355. LFD f17, 13 * SIZE(AO)
  356. LFD f18, 14 * SIZE(AO)
  357. LFD f19, 15 * SIZE(AO)
  358. FMADD f0, f16, f23, f0
  359. FMADD f1, f17, f23, f1
  360. FMADD f2, f18, f23, f2
  361. FMADD f3, f19, f23, f3
  362. LFD f16, 16 * SIZE(AO)
  363. LFD f17, 17 * SIZE(AO)
  364. LFD f18, 18 * SIZE(AO)
  365. LFD f19, 19 * SIZE(AO)
  366. LFD f20, 4 * SIZE(BO)
  367. LFD f21, 5 * SIZE(BO)
  368. LFD f22, 6 * SIZE(BO)
  369. LFD f23, 7 * SIZE(BO)
  370. addi AO, AO, 16 * SIZE
  371. addi BO, BO, 4 * SIZE
  372. DCBT(BO, PREB)
  373. bdnz LL(72)
  374. .align 4
  375. LL(75):
  376. #if defined(LT) || defined(RN)
  377. andi. r0, KK, 3
  378. #else
  379. andi. r0, TEMP, 3
  380. #endif
  381. mtspr CTR, r0
  382. ble+ LL(78)
  383. .align 4
  384. LL(76):
  385. FMADD f0, f16, f20, f0
  386. FMADD f1, f17, f20, f1
  387. FMADD f2, f18, f20, f2
  388. FMADD f3, f19, f20, f3
  389. LFD f16, 4 * SIZE(AO)
  390. LFD f17, 5 * SIZE(AO)
  391. LFD f18, 6 * SIZE(AO)
  392. LFD f19, 7 * SIZE(AO)
  393. LFD f20, 1 * SIZE(BO)
  394. addi BO, BO, 1 * SIZE
  395. addi AO, AO, 4 * SIZE
  396. bdnz LL(76)
  397. .align 4
  398. LL(78):
  399. #if defined(LN) || defined(RT)
  400. #ifdef LN
  401. subi r0, KK, 4
  402. #else
  403. subi r0, KK, 1
  404. #endif
  405. slwi TEMP, r0, 2 + BASE_SHIFT
  406. slwi r0, r0, 0 + BASE_SHIFT
  407. add AO, AORIG, TEMP
  408. add BO, B, r0
  409. #endif
  410. #if defined(LN) || defined(LT)
  411. LFD f16, 0 * SIZE(BO)
  412. LFD f20, 1 * SIZE(BO)
  413. LFD f24, 2 * SIZE(BO)
  414. LFD f28, 3 * SIZE(BO)
  415. FSUB f0, f16, f0
  416. FSUB f1, f20, f1
  417. FSUB f2, f24, f2
  418. FSUB f3, f28, f3
  419. #else
  420. LFD f16, 0 * SIZE(AO)
  421. LFD f17, 1 * SIZE(AO)
  422. LFD f18, 2 * SIZE(AO)
  423. LFD f19, 3 * SIZE(AO)
  424. FSUB f0, f16, f0
  425. FSUB f1, f17, f1
  426. FSUB f2, f18, f2
  427. FSUB f3, f19, f3
  428. #endif
  429. #ifdef LN
  430. LFD f16, 15 * SIZE(AO)
  431. LFD f17, 14 * SIZE(AO)
  432. LFD f18, 13 * SIZE(AO)
  433. LFD f19, 12 * SIZE(AO)
  434. FMUL f3, f16, f3
  435. FNMSUB f2, f17, f3, f2
  436. FNMSUB f1, f18, f3, f1
  437. FNMSUB f0, f19, f3, f0
  438. LFD f16, 10 * SIZE(AO)
  439. LFD f17, 9 * SIZE(AO)
  440. LFD f18, 8 * SIZE(AO)
  441. LFD f19, 5 * SIZE(AO)
  442. LFD f20, 4 * SIZE(AO)
  443. LFD f21, 0 * SIZE(AO)
  444. FMUL f2, f16, f2
  445. FNMSUB f1, f17, f2, f1
  446. FNMSUB f0, f18, f2, f0
  447. FMUL f1, f19, f1
  448. FNMSUB f0, f20, f1, f0
  449. FMUL f0, f21, f0
  450. #endif
  451. #ifdef LT
  452. LFD f16, 0 * SIZE(AO)
  453. LFD f17, 1 * SIZE(AO)
  454. LFD f18, 2 * SIZE(AO)
  455. LFD f19, 3 * SIZE(AO)
  456. FMUL f0, f16, f0
  457. FNMSUB f1, f17, f0, f1
  458. FNMSUB f2, f18, f0, f2
  459. FNMSUB f3, f19, f0, f3
  460. LFD f17, 5 * SIZE(AO)
  461. LFD f18, 6 * SIZE(AO)
  462. LFD f19, 7 * SIZE(AO)
  463. FMUL f1, f17, f1
  464. FNMSUB f2, f18, f1, f2
  465. FNMSUB f3, f19, f1, f3
  466. LFD f18, 10 * SIZE(AO)
  467. LFD f19, 11 * SIZE(AO)
  468. FMUL f2, f18, f2
  469. FNMSUB f3, f19, f2, f3
  470. LFD f19, 15 * SIZE(AO)
  471. FMUL f3, f19, f3
  472. #endif
  473. #ifdef RN
  474. LFD f16, 0 * SIZE(BO)
  475. FMUL f0, f16, f0
  476. FMUL f1, f16, f1
  477. FMUL f2, f16, f2
  478. FMUL f3, f16, f3
  479. #endif
  480. #ifdef RT
  481. LFD f21, 0 * SIZE(BO)
  482. FMUL f0, f21, f0
  483. FMUL f1, f21, f1
  484. FMUL f2, f21, f2
  485. FMUL f3, f21, f3
  486. #endif
  487. #ifdef LN
  488. subi CO1, CO1, 4 * SIZE
  489. #endif
  490. #if defined(LN) || defined(LT)
  491. STFD f0, 0 * SIZE(BO)
  492. STFD f1, 1 * SIZE(BO)
  493. STFD f2, 2 * SIZE(BO)
  494. STFD f3, 3 * SIZE(BO)
  495. #else
  496. STFD f0, 0 * SIZE(AO)
  497. STFD f1, 1 * SIZE(AO)
  498. STFD f2, 2 * SIZE(AO)
  499. STFD f3, 3 * SIZE(AO)
  500. #endif
  501. STFD f0, 0 * SIZE(CO1)
  502. STFD f1, 1 * SIZE(CO1)
  503. STFD f2, 2 * SIZE(CO1)
  504. STFD f3, 3 * SIZE(CO1)
  505. lfs f0, FZERO
  506. fmr f1, f0
  507. fmr f2, f0
  508. fmr f3, f0
  509. #ifndef LN
  510. addi CO1, CO1, 4 * SIZE
  511. #endif
  512. #ifdef RT
  513. slwi r0, K, 2 + BASE_SHIFT
  514. add AORIG, AORIG, r0
  515. #endif
  516. #if defined(LT) || defined(RN)
  517. sub TEMP, K, KK
  518. slwi r0, TEMP, 2 + BASE_SHIFT
  519. slwi TEMP, TEMP, 0 + BASE_SHIFT
  520. add AO, AO, r0
  521. add BO, BO, TEMP
  522. #endif
  523. #ifdef LN
  524. subi KK, KK, 4
  525. #endif
  526. #ifdef LT
  527. addi KK, KK, 4
  528. #endif
  529. addic. I, I, -1
  530. bgt+ LL(71)
  531. .align 4
  532. LL(80):
  533. andi. I, M, 2
  534. ble LL(90)
  535. #if defined(LT) || defined(RN)
  536. LFD f16, 0 * SIZE(AO)
  537. LFD f17, 1 * SIZE(AO)
  538. LFD f18, 2 * SIZE(AO)
  539. LFD f19, 3 * SIZE(AO)
  540. LFD f20, 0 * SIZE(B)
  541. LFD f21, 1 * SIZE(B)
  542. LFD f22, 2 * SIZE(B)
  543. LFD f23, 3 * SIZE(B)
  544. srawi. r0, KK, 2
  545. mtspr CTR, r0
  546. mr BO, B
  547. #else
  548. #ifdef LN
  549. slwi r0, K, 1 + BASE_SHIFT
  550. sub AORIG, AORIG, r0
  551. #endif
  552. slwi r0, KK, 1 + BASE_SHIFT
  553. slwi TEMP, KK, 0 + BASE_SHIFT
  554. add AO, AORIG, r0
  555. add BO, B, TEMP
  556. sub TEMP, K, KK
  557. LFD f16, 0 * SIZE(AO)
  558. LFD f17, 1 * SIZE(AO)
  559. LFD f18, 2 * SIZE(AO)
  560. LFD f19, 3 * SIZE(AO)
  561. LFD f20, 0 * SIZE(BO)
  562. LFD f21, 1 * SIZE(BO)
  563. LFD f22, 2 * SIZE(BO)
  564. LFD f23, 3 * SIZE(BO)
  565. srawi. r0, TEMP, 2
  566. mtspr CTR, r0
  567. #endif
  568. ble LL(85)
  569. .align 5
  570. LL(82):
  571. FMADD f0, f16, f20, f0
  572. FMADD f1, f17, f20, f1
  573. FMADD f2, f18, f21, f2
  574. FMADD f3, f19, f21, f3
  575. LFD f16, 4 * SIZE(AO)
  576. LFD f17, 5 * SIZE(AO)
  577. LFD f18, 6 * SIZE(AO)
  578. LFD f19, 7 * SIZE(AO)
  579. FMADD f0, f16, f22, f0
  580. FMADD f1, f17, f22, f1
  581. FMADD f2, f18, f23, f2
  582. FMADD f3, f19, f23, f3
  583. LFD f16, 8 * SIZE(AO)
  584. LFD f17, 9 * SIZE(AO)
  585. LFD f18, 10 * SIZE(AO)
  586. LFD f19, 11 * SIZE(AO)
  587. LFD f20, 4 * SIZE(BO)
  588. LFD f21, 5 * SIZE(BO)
  589. LFD f22, 6 * SIZE(BO)
  590. LFD f23, 7 * SIZE(BO)
  591. addi AO, AO, 8 * SIZE
  592. addi BO, BO, 4 * SIZE
  593. DCBT(BO, PREB)
  594. bdnz LL(82)
  595. .align 4
  596. LL(85):
  597. #if defined(LT) || defined(RN)
  598. andi. r0, KK, 3
  599. #else
  600. andi. r0, TEMP, 3
  601. #endif
  602. mtspr CTR, r0
  603. ble+ LL(88)
  604. .align 4
  605. LL(86):
  606. FMADD f0, f16, f20, f0
  607. FMADD f1, f17, f20, f1
  608. LFD f16, 2 * SIZE(AO)
  609. LFD f17, 3 * SIZE(AO)
  610. LFD f20, 1 * SIZE(BO)
  611. addi BO, BO, 1 * SIZE
  612. addi AO, AO, 2 * SIZE
  613. bdnz LL(86)
  614. .align 4
  615. LL(88):
  616. FADD f0, f2, f0
  617. FADD f1, f3, f1
  618. #if defined(LN) || defined(RT)
  619. #ifdef LN
  620. subi r0, KK, 2
  621. #else
  622. subi r0, KK, 1
  623. #endif
  624. slwi TEMP, r0, 1 + BASE_SHIFT
  625. slwi r0, r0, 0 + BASE_SHIFT
  626. add AO, AORIG, TEMP
  627. add BO, B, r0
  628. #endif
  629. #if defined(LN) || defined(LT)
  630. LFD f16, 0 * SIZE(BO)
  631. LFD f20, 1 * SIZE(BO)
  632. FSUB f0, f16, f0
  633. FSUB f1, f20, f1
  634. #else
  635. LFD f16, 0 * SIZE(AO)
  636. LFD f17, 1 * SIZE(AO)
  637. FSUB f0, f16, f0
  638. FSUB f1, f17, f1
  639. #endif
  640. #ifdef LN
  641. LFD f19, 3 * SIZE(AO)
  642. LFD f20, 2 * SIZE(AO)
  643. LFD f21, 0 * SIZE(AO)
  644. FMUL f1, f19, f1
  645. FNMSUB f0, f20, f1, f0
  646. FMUL f0, f21, f0
  647. #endif
  648. #ifdef LT
  649. LFD f16, 0 * SIZE(AO)
  650. LFD f17, 1 * SIZE(AO)
  651. FMUL f0, f16, f0
  652. FNMSUB f1, f17, f0, f1
  653. LFD f17, 3 * SIZE(AO)
  654. FMUL f1, f17, f1
  655. #endif
  656. #ifdef RN
  657. LFD f16, 0 * SIZE(BO)
  658. FMUL f0, f16, f0
  659. FMUL f1, f16, f1
  660. #endif
  661. #ifdef RT
  662. LFD f21, 0 * SIZE(BO)
  663. FMUL f0, f21, f0
  664. FMUL f1, f21, f1
  665. #endif
  666. #ifdef LN
  667. subi CO1, CO1, 2 * SIZE
  668. #endif
  669. #if defined(LN) || defined(LT)
  670. STFD f0, 0 * SIZE(BO)
  671. STFD f1, 1 * SIZE(BO)
  672. #else
  673. STFD f0, 0 * SIZE(AO)
  674. STFD f1, 1 * SIZE(AO)
  675. #endif
  676. STFD f0, 0 * SIZE(CO1)
  677. STFD f1, 1 * SIZE(CO1)
  678. lfs f0, FZERO
  679. fmr f1, f0
  680. fmr f2, f0
  681. fmr f3, f0
  682. #ifndef LN
  683. addi CO1, CO1, 2 * SIZE
  684. #endif
  685. #ifdef RT
  686. slwi r0, K, 1 + BASE_SHIFT
  687. add AORIG, AORIG, r0
  688. #endif
  689. #if defined(LT) || defined(RN)
  690. sub TEMP, K, KK
  691. slwi r0, TEMP, 1 + BASE_SHIFT
  692. slwi TEMP, TEMP, 0 + BASE_SHIFT
  693. add AO, AO, r0
  694. add BO, BO, TEMP
  695. #endif
  696. #ifdef LN
  697. subi KK, KK, 2
  698. #endif
  699. #ifdef LT
  700. addi KK, KK, 2
  701. #endif
  702. .align 4
  703. LL(90):
  704. andi. I, M, 1
  705. ble LL(99)
  706. #if defined(LT) || defined(RN)
  707. LFD f16, 0 * SIZE(AO)
  708. LFD f17, 1 * SIZE(AO)
  709. LFD f18, 2 * SIZE(AO)
  710. LFD f19, 3 * SIZE(AO)
  711. LFD f20, 0 * SIZE(B)
  712. LFD f21, 1 * SIZE(B)
  713. LFD f22, 2 * SIZE(B)
  714. LFD f23, 3 * SIZE(B)
  715. srawi. r0, KK, 3
  716. mtspr CTR, r0
  717. mr BO, B
  718. #else
  719. #ifdef LN
  720. slwi r0, K, BASE_SHIFT
  721. sub AORIG, AORIG, r0
  722. #endif
  723. slwi r0, KK, 0 + BASE_SHIFT
  724. slwi TEMP, KK, 0 + BASE_SHIFT
  725. add AO, AORIG, r0
  726. add BO, B, TEMP
  727. sub TEMP, K, KK
  728. LFD f16, 0 * SIZE(AO)
  729. LFD f17, 1 * SIZE(AO)
  730. LFD f18, 2 * SIZE(AO)
  731. LFD f19, 3 * SIZE(AO)
  732. LFD f20, 0 * SIZE(BO)
  733. LFD f21, 1 * SIZE(BO)
  734. LFD f22, 2 * SIZE(BO)
  735. LFD f23, 3 * SIZE(BO)
  736. srawi. r0, TEMP, 3
  737. mtspr CTR, r0
  738. #endif
  739. ble LL(95)
  740. .align 5
  741. LL(92):
  742. FMADD f0, f16, f20, f0
  743. FMADD f1, f17, f21, f1
  744. FMADD f2, f18, f22, f2
  745. FMADD f3, f19, f23, f3
  746. LFD f16, 4 * SIZE(AO)
  747. LFD f17, 5 * SIZE(AO)
  748. LFD f18, 6 * SIZE(AO)
  749. LFD f19, 7 * SIZE(AO)
  750. LFD f20, 4 * SIZE(BO)
  751. LFD f21, 5 * SIZE(BO)
  752. LFD f22, 6 * SIZE(BO)
  753. LFD f23, 7 * SIZE(BO)
  754. FMADD f0, f16, f20, f0
  755. FMADD f1, f17, f21, f1
  756. FMADD f2, f18, f22, f2
  757. FMADD f3, f19, f23, f3
  758. LFD f16, 8 * SIZE(AO)
  759. LFD f17, 9 * SIZE(AO)
  760. LFD f18, 10 * SIZE(AO)
  761. LFD f19, 11 * SIZE(AO)
  762. LFD f20, 8 * SIZE(BO)
  763. LFD f21, 9 * SIZE(BO)
  764. LFD f22, 10 * SIZE(BO)
  765. LFD f23, 11 * SIZE(BO)
  766. addi AO, AO, 8 * SIZE
  767. addi BO, BO, 8 * SIZE
  768. bdnz LL(92)
  769. .align 4
  770. LL(95):
  771. #if defined(LT) || defined(RN)
  772. andi. r0, KK, 7
  773. #else
  774. andi. r0, TEMP, 7
  775. #endif
  776. mtspr CTR, r0
  777. ble+ LL(98)
  778. .align 4
  779. LL(96):
  780. FMADD f0, f16, f20, f0
  781. LFD f16, 1 * SIZE(AO)
  782. LFD f20, 1 * SIZE(BO)
  783. addi BO, BO, 1 * SIZE
  784. addi AO, AO, 1 * SIZE
  785. bdnz LL(96)
  786. .align 4
  787. LL(98):
  788. FADD f0, f1, f0
  789. FADD f2, f3, f2
  790. FADD f0, f2, f0
  791. #if defined(LN) || defined(RT)
  792. #ifdef LN
  793. subi r0, KK, 1
  794. #else
  795. subi r0, KK, 1
  796. #endif
  797. slwi TEMP, r0, 0 + BASE_SHIFT
  798. slwi r0, r0, 0 + BASE_SHIFT
  799. add AO, AORIG, TEMP
  800. add BO, B, r0
  801. #endif
  802. #if defined(LN) || defined(LT)
  803. LFD f16, 0 * SIZE(BO)
  804. FSUB f0, f16, f0
  805. #else
  806. LFD f16, 0 * SIZE(AO)
  807. FSUB f0, f16, f0
  808. #endif
  809. #ifdef LN
  810. LFD f21, 0 * SIZE(AO)
  811. FMUL f0, f21, f0
  812. #endif
  813. #ifdef LT
  814. LFD f16, 0 * SIZE(AO)
  815. FMUL f0, f16, f0
  816. #endif
  817. #ifdef RN
  818. LFD f16, 0 * SIZE(BO)
  819. FMUL f0, f16, f0
  820. #endif
  821. #ifdef RT
  822. LFD f21, 0 * SIZE(BO)
  823. FMUL f0, f21, f0
  824. #endif
  825. #ifdef LN
  826. subi CO1, CO1, 1 * SIZE
  827. #endif
  828. #if defined(LN) || defined(LT)
  829. STFD f0, 0 * SIZE(BO)
  830. #else
  831. STFD f0, 0 * SIZE(AO)
  832. #endif
  833. STFD f0, 0 * SIZE(CO1)
  834. lfs f0, FZERO
  835. #ifndef LN
  836. addi CO1, CO1, 1 * SIZE
  837. #endif
  838. #ifdef RT
  839. slwi r0, K, 0 + BASE_SHIFT
  840. add AORIG, AORIG, r0
  841. #endif
  842. #if defined(LT) || defined(RN)
  843. sub TEMP, K, KK
  844. slwi r0, TEMP, 0 + BASE_SHIFT
  845. slwi TEMP, TEMP, 0 + BASE_SHIFT
  846. add AO, AO, r0
  847. add BO, BO, TEMP
  848. #endif
  849. #ifdef LN
  850. subi KK, KK, 1
  851. #endif
  852. #ifdef LT
  853. addi KK, KK, 1
  854. #endif
  855. .align 4
  856. LL(99):
  857. #ifdef LN
  858. slwi r0, K, 0 + BASE_SHIFT
  859. add B, B, r0
  860. #endif
  861. #if defined(LT) || defined(RN)
  862. mr B, BO
  863. #endif
  864. #ifdef RN
  865. addi KK, KK, 1
  866. #endif
  867. #ifdef RT
  868. subi KK, KK, 1
  869. #endif
  870. .align 4
  871. LL(40):
  872. andi. J, N, 2
  873. ble LL(09)
  874. #ifdef RT
  875. slwi r0, K, 1 + BASE_SHIFT
  876. sub B, B, r0
  877. slwi r0, LDC, 1
  878. sub C, C, r0
  879. #endif
  880. mr CO1, C
  881. add CO2, C, LDC
  882. #ifdef LN
  883. add KK, M, OFFSET
  884. #endif
  885. #ifdef LT
  886. mr KK, OFFSET
  887. #endif
  888. fmr f1, f0
  889. fmr f2, f0
  890. fmr f3, f0
  891. fmr f4, f0
  892. fmr f5, f0
  893. fmr f6, f0
  894. fmr f7, f0
  895. srawi. I, M, 2
  896. #if defined(LN) || defined(RT)
  897. mr AORIG, A
  898. #else
  899. mr AO, A
  900. #endif
  901. #ifndef RT
  902. add C, CO2, LDC
  903. #endif
  904. ble LL(50)
  905. .align 4
  906. LL(41):
  907. #if defined(LT) || defined(RN)
  908. LFD f16, 0 * SIZE(AO)
  909. LFD f17, 1 * SIZE(AO)
  910. LFD f18, 2 * SIZE(AO)
  911. LFD f19, 3 * SIZE(AO)
  912. LFD f20, 0 * SIZE(B)
  913. LFD f21, 1 * SIZE(B)
  914. LFD f22, 2 * SIZE(B)
  915. LFD f23, 3 * SIZE(B)
  916. dcbt CO1, PREC
  917. dcbt CO2, PREC
  918. srawi. r0, KK, 2
  919. mtspr CTR, r0
  920. mr BO, B
  921. #else
  922. #ifdef LN
  923. slwi r0, K, 2 + BASE_SHIFT
  924. sub AORIG, AORIG, r0
  925. #endif
  926. slwi r0, KK, 2 + BASE_SHIFT
  927. slwi TEMP, KK, 1 + BASE_SHIFT
  928. add AO, AORIG, r0
  929. add BO, B, TEMP
  930. sub TEMP, K, KK
  931. LFD f16, 0 * SIZE(AO)
  932. LFD f17, 1 * SIZE(AO)
  933. LFD f18, 2 * SIZE(AO)
  934. LFD f19, 3 * SIZE(AO)
  935. LFD f20, 0 * SIZE(BO)
  936. LFD f21, 1 * SIZE(BO)
  937. LFD f22, 2 * SIZE(BO)
  938. LFD f23, 3 * SIZE(BO)
  939. dcbt CO1, PREC
  940. dcbt CO2, PREC
  941. srawi. r0, TEMP, 2
  942. mtspr CTR, r0
  943. #endif
  944. ble LL(45)
  945. .align 5
  946. LL(42):
  947. FMADD f0, f16, f20, f0
  948. FMADD f1, f17, f20, f1
  949. FMADD f2, f18, f20, f2
  950. FMADD f3, f19, f20, f3
  951. FMADD f4, f16, f21, f4
  952. FMADD f5, f17, f21, f5
  953. FMADD f6, f18, f21, f6
  954. FMADD f7, f19, f21, f7
  955. LFD f16, 4 * SIZE(AO)
  956. LFD f17, 5 * SIZE(AO)
  957. LFD f18, 6 * SIZE(AO)
  958. LFD f19, 7 * SIZE(AO)
  959. FMADD f0, f16, f22, f0
  960. FMADD f1, f17, f22, f1
  961. FMADD f2, f18, f22, f2
  962. FMADD f3, f19, f22, f3
  963. FMADD f4, f16, f23, f4
  964. FMADD f5, f17, f23, f5
  965. FMADD f6, f18, f23, f6
  966. FMADD f7, f19, f23, f7
  967. LFD f16, 8 * SIZE(AO)
  968. LFD f17, 9 * SIZE(AO)
  969. LFD f18, 10 * SIZE(AO)
  970. LFD f19, 11 * SIZE(AO)
  971. LFD f20, 4 * SIZE(BO)
  972. LFD f21, 5 * SIZE(BO)
  973. LFD f22, 6 * SIZE(BO)
  974. LFD f23, 7 * SIZE(BO)
  975. FMADD f0, f16, f20, f0
  976. FMADD f1, f17, f20, f1
  977. FMADD f2, f18, f20, f2
  978. FMADD f3, f19, f20, f3
  979. FMADD f4, f16, f21, f4
  980. FMADD f5, f17, f21, f5
  981. FMADD f6, f18, f21, f6
  982. FMADD f7, f19, f21, f7
  983. LFD f16, 12 * SIZE(AO)
  984. LFD f17, 13 * SIZE(AO)
  985. LFD f18, 14 * SIZE(AO)
  986. LFD f19, 15 * SIZE(AO)
  987. FMADD f0, f16, f22, f0
  988. FMADD f1, f17, f22, f1
  989. FMADD f2, f18, f22, f2
  990. FMADD f3, f19, f22, f3
  991. FMADD f4, f16, f23, f4
  992. FMADD f5, f17, f23, f5
  993. FMADD f6, f18, f23, f6
  994. FMADD f7, f19, f23, f7
  995. LFD f16, 16 * SIZE(AO)
  996. LFD f17, 17 * SIZE(AO)
  997. LFD f18, 18 * SIZE(AO)
  998. LFD f19, 19 * SIZE(AO)
  999. LFD f20, 8 * SIZE(BO)
  1000. LFD f21, 9 * SIZE(BO)
  1001. LFD f22, 10 * SIZE(BO)
  1002. LFD f23, 11 * SIZE(BO)
  1003. addi AO, AO, 16 * SIZE
  1004. addi BO, BO, 8 * SIZE
  1005. DCBT(BO, PREB)
  1006. bdnz LL(42)
  1007. .align 4
  1008. LL(45):
  1009. #if defined(LT) || defined(RN)
  1010. andi. r0, KK, 3
  1011. #else
  1012. andi. r0, TEMP, 3
  1013. #endif
  1014. mtspr CTR, r0
  1015. ble+ LL(48)
  1016. .align 4
  1017. LL(46):
  1018. FMADD f0, f16, f20, f0
  1019. FMADD f1, f17, f20, f1
  1020. FMADD f2, f18, f20, f2
  1021. FMADD f3, f19, f20, f3
  1022. FMADD f4, f16, f21, f4
  1023. FMADD f5, f17, f21, f5
  1024. FMADD f6, f18, f21, f6
  1025. FMADD f7, f19, f21, f7
  1026. LFD f16, 4 * SIZE(AO)
  1027. LFD f17, 5 * SIZE(AO)
  1028. LFD f18, 6 * SIZE(AO)
  1029. LFD f19, 7 * SIZE(AO)
  1030. LFD f20, 2 * SIZE(BO)
  1031. LFD f21, 3 * SIZE(BO)
  1032. addi BO, BO, 2 * SIZE
  1033. addi AO, AO, 4 * SIZE
  1034. bdnz LL(46)
  1035. .align 4
  1036. LL(48):
  1037. #if defined(LN) || defined(RT)
  1038. #ifdef LN
  1039. subi r0, KK, 4
  1040. #else
  1041. subi r0, KK, 2
  1042. #endif
  1043. slwi TEMP, r0, 2 + BASE_SHIFT
  1044. slwi r0, r0, 1 + BASE_SHIFT
  1045. add AO, AORIG, TEMP
  1046. add BO, B, r0
  1047. #endif
  1048. #if defined(LN) || defined(LT)
  1049. LFD f16, 0 * SIZE(BO)
  1050. LFD f17, 1 * SIZE(BO)
  1051. LFD f20, 2 * SIZE(BO)
  1052. LFD f21, 3 * SIZE(BO)
  1053. LFD f24, 4 * SIZE(BO)
  1054. LFD f25, 5 * SIZE(BO)
  1055. LFD f28, 6 * SIZE(BO)
  1056. LFD f29, 7 * SIZE(BO)
  1057. FSUB f0, f16, f0
  1058. FSUB f4, f17, f4
  1059. FSUB f1, f20, f1
  1060. FSUB f5, f21, f5
  1061. FSUB f2, f24, f2
  1062. FSUB f6, f25, f6
  1063. FSUB f3, f28, f3
  1064. FSUB f7, f29, f7
  1065. #else
  1066. LFD f16, 0 * SIZE(AO)
  1067. LFD f17, 1 * SIZE(AO)
  1068. LFD f18, 2 * SIZE(AO)
  1069. LFD f19, 3 * SIZE(AO)
  1070. LFD f20, 4 * SIZE(AO)
  1071. LFD f21, 5 * SIZE(AO)
  1072. LFD f22, 6 * SIZE(AO)
  1073. LFD f23, 7 * SIZE(AO)
  1074. FSUB f0, f16, f0
  1075. FSUB f1, f17, f1
  1076. FSUB f2, f18, f2
  1077. FSUB f3, f19, f3
  1078. FSUB f4, f20, f4
  1079. FSUB f5, f21, f5
  1080. FSUB f6, f22, f6
  1081. FSUB f7, f23, f7
  1082. #endif
  1083. #ifdef LN
  1084. LFD f16, 15 * SIZE(AO)
  1085. LFD f17, 14 * SIZE(AO)
  1086. LFD f18, 13 * SIZE(AO)
  1087. LFD f19, 12 * SIZE(AO)
  1088. FMUL f3, f16, f3
  1089. FMUL f7, f16, f7
  1090. FNMSUB f2, f17, f3, f2
  1091. FNMSUB f6, f17, f7, f6
  1092. FNMSUB f1, f18, f3, f1
  1093. FNMSUB f5, f18, f7, f5
  1094. FNMSUB f0, f19, f3, f0
  1095. FNMSUB f4, f19, f7, f4
  1096. LFD f16, 10 * SIZE(AO)
  1097. LFD f17, 9 * SIZE(AO)
  1098. LFD f18, 8 * SIZE(AO)
  1099. LFD f19, 5 * SIZE(AO)
  1100. LFD f20, 4 * SIZE(AO)
  1101. LFD f21, 0 * SIZE(AO)
  1102. FMUL f2, f16, f2
  1103. FMUL f6, f16, f6
  1104. FNMSUB f1, f17, f2, f1
  1105. FNMSUB f5, f17, f6, f5
  1106. FNMSUB f0, f18, f2, f0
  1107. FNMSUB f4, f18, f6, f4
  1108. FMUL f1, f19, f1
  1109. FMUL f5, f19, f5
  1110. FNMSUB f0, f20, f1, f0
  1111. FNMSUB f4, f20, f5, f4
  1112. FMUL f0, f21, f0
  1113. FMUL f4, f21, f4
  1114. #endif
  1115. #ifdef LT
  1116. LFD f16, 0 * SIZE(AO)
  1117. LFD f17, 1 * SIZE(AO)
  1118. LFD f18, 2 * SIZE(AO)
  1119. LFD f19, 3 * SIZE(AO)
  1120. FMUL f0, f16, f0
  1121. FMUL f4, f16, f4
  1122. FNMSUB f1, f17, f0, f1
  1123. FNMSUB f5, f17, f4, f5
  1124. FNMSUB f2, f18, f0, f2
  1125. FNMSUB f6, f18, f4, f6
  1126. FNMSUB f3, f19, f0, f3
  1127. FNMSUB f7, f19, f4, f7
  1128. LFD f17, 5 * SIZE(AO)
  1129. LFD f18, 6 * SIZE(AO)
  1130. LFD f19, 7 * SIZE(AO)
  1131. FMUL f1, f17, f1
  1132. FMUL f5, f17, f5
  1133. FNMSUB f2, f18, f1, f2
  1134. FNMSUB f6, f18, f5, f6
  1135. FNMSUB f3, f19, f1, f3
  1136. FNMSUB f7, f19, f5, f7
  1137. LFD f18, 10 * SIZE(AO)
  1138. LFD f19, 11 * SIZE(AO)
  1139. FMUL f2, f18, f2
  1140. FMUL f6, f18, f6
  1141. FNMSUB f3, f19, f2, f3
  1142. FNMSUB f7, f19, f6, f7
  1143. LFD f19, 15 * SIZE(AO)
  1144. FMUL f3, f19, f3
  1145. FMUL f7, f19, f7
  1146. #endif
  1147. #ifdef RN
  1148. LFD f16, 0 * SIZE(BO)
  1149. LFD f17, 1 * SIZE(BO)
  1150. LFD f18, 3 * SIZE(BO)
  1151. FMUL f0, f16, f0
  1152. FMUL f1, f16, f1
  1153. FMUL f2, f16, f2
  1154. FMUL f3, f16, f3
  1155. FNMSUB f4, f17, f0, f4
  1156. FNMSUB f5, f17, f1, f5
  1157. FNMSUB f6, f17, f2, f6
  1158. FNMSUB f7, f17, f3, f7
  1159. FMUL f4, f18, f4
  1160. FMUL f5, f18, f5
  1161. FMUL f6, f18, f6
  1162. FMUL f7, f18, f7
  1163. #endif
  1164. #ifdef RT
  1165. LFD f19, 3 * SIZE(BO)
  1166. LFD f20, 2 * SIZE(BO)
  1167. LFD f21, 0 * SIZE(BO)
  1168. FMUL f4, f19, f4
  1169. FMUL f5, f19, f5
  1170. FMUL f6, f19, f6
  1171. FMUL f7, f19, f7
  1172. FNMSUB f0, f20, f4, f0
  1173. FNMSUB f1, f20, f5, f1
  1174. FNMSUB f2, f20, f6, f2
  1175. FNMSUB f3, f20, f7, f3
  1176. FMUL f0, f21, f0
  1177. FMUL f1, f21, f1
  1178. FMUL f2, f21, f2
  1179. FMUL f3, f21, f3
  1180. #endif
  1181. #ifdef LN
  1182. subi CO1, CO1, 4 * SIZE
  1183. subi CO2, CO2, 4 * SIZE
  1184. #endif
  1185. #if defined(LN) || defined(LT)
  1186. STFD f0, 0 * SIZE(BO)
  1187. STFD f4, 1 * SIZE(BO)
  1188. STFD f1, 2 * SIZE(BO)
  1189. STFD f5, 3 * SIZE(BO)
  1190. STFD f2, 4 * SIZE(BO)
  1191. STFD f6, 5 * SIZE(BO)
  1192. STFD f3, 6 * SIZE(BO)
  1193. STFD f7, 7 * SIZE(BO)
  1194. #else
  1195. STFD f0, 0 * SIZE(AO)
  1196. STFD f1, 1 * SIZE(AO)
  1197. STFD f2, 2 * SIZE(AO)
  1198. STFD f3, 3 * SIZE(AO)
  1199. STFD f4, 4 * SIZE(AO)
  1200. STFD f5, 5 * SIZE(AO)
  1201. STFD f6, 6 * SIZE(AO)
  1202. STFD f7, 7 * SIZE(AO)
  1203. #endif
  1204. STFD f0, 0 * SIZE(CO1)
  1205. STFD f1, 1 * SIZE(CO1)
  1206. STFD f2, 2 * SIZE(CO1)
  1207. STFD f3, 3 * SIZE(CO1)
  1208. STFD f4, 0 * SIZE(CO2)
  1209. STFD f5, 1 * SIZE(CO2)
  1210. STFD f6, 2 * SIZE(CO2)
  1211. STFD f7, 3 * SIZE(CO2)
  1212. lfs f0, FZERO
  1213. fmr f1, f0
  1214. fmr f2, f0
  1215. fmr f3, f0
  1216. fmr f4, f0
  1217. fmr f5, f0
  1218. fmr f6, f0
  1219. fmr f7, f0
  1220. #ifndef LN
  1221. addi CO1, CO1, 4 * SIZE
  1222. addi CO2, CO2, 4 * SIZE
  1223. #endif
  1224. #ifdef RT
  1225. slwi r0, K, 2 + BASE_SHIFT
  1226. add AORIG, AORIG, r0
  1227. #endif
  1228. #if defined(LT) || defined(RN)
  1229. sub TEMP, K, KK
  1230. slwi r0, TEMP, 2 + BASE_SHIFT
  1231. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1232. add AO, AO, r0
  1233. add BO, BO, TEMP
  1234. #endif
  1235. #ifdef LN
  1236. subi KK, KK, 4
  1237. #endif
  1238. #ifdef LT
  1239. addi KK, KK, 4
  1240. #endif
  1241. addic. I, I, -1
  1242. bgt+ LL(41)
  1243. .align 4
  1244. LL(50):
  1245. andi. I, M, 2
  1246. ble LL(60)
  1247. #if defined(LT) || defined(RN)
  1248. LFD f16, 0 * SIZE(AO)
  1249. LFD f17, 1 * SIZE(AO)
  1250. LFD f18, 2 * SIZE(AO)
  1251. LFD f19, 3 * SIZE(AO)
  1252. LFD f20, 0 * SIZE(B)
  1253. LFD f21, 1 * SIZE(B)
  1254. LFD f22, 2 * SIZE(B)
  1255. LFD f23, 3 * SIZE(B)
  1256. LFD f24, 4 * SIZE(B)
  1257. LFD f25, 5 * SIZE(B)
  1258. LFD f26, 6 * SIZE(B)
  1259. LFD f27, 7 * SIZE(B)
  1260. srawi. r0, KK, 2
  1261. mtspr CTR, r0
  1262. mr BO, B
  1263. #else
  1264. #ifdef LN
  1265. slwi r0, K, 1 + BASE_SHIFT
  1266. sub AORIG, AORIG, r0
  1267. #endif
  1268. slwi r0, KK, 1 + BASE_SHIFT
  1269. slwi TEMP, KK, 1 + BASE_SHIFT
  1270. add AO, AORIG, r0
  1271. add BO, B, TEMP
  1272. sub TEMP, K, KK
  1273. LFD f16, 0 * SIZE(AO)
  1274. LFD f17, 1 * SIZE(AO)
  1275. LFD f18, 2 * SIZE(AO)
  1276. LFD f19, 3 * SIZE(AO)
  1277. LFD f20, 0 * SIZE(BO)
  1278. LFD f21, 1 * SIZE(BO)
  1279. LFD f22, 2 * SIZE(BO)
  1280. LFD f23, 3 * SIZE(BO)
  1281. LFD f24, 4 * SIZE(BO)
  1282. LFD f25, 5 * SIZE(BO)
  1283. LFD f26, 6 * SIZE(BO)
  1284. LFD f27, 7 * SIZE(BO)
  1285. srawi. r0, TEMP, 2
  1286. mtspr CTR, r0
  1287. #endif
  1288. ble LL(55)
  1289. .align 5
  1290. LL(52):
  1291. FMADD f0, f16, f20, f0
  1292. FMADD f1, f17, f20, f1
  1293. FMADD f2, f16, f21, f2
  1294. FMADD f3, f17, f21, f3
  1295. FMADD f4, f18, f22, f4
  1296. FMADD f5, f19, f22, f5
  1297. FMADD f6, f18, f23, f6
  1298. FMADD f7, f19, f23, f7
  1299. LFD f16, 4 * SIZE(AO)
  1300. LFD f17, 5 * SIZE(AO)
  1301. LFD f18, 6 * SIZE(AO)
  1302. LFD f19, 7 * SIZE(AO)
  1303. LFD f20, 8 * SIZE(BO)
  1304. LFD f21, 9 * SIZE(BO)
  1305. LFD f22, 10 * SIZE(BO)
  1306. LFD f23, 11 * SIZE(BO)
  1307. FMADD f0, f16, f24, f0
  1308. FMADD f1, f17, f24, f1
  1309. FMADD f2, f16, f25, f2
  1310. FMADD f3, f17, f25, f3
  1311. FMADD f4, f18, f26, f4
  1312. FMADD f5, f19, f26, f5
  1313. FMADD f6, f18, f27, f6
  1314. FMADD f7, f19, f27, f7
  1315. LFD f16, 8 * SIZE(AO)
  1316. LFD f17, 9 * SIZE(AO)
  1317. LFD f18, 10 * SIZE(AO)
  1318. LFD f19, 11 * SIZE(AO)
  1319. LFD f24, 12 * SIZE(BO)
  1320. LFD f25, 13 * SIZE(BO)
  1321. LFD f26, 14 * SIZE(BO)
  1322. LFD f27, 15 * SIZE(BO)
  1323. addi AO, AO, 8 * SIZE
  1324. addi BO, BO, 8 * SIZE
  1325. DCBT(BO, PREB)
  1326. bdnz LL(52)
  1327. .align 4
  1328. LL(55):
  1329. #if defined(LT) || defined(RN)
  1330. andi. r0, KK, 3
  1331. #else
  1332. andi. r0, TEMP, 3
  1333. #endif
  1334. mtspr CTR, r0
  1335. ble+ LL(58)
  1336. .align 4
  1337. LL(56):
  1338. FMADD f0, f16, f20, f0
  1339. FMADD f1, f17, f20, f1
  1340. FMADD f2, f16, f21, f2
  1341. FMADD f3, f17, f21, f3
  1342. LFD f16, 2 * SIZE(AO)
  1343. LFD f17, 3 * SIZE(AO)
  1344. LFD f20, 2 * SIZE(BO)
  1345. LFD f21, 3 * SIZE(BO)
  1346. addi BO, BO, 2 * SIZE
  1347. addi AO, AO, 2 * SIZE
  1348. bdnz LL(56)
  1349. .align 4
  1350. LL(58):
  1351. FADD f0, f4, f0
  1352. FADD f1, f5, f1
  1353. FADD f2, f6, f2
  1354. FADD f3, f7, f3
  1355. #if defined(LN) || defined(RT)
  1356. #ifdef LN
  1357. subi r0, KK, 2
  1358. #else
  1359. subi r0, KK, 2
  1360. #endif
  1361. slwi TEMP, r0, 1 + BASE_SHIFT
  1362. slwi r0, r0, 1 + BASE_SHIFT
  1363. add AO, AORIG, TEMP
  1364. add BO, B, r0
  1365. #endif
  1366. #if defined(LN) || defined(LT)
  1367. LFD f16, 0 * SIZE(BO)
  1368. LFD f17, 1 * SIZE(BO)
  1369. LFD f20, 2 * SIZE(BO)
  1370. LFD f21, 3 * SIZE(BO)
  1371. FSUB f0, f16, f0
  1372. FSUB f2, f17, f2
  1373. FSUB f1, f20, f1
  1374. FSUB f3, f21, f3
  1375. #else
  1376. LFD f16, 0 * SIZE(AO)
  1377. LFD f17, 1 * SIZE(AO)
  1378. LFD f20, 2 * SIZE(AO)
  1379. LFD f21, 3 * SIZE(AO)
  1380. FSUB f0, f16, f0
  1381. FSUB f1, f17, f1
  1382. FSUB f2, f20, f2
  1383. FSUB f3, f21, f3
  1384. #endif
  1385. #ifdef LN
  1386. LFD f19, 3 * SIZE(AO)
  1387. LFD f20, 2 * SIZE(AO)
  1388. LFD f21, 0 * SIZE(AO)
  1389. FMUL f1, f19, f1
  1390. FMUL f3, f19, f3
  1391. FNMSUB f0, f20, f1, f0
  1392. FNMSUB f2, f20, f3, f2
  1393. FMUL f0, f21, f0
  1394. FMUL f2, f21, f2
  1395. #endif
  1396. #ifdef LT
  1397. LFD f16, 0 * SIZE(AO)
  1398. LFD f17, 1 * SIZE(AO)
  1399. FMUL f0, f16, f0
  1400. FMUL f2, f16, f2
  1401. FNMSUB f1, f17, f0, f1
  1402. FNMSUB f3, f17, f2, f3
  1403. LFD f17, 3 * SIZE(AO)
  1404. FMUL f1, f17, f1
  1405. FMUL f3, f17, f3
  1406. #endif
  1407. #ifdef RN
  1408. LFD f16, 0 * SIZE(BO)
  1409. LFD f17, 1 * SIZE(BO)
  1410. LFD f18, 3 * SIZE(BO)
  1411. FMUL f0, f16, f0
  1412. FMUL f1, f16, f1
  1413. FNMSUB f2, f17, f0, f2
  1414. FNMSUB f3, f17, f1, f3
  1415. FMUL f2, f18, f2
  1416. FMUL f3, f18, f3
  1417. #endif
  1418. #ifdef RT
  1419. LFD f19, 3 * SIZE(BO)
  1420. LFD f20, 2 * SIZE(BO)
  1421. LFD f21, 0 * SIZE(BO)
  1422. FMUL f2, f19, f2
  1423. FMUL f3, f19, f3
  1424. FNMSUB f0, f20, f2, f0
  1425. FNMSUB f1, f20, f3, f1
  1426. FMUL f0, f21, f0
  1427. FMUL f1, f21, f1
  1428. #endif
  1429. #ifdef LN
  1430. subi CO1, CO1, 2 * SIZE
  1431. subi CO2, CO2, 2 * SIZE
  1432. #endif
  1433. #if defined(LN) || defined(LT)
  1434. STFD f0, 0 * SIZE(BO)
  1435. STFD f2, 1 * SIZE(BO)
  1436. STFD f1, 2 * SIZE(BO)
  1437. STFD f3, 3 * SIZE(BO)
  1438. #else
  1439. STFD f0, 0 * SIZE(AO)
  1440. STFD f1, 1 * SIZE(AO)
  1441. STFD f2, 2 * SIZE(AO)
  1442. STFD f3, 3 * SIZE(AO)
  1443. #endif
  1444. STFD f0, 0 * SIZE(CO1)
  1445. STFD f1, 1 * SIZE(CO1)
  1446. STFD f2, 0 * SIZE(CO2)
  1447. STFD f3, 1 * SIZE(CO2)
  1448. lfs f0, FZERO
  1449. fmr f1, f0
  1450. fmr f2, f0
  1451. fmr f3, f0
  1452. fmr f4, f0
  1453. fmr f5, f0
  1454. fmr f6, f0
  1455. fmr f7, f0
  1456. #ifndef LN
  1457. addi CO1, CO1, 2 * SIZE
  1458. addi CO2, CO2, 2 * SIZE
  1459. #endif
  1460. #ifdef RT
  1461. slwi r0, K, 1 + BASE_SHIFT
  1462. add AORIG, AORIG, r0
  1463. #endif
  1464. #if defined(LT) || defined(RN)
  1465. sub TEMP, K, KK
  1466. slwi r0, TEMP, 1 + BASE_SHIFT
  1467. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1468. add AO, AO, r0
  1469. add BO, BO, TEMP
  1470. #endif
  1471. #ifdef LN
  1472. subi KK, KK, 2
  1473. #endif
  1474. #ifdef LT
  1475. addi KK, KK, 2
  1476. #endif
  1477. .align 4
  1478. LL(60):
  1479. andi. I, M, 1
  1480. ble LL(69)
  1481. #if defined(LT) || defined(RN)
  1482. LFD f16, 0 * SIZE(AO)
  1483. LFD f17, 1 * SIZE(AO)
  1484. LFD f18, 2 * SIZE(AO)
  1485. LFD f19, 3 * SIZE(AO)
  1486. LFD f20, 0 * SIZE(B)
  1487. LFD f21, 1 * SIZE(B)
  1488. LFD f22, 2 * SIZE(B)
  1489. LFD f23, 3 * SIZE(B)
  1490. LFD f24, 4 * SIZE(B)
  1491. LFD f25, 5 * SIZE(B)
  1492. LFD f26, 6 * SIZE(B)
  1493. LFD f27, 7 * SIZE(B)
  1494. srawi. r0, KK, 2
  1495. mtspr CTR, r0
  1496. mr BO, B
  1497. #else
  1498. #ifdef LN
  1499. slwi r0, K, BASE_SHIFT
  1500. sub AORIG, AORIG, r0
  1501. #endif
  1502. slwi r0, KK, 0 + BASE_SHIFT
  1503. slwi TEMP, KK, 1 + BASE_SHIFT
  1504. add AO, AORIG, r0
  1505. add BO, B, TEMP
  1506. sub TEMP, K, KK
  1507. LFD f16, 0 * SIZE(AO)
  1508. LFD f17, 1 * SIZE(AO)
  1509. LFD f18, 2 * SIZE(AO)
  1510. LFD f19, 3 * SIZE(AO)
  1511. LFD f20, 0 * SIZE(BO)
  1512. LFD f21, 1 * SIZE(BO)
  1513. LFD f22, 2 * SIZE(BO)
  1514. LFD f23, 3 * SIZE(BO)
  1515. LFD f24, 4 * SIZE(BO)
  1516. LFD f25, 5 * SIZE(BO)
  1517. LFD f26, 6 * SIZE(BO)
  1518. LFD f27, 7 * SIZE(BO)
  1519. srawi. r0, TEMP, 2
  1520. mtspr CTR, r0
  1521. #endif
  1522. ble LL(65)
  1523. .align 5
  1524. LL(62):
  1525. FMADD f0, f16, f20, f0
  1526. FMADD f1, f16, f21, f1
  1527. FMADD f2, f17, f22, f2
  1528. FMADD f3, f17, f23, f3
  1529. LFD f20, 8 * SIZE(BO)
  1530. LFD f21, 9 * SIZE(BO)
  1531. LFD f22, 10 * SIZE(BO)
  1532. LFD f23, 11 * SIZE(BO)
  1533. FMADD f0, f18, f24, f0
  1534. FMADD f1, f18, f25, f1
  1535. FMADD f2, f19, f26, f2
  1536. FMADD f3, f19, f27, f3
  1537. LFD f16, 4 * SIZE(AO)
  1538. LFD f17, 5 * SIZE(AO)
  1539. LFD f18, 6 * SIZE(AO)
  1540. LFD f19, 7 * SIZE(AO)
  1541. LFD f24, 12 * SIZE(BO)
  1542. LFD f25, 13 * SIZE(BO)
  1543. LFD f26, 14 * SIZE(BO)
  1544. LFD f27, 15 * SIZE(BO)
  1545. addi AO, AO, 4 * SIZE
  1546. addi BO, BO, 8 * SIZE
  1547. bdnz LL(62)
  1548. .align 4
  1549. LL(65):
  1550. #if defined(LT) || defined(RN)
  1551. andi. r0, KK, 3
  1552. #else
  1553. andi. r0, TEMP, 3
  1554. #endif
  1555. mtspr CTR, r0
  1556. ble+ LL(68)
  1557. .align 4
  1558. LL(66):
  1559. FMADD f0, f16, f20, f0
  1560. FMADD f1, f16, f21, f1
  1561. LFD f16, 1 * SIZE(AO)
  1562. LFD f20, 2 * SIZE(BO)
  1563. LFD f21, 3 * SIZE(BO)
  1564. addi BO, BO, 2 * SIZE
  1565. addi AO, AO, 1 * SIZE
  1566. bdnz LL(66)
  1567. .align 4
  1568. LL(68):
  1569. FADD f0, f2, f0
  1570. FADD f1, f3, f1
  1571. #if defined(LN) || defined(RT)
  1572. #ifdef LN
  1573. subi r0, KK, 1
  1574. #else
  1575. subi r0, KK, 2
  1576. #endif
  1577. slwi TEMP, r0, 0 + BASE_SHIFT
  1578. slwi r0, r0, 1 + BASE_SHIFT
  1579. add AO, AORIG, TEMP
  1580. add BO, B, r0
  1581. #endif
  1582. #if defined(LN) || defined(LT)
  1583. LFD f16, 0 * SIZE(BO)
  1584. LFD f17, 1 * SIZE(BO)
  1585. FSUB f0, f16, f0
  1586. FSUB f1, f17, f1
  1587. #else
  1588. LFD f16, 0 * SIZE(AO)
  1589. LFD f20, 1 * SIZE(AO)
  1590. FSUB f0, f16, f0
  1591. FSUB f1, f20, f1
  1592. #endif
  1593. #ifdef LN
  1594. LFD f21, 0 * SIZE(AO)
  1595. FMUL f0, f21, f0
  1596. FMUL f1, f21, f1
  1597. #endif
  1598. #ifdef LT
  1599. LFD f16, 0 * SIZE(AO)
  1600. FMUL f0, f16, f0
  1601. FMUL f1, f16, f1
  1602. #endif
  1603. #ifdef RN
  1604. LFD f16, 0 * SIZE(BO)
  1605. LFD f17, 1 * SIZE(BO)
  1606. LFD f18, 3 * SIZE(BO)
  1607. FMUL f0, f16, f0
  1608. FNMSUB f1, f17, f0, f1
  1609. FMUL f1, f18, f1
  1610. #endif
  1611. #ifdef RT
  1612. LFD f19, 3 * SIZE(BO)
  1613. LFD f20, 2 * SIZE(BO)
  1614. LFD f21, 0 * SIZE(BO)
  1615. FMUL f1, f19, f1
  1616. FNMSUB f0, f20, f1, f0
  1617. FMUL f0, f21, f0
  1618. #endif
  1619. #ifdef LN
  1620. subi CO1, CO1, 1 * SIZE
  1621. subi CO2, CO2, 1 * SIZE
  1622. #endif
  1623. #if defined(LN) || defined(LT)
  1624. STFD f0, 0 * SIZE(BO)
  1625. STFD f1, 1 * SIZE(BO)
  1626. #else
  1627. STFD f0, 0 * SIZE(AO)
  1628. STFD f1, 1 * SIZE(AO)
  1629. #endif
  1630. STFD f0, 0 * SIZE(CO1)
  1631. STFD f1, 0 * SIZE(CO2)
  1632. lfs f0, FZERO
  1633. fmr f1, f0
  1634. fmr f4, f0
  1635. fmr f5, f0
  1636. #ifndef LN
  1637. addi CO1, CO1, 1 * SIZE
  1638. addi CO2, CO2, 1 * SIZE
  1639. #endif
  1640. #ifdef RT
  1641. slwi r0, K, 0 + BASE_SHIFT
  1642. add AORIG, AORIG, r0
  1643. #endif
  1644. #if defined(LT) || defined(RN)
  1645. sub TEMP, K, KK
  1646. slwi r0, TEMP, 0 + BASE_SHIFT
  1647. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1648. add AO, AO, r0
  1649. add BO, BO, TEMP
  1650. #endif
  1651. #ifdef LN
  1652. subi KK, KK, 1
  1653. #endif
  1654. #ifdef LT
  1655. addi KK, KK, 1
  1656. #endif
  1657. .align 4
  1658. LL(69):
  1659. #ifdef LN
  1660. slwi r0, K, 1 + BASE_SHIFT
  1661. add B, B, r0
  1662. #endif
  1663. #if defined(LT) || defined(RN)
  1664. mr B, BO
  1665. #endif
  1666. #ifdef RN
  1667. addi KK, KK, 2
  1668. #endif
  1669. #ifdef RT
  1670. subi KK, KK, 2
  1671. #endif
  1672. lfs f0, FZERO
  1673. .align 4
  1674. LL(09):
  1675. srawi. J, N, 2
  1676. ble LL(999)
  1677. .align 4
  1678. LL(10):
  1679. #ifdef RT
  1680. slwi r0, K, 2 + BASE_SHIFT
  1681. sub B, B, r0
  1682. slwi r0, LDC, 2
  1683. sub C, C, r0
  1684. #endif
  1685. mr CO1, C
  1686. add CO2, C, LDC
  1687. add CO3, CO2, LDC
  1688. add CO4, CO3, LDC
  1689. #ifdef LN
  1690. add KK, M, OFFSET
  1691. #endif
  1692. #ifdef LT
  1693. mr KK, OFFSET
  1694. #endif
  1695. fmr f1, f0
  1696. fmr f2, f0
  1697. fmr f3, f0
  1698. fmr f4, f0
  1699. fmr f5, f0
  1700. fmr f6, f0
  1701. fmr f7, f0
  1702. fmr f8, f0
  1703. fmr f9, f0
  1704. fmr f10, f0
  1705. fmr f11, f0
  1706. fmr f12, f0
  1707. fmr f13, f0
  1708. fmr f14, f0
  1709. fmr f15, f0
  1710. srawi. I, M, 2
  1711. #if defined(LN) || defined(RT)
  1712. mr AORIG, A
  1713. #else
  1714. mr AO, A
  1715. #endif
  1716. #ifndef RT
  1717. add C, CO4, LDC
  1718. #endif
  1719. ble LL(20)
  1720. .align 4
  1721. LL(11):
  1722. #if defined(LT) || defined(RN)
  1723. LFD f16, 0 * SIZE(AO)
  1724. LFD f17, 1 * SIZE(AO)
  1725. LFD f18, 2 * SIZE(AO)
  1726. LFD f19, 3 * SIZE(AO)
  1727. LFD f20, 0 * SIZE(B)
  1728. LFD f21, 1 * SIZE(B)
  1729. LFD f22, 2 * SIZE(B)
  1730. LFD f23, 3 * SIZE(B)
  1731. dcbt CO1, PREC
  1732. dcbt CO2, PREC
  1733. dcbt CO3, PREC
  1734. dcbt CO4, PREC
  1735. srawi. r0, KK, 2
  1736. mtspr CTR, r0
  1737. mr BO, B
  1738. #else
  1739. #ifdef LN
  1740. slwi r0, K, 2 + BASE_SHIFT
  1741. sub AORIG, AORIG, r0
  1742. #endif
  1743. slwi TEMP, KK, 2 + BASE_SHIFT
  1744. add AO, AORIG, TEMP
  1745. add BO, B, TEMP
  1746. sub TEMP, K, KK
  1747. LFD f16, 0 * SIZE(AO)
  1748. LFD f17, 1 * SIZE(AO)
  1749. LFD f18, 2 * SIZE(AO)
  1750. LFD f19, 3 * SIZE(AO)
  1751. LFD f20, 0 * SIZE(BO)
  1752. LFD f21, 1 * SIZE(BO)
  1753. LFD f22, 2 * SIZE(BO)
  1754. LFD f23, 3 * SIZE(BO)
  1755. LFD f24, 4 * SIZE(AO)
  1756. LFD f25, 5 * SIZE(AO)
  1757. LFD f26, 6 * SIZE(AO)
  1758. LFD f28, 4 * SIZE(BO)
  1759. LFD f29, 5 * SIZE(BO)
  1760. LFD f30, 6 * SIZE(BO)
  1761. dcbt CO1, PREC
  1762. dcbt CO2, PREC
  1763. dcbt CO3, PREC
  1764. dcbt CO4, PREC
  1765. srawi. r0, TEMP, 2
  1766. mtspr CTR, r0
  1767. #endif
  1768. ble LL(15)
  1769. .align 4
  1770. #define NOP1 mr r18, r18
  1771. #define NOP2 mr r19, r19
  1772. LL(12):
  1773. FMADD f0, f16, f20, f0
  1774. dcbt AO, PREA
  1775. FMADD f4, f16, f21, f4
  1776. dcbt BO, PREB
  1777. FMADD f8, f16, f22, f8
  1778. LFD f31, 7 * SIZE(BO)
  1779. FMADD f12, f16, f23, f12
  1780. LFD f27, 7 * SIZE(AO)
  1781. FMADD f1, f17, f20, f1
  1782. LFD f16, 8 * SIZE(AO)
  1783. FMADD f5, f17, f21, f5
  1784. NOP2
  1785. FMADD f9, f17, f22, f9
  1786. NOP1
  1787. FMADD f13, f17, f23, f13
  1788. LFD f17, 9 * SIZE(AO)
  1789. FMADD f2, f18, f20, f2
  1790. NOP1
  1791. FMADD f6, f18, f21, f6
  1792. NOP2
  1793. FMADD f10, f18, f22, f10
  1794. NOP1
  1795. FMADD f14, f18, f23, f14
  1796. LFD f18, 10 * SIZE(AO)
  1797. FMADD f3, f19, f20, f3
  1798. LFD f20, 8 * SIZE(BO)
  1799. FMADD f7, f19, f21, f7
  1800. LFD f21, 9 * SIZE(BO)
  1801. FMADD f11, f19, f22, f11
  1802. LFD f22, 10 * SIZE(BO)
  1803. FMADD f15, f19, f23, f15
  1804. LFD f19, 11 * SIZE(AO)
  1805. FMADD f0, f24, f28, f0
  1806. LFD f23, 11 * SIZE(BO)
  1807. FMADD f4, f24, f29, f4
  1808. NOP2
  1809. FMADD f8, f24, f30, f8
  1810. NOP1
  1811. FMADD f12, f24, f31, f12
  1812. LFD f24, 12 * SIZE(AO)
  1813. FMADD f1, f25, f28, f1
  1814. NOP1
  1815. FMADD f5, f25, f29, f5
  1816. NOP2
  1817. FMADD f9, f25, f30, f9
  1818. NOP1
  1819. FMADD f13, f25, f31, f13
  1820. LFD f25, 13 * SIZE(AO)
  1821. FMADD f2, f26, f28, f2
  1822. NOP1
  1823. FMADD f6, f26, f29, f6
  1824. NOP2
  1825. FMADD f10, f26, f30, f10
  1826. NOP1
  1827. FMADD f14, f26, f31, f14
  1828. LFD f26, 14 * SIZE(AO)
  1829. FMADD f3, f27, f28, f3
  1830. LFD f28, 12 * SIZE(BO)
  1831. FMADD f7, f27, f29, f7
  1832. LFD f29, 13 * SIZE(BO)
  1833. FMADD f11, f27, f30, f11
  1834. LFD f30, 14 * SIZE(BO)
  1835. FMADD f15, f27, f31, f15
  1836. LFD f27, 15 * SIZE(AO)
  1837. FMADD f0, f16, f20, f0
  1838. LFD f31, 15 * SIZE(BO)
  1839. FMADD f4, f16, f21, f4
  1840. NOP2
  1841. FMADD f8, f16, f22, f8
  1842. NOP1
  1843. FMADD f12, f16, f23, f12
  1844. LFD f16, 16 * SIZE(AO)
  1845. FMADD f1, f17, f20, f1
  1846. NOP1
  1847. FMADD f5, f17, f21, f5
  1848. NOP2
  1849. FMADD f9, f17, f22, f9
  1850. NOP1
  1851. FMADD f13, f17, f23, f13
  1852. LFD f17, 17 * SIZE(AO)
  1853. FMADD f2, f18, f20, f2
  1854. NOP1
  1855. FMADD f6, f18, f21, f6
  1856. NOP2
  1857. FMADD f10, f18, f22, f10
  1858. NOP1
  1859. FMADD f14, f18, f23, f14
  1860. LFD f18, 18 * SIZE(AO)
  1861. FMADD f3, f19, f20, f3
  1862. LFD f20, 16 * SIZE(BO)
  1863. FMADD f7, f19, f21, f7
  1864. LFD f21, 17 * SIZE(BO)
  1865. FMADD f11, f19, f22, f11
  1866. LFD f22, 18 * SIZE(BO)
  1867. FMADD f15, f19, f23, f15
  1868. LFD f19, 19 * SIZE(AO)
  1869. FMADD f0, f24, f28, f0
  1870. LFD f23, 19 * SIZE(BO)
  1871. FMADD f4, f24, f29, f4
  1872. NOP2
  1873. FMADD f8, f24, f30, f8
  1874. NOP1
  1875. FMADD f12, f24, f31, f12
  1876. LFD f24, 20 * SIZE(AO)
  1877. FMADD f1, f25, f28, f1
  1878. NOP1
  1879. FMADD f5, f25, f29, f5
  1880. NOP2
  1881. FMADD f9, f25, f30, f9
  1882. NOP1
  1883. FMADD f13, f25, f31, f13
  1884. LFD f25, 21 * SIZE(AO)
  1885. FMADD f2, f26, f28, f2
  1886. NOP1
  1887. FMADD f6, f26, f29, f6
  1888. NOP2
  1889. FMADD f10, f26, f30, f10
  1890. NOP1
  1891. FMADD f14, f26, f31, f14
  1892. LFD f26, 22 * SIZE(AO)
  1893. FMADD f3, f27, f28, f3
  1894. LFD f28, 20 * SIZE(BO)
  1895. FMADD f7, f27, f29, f7
  1896. LFD f29, 21 * SIZE(BO)
  1897. FMADD f11, f27, f30, f11
  1898. LFD f30, 22 * SIZE(BO)
  1899. FMADD f15, f27, f31, f15
  1900. addi AO, AO, 16 * SIZE
  1901. addi BO, BO, 16 * SIZE
  1902. bdnz LL(12)
  1903. .align 4
  1904. LL(15):
  1905. #if defined(LT) || defined(RN)
  1906. andi. r0, KK, 3
  1907. #else
  1908. andi. r0, TEMP, 3
  1909. #endif
  1910. mtspr CTR, r0
  1911. ble+ LL(18)
  1912. .align 4
  1913. LL(16):
  1914. FMADD f0, f16, f20, f0
  1915. FMADD f5, f17, f21, f5
  1916. FMADD f10, f18, f22, f10
  1917. FMADD f15, f19, f23, f15
  1918. FMADD f1, f17, f20, f1
  1919. FMADD f2, f18, f20, f2
  1920. FMADD f3, f19, f20, f3
  1921. FMADD f4, f16, f21, f4
  1922. FMADD f6, f18, f21, f6
  1923. FMADD f7, f19, f21, f7
  1924. FMADD f8, f16, f22, f8
  1925. FMADD f9, f17, f22, f9
  1926. FMADD f11, f19, f22, f11
  1927. FMADD f12, f16, f23, f12
  1928. FMADD f13, f17, f23, f13
  1929. FMADD f14, f18, f23, f14
  1930. LFD f16, 4 * SIZE(AO)
  1931. LFD f17, 5 * SIZE(AO)
  1932. LFD f18, 6 * SIZE(AO)
  1933. LFD f19, 7 * SIZE(AO)
  1934. LFD f20, 4 * SIZE(BO)
  1935. LFD f21, 5 * SIZE(BO)
  1936. LFD f22, 6 * SIZE(BO)
  1937. LFD f23, 7 * SIZE(BO)
  1938. addi BO, BO, 4 * SIZE
  1939. addi AO, AO, 4 * SIZE
  1940. bdnz LL(16)
  1941. .align 4
  1942. LL(18):
  1943. #if defined(LN) || defined(RT)
  1944. subi r0, KK, 4
  1945. slwi r0, r0, 2 + BASE_SHIFT
  1946. add AO, AORIG, r0
  1947. add BO, B, r0
  1948. #endif
  1949. #if defined(LN) || defined(LT)
  1950. LFD f16, 0 * SIZE(BO)
  1951. LFD f17, 1 * SIZE(BO)
  1952. LFD f18, 2 * SIZE(BO)
  1953. LFD f19, 3 * SIZE(BO)
  1954. LFD f20, 4 * SIZE(BO)
  1955. LFD f21, 5 * SIZE(BO)
  1956. LFD f22, 6 * SIZE(BO)
  1957. LFD f23, 7 * SIZE(BO)
  1958. LFD f24, 8 * SIZE(BO)
  1959. LFD f25, 9 * SIZE(BO)
  1960. LFD f26, 10 * SIZE(BO)
  1961. LFD f27, 11 * SIZE(BO)
  1962. LFD f28, 12 * SIZE(BO)
  1963. LFD f29, 13 * SIZE(BO)
  1964. LFD f30, 14 * SIZE(BO)
  1965. LFD f31, 15 * SIZE(BO)
  1966. FSUB f0, f16, f0
  1967. FSUB f4, f17, f4
  1968. FSUB f8, f18, f8
  1969. FSUB f12, f19, f12
  1970. FSUB f1, f20, f1
  1971. FSUB f5, f21, f5
  1972. FSUB f9, f22, f9
  1973. FSUB f13, f23, f13
  1974. FSUB f2, f24, f2
  1975. FSUB f6, f25, f6
  1976. FSUB f10, f26, f10
  1977. FSUB f14, f27, f14
  1978. FSUB f3, f28, f3
  1979. FSUB f7, f29, f7
  1980. FSUB f11, f30, f11
  1981. FSUB f15, f31, f15
  1982. #else
  1983. LFD f16, 0 * SIZE(AO)
  1984. LFD f17, 1 * SIZE(AO)
  1985. LFD f18, 2 * SIZE(AO)
  1986. LFD f19, 3 * SIZE(AO)
  1987. LFD f20, 4 * SIZE(AO)
  1988. LFD f21, 5 * SIZE(AO)
  1989. LFD f22, 6 * SIZE(AO)
  1990. LFD f23, 7 * SIZE(AO)
  1991. LFD f24, 8 * SIZE(AO)
  1992. LFD f25, 9 * SIZE(AO)
  1993. LFD f26, 10 * SIZE(AO)
  1994. LFD f27, 11 * SIZE(AO)
  1995. LFD f28, 12 * SIZE(AO)
  1996. LFD f29, 13 * SIZE(AO)
  1997. LFD f30, 14 * SIZE(AO)
  1998. LFD f31, 15 * SIZE(AO)
  1999. FSUB f0, f16, f0
  2000. FSUB f1, f17, f1
  2001. FSUB f2, f18, f2
  2002. FSUB f3, f19, f3
  2003. FSUB f4, f20, f4
  2004. FSUB f5, f21, f5
  2005. FSUB f6, f22, f6
  2006. FSUB f7, f23, f7
  2007. FSUB f8, f24, f8
  2008. FSUB f9, f25, f9
  2009. FSUB f10, f26, f10
  2010. FSUB f11, f27, f11
  2011. FSUB f12, f28, f12
  2012. FSUB f13, f29, f13
  2013. FSUB f14, f30, f14
  2014. FSUB f15, f31, f15
  2015. #endif
  2016. #ifdef LN
  2017. LFD f16, 15 * SIZE(AO)
  2018. LFD f17, 14 * SIZE(AO)
  2019. LFD f18, 13 * SIZE(AO)
  2020. LFD f19, 12 * SIZE(AO)
  2021. FMUL f3, f16, f3
  2022. FMUL f7, f16, f7
  2023. FMUL f11, f16, f11
  2024. FMUL f15, f16, f15
  2025. FNMSUB f2, f17, f3, f2
  2026. FNMSUB f6, f17, f7, f6
  2027. FNMSUB f10, f17, f11, f10
  2028. FNMSUB f14, f17, f15, f14
  2029. FNMSUB f1, f18, f3, f1
  2030. FNMSUB f5, f18, f7, f5
  2031. FNMSUB f9, f18, f11, f9
  2032. FNMSUB f13, f18, f15, f13
  2033. FNMSUB f0, f19, f3, f0
  2034. FNMSUB f4, f19, f7, f4
  2035. FNMSUB f8, f19, f11, f8
  2036. FNMSUB f12, f19, f15, f12
  2037. LFD f16, 10 * SIZE(AO)
  2038. LFD f17, 9 * SIZE(AO)
  2039. LFD f18, 8 * SIZE(AO)
  2040. LFD f19, 5 * SIZE(AO)
  2041. FMUL f2, f16, f2
  2042. FMUL f6, f16, f6
  2043. FMUL f10, f16, f10
  2044. FMUL f14, f16, f14
  2045. LFD f20, 4 * SIZE(AO)
  2046. LFD f21, 0 * SIZE(AO)
  2047. FNMSUB f1, f17, f2, f1
  2048. FNMSUB f5, f17, f6, f5
  2049. FNMSUB f9, f17, f10, f9
  2050. FNMSUB f13, f17, f14, f13
  2051. FNMSUB f0, f18, f2, f0
  2052. FNMSUB f4, f18, f6, f4
  2053. FNMSUB f8, f18, f10, f8
  2054. FNMSUB f12, f18, f14, f12
  2055. FMUL f1, f19, f1
  2056. FMUL f5, f19, f5
  2057. FMUL f9, f19, f9
  2058. FMUL f13, f19, f13
  2059. FNMSUB f0, f20, f1, f0
  2060. FNMSUB f4, f20, f5, f4
  2061. FNMSUB f8, f20, f9, f8
  2062. FNMSUB f12, f20, f13, f12
  2063. FMUL f0, f21, f0
  2064. FMUL f4, f21, f4
  2065. FMUL f8, f21, f8
  2066. FMUL f12, f21, f12
  2067. #endif
  2068. #ifdef LT
  2069. LFD f16, 0 * SIZE(AO)
  2070. LFD f17, 1 * SIZE(AO)
  2071. LFD f18, 2 * SIZE(AO)
  2072. LFD f19, 3 * SIZE(AO)
  2073. FMUL f0, f16, f0
  2074. FMUL f4, f16, f4
  2075. FMUL f8, f16, f8
  2076. FMUL f12, f16, f12
  2077. FNMSUB f1, f17, f0, f1
  2078. FNMSUB f5, f17, f4, f5
  2079. FNMSUB f9, f17, f8, f9
  2080. FNMSUB f13, f17, f12, f13
  2081. FNMSUB f2, f18, f0, f2
  2082. FNMSUB f6, f18, f4, f6
  2083. FNMSUB f10, f18, f8, f10
  2084. FNMSUB f14, f18, f12, f14
  2085. FNMSUB f3, f19, f0, f3
  2086. FNMSUB f7, f19, f4, f7
  2087. FNMSUB f11, f19, f8, f11
  2088. FNMSUB f15, f19, f12, f15
  2089. LFD f16, 5 * SIZE(AO)
  2090. LFD f17, 6 * SIZE(AO)
  2091. LFD f18, 7 * SIZE(AO)
  2092. LFD f19, 10 * SIZE(AO)
  2093. FMUL f1, f16, f1
  2094. FMUL f5, f16, f5
  2095. FMUL f9, f16, f9
  2096. FMUL f13, f16, f13
  2097. LFD f20, 11 * SIZE(AO)
  2098. LFD f21, 15 * SIZE(AO)
  2099. FNMSUB f2, f17, f1, f2
  2100. FNMSUB f6, f17, f5, f6
  2101. FNMSUB f10, f17, f9, f10
  2102. FNMSUB f14, f17, f13, f14
  2103. FNMSUB f3, f18, f1, f3
  2104. FNMSUB f7, f18, f5, f7
  2105. FNMSUB f11, f18, f9, f11
  2106. FNMSUB f15, f18, f13, f15
  2107. FMUL f2, f19, f2
  2108. FMUL f6, f19, f6
  2109. FMUL f10, f19, f10
  2110. FMUL f14, f19, f14
  2111. FNMSUB f3, f20, f2, f3
  2112. FNMSUB f7, f20, f6, f7
  2113. FNMSUB f11, f20, f10, f11
  2114. FNMSUB f15, f20, f14, f15
  2115. FMUL f3, f21, f3
  2116. FMUL f7, f21, f7
  2117. FMUL f11, f21, f11
  2118. FMUL f15, f21, f15
  2119. #endif
  2120. #ifdef RN
  2121. LFD f16, 0 * SIZE(BO)
  2122. LFD f17, 1 * SIZE(BO)
  2123. LFD f18, 2 * SIZE(BO)
  2124. LFD f19, 3 * SIZE(BO)
  2125. FMUL f0, f16, f0
  2126. FMUL f1, f16, f1
  2127. FMUL f2, f16, f2
  2128. FMUL f3, f16, f3
  2129. FNMSUB f4, f17, f0, f4
  2130. FNMSUB f5, f17, f1, f5
  2131. FNMSUB f6, f17, f2, f6
  2132. FNMSUB f7, f17, f3, f7
  2133. FNMSUB f8, f18, f0, f8
  2134. FNMSUB f9, f18, f1, f9
  2135. FNMSUB f10, f18, f2, f10
  2136. FNMSUB f11, f18, f3, f11
  2137. FNMSUB f12, f19, f0, f12
  2138. FNMSUB f13, f19, f1, f13
  2139. FNMSUB f14, f19, f2, f14
  2140. FNMSUB f15, f19, f3, f15
  2141. LFD f16, 5 * SIZE(BO)
  2142. LFD f17, 6 * SIZE(BO)
  2143. LFD f18, 7 * SIZE(BO)
  2144. LFD f19, 10 * SIZE(BO)
  2145. FMUL f4, f16, f4
  2146. FMUL f5, f16, f5
  2147. FMUL f6, f16, f6
  2148. FMUL f7, f16, f7
  2149. LFD f20, 11 * SIZE(BO)
  2150. LFD f21, 15 * SIZE(BO)
  2151. FNMSUB f8, f17, f4, f8
  2152. FNMSUB f9, f17, f5, f9
  2153. FNMSUB f10, f17, f6, f10
  2154. FNMSUB f11, f17, f7, f11
  2155. FNMSUB f12, f18, f4, f12
  2156. FNMSUB f13, f18, f5, f13
  2157. FNMSUB f14, f18, f6, f14
  2158. FNMSUB f15, f18, f7, f15
  2159. FMUL f8, f19, f8
  2160. FMUL f9, f19, f9
  2161. FMUL f10, f19, f10
  2162. FMUL f11, f19, f11
  2163. FNMSUB f12, f20, f8, f12
  2164. FNMSUB f13, f20, f9, f13
  2165. FNMSUB f14, f20, f10, f14
  2166. FNMSUB f15, f20, f11, f15
  2167. FMUL f12, f21, f12
  2168. FMUL f13, f21, f13
  2169. FMUL f14, f21, f14
  2170. FMUL f15, f21, f15
  2171. #endif
  2172. #ifdef RT
  2173. LFD f16, 15 * SIZE(BO)
  2174. LFD f17, 14 * SIZE(BO)
  2175. LFD f18, 13 * SIZE(BO)
  2176. LFD f19, 12 * SIZE(BO)
  2177. FMUL f12, f16, f12
  2178. FMUL f13, f16, f13
  2179. FMUL f14, f16, f14
  2180. FMUL f15, f16, f15
  2181. FNMSUB f8, f17, f12, f8
  2182. FNMSUB f9, f17, f13, f9
  2183. FNMSUB f10, f17, f14, f10
  2184. FNMSUB f11, f17, f15, f11
  2185. FNMSUB f4, f18, f12, f4
  2186. FNMSUB f5, f18, f13, f5
  2187. FNMSUB f6, f18, f14, f6
  2188. FNMSUB f7, f18, f15, f7
  2189. FNMSUB f0, f19, f12, f0
  2190. FNMSUB f1, f19, f13, f1
  2191. FNMSUB f2, f19, f14, f2
  2192. FNMSUB f3, f19, f15, f3
  2193. LFD f16, 10 * SIZE(BO)
  2194. LFD f17, 9 * SIZE(BO)
  2195. LFD f18, 8 * SIZE(BO)
  2196. LFD f19, 5 * SIZE(BO)
  2197. FMUL f8, f16, f8
  2198. FMUL f9, f16, f9
  2199. FMUL f10, f16, f10
  2200. FMUL f11, f16, f11
  2201. LFD f20, 4 * SIZE(BO)
  2202. LFD f21, 0 * SIZE(BO)
  2203. FNMSUB f4, f17, f8, f4
  2204. FNMSUB f5, f17, f9, f5
  2205. FNMSUB f6, f17, f10, f6
  2206. FNMSUB f7, f17, f11, f7
  2207. FNMSUB f0, f18, f8, f0
  2208. FNMSUB f1, f18, f9, f1
  2209. FNMSUB f2, f18, f10, f2
  2210. FNMSUB f3, f18, f11, f3
  2211. FMUL f4, f19, f4
  2212. FMUL f5, f19, f5
  2213. FMUL f6, f19, f6
  2214. FMUL f7, f19, f7
  2215. FNMSUB f0, f20, f4, f0
  2216. FNMSUB f1, f20, f5, f1
  2217. FNMSUB f2, f20, f6, f2
  2218. FNMSUB f3, f20, f7, f3
  2219. FMUL f0, f21, f0
  2220. FMUL f1, f21, f1
  2221. FMUL f2, f21, f2
  2222. FMUL f3, f21, f3
  2223. #endif
  2224. #ifdef LN
  2225. subi CO1, CO1, 4 * SIZE
  2226. subi CO2, CO2, 4 * SIZE
  2227. subi CO3, CO3, 4 * SIZE
  2228. subi CO4, CO4, 4 * SIZE
  2229. #endif
  2230. #if defined(LN) || defined(LT)
  2231. STFD f0, 0 * SIZE(BO)
  2232. STFD f4, 1 * SIZE(BO)
  2233. STFD f8, 2 * SIZE(BO)
  2234. STFD f12, 3 * SIZE(BO)
  2235. STFD f1, 4 * SIZE(BO)
  2236. STFD f5, 5 * SIZE(BO)
  2237. STFD f9, 6 * SIZE(BO)
  2238. STFD f13, 7 * SIZE(BO)
  2239. STFD f2, 8 * SIZE(BO)
  2240. STFD f6, 9 * SIZE(BO)
  2241. STFD f10, 10 * SIZE(BO)
  2242. STFD f14, 11 * SIZE(BO)
  2243. STFD f3, 12 * SIZE(BO)
  2244. STFD f7, 13 * SIZE(BO)
  2245. STFD f11, 14 * SIZE(BO)
  2246. STFD f15, 15 * SIZE(BO)
  2247. #else
  2248. STFD f0, 0 * SIZE(AO)
  2249. STFD f1, 1 * SIZE(AO)
  2250. STFD f2, 2 * SIZE(AO)
  2251. STFD f3, 3 * SIZE(AO)
  2252. STFD f4, 4 * SIZE(AO)
  2253. STFD f5, 5 * SIZE(AO)
  2254. STFD f6, 6 * SIZE(AO)
  2255. STFD f7, 7 * SIZE(AO)
  2256. STFD f8, 8 * SIZE(AO)
  2257. STFD f9, 9 * SIZE(AO)
  2258. STFD f10, 10 * SIZE(AO)
  2259. STFD f11, 11 * SIZE(AO)
  2260. STFD f12, 12 * SIZE(AO)
  2261. STFD f13, 13 * SIZE(AO)
  2262. STFD f14, 14 * SIZE(AO)
  2263. STFD f15, 15 * SIZE(AO)
  2264. #endif
  2265. STFD f0, 0 * SIZE(CO1)
  2266. STFD f1, 1 * SIZE(CO1)
  2267. STFD f2, 2 * SIZE(CO1)
  2268. STFD f3, 3 * SIZE(CO1)
  2269. STFD f4, 0 * SIZE(CO2)
  2270. STFD f5, 1 * SIZE(CO2)
  2271. STFD f6, 2 * SIZE(CO2)
  2272. STFD f7, 3 * SIZE(CO2)
  2273. STFD f8, 0 * SIZE(CO3)
  2274. STFD f9, 1 * SIZE(CO3)
  2275. STFD f10, 2 * SIZE(CO3)
  2276. STFD f11, 3 * SIZE(CO3)
  2277. STFD f12, 0 * SIZE(CO4)
  2278. STFD f13, 1 * SIZE(CO4)
  2279. STFD f14, 2 * SIZE(CO4)
  2280. STFD f15, 3 * SIZE(CO4)
  2281. lfs f0, FZERO
  2282. fmr f1, f0
  2283. fmr f2, f0
  2284. fmr f3, f0
  2285. fmr f4, f0
  2286. fmr f5, f0
  2287. fmr f6, f0
  2288. fmr f7, f0
  2289. fmr f8, f0
  2290. fmr f9, f0
  2291. fmr f10, f0
  2292. fmr f11, f0
  2293. fmr f12, f0
  2294. fmr f13, f0
  2295. fmr f14, f0
  2296. fmr f15, f0
  2297. #ifndef LN
  2298. addi CO1, CO1, 4 * SIZE
  2299. addi CO2, CO2, 4 * SIZE
  2300. addi CO3, CO3, 4 * SIZE
  2301. addi CO4, CO4, 4 * SIZE
  2302. #endif
  2303. #ifdef RT
  2304. slwi r0, K, 2 + BASE_SHIFT
  2305. add AORIG, AORIG, r0
  2306. #endif
  2307. #if defined(LT) || defined(RN)
  2308. sub TEMP, K, KK
  2309. slwi TEMP, TEMP, 2 + BASE_SHIFT
  2310. add AO, AO, TEMP
  2311. add BO, BO, TEMP
  2312. #endif
  2313. #ifdef LT
  2314. addi KK, KK, 4
  2315. #endif
  2316. #ifdef LN
  2317. subi KK, KK, 4
  2318. #endif
  2319. addic. I, I, -1
  2320. bgt+ LL(11)
  2321. .align 4
  2322. LL(20):
  2323. andi. I, M, 2
  2324. ble LL(30)
  2325. #if defined(LT) || defined(RN)
  2326. LFD f16, 0 * SIZE(AO)
  2327. LFD f17, 1 * SIZE(AO)
  2328. LFD f18, 2 * SIZE(AO)
  2329. LFD f19, 3 * SIZE(AO)
  2330. LFD f20, 0 * SIZE(B)
  2331. LFD f21, 1 * SIZE(B)
  2332. LFD f22, 2 * SIZE(B)
  2333. LFD f23, 3 * SIZE(B)
  2334. LFD f24, 4 * SIZE(B)
  2335. LFD f25, 5 * SIZE(B)
  2336. LFD f26, 6 * SIZE(B)
  2337. LFD f27, 7 * SIZE(B)
  2338. srawi. r0, KK, 2
  2339. mtspr CTR, r0
  2340. mr BO, B
  2341. #else
  2342. #ifdef LN
  2343. slwi r0, K, 1 + BASE_SHIFT
  2344. sub AORIG, AORIG, r0
  2345. #endif
  2346. slwi r0, KK, 1 + BASE_SHIFT
  2347. slwi TEMP, KK, 2 + BASE_SHIFT
  2348. add AO, AORIG, r0
  2349. add BO, B, TEMP
  2350. sub TEMP, K, KK
  2351. LFD f16, 0 * SIZE(AO)
  2352. LFD f17, 1 * SIZE(AO)
  2353. LFD f18, 2 * SIZE(AO)
  2354. LFD f19, 3 * SIZE(AO)
  2355. LFD f20, 0 * SIZE(BO)
  2356. LFD f21, 1 * SIZE(BO)
  2357. LFD f22, 2 * SIZE(BO)
  2358. LFD f23, 3 * SIZE(BO)
  2359. LFD f24, 4 * SIZE(BO)
  2360. LFD f25, 5 * SIZE(BO)
  2361. LFD f26, 6 * SIZE(BO)
  2362. LFD f27, 7 * SIZE(BO)
  2363. srawi. r0, TEMP, 2
  2364. mtspr CTR, r0
  2365. #endif
  2366. ble LL(25)
  2367. .align 5
  2368. LL(22):
  2369. FMADD f0, f16, f20, f0
  2370. FMADD f1, f17, f20, f1
  2371. FMADD f4, f16, f21, f4
  2372. FMADD f5, f17, f21, f5
  2373. FMADD f8, f16, f22, f8
  2374. FMADD f9, f17, f22, f9
  2375. FMADD f12, f16, f23, f12
  2376. FMADD f13, f17, f23, f13
  2377. LFD f20, 8 * SIZE(BO)
  2378. LFD f21, 9 * SIZE(BO)
  2379. LFD f22, 10 * SIZE(BO)
  2380. LFD f23, 11 * SIZE(BO)
  2381. FMADD f2, f18, f24, f2
  2382. FMADD f3, f19, f24, f3
  2383. FMADD f6, f18, f25, f6
  2384. FMADD f7, f19, f25, f7
  2385. FMADD f10, f18, f26, f10
  2386. FMADD f11, f19, f26, f11
  2387. FMADD f14, f18, f27, f14
  2388. FMADD f15, f19, f27, f15
  2389. LFD f16, 4 * SIZE(AO)
  2390. LFD f17, 5 * SIZE(AO)
  2391. LFD f18, 6 * SIZE(AO)
  2392. LFD f19, 7 * SIZE(AO)
  2393. FMADD f0, f16, f20, f0
  2394. FMADD f1, f17, f20, f1
  2395. FMADD f4, f16, f21, f4
  2396. FMADD f5, f17, f21, f5
  2397. LFD f24, 12 * SIZE(BO)
  2398. LFD f25, 13 * SIZE(BO)
  2399. LFD f26, 14 * SIZE(BO)
  2400. LFD f27, 15 * SIZE(BO)
  2401. FMADD f8, f16, f22, f8
  2402. FMADD f9, f17, f22, f9
  2403. FMADD f12, f16, f23, f12
  2404. FMADD f13, f17, f23, f13
  2405. LFD f20, 16 * SIZE(BO)
  2406. LFD f21, 17 * SIZE(BO)
  2407. LFD f22, 18 * SIZE(BO)
  2408. LFD f23, 19 * SIZE(BO)
  2409. FMADD f2, f18, f24, f2
  2410. FMADD f3, f19, f24, f3
  2411. FMADD f6, f18, f25, f6
  2412. FMADD f7, f19, f25, f7
  2413. FMADD f10, f18, f26, f10
  2414. FMADD f11, f19, f26, f11
  2415. FMADD f14, f18, f27, f14
  2416. FMADD f15, f19, f27, f15
  2417. LFD f16, 8 * SIZE(AO)
  2418. LFD f17, 9 * SIZE(AO)
  2419. LFD f18, 10 * SIZE(AO)
  2420. LFD f19, 11 * SIZE(AO)
  2421. LFD f24, 20 * SIZE(BO)
  2422. LFD f25, 21 * SIZE(BO)
  2423. LFD f26, 22 * SIZE(BO)
  2424. LFD f27, 23 * SIZE(BO)
  2425. addi AO, AO, 8 * SIZE
  2426. addi BO, BO, 16 * SIZE
  2427. DCBT(BO, PREB)
  2428. bdnz LL(22)
  2429. fadd f0, f2, f0
  2430. fadd f1, f3, f1
  2431. fadd f4, f6, f4
  2432. fadd f5, f7, f5
  2433. fadd f8, f10, f8
  2434. fadd f9, f11, f9
  2435. fadd f12, f14, f12
  2436. fadd f13, f15, f13
  2437. .align 4
  2438. LL(25):
  2439. #if defined(LT) || defined(RN)
  2440. andi. r0, KK, 3
  2441. #else
  2442. andi. r0, TEMP, 3
  2443. #endif
  2444. mtspr CTR, r0
  2445. ble+ LL(28)
  2446. .align 4
  2447. LL(26):
  2448. FMADD f0, f16, f20, f0
  2449. FMADD f1, f17, f20, f1
  2450. FMADD f4, f16, f21, f4
  2451. FMADD f5, f17, f21, f5
  2452. FMADD f8, f16, f22, f8
  2453. FMADD f9, f17, f22, f9
  2454. FMADD f12, f16, f23, f12
  2455. FMADD f13, f17, f23, f13
  2456. LFD f16, 2 * SIZE(AO)
  2457. LFD f17, 3 * SIZE(AO)
  2458. LFD f20, 4 * SIZE(BO)
  2459. LFD f21, 5 * SIZE(BO)
  2460. LFD f22, 6 * SIZE(BO)
  2461. LFD f23, 7 * SIZE(BO)
  2462. addi BO, BO, 4 * SIZE
  2463. addi AO, AO, 2 * SIZE
  2464. bdnz LL(26)
  2465. .align 4
  2466. LL(28):
  2467. #if defined(LN) || defined(RT)
  2468. #ifdef LN
  2469. subi r0, KK, 2
  2470. #else
  2471. subi r0, KK, 4
  2472. #endif
  2473. slwi TEMP, r0, 1 + BASE_SHIFT
  2474. slwi r0, r0, 2 + BASE_SHIFT
  2475. add AO, AORIG, TEMP
  2476. add BO, B, r0
  2477. #endif
  2478. #if defined(LN) || defined(LT)
  2479. LFD f16, 0 * SIZE(BO)
  2480. LFD f17, 1 * SIZE(BO)
  2481. LFD f18, 2 * SIZE(BO)
  2482. LFD f19, 3 * SIZE(BO)
  2483. LFD f20, 4 * SIZE(BO)
  2484. LFD f21, 5 * SIZE(BO)
  2485. LFD f22, 6 * SIZE(BO)
  2486. LFD f23, 7 * SIZE(BO)
  2487. FSUB f0, f16, f0
  2488. FSUB f4, f17, f4
  2489. FSUB f8, f18, f8
  2490. FSUB f12, f19, f12
  2491. FSUB f1, f20, f1
  2492. FSUB f5, f21, f5
  2493. FSUB f9, f22, f9
  2494. FSUB f13, f23, f13
  2495. #else
  2496. LFD f16, 0 * SIZE(AO)
  2497. LFD f17, 1 * SIZE(AO)
  2498. LFD f20, 2 * SIZE(AO)
  2499. LFD f21, 3 * SIZE(AO)
  2500. LFD f24, 4 * SIZE(AO)
  2501. LFD f25, 5 * SIZE(AO)
  2502. LFD f28, 6 * SIZE(AO)
  2503. LFD f29, 7 * SIZE(AO)
  2504. FSUB f0, f16, f0
  2505. FSUB f1, f17, f1
  2506. FSUB f4, f20, f4
  2507. FSUB f5, f21, f5
  2508. FSUB f8, f24, f8
  2509. FSUB f9, f25, f9
  2510. FSUB f12, f28, f12
  2511. FSUB f13, f29, f13
  2512. #endif
  2513. #ifdef LN
  2514. LFD f19, 3 * SIZE(AO)
  2515. LFD f20, 2 * SIZE(AO)
  2516. LFD f21, 0 * SIZE(AO)
  2517. FMUL f1, f19, f1
  2518. FMUL f5, f19, f5
  2519. FMUL f9, f19, f9
  2520. FMUL f13, f19, f13
  2521. FNMSUB f0, f20, f1, f0
  2522. FNMSUB f4, f20, f5, f4
  2523. FNMSUB f8, f20, f9, f8
  2524. FNMSUB f12, f20, f13, f12
  2525. FMUL f0, f21, f0
  2526. FMUL f4, f21, f4
  2527. FMUL f8, f21, f8
  2528. FMUL f12, f21, f12
  2529. #endif
  2530. #ifdef LT
  2531. LFD f16, 0 * SIZE(AO)
  2532. LFD f17, 1 * SIZE(AO)
  2533. FMUL f0, f16, f0
  2534. FMUL f4, f16, f4
  2535. FMUL f8, f16, f8
  2536. FMUL f12, f16, f12
  2537. FNMSUB f1, f17, f0, f1
  2538. FNMSUB f5, f17, f4, f5
  2539. FNMSUB f9, f17, f8, f9
  2540. FNMSUB f13, f17, f12, f13
  2541. LFD f17, 3 * SIZE(AO)
  2542. FMUL f1, f17, f1
  2543. FMUL f5, f17, f5
  2544. FMUL f9, f17, f9
  2545. FMUL f13, f17, f13
  2546. #endif
  2547. #ifdef RN
  2548. LFD f16, 0 * SIZE(BO)
  2549. LFD f17, 1 * SIZE(BO)
  2550. LFD f18, 2 * SIZE(BO)
  2551. LFD f19, 3 * SIZE(BO)
  2552. FMUL f0, f16, f0
  2553. FMUL f1, f16, f1
  2554. FNMSUB f4, f17, f0, f4
  2555. FNMSUB f5, f17, f1, f5
  2556. FNMSUB f8, f18, f0, f8
  2557. FNMSUB f9, f18, f1, f9
  2558. FNMSUB f12, f19, f0, f12
  2559. FNMSUB f13, f19, f1, f13
  2560. LFD f16, 5 * SIZE(BO)
  2561. LFD f17, 6 * SIZE(BO)
  2562. LFD f18, 7 * SIZE(BO)
  2563. LFD f19, 10 * SIZE(BO)
  2564. LFD f20, 11 * SIZE(BO)
  2565. LFD f21, 15 * SIZE(BO)
  2566. FMUL f4, f16, f4
  2567. FMUL f5, f16, f5
  2568. FNMSUB f8, f17, f4, f8
  2569. FNMSUB f9, f17, f5, f9
  2570. FNMSUB f12, f18, f4, f12
  2571. FNMSUB f13, f18, f5, f13
  2572. FMUL f8, f19, f8
  2573. FMUL f9, f19, f9
  2574. FNMSUB f12, f20, f8, f12
  2575. FNMSUB f13, f20, f9, f13
  2576. FMUL f12, f21, f12
  2577. FMUL f13, f21, f13
  2578. #endif
  2579. #ifdef RT
  2580. LFD f16, 15 * SIZE(BO)
  2581. LFD f17, 14 * SIZE(BO)
  2582. LFD f18, 13 * SIZE(BO)
  2583. LFD f19, 12 * SIZE(BO)
  2584. FMUL f12, f16, f12
  2585. FMUL f13, f16, f13
  2586. FNMSUB f8, f17, f12, f8
  2587. FNMSUB f9, f17, f13, f9
  2588. FNMSUB f4, f18, f12, f4
  2589. FNMSUB f5, f18, f13, f5
  2590. FNMSUB f0, f19, f12, f0
  2591. FNMSUB f1, f19, f13, f1
  2592. LFD f16, 10 * SIZE(BO)
  2593. LFD f17, 9 * SIZE(BO)
  2594. LFD f18, 8 * SIZE(BO)
  2595. LFD f19, 5 * SIZE(BO)
  2596. LFD f20, 4 * SIZE(BO)
  2597. LFD f21, 0 * SIZE(BO)
  2598. FMUL f8, f16, f8
  2599. FMUL f9, f16, f9
  2600. FNMSUB f4, f17, f8, f4
  2601. FNMSUB f5, f17, f9, f5
  2602. FNMSUB f0, f18, f8, f0
  2603. FNMSUB f1, f18, f9, f1
  2604. FMUL f4, f19, f4
  2605. FMUL f5, f19, f5
  2606. FNMSUB f0, f20, f4, f0
  2607. FNMSUB f1, f20, f5, f1
  2608. FMUL f0, f21, f0
  2609. FMUL f1, f21, f1
  2610. #endif
  2611. #ifdef LN
  2612. subi CO1, CO1, 2 * SIZE
  2613. subi CO2, CO2, 2 * SIZE
  2614. subi CO3, CO3, 2 * SIZE
  2615. subi CO4, CO4, 2 * SIZE
  2616. #endif
  2617. #if defined(LN) || defined(LT)
  2618. STFD f0, 0 * SIZE(BO)
  2619. STFD f4, 1 * SIZE(BO)
  2620. STFD f8, 2 * SIZE(BO)
  2621. STFD f12, 3 * SIZE(BO)
  2622. STFD f1, 4 * SIZE(BO)
  2623. STFD f5, 5 * SIZE(BO)
  2624. STFD f9, 6 * SIZE(BO)
  2625. STFD f13, 7 * SIZE(BO)
  2626. #else
  2627. STFD f0, 0 * SIZE(AO)
  2628. STFD f1, 1 * SIZE(AO)
  2629. STFD f4, 2 * SIZE(AO)
  2630. STFD f5, 3 * SIZE(AO)
  2631. STFD f8, 4 * SIZE(AO)
  2632. STFD f9, 5 * SIZE(AO)
  2633. STFD f12, 6 * SIZE(AO)
  2634. STFD f13, 7 * SIZE(AO)
  2635. #endif
  2636. STFD f0, 0 * SIZE(CO1)
  2637. STFD f1, 1 * SIZE(CO1)
  2638. STFD f4, 0 * SIZE(CO2)
  2639. STFD f5, 1 * SIZE(CO2)
  2640. STFD f8, 0 * SIZE(CO3)
  2641. STFD f9, 1 * SIZE(CO3)
  2642. STFD f12, 0 * SIZE(CO4)
  2643. STFD f13, 1 * SIZE(CO4)
  2644. lfs f0, FZERO
  2645. fmr f1, f0
  2646. fmr f2, f0
  2647. fmr f3, f0
  2648. fmr f4, f0
  2649. fmr f5, f0
  2650. fmr f6, f0
  2651. fmr f7, f0
  2652. fmr f8, f0
  2653. fmr f9, f0
  2654. fmr f10, f0
  2655. fmr f11, f0
  2656. fmr f12, f0
  2657. fmr f13, f0
  2658. fmr f14, f0
  2659. fmr f15, f0
  2660. #ifndef LN
  2661. addi CO1, CO1, 2 * SIZE
  2662. addi CO2, CO2, 2 * SIZE
  2663. addi CO3, CO3, 2 * SIZE
  2664. addi CO4, CO4, 2 * SIZE
  2665. #endif
  2666. #ifdef RT
  2667. slwi r0, K, 1 + BASE_SHIFT
  2668. add AORIG, AORIG, r0
  2669. #endif
  2670. #if defined(LT) || defined(RN)
  2671. sub TEMP, K, KK
  2672. slwi r0, TEMP, 1 + BASE_SHIFT
  2673. slwi TEMP, TEMP, 2 + BASE_SHIFT
  2674. add AO, AO, r0
  2675. add BO, BO, TEMP
  2676. #endif
  2677. #ifdef LN
  2678. subi KK, KK, 2
  2679. #endif
  2680. #ifdef LT
  2681. addi KK, KK, 2
  2682. #endif
  2683. .align 4
  2684. LL(30):
  2685. andi. I, M, 1
  2686. ble LL(39)
  2687. #if defined(LT) || defined(RN)
  2688. LFD f16, 0 * SIZE(AO)
  2689. LFD f17, 1 * SIZE(AO)
  2690. LFD f18, 2 * SIZE(AO)
  2691. LFD f19, 3 * SIZE(AO)
  2692. LFD f20, 0 * SIZE(B)
  2693. LFD f21, 1 * SIZE(B)
  2694. LFD f22, 2 * SIZE(B)
  2695. LFD f23, 3 * SIZE(B)
  2696. LFD f24, 4 * SIZE(B)
  2697. LFD f25, 5 * SIZE(B)
  2698. LFD f26, 6 * SIZE(B)
  2699. LFD f27, 7 * SIZE(B)
  2700. srawi. r0, KK, 2
  2701. mtspr CTR, r0
  2702. mr BO, B
  2703. #else
  2704. #ifdef LN
  2705. slwi r0, K, BASE_SHIFT
  2706. sub AORIG, AORIG, r0
  2707. #endif
  2708. slwi r0, KK, 0 + BASE_SHIFT
  2709. slwi TEMP, KK, 2 + BASE_SHIFT
  2710. add AO, AORIG, r0
  2711. add BO, B, TEMP
  2712. sub TEMP, K, KK
  2713. LFD f16, 0 * SIZE(AO)
  2714. LFD f17, 1 * SIZE(AO)
  2715. LFD f18, 2 * SIZE(AO)
  2716. LFD f19, 3 * SIZE(AO)
  2717. LFD f20, 0 * SIZE(BO)
  2718. LFD f21, 1 * SIZE(BO)
  2719. LFD f22, 2 * SIZE(BO)
  2720. LFD f23, 3 * SIZE(BO)
  2721. LFD f24, 4 * SIZE(BO)
  2722. LFD f25, 5 * SIZE(BO)
  2723. LFD f26, 6 * SIZE(BO)
  2724. LFD f27, 7 * SIZE(BO)
  2725. srawi. r0, TEMP, 2
  2726. mtspr CTR, r0
  2727. #endif
  2728. ble LL(35)
  2729. .align 5
  2730. LL(32):
  2731. FMADD f0, f16, f20, f0
  2732. FMADD f4, f16, f21, f4
  2733. FMADD f8, f16, f22, f8
  2734. FMADD f12, f16, f23, f12
  2735. LFD f20, 8 * SIZE(BO)
  2736. LFD f21, 9 * SIZE(BO)
  2737. LFD f22, 10 * SIZE(BO)
  2738. LFD f23, 11 * SIZE(BO)
  2739. FMADD f1, f17, f24, f1
  2740. FMADD f5, f17, f25, f5
  2741. FMADD f9, f17, f26, f9
  2742. FMADD f13, f17, f27, f13
  2743. LFD f24, 12 * SIZE(BO)
  2744. LFD f25, 13 * SIZE(BO)
  2745. LFD f26, 14 * SIZE(BO)
  2746. LFD f27, 15 * SIZE(BO)
  2747. FMADD f0, f18, f20, f0
  2748. FMADD f4, f18, f21, f4
  2749. FMADD f8, f18, f22, f8
  2750. FMADD f12, f18, f23, f12
  2751. LFD f20, 16 * SIZE(BO)
  2752. LFD f21, 17 * SIZE(BO)
  2753. LFD f22, 18 * SIZE(BO)
  2754. LFD f23, 19 * SIZE(BO)
  2755. FMADD f1, f19, f24, f1
  2756. FMADD f5, f19, f25, f5
  2757. FMADD f9, f19, f26, f9
  2758. FMADD f13, f19, f27, f13
  2759. LFD f16, 4 * SIZE(AO)
  2760. LFD f17, 5 * SIZE(AO)
  2761. LFD f18, 6 * SIZE(AO)
  2762. LFD f19, 7 * SIZE(AO)
  2763. LFD f24, 20 * SIZE(BO)
  2764. LFD f25, 21 * SIZE(BO)
  2765. LFD f26, 22 * SIZE(BO)
  2766. LFD f27, 23 * SIZE(BO)
  2767. addi AO, AO, 4 * SIZE
  2768. addi BO, BO, 16 * SIZE
  2769. DCBT(BO, PREB)
  2770. bdnz LL(32)
  2771. fadd f0, f1, f0
  2772. fadd f4, f5, f4
  2773. fadd f8, f9, f8
  2774. fadd f12, f13, f12
  2775. .align 4
  2776. LL(35):
  2777. #if defined(LT) || defined(RN)
  2778. andi. r0, KK, 3
  2779. #else
  2780. andi. r0, TEMP, 3
  2781. #endif
  2782. mtspr CTR, r0
  2783. ble+ LL(38)
  2784. .align 4
  2785. LL(36):
  2786. FMADD f0, f16, f20, f0
  2787. FMADD f4, f16, f21, f4
  2788. FMADD f8, f16, f22, f8
  2789. FMADD f12, f16, f23, f12
  2790. LFD f16, 1 * SIZE(AO)
  2791. LFD f20, 4 * SIZE(BO)
  2792. LFD f21, 5 * SIZE(BO)
  2793. LFD f22, 6 * SIZE(BO)
  2794. LFD f23, 7 * SIZE(BO)
  2795. addi BO, BO, 4 * SIZE
  2796. addi AO, AO, 1 * SIZE
  2797. bdnz LL(36)
  2798. .align 4
  2799. LL(38):
  2800. #if defined(LN) || defined(RT)
  2801. #ifdef LN
  2802. subi r0, KK, 1
  2803. #else
  2804. subi r0, KK, 4
  2805. #endif
  2806. slwi TEMP, r0, 0 + BASE_SHIFT
  2807. slwi r0, r0, 2 + BASE_SHIFT
  2808. add AO, AORIG, TEMP
  2809. add BO, B, r0
  2810. #endif
  2811. #if defined(LN) || defined(LT)
  2812. LFD f16, 0 * SIZE(BO)
  2813. LFD f17, 1 * SIZE(BO)
  2814. LFD f18, 2 * SIZE(BO)
  2815. LFD f19, 3 * SIZE(BO)
  2816. FSUB f0, f16, f0
  2817. FSUB f4, f17, f4
  2818. FSUB f8, f18, f8
  2819. FSUB f12, f19, f12
  2820. #else
  2821. LFD f16, 0 * SIZE(AO)
  2822. LFD f20, 1 * SIZE(AO)
  2823. LFD f24, 2 * SIZE(AO)
  2824. LFD f28, 3 * SIZE(AO)
  2825. FSUB f0, f16, f0
  2826. FSUB f4, f20, f4
  2827. FSUB f8, f24, f8
  2828. FSUB f12, f28, f12
  2829. #endif
  2830. #ifdef LN
  2831. LFD f21, 0 * SIZE(AO)
  2832. FMUL f0, f21, f0
  2833. FMUL f4, f21, f4
  2834. FMUL f8, f21, f8
  2835. FMUL f12, f21, f12
  2836. #endif
  2837. #ifdef LT
  2838. LFD f16, 0 * SIZE(AO)
  2839. FMUL f0, f16, f0
  2840. FMUL f4, f16, f4
  2841. FMUL f8, f16, f8
  2842. FMUL f12, f16, f12
  2843. #endif
  2844. #ifdef RN
  2845. LFD f16, 0 * SIZE(BO)
  2846. LFD f17, 1 * SIZE(BO)
  2847. LFD f18, 2 * SIZE(BO)
  2848. LFD f19, 3 * SIZE(BO)
  2849. FMUL f0, f16, f0
  2850. FNMSUB f4, f17, f0, f4
  2851. FNMSUB f8, f18, f0, f8
  2852. FNMSUB f12, f19, f0, f12
  2853. LFD f16, 5 * SIZE(BO)
  2854. LFD f17, 6 * SIZE(BO)
  2855. LFD f18, 7 * SIZE(BO)
  2856. LFD f19, 10 * SIZE(BO)
  2857. LFD f20, 11 * SIZE(BO)
  2858. LFD f21, 15 * SIZE(BO)
  2859. FMUL f4, f16, f4
  2860. FNMSUB f8, f17, f4, f8
  2861. FNMSUB f12, f18, f4, f12
  2862. FMUL f8, f19, f8
  2863. FNMSUB f12, f20, f8, f12
  2864. FMUL f12, f21, f12
  2865. #endif
  2866. #ifdef RT
  2867. LFD f16, 15 * SIZE(BO)
  2868. LFD f17, 14 * SIZE(BO)
  2869. LFD f18, 13 * SIZE(BO)
  2870. LFD f19, 12 * SIZE(BO)
  2871. FMUL f12, f16, f12
  2872. FNMSUB f8, f17, f12, f8
  2873. FNMSUB f4, f18, f12, f4
  2874. FNMSUB f0, f19, f12, f0
  2875. LFD f16, 10 * SIZE(BO)
  2876. LFD f17, 9 * SIZE(BO)
  2877. LFD f18, 8 * SIZE(BO)
  2878. LFD f19, 5 * SIZE(BO)
  2879. FMUL f8, f16, f8
  2880. LFD f20, 4 * SIZE(BO)
  2881. LFD f21, 0 * SIZE(BO)
  2882. FNMSUB f4, f17, f8, f4
  2883. FNMSUB f0, f18, f8, f0
  2884. FMUL f4, f19, f4
  2885. FNMSUB f0, f20, f4, f0
  2886. FMUL f0, f21, f0
  2887. #endif
  2888. #ifdef LN
  2889. subi CO1, CO1, 1 * SIZE
  2890. subi CO2, CO2, 1 * SIZE
  2891. subi CO3, CO3, 1 * SIZE
  2892. subi CO4, CO4, 1 * SIZE
  2893. #endif
  2894. #if defined(LN) || defined(LT)
  2895. STFD f0, 0 * SIZE(BO)
  2896. STFD f4, 1 * SIZE(BO)
  2897. STFD f8, 2 * SIZE(BO)
  2898. STFD f12, 3 * SIZE(BO)
  2899. #else
  2900. STFD f0, 0 * SIZE(AO)
  2901. STFD f4, 1 * SIZE(AO)
  2902. STFD f8, 2 * SIZE(AO)
  2903. STFD f12, 3 * SIZE(AO)
  2904. #endif
  2905. STFD f0, 0 * SIZE(CO1)
  2906. STFD f4, 0 * SIZE(CO2)
  2907. STFD f8, 0 * SIZE(CO3)
  2908. STFD f12, 0 * SIZE(CO4)
  2909. lfs f0, FZERO
  2910. fmr f1, f0
  2911. fmr f4, f0
  2912. fmr f5, f0
  2913. fmr f8, f0
  2914. fmr f9, f0
  2915. fmr f12, f0
  2916. fmr f13, f0
  2917. #ifndef LN
  2918. addi CO1, CO1, 1 * SIZE
  2919. addi CO2, CO2, 1 * SIZE
  2920. addi CO3, CO3, 1 * SIZE
  2921. addi CO4, CO4, 1 * SIZE
  2922. #endif
  2923. #ifdef RT
  2924. slwi r0, K, 0 + BASE_SHIFT
  2925. add AORIG, AORIG, r0
  2926. #endif
  2927. #if defined(LT) || defined(RN)
  2928. sub TEMP, K, KK
  2929. slwi r0, TEMP, 0 + BASE_SHIFT
  2930. slwi TEMP, TEMP, 2 + BASE_SHIFT
  2931. add AO, AO, r0
  2932. add BO, BO, TEMP
  2933. #endif
  2934. #ifdef LN
  2935. subi KK, KK, 1
  2936. #endif
  2937. #ifdef LT
  2938. addi KK, KK, 1
  2939. #endif
  2940. .align 4
  2941. LL(39):
  2942. #ifdef LN
  2943. slwi r0, K, 2 + BASE_SHIFT
  2944. add B, B, r0
  2945. #endif
  2946. #if defined(LT) || defined(RN)
  2947. mr B, BO
  2948. #endif
  2949. #ifdef RN
  2950. addi KK, KK, 4
  2951. #endif
  2952. #ifdef RT
  2953. subi KK, KK, 4
  2954. #endif
  2955. addic. J, J, -1
  2956. lfs f0, FZERO
  2957. bgt LL(10)
  2958. .align 4
  2959. LL(999):
  2960. addi r3, 0, 0
  2961. lfd f14, 0(SP)
  2962. lfd f15, 8(SP)
  2963. lfd f16, 16(SP)
  2964. lfd f17, 24(SP)
  2965. lfd f18, 32(SP)
  2966. lfd f19, 40(SP)
  2967. lfd f20, 48(SP)
  2968. lfd f21, 56(SP)
  2969. lfd f22, 64(SP)
  2970. lfd f23, 72(SP)
  2971. lfd f24, 80(SP)
  2972. lfd f25, 88(SP)
  2973. lfd f26, 96(SP)
  2974. lfd f27, 104(SP)
  2975. lfd f28, 112(SP)
  2976. lfd f29, 120(SP)
  2977. lfd f30, 128(SP)
  2978. lfd f31, 136(SP)
  2979. #ifdef __64BIT__
  2980. ld r31, 144(SP)
  2981. ld r30, 152(SP)
  2982. ld r29, 160(SP)
  2983. ld r28, 168(SP)
  2984. ld r27, 176(SP)
  2985. ld r26, 184(SP)
  2986. ld r25, 192(SP)
  2987. ld r24, 200(SP)
  2988. ld r23, 208(SP)
  2989. ld r22, 216(SP)
  2990. ld r21, 224(SP)
  2991. ld r20, 232(SP)
  2992. ld r19, 240(SP)
  2993. ld r18, 248(SP)
  2994. #else
  2995. lwz r31, 144(SP)
  2996. lwz r30, 148(SP)
  2997. lwz r29, 152(SP)
  2998. lwz r28, 156(SP)
  2999. lwz r27, 160(SP)
  3000. lwz r26, 164(SP)
  3001. lwz r25, 168(SP)
  3002. lwz r24, 172(SP)
  3003. lwz r23, 176(SP)
  3004. lwz r22, 180(SP)
  3005. lwz r21, 184(SP)
  3006. lwz r20, 188(SP)
  3007. lwz r19, 192(SP)
  3008. lwz r18, 196(SP)
  3009. #endif
  3010. addi SP, SP, STACKSIZE
  3011. blr
  3012. EPILOGUE
  3013. #endif