You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_cell_LT.S 63 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #if defined(linux) || defined(__FreeBSD__)
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define AORIG r18
  88. #define TEMP r19
  89. #define KK r20
  90. #define I r21
  91. #define J r22
  92. #define AO r23
  93. #define BO r24
  94. #define CO1 r25
  95. #define CO2 r26
  96. #define CO3 r27
  97. #define CO4 r28
  98. #define PREA r29
  99. #define PREB r30
  100. #define PREC r31
  101. #ifndef NEEDPARAM
  102. PROLOGUE
  103. PROFCODE
  104. addi SP, SP, -STACKSIZE
  105. li r0, 0
  106. stfd f14, 0(SP)
  107. stfd f15, 8(SP)
  108. stfd f16, 16(SP)
  109. stfd f17, 24(SP)
  110. stfd f18, 32(SP)
  111. stfd f19, 40(SP)
  112. stfd f20, 48(SP)
  113. stfd f21, 56(SP)
  114. stfd f22, 64(SP)
  115. stfd f23, 72(SP)
  116. stfd f24, 80(SP)
  117. stfd f25, 88(SP)
  118. stfd f26, 96(SP)
  119. stfd f27, 104(SP)
  120. stfd f28, 112(SP)
  121. stfd f29, 120(SP)
  122. stfd f30, 128(SP)
  123. stfd f31, 136(SP)
  124. #ifdef __64BIT__
  125. std r31, 144(SP)
  126. std r30, 152(SP)
  127. std r29, 160(SP)
  128. std r28, 168(SP)
  129. std r27, 176(SP)
  130. std r26, 184(SP)
  131. std r25, 192(SP)
  132. std r24, 200(SP)
  133. std r23, 208(SP)
  134. std r22, 216(SP)
  135. std r21, 224(SP)
  136. std r20, 232(SP)
  137. std r19, 240(SP)
  138. std r18, 248(SP)
  139. #else
  140. stw r31, 144(SP)
  141. stw r30, 148(SP)
  142. stw r29, 152(SP)
  143. stw r28, 156(SP)
  144. stw r27, 160(SP)
  145. stw r26, 164(SP)
  146. stw r25, 168(SP)
  147. stw r24, 172(SP)
  148. stw r23, 176(SP)
  149. stw r22, 180(SP)
  150. stw r21, 184(SP)
  151. stw r20, 188(SP)
  152. stw r19, 192(SP)
  153. stw r18, 196(SP)
  154. #endif
  155. stw r0, FZERO
  156. #if defined(_AIX) || defined(__APPLE__)
  157. #if !defined(__64BIT__) && defined(DOUBLE)
  158. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  159. #endif
  160. #endif
  161. slwi LDC, LDC, BASE_SHIFT
  162. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  163. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  164. #endif
  165. #if defined(_AIX) || defined(__APPLE__)
  166. #ifdef __64BIT__
  167. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  168. #else
  169. #ifdef DOUBLE
  170. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  171. #else
  172. lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  173. #endif
  174. #endif
  175. #endif
  176. #ifdef LN
  177. mullw r0, M, K
  178. slwi r0, r0, BASE_SHIFT
  179. add A, A, r0
  180. slwi r0, M, BASE_SHIFT
  181. add C, C, r0
  182. #endif
  183. #ifdef RN
  184. neg KK, OFFSET
  185. #endif
  186. #ifdef RT
  187. mullw r0, N, K
  188. slwi r0, r0, BASE_SHIFT
  189. add B, B, r0
  190. mullw r0, N, LDC
  191. add C, C, r0
  192. sub KK, N, OFFSET
  193. #endif
  194. cmpwi cr0, M, 0
  195. ble LL(999)
  196. cmpwi cr0, N, 0
  197. ble LL(999)
  198. cmpwi cr0, K, 0
  199. ble LL(999)
  200. #ifndef PREFETCHTEST
  201. #if defined(TRSMKERNEL) && defined(LN)
  202. /* Direction is special */
  203. #ifdef PPC970
  204. li PREC, -4 * SIZE
  205. #endif
  206. #ifdef POWER4
  207. li PREC, -4 * SIZE
  208. #endif
  209. #ifdef POWER5
  210. li PREC, -4 * SIZE
  211. #endif
  212. #ifdef CELL
  213. li PREC, -4 * SIZE
  214. #endif
  215. #else
  216. /* Normal prefetch */
  217. #ifdef PPC970
  218. li PREC, 4 * SIZE
  219. #endif
  220. #ifdef POWER4
  221. li PREC, 4 * SIZE /* is 12 best? */
  222. #endif
  223. #ifdef POWER5
  224. li PREC, 3 * SIZE
  225. #endif
  226. #endif
  227. #else
  228. #if defined(linux) || defined(__FreeBSD__)
  229. #ifndef __64BIT__
  230. mr PREA, r10
  231. lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
  232. lwz PREC, FRAMESLOT(1) + STACKSIZE(SP)
  233. #else
  234. ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  235. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  236. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  237. #endif
  238. #endif
  239. #if defined(_AIX) || defined(__APPLE__)
  240. #ifdef __64BIT__
  241. ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  242. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  243. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  244. #else
  245. #ifdef DOUBLE
  246. lwz PREA, FRAMESLOT(1) + STACKSIZE(SP)
  247. lwz PREB, FRAMESLOT(2) + STACKSIZE(SP)
  248. lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
  249. #else
  250. lwz PREA, FRAMESLOT(0) + STACKSIZE(SP)
  251. lwz PREB, FRAMESLOT(1) + STACKSIZE(SP)
  252. lwz PREC, FRAMESLOT(2) + STACKSIZE(SP)
  253. #endif
  254. #endif
  255. #endif
  256. #endif
  257. #ifndef PREFETCHTEST
  258. #ifdef PPC970
  259. #ifdef ALLOC_HUGETLB
  260. li PREA, (16 * 5 * SIZE | 1)
  261. li PREB, (16 * 5 * SIZE | 3)
  262. #else
  263. li PREA, (16 * 14 * SIZE | 1)
  264. li PREB, (16 * 8 * SIZE | 3)
  265. #endif
  266. #endif
  267. #ifdef POWER4
  268. #ifdef ALLOC_HUGETLB
  269. li PREA, (16 * 1 * SIZE + 16)
  270. li PREB, (16 * 1 * SIZE + 16)
  271. #else
  272. li PREA, (16 * 2 * SIZE + 16)
  273. li PREB, (16 * 2 * SIZE + 16)
  274. #endif
  275. #endif
  276. #ifdef POWER5
  277. #ifdef ALLOC_HUGETLB
  278. li PREA, (16 * 7 * SIZE | 1)
  279. li PREB, (16 * 7 * SIZE | 3)
  280. #else
  281. li PREA, (16 * 12 * SIZE | 1)
  282. li PREB, (16 * 6 * SIZE | 3)
  283. #endif
  284. #endif
  285. #ifdef CELL
  286. li PREA, (16 * 12 * SIZE)
  287. li PREB, (16 * 12 * SIZE)
  288. #endif
  289. #endif
  290. lfs f0, FZERO
  291. srawi. J, N, 2
  292. ble LL(40)
  293. .align 4
  294. LL(10):
  295. #ifdef RT
  296. slwi r0, K, 2 + BASE_SHIFT
  297. sub B, B, r0
  298. slwi r0, LDC, 2
  299. sub C, C, r0
  300. #endif
  301. mr CO1, C
  302. add CO2, C, LDC
  303. add CO3, CO2, LDC
  304. add CO4, CO3, LDC
  305. #ifdef LN
  306. add KK, M, OFFSET
  307. #endif
  308. #ifdef LT
  309. mr KK, OFFSET
  310. #endif
  311. fmr f1, f0
  312. fmr f2, f0
  313. fmr f3, f0
  314. fmr f4, f0
  315. fmr f5, f0
  316. fmr f6, f0
  317. fmr f7, f0
  318. fmr f8, f0
  319. fmr f9, f0
  320. fmr f10, f0
  321. fmr f11, f0
  322. fmr f12, f0
  323. fmr f13, f0
  324. fmr f14, f0
  325. fmr f15, f0
  326. srawi. I, M, 2
  327. #if defined(LN) || defined(RT)
  328. mr AORIG, A
  329. #else
  330. mr AO, A
  331. #endif
  332. #ifndef RT
  333. add C, CO4, LDC
  334. #endif
  335. ble LL(20)
  336. .align 4
  337. LL(11):
  338. #if defined(LT) || defined(RN)
  339. LFD f16, 0 * SIZE(AO)
  340. LFD f17, 1 * SIZE(AO)
  341. LFD f18, 2 * SIZE(AO)
  342. LFD f19, 3 * SIZE(AO)
  343. LFD f20, 0 * SIZE(B)
  344. LFD f21, 1 * SIZE(B)
  345. LFD f22, 2 * SIZE(B)
  346. LFD f23, 3 * SIZE(B)
  347. LFD f24, 4 * SIZE(AO)
  348. LFD f25, 5 * SIZE(AO)
  349. LFD f26, 6 * SIZE(AO)
  350. LFD f28, 4 * SIZE(B)
  351. LFD f29, 5 * SIZE(B)
  352. LFD f30, 6 * SIZE(B)
  353. dcbtst CO1, PREC
  354. dcbtst CO2, PREC
  355. dcbtst CO3, PREC
  356. dcbtst CO4, PREC
  357. srawi. r0, KK, 2
  358. mtspr CTR, r0
  359. mr BO, B
  360. #else
  361. #ifdef LN
  362. slwi r0, K, 2 + BASE_SHIFT
  363. sub AORIG, AORIG, r0
  364. #endif
  365. slwi TEMP, KK, 2 + BASE_SHIFT
  366. add AO, AORIG, TEMP
  367. add BO, B, TEMP
  368. sub TEMP, K, KK
  369. LFD f16, 0 * SIZE(AO)
  370. LFD f17, 1 * SIZE(AO)
  371. LFD f18, 2 * SIZE(AO)
  372. LFD f19, 3 * SIZE(AO)
  373. LFD f20, 0 * SIZE(BO)
  374. LFD f21, 1 * SIZE(BO)
  375. LFD f22, 2 * SIZE(BO)
  376. LFD f23, 3 * SIZE(BO)
  377. dcbt CO1, PREC
  378. dcbt CO2, PREC
  379. dcbt CO3, PREC
  380. dcbt CO4, PREC
  381. srawi. r0, TEMP, 2
  382. mtspr CTR, r0
  383. #endif
  384. ble LL(15)
  385. .align 4
  386. #define NOP1 mr r18, r18
  387. #define NOP2 mr r19, r19
  388. LL(12):
  389. FMADD f0, f16, f20, f0
  390. dcbt AO, PREA
  391. FMADD f4, f16, f21, f4
  392. dcbt BO, PREB
  393. FMADD f8, f16, f22, f8
  394. LFD f31, 7 * SIZE(BO)
  395. FMADD f12, f16, f23, f12
  396. LFD f27, 7 * SIZE(AO)
  397. FMADD f1, f17, f20, f1
  398. LFD f16, 8 * SIZE(AO)
  399. FMADD f5, f17, f21, f5
  400. NOP2
  401. FMADD f9, f17, f22, f9
  402. NOP1
  403. FMADD f13, f17, f23, f13
  404. LFD f17, 9 * SIZE(AO)
  405. FMADD f2, f18, f20, f2
  406. NOP1
  407. FMADD f6, f18, f21, f6
  408. NOP2
  409. FMADD f10, f18, f22, f10
  410. NOP1
  411. FMADD f14, f18, f23, f14
  412. LFD f18, 10 * SIZE(AO)
  413. FMADD f3, f19, f20, f3
  414. LFD f20, 8 * SIZE(BO)
  415. FMADD f7, f19, f21, f7
  416. LFD f21, 9 * SIZE(BO)
  417. FMADD f11, f19, f22, f11
  418. LFD f22, 10 * SIZE(BO)
  419. FMADD f15, f19, f23, f15
  420. LFD f19, 11 * SIZE(AO)
  421. FMADD f0, f24, f28, f0
  422. LFD f23, 11 * SIZE(BO)
  423. FMADD f4, f24, f29, f4
  424. NOP2
  425. FMADD f8, f24, f30, f8
  426. NOP1
  427. FMADD f12, f24, f31, f12
  428. LFD f24, 12 * SIZE(AO)
  429. FMADD f1, f25, f28, f1
  430. NOP1
  431. FMADD f5, f25, f29, f5
  432. NOP2
  433. FMADD f9, f25, f30, f9
  434. NOP1
  435. FMADD f13, f25, f31, f13
  436. LFD f25, 13 * SIZE(AO)
  437. FMADD f2, f26, f28, f2
  438. NOP1
  439. FMADD f6, f26, f29, f6
  440. NOP2
  441. FMADD f10, f26, f30, f10
  442. NOP1
  443. FMADD f14, f26, f31, f14
  444. LFD f26, 14 * SIZE(AO)
  445. FMADD f3, f27, f28, f3
  446. LFD f28, 12 * SIZE(BO)
  447. FMADD f7, f27, f29, f7
  448. LFD f29, 13 * SIZE(BO)
  449. FMADD f11, f27, f30, f11
  450. LFD f30, 14 * SIZE(BO)
  451. FMADD f15, f27, f31, f15
  452. LFD f27, 15 * SIZE(AO)
  453. FMADD f0, f16, f20, f0
  454. LFD f31, 15 * SIZE(BO)
  455. FMADD f4, f16, f21, f4
  456. NOP2
  457. FMADD f8, f16, f22, f8
  458. NOP1
  459. FMADD f12, f16, f23, f12
  460. LFD f16, 16 * SIZE(AO)
  461. FMADD f1, f17, f20, f1
  462. NOP1
  463. FMADD f5, f17, f21, f5
  464. NOP2
  465. FMADD f9, f17, f22, f9
  466. NOP1
  467. FMADD f13, f17, f23, f13
  468. LFD f17, 17 * SIZE(AO)
  469. FMADD f2, f18, f20, f2
  470. NOP1
  471. FMADD f6, f18, f21, f6
  472. NOP2
  473. FMADD f10, f18, f22, f10
  474. NOP1
  475. FMADD f14, f18, f23, f14
  476. LFD f18, 18 * SIZE(AO)
  477. FMADD f3, f19, f20, f3
  478. LFD f20, 16 * SIZE(BO)
  479. FMADD f7, f19, f21, f7
  480. LFD f21, 17 * SIZE(BO)
  481. FMADD f11, f19, f22, f11
  482. LFD f22, 18 * SIZE(BO)
  483. FMADD f15, f19, f23, f15
  484. LFD f19, 19 * SIZE(AO)
  485. FMADD f0, f24, f28, f0
  486. LFD f23, 19 * SIZE(BO)
  487. FMADD f4, f24, f29, f4
  488. NOP2
  489. FMADD f8, f24, f30, f8
  490. NOP1
  491. FMADD f12, f24, f31, f12
  492. LFD f24, 20 * SIZE(AO)
  493. FMADD f1, f25, f28, f1
  494. NOP1
  495. FMADD f5, f25, f29, f5
  496. NOP2
  497. FMADD f9, f25, f30, f9
  498. NOP1
  499. FMADD f13, f25, f31, f13
  500. LFD f25, 21 * SIZE(AO)
  501. FMADD f2, f26, f28, f2
  502. NOP1
  503. FMADD f6, f26, f29, f6
  504. NOP2
  505. FMADD f10, f26, f30, f10
  506. NOP1
  507. FMADD f14, f26, f31, f14
  508. LFD f26, 22 * SIZE(AO)
  509. FMADD f3, f27, f28, f3
  510. LFD f28, 20 * SIZE(BO)
  511. FMADD f7, f27, f29, f7
  512. LFD f29, 21 * SIZE(BO)
  513. FMADD f11, f27, f30, f11
  514. LFD f30, 22 * SIZE(BO)
  515. FMADD f15, f27, f31, f15
  516. addi AO, AO, 16 * SIZE
  517. addi BO, BO, 16 * SIZE
  518. bdnz LL(12)
  519. .align 4
  520. LL(15):
  521. #if defined(LT) || defined(RN)
  522. andi. r0, KK, 3
  523. #else
  524. andi. r0, TEMP, 3
  525. #endif
  526. mtspr CTR, r0
  527. ble+ LL(18)
  528. .align 4
  529. LL(16):
  530. FMADD f0, f16, f20, f0
  531. FMADD f5, f17, f21, f5
  532. FMADD f10, f18, f22, f10
  533. FMADD f15, f19, f23, f15
  534. FMADD f1, f17, f20, f1
  535. FMADD f2, f18, f20, f2
  536. FMADD f3, f19, f20, f3
  537. FMADD f4, f16, f21, f4
  538. FMADD f6, f18, f21, f6
  539. FMADD f7, f19, f21, f7
  540. FMADD f8, f16, f22, f8
  541. FMADD f9, f17, f22, f9
  542. FMADD f11, f19, f22, f11
  543. FMADD f12, f16, f23, f12
  544. FMADD f13, f17, f23, f13
  545. FMADD f14, f18, f23, f14
  546. LFD f16, 4 * SIZE(AO)
  547. LFD f17, 5 * SIZE(AO)
  548. LFD f18, 6 * SIZE(AO)
  549. LFD f19, 7 * SIZE(AO)
  550. LFD f20, 4 * SIZE(BO)
  551. LFD f21, 5 * SIZE(BO)
  552. LFD f22, 6 * SIZE(BO)
  553. LFD f23, 7 * SIZE(BO)
  554. addi BO, BO, 4 * SIZE
  555. addi AO, AO, 4 * SIZE
  556. bdnz LL(16)
  557. .align 4
  558. LL(18):
  559. #if defined(LN) || defined(RT)
  560. subi r0, KK, 4
  561. slwi r0, r0, 2 + BASE_SHIFT
  562. add AO, AORIG, r0
  563. add BO, B, r0
  564. #endif
  565. #if defined(LN) || defined(LT)
  566. LFD f16, 0 * SIZE(BO)
  567. LFD f17, 1 * SIZE(BO)
  568. LFD f18, 2 * SIZE(BO)
  569. LFD f19, 3 * SIZE(BO)
  570. LFD f20, 4 * SIZE(BO)
  571. LFD f21, 5 * SIZE(BO)
  572. LFD f22, 6 * SIZE(BO)
  573. LFD f23, 7 * SIZE(BO)
  574. LFD f24, 8 * SIZE(BO)
  575. LFD f25, 9 * SIZE(BO)
  576. LFD f26, 10 * SIZE(BO)
  577. LFD f27, 11 * SIZE(BO)
  578. LFD f28, 12 * SIZE(BO)
  579. LFD f29, 13 * SIZE(BO)
  580. LFD f30, 14 * SIZE(BO)
  581. LFD f31, 15 * SIZE(BO)
  582. FSUB f0, f16, f0
  583. FSUB f4, f17, f4
  584. FSUB f8, f18, f8
  585. FSUB f12, f19, f12
  586. FSUB f1, f20, f1
  587. FSUB f5, f21, f5
  588. FSUB f9, f22, f9
  589. FSUB f13, f23, f13
  590. FSUB f2, f24, f2
  591. FSUB f6, f25, f6
  592. FSUB f10, f26, f10
  593. FSUB f14, f27, f14
  594. FSUB f3, f28, f3
  595. FSUB f7, f29, f7
  596. FSUB f11, f30, f11
  597. FSUB f15, f31, f15
  598. #else
  599. LFD f16, 0 * SIZE(AO)
  600. LFD f17, 1 * SIZE(AO)
  601. LFD f18, 2 * SIZE(AO)
  602. LFD f19, 3 * SIZE(AO)
  603. LFD f20, 4 * SIZE(AO)
  604. LFD f21, 5 * SIZE(AO)
  605. LFD f22, 6 * SIZE(AO)
  606. LFD f23, 7 * SIZE(AO)
  607. LFD f24, 8 * SIZE(AO)
  608. LFD f25, 9 * SIZE(AO)
  609. LFD f26, 10 * SIZE(AO)
  610. LFD f27, 11 * SIZE(AO)
  611. LFD f28, 12 * SIZE(AO)
  612. LFD f29, 13 * SIZE(AO)
  613. LFD f30, 14 * SIZE(AO)
  614. LFD f31, 15 * SIZE(AO)
  615. FSUB f0, f16, f0
  616. FSUB f1, f17, f1
  617. FSUB f2, f18, f2
  618. FSUB f3, f19, f3
  619. FSUB f4, f20, f4
  620. FSUB f5, f21, f5
  621. FSUB f6, f22, f6
  622. FSUB f7, f23, f7
  623. FSUB f8, f24, f8
  624. FSUB f9, f25, f9
  625. FSUB f10, f26, f10
  626. FSUB f11, f27, f11
  627. FSUB f12, f28, f12
  628. FSUB f13, f29, f13
  629. FSUB f14, f30, f14
  630. FSUB f15, f31, f15
  631. #endif
  632. #ifdef LN
  633. LFD f16, 15 * SIZE(AO)
  634. LFD f17, 14 * SIZE(AO)
  635. LFD f18, 13 * SIZE(AO)
  636. LFD f19, 12 * SIZE(AO)
  637. FMUL f3, f16, f3
  638. FMUL f7, f16, f7
  639. FMUL f11, f16, f11
  640. FMUL f15, f16, f15
  641. FNMSUB f2, f17, f3, f2
  642. FNMSUB f6, f17, f7, f6
  643. FNMSUB f10, f17, f11, f10
  644. FNMSUB f14, f17, f15, f14
  645. FNMSUB f1, f18, f3, f1
  646. FNMSUB f5, f18, f7, f5
  647. FNMSUB f9, f18, f11, f9
  648. FNMSUB f13, f18, f15, f13
  649. FNMSUB f0, f19, f3, f0
  650. FNMSUB f4, f19, f7, f4
  651. FNMSUB f8, f19, f11, f8
  652. FNMSUB f12, f19, f15, f12
  653. LFD f16, 10 * SIZE(AO)
  654. LFD f17, 9 * SIZE(AO)
  655. LFD f18, 8 * SIZE(AO)
  656. LFD f19, 5 * SIZE(AO)
  657. FMUL f2, f16, f2
  658. FMUL f6, f16, f6
  659. FMUL f10, f16, f10
  660. FMUL f14, f16, f14
  661. LFD f20, 4 * SIZE(AO)
  662. LFD f21, 0 * SIZE(AO)
  663. FNMSUB f1, f17, f2, f1
  664. FNMSUB f5, f17, f6, f5
  665. FNMSUB f9, f17, f10, f9
  666. FNMSUB f13, f17, f14, f13
  667. FNMSUB f0, f18, f2, f0
  668. FNMSUB f4, f18, f6, f4
  669. FNMSUB f8, f18, f10, f8
  670. FNMSUB f12, f18, f14, f12
  671. FMUL f1, f19, f1
  672. FMUL f5, f19, f5
  673. FMUL f9, f19, f9
  674. FMUL f13, f19, f13
  675. FNMSUB f0, f20, f1, f0
  676. FNMSUB f4, f20, f5, f4
  677. FNMSUB f8, f20, f9, f8
  678. FNMSUB f12, f20, f13, f12
  679. FMUL f0, f21, f0
  680. FMUL f4, f21, f4
  681. FMUL f8, f21, f8
  682. FMUL f12, f21, f12
  683. #endif
  684. #ifdef LT
  685. LFD f16, 0 * SIZE(AO)
  686. LFD f17, 1 * SIZE(AO)
  687. LFD f18, 2 * SIZE(AO)
  688. LFD f19, 3 * SIZE(AO)
  689. FMUL f0, f16, f0
  690. FMUL f4, f16, f4
  691. FMUL f8, f16, f8
  692. FMUL f12, f16, f12
  693. FNMSUB f1, f17, f0, f1
  694. FNMSUB f5, f17, f4, f5
  695. FNMSUB f9, f17, f8, f9
  696. FNMSUB f13, f17, f12, f13
  697. FNMSUB f2, f18, f0, f2
  698. FNMSUB f6, f18, f4, f6
  699. FNMSUB f10, f18, f8, f10
  700. FNMSUB f14, f18, f12, f14
  701. FNMSUB f3, f19, f0, f3
  702. FNMSUB f7, f19, f4, f7
  703. FNMSUB f11, f19, f8, f11
  704. FNMSUB f15, f19, f12, f15
  705. LFD f16, 5 * SIZE(AO)
  706. LFD f17, 6 * SIZE(AO)
  707. LFD f18, 7 * SIZE(AO)
  708. LFD f19, 10 * SIZE(AO)
  709. FMUL f1, f16, f1
  710. FMUL f5, f16, f5
  711. FMUL f9, f16, f9
  712. FMUL f13, f16, f13
  713. LFD f20, 11 * SIZE(AO)
  714. LFD f21, 15 * SIZE(AO)
  715. FNMSUB f2, f17, f1, f2
  716. FNMSUB f6, f17, f5, f6
  717. FNMSUB f10, f17, f9, f10
  718. FNMSUB f14, f17, f13, f14
  719. FNMSUB f3, f18, f1, f3
  720. FNMSUB f7, f18, f5, f7
  721. FNMSUB f11, f18, f9, f11
  722. FNMSUB f15, f18, f13, f15
  723. FMUL f2, f19, f2
  724. FMUL f6, f19, f6
  725. FMUL f10, f19, f10
  726. FMUL f14, f19, f14
  727. FNMSUB f3, f20, f2, f3
  728. FNMSUB f7, f20, f6, f7
  729. FNMSUB f11, f20, f10, f11
  730. FNMSUB f15, f20, f14, f15
  731. FMUL f3, f21, f3
  732. FMUL f7, f21, f7
  733. FMUL f11, f21, f11
  734. FMUL f15, f21, f15
  735. #endif
  736. #ifdef RN
  737. LFD f16, 0 * SIZE(BO)
  738. LFD f17, 1 * SIZE(BO)
  739. LFD f18, 2 * SIZE(BO)
  740. LFD f19, 3 * SIZE(BO)
  741. FMUL f0, f16, f0
  742. FMUL f1, f16, f1
  743. FMUL f2, f16, f2
  744. FMUL f3, f16, f3
  745. FNMSUB f4, f17, f0, f4
  746. FNMSUB f5, f17, f1, f5
  747. FNMSUB f6, f17, f2, f6
  748. FNMSUB f7, f17, f3, f7
  749. FNMSUB f8, f18, f0, f8
  750. FNMSUB f9, f18, f1, f9
  751. FNMSUB f10, f18, f2, f10
  752. FNMSUB f11, f18, f3, f11
  753. FNMSUB f12, f19, f0, f12
  754. FNMSUB f13, f19, f1, f13
  755. FNMSUB f14, f19, f2, f14
  756. FNMSUB f15, f19, f3, f15
  757. LFD f16, 5 * SIZE(BO)
  758. LFD f17, 6 * SIZE(BO)
  759. LFD f18, 7 * SIZE(BO)
  760. LFD f19, 10 * SIZE(BO)
  761. FMUL f4, f16, f4
  762. FMUL f5, f16, f5
  763. FMUL f6, f16, f6
  764. FMUL f7, f16, f7
  765. LFD f20, 11 * SIZE(BO)
  766. LFD f21, 15 * SIZE(BO)
  767. FNMSUB f8, f17, f4, f8
  768. FNMSUB f9, f17, f5, f9
  769. FNMSUB f10, f17, f6, f10
  770. FNMSUB f11, f17, f7, f11
  771. FNMSUB f12, f18, f4, f12
  772. FNMSUB f13, f18, f5, f13
  773. FNMSUB f14, f18, f6, f14
  774. FNMSUB f15, f18, f7, f15
  775. FMUL f8, f19, f8
  776. FMUL f9, f19, f9
  777. FMUL f10, f19, f10
  778. FMUL f11, f19, f11
  779. FNMSUB f12, f20, f8, f12
  780. FNMSUB f13, f20, f9, f13
  781. FNMSUB f14, f20, f10, f14
  782. FNMSUB f15, f20, f11, f15
  783. FMUL f12, f21, f12
  784. FMUL f13, f21, f13
  785. FMUL f14, f21, f14
  786. FMUL f15, f21, f15
  787. #endif
  788. #ifdef RT
  789. LFD f16, 15 * SIZE(BO)
  790. LFD f17, 14 * SIZE(BO)
  791. LFD f18, 13 * SIZE(BO)
  792. LFD f19, 12 * SIZE(BO)
  793. FMUL f12, f16, f12
  794. FMUL f13, f16, f13
  795. FMUL f14, f16, f14
  796. FMUL f15, f16, f15
  797. FNMSUB f8, f17, f12, f8
  798. FNMSUB f9, f17, f13, f9
  799. FNMSUB f10, f17, f14, f10
  800. FNMSUB f11, f17, f15, f11
  801. FNMSUB f4, f18, f12, f4
  802. FNMSUB f5, f18, f13, f5
  803. FNMSUB f6, f18, f14, f6
  804. FNMSUB f7, f18, f15, f7
  805. FNMSUB f0, f19, f12, f0
  806. FNMSUB f1, f19, f13, f1
  807. FNMSUB f2, f19, f14, f2
  808. FNMSUB f3, f19, f15, f3
  809. LFD f16, 10 * SIZE(BO)
  810. LFD f17, 9 * SIZE(BO)
  811. LFD f18, 8 * SIZE(BO)
  812. LFD f19, 5 * SIZE(BO)
  813. FMUL f8, f16, f8
  814. FMUL f9, f16, f9
  815. FMUL f10, f16, f10
  816. FMUL f11, f16, f11
  817. LFD f20, 4 * SIZE(BO)
  818. LFD f21, 0 * SIZE(BO)
  819. FNMSUB f4, f17, f8, f4
  820. FNMSUB f5, f17, f9, f5
  821. FNMSUB f6, f17, f10, f6
  822. FNMSUB f7, f17, f11, f7
  823. FNMSUB f0, f18, f8, f0
  824. FNMSUB f1, f18, f9, f1
  825. FNMSUB f2, f18, f10, f2
  826. FNMSUB f3, f18, f11, f3
  827. FMUL f4, f19, f4
  828. FMUL f5, f19, f5
  829. FMUL f6, f19, f6
  830. FMUL f7, f19, f7
  831. FNMSUB f0, f20, f4, f0
  832. FNMSUB f1, f20, f5, f1
  833. FNMSUB f2, f20, f6, f2
  834. FNMSUB f3, f20, f7, f3
  835. FMUL f0, f21, f0
  836. FMUL f1, f21, f1
  837. FMUL f2, f21, f2
  838. FMUL f3, f21, f3
  839. #endif
  840. #ifdef LN
  841. subi CO1, CO1, 4 * SIZE
  842. subi CO2, CO2, 4 * SIZE
  843. subi CO3, CO3, 4 * SIZE
  844. subi CO4, CO4, 4 * SIZE
  845. #endif
  846. #if defined(LN) || defined(LT)
  847. STFD f0, 0 * SIZE(BO)
  848. STFD f4, 1 * SIZE(BO)
  849. STFD f8, 2 * SIZE(BO)
  850. STFD f12, 3 * SIZE(BO)
  851. STFD f1, 4 * SIZE(BO)
  852. STFD f5, 5 * SIZE(BO)
  853. STFD f9, 6 * SIZE(BO)
  854. STFD f13, 7 * SIZE(BO)
  855. STFD f2, 8 * SIZE(BO)
  856. STFD f6, 9 * SIZE(BO)
  857. STFD f10, 10 * SIZE(BO)
  858. STFD f14, 11 * SIZE(BO)
  859. STFD f3, 12 * SIZE(BO)
  860. STFD f7, 13 * SIZE(BO)
  861. STFD f11, 14 * SIZE(BO)
  862. STFD f15, 15 * SIZE(BO)
  863. #else
  864. STFD f0, 0 * SIZE(AO)
  865. STFD f1, 1 * SIZE(AO)
  866. STFD f2, 2 * SIZE(AO)
  867. STFD f3, 3 * SIZE(AO)
  868. STFD f4, 4 * SIZE(AO)
  869. STFD f5, 5 * SIZE(AO)
  870. STFD f6, 6 * SIZE(AO)
  871. STFD f7, 7 * SIZE(AO)
  872. STFD f8, 8 * SIZE(AO)
  873. STFD f9, 9 * SIZE(AO)
  874. STFD f10, 10 * SIZE(AO)
  875. STFD f11, 11 * SIZE(AO)
  876. STFD f12, 12 * SIZE(AO)
  877. STFD f13, 13 * SIZE(AO)
  878. STFD f14, 14 * SIZE(AO)
  879. STFD f15, 15 * SIZE(AO)
  880. #endif
  881. STFD f0, 0 * SIZE(CO1)
  882. STFD f1, 1 * SIZE(CO1)
  883. STFD f2, 2 * SIZE(CO1)
  884. STFD f3, 3 * SIZE(CO1)
  885. STFD f4, 0 * SIZE(CO2)
  886. STFD f5, 1 * SIZE(CO2)
  887. STFD f6, 2 * SIZE(CO2)
  888. STFD f7, 3 * SIZE(CO2)
  889. STFD f8, 0 * SIZE(CO3)
  890. STFD f9, 1 * SIZE(CO3)
  891. STFD f10, 2 * SIZE(CO3)
  892. STFD f11, 3 * SIZE(CO3)
  893. STFD f12, 0 * SIZE(CO4)
  894. STFD f13, 1 * SIZE(CO4)
  895. STFD f14, 2 * SIZE(CO4)
  896. STFD f15, 3 * SIZE(CO4)
  897. lfs f0, FZERO
  898. fmr f1, f0
  899. fmr f2, f0
  900. fmr f3, f0
  901. fmr f4, f0
  902. fmr f5, f0
  903. fmr f6, f0
  904. fmr f7, f0
  905. fmr f8, f0
  906. fmr f9, f0
  907. fmr f10, f0
  908. fmr f11, f0
  909. fmr f12, f0
  910. fmr f13, f0
  911. fmr f14, f0
  912. fmr f15, f0
  913. #ifndef LN
  914. addi CO1, CO1, 4 * SIZE
  915. addi CO2, CO2, 4 * SIZE
  916. addi CO3, CO3, 4 * SIZE
  917. addi CO4, CO4, 4 * SIZE
  918. #endif
  919. #ifdef RT
  920. slwi r0, K, 2 + BASE_SHIFT
  921. add AORIG, AORIG, r0
  922. #endif
  923. #if defined(LT) || defined(RN)
  924. sub TEMP, K, KK
  925. slwi TEMP, TEMP, 2 + BASE_SHIFT
  926. add AO, AO, TEMP
  927. add BO, BO, TEMP
  928. #endif
  929. #ifdef LT
  930. addi KK, KK, 4
  931. #endif
  932. #ifdef LN
  933. subi KK, KK, 4
  934. #endif
  935. addic. I, I, -1
  936. bgt+ LL(11)
  937. .align 4
  938. LL(20):
  939. andi. I, M, 2
  940. ble LL(30)
  941. #if defined(LT) || defined(RN)
  942. LFD f16, 0 * SIZE(AO)
  943. LFD f17, 1 * SIZE(AO)
  944. LFD f18, 2 * SIZE(AO)
  945. LFD f19, 3 * SIZE(AO)
  946. LFD f20, 0 * SIZE(B)
  947. LFD f21, 1 * SIZE(B)
  948. LFD f22, 2 * SIZE(B)
  949. LFD f23, 3 * SIZE(B)
  950. LFD f24, 4 * SIZE(B)
  951. LFD f25, 5 * SIZE(B)
  952. LFD f26, 6 * SIZE(B)
  953. LFD f27, 7 * SIZE(B)
  954. srawi. r0, KK, 2
  955. mtspr CTR, r0
  956. mr BO, B
  957. #else
  958. #ifdef LN
  959. slwi r0, K, 1 + BASE_SHIFT
  960. sub AORIG, AORIG, r0
  961. #endif
  962. slwi r0, KK, 1 + BASE_SHIFT
  963. slwi TEMP, KK, 2 + BASE_SHIFT
  964. add AO, AORIG, r0
  965. add BO, B, TEMP
  966. sub TEMP, K, KK
  967. LFD f16, 0 * SIZE(AO)
  968. LFD f17, 1 * SIZE(AO)
  969. LFD f18, 2 * SIZE(AO)
  970. LFD f19, 3 * SIZE(AO)
  971. LFD f20, 0 * SIZE(BO)
  972. LFD f21, 1 * SIZE(BO)
  973. LFD f22, 2 * SIZE(BO)
  974. LFD f23, 3 * SIZE(BO)
  975. LFD f24, 4 * SIZE(BO)
  976. LFD f25, 5 * SIZE(BO)
  977. LFD f26, 6 * SIZE(BO)
  978. LFD f27, 7 * SIZE(BO)
  979. srawi. r0, TEMP, 2
  980. mtspr CTR, r0
  981. #endif
  982. ble LL(25)
  983. .align 5
  984. LL(22):
  985. FMADD f0, f16, f20, f0
  986. FMADD f1, f17, f20, f1
  987. FMADD f4, f16, f21, f4
  988. FMADD f5, f17, f21, f5
  989. FMADD f8, f16, f22, f8
  990. FMADD f9, f17, f22, f9
  991. FMADD f12, f16, f23, f12
  992. FMADD f13, f17, f23, f13
  993. LFD f20, 8 * SIZE(BO)
  994. LFD f21, 9 * SIZE(BO)
  995. LFD f22, 10 * SIZE(BO)
  996. LFD f23, 11 * SIZE(BO)
  997. FMADD f2, f18, f24, f2
  998. FMADD f3, f19, f24, f3
  999. FMADD f6, f18, f25, f6
  1000. FMADD f7, f19, f25, f7
  1001. FMADD f10, f18, f26, f10
  1002. FMADD f11, f19, f26, f11
  1003. FMADD f14, f18, f27, f14
  1004. FMADD f15, f19, f27, f15
  1005. LFD f16, 4 * SIZE(AO)
  1006. LFD f17, 5 * SIZE(AO)
  1007. LFD f18, 6 * SIZE(AO)
  1008. LFD f19, 7 * SIZE(AO)
  1009. FMADD f0, f16, f20, f0
  1010. FMADD f1, f17, f20, f1
  1011. FMADD f4, f16, f21, f4
  1012. FMADD f5, f17, f21, f5
  1013. LFD f24, 12 * SIZE(BO)
  1014. LFD f25, 13 * SIZE(BO)
  1015. LFD f26, 14 * SIZE(BO)
  1016. LFD f27, 15 * SIZE(BO)
  1017. FMADD f8, f16, f22, f8
  1018. FMADD f9, f17, f22, f9
  1019. FMADD f12, f16, f23, f12
  1020. FMADD f13, f17, f23, f13
  1021. LFD f20, 16 * SIZE(BO)
  1022. LFD f21, 17 * SIZE(BO)
  1023. LFD f22, 18 * SIZE(BO)
  1024. LFD f23, 19 * SIZE(BO)
  1025. FMADD f2, f18, f24, f2
  1026. FMADD f3, f19, f24, f3
  1027. FMADD f6, f18, f25, f6
  1028. FMADD f7, f19, f25, f7
  1029. FMADD f10, f18, f26, f10
  1030. FMADD f11, f19, f26, f11
  1031. FMADD f14, f18, f27, f14
  1032. FMADD f15, f19, f27, f15
  1033. LFD f16, 8 * SIZE(AO)
  1034. LFD f17, 9 * SIZE(AO)
  1035. LFD f18, 10 * SIZE(AO)
  1036. LFD f19, 11 * SIZE(AO)
  1037. LFD f24, 20 * SIZE(BO)
  1038. LFD f25, 21 * SIZE(BO)
  1039. LFD f26, 22 * SIZE(BO)
  1040. LFD f27, 23 * SIZE(BO)
  1041. addi AO, AO, 8 * SIZE
  1042. addi BO, BO, 16 * SIZE
  1043. DCBT(BO, PREB)
  1044. bdnz LL(22)
  1045. fadd f0, f2, f0
  1046. fadd f1, f3, f1
  1047. fadd f4, f6, f4
  1048. fadd f5, f7, f5
  1049. fadd f8, f10, f8
  1050. fadd f9, f11, f9
  1051. fadd f12, f14, f12
  1052. fadd f13, f15, f13
  1053. .align 4
  1054. LL(25):
  1055. #if defined(LT) || defined(RN)
  1056. andi. r0, KK, 3
  1057. #else
  1058. andi. r0, TEMP, 3
  1059. #endif
  1060. mtspr CTR, r0
  1061. ble+ LL(28)
  1062. .align 4
  1063. LL(26):
  1064. FMADD f0, f16, f20, f0
  1065. FMADD f1, f17, f20, f1
  1066. FMADD f4, f16, f21, f4
  1067. FMADD f5, f17, f21, f5
  1068. FMADD f8, f16, f22, f8
  1069. FMADD f9, f17, f22, f9
  1070. FMADD f12, f16, f23, f12
  1071. FMADD f13, f17, f23, f13
  1072. LFD f16, 2 * SIZE(AO)
  1073. LFD f17, 3 * SIZE(AO)
  1074. LFD f20, 4 * SIZE(BO)
  1075. LFD f21, 5 * SIZE(BO)
  1076. LFD f22, 6 * SIZE(BO)
  1077. LFD f23, 7 * SIZE(BO)
  1078. addi BO, BO, 4 * SIZE
  1079. addi AO, AO, 2 * SIZE
  1080. bdnz LL(26)
  1081. .align 4
  1082. LL(28):
  1083. #if defined(LN) || defined(RT)
  1084. #ifdef LN
  1085. subi r0, KK, 2
  1086. #else
  1087. subi r0, KK, 4
  1088. #endif
  1089. slwi TEMP, r0, 1 + BASE_SHIFT
  1090. slwi r0, r0, 2 + BASE_SHIFT
  1091. add AO, AORIG, TEMP
  1092. add BO, B, r0
  1093. #endif
  1094. #if defined(LN) || defined(LT)
  1095. LFD f16, 0 * SIZE(BO)
  1096. LFD f17, 1 * SIZE(BO)
  1097. LFD f18, 2 * SIZE(BO)
  1098. LFD f19, 3 * SIZE(BO)
  1099. LFD f20, 4 * SIZE(BO)
  1100. LFD f21, 5 * SIZE(BO)
  1101. LFD f22, 6 * SIZE(BO)
  1102. LFD f23, 7 * SIZE(BO)
  1103. FSUB f0, f16, f0
  1104. FSUB f4, f17, f4
  1105. FSUB f8, f18, f8
  1106. FSUB f12, f19, f12
  1107. FSUB f1, f20, f1
  1108. FSUB f5, f21, f5
  1109. FSUB f9, f22, f9
  1110. FSUB f13, f23, f13
  1111. #else
  1112. LFD f16, 0 * SIZE(AO)
  1113. LFD f17, 1 * SIZE(AO)
  1114. LFD f20, 2 * SIZE(AO)
  1115. LFD f21, 3 * SIZE(AO)
  1116. LFD f24, 4 * SIZE(AO)
  1117. LFD f25, 5 * SIZE(AO)
  1118. LFD f28, 6 * SIZE(AO)
  1119. LFD f29, 7 * SIZE(AO)
  1120. FSUB f0, f16, f0
  1121. FSUB f1, f17, f1
  1122. FSUB f4, f20, f4
  1123. FSUB f5, f21, f5
  1124. FSUB f8, f24, f8
  1125. FSUB f9, f25, f9
  1126. FSUB f12, f28, f12
  1127. FSUB f13, f29, f13
  1128. #endif
  1129. #ifdef LN
  1130. LFD f19, 3 * SIZE(AO)
  1131. LFD f20, 2 * SIZE(AO)
  1132. LFD f21, 0 * SIZE(AO)
  1133. FMUL f1, f19, f1
  1134. FMUL f5, f19, f5
  1135. FMUL f9, f19, f9
  1136. FMUL f13, f19, f13
  1137. FNMSUB f0, f20, f1, f0
  1138. FNMSUB f4, f20, f5, f4
  1139. FNMSUB f8, f20, f9, f8
  1140. FNMSUB f12, f20, f13, f12
  1141. FMUL f0, f21, f0
  1142. FMUL f4, f21, f4
  1143. FMUL f8, f21, f8
  1144. FMUL f12, f21, f12
  1145. #endif
  1146. #ifdef LT
  1147. LFD f16, 0 * SIZE(AO)
  1148. LFD f17, 1 * SIZE(AO)
  1149. FMUL f0, f16, f0
  1150. FMUL f4, f16, f4
  1151. FMUL f8, f16, f8
  1152. FMUL f12, f16, f12
  1153. FNMSUB f1, f17, f0, f1
  1154. FNMSUB f5, f17, f4, f5
  1155. FNMSUB f9, f17, f8, f9
  1156. FNMSUB f13, f17, f12, f13
  1157. LFD f17, 3 * SIZE(AO)
  1158. FMUL f1, f17, f1
  1159. FMUL f5, f17, f5
  1160. FMUL f9, f17, f9
  1161. FMUL f13, f17, f13
  1162. #endif
  1163. #ifdef RN
  1164. LFD f16, 0 * SIZE(BO)
  1165. LFD f17, 1 * SIZE(BO)
  1166. LFD f18, 2 * SIZE(BO)
  1167. LFD f19, 3 * SIZE(BO)
  1168. FMUL f0, f16, f0
  1169. FMUL f1, f16, f1
  1170. FNMSUB f4, f17, f0, f4
  1171. FNMSUB f5, f17, f1, f5
  1172. FNMSUB f8, f18, f0, f8
  1173. FNMSUB f9, f18, f1, f9
  1174. FNMSUB f12, f19, f0, f12
  1175. FNMSUB f13, f19, f1, f13
  1176. LFD f16, 5 * SIZE(BO)
  1177. LFD f17, 6 * SIZE(BO)
  1178. LFD f18, 7 * SIZE(BO)
  1179. LFD f19, 10 * SIZE(BO)
  1180. LFD f20, 11 * SIZE(BO)
  1181. LFD f21, 15 * SIZE(BO)
  1182. FMUL f4, f16, f4
  1183. FMUL f5, f16, f5
  1184. FNMSUB f8, f17, f4, f8
  1185. FNMSUB f9, f17, f5, f9
  1186. FNMSUB f12, f18, f4, f12
  1187. FNMSUB f13, f18, f5, f13
  1188. FMUL f8, f19, f8
  1189. FMUL f9, f19, f9
  1190. FNMSUB f12, f20, f8, f12
  1191. FNMSUB f13, f20, f9, f13
  1192. FMUL f12, f21, f12
  1193. FMUL f13, f21, f13
  1194. #endif
  1195. #ifdef RT
  1196. LFD f16, 15 * SIZE(BO)
  1197. LFD f17, 14 * SIZE(BO)
  1198. LFD f18, 13 * SIZE(BO)
  1199. LFD f19, 12 * SIZE(BO)
  1200. FMUL f12, f16, f12
  1201. FMUL f13, f16, f13
  1202. FNMSUB f8, f17, f12, f8
  1203. FNMSUB f9, f17, f13, f9
  1204. FNMSUB f4, f18, f12, f4
  1205. FNMSUB f5, f18, f13, f5
  1206. FNMSUB f0, f19, f12, f0
  1207. FNMSUB f1, f19, f13, f1
  1208. LFD f16, 10 * SIZE(BO)
  1209. LFD f17, 9 * SIZE(BO)
  1210. LFD f18, 8 * SIZE(BO)
  1211. LFD f19, 5 * SIZE(BO)
  1212. LFD f20, 4 * SIZE(BO)
  1213. LFD f21, 0 * SIZE(BO)
  1214. FMUL f8, f16, f8
  1215. FMUL f9, f16, f9
  1216. FNMSUB f4, f17, f8, f4
  1217. FNMSUB f5, f17, f9, f5
  1218. FNMSUB f0, f18, f8, f0
  1219. FNMSUB f1, f18, f9, f1
  1220. FMUL f4, f19, f4
  1221. FMUL f5, f19, f5
  1222. FNMSUB f0, f20, f4, f0
  1223. FNMSUB f1, f20, f5, f1
  1224. FMUL f0, f21, f0
  1225. FMUL f1, f21, f1
  1226. #endif
  1227. #ifdef LN
  1228. subi CO1, CO1, 2 * SIZE
  1229. subi CO2, CO2, 2 * SIZE
  1230. subi CO3, CO3, 2 * SIZE
  1231. subi CO4, CO4, 2 * SIZE
  1232. #endif
  1233. #if defined(LN) || defined(LT)
  1234. STFD f0, 0 * SIZE(BO)
  1235. STFD f4, 1 * SIZE(BO)
  1236. STFD f8, 2 * SIZE(BO)
  1237. STFD f12, 3 * SIZE(BO)
  1238. STFD f1, 4 * SIZE(BO)
  1239. STFD f5, 5 * SIZE(BO)
  1240. STFD f9, 6 * SIZE(BO)
  1241. STFD f13, 7 * SIZE(BO)
  1242. #else
  1243. STFD f0, 0 * SIZE(AO)
  1244. STFD f1, 1 * SIZE(AO)
  1245. STFD f4, 2 * SIZE(AO)
  1246. STFD f5, 3 * SIZE(AO)
  1247. STFD f8, 4 * SIZE(AO)
  1248. STFD f9, 5 * SIZE(AO)
  1249. STFD f12, 6 * SIZE(AO)
  1250. STFD f13, 7 * SIZE(AO)
  1251. #endif
  1252. STFD f0, 0 * SIZE(CO1)
  1253. STFD f1, 1 * SIZE(CO1)
  1254. STFD f4, 0 * SIZE(CO2)
  1255. STFD f5, 1 * SIZE(CO2)
  1256. STFD f8, 0 * SIZE(CO3)
  1257. STFD f9, 1 * SIZE(CO3)
  1258. STFD f12, 0 * SIZE(CO4)
  1259. STFD f13, 1 * SIZE(CO4)
  1260. lfs f0, FZERO
  1261. fmr f1, f0
  1262. fmr f2, f0
  1263. fmr f3, f0
  1264. fmr f4, f0
  1265. fmr f5, f0
  1266. fmr f6, f0
  1267. fmr f7, f0
  1268. fmr f8, f0
  1269. fmr f9, f0
  1270. fmr f10, f0
  1271. fmr f11, f0
  1272. fmr f12, f0
  1273. fmr f13, f0
  1274. fmr f14, f0
  1275. fmr f15, f0
  1276. #ifndef LN
  1277. addi CO1, CO1, 2 * SIZE
  1278. addi CO2, CO2, 2 * SIZE
  1279. addi CO3, CO3, 2 * SIZE
  1280. addi CO4, CO4, 2 * SIZE
  1281. #endif
  1282. #ifdef RT
  1283. slwi r0, K, 1 + BASE_SHIFT
  1284. add AORIG, AORIG, r0
  1285. #endif
  1286. #if defined(LT) || defined(RN)
  1287. sub TEMP, K, KK
  1288. slwi r0, TEMP, 1 + BASE_SHIFT
  1289. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1290. add AO, AO, r0
  1291. add BO, BO, TEMP
  1292. #endif
  1293. #ifdef LN
  1294. subi KK, KK, 2
  1295. #endif
  1296. #ifdef LT
  1297. addi KK, KK, 2
  1298. #endif
  1299. .align 4
  1300. LL(30):
  1301. andi. I, M, 1
  1302. ble LL(39)
  1303. #if defined(LT) || defined(RN)
  1304. LFD f16, 0 * SIZE(AO)
  1305. LFD f17, 1 * SIZE(AO)
  1306. LFD f18, 2 * SIZE(AO)
  1307. LFD f19, 3 * SIZE(AO)
  1308. LFD f20, 0 * SIZE(B)
  1309. LFD f21, 1 * SIZE(B)
  1310. LFD f22, 2 * SIZE(B)
  1311. LFD f23, 3 * SIZE(B)
  1312. LFD f24, 4 * SIZE(B)
  1313. LFD f25, 5 * SIZE(B)
  1314. LFD f26, 6 * SIZE(B)
  1315. LFD f27, 7 * SIZE(B)
  1316. srawi. r0, KK, 2
  1317. mtspr CTR, r0
  1318. mr BO, B
  1319. #else
  1320. #ifdef LN
  1321. slwi r0, K, BASE_SHIFT
  1322. sub AORIG, AORIG, r0
  1323. #endif
  1324. slwi r0, KK, 0 + BASE_SHIFT
  1325. slwi TEMP, KK, 2 + BASE_SHIFT
  1326. add AO, AORIG, r0
  1327. add BO, B, TEMP
  1328. sub TEMP, K, KK
  1329. LFD f16, 0 * SIZE(AO)
  1330. LFD f17, 1 * SIZE(AO)
  1331. LFD f18, 2 * SIZE(AO)
  1332. LFD f19, 3 * SIZE(AO)
  1333. LFD f20, 0 * SIZE(BO)
  1334. LFD f21, 1 * SIZE(BO)
  1335. LFD f22, 2 * SIZE(BO)
  1336. LFD f23, 3 * SIZE(BO)
  1337. LFD f24, 4 * SIZE(BO)
  1338. LFD f25, 5 * SIZE(BO)
  1339. LFD f26, 6 * SIZE(BO)
  1340. LFD f27, 7 * SIZE(BO)
  1341. srawi. r0, TEMP, 2
  1342. mtspr CTR, r0
  1343. #endif
  1344. ble LL(35)
  1345. .align 5
  1346. LL(32):
  1347. FMADD f0, f16, f20, f0
  1348. FMADD f4, f16, f21, f4
  1349. FMADD f8, f16, f22, f8
  1350. FMADD f12, f16, f23, f12
  1351. LFD f20, 8 * SIZE(BO)
  1352. LFD f21, 9 * SIZE(BO)
  1353. LFD f22, 10 * SIZE(BO)
  1354. LFD f23, 11 * SIZE(BO)
  1355. FMADD f1, f17, f24, f1
  1356. FMADD f5, f17, f25, f5
  1357. FMADD f9, f17, f26, f9
  1358. FMADD f13, f17, f27, f13
  1359. LFD f24, 12 * SIZE(BO)
  1360. LFD f25, 13 * SIZE(BO)
  1361. LFD f26, 14 * SIZE(BO)
  1362. LFD f27, 15 * SIZE(BO)
  1363. FMADD f0, f18, f20, f0
  1364. FMADD f4, f18, f21, f4
  1365. FMADD f8, f18, f22, f8
  1366. FMADD f12, f18, f23, f12
  1367. LFD f20, 16 * SIZE(BO)
  1368. LFD f21, 17 * SIZE(BO)
  1369. LFD f22, 18 * SIZE(BO)
  1370. LFD f23, 19 * SIZE(BO)
  1371. FMADD f1, f19, f24, f1
  1372. FMADD f5, f19, f25, f5
  1373. FMADD f9, f19, f26, f9
  1374. FMADD f13, f19, f27, f13
  1375. LFD f16, 4 * SIZE(AO)
  1376. LFD f17, 5 * SIZE(AO)
  1377. LFD f18, 6 * SIZE(AO)
  1378. LFD f19, 7 * SIZE(AO)
  1379. LFD f24, 20 * SIZE(BO)
  1380. LFD f25, 21 * SIZE(BO)
  1381. LFD f26, 22 * SIZE(BO)
  1382. LFD f27, 23 * SIZE(BO)
  1383. addi AO, AO, 4 * SIZE
  1384. addi BO, BO, 16 * SIZE
  1385. DCBT(BO, PREB)
  1386. bdnz LL(32)
  1387. fadd f0, f1, f0
  1388. fadd f4, f5, f4
  1389. fadd f8, f9, f8
  1390. fadd f12, f13, f12
  1391. .align 4
  1392. LL(35):
  1393. #if defined(LT) || defined(RN)
  1394. andi. r0, KK, 3
  1395. #else
  1396. andi. r0, TEMP, 3
  1397. #endif
  1398. mtspr CTR, r0
  1399. ble+ LL(38)
  1400. .align 4
  1401. LL(36):
  1402. FMADD f0, f16, f20, f0
  1403. FMADD f4, f16, f21, f4
  1404. FMADD f8, f16, f22, f8
  1405. FMADD f12, f16, f23, f12
  1406. LFD f16, 1 * SIZE(AO)
  1407. LFD f20, 4 * SIZE(BO)
  1408. LFD f21, 5 * SIZE(BO)
  1409. LFD f22, 6 * SIZE(BO)
  1410. LFD f23, 7 * SIZE(BO)
  1411. addi BO, BO, 4 * SIZE
  1412. addi AO, AO, 1 * SIZE
  1413. bdnz LL(36)
  1414. .align 4
  1415. LL(38):
  1416. #if defined(LN) || defined(RT)
  1417. #ifdef LN
  1418. subi r0, KK, 1
  1419. #else
  1420. subi r0, KK, 4
  1421. #endif
  1422. slwi TEMP, r0, 0 + BASE_SHIFT
  1423. slwi r0, r0, 2 + BASE_SHIFT
  1424. add AO, AORIG, TEMP
  1425. add BO, B, r0
  1426. #endif
  1427. #if defined(LN) || defined(LT)
  1428. LFD f16, 0 * SIZE(BO)
  1429. LFD f17, 1 * SIZE(BO)
  1430. LFD f18, 2 * SIZE(BO)
  1431. LFD f19, 3 * SIZE(BO)
  1432. FSUB f0, f16, f0
  1433. FSUB f4, f17, f4
  1434. FSUB f8, f18, f8
  1435. FSUB f12, f19, f12
  1436. #else
  1437. LFD f16, 0 * SIZE(AO)
  1438. LFD f20, 1 * SIZE(AO)
  1439. LFD f24, 2 * SIZE(AO)
  1440. LFD f28, 3 * SIZE(AO)
  1441. FSUB f0, f16, f0
  1442. FSUB f4, f20, f4
  1443. FSUB f8, f24, f8
  1444. FSUB f12, f28, f12
  1445. #endif
  1446. #ifdef LN
  1447. LFD f21, 0 * SIZE(AO)
  1448. FMUL f0, f21, f0
  1449. FMUL f4, f21, f4
  1450. FMUL f8, f21, f8
  1451. FMUL f12, f21, f12
  1452. #endif
  1453. #ifdef LT
  1454. LFD f16, 0 * SIZE(AO)
  1455. FMUL f0, f16, f0
  1456. FMUL f4, f16, f4
  1457. FMUL f8, f16, f8
  1458. FMUL f12, f16, f12
  1459. #endif
  1460. #ifdef RN
  1461. LFD f16, 0 * SIZE(BO)
  1462. LFD f17, 1 * SIZE(BO)
  1463. LFD f18, 2 * SIZE(BO)
  1464. LFD f19, 3 * SIZE(BO)
  1465. FMUL f0, f16, f0
  1466. FNMSUB f4, f17, f0, f4
  1467. FNMSUB f8, f18, f0, f8
  1468. FNMSUB f12, f19, f0, f12
  1469. LFD f16, 5 * SIZE(BO)
  1470. LFD f17, 6 * SIZE(BO)
  1471. LFD f18, 7 * SIZE(BO)
  1472. LFD f19, 10 * SIZE(BO)
  1473. LFD f20, 11 * SIZE(BO)
  1474. LFD f21, 15 * SIZE(BO)
  1475. FMUL f4, f16, f4
  1476. FNMSUB f8, f17, f4, f8
  1477. FNMSUB f12, f18, f4, f12
  1478. FMUL f8, f19, f8
  1479. FNMSUB f12, f20, f8, f12
  1480. FMUL f12, f21, f12
  1481. #endif
  1482. #ifdef RT
  1483. LFD f16, 15 * SIZE(BO)
  1484. LFD f17, 14 * SIZE(BO)
  1485. LFD f18, 13 * SIZE(BO)
  1486. LFD f19, 12 * SIZE(BO)
  1487. FMUL f12, f16, f12
  1488. FNMSUB f8, f17, f12, f8
  1489. FNMSUB f4, f18, f12, f4
  1490. FNMSUB f0, f19, f12, f0
  1491. LFD f16, 10 * SIZE(BO)
  1492. LFD f17, 9 * SIZE(BO)
  1493. LFD f18, 8 * SIZE(BO)
  1494. LFD f19, 5 * SIZE(BO)
  1495. FMUL f8, f16, f8
  1496. LFD f20, 4 * SIZE(BO)
  1497. LFD f21, 0 * SIZE(BO)
  1498. FNMSUB f4, f17, f8, f4
  1499. FNMSUB f0, f18, f8, f0
  1500. FMUL f4, f19, f4
  1501. FNMSUB f0, f20, f4, f0
  1502. FMUL f0, f21, f0
  1503. #endif
  1504. #ifdef LN
  1505. subi CO1, CO1, 1 * SIZE
  1506. subi CO2, CO2, 1 * SIZE
  1507. subi CO3, CO3, 1 * SIZE
  1508. subi CO4, CO4, 1 * SIZE
  1509. #endif
  1510. #if defined(LN) || defined(LT)
  1511. STFD f0, 0 * SIZE(BO)
  1512. STFD f4, 1 * SIZE(BO)
  1513. STFD f8, 2 * SIZE(BO)
  1514. STFD f12, 3 * SIZE(BO)
  1515. #else
  1516. STFD f0, 0 * SIZE(AO)
  1517. STFD f4, 1 * SIZE(AO)
  1518. STFD f8, 2 * SIZE(AO)
  1519. STFD f12, 3 * SIZE(AO)
  1520. #endif
  1521. STFD f0, 0 * SIZE(CO1)
  1522. STFD f4, 0 * SIZE(CO2)
  1523. STFD f8, 0 * SIZE(CO3)
  1524. STFD f12, 0 * SIZE(CO4)
  1525. lfs f0, FZERO
  1526. fmr f1, f0
  1527. fmr f4, f0
  1528. fmr f5, f0
  1529. fmr f8, f0
  1530. fmr f9, f0
  1531. fmr f12, f0
  1532. fmr f13, f0
  1533. #ifndef LN
  1534. addi CO1, CO1, 1 * SIZE
  1535. addi CO2, CO2, 1 * SIZE
  1536. addi CO3, CO3, 1 * SIZE
  1537. addi CO4, CO4, 1 * SIZE
  1538. #endif
  1539. #ifdef RT
  1540. slwi r0, K, 0 + BASE_SHIFT
  1541. add AORIG, AORIG, r0
  1542. #endif
  1543. #if defined(LT) || defined(RN)
  1544. sub TEMP, K, KK
  1545. slwi r0, TEMP, 0 + BASE_SHIFT
  1546. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1547. add AO, AO, r0
  1548. add BO, BO, TEMP
  1549. #endif
  1550. #ifdef LN
  1551. subi KK, KK, 1
  1552. #endif
  1553. #ifdef LT
  1554. addi KK, KK, 1
  1555. #endif
  1556. .align 4
  1557. LL(39):
  1558. #ifdef LN
  1559. slwi r0, K, 2 + BASE_SHIFT
  1560. add B, B, r0
  1561. #endif
  1562. #if defined(LT) || defined(RN)
  1563. mr B, BO
  1564. #endif
  1565. #ifdef RN
  1566. addi KK, KK, 4
  1567. #endif
  1568. #ifdef RT
  1569. subi KK, KK, 4
  1570. #endif
  1571. addic. J, J, -1
  1572. lfs f0, FZERO
  1573. bgt LL(10)
  1574. .align 4
  1575. LL(40):
  1576. andi. J, N, 2
  1577. ble LL(70)
  1578. #ifdef RT
  1579. slwi r0, K, 1 + BASE_SHIFT
  1580. sub B, B, r0
  1581. slwi r0, LDC, 1
  1582. sub C, C, r0
  1583. #endif
  1584. mr CO1, C
  1585. add CO2, C, LDC
  1586. #ifdef LN
  1587. add KK, M, OFFSET
  1588. #endif
  1589. #ifdef LT
  1590. mr KK, OFFSET
  1591. #endif
  1592. fmr f1, f0
  1593. fmr f2, f0
  1594. fmr f3, f0
  1595. fmr f4, f0
  1596. fmr f5, f0
  1597. fmr f6, f0
  1598. fmr f7, f0
  1599. srawi. I, M, 2
  1600. #if defined(LN) || defined(RT)
  1601. mr AORIG, A
  1602. #else
  1603. mr AO, A
  1604. #endif
  1605. #ifndef RT
  1606. add C, CO2, LDC
  1607. #endif
  1608. ble LL(50)
  1609. .align 4
  1610. LL(41):
  1611. #if defined(LT) || defined(RN)
  1612. LFD f16, 0 * SIZE(AO)
  1613. LFD f17, 1 * SIZE(AO)
  1614. LFD f18, 2 * SIZE(AO)
  1615. LFD f19, 3 * SIZE(AO)
  1616. LFD f20, 0 * SIZE(B)
  1617. LFD f21, 1 * SIZE(B)
  1618. LFD f22, 2 * SIZE(B)
  1619. LFD f23, 3 * SIZE(B)
  1620. dcbt CO1, PREC
  1621. dcbt CO2, PREC
  1622. srawi. r0, KK, 2
  1623. mtspr CTR, r0
  1624. mr BO, B
  1625. #else
  1626. #ifdef LN
  1627. slwi r0, K, 2 + BASE_SHIFT
  1628. sub AORIG, AORIG, r0
  1629. #endif
  1630. slwi r0, KK, 2 + BASE_SHIFT
  1631. slwi TEMP, KK, 1 + BASE_SHIFT
  1632. add AO, AORIG, r0
  1633. add BO, B, TEMP
  1634. sub TEMP, K, KK
  1635. LFD f16, 0 * SIZE(AO)
  1636. LFD f17, 1 * SIZE(AO)
  1637. LFD f18, 2 * SIZE(AO)
  1638. LFD f19, 3 * SIZE(AO)
  1639. LFD f20, 0 * SIZE(BO)
  1640. LFD f21, 1 * SIZE(BO)
  1641. LFD f22, 2 * SIZE(BO)
  1642. LFD f23, 3 * SIZE(BO)
  1643. dcbt CO1, PREC
  1644. dcbt CO2, PREC
  1645. srawi. r0, TEMP, 2
  1646. mtspr CTR, r0
  1647. #endif
  1648. ble LL(45)
  1649. .align 5
  1650. LL(42):
  1651. FMADD f0, f16, f20, f0
  1652. FMADD f1, f17, f20, f1
  1653. FMADD f2, f18, f20, f2
  1654. FMADD f3, f19, f20, f3
  1655. FMADD f4, f16, f21, f4
  1656. FMADD f5, f17, f21, f5
  1657. FMADD f6, f18, f21, f6
  1658. FMADD f7, f19, f21, f7
  1659. LFD f16, 4 * SIZE(AO)
  1660. LFD f17, 5 * SIZE(AO)
  1661. LFD f18, 6 * SIZE(AO)
  1662. LFD f19, 7 * SIZE(AO)
  1663. FMADD f0, f16, f22, f0
  1664. FMADD f1, f17, f22, f1
  1665. FMADD f2, f18, f22, f2
  1666. FMADD f3, f19, f22, f3
  1667. FMADD f4, f16, f23, f4
  1668. FMADD f5, f17, f23, f5
  1669. FMADD f6, f18, f23, f6
  1670. FMADD f7, f19, f23, f7
  1671. LFD f16, 8 * SIZE(AO)
  1672. LFD f17, 9 * SIZE(AO)
  1673. LFD f18, 10 * SIZE(AO)
  1674. LFD f19, 11 * SIZE(AO)
  1675. LFD f20, 4 * SIZE(BO)
  1676. LFD f21, 5 * SIZE(BO)
  1677. LFD f22, 6 * SIZE(BO)
  1678. LFD f23, 7 * SIZE(BO)
  1679. FMADD f0, f16, f20, f0
  1680. FMADD f1, f17, f20, f1
  1681. FMADD f2, f18, f20, f2
  1682. FMADD f3, f19, f20, f3
  1683. FMADD f4, f16, f21, f4
  1684. FMADD f5, f17, f21, f5
  1685. FMADD f6, f18, f21, f6
  1686. FMADD f7, f19, f21, f7
  1687. LFD f16, 12 * SIZE(AO)
  1688. LFD f17, 13 * SIZE(AO)
  1689. LFD f18, 14 * SIZE(AO)
  1690. LFD f19, 15 * SIZE(AO)
  1691. FMADD f0, f16, f22, f0
  1692. FMADD f1, f17, f22, f1
  1693. FMADD f2, f18, f22, f2
  1694. FMADD f3, f19, f22, f3
  1695. FMADD f4, f16, f23, f4
  1696. FMADD f5, f17, f23, f5
  1697. FMADD f6, f18, f23, f6
  1698. FMADD f7, f19, f23, f7
  1699. LFD f16, 16 * SIZE(AO)
  1700. LFD f17, 17 * SIZE(AO)
  1701. LFD f18, 18 * SIZE(AO)
  1702. LFD f19, 19 * SIZE(AO)
  1703. LFD f20, 8 * SIZE(BO)
  1704. LFD f21, 9 * SIZE(BO)
  1705. LFD f22, 10 * SIZE(BO)
  1706. LFD f23, 11 * SIZE(BO)
  1707. addi AO, AO, 16 * SIZE
  1708. addi BO, BO, 8 * SIZE
  1709. DCBT(BO, PREB)
  1710. bdnz LL(42)
  1711. .align 4
  1712. LL(45):
  1713. #if defined(LT) || defined(RN)
  1714. andi. r0, KK, 3
  1715. #else
  1716. andi. r0, TEMP, 3
  1717. #endif
  1718. mtspr CTR, r0
  1719. ble+ LL(48)
  1720. .align 4
  1721. LL(46):
  1722. FMADD f0, f16, f20, f0
  1723. FMADD f1, f17, f20, f1
  1724. FMADD f2, f18, f20, f2
  1725. FMADD f3, f19, f20, f3
  1726. FMADD f4, f16, f21, f4
  1727. FMADD f5, f17, f21, f5
  1728. FMADD f6, f18, f21, f6
  1729. FMADD f7, f19, f21, f7
  1730. LFD f16, 4 * SIZE(AO)
  1731. LFD f17, 5 * SIZE(AO)
  1732. LFD f18, 6 * SIZE(AO)
  1733. LFD f19, 7 * SIZE(AO)
  1734. LFD f20, 2 * SIZE(BO)
  1735. LFD f21, 3 * SIZE(BO)
  1736. addi BO, BO, 2 * SIZE
  1737. addi AO, AO, 4 * SIZE
  1738. bdnz LL(46)
  1739. .align 4
  1740. LL(48):
  1741. #if defined(LN) || defined(RT)
  1742. #ifdef LN
  1743. subi r0, KK, 4
  1744. #else
  1745. subi r0, KK, 2
  1746. #endif
  1747. slwi TEMP, r0, 2 + BASE_SHIFT
  1748. slwi r0, r0, 1 + BASE_SHIFT
  1749. add AO, AORIG, TEMP
  1750. add BO, B, r0
  1751. #endif
  1752. #if defined(LN) || defined(LT)
  1753. LFD f16, 0 * SIZE(BO)
  1754. LFD f17, 1 * SIZE(BO)
  1755. LFD f20, 2 * SIZE(BO)
  1756. LFD f21, 3 * SIZE(BO)
  1757. LFD f24, 4 * SIZE(BO)
  1758. LFD f25, 5 * SIZE(BO)
  1759. LFD f28, 6 * SIZE(BO)
  1760. LFD f29, 7 * SIZE(BO)
  1761. FSUB f0, f16, f0
  1762. FSUB f4, f17, f4
  1763. FSUB f1, f20, f1
  1764. FSUB f5, f21, f5
  1765. FSUB f2, f24, f2
  1766. FSUB f6, f25, f6
  1767. FSUB f3, f28, f3
  1768. FSUB f7, f29, f7
  1769. #else
  1770. LFD f16, 0 * SIZE(AO)
  1771. LFD f17, 1 * SIZE(AO)
  1772. LFD f18, 2 * SIZE(AO)
  1773. LFD f19, 3 * SIZE(AO)
  1774. LFD f20, 4 * SIZE(AO)
  1775. LFD f21, 5 * SIZE(AO)
  1776. LFD f22, 6 * SIZE(AO)
  1777. LFD f23, 7 * SIZE(AO)
  1778. FSUB f0, f16, f0
  1779. FSUB f1, f17, f1
  1780. FSUB f2, f18, f2
  1781. FSUB f3, f19, f3
  1782. FSUB f4, f20, f4
  1783. FSUB f5, f21, f5
  1784. FSUB f6, f22, f6
  1785. FSUB f7, f23, f7
  1786. #endif
  1787. #ifdef LN
  1788. LFD f16, 15 * SIZE(AO)
  1789. LFD f17, 14 * SIZE(AO)
  1790. LFD f18, 13 * SIZE(AO)
  1791. LFD f19, 12 * SIZE(AO)
  1792. FMUL f3, f16, f3
  1793. FMUL f7, f16, f7
  1794. FNMSUB f2, f17, f3, f2
  1795. FNMSUB f6, f17, f7, f6
  1796. FNMSUB f1, f18, f3, f1
  1797. FNMSUB f5, f18, f7, f5
  1798. FNMSUB f0, f19, f3, f0
  1799. FNMSUB f4, f19, f7, f4
  1800. LFD f16, 10 * SIZE(AO)
  1801. LFD f17, 9 * SIZE(AO)
  1802. LFD f18, 8 * SIZE(AO)
  1803. LFD f19, 5 * SIZE(AO)
  1804. LFD f20, 4 * SIZE(AO)
  1805. LFD f21, 0 * SIZE(AO)
  1806. FMUL f2, f16, f2
  1807. FMUL f6, f16, f6
  1808. FNMSUB f1, f17, f2, f1
  1809. FNMSUB f5, f17, f6, f5
  1810. FNMSUB f0, f18, f2, f0
  1811. FNMSUB f4, f18, f6, f4
  1812. FMUL f1, f19, f1
  1813. FMUL f5, f19, f5
  1814. FNMSUB f0, f20, f1, f0
  1815. FNMSUB f4, f20, f5, f4
  1816. FMUL f0, f21, f0
  1817. FMUL f4, f21, f4
  1818. #endif
  1819. #ifdef LT
  1820. LFD f16, 0 * SIZE(AO)
  1821. LFD f17, 1 * SIZE(AO)
  1822. LFD f18, 2 * SIZE(AO)
  1823. LFD f19, 3 * SIZE(AO)
  1824. FMUL f0, f16, f0
  1825. FMUL f4, f16, f4
  1826. FNMSUB f1, f17, f0, f1
  1827. FNMSUB f5, f17, f4, f5
  1828. FNMSUB f2, f18, f0, f2
  1829. FNMSUB f6, f18, f4, f6
  1830. FNMSUB f3, f19, f0, f3
  1831. FNMSUB f7, f19, f4, f7
  1832. LFD f17, 5 * SIZE(AO)
  1833. LFD f18, 6 * SIZE(AO)
  1834. LFD f19, 7 * SIZE(AO)
  1835. FMUL f1, f17, f1
  1836. FMUL f5, f17, f5
  1837. FNMSUB f2, f18, f1, f2
  1838. FNMSUB f6, f18, f5, f6
  1839. FNMSUB f3, f19, f1, f3
  1840. FNMSUB f7, f19, f5, f7
  1841. LFD f18, 10 * SIZE(AO)
  1842. LFD f19, 11 * SIZE(AO)
  1843. FMUL f2, f18, f2
  1844. FMUL f6, f18, f6
  1845. FNMSUB f3, f19, f2, f3
  1846. FNMSUB f7, f19, f6, f7
  1847. LFD f19, 15 * SIZE(AO)
  1848. FMUL f3, f19, f3
  1849. FMUL f7, f19, f7
  1850. #endif
  1851. #ifdef RN
  1852. LFD f16, 0 * SIZE(BO)
  1853. LFD f17, 1 * SIZE(BO)
  1854. LFD f18, 3 * SIZE(BO)
  1855. FMUL f0, f16, f0
  1856. FMUL f1, f16, f1
  1857. FMUL f2, f16, f2
  1858. FMUL f3, f16, f3
  1859. FNMSUB f4, f17, f0, f4
  1860. FNMSUB f5, f17, f1, f5
  1861. FNMSUB f6, f17, f2, f6
  1862. FNMSUB f7, f17, f3, f7
  1863. FMUL f4, f18, f4
  1864. FMUL f5, f18, f5
  1865. FMUL f6, f18, f6
  1866. FMUL f7, f18, f7
  1867. #endif
  1868. #ifdef RT
  1869. LFD f19, 3 * SIZE(BO)
  1870. LFD f20, 2 * SIZE(BO)
  1871. LFD f21, 0 * SIZE(BO)
  1872. FMUL f4, f19, f4
  1873. FMUL f5, f19, f5
  1874. FMUL f6, f19, f6
  1875. FMUL f7, f19, f7
  1876. FNMSUB f0, f20, f4, f0
  1877. FNMSUB f1, f20, f5, f1
  1878. FNMSUB f2, f20, f6, f2
  1879. FNMSUB f3, f20, f7, f3
  1880. FMUL f0, f21, f0
  1881. FMUL f1, f21, f1
  1882. FMUL f2, f21, f2
  1883. FMUL f3, f21, f3
  1884. #endif
  1885. #ifdef LN
  1886. subi CO1, CO1, 4 * SIZE
  1887. subi CO2, CO2, 4 * SIZE
  1888. #endif
  1889. #if defined(LN) || defined(LT)
  1890. STFD f0, 0 * SIZE(BO)
  1891. STFD f4, 1 * SIZE(BO)
  1892. STFD f1, 2 * SIZE(BO)
  1893. STFD f5, 3 * SIZE(BO)
  1894. STFD f2, 4 * SIZE(BO)
  1895. STFD f6, 5 * SIZE(BO)
  1896. STFD f3, 6 * SIZE(BO)
  1897. STFD f7, 7 * SIZE(BO)
  1898. #else
  1899. STFD f0, 0 * SIZE(AO)
  1900. STFD f1, 1 * SIZE(AO)
  1901. STFD f2, 2 * SIZE(AO)
  1902. STFD f3, 3 * SIZE(AO)
  1903. STFD f4, 4 * SIZE(AO)
  1904. STFD f5, 5 * SIZE(AO)
  1905. STFD f6, 6 * SIZE(AO)
  1906. STFD f7, 7 * SIZE(AO)
  1907. #endif
  1908. STFD f0, 0 * SIZE(CO1)
  1909. STFD f1, 1 * SIZE(CO1)
  1910. STFD f2, 2 * SIZE(CO1)
  1911. STFD f3, 3 * SIZE(CO1)
  1912. STFD f4, 0 * SIZE(CO2)
  1913. STFD f5, 1 * SIZE(CO2)
  1914. STFD f6, 2 * SIZE(CO2)
  1915. STFD f7, 3 * SIZE(CO2)
  1916. lfs f0, FZERO
  1917. fmr f1, f0
  1918. fmr f2, f0
  1919. fmr f3, f0
  1920. fmr f4, f0
  1921. fmr f5, f0
  1922. fmr f6, f0
  1923. fmr f7, f0
  1924. #ifndef LN
  1925. addi CO1, CO1, 4 * SIZE
  1926. addi CO2, CO2, 4 * SIZE
  1927. #endif
  1928. #ifdef RT
  1929. slwi r0, K, 2 + BASE_SHIFT
  1930. add AORIG, AORIG, r0
  1931. #endif
  1932. #if defined(LT) || defined(RN)
  1933. sub TEMP, K, KK
  1934. slwi r0, TEMP, 2 + BASE_SHIFT
  1935. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1936. add AO, AO, r0
  1937. add BO, BO, TEMP
  1938. #endif
  1939. #ifdef LN
  1940. subi KK, KK, 4
  1941. #endif
  1942. #ifdef LT
  1943. addi KK, KK, 4
  1944. #endif
  1945. addic. I, I, -1
  1946. bgt+ LL(41)
  1947. .align 4
  1948. LL(50):
  1949. andi. I, M, 2
  1950. ble LL(60)
  1951. #if defined(LT) || defined(RN)
  1952. LFD f16, 0 * SIZE(AO)
  1953. LFD f17, 1 * SIZE(AO)
  1954. LFD f18, 2 * SIZE(AO)
  1955. LFD f19, 3 * SIZE(AO)
  1956. LFD f20, 0 * SIZE(B)
  1957. LFD f21, 1 * SIZE(B)
  1958. LFD f22, 2 * SIZE(B)
  1959. LFD f23, 3 * SIZE(B)
  1960. LFD f24, 4 * SIZE(B)
  1961. LFD f25, 5 * SIZE(B)
  1962. LFD f26, 6 * SIZE(B)
  1963. LFD f27, 7 * SIZE(B)
  1964. srawi. r0, KK, 2
  1965. mtspr CTR, r0
  1966. mr BO, B
  1967. #else
  1968. #ifdef LN
  1969. slwi r0, K, 1 + BASE_SHIFT
  1970. sub AORIG, AORIG, r0
  1971. #endif
  1972. slwi r0, KK, 1 + BASE_SHIFT
  1973. slwi TEMP, KK, 1 + BASE_SHIFT
  1974. add AO, AORIG, r0
  1975. add BO, B, TEMP
  1976. sub TEMP, K, KK
  1977. LFD f16, 0 * SIZE(AO)
  1978. LFD f17, 1 * SIZE(AO)
  1979. LFD f18, 2 * SIZE(AO)
  1980. LFD f19, 3 * SIZE(AO)
  1981. LFD f20, 0 * SIZE(BO)
  1982. LFD f21, 1 * SIZE(BO)
  1983. LFD f22, 2 * SIZE(BO)
  1984. LFD f23, 3 * SIZE(BO)
  1985. LFD f24, 4 * SIZE(BO)
  1986. LFD f25, 5 * SIZE(BO)
  1987. LFD f26, 6 * SIZE(BO)
  1988. LFD f27, 7 * SIZE(BO)
  1989. srawi. r0, TEMP, 2
  1990. mtspr CTR, r0
  1991. #endif
  1992. ble LL(55)
  1993. .align 5
  1994. LL(52):
  1995. FMADD f0, f16, f20, f0
  1996. FMADD f1, f17, f20, f1
  1997. FMADD f2, f16, f21, f2
  1998. FMADD f3, f17, f21, f3
  1999. FMADD f4, f18, f22, f4
  2000. FMADD f5, f19, f22, f5
  2001. FMADD f6, f18, f23, f6
  2002. FMADD f7, f19, f23, f7
  2003. LFD f16, 4 * SIZE(AO)
  2004. LFD f17, 5 * SIZE(AO)
  2005. LFD f18, 6 * SIZE(AO)
  2006. LFD f19, 7 * SIZE(AO)
  2007. LFD f20, 8 * SIZE(BO)
  2008. LFD f21, 9 * SIZE(BO)
  2009. LFD f22, 10 * SIZE(BO)
  2010. LFD f23, 11 * SIZE(BO)
  2011. FMADD f0, f16, f24, f0
  2012. FMADD f1, f17, f24, f1
  2013. FMADD f2, f16, f25, f2
  2014. FMADD f3, f17, f25, f3
  2015. FMADD f4, f18, f26, f4
  2016. FMADD f5, f19, f26, f5
  2017. FMADD f6, f18, f27, f6
  2018. FMADD f7, f19, f27, f7
  2019. LFD f16, 8 * SIZE(AO)
  2020. LFD f17, 9 * SIZE(AO)
  2021. LFD f18, 10 * SIZE(AO)
  2022. LFD f19, 11 * SIZE(AO)
  2023. LFD f24, 12 * SIZE(BO)
  2024. LFD f25, 13 * SIZE(BO)
  2025. LFD f26, 14 * SIZE(BO)
  2026. LFD f27, 15 * SIZE(BO)
  2027. addi AO, AO, 8 * SIZE
  2028. addi BO, BO, 8 * SIZE
  2029. DCBT(BO, PREB)
  2030. bdnz LL(52)
  2031. .align 4
  2032. LL(55):
  2033. #if defined(LT) || defined(RN)
  2034. andi. r0, KK, 3
  2035. #else
  2036. andi. r0, TEMP, 3
  2037. #endif
  2038. mtspr CTR, r0
  2039. ble+ LL(58)
  2040. .align 4
  2041. LL(56):
  2042. FMADD f0, f16, f20, f0
  2043. FMADD f1, f17, f20, f1
  2044. FMADD f2, f16, f21, f2
  2045. FMADD f3, f17, f21, f3
  2046. LFD f16, 2 * SIZE(AO)
  2047. LFD f17, 3 * SIZE(AO)
  2048. LFD f20, 2 * SIZE(BO)
  2049. LFD f21, 3 * SIZE(BO)
  2050. addi BO, BO, 2 * SIZE
  2051. addi AO, AO, 2 * SIZE
  2052. bdnz LL(56)
  2053. .align 4
  2054. LL(58):
  2055. FADD f0, f4, f0
  2056. FADD f1, f5, f1
  2057. FADD f2, f6, f2
  2058. FADD f3, f7, f3
  2059. #if defined(LN) || defined(RT)
  2060. #ifdef LN
  2061. subi r0, KK, 2
  2062. #else
  2063. subi r0, KK, 2
  2064. #endif
  2065. slwi TEMP, r0, 1 + BASE_SHIFT
  2066. slwi r0, r0, 1 + BASE_SHIFT
  2067. add AO, AORIG, TEMP
  2068. add BO, B, r0
  2069. #endif
  2070. #if defined(LN) || defined(LT)
  2071. LFD f16, 0 * SIZE(BO)
  2072. LFD f17, 1 * SIZE(BO)
  2073. LFD f20, 2 * SIZE(BO)
  2074. LFD f21, 3 * SIZE(BO)
  2075. FSUB f0, f16, f0
  2076. FSUB f2, f17, f2
  2077. FSUB f1, f20, f1
  2078. FSUB f3, f21, f3
  2079. #else
  2080. LFD f16, 0 * SIZE(AO)
  2081. LFD f17, 1 * SIZE(AO)
  2082. LFD f20, 2 * SIZE(AO)
  2083. LFD f21, 3 * SIZE(AO)
  2084. FSUB f0, f16, f0
  2085. FSUB f1, f17, f1
  2086. FSUB f2, f20, f2
  2087. FSUB f3, f21, f3
  2088. #endif
  2089. #ifdef LN
  2090. LFD f19, 3 * SIZE(AO)
  2091. LFD f20, 2 * SIZE(AO)
  2092. LFD f21, 0 * SIZE(AO)
  2093. FMUL f1, f19, f1
  2094. FMUL f3, f19, f3
  2095. FNMSUB f0, f20, f1, f0
  2096. FNMSUB f2, f20, f3, f2
  2097. FMUL f0, f21, f0
  2098. FMUL f2, f21, f2
  2099. #endif
  2100. #ifdef LT
  2101. LFD f16, 0 * SIZE(AO)
  2102. LFD f17, 1 * SIZE(AO)
  2103. FMUL f0, f16, f0
  2104. FMUL f2, f16, f2
  2105. FNMSUB f1, f17, f0, f1
  2106. FNMSUB f3, f17, f2, f3
  2107. LFD f17, 3 * SIZE(AO)
  2108. FMUL f1, f17, f1
  2109. FMUL f3, f17, f3
  2110. #endif
  2111. #ifdef RN
  2112. LFD f16, 0 * SIZE(BO)
  2113. LFD f17, 1 * SIZE(BO)
  2114. LFD f18, 3 * SIZE(BO)
  2115. FMUL f0, f16, f0
  2116. FMUL f1, f16, f1
  2117. FNMSUB f2, f17, f0, f2
  2118. FNMSUB f3, f17, f1, f3
  2119. FMUL f2, f18, f2
  2120. FMUL f3, f18, f3
  2121. #endif
  2122. #ifdef RT
  2123. LFD f19, 3 * SIZE(BO)
  2124. LFD f20, 2 * SIZE(BO)
  2125. LFD f21, 0 * SIZE(BO)
  2126. FMUL f2, f19, f2
  2127. FMUL f3, f19, f3
  2128. FNMSUB f0, f20, f2, f0
  2129. FNMSUB f1, f20, f3, f1
  2130. FMUL f0, f21, f0
  2131. FMUL f1, f21, f1
  2132. #endif
  2133. #ifdef LN
  2134. subi CO1, CO1, 2 * SIZE
  2135. subi CO2, CO2, 2 * SIZE
  2136. #endif
  2137. #if defined(LN) || defined(LT)
  2138. STFD f0, 0 * SIZE(BO)
  2139. STFD f2, 1 * SIZE(BO)
  2140. STFD f1, 2 * SIZE(BO)
  2141. STFD f3, 3 * SIZE(BO)
  2142. #else
  2143. STFD f0, 0 * SIZE(AO)
  2144. STFD f1, 1 * SIZE(AO)
  2145. STFD f2, 2 * SIZE(AO)
  2146. STFD f3, 3 * SIZE(AO)
  2147. #endif
  2148. STFD f0, 0 * SIZE(CO1)
  2149. STFD f1, 1 * SIZE(CO1)
  2150. STFD f2, 0 * SIZE(CO2)
  2151. STFD f3, 1 * SIZE(CO2)
  2152. lfs f0, FZERO
  2153. fmr f1, f0
  2154. fmr f2, f0
  2155. fmr f3, f0
  2156. fmr f4, f0
  2157. fmr f5, f0
  2158. fmr f6, f0
  2159. fmr f7, f0
  2160. #ifndef LN
  2161. addi CO1, CO1, 2 * SIZE
  2162. addi CO2, CO2, 2 * SIZE
  2163. #endif
  2164. #ifdef RT
  2165. slwi r0, K, 1 + BASE_SHIFT
  2166. add AORIG, AORIG, r0
  2167. #endif
  2168. #if defined(LT) || defined(RN)
  2169. sub TEMP, K, KK
  2170. slwi r0, TEMP, 1 + BASE_SHIFT
  2171. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2172. add AO, AO, r0
  2173. add BO, BO, TEMP
  2174. #endif
  2175. #ifdef LN
  2176. subi KK, KK, 2
  2177. #endif
  2178. #ifdef LT
  2179. addi KK, KK, 2
  2180. #endif
  2181. .align 4
  2182. LL(60):
  2183. andi. I, M, 1
  2184. ble LL(69)
  2185. #if defined(LT) || defined(RN)
  2186. LFD f16, 0 * SIZE(AO)
  2187. LFD f17, 1 * SIZE(AO)
  2188. LFD f18, 2 * SIZE(AO)
  2189. LFD f19, 3 * SIZE(AO)
  2190. LFD f20, 0 * SIZE(B)
  2191. LFD f21, 1 * SIZE(B)
  2192. LFD f22, 2 * SIZE(B)
  2193. LFD f23, 3 * SIZE(B)
  2194. LFD f24, 4 * SIZE(B)
  2195. LFD f25, 5 * SIZE(B)
  2196. LFD f26, 6 * SIZE(B)
  2197. LFD f27, 7 * SIZE(B)
  2198. srawi. r0, KK, 2
  2199. mtspr CTR, r0
  2200. mr BO, B
  2201. #else
  2202. #ifdef LN
  2203. slwi r0, K, BASE_SHIFT
  2204. sub AORIG, AORIG, r0
  2205. #endif
  2206. slwi r0, KK, 0 + BASE_SHIFT
  2207. slwi TEMP, KK, 1 + BASE_SHIFT
  2208. add AO, AORIG, r0
  2209. add BO, B, TEMP
  2210. sub TEMP, K, KK
  2211. LFD f16, 0 * SIZE(AO)
  2212. LFD f17, 1 * SIZE(AO)
  2213. LFD f18, 2 * SIZE(AO)
  2214. LFD f19, 3 * SIZE(AO)
  2215. LFD f20, 0 * SIZE(BO)
  2216. LFD f21, 1 * SIZE(BO)
  2217. LFD f22, 2 * SIZE(BO)
  2218. LFD f23, 3 * SIZE(BO)
  2219. LFD f24, 4 * SIZE(BO)
  2220. LFD f25, 5 * SIZE(BO)
  2221. LFD f26, 6 * SIZE(BO)
  2222. LFD f27, 7 * SIZE(BO)
  2223. srawi. r0, TEMP, 2
  2224. mtspr CTR, r0
  2225. #endif
  2226. ble LL(65)
  2227. .align 5
  2228. LL(62):
  2229. FMADD f0, f16, f20, f0
  2230. FMADD f1, f16, f21, f1
  2231. FMADD f2, f17, f22, f2
  2232. FMADD f3, f17, f23, f3
  2233. LFD f20, 8 * SIZE(BO)
  2234. LFD f21, 9 * SIZE(BO)
  2235. LFD f22, 10 * SIZE(BO)
  2236. LFD f23, 11 * SIZE(BO)
  2237. FMADD f0, f18, f24, f0
  2238. FMADD f1, f18, f25, f1
  2239. FMADD f2, f19, f26, f2
  2240. FMADD f3, f19, f27, f3
  2241. LFD f16, 4 * SIZE(AO)
  2242. LFD f17, 5 * SIZE(AO)
  2243. LFD f18, 6 * SIZE(AO)
  2244. LFD f19, 7 * SIZE(AO)
  2245. LFD f24, 12 * SIZE(BO)
  2246. LFD f25, 13 * SIZE(BO)
  2247. LFD f26, 14 * SIZE(BO)
  2248. LFD f27, 15 * SIZE(BO)
  2249. addi AO, AO, 4 * SIZE
  2250. addi BO, BO, 8 * SIZE
  2251. bdnz LL(62)
  2252. .align 4
  2253. LL(65):
  2254. #if defined(LT) || defined(RN)
  2255. andi. r0, KK, 3
  2256. #else
  2257. andi. r0, TEMP, 3
  2258. #endif
  2259. mtspr CTR, r0
  2260. ble+ LL(68)
  2261. .align 4
  2262. LL(66):
  2263. FMADD f0, f16, f20, f0
  2264. FMADD f1, f16, f21, f1
  2265. LFD f16, 1 * SIZE(AO)
  2266. LFD f20, 2 * SIZE(BO)
  2267. LFD f21, 3 * SIZE(BO)
  2268. addi BO, BO, 2 * SIZE
  2269. addi AO, AO, 1 * SIZE
  2270. bdnz LL(66)
  2271. .align 4
  2272. LL(68):
  2273. FADD f0, f2, f0
  2274. FADD f1, f3, f1
  2275. #if defined(LN) || defined(RT)
  2276. #ifdef LN
  2277. subi r0, KK, 1
  2278. #else
  2279. subi r0, KK, 2
  2280. #endif
  2281. slwi TEMP, r0, 0 + BASE_SHIFT
  2282. slwi r0, r0, 1 + BASE_SHIFT
  2283. add AO, AORIG, TEMP
  2284. add BO, B, r0
  2285. #endif
  2286. #if defined(LN) || defined(LT)
  2287. LFD f16, 0 * SIZE(BO)
  2288. LFD f17, 1 * SIZE(BO)
  2289. FSUB f0, f16, f0
  2290. FSUB f1, f17, f1
  2291. #else
  2292. LFD f16, 0 * SIZE(AO)
  2293. LFD f20, 1 * SIZE(AO)
  2294. FSUB f0, f16, f0
  2295. FSUB f1, f20, f1
  2296. #endif
  2297. #ifdef LN
  2298. LFD f21, 0 * SIZE(AO)
  2299. FMUL f0, f21, f0
  2300. FMUL f1, f21, f1
  2301. #endif
  2302. #ifdef LT
  2303. LFD f16, 0 * SIZE(AO)
  2304. FMUL f0, f16, f0
  2305. FMUL f1, f16, f1
  2306. #endif
  2307. #ifdef RN
  2308. LFD f16, 0 * SIZE(BO)
  2309. LFD f17, 1 * SIZE(BO)
  2310. LFD f18, 3 * SIZE(BO)
  2311. FMUL f0, f16, f0
  2312. FNMSUB f1, f17, f0, f1
  2313. FMUL f1, f18, f1
  2314. #endif
  2315. #ifdef RT
  2316. LFD f19, 3 * SIZE(BO)
  2317. LFD f20, 2 * SIZE(BO)
  2318. LFD f21, 0 * SIZE(BO)
  2319. FMUL f1, f19, f1
  2320. FNMSUB f0, f20, f1, f0
  2321. FMUL f0, f21, f0
  2322. #endif
  2323. #ifdef LN
  2324. subi CO1, CO1, 1 * SIZE
  2325. subi CO2, CO2, 1 * SIZE
  2326. #endif
  2327. #if defined(LN) || defined(LT)
  2328. STFD f0, 0 * SIZE(BO)
  2329. STFD f1, 1 * SIZE(BO)
  2330. #else
  2331. STFD f0, 0 * SIZE(AO)
  2332. STFD f1, 1 * SIZE(AO)
  2333. #endif
  2334. STFD f0, 0 * SIZE(CO1)
  2335. STFD f1, 0 * SIZE(CO2)
  2336. lfs f0, FZERO
  2337. fmr f1, f0
  2338. fmr f4, f0
  2339. fmr f5, f0
  2340. #ifndef LN
  2341. addi CO1, CO1, 1 * SIZE
  2342. addi CO2, CO2, 1 * SIZE
  2343. #endif
  2344. #ifdef RT
  2345. slwi r0, K, 0 + BASE_SHIFT
  2346. add AORIG, AORIG, r0
  2347. #endif
  2348. #if defined(LT) || defined(RN)
  2349. sub TEMP, K, KK
  2350. slwi r0, TEMP, 0 + BASE_SHIFT
  2351. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2352. add AO, AO, r0
  2353. add BO, BO, TEMP
  2354. #endif
  2355. #ifdef LN
  2356. subi KK, KK, 1
  2357. #endif
  2358. #ifdef LT
  2359. addi KK, KK, 1
  2360. #endif
  2361. .align 4
  2362. LL(69):
  2363. #ifdef LN
  2364. slwi r0, K, 1 + BASE_SHIFT
  2365. add B, B, r0
  2366. #endif
  2367. #if defined(LT) || defined(RN)
  2368. mr B, BO
  2369. #endif
  2370. #ifdef RN
  2371. addi KK, KK, 2
  2372. #endif
  2373. #ifdef RT
  2374. subi KK, KK, 2
  2375. #endif
  2376. lfs f0, FZERO
  2377. .align 4
  2378. LL(70):
  2379. andi. J, N, 1
  2380. ble LL(999)
  2381. #ifdef RT
  2382. slwi r0, K, 0 + BASE_SHIFT
  2383. sub B, B, r0
  2384. sub C, C, LDC
  2385. #endif
  2386. mr CO1, C
  2387. #ifdef LN
  2388. add KK, M, OFFSET
  2389. #endif
  2390. #ifdef LT
  2391. mr KK, OFFSET
  2392. #endif
  2393. fmr f1, f0
  2394. fmr f2, f0
  2395. fmr f3, f0
  2396. srawi. I, M, 2
  2397. #if defined(LN) || defined(RT)
  2398. mr AORIG, A
  2399. #else
  2400. mr AO, A
  2401. #endif
  2402. #ifndef RT
  2403. add C, CO1, LDC
  2404. #endif
  2405. ble LL(80)
  2406. .align 4
  2407. LL(71):
  2408. #if defined(LT) || defined(RN)
  2409. LFD f16, 0 * SIZE(AO)
  2410. LFD f17, 1 * SIZE(AO)
  2411. LFD f18, 2 * SIZE(AO)
  2412. LFD f19, 3 * SIZE(AO)
  2413. LFD f20, 0 * SIZE(B)
  2414. LFD f21, 1 * SIZE(B)
  2415. LFD f22, 2 * SIZE(B)
  2416. LFD f23, 3 * SIZE(B)
  2417. dcbt CO1, PREC
  2418. srawi. r0, KK, 2
  2419. mtspr CTR, r0
  2420. mr BO, B
  2421. #else
  2422. #ifdef LN
  2423. slwi r0, K, 2 + BASE_SHIFT
  2424. sub AORIG, AORIG, r0
  2425. #endif
  2426. slwi r0, KK, 2 + BASE_SHIFT
  2427. slwi TEMP, KK, 0 + BASE_SHIFT
  2428. add AO, AORIG, r0
  2429. add BO, B, TEMP
  2430. sub TEMP, K, KK
  2431. LFD f16, 0 * SIZE(AO)
  2432. LFD f17, 1 * SIZE(AO)
  2433. LFD f18, 2 * SIZE(AO)
  2434. LFD f19, 3 * SIZE(AO)
  2435. LFD f20, 0 * SIZE(BO)
  2436. LFD f21, 1 * SIZE(BO)
  2437. LFD f22, 2 * SIZE(BO)
  2438. LFD f23, 3 * SIZE(BO)
  2439. dcbt CO1, PREC
  2440. srawi. r0, TEMP, 2
  2441. mtspr CTR, r0
  2442. #endif
  2443. ble LL(75)
  2444. .align 5
  2445. LL(72):
  2446. FMADD f0, f16, f20, f0
  2447. FMADD f1, f17, f20, f1
  2448. FMADD f2, f18, f20, f2
  2449. FMADD f3, f19, f20, f3
  2450. LFD f16, 4 * SIZE(AO)
  2451. LFD f17, 5 * SIZE(AO)
  2452. LFD f18, 6 * SIZE(AO)
  2453. LFD f19, 7 * SIZE(AO)
  2454. FMADD f0, f16, f21, f0
  2455. FMADD f1, f17, f21, f1
  2456. FMADD f2, f18, f21, f2
  2457. FMADD f3, f19, f21, f3
  2458. LFD f16, 8 * SIZE(AO)
  2459. LFD f17, 9 * SIZE(AO)
  2460. LFD f18, 10 * SIZE(AO)
  2461. LFD f19, 11 * SIZE(AO)
  2462. FMADD f0, f16, f22, f0
  2463. FMADD f1, f17, f22, f1
  2464. FMADD f2, f18, f22, f2
  2465. FMADD f3, f19, f22, f3
  2466. LFD f16, 12 * SIZE(AO)
  2467. LFD f17, 13 * SIZE(AO)
  2468. LFD f18, 14 * SIZE(AO)
  2469. LFD f19, 15 * SIZE(AO)
  2470. FMADD f0, f16, f23, f0
  2471. FMADD f1, f17, f23, f1
  2472. FMADD f2, f18, f23, f2
  2473. FMADD f3, f19, f23, f3
  2474. LFD f16, 16 * SIZE(AO)
  2475. LFD f17, 17 * SIZE(AO)
  2476. LFD f18, 18 * SIZE(AO)
  2477. LFD f19, 19 * SIZE(AO)
  2478. LFD f20, 4 * SIZE(BO)
  2479. LFD f21, 5 * SIZE(BO)
  2480. LFD f22, 6 * SIZE(BO)
  2481. LFD f23, 7 * SIZE(BO)
  2482. addi AO, AO, 16 * SIZE
  2483. addi BO, BO, 4 * SIZE
  2484. DCBT(BO, PREB)
  2485. bdnz LL(72)
  2486. .align 4
  2487. LL(75):
  2488. #if defined(LT) || defined(RN)
  2489. andi. r0, KK, 3
  2490. #else
  2491. andi. r0, TEMP, 3
  2492. #endif
  2493. mtspr CTR, r0
  2494. ble+ LL(78)
  2495. .align 4
  2496. LL(76):
  2497. FMADD f0, f16, f20, f0
  2498. FMADD f1, f17, f20, f1
  2499. FMADD f2, f18, f20, f2
  2500. FMADD f3, f19, f20, f3
  2501. LFD f16, 4 * SIZE(AO)
  2502. LFD f17, 5 * SIZE(AO)
  2503. LFD f18, 6 * SIZE(AO)
  2504. LFD f19, 7 * SIZE(AO)
  2505. LFD f20, 1 * SIZE(BO)
  2506. addi BO, BO, 1 * SIZE
  2507. addi AO, AO, 4 * SIZE
  2508. bdnz LL(76)
  2509. .align 4
  2510. LL(78):
  2511. #if defined(LN) || defined(RT)
  2512. #ifdef LN
  2513. subi r0, KK, 4
  2514. #else
  2515. subi r0, KK, 1
  2516. #endif
  2517. slwi TEMP, r0, 2 + BASE_SHIFT
  2518. slwi r0, r0, 0 + BASE_SHIFT
  2519. add AO, AORIG, TEMP
  2520. add BO, B, r0
  2521. #endif
  2522. #if defined(LN) || defined(LT)
  2523. LFD f16, 0 * SIZE(BO)
  2524. LFD f20, 1 * SIZE(BO)
  2525. LFD f24, 2 * SIZE(BO)
  2526. LFD f28, 3 * SIZE(BO)
  2527. FSUB f0, f16, f0
  2528. FSUB f1, f20, f1
  2529. FSUB f2, f24, f2
  2530. FSUB f3, f28, f3
  2531. #else
  2532. LFD f16, 0 * SIZE(AO)
  2533. LFD f17, 1 * SIZE(AO)
  2534. LFD f18, 2 * SIZE(AO)
  2535. LFD f19, 3 * SIZE(AO)
  2536. FSUB f0, f16, f0
  2537. FSUB f1, f17, f1
  2538. FSUB f2, f18, f2
  2539. FSUB f3, f19, f3
  2540. #endif
  2541. #ifdef LN
  2542. LFD f16, 15 * SIZE(AO)
  2543. LFD f17, 14 * SIZE(AO)
  2544. LFD f18, 13 * SIZE(AO)
  2545. LFD f19, 12 * SIZE(AO)
  2546. FMUL f3, f16, f3
  2547. FNMSUB f2, f17, f3, f2
  2548. FNMSUB f1, f18, f3, f1
  2549. FNMSUB f0, f19, f3, f0
  2550. LFD f16, 10 * SIZE(AO)
  2551. LFD f17, 9 * SIZE(AO)
  2552. LFD f18, 8 * SIZE(AO)
  2553. LFD f19, 5 * SIZE(AO)
  2554. LFD f20, 4 * SIZE(AO)
  2555. LFD f21, 0 * SIZE(AO)
  2556. FMUL f2, f16, f2
  2557. FNMSUB f1, f17, f2, f1
  2558. FNMSUB f0, f18, f2, f0
  2559. FMUL f1, f19, f1
  2560. FNMSUB f0, f20, f1, f0
  2561. FMUL f0, f21, f0
  2562. #endif
  2563. #ifdef LT
  2564. LFD f16, 0 * SIZE(AO)
  2565. LFD f17, 1 * SIZE(AO)
  2566. LFD f18, 2 * SIZE(AO)
  2567. LFD f19, 3 * SIZE(AO)
  2568. FMUL f0, f16, f0
  2569. FNMSUB f1, f17, f0, f1
  2570. FNMSUB f2, f18, f0, f2
  2571. FNMSUB f3, f19, f0, f3
  2572. LFD f17, 5 * SIZE(AO)
  2573. LFD f18, 6 * SIZE(AO)
  2574. LFD f19, 7 * SIZE(AO)
  2575. FMUL f1, f17, f1
  2576. FNMSUB f2, f18, f1, f2
  2577. FNMSUB f3, f19, f1, f3
  2578. LFD f18, 10 * SIZE(AO)
  2579. LFD f19, 11 * SIZE(AO)
  2580. FMUL f2, f18, f2
  2581. FNMSUB f3, f19, f2, f3
  2582. LFD f19, 15 * SIZE(AO)
  2583. FMUL f3, f19, f3
  2584. #endif
  2585. #ifdef RN
  2586. LFD f16, 0 * SIZE(BO)
  2587. FMUL f0, f16, f0
  2588. FMUL f1, f16, f1
  2589. FMUL f2, f16, f2
  2590. FMUL f3, f16, f3
  2591. #endif
  2592. #ifdef RT
  2593. LFD f21, 0 * SIZE(BO)
  2594. FMUL f0, f21, f0
  2595. FMUL f1, f21, f1
  2596. FMUL f2, f21, f2
  2597. FMUL f3, f21, f3
  2598. #endif
  2599. #ifdef LN
  2600. subi CO1, CO1, 4 * SIZE
  2601. #endif
  2602. #if defined(LN) || defined(LT)
  2603. STFD f0, 0 * SIZE(BO)
  2604. STFD f1, 1 * SIZE(BO)
  2605. STFD f2, 2 * SIZE(BO)
  2606. STFD f3, 3 * SIZE(BO)
  2607. #else
  2608. STFD f0, 0 * SIZE(AO)
  2609. STFD f1, 1 * SIZE(AO)
  2610. STFD f2, 2 * SIZE(AO)
  2611. STFD f3, 3 * SIZE(AO)
  2612. #endif
  2613. STFD f0, 0 * SIZE(CO1)
  2614. STFD f1, 1 * SIZE(CO1)
  2615. STFD f2, 2 * SIZE(CO1)
  2616. STFD f3, 3 * SIZE(CO1)
  2617. lfs f0, FZERO
  2618. fmr f1, f0
  2619. fmr f2, f0
  2620. fmr f3, f0
  2621. #ifndef LN
  2622. addi CO1, CO1, 4 * SIZE
  2623. #endif
  2624. #ifdef RT
  2625. slwi r0, K, 2 + BASE_SHIFT
  2626. add AORIG, AORIG, r0
  2627. #endif
  2628. #if defined(LT) || defined(RN)
  2629. sub TEMP, K, KK
  2630. slwi r0, TEMP, 2 + BASE_SHIFT
  2631. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2632. add AO, AO, r0
  2633. add BO, BO, TEMP
  2634. #endif
  2635. #ifdef LN
  2636. subi KK, KK, 4
  2637. #endif
  2638. #ifdef LT
  2639. addi KK, KK, 4
  2640. #endif
  2641. addic. I, I, -1
  2642. bgt+ LL(71)
  2643. .align 4
  2644. LL(80):
  2645. andi. I, M, 2
  2646. ble LL(90)
  2647. #if defined(LT) || defined(RN)
  2648. LFD f16, 0 * SIZE(AO)
  2649. LFD f17, 1 * SIZE(AO)
  2650. LFD f18, 2 * SIZE(AO)
  2651. LFD f19, 3 * SIZE(AO)
  2652. LFD f20, 0 * SIZE(B)
  2653. LFD f21, 1 * SIZE(B)
  2654. LFD f22, 2 * SIZE(B)
  2655. LFD f23, 3 * SIZE(B)
  2656. srawi. r0, KK, 2
  2657. mtspr CTR, r0
  2658. mr BO, B
  2659. #else
  2660. #ifdef LN
  2661. slwi r0, K, 1 + BASE_SHIFT
  2662. sub AORIG, AORIG, r0
  2663. #endif
  2664. slwi r0, KK, 1 + BASE_SHIFT
  2665. slwi TEMP, KK, 0 + BASE_SHIFT
  2666. add AO, AORIG, r0
  2667. add BO, B, TEMP
  2668. sub TEMP, K, KK
  2669. LFD f16, 0 * SIZE(AO)
  2670. LFD f17, 1 * SIZE(AO)
  2671. LFD f18, 2 * SIZE(AO)
  2672. LFD f19, 3 * SIZE(AO)
  2673. LFD f20, 0 * SIZE(BO)
  2674. LFD f21, 1 * SIZE(BO)
  2675. LFD f22, 2 * SIZE(BO)
  2676. LFD f23, 3 * SIZE(BO)
  2677. srawi. r0, TEMP, 2
  2678. mtspr CTR, r0
  2679. #endif
  2680. ble LL(85)
  2681. .align 5
  2682. LL(82):
  2683. FMADD f0, f16, f20, f0
  2684. FMADD f1, f17, f20, f1
  2685. FMADD f2, f18, f21, f2
  2686. FMADD f3, f19, f21, f3
  2687. LFD f16, 4 * SIZE(AO)
  2688. LFD f17, 5 * SIZE(AO)
  2689. LFD f18, 6 * SIZE(AO)
  2690. LFD f19, 7 * SIZE(AO)
  2691. FMADD f0, f16, f22, f0
  2692. FMADD f1, f17, f22, f1
  2693. FMADD f2, f18, f23, f2
  2694. FMADD f3, f19, f23, f3
  2695. LFD f16, 8 * SIZE(AO)
  2696. LFD f17, 9 * SIZE(AO)
  2697. LFD f18, 10 * SIZE(AO)
  2698. LFD f19, 11 * SIZE(AO)
  2699. LFD f20, 4 * SIZE(BO)
  2700. LFD f21, 5 * SIZE(BO)
  2701. LFD f22, 6 * SIZE(BO)
  2702. LFD f23, 7 * SIZE(BO)
  2703. addi AO, AO, 8 * SIZE
  2704. addi BO, BO, 4 * SIZE
  2705. DCBT(BO, PREB)
  2706. bdnz LL(82)
  2707. .align 4
  2708. LL(85):
  2709. #if defined(LT) || defined(RN)
  2710. andi. r0, KK, 3
  2711. #else
  2712. andi. r0, TEMP, 3
  2713. #endif
  2714. mtspr CTR, r0
  2715. ble+ LL(88)
  2716. .align 4
  2717. LL(86):
  2718. FMADD f0, f16, f20, f0
  2719. FMADD f1, f17, f20, f1
  2720. LFD f16, 2 * SIZE(AO)
  2721. LFD f17, 3 * SIZE(AO)
  2722. LFD f20, 1 * SIZE(BO)
  2723. addi BO, BO, 1 * SIZE
  2724. addi AO, AO, 2 * SIZE
  2725. bdnz LL(86)
  2726. .align 4
  2727. LL(88):
  2728. FADD f0, f2, f0
  2729. FADD f1, f3, f1
  2730. #if defined(LN) || defined(RT)
  2731. #ifdef LN
  2732. subi r0, KK, 2
  2733. #else
  2734. subi r0, KK, 1
  2735. #endif
  2736. slwi TEMP, r0, 1 + BASE_SHIFT
  2737. slwi r0, r0, 0 + BASE_SHIFT
  2738. add AO, AORIG, TEMP
  2739. add BO, B, r0
  2740. #endif
  2741. #if defined(LN) || defined(LT)
  2742. LFD f16, 0 * SIZE(BO)
  2743. LFD f20, 1 * SIZE(BO)
  2744. FSUB f0, f16, f0
  2745. FSUB f1, f20, f1
  2746. #else
  2747. LFD f16, 0 * SIZE(AO)
  2748. LFD f17, 1 * SIZE(AO)
  2749. FSUB f0, f16, f0
  2750. FSUB f1, f17, f1
  2751. #endif
  2752. #ifdef LN
  2753. LFD f19, 3 * SIZE(AO)
  2754. LFD f20, 2 * SIZE(AO)
  2755. LFD f21, 0 * SIZE(AO)
  2756. FMUL f1, f19, f1
  2757. FNMSUB f0, f20, f1, f0
  2758. FMUL f0, f21, f0
  2759. #endif
  2760. #ifdef LT
  2761. LFD f16, 0 * SIZE(AO)
  2762. LFD f17, 1 * SIZE(AO)
  2763. FMUL f0, f16, f0
  2764. FNMSUB f1, f17, f0, f1
  2765. LFD f17, 3 * SIZE(AO)
  2766. FMUL f1, f17, f1
  2767. #endif
  2768. #ifdef RN
  2769. LFD f16, 0 * SIZE(BO)
  2770. FMUL f0, f16, f0
  2771. FMUL f1, f16, f1
  2772. #endif
  2773. #ifdef RT
  2774. LFD f21, 0 * SIZE(BO)
  2775. FMUL f0, f21, f0
  2776. FMUL f1, f21, f1
  2777. #endif
  2778. #ifdef LN
  2779. subi CO1, CO1, 2 * SIZE
  2780. #endif
  2781. #if defined(LN) || defined(LT)
  2782. STFD f0, 0 * SIZE(BO)
  2783. STFD f1, 1 * SIZE(BO)
  2784. #else
  2785. STFD f0, 0 * SIZE(AO)
  2786. STFD f1, 1 * SIZE(AO)
  2787. #endif
  2788. STFD f0, 0 * SIZE(CO1)
  2789. STFD f1, 1 * SIZE(CO1)
  2790. lfs f0, FZERO
  2791. fmr f1, f0
  2792. fmr f2, f0
  2793. fmr f3, f0
  2794. #ifndef LN
  2795. addi CO1, CO1, 2 * SIZE
  2796. #endif
  2797. #ifdef RT
  2798. slwi r0, K, 1 + BASE_SHIFT
  2799. add AORIG, AORIG, r0
  2800. #endif
  2801. #if defined(LT) || defined(RN)
  2802. sub TEMP, K, KK
  2803. slwi r0, TEMP, 1 + BASE_SHIFT
  2804. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2805. add AO, AO, r0
  2806. add BO, BO, TEMP
  2807. #endif
  2808. #ifdef LN
  2809. subi KK, KK, 2
  2810. #endif
  2811. #ifdef LT
  2812. addi KK, KK, 2
  2813. #endif
  2814. .align 4
  2815. LL(90):
  2816. andi. I, M, 1
  2817. ble LL(999)
  2818. #if defined(LT) || defined(RN)
  2819. LFD f16, 0 * SIZE(AO)
  2820. LFD f17, 1 * SIZE(AO)
  2821. LFD f18, 2 * SIZE(AO)
  2822. LFD f19, 3 * SIZE(AO)
  2823. LFD f20, 0 * SIZE(B)
  2824. LFD f21, 1 * SIZE(B)
  2825. LFD f22, 2 * SIZE(B)
  2826. LFD f23, 3 * SIZE(B)
  2827. srawi. r0, KK, 3
  2828. mtspr CTR, r0
  2829. mr BO, B
  2830. #else
  2831. #ifdef LN
  2832. slwi r0, K, BASE_SHIFT
  2833. sub AORIG, AORIG, r0
  2834. #endif
  2835. slwi r0, KK, 0 + BASE_SHIFT
  2836. slwi TEMP, KK, 0 + BASE_SHIFT
  2837. add AO, AORIG, r0
  2838. add BO, B, TEMP
  2839. sub TEMP, K, KK
  2840. LFD f16, 0 * SIZE(AO)
  2841. LFD f17, 1 * SIZE(AO)
  2842. LFD f18, 2 * SIZE(AO)
  2843. LFD f19, 3 * SIZE(AO)
  2844. LFD f20, 0 * SIZE(BO)
  2845. LFD f21, 1 * SIZE(BO)
  2846. LFD f22, 2 * SIZE(BO)
  2847. LFD f23, 3 * SIZE(BO)
  2848. srawi. r0, TEMP, 3
  2849. mtspr CTR, r0
  2850. #endif
  2851. ble LL(95)
  2852. .align 5
  2853. LL(92):
  2854. FMADD f0, f16, f20, f0
  2855. FMADD f1, f17, f21, f1
  2856. FMADD f2, f18, f22, f2
  2857. FMADD f3, f19, f23, f3
  2858. LFD f16, 4 * SIZE(AO)
  2859. LFD f17, 5 * SIZE(AO)
  2860. LFD f18, 6 * SIZE(AO)
  2861. LFD f19, 7 * SIZE(AO)
  2862. LFD f20, 4 * SIZE(BO)
  2863. LFD f21, 5 * SIZE(BO)
  2864. LFD f22, 6 * SIZE(BO)
  2865. LFD f23, 7 * SIZE(BO)
  2866. FMADD f0, f16, f20, f0
  2867. FMADD f1, f17, f21, f1
  2868. FMADD f2, f18, f22, f2
  2869. FMADD f3, f19, f23, f3
  2870. LFD f16, 8 * SIZE(AO)
  2871. LFD f17, 9 * SIZE(AO)
  2872. LFD f18, 10 * SIZE(AO)
  2873. LFD f19, 11 * SIZE(AO)
  2874. LFD f20, 8 * SIZE(BO)
  2875. LFD f21, 9 * SIZE(BO)
  2876. LFD f22, 10 * SIZE(BO)
  2877. LFD f23, 11 * SIZE(BO)
  2878. addi AO, AO, 8 * SIZE
  2879. addi BO, BO, 8 * SIZE
  2880. bdnz LL(92)
  2881. .align 4
  2882. LL(95):
  2883. #if defined(LT) || defined(RN)
  2884. andi. r0, KK, 7
  2885. #else
  2886. andi. r0, TEMP, 7
  2887. #endif
  2888. mtspr CTR, r0
  2889. ble+ LL(98)
  2890. .align 4
  2891. LL(96):
  2892. FMADD f0, f16, f20, f0
  2893. LFD f16, 1 * SIZE(AO)
  2894. LFD f20, 1 * SIZE(BO)
  2895. addi BO, BO, 1 * SIZE
  2896. addi AO, AO, 1 * SIZE
  2897. bdnz LL(96)
  2898. .align 4
  2899. LL(98):
  2900. FADD f0, f1, f0
  2901. FADD f2, f3, f2
  2902. FADD f0, f2, f0
  2903. #if defined(LN) || defined(RT)
  2904. #ifdef LN
  2905. subi r0, KK, 1
  2906. #else
  2907. subi r0, KK, 1
  2908. #endif
  2909. slwi TEMP, r0, 0 + BASE_SHIFT
  2910. slwi r0, r0, 0 + BASE_SHIFT
  2911. add AO, AORIG, TEMP
  2912. add BO, B, r0
  2913. #endif
  2914. #if defined(LN) || defined(LT)
  2915. LFD f16, 0 * SIZE(BO)
  2916. FSUB f0, f16, f0
  2917. #else
  2918. LFD f16, 0 * SIZE(AO)
  2919. FSUB f0, f16, f0
  2920. #endif
  2921. #ifdef LN
  2922. LFD f21, 0 * SIZE(AO)
  2923. FMUL f0, f21, f0
  2924. #endif
  2925. #ifdef LT
  2926. LFD f16, 0 * SIZE(AO)
  2927. FMUL f0, f16, f0
  2928. #endif
  2929. #ifdef RN
  2930. LFD f16, 0 * SIZE(BO)
  2931. FMUL f0, f16, f0
  2932. #endif
  2933. #ifdef RT
  2934. LFD f21, 0 * SIZE(BO)
  2935. FMUL f0, f21, f0
  2936. #endif
  2937. #ifdef LN
  2938. subi CO1, CO1, 1 * SIZE
  2939. #endif
  2940. #if defined(LN) || defined(LT)
  2941. STFD f0, 0 * SIZE(BO)
  2942. #else
  2943. STFD f0, 0 * SIZE(AO)
  2944. #endif
  2945. STFD f0, 0 * SIZE(CO1)
  2946. #ifndef LN
  2947. addi CO1, CO1, 1 * SIZE
  2948. #endif
  2949. #ifdef RT
  2950. slwi r0, K, 0 + BASE_SHIFT
  2951. add AORIG, AORIG, r0
  2952. #endif
  2953. #if defined(LT) || defined(RN)
  2954. sub TEMP, K, KK
  2955. slwi r0, TEMP, 0 + BASE_SHIFT
  2956. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2957. add AO, AO, r0
  2958. add BO, BO, TEMP
  2959. #endif
  2960. #ifdef LN
  2961. subi KK, KK, 1
  2962. #endif
  2963. #ifdef LT
  2964. addi KK, KK, 1
  2965. #endif
  2966. .align 4
  2967. LL(999):
  2968. addi r3, 0, 0
  2969. lfd f14, 0(SP)
  2970. lfd f15, 8(SP)
  2971. lfd f16, 16(SP)
  2972. lfd f17, 24(SP)
  2973. lfd f18, 32(SP)
  2974. lfd f19, 40(SP)
  2975. lfd f20, 48(SP)
  2976. lfd f21, 56(SP)
  2977. lfd f22, 64(SP)
  2978. lfd f23, 72(SP)
  2979. lfd f24, 80(SP)
  2980. lfd f25, 88(SP)
  2981. lfd f26, 96(SP)
  2982. lfd f27, 104(SP)
  2983. lfd f28, 112(SP)
  2984. lfd f29, 120(SP)
  2985. lfd f30, 128(SP)
  2986. lfd f31, 136(SP)
  2987. #ifdef __64BIT__
  2988. ld r31, 144(SP)
  2989. ld r30, 152(SP)
  2990. ld r29, 160(SP)
  2991. ld r28, 168(SP)
  2992. ld r27, 176(SP)
  2993. ld r26, 184(SP)
  2994. ld r25, 192(SP)
  2995. ld r24, 200(SP)
  2996. ld r23, 208(SP)
  2997. ld r22, 216(SP)
  2998. ld r21, 224(SP)
  2999. ld r20, 232(SP)
  3000. ld r19, 240(SP)
  3001. ld r18, 248(SP)
  3002. #else
  3003. lwz r31, 144(SP)
  3004. lwz r30, 148(SP)
  3005. lwz r29, 152(SP)
  3006. lwz r28, 156(SP)
  3007. lwz r27, 160(SP)
  3008. lwz r26, 164(SP)
  3009. lwz r25, 168(SP)
  3010. lwz r24, 172(SP)
  3011. lwz r23, 176(SP)
  3012. lwz r22, 180(SP)
  3013. lwz r21, 184(SP)
  3014. lwz r20, 188(SP)
  3015. lwz r19, 192(SP)
  3016. lwz r18, 196(SP)
  3017. #endif
  3018. addi SP, SP, STACKSIZE
  3019. blr
  3020. EPILOGUE
  3021. #endif