You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_power6_LN.S 63 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #ifdef linux
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define AORIG r18
  88. #define TEMP r19
  89. #define KK r20
  90. #define I r21
  91. #define J r22
  92. #define AO r23
  93. #define BO r24
  94. #define CO1 r25
  95. #define CO2 r26
  96. #define CO3 r27
  97. #define CO4 r28
  98. #define PREA r29
  99. #define PREC r31
  100. #ifndef NEEDPARAM
  101. PROLOGUE
  102. PROFCODE
  103. addi SP, SP, -STACKSIZE
  104. li r0, 0
  105. stfd f14, 0(SP)
  106. stfd f15, 8(SP)
  107. stfd f16, 16(SP)
  108. stfd f17, 24(SP)
  109. stfd f18, 32(SP)
  110. stfd f19, 40(SP)
  111. stfd f20, 48(SP)
  112. stfd f21, 56(SP)
  113. stfd f22, 64(SP)
  114. stfd f23, 72(SP)
  115. stfd f24, 80(SP)
  116. stfd f25, 88(SP)
  117. stfd f26, 96(SP)
  118. stfd f27, 104(SP)
  119. stfd f28, 112(SP)
  120. stfd f29, 120(SP)
  121. stfd f30, 128(SP)
  122. stfd f31, 136(SP)
  123. #ifdef __64BIT__
  124. std r31, 144(SP)
  125. std r30, 152(SP)
  126. std r29, 160(SP)
  127. std r28, 168(SP)
  128. std r27, 176(SP)
  129. std r26, 184(SP)
  130. std r25, 192(SP)
  131. std r24, 200(SP)
  132. std r23, 208(SP)
  133. std r22, 216(SP)
  134. std r21, 224(SP)
  135. std r20, 232(SP)
  136. std r19, 240(SP)
  137. std r18, 248(SP)
  138. #else
  139. stw r31, 144(SP)
  140. stw r30, 148(SP)
  141. stw r29, 152(SP)
  142. stw r28, 156(SP)
  143. stw r27, 160(SP)
  144. stw r26, 164(SP)
  145. stw r25, 168(SP)
  146. stw r24, 172(SP)
  147. stw r23, 176(SP)
  148. stw r22, 180(SP)
  149. stw r21, 184(SP)
  150. stw r20, 188(SP)
  151. stw r19, 192(SP)
  152. stw r18, 196(SP)
  153. #endif
  154. stw r0, FZERO
  155. #if defined(_AIX) || defined(__APPLE__)
  156. #if !defined(__64BIT__) && defined(DOUBLE)
  157. lwz LDC, 56 + STACKSIZE(SP)
  158. #endif
  159. #endif
  160. slwi LDC, LDC, BASE_SHIFT
  161. #if defined(linux) && defined(__64BIT__)
  162. ld OFFSET, 112 + STACKSIZE(SP)
  163. #endif
  164. #if defined(_AIX) || defined(__APPLE__)
  165. #ifdef __64BIT__
  166. ld OFFSET, 112 + STACKSIZE(SP)
  167. #else
  168. #ifdef DOUBLE
  169. lwz OFFSET, 60 + STACKSIZE(SP)
  170. #else
  171. lwz OFFSET, 56 + STACKSIZE(SP)
  172. #endif
  173. #endif
  174. #endif
  175. #ifdef LN
  176. mullw r0, M, K
  177. slwi r0, r0, BASE_SHIFT
  178. add A, A, r0
  179. slwi r0, M, BASE_SHIFT
  180. add C, C, r0
  181. #endif
  182. #ifdef RN
  183. neg KK, OFFSET
  184. #endif
  185. #ifdef RT
  186. mullw r0, N, K
  187. slwi r0, r0, BASE_SHIFT
  188. add B, B, r0
  189. mullw r0, N, LDC
  190. add C, C, r0
  191. sub KK, N, OFFSET
  192. #endif
  193. cmpwi cr0, M, 0
  194. ble LL(999)
  195. cmpwi cr0, N, 0
  196. ble LL(999)
  197. cmpwi cr0, K, 0
  198. ble LL(999)
  199. li PREA, (16 * 3 * SIZE)
  200. li PREC, -4 * SIZE
  201. lfs f0, FZERO
  202. srawi. J, N, 2
  203. ble LL(40)
  204. .align 4
  205. LL(10):
  206. #ifdef RT
  207. slwi r0, K, 2 + BASE_SHIFT
  208. sub B, B, r0
  209. slwi r0, LDC, 2
  210. sub C, C, r0
  211. #endif
  212. mr CO1, C
  213. add CO2, C, LDC
  214. add CO3, CO2, LDC
  215. add CO4, CO3, LDC
  216. #ifdef LN
  217. add KK, M, OFFSET
  218. #endif
  219. #ifdef LT
  220. mr KK, OFFSET
  221. #endif
  222. fmr f1, f0
  223. fmr f2, f0
  224. fmr f3, f0
  225. fmr f4, f0
  226. fmr f5, f0
  227. fmr f6, f0
  228. fmr f7, f0
  229. fmr f8, f0
  230. fmr f9, f0
  231. fmr f10, f0
  232. fmr f11, f0
  233. fmr f12, f0
  234. fmr f13, f0
  235. fmr f14, f0
  236. fmr f15, f0
  237. #if defined(LN) || defined(RT)
  238. mr AORIG, A
  239. #else
  240. mr AO, A
  241. #endif
  242. #ifndef RT
  243. add C, CO4, LDC
  244. #endif
  245. LL(30):
  246. andi. I, M, 1
  247. ble LL(20)
  248. #if defined(LT) || defined(RN)
  249. LFD f16, 0 * SIZE(AO)
  250. LFD f17, 1 * SIZE(AO)
  251. LFD f18, 2 * SIZE(AO)
  252. LFD f19, 3 * SIZE(AO)
  253. LFD f20, 0 * SIZE(B)
  254. LFD f21, 1 * SIZE(B)
  255. LFD f22, 2 * SIZE(B)
  256. LFD f23, 3 * SIZE(B)
  257. LFD f24, 4 * SIZE(B)
  258. LFD f25, 5 * SIZE(B)
  259. LFD f26, 6 * SIZE(B)
  260. LFD f27, 7 * SIZE(B)
  261. srawi. r0, KK, 2
  262. mtspr CTR, r0
  263. mr BO, B
  264. #else
  265. #ifdef LN
  266. slwi r0, K, BASE_SHIFT
  267. sub AORIG, AORIG, r0
  268. #endif
  269. slwi r0, KK, 0 + BASE_SHIFT
  270. slwi TEMP, KK, 2 + BASE_SHIFT
  271. add AO, AORIG, r0
  272. add BO, B, TEMP
  273. sub TEMP, K, KK
  274. LFD f16, 0 * SIZE(AO)
  275. LFD f17, 1 * SIZE(AO)
  276. LFD f18, 2 * SIZE(AO)
  277. LFD f19, 3 * SIZE(AO)
  278. LFD f20, 0 * SIZE(BO)
  279. LFD f21, 1 * SIZE(BO)
  280. LFD f22, 2 * SIZE(BO)
  281. LFD f23, 3 * SIZE(BO)
  282. LFD f24, 4 * SIZE(BO)
  283. LFD f25, 5 * SIZE(BO)
  284. LFD f26, 6 * SIZE(BO)
  285. LFD f27, 7 * SIZE(BO)
  286. srawi. r0, TEMP, 2
  287. mtspr CTR, r0
  288. #endif
  289. ble LL(35)
  290. .align 5
  291. LL(32):
  292. FMADD f0, f16, f20, f0
  293. FMADD f4, f16, f21, f4
  294. FMADD f8, f16, f22, f8
  295. FMADD f12, f16, f23, f12
  296. LFD f20, 8 * SIZE(BO)
  297. LFD f21, 9 * SIZE(BO)
  298. LFD f22, 10 * SIZE(BO)
  299. LFD f23, 11 * SIZE(BO)
  300. FMADD f1, f17, f24, f1
  301. FMADD f5, f17, f25, f5
  302. FMADD f9, f17, f26, f9
  303. FMADD f13, f17, f27, f13
  304. LFD f24, 12 * SIZE(BO)
  305. LFD f25, 13 * SIZE(BO)
  306. LFD f26, 14 * SIZE(BO)
  307. LFD f27, 15 * SIZE(BO)
  308. FMADD f0, f18, f20, f0
  309. FMADD f4, f18, f21, f4
  310. FMADD f8, f18, f22, f8
  311. FMADD f12, f18, f23, f12
  312. LFD f20, 16 * SIZE(BO)
  313. LFD f21, 17 * SIZE(BO)
  314. LFD f22, 18 * SIZE(BO)
  315. LFD f23, 19 * SIZE(BO)
  316. FMADD f1, f19, f24, f1
  317. FMADD f5, f19, f25, f5
  318. FMADD f9, f19, f26, f9
  319. FMADD f13, f19, f27, f13
  320. LFD f16, 4 * SIZE(AO)
  321. LFD f17, 5 * SIZE(AO)
  322. LFD f18, 6 * SIZE(AO)
  323. LFD f19, 7 * SIZE(AO)
  324. LFD f24, 20 * SIZE(BO)
  325. LFD f25, 21 * SIZE(BO)
  326. LFD f26, 22 * SIZE(BO)
  327. LFD f27, 23 * SIZE(BO)
  328. addi AO, AO, 4 * SIZE
  329. addi BO, BO, 16 * SIZE
  330. dcbtst AO, PREA
  331. bdnz LL(32)
  332. fadd f0, f1, f0
  333. fadd f4, f5, f4
  334. fadd f8, f9, f8
  335. fadd f12, f13, f12
  336. .align 4
  337. LL(35):
  338. #if defined(LT) || defined(RN)
  339. andi. r0, KK, 3
  340. #else
  341. andi. r0, TEMP, 3
  342. #endif
  343. mtspr CTR, r0
  344. ble+ LL(38)
  345. .align 4
  346. LL(36):
  347. FMADD f0, f16, f20, f0
  348. FMADD f4, f16, f21, f4
  349. FMADD f8, f16, f22, f8
  350. FMADD f12, f16, f23, f12
  351. LFD f16, 1 * SIZE(AO)
  352. LFD f20, 4 * SIZE(BO)
  353. LFD f21, 5 * SIZE(BO)
  354. LFD f22, 6 * SIZE(BO)
  355. LFD f23, 7 * SIZE(BO)
  356. addi BO, BO, 4 * SIZE
  357. addi AO, AO, 1 * SIZE
  358. bdnz LL(36)
  359. .align 4
  360. LL(38):
  361. #if defined(LN) || defined(RT)
  362. #ifdef LN
  363. subi r0, KK, 1
  364. #else
  365. subi r0, KK, 4
  366. #endif
  367. slwi TEMP, r0, 0 + BASE_SHIFT
  368. slwi r0, r0, 2 + BASE_SHIFT
  369. add AO, AORIG, TEMP
  370. add BO, B, r0
  371. #endif
  372. #if defined(LN) || defined(LT)
  373. LFD f16, 0 * SIZE(BO)
  374. LFD f17, 1 * SIZE(BO)
  375. LFD f18, 2 * SIZE(BO)
  376. LFD f19, 3 * SIZE(BO)
  377. FSUB f0, f16, f0
  378. FSUB f4, f17, f4
  379. FSUB f8, f18, f8
  380. FSUB f12, f19, f12
  381. #else
  382. LFD f16, 0 * SIZE(AO)
  383. LFD f20, 1 * SIZE(AO)
  384. LFD f24, 2 * SIZE(AO)
  385. LFD f28, 3 * SIZE(AO)
  386. FSUB f0, f16, f0
  387. FSUB f4, f20, f4
  388. FSUB f8, f24, f8
  389. FSUB f12, f28, f12
  390. #endif
  391. #ifdef LN
  392. LFD f21, 0 * SIZE(AO)
  393. FMUL f0, f21, f0
  394. FMUL f4, f21, f4
  395. FMUL f8, f21, f8
  396. FMUL f12, f21, f12
  397. #endif
  398. #ifdef LT
  399. LFD f16, 0 * SIZE(AO)
  400. FMUL f0, f16, f0
  401. FMUL f4, f16, f4
  402. FMUL f8, f16, f8
  403. FMUL f12, f16, f12
  404. #endif
  405. #ifdef RN
  406. LFD f16, 0 * SIZE(BO)
  407. LFD f17, 1 * SIZE(BO)
  408. LFD f18, 2 * SIZE(BO)
  409. LFD f19, 3 * SIZE(BO)
  410. FMUL f0, f16, f0
  411. FNMSUB f4, f17, f0, f4
  412. FNMSUB f8, f18, f0, f8
  413. FNMSUB f12, f19, f0, f12
  414. LFD f16, 5 * SIZE(BO)
  415. LFD f17, 6 * SIZE(BO)
  416. LFD f18, 7 * SIZE(BO)
  417. LFD f19, 10 * SIZE(BO)
  418. LFD f20, 11 * SIZE(BO)
  419. LFD f21, 15 * SIZE(BO)
  420. FMUL f4, f16, f4
  421. FNMSUB f8, f17, f4, f8
  422. FNMSUB f12, f18, f4, f12
  423. FMUL f8, f19, f8
  424. FNMSUB f12, f20, f8, f12
  425. FMUL f12, f21, f12
  426. #endif
  427. #ifdef RT
  428. LFD f16, 15 * SIZE(BO)
  429. LFD f17, 14 * SIZE(BO)
  430. LFD f18, 13 * SIZE(BO)
  431. LFD f19, 12 * SIZE(BO)
  432. FMUL f12, f16, f12
  433. FNMSUB f8, f17, f12, f8
  434. FNMSUB f4, f18, f12, f4
  435. FNMSUB f0, f19, f12, f0
  436. LFD f16, 10 * SIZE(BO)
  437. LFD f17, 9 * SIZE(BO)
  438. LFD f18, 8 * SIZE(BO)
  439. LFD f19, 5 * SIZE(BO)
  440. FMUL f8, f16, f8
  441. LFD f20, 4 * SIZE(BO)
  442. LFD f21, 0 * SIZE(BO)
  443. FNMSUB f4, f17, f8, f4
  444. FNMSUB f0, f18, f8, f0
  445. FMUL f4, f19, f4
  446. FNMSUB f0, f20, f4, f0
  447. FMUL f0, f21, f0
  448. #endif
  449. #ifdef LN
  450. subi CO1, CO1, 1 * SIZE
  451. subi CO2, CO2, 1 * SIZE
  452. subi CO3, CO3, 1 * SIZE
  453. subi CO4, CO4, 1 * SIZE
  454. #endif
  455. #if defined(LN) || defined(LT)
  456. STFD f0, 0 * SIZE(BO)
  457. STFD f4, 1 * SIZE(BO)
  458. STFD f8, 2 * SIZE(BO)
  459. STFD f12, 3 * SIZE(BO)
  460. #else
  461. STFD f0, 0 * SIZE(AO)
  462. STFD f4, 1 * SIZE(AO)
  463. STFD f8, 2 * SIZE(AO)
  464. STFD f12, 3 * SIZE(AO)
  465. #endif
  466. STFD f0, 0 * SIZE(CO1)
  467. STFD f4, 0 * SIZE(CO2)
  468. STFD f8, 0 * SIZE(CO3)
  469. STFD f12, 0 * SIZE(CO4)
  470. lfs f0, FZERO
  471. fmr f1, f0
  472. fmr f4, f0
  473. fmr f5, f0
  474. fmr f8, f0
  475. fmr f9, f0
  476. fmr f12, f0
  477. fmr f13, f0
  478. #ifndef LN
  479. addi CO1, CO1, 1 * SIZE
  480. addi CO2, CO2, 1 * SIZE
  481. addi CO3, CO3, 1 * SIZE
  482. addi CO4, CO4, 1 * SIZE
  483. #endif
  484. #ifdef RT
  485. slwi r0, K, 0 + BASE_SHIFT
  486. add AORIG, AORIG, r0
  487. #endif
  488. #if defined(LT) || defined(RN)
  489. sub TEMP, K, KK
  490. slwi r0, TEMP, 0 + BASE_SHIFT
  491. slwi TEMP, TEMP, 2 + BASE_SHIFT
  492. add AO, AO, r0
  493. add BO, BO, TEMP
  494. #endif
  495. #ifdef LN
  496. subi KK, KK, 1
  497. #endif
  498. #ifdef LT
  499. addi KK, KK, 1
  500. #endif
  501. .align 4
  502. LL(20):
  503. andi. I, M, 2
  504. ble LL(09)
  505. #if defined(LT) || defined(RN)
  506. LFD f16, 0 * SIZE(AO)
  507. LFD f17, 1 * SIZE(AO)
  508. LFD f18, 2 * SIZE(AO)
  509. LFD f19, 3 * SIZE(AO)
  510. LFD f20, 0 * SIZE(B)
  511. LFD f21, 1 * SIZE(B)
  512. LFD f22, 2 * SIZE(B)
  513. LFD f23, 3 * SIZE(B)
  514. LFD f24, 4 * SIZE(B)
  515. LFD f25, 5 * SIZE(B)
  516. LFD f26, 6 * SIZE(B)
  517. LFD f27, 7 * SIZE(B)
  518. srawi. r0, KK, 2
  519. mtspr CTR, r0
  520. mr BO, B
  521. #else
  522. #ifdef LN
  523. slwi r0, K, 1 + BASE_SHIFT
  524. sub AORIG, AORIG, r0
  525. #endif
  526. slwi r0, KK, 1 + BASE_SHIFT
  527. slwi TEMP, KK, 2 + BASE_SHIFT
  528. add AO, AORIG, r0
  529. add BO, B, TEMP
  530. sub TEMP, K, KK
  531. LFD f16, 0 * SIZE(AO)
  532. LFD f17, 1 * SIZE(AO)
  533. LFD f18, 2 * SIZE(AO)
  534. LFD f19, 3 * SIZE(AO)
  535. LFD f20, 0 * SIZE(BO)
  536. LFD f21, 1 * SIZE(BO)
  537. LFD f22, 2 * SIZE(BO)
  538. LFD f23, 3 * SIZE(BO)
  539. LFD f24, 4 * SIZE(BO)
  540. LFD f25, 5 * SIZE(BO)
  541. LFD f26, 6 * SIZE(BO)
  542. LFD f27, 7 * SIZE(BO)
  543. srawi. r0, TEMP, 2
  544. mtspr CTR, r0
  545. #endif
  546. ble LL(25)
  547. .align 5
  548. LL(22):
  549. FMADD f0, f16, f20, f0
  550. FMADD f1, f17, f20, f1
  551. FMADD f4, f16, f21, f4
  552. FMADD f5, f17, f21, f5
  553. FMADD f8, f16, f22, f8
  554. FMADD f9, f17, f22, f9
  555. FMADD f12, f16, f23, f12
  556. FMADD f13, f17, f23, f13
  557. LFD f20, 8 * SIZE(BO)
  558. LFD f21, 9 * SIZE(BO)
  559. LFD f22, 10 * SIZE(BO)
  560. LFD f23, 11 * SIZE(BO)
  561. FMADD f2, f18, f24, f2
  562. FMADD f3, f19, f24, f3
  563. FMADD f6, f18, f25, f6
  564. FMADD f7, f19, f25, f7
  565. FMADD f10, f18, f26, f10
  566. FMADD f11, f19, f26, f11
  567. FMADD f14, f18, f27, f14
  568. FMADD f15, f19, f27, f15
  569. LFD f16, 4 * SIZE(AO)
  570. LFD f17, 5 * SIZE(AO)
  571. LFD f18, 6 * SIZE(AO)
  572. LFD f19, 7 * SIZE(AO)
  573. FMADD f0, f16, f20, f0
  574. FMADD f1, f17, f20, f1
  575. FMADD f4, f16, f21, f4
  576. FMADD f5, f17, f21, f5
  577. LFD f24, 12 * SIZE(BO)
  578. LFD f25, 13 * SIZE(BO)
  579. LFD f26, 14 * SIZE(BO)
  580. LFD f27, 15 * SIZE(BO)
  581. FMADD f8, f16, f22, f8
  582. FMADD f9, f17, f22, f9
  583. FMADD f12, f16, f23, f12
  584. FMADD f13, f17, f23, f13
  585. LFD f20, 16 * SIZE(BO)
  586. LFD f21, 17 * SIZE(BO)
  587. LFD f22, 18 * SIZE(BO)
  588. LFD f23, 19 * SIZE(BO)
  589. FMADD f2, f18, f24, f2
  590. FMADD f3, f19, f24, f3
  591. FMADD f6, f18, f25, f6
  592. FMADD f7, f19, f25, f7
  593. FMADD f10, f18, f26, f10
  594. FMADD f11, f19, f26, f11
  595. FMADD f14, f18, f27, f14
  596. FMADD f15, f19, f27, f15
  597. LFD f16, 8 * SIZE(AO)
  598. LFD f17, 9 * SIZE(AO)
  599. LFD f18, 10 * SIZE(AO)
  600. LFD f19, 11 * SIZE(AO)
  601. LFD f24, 20 * SIZE(BO)
  602. LFD f25, 21 * SIZE(BO)
  603. LFD f26, 22 * SIZE(BO)
  604. LFD f27, 23 * SIZE(BO)
  605. addi AO, AO, 8 * SIZE
  606. addi BO, BO, 16 * SIZE
  607. dcbtst AO, PREA
  608. bdnz LL(22)
  609. fadd f0, f2, f0
  610. fadd f1, f3, f1
  611. fadd f4, f6, f4
  612. fadd f5, f7, f5
  613. fadd f8, f10, f8
  614. fadd f9, f11, f9
  615. fadd f12, f14, f12
  616. fadd f13, f15, f13
  617. .align 4
  618. LL(25):
  619. #if defined(LT) || defined(RN)
  620. andi. r0, KK, 3
  621. #else
  622. andi. r0, TEMP, 3
  623. #endif
  624. mtspr CTR, r0
  625. ble+ LL(28)
  626. .align 4
  627. LL(26):
  628. FMADD f0, f16, f20, f0
  629. FMADD f1, f17, f20, f1
  630. FMADD f4, f16, f21, f4
  631. FMADD f5, f17, f21, f5
  632. FMADD f8, f16, f22, f8
  633. FMADD f9, f17, f22, f9
  634. FMADD f12, f16, f23, f12
  635. FMADD f13, f17, f23, f13
  636. LFD f16, 2 * SIZE(AO)
  637. LFD f17, 3 * SIZE(AO)
  638. LFD f20, 4 * SIZE(BO)
  639. LFD f21, 5 * SIZE(BO)
  640. LFD f22, 6 * SIZE(BO)
  641. LFD f23, 7 * SIZE(BO)
  642. addi BO, BO, 4 * SIZE
  643. addi AO, AO, 2 * SIZE
  644. bdnz LL(26)
  645. .align 4
  646. LL(28):
  647. #if defined(LN) || defined(RT)
  648. #ifdef LN
  649. subi r0, KK, 2
  650. #else
  651. subi r0, KK, 4
  652. #endif
  653. slwi TEMP, r0, 1 + BASE_SHIFT
  654. slwi r0, r0, 2 + BASE_SHIFT
  655. add AO, AORIG, TEMP
  656. add BO, B, r0
  657. #endif
  658. #if defined(LN) || defined(LT)
  659. LFD f16, 0 * SIZE(BO)
  660. LFD f17, 1 * SIZE(BO)
  661. LFD f18, 2 * SIZE(BO)
  662. LFD f19, 3 * SIZE(BO)
  663. LFD f20, 4 * SIZE(BO)
  664. LFD f21, 5 * SIZE(BO)
  665. LFD f22, 6 * SIZE(BO)
  666. LFD f23, 7 * SIZE(BO)
  667. FSUB f0, f16, f0
  668. FSUB f4, f17, f4
  669. FSUB f8, f18, f8
  670. FSUB f12, f19, f12
  671. FSUB f1, f20, f1
  672. FSUB f5, f21, f5
  673. FSUB f9, f22, f9
  674. FSUB f13, f23, f13
  675. #else
  676. LFD f16, 0 * SIZE(AO)
  677. LFD f17, 1 * SIZE(AO)
  678. LFD f20, 2 * SIZE(AO)
  679. LFD f21, 3 * SIZE(AO)
  680. LFD f24, 4 * SIZE(AO)
  681. LFD f25, 5 * SIZE(AO)
  682. LFD f28, 6 * SIZE(AO)
  683. LFD f29, 7 * SIZE(AO)
  684. FSUB f0, f16, f0
  685. FSUB f1, f17, f1
  686. FSUB f4, f20, f4
  687. FSUB f5, f21, f5
  688. FSUB f8, f24, f8
  689. FSUB f9, f25, f9
  690. FSUB f12, f28, f12
  691. FSUB f13, f29, f13
  692. #endif
  693. #ifdef LN
  694. LFD f19, 3 * SIZE(AO)
  695. LFD f20, 2 * SIZE(AO)
  696. LFD f21, 0 * SIZE(AO)
  697. FMUL f1, f19, f1
  698. FMUL f5, f19, f5
  699. FMUL f9, f19, f9
  700. FMUL f13, f19, f13
  701. FNMSUB f0, f20, f1, f0
  702. FNMSUB f4, f20, f5, f4
  703. FNMSUB f8, f20, f9, f8
  704. FNMSUB f12, f20, f13, f12
  705. FMUL f0, f21, f0
  706. FMUL f4, f21, f4
  707. FMUL f8, f21, f8
  708. FMUL f12, f21, f12
  709. #endif
  710. #ifdef LT
  711. LFD f16, 0 * SIZE(AO)
  712. LFD f17, 1 * SIZE(AO)
  713. FMUL f0, f16, f0
  714. FMUL f4, f16, f4
  715. FMUL f8, f16, f8
  716. FMUL f12, f16, f12
  717. FNMSUB f1, f17, f0, f1
  718. FNMSUB f5, f17, f4, f5
  719. FNMSUB f9, f17, f8, f9
  720. FNMSUB f13, f17, f12, f13
  721. LFD f17, 3 * SIZE(AO)
  722. FMUL f1, f17, f1
  723. FMUL f5, f17, f5
  724. FMUL f9, f17, f9
  725. FMUL f13, f17, f13
  726. #endif
  727. #ifdef RN
  728. LFD f16, 0 * SIZE(BO)
  729. LFD f17, 1 * SIZE(BO)
  730. LFD f18, 2 * SIZE(BO)
  731. LFD f19, 3 * SIZE(BO)
  732. FMUL f0, f16, f0
  733. FMUL f1, f16, f1
  734. FNMSUB f4, f17, f0, f4
  735. FNMSUB f5, f17, f1, f5
  736. FNMSUB f8, f18, f0, f8
  737. FNMSUB f9, f18, f1, f9
  738. FNMSUB f12, f19, f0, f12
  739. FNMSUB f13, f19, f1, f13
  740. LFD f16, 5 * SIZE(BO)
  741. LFD f17, 6 * SIZE(BO)
  742. LFD f18, 7 * SIZE(BO)
  743. LFD f19, 10 * SIZE(BO)
  744. LFD f20, 11 * SIZE(BO)
  745. LFD f21, 15 * SIZE(BO)
  746. FMUL f4, f16, f4
  747. FMUL f5, f16, f5
  748. FNMSUB f8, f17, f4, f8
  749. FNMSUB f9, f17, f5, f9
  750. FNMSUB f12, f18, f4, f12
  751. FNMSUB f13, f18, f5, f13
  752. FMUL f8, f19, f8
  753. FMUL f9, f19, f9
  754. FNMSUB f12, f20, f8, f12
  755. FNMSUB f13, f20, f9, f13
  756. FMUL f12, f21, f12
  757. FMUL f13, f21, f13
  758. #endif
  759. #ifdef RT
  760. LFD f16, 15 * SIZE(BO)
  761. LFD f17, 14 * SIZE(BO)
  762. LFD f18, 13 * SIZE(BO)
  763. LFD f19, 12 * SIZE(BO)
  764. FMUL f12, f16, f12
  765. FMUL f13, f16, f13
  766. FNMSUB f8, f17, f12, f8
  767. FNMSUB f9, f17, f13, f9
  768. FNMSUB f4, f18, f12, f4
  769. FNMSUB f5, f18, f13, f5
  770. FNMSUB f0, f19, f12, f0
  771. FNMSUB f1, f19, f13, f1
  772. LFD f16, 10 * SIZE(BO)
  773. LFD f17, 9 * SIZE(BO)
  774. LFD f18, 8 * SIZE(BO)
  775. LFD f19, 5 * SIZE(BO)
  776. LFD f20, 4 * SIZE(BO)
  777. LFD f21, 0 * SIZE(BO)
  778. FMUL f8, f16, f8
  779. FMUL f9, f16, f9
  780. FNMSUB f4, f17, f8, f4
  781. FNMSUB f5, f17, f9, f5
  782. FNMSUB f0, f18, f8, f0
  783. FNMSUB f1, f18, f9, f1
  784. FMUL f4, f19, f4
  785. FMUL f5, f19, f5
  786. FNMSUB f0, f20, f4, f0
  787. FNMSUB f1, f20, f5, f1
  788. FMUL f0, f21, f0
  789. FMUL f1, f21, f1
  790. #endif
  791. #ifdef LN
  792. subi CO1, CO1, 2 * SIZE
  793. subi CO2, CO2, 2 * SIZE
  794. subi CO3, CO3, 2 * SIZE
  795. subi CO4, CO4, 2 * SIZE
  796. #endif
  797. #if defined(LN) || defined(LT)
  798. STFD f0, 0 * SIZE(BO)
  799. STFD f4, 1 * SIZE(BO)
  800. STFD f8, 2 * SIZE(BO)
  801. STFD f12, 3 * SIZE(BO)
  802. STFD f1, 4 * SIZE(BO)
  803. STFD f5, 5 * SIZE(BO)
  804. STFD f9, 6 * SIZE(BO)
  805. STFD f13, 7 * SIZE(BO)
  806. #else
  807. STFD f0, 0 * SIZE(AO)
  808. STFD f1, 1 * SIZE(AO)
  809. STFD f4, 2 * SIZE(AO)
  810. STFD f5, 3 * SIZE(AO)
  811. STFD f8, 4 * SIZE(AO)
  812. STFD f9, 5 * SIZE(AO)
  813. STFD f12, 6 * SIZE(AO)
  814. STFD f13, 7 * SIZE(AO)
  815. #endif
  816. STFD f0, 0 * SIZE(CO1)
  817. STFD f1, 1 * SIZE(CO1)
  818. STFD f4, 0 * SIZE(CO2)
  819. STFD f5, 1 * SIZE(CO2)
  820. STFD f8, 0 * SIZE(CO3)
  821. STFD f9, 1 * SIZE(CO3)
  822. STFD f12, 0 * SIZE(CO4)
  823. STFD f13, 1 * SIZE(CO4)
  824. lfs f0, FZERO
  825. fmr f1, f0
  826. fmr f2, f0
  827. fmr f3, f0
  828. fmr f4, f0
  829. fmr f5, f0
  830. fmr f6, f0
  831. fmr f7, f0
  832. fmr f8, f0
  833. fmr f9, f0
  834. fmr f10, f0
  835. fmr f11, f0
  836. fmr f12, f0
  837. fmr f13, f0
  838. fmr f14, f0
  839. fmr f15, f0
  840. #ifndef LN
  841. addi CO1, CO1, 2 * SIZE
  842. addi CO2, CO2, 2 * SIZE
  843. addi CO3, CO3, 2 * SIZE
  844. addi CO4, CO4, 2 * SIZE
  845. #endif
  846. #ifdef RT
  847. slwi r0, K, 1 + BASE_SHIFT
  848. add AORIG, AORIG, r0
  849. #endif
  850. #if defined(LT) || defined(RN)
  851. sub TEMP, K, KK
  852. slwi r0, TEMP, 1 + BASE_SHIFT
  853. slwi TEMP, TEMP, 2 + BASE_SHIFT
  854. add AO, AO, r0
  855. add BO, BO, TEMP
  856. #endif
  857. #ifdef LN
  858. subi KK, KK, 2
  859. #endif
  860. #ifdef LT
  861. addi KK, KK, 2
  862. #endif
  863. .align 4
  864. LL(09):
  865. srawi. I, M, 2
  866. ble LL(39)
  867. .align 4
  868. LL(11):
  869. #if defined(LT) || defined(RN)
  870. LFD f16, 0 * SIZE(AO)
  871. LFD f17, 1 * SIZE(AO)
  872. LFD f18, 2 * SIZE(AO)
  873. LFD f19, 3 * SIZE(AO)
  874. LFD f20, 0 * SIZE(B)
  875. LFD f21, 1 * SIZE(B)
  876. LFD f22, 2 * SIZE(B)
  877. LFD f23, 3 * SIZE(B)
  878. dcbtst CO1, PREC
  879. dcbtst CO2, PREC
  880. dcbtst CO3, PREC
  881. dcbtst CO4, PREC
  882. srawi. r0, KK, 3
  883. mtspr CTR, r0
  884. mr BO, B
  885. #else
  886. #ifdef LN
  887. slwi r0, K, 2 + BASE_SHIFT
  888. sub AORIG, AORIG, r0
  889. #endif
  890. slwi TEMP, KK, 2 + BASE_SHIFT
  891. add AO, AORIG, TEMP
  892. add BO, B, TEMP
  893. sub TEMP, K, KK
  894. LFD f16, 0 * SIZE(AO)
  895. LFD f17, 1 * SIZE(AO)
  896. LFD f18, 2 * SIZE(AO)
  897. LFD f19, 3 * SIZE(AO)
  898. LFD f20, 0 * SIZE(BO)
  899. LFD f21, 1 * SIZE(BO)
  900. LFD f22, 2 * SIZE(BO)
  901. LFD f23, 3 * SIZE(BO)
  902. dcbtst CO1, PREC
  903. dcbtst CO2, PREC
  904. dcbtst CO3, PREC
  905. dcbtst CO4, PREC
  906. srawi. r0, TEMP, 3
  907. mtspr CTR, r0
  908. #endif
  909. ble LL(15)
  910. .align 4
  911. LL(12):
  912. dcbt AO, PREA
  913. dcbtst BO, PREA
  914. FMADD f0, f16, f20, f0
  915. FMADD f4, f16, f21, f4
  916. FMADD f8, f16, f22, f8
  917. FMADD f12, f16, f23, f12
  918. LFD f24, 4 * SIZE(AO)
  919. LFD f28, 4 * SIZE(BO)
  920. LFD f25, 5 * SIZE(AO)
  921. LFD f29, 5 * SIZE(BO)
  922. FMADD f1, f17, f20, f1
  923. FMADD f5, f17, f21, f5
  924. FMADD f9, f17, f22, f9
  925. FMADD f13, f17, f23, f13
  926. FMADD f2, f18, f20, f2
  927. FMADD f6, f18, f21, f6
  928. FMADD f10, f18, f22, f10
  929. FMADD f14, f18, f23, f14
  930. LFD f26, 6 * SIZE(AO)
  931. LFD f30, 6 * SIZE(BO)
  932. LFD f27, 7 * SIZE(AO)
  933. LFD f31, 7 * SIZE(BO)
  934. FMADD f3, f19, f20, f3
  935. FMADD f7, f19, f21, f7
  936. FMADD f11, f19, f22, f11
  937. FMADD f15, f19, f23, f15
  938. FMADD f0, f24, f28, f0
  939. FMADD f4, f24, f29, f4
  940. FMADD f8, f24, f30, f8
  941. FMADD f12, f24, f31, f12
  942. LFD f16, 8 * SIZE(AO)
  943. LFD f20, 8 * SIZE(BO)
  944. LFD f17, 9 * SIZE(AO)
  945. LFD f21, 9 * SIZE(BO)
  946. FMADD f1, f25, f28, f1
  947. FMADD f5, f25, f29, f5
  948. FMADD f9, f25, f30, f9
  949. FMADD f13, f25, f31, f13
  950. FMADD f2, f26, f28, f2
  951. FMADD f6, f26, f29, f6
  952. FMADD f10, f26, f30, f10
  953. FMADD f14, f26, f31, f14
  954. LFD f18, 10 * SIZE(AO)
  955. LFD f22, 10 * SIZE(BO)
  956. LFD f19, 11 * SIZE(AO)
  957. LFD f23, 11 * SIZE(BO)
  958. FMADD f3, f27, f28, f3
  959. FMADD f7, f27, f29, f7
  960. FMADD f11, f27, f30, f11
  961. FMADD f15, f27, f31, f15
  962. FMADD f0, f16, f20, f0
  963. FMADD f4, f16, f21, f4
  964. FMADD f8, f16, f22, f8
  965. FMADD f12, f16, f23, f12
  966. LFD f24, 12 * SIZE(AO)
  967. LFD f28, 12 * SIZE(BO)
  968. LFD f25, 13 * SIZE(AO)
  969. LFD f29, 13 * SIZE(BO)
  970. FMADD f1, f17, f20, f1
  971. FMADD f5, f17, f21, f5
  972. FMADD f9, f17, f22, f9
  973. FMADD f13, f17, f23, f13
  974. FMADD f2, f18, f20, f2
  975. FMADD f6, f18, f21, f6
  976. FMADD f10, f18, f22, f10
  977. FMADD f14, f18, f23, f14
  978. LFD f26, 14 * SIZE(AO)
  979. LFD f30, 14 * SIZE(BO)
  980. LFD f27, 15 * SIZE(AO)
  981. LFD f31, 15 * SIZE(BO)
  982. FMADD f3, f19, f20, f3
  983. FMADD f7, f19, f21, f7
  984. FMADD f11, f19, f22, f11
  985. FMADD f15, f19, f23, f15
  986. FMADD f0, f24, f28, f0
  987. FMADD f4, f24, f29, f4
  988. FMADD f8, f24, f30, f8
  989. FMADD f12, f24, f31, f12
  990. LFD f16, 16 * SIZE(AO)
  991. LFD f20, 16 * SIZE(BO)
  992. LFD f17, 17 * SIZE(AO)
  993. LFD f21, 17 * SIZE(BO)
  994. FMADD f1, f25, f28, f1
  995. FMADD f5, f25, f29, f5
  996. FMADD f9, f25, f30, f9
  997. FMADD f13, f25, f31, f13
  998. FMADD f2, f26, f28, f2
  999. FMADD f6, f26, f29, f6
  1000. FMADD f10, f26, f30, f10
  1001. FMADD f14, f26, f31, f14
  1002. LFD f18, 18 * SIZE(AO)
  1003. LFD f22, 18 * SIZE(BO)
  1004. LFD f19, 19 * SIZE(AO)
  1005. LFD f23, 19 * SIZE(BO)
  1006. FMADD f3, f27, f28, f3
  1007. FMADD f7, f27, f29, f7
  1008. FMADD f11, f27, f30, f11
  1009. FMADD f15, f27, f31, f15
  1010. FMADD f0, f16, f20, f0
  1011. FMADD f4, f16, f21, f4
  1012. FMADD f8, f16, f22, f8
  1013. FMADD f12, f16, f23, f12
  1014. LFD f24, 20 * SIZE(AO)
  1015. LFD f28, 20 * SIZE(BO)
  1016. LFD f25, 21 * SIZE(AO)
  1017. LFD f29, 21 * SIZE(BO)
  1018. FMADD f1, f17, f20, f1
  1019. FMADD f5, f17, f21, f5
  1020. FMADD f9, f17, f22, f9
  1021. FMADD f13, f17, f23, f13
  1022. FMADD f2, f18, f20, f2
  1023. FMADD f6, f18, f21, f6
  1024. FMADD f10, f18, f22, f10
  1025. FMADD f14, f18, f23, f14
  1026. LFD f26, 22 * SIZE(AO)
  1027. LFD f30, 22 * SIZE(BO)
  1028. LFD f27, 23 * SIZE(AO)
  1029. LFD f31, 23 * SIZE(BO)
  1030. FMADD f3, f19, f20, f3
  1031. FMADD f7, f19, f21, f7
  1032. FMADD f11, f19, f22, f11
  1033. FMADD f15, f19, f23, f15
  1034. FMADD f0, f24, f28, f0
  1035. FMADD f4, f24, f29, f4
  1036. FMADD f8, f24, f30, f8
  1037. FMADD f12, f24, f31, f12
  1038. LFD f16, 24 * SIZE(AO)
  1039. LFD f20, 24 * SIZE(BO)
  1040. LFD f17, 25 * SIZE(AO)
  1041. LFD f21, 25 * SIZE(BO)
  1042. FMADD f1, f25, f28, f1
  1043. FMADD f5, f25, f29, f5
  1044. FMADD f9, f25, f30, f9
  1045. FMADD f13, f25, f31, f13
  1046. FMADD f2, f26, f28, f2
  1047. FMADD f6, f26, f29, f6
  1048. FMADD f10, f26, f30, f10
  1049. FMADD f14, f26, f31, f14
  1050. LFD f18, 26 * SIZE(AO)
  1051. LFD f22, 26 * SIZE(BO)
  1052. LFD f19, 27 * SIZE(AO)
  1053. LFD f23, 27 * SIZE(BO)
  1054. FMADD f3, f27, f28, f3
  1055. FMADD f7, f27, f29, f7
  1056. FMADD f11, f27, f30, f11
  1057. FMADD f15, f27, f31, f15
  1058. FMADD f0, f16, f20, f0
  1059. FMADD f4, f16, f21, f4
  1060. FMADD f8, f16, f22, f8
  1061. FMADD f12, f16, f23, f12
  1062. LFD f24, 28 * SIZE(AO)
  1063. LFD f28, 28 * SIZE(BO)
  1064. LFD f25, 29 * SIZE(AO)
  1065. LFD f29, 29 * SIZE(BO)
  1066. FMADD f1, f17, f20, f1
  1067. FMADD f5, f17, f21, f5
  1068. FMADD f9, f17, f22, f9
  1069. FMADD f13, f17, f23, f13
  1070. FMADD f2, f18, f20, f2
  1071. FMADD f6, f18, f21, f6
  1072. FMADD f10, f18, f22, f10
  1073. FMADD f14, f18, f23, f14
  1074. LFD f26, 30 * SIZE(AO)
  1075. LFD f30, 30 * SIZE(BO)
  1076. LFD f27, 31 * SIZE(AO)
  1077. LFD f31, 31 * SIZE(BO)
  1078. FMADD f3, f19, f20, f3
  1079. FMADD f7, f19, f21, f7
  1080. FMADD f11, f19, f22, f11
  1081. FMADD f15, f19, f23, f15
  1082. FMADD f0, f24, f28, f0
  1083. FMADD f4, f24, f29, f4
  1084. FMADD f8, f24, f30, f8
  1085. FMADD f12, f24, f31, f12
  1086. LFD f16, 32 * SIZE(AO)
  1087. LFD f20, 32 * SIZE(BO)
  1088. LFD f17, 33 * SIZE(AO)
  1089. LFD f21, 33 * SIZE(BO)
  1090. FMADD f1, f25, f28, f1
  1091. FMADD f5, f25, f29, f5
  1092. FMADD f9, f25, f30, f9
  1093. FMADD f13, f25, f31, f13
  1094. FMADD f2, f26, f28, f2
  1095. FMADD f6, f26, f29, f6
  1096. FMADD f10, f26, f30, f10
  1097. FMADD f14, f26, f31, f14
  1098. LFD f18, 34 * SIZE(AO)
  1099. LFD f22, 34 * SIZE(BO)
  1100. LFD f19, 35 * SIZE(AO)
  1101. LFD f23, 35 * SIZE(BO)
  1102. addi AO, AO, 32 * SIZE
  1103. addi BO, BO, 32 * SIZE
  1104. FMADD f3, f27, f28, f3
  1105. FMADD f7, f27, f29, f7
  1106. FMADD f11, f27, f30, f11
  1107. FMADD f15, f27, f31, f15
  1108. bdnz LL(12)
  1109. .align 4
  1110. LL(15):
  1111. #if defined(LT) || defined(RN)
  1112. andi. r0, KK, 7
  1113. #else
  1114. andi. r0, TEMP, 7
  1115. #endif
  1116. mtspr CTR, r0
  1117. ble+ LL(18)
  1118. .align 4
  1119. LL(16):
  1120. FMADD f0, f16, f20, f0
  1121. FMADD f4, f16, f21, f4
  1122. FMADD f8, f16, f22, f8
  1123. FMADD f12, f16, f23, f12
  1124. FMADD f1, f17, f20, f1
  1125. FMADD f5, f17, f21, f5
  1126. FMADD f9, f17, f22, f9
  1127. FMADD f13, f17, f23, f13
  1128. FMADD f2, f18, f20, f2
  1129. FMADD f6, f18, f21, f6
  1130. FMADD f10, f18, f22, f10
  1131. FMADD f14, f18, f23, f14
  1132. FMADD f3, f19, f20, f3
  1133. FMADD f7, f19, f21, f7
  1134. FMADD f11, f19, f22, f11
  1135. FMADD f15, f19, f23, f15
  1136. LFD f16, 4 * SIZE(AO)
  1137. LFD f17, 5 * SIZE(AO)
  1138. LFD f18, 6 * SIZE(AO)
  1139. LFD f19, 7 * SIZE(AO)
  1140. LFD f20, 4 * SIZE(BO)
  1141. LFD f21, 5 * SIZE(BO)
  1142. LFD f22, 6 * SIZE(BO)
  1143. LFD f23, 7 * SIZE(BO)
  1144. addi BO, BO, 4 * SIZE
  1145. addi AO, AO, 4 * SIZE
  1146. bdnz LL(16)
  1147. .align 4
  1148. LL(18):
  1149. #if defined(LN) || defined(RT)
  1150. subi r0, KK, 4
  1151. slwi r0, r0, 2 + BASE_SHIFT
  1152. add AO, AORIG, r0
  1153. add BO, B, r0
  1154. #endif
  1155. #if defined(LN) || defined(LT)
  1156. LFD f16, 0 * SIZE(BO)
  1157. LFD f17, 1 * SIZE(BO)
  1158. LFD f18, 2 * SIZE(BO)
  1159. LFD f19, 3 * SIZE(BO)
  1160. LFD f20, 4 * SIZE(BO)
  1161. LFD f21, 5 * SIZE(BO)
  1162. LFD f22, 6 * SIZE(BO)
  1163. LFD f23, 7 * SIZE(BO)
  1164. LFD f24, 8 * SIZE(BO)
  1165. LFD f25, 9 * SIZE(BO)
  1166. LFD f26, 10 * SIZE(BO)
  1167. LFD f27, 11 * SIZE(BO)
  1168. LFD f28, 12 * SIZE(BO)
  1169. LFD f29, 13 * SIZE(BO)
  1170. LFD f30, 14 * SIZE(BO)
  1171. LFD f31, 15 * SIZE(BO)
  1172. FSUB f0, f16, f0
  1173. FSUB f4, f17, f4
  1174. FSUB f8, f18, f8
  1175. FSUB f12, f19, f12
  1176. FSUB f1, f20, f1
  1177. FSUB f5, f21, f5
  1178. FSUB f9, f22, f9
  1179. FSUB f13, f23, f13
  1180. FSUB f2, f24, f2
  1181. FSUB f6, f25, f6
  1182. FSUB f10, f26, f10
  1183. FSUB f14, f27, f14
  1184. FSUB f3, f28, f3
  1185. FSUB f7, f29, f7
  1186. FSUB f11, f30, f11
  1187. FSUB f15, f31, f15
  1188. #else
  1189. LFD f16, 0 * SIZE(AO)
  1190. LFD f17, 1 * SIZE(AO)
  1191. LFD f18, 2 * SIZE(AO)
  1192. LFD f19, 3 * SIZE(AO)
  1193. LFD f20, 4 * SIZE(AO)
  1194. LFD f21, 5 * SIZE(AO)
  1195. LFD f22, 6 * SIZE(AO)
  1196. LFD f23, 7 * SIZE(AO)
  1197. LFD f24, 8 * SIZE(AO)
  1198. LFD f25, 9 * SIZE(AO)
  1199. LFD f26, 10 * SIZE(AO)
  1200. LFD f27, 11 * SIZE(AO)
  1201. LFD f28, 12 * SIZE(AO)
  1202. LFD f29, 13 * SIZE(AO)
  1203. LFD f30, 14 * SIZE(AO)
  1204. LFD f31, 15 * SIZE(AO)
  1205. FSUB f0, f16, f0
  1206. FSUB f1, f17, f1
  1207. FSUB f2, f18, f2
  1208. FSUB f3, f19, f3
  1209. FSUB f4, f20, f4
  1210. FSUB f5, f21, f5
  1211. FSUB f6, f22, f6
  1212. FSUB f7, f23, f7
  1213. FSUB f8, f24, f8
  1214. FSUB f9, f25, f9
  1215. FSUB f10, f26, f10
  1216. FSUB f11, f27, f11
  1217. FSUB f12, f28, f12
  1218. FSUB f13, f29, f13
  1219. FSUB f14, f30, f14
  1220. FSUB f15, f31, f15
  1221. #endif
  1222. #ifdef LN
  1223. LFD f16, 15 * SIZE(AO)
  1224. LFD f17, 14 * SIZE(AO)
  1225. LFD f18, 13 * SIZE(AO)
  1226. LFD f19, 12 * SIZE(AO)
  1227. FMUL f3, f16, f3
  1228. FMUL f7, f16, f7
  1229. FMUL f11, f16, f11
  1230. FMUL f15, f16, f15
  1231. FNMSUB f2, f17, f3, f2
  1232. FNMSUB f6, f17, f7, f6
  1233. FNMSUB f10, f17, f11, f10
  1234. FNMSUB f14, f17, f15, f14
  1235. FNMSUB f1, f18, f3, f1
  1236. FNMSUB f5, f18, f7, f5
  1237. FNMSUB f9, f18, f11, f9
  1238. FNMSUB f13, f18, f15, f13
  1239. FNMSUB f0, f19, f3, f0
  1240. FNMSUB f4, f19, f7, f4
  1241. FNMSUB f8, f19, f11, f8
  1242. FNMSUB f12, f19, f15, f12
  1243. LFD f16, 10 * SIZE(AO)
  1244. LFD f17, 9 * SIZE(AO)
  1245. LFD f18, 8 * SIZE(AO)
  1246. LFD f19, 5 * SIZE(AO)
  1247. FMUL f2, f16, f2
  1248. FMUL f6, f16, f6
  1249. FMUL f10, f16, f10
  1250. FMUL f14, f16, f14
  1251. LFD f20, 4 * SIZE(AO)
  1252. LFD f21, 0 * SIZE(AO)
  1253. FNMSUB f1, f17, f2, f1
  1254. FNMSUB f5, f17, f6, f5
  1255. FNMSUB f9, f17, f10, f9
  1256. FNMSUB f13, f17, f14, f13
  1257. FNMSUB f0, f18, f2, f0
  1258. FNMSUB f4, f18, f6, f4
  1259. FNMSUB f8, f18, f10, f8
  1260. FNMSUB f12, f18, f14, f12
  1261. FMUL f1, f19, f1
  1262. FMUL f5, f19, f5
  1263. FMUL f9, f19, f9
  1264. FMUL f13, f19, f13
  1265. FNMSUB f0, f20, f1, f0
  1266. FNMSUB f4, f20, f5, f4
  1267. FNMSUB f8, f20, f9, f8
  1268. FNMSUB f12, f20, f13, f12
  1269. FMUL f0, f21, f0
  1270. FMUL f4, f21, f4
  1271. FMUL f8, f21, f8
  1272. FMUL f12, f21, f12
  1273. #endif
  1274. #ifdef LT
  1275. LFD f16, 0 * SIZE(AO)
  1276. LFD f17, 1 * SIZE(AO)
  1277. LFD f18, 2 * SIZE(AO)
  1278. LFD f19, 3 * SIZE(AO)
  1279. FMUL f0, f16, f0
  1280. FMUL f4, f16, f4
  1281. FMUL f8, f16, f8
  1282. FMUL f12, f16, f12
  1283. FNMSUB f1, f17, f0, f1
  1284. FNMSUB f5, f17, f4, f5
  1285. FNMSUB f9, f17, f8, f9
  1286. FNMSUB f13, f17, f12, f13
  1287. FNMSUB f2, f18, f0, f2
  1288. FNMSUB f6, f18, f4, f6
  1289. FNMSUB f10, f18, f8, f10
  1290. FNMSUB f14, f18, f12, f14
  1291. FNMSUB f3, f19, f0, f3
  1292. FNMSUB f7, f19, f4, f7
  1293. FNMSUB f11, f19, f8, f11
  1294. FNMSUB f15, f19, f12, f15
  1295. LFD f16, 5 * SIZE(AO)
  1296. LFD f17, 6 * SIZE(AO)
  1297. LFD f18, 7 * SIZE(AO)
  1298. LFD f19, 10 * SIZE(AO)
  1299. FMUL f1, f16, f1
  1300. FMUL f5, f16, f5
  1301. FMUL f9, f16, f9
  1302. FMUL f13, f16, f13
  1303. LFD f20, 11 * SIZE(AO)
  1304. LFD f21, 15 * SIZE(AO)
  1305. FNMSUB f2, f17, f1, f2
  1306. FNMSUB f6, f17, f5, f6
  1307. FNMSUB f10, f17, f9, f10
  1308. FNMSUB f14, f17, f13, f14
  1309. FNMSUB f3, f18, f1, f3
  1310. FNMSUB f7, f18, f5, f7
  1311. FNMSUB f11, f18, f9, f11
  1312. FNMSUB f15, f18, f13, f15
  1313. FMUL f2, f19, f2
  1314. FMUL f6, f19, f6
  1315. FMUL f10, f19, f10
  1316. FMUL f14, f19, f14
  1317. FNMSUB f3, f20, f2, f3
  1318. FNMSUB f7, f20, f6, f7
  1319. FNMSUB f11, f20, f10, f11
  1320. FNMSUB f15, f20, f14, f15
  1321. FMUL f3, f21, f3
  1322. FMUL f7, f21, f7
  1323. FMUL f11, f21, f11
  1324. FMUL f15, f21, f15
  1325. #endif
  1326. #ifdef RN
  1327. LFD f16, 0 * SIZE(BO)
  1328. LFD f17, 1 * SIZE(BO)
  1329. LFD f18, 2 * SIZE(BO)
  1330. LFD f19, 3 * SIZE(BO)
  1331. FMUL f0, f16, f0
  1332. FMUL f1, f16, f1
  1333. FMUL f2, f16, f2
  1334. FMUL f3, f16, f3
  1335. FNMSUB f4, f17, f0, f4
  1336. FNMSUB f5, f17, f1, f5
  1337. FNMSUB f6, f17, f2, f6
  1338. FNMSUB f7, f17, f3, f7
  1339. FNMSUB f8, f18, f0, f8
  1340. FNMSUB f9, f18, f1, f9
  1341. FNMSUB f10, f18, f2, f10
  1342. FNMSUB f11, f18, f3, f11
  1343. FNMSUB f12, f19, f0, f12
  1344. FNMSUB f13, f19, f1, f13
  1345. FNMSUB f14, f19, f2, f14
  1346. FNMSUB f15, f19, f3, f15
  1347. LFD f16, 5 * SIZE(BO)
  1348. LFD f17, 6 * SIZE(BO)
  1349. LFD f18, 7 * SIZE(BO)
  1350. LFD f19, 10 * SIZE(BO)
  1351. FMUL f4, f16, f4
  1352. FMUL f5, f16, f5
  1353. FMUL f6, f16, f6
  1354. FMUL f7, f16, f7
  1355. LFD f20, 11 * SIZE(BO)
  1356. LFD f21, 15 * SIZE(BO)
  1357. FNMSUB f8, f17, f4, f8
  1358. FNMSUB f9, f17, f5, f9
  1359. FNMSUB f10, f17, f6, f10
  1360. FNMSUB f11, f17, f7, f11
  1361. FNMSUB f12, f18, f4, f12
  1362. FNMSUB f13, f18, f5, f13
  1363. FNMSUB f14, f18, f6, f14
  1364. FNMSUB f15, f18, f7, f15
  1365. FMUL f8, f19, f8
  1366. FMUL f9, f19, f9
  1367. FMUL f10, f19, f10
  1368. FMUL f11, f19, f11
  1369. FNMSUB f12, f20, f8, f12
  1370. FNMSUB f13, f20, f9, f13
  1371. FNMSUB f14, f20, f10, f14
  1372. FNMSUB f15, f20, f11, f15
  1373. FMUL f12, f21, f12
  1374. FMUL f13, f21, f13
  1375. FMUL f14, f21, f14
  1376. FMUL f15, f21, f15
  1377. #endif
  1378. #ifdef RT
  1379. LFD f16, 15 * SIZE(BO)
  1380. LFD f17, 14 * SIZE(BO)
  1381. LFD f18, 13 * SIZE(BO)
  1382. LFD f19, 12 * SIZE(BO)
  1383. FMUL f12, f16, f12
  1384. FMUL f13, f16, f13
  1385. FMUL f14, f16, f14
  1386. FMUL f15, f16, f15
  1387. FNMSUB f8, f17, f12, f8
  1388. FNMSUB f9, f17, f13, f9
  1389. FNMSUB f10, f17, f14, f10
  1390. FNMSUB f11, f17, f15, f11
  1391. FNMSUB f4, f18, f12, f4
  1392. FNMSUB f5, f18, f13, f5
  1393. FNMSUB f6, f18, f14, f6
  1394. FNMSUB f7, f18, f15, f7
  1395. FNMSUB f0, f19, f12, f0
  1396. FNMSUB f1, f19, f13, f1
  1397. FNMSUB f2, f19, f14, f2
  1398. FNMSUB f3, f19, f15, f3
  1399. LFD f16, 10 * SIZE(BO)
  1400. LFD f17, 9 * SIZE(BO)
  1401. LFD f18, 8 * SIZE(BO)
  1402. LFD f19, 5 * SIZE(BO)
  1403. FMUL f8, f16, f8
  1404. FMUL f9, f16, f9
  1405. FMUL f10, f16, f10
  1406. FMUL f11, f16, f11
  1407. LFD f20, 4 * SIZE(BO)
  1408. LFD f21, 0 * SIZE(BO)
  1409. FNMSUB f4, f17, f8, f4
  1410. FNMSUB f5, f17, f9, f5
  1411. FNMSUB f6, f17, f10, f6
  1412. FNMSUB f7, f17, f11, f7
  1413. FNMSUB f0, f18, f8, f0
  1414. FNMSUB f1, f18, f9, f1
  1415. FNMSUB f2, f18, f10, f2
  1416. FNMSUB f3, f18, f11, f3
  1417. FMUL f4, f19, f4
  1418. FMUL f5, f19, f5
  1419. FMUL f6, f19, f6
  1420. FMUL f7, f19, f7
  1421. FNMSUB f0, f20, f4, f0
  1422. FNMSUB f1, f20, f5, f1
  1423. FNMSUB f2, f20, f6, f2
  1424. FNMSUB f3, f20, f7, f3
  1425. FMUL f0, f21, f0
  1426. FMUL f1, f21, f1
  1427. FMUL f2, f21, f2
  1428. FMUL f3, f21, f3
  1429. #endif
  1430. #ifdef LN
  1431. subi CO1, CO1, 4 * SIZE
  1432. subi CO2, CO2, 4 * SIZE
  1433. subi CO3, CO3, 4 * SIZE
  1434. subi CO4, CO4, 4 * SIZE
  1435. #endif
  1436. #if defined(LN) || defined(LT)
  1437. STFD f0, 0 * SIZE(BO)
  1438. STFD f4, 1 * SIZE(BO)
  1439. STFD f8, 2 * SIZE(BO)
  1440. STFD f12, 3 * SIZE(BO)
  1441. STFD f1, 4 * SIZE(BO)
  1442. STFD f5, 5 * SIZE(BO)
  1443. STFD f9, 6 * SIZE(BO)
  1444. STFD f13, 7 * SIZE(BO)
  1445. STFD f2, 8 * SIZE(BO)
  1446. STFD f6, 9 * SIZE(BO)
  1447. STFD f10, 10 * SIZE(BO)
  1448. STFD f14, 11 * SIZE(BO)
  1449. STFD f3, 12 * SIZE(BO)
  1450. STFD f7, 13 * SIZE(BO)
  1451. STFD f11, 14 * SIZE(BO)
  1452. STFD f15, 15 * SIZE(BO)
  1453. #else
  1454. STFD f0, 0 * SIZE(AO)
  1455. STFD f1, 1 * SIZE(AO)
  1456. STFD f2, 2 * SIZE(AO)
  1457. STFD f3, 3 * SIZE(AO)
  1458. STFD f4, 4 * SIZE(AO)
  1459. STFD f5, 5 * SIZE(AO)
  1460. STFD f6, 6 * SIZE(AO)
  1461. STFD f7, 7 * SIZE(AO)
  1462. STFD f8, 8 * SIZE(AO)
  1463. STFD f9, 9 * SIZE(AO)
  1464. STFD f10, 10 * SIZE(AO)
  1465. STFD f11, 11 * SIZE(AO)
  1466. STFD f12, 12 * SIZE(AO)
  1467. STFD f13, 13 * SIZE(AO)
  1468. STFD f14, 14 * SIZE(AO)
  1469. STFD f15, 15 * SIZE(AO)
  1470. #endif
  1471. STFD f0, 0 * SIZE(CO1)
  1472. STFD f1, 1 * SIZE(CO1)
  1473. STFD f2, 2 * SIZE(CO1)
  1474. STFD f3, 3 * SIZE(CO1)
  1475. STFD f4, 0 * SIZE(CO2)
  1476. STFD f5, 1 * SIZE(CO2)
  1477. STFD f6, 2 * SIZE(CO2)
  1478. STFD f7, 3 * SIZE(CO2)
  1479. STFD f8, 0 * SIZE(CO3)
  1480. STFD f9, 1 * SIZE(CO3)
  1481. STFD f10, 2 * SIZE(CO3)
  1482. STFD f11, 3 * SIZE(CO3)
  1483. STFD f12, 0 * SIZE(CO4)
  1484. STFD f13, 1 * SIZE(CO4)
  1485. STFD f14, 2 * SIZE(CO4)
  1486. STFD f15, 3 * SIZE(CO4)
  1487. lfs f0, FZERO
  1488. fmr f1, f0
  1489. fmr f2, f0
  1490. fmr f3, f0
  1491. fmr f4, f0
  1492. fmr f5, f0
  1493. fmr f6, f0
  1494. fmr f7, f0
  1495. fmr f8, f0
  1496. fmr f9, f0
  1497. fmr f10, f0
  1498. fmr f11, f0
  1499. fmr f12, f0
  1500. fmr f13, f0
  1501. fmr f14, f0
  1502. fmr f15, f0
  1503. #ifndef LN
  1504. addi CO1, CO1, 4 * SIZE
  1505. addi CO2, CO2, 4 * SIZE
  1506. addi CO3, CO3, 4 * SIZE
  1507. addi CO4, CO4, 4 * SIZE
  1508. #endif
  1509. #ifdef RT
  1510. slwi r0, K, 2 + BASE_SHIFT
  1511. add AORIG, AORIG, r0
  1512. #endif
  1513. #if defined(LT) || defined(RN)
  1514. sub TEMP, K, KK
  1515. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1516. add AO, AO, TEMP
  1517. add BO, BO, TEMP
  1518. #endif
  1519. #ifdef LT
  1520. addi KK, KK, 4
  1521. #endif
  1522. #ifdef LN
  1523. subi KK, KK, 4
  1524. #endif
  1525. addic. I, I, -1
  1526. bgt+ LL(11)
  1527. .align 4
  1528. LL(39):
  1529. #ifdef LN
  1530. slwi r0, K, 2 + BASE_SHIFT
  1531. add B, B, r0
  1532. #endif
  1533. #if defined(LT) || defined(RN)
  1534. mr B, BO
  1535. #endif
  1536. #ifdef RN
  1537. addi KK, KK, 4
  1538. #endif
  1539. #ifdef RT
  1540. subi KK, KK, 4
  1541. #endif
  1542. addic. J, J, -1
  1543. lfs f0, FZERO
  1544. bgt LL(10)
  1545. .align 4
  1546. LL(40):
  1547. andi. J, N, 2
  1548. ble LL(70)
  1549. #ifdef RT
  1550. slwi r0, K, 1 + BASE_SHIFT
  1551. sub B, B, r0
  1552. slwi r0, LDC, 1
  1553. sub C, C, r0
  1554. #endif
  1555. mr CO1, C
  1556. add CO2, C, LDC
  1557. #ifdef LN
  1558. add KK, M, OFFSET
  1559. #endif
  1560. #ifdef LT
  1561. mr KK, OFFSET
  1562. #endif
  1563. fmr f1, f0
  1564. fmr f2, f0
  1565. fmr f3, f0
  1566. fmr f4, f0
  1567. fmr f5, f0
  1568. fmr f6, f0
  1569. fmr f7, f0
  1570. #if defined(LN) || defined(RT)
  1571. mr AORIG, A
  1572. #else
  1573. mr AO, A
  1574. #endif
  1575. #ifndef RT
  1576. add C, CO2, LDC
  1577. #endif
  1578. LL(60):
  1579. andi. I, M, 1
  1580. ble LL(50)
  1581. #if defined(LT) || defined(RN)
  1582. LFD f16, 0 * SIZE(AO)
  1583. LFD f17, 1 * SIZE(AO)
  1584. LFD f18, 2 * SIZE(AO)
  1585. LFD f19, 3 * SIZE(AO)
  1586. LFD f20, 0 * SIZE(B)
  1587. LFD f21, 1 * SIZE(B)
  1588. LFD f22, 2 * SIZE(B)
  1589. LFD f23, 3 * SIZE(B)
  1590. LFD f24, 4 * SIZE(B)
  1591. LFD f25, 5 * SIZE(B)
  1592. LFD f26, 6 * SIZE(B)
  1593. LFD f27, 7 * SIZE(B)
  1594. srawi. r0, KK, 2
  1595. mtspr CTR, r0
  1596. mr BO, B
  1597. #else
  1598. #ifdef LN
  1599. slwi r0, K, BASE_SHIFT
  1600. sub AORIG, AORIG, r0
  1601. #endif
  1602. slwi r0, KK, 0 + BASE_SHIFT
  1603. slwi TEMP, KK, 1 + BASE_SHIFT
  1604. add AO, AORIG, r0
  1605. add BO, B, TEMP
  1606. sub TEMP, K, KK
  1607. LFD f16, 0 * SIZE(AO)
  1608. LFD f17, 1 * SIZE(AO)
  1609. LFD f18, 2 * SIZE(AO)
  1610. LFD f19, 3 * SIZE(AO)
  1611. LFD f20, 0 * SIZE(BO)
  1612. LFD f21, 1 * SIZE(BO)
  1613. LFD f22, 2 * SIZE(BO)
  1614. LFD f23, 3 * SIZE(BO)
  1615. LFD f24, 4 * SIZE(BO)
  1616. LFD f25, 5 * SIZE(BO)
  1617. LFD f26, 6 * SIZE(BO)
  1618. LFD f27, 7 * SIZE(BO)
  1619. srawi. r0, TEMP, 2
  1620. mtspr CTR, r0
  1621. #endif
  1622. ble LL(65)
  1623. .align 5
  1624. LL(62):
  1625. FMADD f0, f16, f20, f0
  1626. FMADD f1, f16, f21, f1
  1627. FMADD f2, f17, f22, f2
  1628. FMADD f3, f17, f23, f3
  1629. LFD f20, 8 * SIZE(BO)
  1630. LFD f21, 9 * SIZE(BO)
  1631. LFD f22, 10 * SIZE(BO)
  1632. LFD f23, 11 * SIZE(BO)
  1633. FMADD f0, f18, f24, f0
  1634. FMADD f1, f18, f25, f1
  1635. FMADD f2, f19, f26, f2
  1636. FMADD f3, f19, f27, f3
  1637. LFD f16, 4 * SIZE(AO)
  1638. LFD f17, 5 * SIZE(AO)
  1639. LFD f18, 6 * SIZE(AO)
  1640. LFD f19, 7 * SIZE(AO)
  1641. LFD f24, 12 * SIZE(BO)
  1642. LFD f25, 13 * SIZE(BO)
  1643. LFD f26, 14 * SIZE(BO)
  1644. LFD f27, 15 * SIZE(BO)
  1645. addi AO, AO, 4 * SIZE
  1646. addi BO, BO, 8 * SIZE
  1647. bdnz LL(62)
  1648. .align 4
  1649. LL(65):
  1650. #if defined(LT) || defined(RN)
  1651. andi. r0, KK, 3
  1652. #else
  1653. andi. r0, TEMP, 3
  1654. #endif
  1655. mtspr CTR, r0
  1656. ble+ LL(68)
  1657. .align 4
  1658. LL(66):
  1659. FMADD f0, f16, f20, f0
  1660. FMADD f1, f16, f21, f1
  1661. LFD f16, 1 * SIZE(AO)
  1662. LFD f20, 2 * SIZE(BO)
  1663. LFD f21, 3 * SIZE(BO)
  1664. addi BO, BO, 2 * SIZE
  1665. addi AO, AO, 1 * SIZE
  1666. bdnz LL(66)
  1667. .align 4
  1668. LL(68):
  1669. FADD f0, f2, f0
  1670. FADD f1, f3, f1
  1671. #if defined(LN) || defined(RT)
  1672. #ifdef LN
  1673. subi r0, KK, 1
  1674. #else
  1675. subi r0, KK, 2
  1676. #endif
  1677. slwi TEMP, r0, 0 + BASE_SHIFT
  1678. slwi r0, r0, 1 + BASE_SHIFT
  1679. add AO, AORIG, TEMP
  1680. add BO, B, r0
  1681. #endif
  1682. #if defined(LN) || defined(LT)
  1683. LFD f16, 0 * SIZE(BO)
  1684. LFD f17, 1 * SIZE(BO)
  1685. FSUB f0, f16, f0
  1686. FSUB f1, f17, f1
  1687. #else
  1688. LFD f16, 0 * SIZE(AO)
  1689. LFD f20, 1 * SIZE(AO)
  1690. FSUB f0, f16, f0
  1691. FSUB f1, f20, f1
  1692. #endif
  1693. #ifdef LN
  1694. LFD f21, 0 * SIZE(AO)
  1695. FMUL f0, f21, f0
  1696. FMUL f1, f21, f1
  1697. #endif
  1698. #ifdef LT
  1699. LFD f16, 0 * SIZE(AO)
  1700. FMUL f0, f16, f0
  1701. FMUL f1, f16, f1
  1702. #endif
  1703. #ifdef RN
  1704. LFD f16, 0 * SIZE(BO)
  1705. LFD f17, 1 * SIZE(BO)
  1706. LFD f18, 3 * SIZE(BO)
  1707. FMUL f0, f16, f0
  1708. FNMSUB f1, f17, f0, f1
  1709. FMUL f1, f18, f1
  1710. #endif
  1711. #ifdef RT
  1712. LFD f19, 3 * SIZE(BO)
  1713. LFD f20, 2 * SIZE(BO)
  1714. LFD f21, 0 * SIZE(BO)
  1715. FMUL f1, f19, f1
  1716. FNMSUB f0, f20, f1, f0
  1717. FMUL f0, f21, f0
  1718. #endif
  1719. #ifdef LN
  1720. subi CO1, CO1, 1 * SIZE
  1721. subi CO2, CO2, 1 * SIZE
  1722. #endif
  1723. #if defined(LN) || defined(LT)
  1724. STFD f0, 0 * SIZE(BO)
  1725. STFD f1, 1 * SIZE(BO)
  1726. #else
  1727. STFD f0, 0 * SIZE(AO)
  1728. STFD f1, 1 * SIZE(AO)
  1729. #endif
  1730. STFD f0, 0 * SIZE(CO1)
  1731. STFD f1, 0 * SIZE(CO2)
  1732. lfs f0, FZERO
  1733. fmr f1, f0
  1734. fmr f4, f0
  1735. fmr f5, f0
  1736. #ifndef LN
  1737. addi CO1, CO1, 1 * SIZE
  1738. addi CO2, CO2, 1 * SIZE
  1739. #endif
  1740. #ifdef RT
  1741. slwi r0, K, 0 + BASE_SHIFT
  1742. add AORIG, AORIG, r0
  1743. #endif
  1744. #if defined(LT) || defined(RN)
  1745. sub TEMP, K, KK
  1746. slwi r0, TEMP, 0 + BASE_SHIFT
  1747. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1748. add AO, AO, r0
  1749. add BO, BO, TEMP
  1750. #endif
  1751. #ifdef LN
  1752. subi KK, KK, 1
  1753. #endif
  1754. #ifdef LT
  1755. addi KK, KK, 1
  1756. #endif
  1757. .align 4
  1758. LL(50):
  1759. andi. I, M, 2
  1760. ble LL(41)
  1761. #if defined(LT) || defined(RN)
  1762. LFD f16, 0 * SIZE(AO)
  1763. LFD f17, 1 * SIZE(AO)
  1764. LFD f18, 2 * SIZE(AO)
  1765. LFD f19, 3 * SIZE(AO)
  1766. LFD f20, 0 * SIZE(B)
  1767. LFD f21, 1 * SIZE(B)
  1768. LFD f22, 2 * SIZE(B)
  1769. LFD f23, 3 * SIZE(B)
  1770. LFD f24, 4 * SIZE(B)
  1771. LFD f25, 5 * SIZE(B)
  1772. LFD f26, 6 * SIZE(B)
  1773. LFD f27, 7 * SIZE(B)
  1774. srawi. r0, KK, 2
  1775. mtspr CTR, r0
  1776. mr BO, B
  1777. #else
  1778. #ifdef LN
  1779. slwi r0, K, 1 + BASE_SHIFT
  1780. sub AORIG, AORIG, r0
  1781. #endif
  1782. slwi r0, KK, 1 + BASE_SHIFT
  1783. slwi TEMP, KK, 1 + BASE_SHIFT
  1784. add AO, AORIG, r0
  1785. add BO, B, TEMP
  1786. sub TEMP, K, KK
  1787. LFD f16, 0 * SIZE(AO)
  1788. LFD f17, 1 * SIZE(AO)
  1789. LFD f18, 2 * SIZE(AO)
  1790. LFD f19, 3 * SIZE(AO)
  1791. LFD f20, 0 * SIZE(BO)
  1792. LFD f21, 1 * SIZE(BO)
  1793. LFD f22, 2 * SIZE(BO)
  1794. LFD f23, 3 * SIZE(BO)
  1795. LFD f24, 4 * SIZE(BO)
  1796. LFD f25, 5 * SIZE(BO)
  1797. LFD f26, 6 * SIZE(BO)
  1798. LFD f27, 7 * SIZE(BO)
  1799. srawi. r0, TEMP, 2
  1800. mtspr CTR, r0
  1801. #endif
  1802. ble LL(55)
  1803. .align 5
  1804. LL(52):
  1805. FMADD f0, f16, f20, f0
  1806. FMADD f1, f17, f20, f1
  1807. FMADD f2, f16, f21, f2
  1808. FMADD f3, f17, f21, f3
  1809. FMADD f4, f18, f22, f4
  1810. FMADD f5, f19, f22, f5
  1811. FMADD f6, f18, f23, f6
  1812. FMADD f7, f19, f23, f7
  1813. LFD f16, 4 * SIZE(AO)
  1814. LFD f17, 5 * SIZE(AO)
  1815. LFD f18, 6 * SIZE(AO)
  1816. LFD f19, 7 * SIZE(AO)
  1817. LFD f20, 8 * SIZE(BO)
  1818. LFD f21, 9 * SIZE(BO)
  1819. LFD f22, 10 * SIZE(BO)
  1820. LFD f23, 11 * SIZE(BO)
  1821. FMADD f0, f16, f24, f0
  1822. FMADD f1, f17, f24, f1
  1823. FMADD f2, f16, f25, f2
  1824. FMADD f3, f17, f25, f3
  1825. FMADD f4, f18, f26, f4
  1826. FMADD f5, f19, f26, f5
  1827. FMADD f6, f18, f27, f6
  1828. FMADD f7, f19, f27, f7
  1829. LFD f16, 8 * SIZE(AO)
  1830. LFD f17, 9 * SIZE(AO)
  1831. LFD f18, 10 * SIZE(AO)
  1832. LFD f19, 11 * SIZE(AO)
  1833. LFD f24, 12 * SIZE(BO)
  1834. LFD f25, 13 * SIZE(BO)
  1835. LFD f26, 14 * SIZE(BO)
  1836. LFD f27, 15 * SIZE(BO)
  1837. addi AO, AO, 8 * SIZE
  1838. addi BO, BO, 8 * SIZE
  1839. dcbtst AO, PREA
  1840. bdnz LL(52)
  1841. .align 4
  1842. LL(55):
  1843. #if defined(LT) || defined(RN)
  1844. andi. r0, KK, 3
  1845. #else
  1846. andi. r0, TEMP, 3
  1847. #endif
  1848. mtspr CTR, r0
  1849. ble+ LL(58)
  1850. .align 4
  1851. LL(56):
  1852. FMADD f0, f16, f20, f0
  1853. FMADD f1, f17, f20, f1
  1854. FMADD f2, f16, f21, f2
  1855. FMADD f3, f17, f21, f3
  1856. LFD f16, 2 * SIZE(AO)
  1857. LFD f17, 3 * SIZE(AO)
  1858. LFD f20, 2 * SIZE(BO)
  1859. LFD f21, 3 * SIZE(BO)
  1860. addi BO, BO, 2 * SIZE
  1861. addi AO, AO, 2 * SIZE
  1862. bdnz LL(56)
  1863. .align 4
  1864. LL(58):
  1865. FADD f0, f4, f0
  1866. FADD f1, f5, f1
  1867. FADD f2, f6, f2
  1868. FADD f3, f7, f3
  1869. #if defined(LN) || defined(RT)
  1870. #ifdef LN
  1871. subi r0, KK, 2
  1872. #else
  1873. subi r0, KK, 2
  1874. #endif
  1875. slwi TEMP, r0, 1 + BASE_SHIFT
  1876. slwi r0, r0, 1 + BASE_SHIFT
  1877. add AO, AORIG, TEMP
  1878. add BO, B, r0
  1879. #endif
  1880. #if defined(LN) || defined(LT)
  1881. LFD f16, 0 * SIZE(BO)
  1882. LFD f17, 1 * SIZE(BO)
  1883. LFD f20, 2 * SIZE(BO)
  1884. LFD f21, 3 * SIZE(BO)
  1885. FSUB f0, f16, f0
  1886. FSUB f2, f17, f2
  1887. FSUB f1, f20, f1
  1888. FSUB f3, f21, f3
  1889. #else
  1890. LFD f16, 0 * SIZE(AO)
  1891. LFD f17, 1 * SIZE(AO)
  1892. LFD f20, 2 * SIZE(AO)
  1893. LFD f21, 3 * SIZE(AO)
  1894. FSUB f0, f16, f0
  1895. FSUB f1, f17, f1
  1896. FSUB f2, f20, f2
  1897. FSUB f3, f21, f3
  1898. #endif
  1899. #ifdef LN
  1900. LFD f19, 3 * SIZE(AO)
  1901. LFD f20, 2 * SIZE(AO)
  1902. LFD f21, 0 * SIZE(AO)
  1903. FMUL f1, f19, f1
  1904. FMUL f3, f19, f3
  1905. FNMSUB f0, f20, f1, f0
  1906. FNMSUB f2, f20, f3, f2
  1907. FMUL f0, f21, f0
  1908. FMUL f2, f21, f2
  1909. #endif
  1910. #ifdef LT
  1911. LFD f16, 0 * SIZE(AO)
  1912. LFD f17, 1 * SIZE(AO)
  1913. FMUL f0, f16, f0
  1914. FMUL f2, f16, f2
  1915. FNMSUB f1, f17, f0, f1
  1916. FNMSUB f3, f17, f2, f3
  1917. LFD f17, 3 * SIZE(AO)
  1918. FMUL f1, f17, f1
  1919. FMUL f3, f17, f3
  1920. #endif
  1921. #ifdef RN
  1922. LFD f16, 0 * SIZE(BO)
  1923. LFD f17, 1 * SIZE(BO)
  1924. LFD f18, 3 * SIZE(BO)
  1925. FMUL f0, f16, f0
  1926. FMUL f1, f16, f1
  1927. FNMSUB f2, f17, f0, f2
  1928. FNMSUB f3, f17, f1, f3
  1929. FMUL f2, f18, f2
  1930. FMUL f3, f18, f3
  1931. #endif
  1932. #ifdef RT
  1933. LFD f19, 3 * SIZE(BO)
  1934. LFD f20, 2 * SIZE(BO)
  1935. LFD f21, 0 * SIZE(BO)
  1936. FMUL f2, f19, f2
  1937. FMUL f3, f19, f3
  1938. FNMSUB f0, f20, f2, f0
  1939. FNMSUB f1, f20, f3, f1
  1940. FMUL f0, f21, f0
  1941. FMUL f1, f21, f1
  1942. #endif
  1943. #ifdef LN
  1944. subi CO1, CO1, 2 * SIZE
  1945. subi CO2, CO2, 2 * SIZE
  1946. #endif
  1947. #if defined(LN) || defined(LT)
  1948. STFD f0, 0 * SIZE(BO)
  1949. STFD f2, 1 * SIZE(BO)
  1950. STFD f1, 2 * SIZE(BO)
  1951. STFD f3, 3 * SIZE(BO)
  1952. #else
  1953. STFD f0, 0 * SIZE(AO)
  1954. STFD f1, 1 * SIZE(AO)
  1955. STFD f2, 2 * SIZE(AO)
  1956. STFD f3, 3 * SIZE(AO)
  1957. #endif
  1958. STFD f0, 0 * SIZE(CO1)
  1959. STFD f1, 1 * SIZE(CO1)
  1960. STFD f2, 0 * SIZE(CO2)
  1961. STFD f3, 1 * SIZE(CO2)
  1962. lfs f0, FZERO
  1963. fmr f1, f0
  1964. fmr f2, f0
  1965. fmr f3, f0
  1966. fmr f4, f0
  1967. fmr f5, f0
  1968. fmr f6, f0
  1969. fmr f7, f0
  1970. #ifndef LN
  1971. addi CO1, CO1, 2 * SIZE
  1972. addi CO2, CO2, 2 * SIZE
  1973. #endif
  1974. #ifdef RT
  1975. slwi r0, K, 1 + BASE_SHIFT
  1976. add AORIG, AORIG, r0
  1977. #endif
  1978. #if defined(LT) || defined(RN)
  1979. sub TEMP, K, KK
  1980. slwi r0, TEMP, 1 + BASE_SHIFT
  1981. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1982. add AO, AO, r0
  1983. add BO, BO, TEMP
  1984. #endif
  1985. #ifdef LN
  1986. subi KK, KK, 2
  1987. #endif
  1988. #ifdef LT
  1989. addi KK, KK, 2
  1990. #endif
  1991. .align 4
  1992. LL(41):
  1993. srawi. I, M, 2
  1994. ble LL(69)
  1995. .align 4
  1996. LL(42):
  1997. #if defined(LT) || defined(RN)
  1998. LFD f16, 0 * SIZE(AO)
  1999. LFD f17, 1 * SIZE(AO)
  2000. LFD f18, 2 * SIZE(AO)
  2001. LFD f19, 3 * SIZE(AO)
  2002. LFD f20, 0 * SIZE(B)
  2003. LFD f21, 1 * SIZE(B)
  2004. LFD f22, 2 * SIZE(B)
  2005. LFD f23, 3 * SIZE(B)
  2006. dcbt CO1, PREC
  2007. dcbt CO2, PREC
  2008. srawi. r0, KK, 2
  2009. mtspr CTR, r0
  2010. mr BO, B
  2011. #else
  2012. #ifdef LN
  2013. slwi r0, K, 2 + BASE_SHIFT
  2014. sub AORIG, AORIG, r0
  2015. #endif
  2016. slwi r0, KK, 2 + BASE_SHIFT
  2017. slwi TEMP, KK, 1 + BASE_SHIFT
  2018. add AO, AORIG, r0
  2019. add BO, B, TEMP
  2020. sub TEMP, K, KK
  2021. LFD f16, 0 * SIZE(AO)
  2022. LFD f17, 1 * SIZE(AO)
  2023. LFD f18, 2 * SIZE(AO)
  2024. LFD f19, 3 * SIZE(AO)
  2025. LFD f20, 0 * SIZE(BO)
  2026. LFD f21, 1 * SIZE(BO)
  2027. LFD f22, 2 * SIZE(BO)
  2028. LFD f23, 3 * SIZE(BO)
  2029. dcbt CO1, PREC
  2030. dcbt CO2, PREC
  2031. srawi. r0, TEMP, 2
  2032. mtspr CTR, r0
  2033. #endif
  2034. ble LL(45)
  2035. .align 5
  2036. LL(43):
  2037. FMADD f0, f16, f20, f0
  2038. FMADD f1, f17, f20, f1
  2039. FMADD f2, f18, f20, f2
  2040. FMADD f3, f19, f20, f3
  2041. FMADD f4, f16, f21, f4
  2042. FMADD f5, f17, f21, f5
  2043. FMADD f6, f18, f21, f6
  2044. FMADD f7, f19, f21, f7
  2045. LFD f16, 4 * SIZE(AO)
  2046. LFD f17, 5 * SIZE(AO)
  2047. LFD f18, 6 * SIZE(AO)
  2048. LFD f19, 7 * SIZE(AO)
  2049. FMADD f0, f16, f22, f0
  2050. FMADD f1, f17, f22, f1
  2051. FMADD f2, f18, f22, f2
  2052. FMADD f3, f19, f22, f3
  2053. FMADD f4, f16, f23, f4
  2054. FMADD f5, f17, f23, f5
  2055. FMADD f6, f18, f23, f6
  2056. FMADD f7, f19, f23, f7
  2057. LFD f16, 8 * SIZE(AO)
  2058. LFD f17, 9 * SIZE(AO)
  2059. LFD f18, 10 * SIZE(AO)
  2060. LFD f19, 11 * SIZE(AO)
  2061. LFD f20, 4 * SIZE(BO)
  2062. LFD f21, 5 * SIZE(BO)
  2063. LFD f22, 6 * SIZE(BO)
  2064. LFD f23, 7 * SIZE(BO)
  2065. FMADD f0, f16, f20, f0
  2066. FMADD f1, f17, f20, f1
  2067. FMADD f2, f18, f20, f2
  2068. FMADD f3, f19, f20, f3
  2069. FMADD f4, f16, f21, f4
  2070. FMADD f5, f17, f21, f5
  2071. FMADD f6, f18, f21, f6
  2072. FMADD f7, f19, f21, f7
  2073. LFD f16, 12 * SIZE(AO)
  2074. LFD f17, 13 * SIZE(AO)
  2075. LFD f18, 14 * SIZE(AO)
  2076. LFD f19, 15 * SIZE(AO)
  2077. FMADD f0, f16, f22, f0
  2078. FMADD f1, f17, f22, f1
  2079. FMADD f2, f18, f22, f2
  2080. FMADD f3, f19, f22, f3
  2081. FMADD f4, f16, f23, f4
  2082. FMADD f5, f17, f23, f5
  2083. FMADD f6, f18, f23, f6
  2084. FMADD f7, f19, f23, f7
  2085. LFD f16, 16 * SIZE(AO)
  2086. LFD f17, 17 * SIZE(AO)
  2087. LFD f18, 18 * SIZE(AO)
  2088. LFD f19, 19 * SIZE(AO)
  2089. LFD f20, 8 * SIZE(BO)
  2090. LFD f21, 9 * SIZE(BO)
  2091. LFD f22, 10 * SIZE(BO)
  2092. LFD f23, 11 * SIZE(BO)
  2093. addi AO, AO, 16 * SIZE
  2094. addi BO, BO, 8 * SIZE
  2095. dcbtst AO, PREA
  2096. bdnz LL(43)
  2097. .align 4
  2098. LL(45):
  2099. #if defined(LT) || defined(RN)
  2100. andi. r0, KK, 3
  2101. #else
  2102. andi. r0, TEMP, 3
  2103. #endif
  2104. mtspr CTR, r0
  2105. ble+ LL(48)
  2106. .align 4
  2107. LL(46):
  2108. FMADD f0, f16, f20, f0
  2109. FMADD f1, f17, f20, f1
  2110. FMADD f2, f18, f20, f2
  2111. FMADD f3, f19, f20, f3
  2112. FMADD f4, f16, f21, f4
  2113. FMADD f5, f17, f21, f5
  2114. FMADD f6, f18, f21, f6
  2115. FMADD f7, f19, f21, f7
  2116. LFD f16, 4 * SIZE(AO)
  2117. LFD f17, 5 * SIZE(AO)
  2118. LFD f18, 6 * SIZE(AO)
  2119. LFD f19, 7 * SIZE(AO)
  2120. LFD f20, 2 * SIZE(BO)
  2121. LFD f21, 3 * SIZE(BO)
  2122. addi BO, BO, 2 * SIZE
  2123. addi AO, AO, 4 * SIZE
  2124. bdnz LL(46)
  2125. .align 4
  2126. LL(48):
  2127. #if defined(LN) || defined(RT)
  2128. #ifdef LN
  2129. subi r0, KK, 4
  2130. #else
  2131. subi r0, KK, 2
  2132. #endif
  2133. slwi TEMP, r0, 2 + BASE_SHIFT
  2134. slwi r0, r0, 1 + BASE_SHIFT
  2135. add AO, AORIG, TEMP
  2136. add BO, B, r0
  2137. #endif
  2138. #if defined(LN) || defined(LT)
  2139. LFD f16, 0 * SIZE(BO)
  2140. LFD f17, 1 * SIZE(BO)
  2141. LFD f20, 2 * SIZE(BO)
  2142. LFD f21, 3 * SIZE(BO)
  2143. LFD f24, 4 * SIZE(BO)
  2144. LFD f25, 5 * SIZE(BO)
  2145. LFD f28, 6 * SIZE(BO)
  2146. LFD f29, 7 * SIZE(BO)
  2147. FSUB f0, f16, f0
  2148. FSUB f4, f17, f4
  2149. FSUB f1, f20, f1
  2150. FSUB f5, f21, f5
  2151. FSUB f2, f24, f2
  2152. FSUB f6, f25, f6
  2153. FSUB f3, f28, f3
  2154. FSUB f7, f29, f7
  2155. #else
  2156. LFD f16, 0 * SIZE(AO)
  2157. LFD f17, 1 * SIZE(AO)
  2158. LFD f18, 2 * SIZE(AO)
  2159. LFD f19, 3 * SIZE(AO)
  2160. LFD f20, 4 * SIZE(AO)
  2161. LFD f21, 5 * SIZE(AO)
  2162. LFD f22, 6 * SIZE(AO)
  2163. LFD f23, 7 * SIZE(AO)
  2164. FSUB f0, f16, f0
  2165. FSUB f1, f17, f1
  2166. FSUB f2, f18, f2
  2167. FSUB f3, f19, f3
  2168. FSUB f4, f20, f4
  2169. FSUB f5, f21, f5
  2170. FSUB f6, f22, f6
  2171. FSUB f7, f23, f7
  2172. #endif
  2173. #ifdef LN
  2174. LFD f16, 15 * SIZE(AO)
  2175. LFD f17, 14 * SIZE(AO)
  2176. LFD f18, 13 * SIZE(AO)
  2177. LFD f19, 12 * SIZE(AO)
  2178. FMUL f3, f16, f3
  2179. FMUL f7, f16, f7
  2180. FNMSUB f2, f17, f3, f2
  2181. FNMSUB f6, f17, f7, f6
  2182. FNMSUB f1, f18, f3, f1
  2183. FNMSUB f5, f18, f7, f5
  2184. FNMSUB f0, f19, f3, f0
  2185. FNMSUB f4, f19, f7, f4
  2186. LFD f16, 10 * SIZE(AO)
  2187. LFD f17, 9 * SIZE(AO)
  2188. LFD f18, 8 * SIZE(AO)
  2189. LFD f19, 5 * SIZE(AO)
  2190. LFD f20, 4 * SIZE(AO)
  2191. LFD f21, 0 * SIZE(AO)
  2192. FMUL f2, f16, f2
  2193. FMUL f6, f16, f6
  2194. FNMSUB f1, f17, f2, f1
  2195. FNMSUB f5, f17, f6, f5
  2196. FNMSUB f0, f18, f2, f0
  2197. FNMSUB f4, f18, f6, f4
  2198. FMUL f1, f19, f1
  2199. FMUL f5, f19, f5
  2200. FNMSUB f0, f20, f1, f0
  2201. FNMSUB f4, f20, f5, f4
  2202. FMUL f0, f21, f0
  2203. FMUL f4, f21, f4
  2204. #endif
  2205. #ifdef LT
  2206. LFD f16, 0 * SIZE(AO)
  2207. LFD f17, 1 * SIZE(AO)
  2208. LFD f18, 2 * SIZE(AO)
  2209. LFD f19, 3 * SIZE(AO)
  2210. FMUL f0, f16, f0
  2211. FMUL f4, f16, f4
  2212. FNMSUB f1, f17, f0, f1
  2213. FNMSUB f5, f17, f4, f5
  2214. FNMSUB f2, f18, f0, f2
  2215. FNMSUB f6, f18, f4, f6
  2216. FNMSUB f3, f19, f0, f3
  2217. FNMSUB f7, f19, f4, f7
  2218. LFD f17, 5 * SIZE(AO)
  2219. LFD f18, 6 * SIZE(AO)
  2220. LFD f19, 7 * SIZE(AO)
  2221. FMUL f1, f17, f1
  2222. FMUL f5, f17, f5
  2223. FNMSUB f2, f18, f1, f2
  2224. FNMSUB f6, f18, f5, f6
  2225. FNMSUB f3, f19, f1, f3
  2226. FNMSUB f7, f19, f5, f7
  2227. LFD f18, 10 * SIZE(AO)
  2228. LFD f19, 11 * SIZE(AO)
  2229. FMUL f2, f18, f2
  2230. FMUL f6, f18, f6
  2231. FNMSUB f3, f19, f2, f3
  2232. FNMSUB f7, f19, f6, f7
  2233. LFD f19, 15 * SIZE(AO)
  2234. FMUL f3, f19, f3
  2235. FMUL f7, f19, f7
  2236. #endif
  2237. #ifdef RN
  2238. LFD f16, 0 * SIZE(BO)
  2239. LFD f17, 1 * SIZE(BO)
  2240. LFD f18, 3 * SIZE(BO)
  2241. FMUL f0, f16, f0
  2242. FMUL f1, f16, f1
  2243. FMUL f2, f16, f2
  2244. FMUL f3, f16, f3
  2245. FNMSUB f4, f17, f0, f4
  2246. FNMSUB f5, f17, f1, f5
  2247. FNMSUB f6, f17, f2, f6
  2248. FNMSUB f7, f17, f3, f7
  2249. FMUL f4, f18, f4
  2250. FMUL f5, f18, f5
  2251. FMUL f6, f18, f6
  2252. FMUL f7, f18, f7
  2253. #endif
  2254. #ifdef RT
  2255. LFD f19, 3 * SIZE(BO)
  2256. LFD f20, 2 * SIZE(BO)
  2257. LFD f21, 0 * SIZE(BO)
  2258. FMUL f4, f19, f4
  2259. FMUL f5, f19, f5
  2260. FMUL f6, f19, f6
  2261. FMUL f7, f19, f7
  2262. FNMSUB f0, f20, f4, f0
  2263. FNMSUB f1, f20, f5, f1
  2264. FNMSUB f2, f20, f6, f2
  2265. FNMSUB f3, f20, f7, f3
  2266. FMUL f0, f21, f0
  2267. FMUL f1, f21, f1
  2268. FMUL f2, f21, f2
  2269. FMUL f3, f21, f3
  2270. #endif
  2271. #ifdef LN
  2272. subi CO1, CO1, 4 * SIZE
  2273. subi CO2, CO2, 4 * SIZE
  2274. #endif
  2275. #if defined(LN) || defined(LT)
  2276. STFD f0, 0 * SIZE(BO)
  2277. STFD f4, 1 * SIZE(BO)
  2278. STFD f1, 2 * SIZE(BO)
  2279. STFD f5, 3 * SIZE(BO)
  2280. STFD f2, 4 * SIZE(BO)
  2281. STFD f6, 5 * SIZE(BO)
  2282. STFD f3, 6 * SIZE(BO)
  2283. STFD f7, 7 * SIZE(BO)
  2284. #else
  2285. STFD f0, 0 * SIZE(AO)
  2286. STFD f1, 1 * SIZE(AO)
  2287. STFD f2, 2 * SIZE(AO)
  2288. STFD f3, 3 * SIZE(AO)
  2289. STFD f4, 4 * SIZE(AO)
  2290. STFD f5, 5 * SIZE(AO)
  2291. STFD f6, 6 * SIZE(AO)
  2292. STFD f7, 7 * SIZE(AO)
  2293. #endif
  2294. STFD f0, 0 * SIZE(CO1)
  2295. STFD f1, 1 * SIZE(CO1)
  2296. STFD f2, 2 * SIZE(CO1)
  2297. STFD f3, 3 * SIZE(CO1)
  2298. STFD f4, 0 * SIZE(CO2)
  2299. STFD f5, 1 * SIZE(CO2)
  2300. STFD f6, 2 * SIZE(CO2)
  2301. STFD f7, 3 * SIZE(CO2)
  2302. lfs f0, FZERO
  2303. fmr f1, f0
  2304. fmr f2, f0
  2305. fmr f3, f0
  2306. fmr f4, f0
  2307. fmr f5, f0
  2308. fmr f6, f0
  2309. fmr f7, f0
  2310. #ifndef LN
  2311. addi CO1, CO1, 4 * SIZE
  2312. addi CO2, CO2, 4 * SIZE
  2313. #endif
  2314. #ifdef RT
  2315. slwi r0, K, 2 + BASE_SHIFT
  2316. add AORIG, AORIG, r0
  2317. #endif
  2318. #if defined(LT) || defined(RN)
  2319. sub TEMP, K, KK
  2320. slwi r0, TEMP, 2 + BASE_SHIFT
  2321. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2322. add AO, AO, r0
  2323. add BO, BO, TEMP
  2324. #endif
  2325. #ifdef LN
  2326. subi KK, KK, 4
  2327. #endif
  2328. #ifdef LT
  2329. addi KK, KK, 4
  2330. #endif
  2331. addic. I, I, -1
  2332. bgt+ LL(42)
  2333. .align 4
  2334. LL(69):
  2335. #ifdef LN
  2336. slwi r0, K, 1 + BASE_SHIFT
  2337. add B, B, r0
  2338. #endif
  2339. #if defined(LT) || defined(RN)
  2340. mr B, BO
  2341. #endif
  2342. #ifdef RN
  2343. addi KK, KK, 2
  2344. #endif
  2345. #ifdef RT
  2346. subi KK, KK, 2
  2347. #endif
  2348. lfs f0, FZERO
  2349. .align 4
  2350. LL(70):
  2351. andi. J, N, 1
  2352. ble LL(999)
  2353. #ifdef RT
  2354. slwi r0, K, 0 + BASE_SHIFT
  2355. sub B, B, r0
  2356. sub C, C, LDC
  2357. #endif
  2358. mr CO1, C
  2359. #ifdef LN
  2360. add KK, M, OFFSET
  2361. #endif
  2362. #ifdef LT
  2363. mr KK, OFFSET
  2364. #endif
  2365. fmr f1, f0
  2366. fmr f2, f0
  2367. fmr f3, f0
  2368. #if defined(LN) || defined(RT)
  2369. mr AORIG, A
  2370. #else
  2371. mr AO, A
  2372. #endif
  2373. #ifndef RT
  2374. add C, CO1, LDC
  2375. #endif
  2376. .align 4
  2377. LL(90):
  2378. andi. I, M, 1
  2379. ble LL(80)
  2380. #if defined(LT) || defined(RN)
  2381. LFD f16, 0 * SIZE(AO)
  2382. LFD f17, 1 * SIZE(AO)
  2383. LFD f18, 2 * SIZE(AO)
  2384. LFD f19, 3 * SIZE(AO)
  2385. LFD f20, 0 * SIZE(B)
  2386. LFD f21, 1 * SIZE(B)
  2387. LFD f22, 2 * SIZE(B)
  2388. LFD f23, 3 * SIZE(B)
  2389. srawi. r0, KK, 3
  2390. mtspr CTR, r0
  2391. mr BO, B
  2392. #else
  2393. #ifdef LN
  2394. slwi r0, K, BASE_SHIFT
  2395. sub AORIG, AORIG, r0
  2396. #endif
  2397. slwi r0, KK, 0 + BASE_SHIFT
  2398. slwi TEMP, KK, 0 + BASE_SHIFT
  2399. add AO, AORIG, r0
  2400. add BO, B, TEMP
  2401. sub TEMP, K, KK
  2402. LFD f16, 0 * SIZE(AO)
  2403. LFD f17, 1 * SIZE(AO)
  2404. LFD f18, 2 * SIZE(AO)
  2405. LFD f19, 3 * SIZE(AO)
  2406. LFD f20, 0 * SIZE(BO)
  2407. LFD f21, 1 * SIZE(BO)
  2408. LFD f22, 2 * SIZE(BO)
  2409. LFD f23, 3 * SIZE(BO)
  2410. srawi. r0, TEMP, 3
  2411. mtspr CTR, r0
  2412. #endif
  2413. ble LL(95)
  2414. .align 5
  2415. LL(92):
  2416. FMADD f0, f16, f20, f0
  2417. FMADD f1, f17, f21, f1
  2418. FMADD f2, f18, f22, f2
  2419. FMADD f3, f19, f23, f3
  2420. LFD f16, 4 * SIZE(AO)
  2421. LFD f17, 5 * SIZE(AO)
  2422. LFD f18, 6 * SIZE(AO)
  2423. LFD f19, 7 * SIZE(AO)
  2424. LFD f20, 4 * SIZE(BO)
  2425. LFD f21, 5 * SIZE(BO)
  2426. LFD f22, 6 * SIZE(BO)
  2427. LFD f23, 7 * SIZE(BO)
  2428. FMADD f0, f16, f20, f0
  2429. FMADD f1, f17, f21, f1
  2430. FMADD f2, f18, f22, f2
  2431. FMADD f3, f19, f23, f3
  2432. LFD f16, 8 * SIZE(AO)
  2433. LFD f17, 9 * SIZE(AO)
  2434. LFD f18, 10 * SIZE(AO)
  2435. LFD f19, 11 * SIZE(AO)
  2436. LFD f20, 8 * SIZE(BO)
  2437. LFD f21, 9 * SIZE(BO)
  2438. LFD f22, 10 * SIZE(BO)
  2439. LFD f23, 11 * SIZE(BO)
  2440. addi AO, AO, 8 * SIZE
  2441. addi BO, BO, 8 * SIZE
  2442. bdnz LL(92)
  2443. .align 4
  2444. LL(95):
  2445. #if defined(LT) || defined(RN)
  2446. andi. r0, KK, 7
  2447. #else
  2448. andi. r0, TEMP, 7
  2449. #endif
  2450. mtspr CTR, r0
  2451. ble+ LL(98)
  2452. .align 4
  2453. LL(96):
  2454. FMADD f0, f16, f20, f0
  2455. LFD f16, 1 * SIZE(AO)
  2456. LFD f20, 1 * SIZE(BO)
  2457. addi BO, BO, 1 * SIZE
  2458. addi AO, AO, 1 * SIZE
  2459. bdnz LL(96)
  2460. .align 4
  2461. LL(98):
  2462. FADD f0, f1, f0
  2463. FADD f2, f3, f2
  2464. FADD f0, f2, f0
  2465. #if defined(LN) || defined(RT)
  2466. #ifdef LN
  2467. subi r0, KK, 1
  2468. #else
  2469. subi r0, KK, 1
  2470. #endif
  2471. slwi TEMP, r0, 0 + BASE_SHIFT
  2472. slwi r0, r0, 0 + BASE_SHIFT
  2473. add AO, AORIG, TEMP
  2474. add BO, B, r0
  2475. #endif
  2476. #if defined(LN) || defined(LT)
  2477. LFD f16, 0 * SIZE(BO)
  2478. FSUB f0, f16, f0
  2479. #else
  2480. LFD f16, 0 * SIZE(AO)
  2481. FSUB f0, f16, f0
  2482. #endif
  2483. #ifdef LN
  2484. LFD f21, 0 * SIZE(AO)
  2485. FMUL f0, f21, f0
  2486. #endif
  2487. #ifdef LT
  2488. LFD f16, 0 * SIZE(AO)
  2489. FMUL f0, f16, f0
  2490. #endif
  2491. #ifdef RN
  2492. LFD f16, 0 * SIZE(BO)
  2493. FMUL f0, f16, f0
  2494. #endif
  2495. #ifdef RT
  2496. LFD f21, 0 * SIZE(BO)
  2497. FMUL f0, f21, f0
  2498. #endif
  2499. #ifdef LN
  2500. subi CO1, CO1, 1 * SIZE
  2501. #endif
  2502. #if defined(LN) || defined(LT)
  2503. STFD f0, 0 * SIZE(BO)
  2504. #else
  2505. STFD f0, 0 * SIZE(AO)
  2506. #endif
  2507. STFD f0, 0 * SIZE(CO1)
  2508. lfs f0, FZERO
  2509. fmr f1, f0
  2510. fmr f2, f0
  2511. fmr f3, f0
  2512. #ifndef LN
  2513. addi CO1, CO1, 1 * SIZE
  2514. #endif
  2515. #ifdef RT
  2516. slwi r0, K, 0 + BASE_SHIFT
  2517. add AORIG, AORIG, r0
  2518. #endif
  2519. #if defined(LT) || defined(RN)
  2520. sub TEMP, K, KK
  2521. slwi r0, TEMP, 0 + BASE_SHIFT
  2522. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2523. add AO, AO, r0
  2524. add BO, BO, TEMP
  2525. #endif
  2526. #ifdef LN
  2527. subi KK, KK, 1
  2528. #endif
  2529. #ifdef LT
  2530. addi KK, KK, 1
  2531. #endif
  2532. .align 4
  2533. LL(80):
  2534. andi. I, M, 2
  2535. ble LL(71)
  2536. #if defined(LT) || defined(RN)
  2537. LFD f16, 0 * SIZE(AO)
  2538. LFD f17, 1 * SIZE(AO)
  2539. LFD f18, 2 * SIZE(AO)
  2540. LFD f19, 3 * SIZE(AO)
  2541. LFD f20, 0 * SIZE(B)
  2542. LFD f21, 1 * SIZE(B)
  2543. LFD f22, 2 * SIZE(B)
  2544. LFD f23, 3 * SIZE(B)
  2545. srawi. r0, KK, 2
  2546. mtspr CTR, r0
  2547. mr BO, B
  2548. #else
  2549. #ifdef LN
  2550. slwi r0, K, 1 + BASE_SHIFT
  2551. sub AORIG, AORIG, r0
  2552. #endif
  2553. slwi r0, KK, 1 + BASE_SHIFT
  2554. slwi TEMP, KK, 0 + BASE_SHIFT
  2555. add AO, AORIG, r0
  2556. add BO, B, TEMP
  2557. sub TEMP, K, KK
  2558. LFD f16, 0 * SIZE(AO)
  2559. LFD f17, 1 * SIZE(AO)
  2560. LFD f18, 2 * SIZE(AO)
  2561. LFD f19, 3 * SIZE(AO)
  2562. LFD f20, 0 * SIZE(BO)
  2563. LFD f21, 1 * SIZE(BO)
  2564. LFD f22, 2 * SIZE(BO)
  2565. LFD f23, 3 * SIZE(BO)
  2566. srawi. r0, TEMP, 2
  2567. mtspr CTR, r0
  2568. #endif
  2569. ble LL(85)
  2570. .align 5
  2571. LL(82):
  2572. FMADD f0, f16, f20, f0
  2573. FMADD f1, f17, f20, f1
  2574. FMADD f2, f18, f21, f2
  2575. FMADD f3, f19, f21, f3
  2576. LFD f16, 4 * SIZE(AO)
  2577. LFD f17, 5 * SIZE(AO)
  2578. LFD f18, 6 * SIZE(AO)
  2579. LFD f19, 7 * SIZE(AO)
  2580. FMADD f0, f16, f22, f0
  2581. FMADD f1, f17, f22, f1
  2582. FMADD f2, f18, f23, f2
  2583. FMADD f3, f19, f23, f3
  2584. LFD f16, 8 * SIZE(AO)
  2585. LFD f17, 9 * SIZE(AO)
  2586. LFD f18, 10 * SIZE(AO)
  2587. LFD f19, 11 * SIZE(AO)
  2588. LFD f20, 4 * SIZE(BO)
  2589. LFD f21, 5 * SIZE(BO)
  2590. LFD f22, 6 * SIZE(BO)
  2591. LFD f23, 7 * SIZE(BO)
  2592. addi AO, AO, 8 * SIZE
  2593. addi BO, BO, 4 * SIZE
  2594. dcbtst AO, PREA
  2595. bdnz LL(82)
  2596. .align 4
  2597. LL(85):
  2598. #if defined(LT) || defined(RN)
  2599. andi. r0, KK, 3
  2600. #else
  2601. andi. r0, TEMP, 3
  2602. #endif
  2603. mtspr CTR, r0
  2604. ble+ LL(88)
  2605. .align 4
  2606. LL(86):
  2607. FMADD f0, f16, f20, f0
  2608. FMADD f1, f17, f20, f1
  2609. LFD f16, 2 * SIZE(AO)
  2610. LFD f17, 3 * SIZE(AO)
  2611. LFD f20, 1 * SIZE(BO)
  2612. addi BO, BO, 1 * SIZE
  2613. addi AO, AO, 2 * SIZE
  2614. bdnz LL(86)
  2615. .align 4
  2616. LL(88):
  2617. FADD f0, f2, f0
  2618. FADD f1, f3, f1
  2619. #if defined(LN) || defined(RT)
  2620. #ifdef LN
  2621. subi r0, KK, 2
  2622. #else
  2623. subi r0, KK, 1
  2624. #endif
  2625. slwi TEMP, r0, 1 + BASE_SHIFT
  2626. slwi r0, r0, 0 + BASE_SHIFT
  2627. add AO, AORIG, TEMP
  2628. add BO, B, r0
  2629. #endif
  2630. #if defined(LN) || defined(LT)
  2631. LFD f16, 0 * SIZE(BO)
  2632. LFD f20, 1 * SIZE(BO)
  2633. FSUB f0, f16, f0
  2634. FSUB f1, f20, f1
  2635. #else
  2636. LFD f16, 0 * SIZE(AO)
  2637. LFD f17, 1 * SIZE(AO)
  2638. FSUB f0, f16, f0
  2639. FSUB f1, f17, f1
  2640. #endif
  2641. #ifdef LN
  2642. LFD f19, 3 * SIZE(AO)
  2643. LFD f20, 2 * SIZE(AO)
  2644. LFD f21, 0 * SIZE(AO)
  2645. FMUL f1, f19, f1
  2646. FNMSUB f0, f20, f1, f0
  2647. FMUL f0, f21, f0
  2648. #endif
  2649. #ifdef LT
  2650. LFD f16, 0 * SIZE(AO)
  2651. LFD f17, 1 * SIZE(AO)
  2652. FMUL f0, f16, f0
  2653. FNMSUB f1, f17, f0, f1
  2654. LFD f17, 3 * SIZE(AO)
  2655. FMUL f1, f17, f1
  2656. #endif
  2657. #ifdef RN
  2658. LFD f16, 0 * SIZE(BO)
  2659. FMUL f0, f16, f0
  2660. FMUL f1, f16, f1
  2661. #endif
  2662. #ifdef RT
  2663. LFD f21, 0 * SIZE(BO)
  2664. FMUL f0, f21, f0
  2665. FMUL f1, f21, f1
  2666. #endif
  2667. #ifdef LN
  2668. subi CO1, CO1, 2 * SIZE
  2669. #endif
  2670. #if defined(LN) || defined(LT)
  2671. STFD f0, 0 * SIZE(BO)
  2672. STFD f1, 1 * SIZE(BO)
  2673. #else
  2674. STFD f0, 0 * SIZE(AO)
  2675. STFD f1, 1 * SIZE(AO)
  2676. #endif
  2677. STFD f0, 0 * SIZE(CO1)
  2678. STFD f1, 1 * SIZE(CO1)
  2679. lfs f0, FZERO
  2680. fmr f1, f0
  2681. fmr f2, f0
  2682. fmr f3, f0
  2683. #ifndef LN
  2684. addi CO1, CO1, 2 * SIZE
  2685. #endif
  2686. #ifdef RT
  2687. slwi r0, K, 1 + BASE_SHIFT
  2688. add AORIG, AORIG, r0
  2689. #endif
  2690. #if defined(LT) || defined(RN)
  2691. sub TEMP, K, KK
  2692. slwi r0, TEMP, 1 + BASE_SHIFT
  2693. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2694. add AO, AO, r0
  2695. add BO, BO, TEMP
  2696. #endif
  2697. #ifdef LN
  2698. subi KK, KK, 2
  2699. #endif
  2700. #ifdef LT
  2701. addi KK, KK, 2
  2702. #endif
  2703. .align 4
  2704. LL(71):
  2705. srawi. I, M, 2
  2706. ble LL(999)
  2707. .align 4
  2708. LL(72):
  2709. #if defined(LT) || defined(RN)
  2710. LFD f16, 0 * SIZE(AO)
  2711. LFD f17, 1 * SIZE(AO)
  2712. LFD f18, 2 * SIZE(AO)
  2713. LFD f19, 3 * SIZE(AO)
  2714. LFD f20, 0 * SIZE(B)
  2715. LFD f21, 1 * SIZE(B)
  2716. LFD f22, 2 * SIZE(B)
  2717. LFD f23, 3 * SIZE(B)
  2718. dcbt CO1, PREC
  2719. srawi. r0, KK, 2
  2720. mtspr CTR, r0
  2721. mr BO, B
  2722. #else
  2723. #ifdef LN
  2724. slwi r0, K, 2 + BASE_SHIFT
  2725. sub AORIG, AORIG, r0
  2726. #endif
  2727. slwi r0, KK, 2 + BASE_SHIFT
  2728. slwi TEMP, KK, 0 + BASE_SHIFT
  2729. add AO, AORIG, r0
  2730. add BO, B, TEMP
  2731. sub TEMP, K, KK
  2732. LFD f16, 0 * SIZE(AO)
  2733. LFD f17, 1 * SIZE(AO)
  2734. LFD f18, 2 * SIZE(AO)
  2735. LFD f19, 3 * SIZE(AO)
  2736. LFD f20, 0 * SIZE(BO)
  2737. LFD f21, 1 * SIZE(BO)
  2738. LFD f22, 2 * SIZE(BO)
  2739. LFD f23, 3 * SIZE(BO)
  2740. dcbt CO1, PREC
  2741. srawi. r0, TEMP, 2
  2742. mtspr CTR, r0
  2743. #endif
  2744. ble LL(75)
  2745. .align 5
  2746. LL(73):
  2747. FMADD f0, f16, f20, f0
  2748. FMADD f1, f17, f20, f1
  2749. FMADD f2, f18, f20, f2
  2750. FMADD f3, f19, f20, f3
  2751. LFD f16, 4 * SIZE(AO)
  2752. LFD f17, 5 * SIZE(AO)
  2753. LFD f18, 6 * SIZE(AO)
  2754. LFD f19, 7 * SIZE(AO)
  2755. FMADD f0, f16, f21, f0
  2756. FMADD f1, f17, f21, f1
  2757. FMADD f2, f18, f21, f2
  2758. FMADD f3, f19, f21, f3
  2759. LFD f16, 8 * SIZE(AO)
  2760. LFD f17, 9 * SIZE(AO)
  2761. LFD f18, 10 * SIZE(AO)
  2762. LFD f19, 11 * SIZE(AO)
  2763. FMADD f0, f16, f22, f0
  2764. FMADD f1, f17, f22, f1
  2765. FMADD f2, f18, f22, f2
  2766. FMADD f3, f19, f22, f3
  2767. LFD f16, 12 * SIZE(AO)
  2768. LFD f17, 13 * SIZE(AO)
  2769. LFD f18, 14 * SIZE(AO)
  2770. LFD f19, 15 * SIZE(AO)
  2771. FMADD f0, f16, f23, f0
  2772. FMADD f1, f17, f23, f1
  2773. FMADD f2, f18, f23, f2
  2774. FMADD f3, f19, f23, f3
  2775. LFD f16, 16 * SIZE(AO)
  2776. LFD f17, 17 * SIZE(AO)
  2777. LFD f18, 18 * SIZE(AO)
  2778. LFD f19, 19 * SIZE(AO)
  2779. LFD f20, 4 * SIZE(BO)
  2780. LFD f21, 5 * SIZE(BO)
  2781. LFD f22, 6 * SIZE(BO)
  2782. LFD f23, 7 * SIZE(BO)
  2783. addi AO, AO, 16 * SIZE
  2784. addi BO, BO, 4 * SIZE
  2785. dcbtst AO, PREA
  2786. bdnz LL(73)
  2787. .align 4
  2788. LL(75):
  2789. #if defined(LT) || defined(RN)
  2790. andi. r0, KK, 3
  2791. #else
  2792. andi. r0, TEMP, 3
  2793. #endif
  2794. mtspr CTR, r0
  2795. ble+ LL(78)
  2796. .align 4
  2797. LL(76):
  2798. FMADD f0, f16, f20, f0
  2799. FMADD f1, f17, f20, f1
  2800. FMADD f2, f18, f20, f2
  2801. FMADD f3, f19, f20, f3
  2802. LFD f16, 4 * SIZE(AO)
  2803. LFD f17, 5 * SIZE(AO)
  2804. LFD f18, 6 * SIZE(AO)
  2805. LFD f19, 7 * SIZE(AO)
  2806. LFD f20, 1 * SIZE(BO)
  2807. addi BO, BO, 1 * SIZE
  2808. addi AO, AO, 4 * SIZE
  2809. bdnz LL(76)
  2810. .align 4
  2811. LL(78):
  2812. #if defined(LN) || defined(RT)
  2813. #ifdef LN
  2814. subi r0, KK, 4
  2815. #else
  2816. subi r0, KK, 1
  2817. #endif
  2818. slwi TEMP, r0, 2 + BASE_SHIFT
  2819. slwi r0, r0, 0 + BASE_SHIFT
  2820. add AO, AORIG, TEMP
  2821. add BO, B, r0
  2822. #endif
  2823. #if defined(LN) || defined(LT)
  2824. LFD f16, 0 * SIZE(BO)
  2825. LFD f20, 1 * SIZE(BO)
  2826. LFD f24, 2 * SIZE(BO)
  2827. LFD f28, 3 * SIZE(BO)
  2828. FSUB f0, f16, f0
  2829. FSUB f1, f20, f1
  2830. FSUB f2, f24, f2
  2831. FSUB f3, f28, f3
  2832. #else
  2833. LFD f16, 0 * SIZE(AO)
  2834. LFD f17, 1 * SIZE(AO)
  2835. LFD f18, 2 * SIZE(AO)
  2836. LFD f19, 3 * SIZE(AO)
  2837. FSUB f0, f16, f0
  2838. FSUB f1, f17, f1
  2839. FSUB f2, f18, f2
  2840. FSUB f3, f19, f3
  2841. #endif
  2842. #ifdef LN
  2843. LFD f16, 15 * SIZE(AO)
  2844. LFD f17, 14 * SIZE(AO)
  2845. LFD f18, 13 * SIZE(AO)
  2846. LFD f19, 12 * SIZE(AO)
  2847. FMUL f3, f16, f3
  2848. FNMSUB f2, f17, f3, f2
  2849. FNMSUB f1, f18, f3, f1
  2850. FNMSUB f0, f19, f3, f0
  2851. LFD f16, 10 * SIZE(AO)
  2852. LFD f17, 9 * SIZE(AO)
  2853. LFD f18, 8 * SIZE(AO)
  2854. LFD f19, 5 * SIZE(AO)
  2855. LFD f20, 4 * SIZE(AO)
  2856. LFD f21, 0 * SIZE(AO)
  2857. FMUL f2, f16, f2
  2858. FNMSUB f1, f17, f2, f1
  2859. FNMSUB f0, f18, f2, f0
  2860. FMUL f1, f19, f1
  2861. FNMSUB f0, f20, f1, f0
  2862. FMUL f0, f21, f0
  2863. #endif
  2864. #ifdef LT
  2865. LFD f16, 0 * SIZE(AO)
  2866. LFD f17, 1 * SIZE(AO)
  2867. LFD f18, 2 * SIZE(AO)
  2868. LFD f19, 3 * SIZE(AO)
  2869. FMUL f0, f16, f0
  2870. FNMSUB f1, f17, f0, f1
  2871. FNMSUB f2, f18, f0, f2
  2872. FNMSUB f3, f19, f0, f3
  2873. LFD f17, 5 * SIZE(AO)
  2874. LFD f18, 6 * SIZE(AO)
  2875. LFD f19, 7 * SIZE(AO)
  2876. FMUL f1, f17, f1
  2877. FNMSUB f2, f18, f1, f2
  2878. FNMSUB f3, f19, f1, f3
  2879. LFD f18, 10 * SIZE(AO)
  2880. LFD f19, 11 * SIZE(AO)
  2881. FMUL f2, f18, f2
  2882. FNMSUB f3, f19, f2, f3
  2883. LFD f19, 15 * SIZE(AO)
  2884. FMUL f3, f19, f3
  2885. #endif
  2886. #ifdef RN
  2887. LFD f16, 0 * SIZE(BO)
  2888. FMUL f0, f16, f0
  2889. FMUL f1, f16, f1
  2890. FMUL f2, f16, f2
  2891. FMUL f3, f16, f3
  2892. #endif
  2893. #ifdef RT
  2894. LFD f21, 0 * SIZE(BO)
  2895. FMUL f0, f21, f0
  2896. FMUL f1, f21, f1
  2897. FMUL f2, f21, f2
  2898. FMUL f3, f21, f3
  2899. #endif
  2900. #ifdef LN
  2901. subi CO1, CO1, 4 * SIZE
  2902. #endif
  2903. #if defined(LN) || defined(LT)
  2904. STFD f0, 0 * SIZE(BO)
  2905. STFD f1, 1 * SIZE(BO)
  2906. STFD f2, 2 * SIZE(BO)
  2907. STFD f3, 3 * SIZE(BO)
  2908. #else
  2909. STFD f0, 0 * SIZE(AO)
  2910. STFD f1, 1 * SIZE(AO)
  2911. STFD f2, 2 * SIZE(AO)
  2912. STFD f3, 3 * SIZE(AO)
  2913. #endif
  2914. STFD f0, 0 * SIZE(CO1)
  2915. STFD f1, 1 * SIZE(CO1)
  2916. STFD f2, 2 * SIZE(CO1)
  2917. STFD f3, 3 * SIZE(CO1)
  2918. lfs f0, FZERO
  2919. fmr f1, f0
  2920. fmr f2, f0
  2921. fmr f3, f0
  2922. #ifndef LN
  2923. addi CO1, CO1, 4 * SIZE
  2924. #endif
  2925. #ifdef RT
  2926. slwi r0, K, 2 + BASE_SHIFT
  2927. add AORIG, AORIG, r0
  2928. #endif
  2929. #if defined(LT) || defined(RN)
  2930. sub TEMP, K, KK
  2931. slwi r0, TEMP, 2 + BASE_SHIFT
  2932. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2933. add AO, AO, r0
  2934. add BO, BO, TEMP
  2935. #endif
  2936. #ifdef LN
  2937. subi KK, KK, 4
  2938. #endif
  2939. #ifdef LT
  2940. addi KK, KK, 4
  2941. #endif
  2942. addic. I, I, -1
  2943. bgt+ LL(72)
  2944. .align 4
  2945. LL(999):
  2946. addi r3, 0, 0
  2947. lfd f14, 0(SP)
  2948. lfd f15, 8(SP)
  2949. lfd f16, 16(SP)
  2950. lfd f17, 24(SP)
  2951. lfd f18, 32(SP)
  2952. lfd f19, 40(SP)
  2953. lfd f20, 48(SP)
  2954. lfd f21, 56(SP)
  2955. lfd f22, 64(SP)
  2956. lfd f23, 72(SP)
  2957. lfd f24, 80(SP)
  2958. lfd f25, 88(SP)
  2959. lfd f26, 96(SP)
  2960. lfd f27, 104(SP)
  2961. lfd f28, 112(SP)
  2962. lfd f29, 120(SP)
  2963. lfd f30, 128(SP)
  2964. lfd f31, 136(SP)
  2965. #ifdef __64BIT__
  2966. ld r31, 144(SP)
  2967. ld r30, 152(SP)
  2968. ld r29, 160(SP)
  2969. ld r28, 168(SP)
  2970. ld r27, 176(SP)
  2971. ld r26, 184(SP)
  2972. ld r25, 192(SP)
  2973. ld r24, 200(SP)
  2974. ld r23, 208(SP)
  2975. ld r22, 216(SP)
  2976. ld r21, 224(SP)
  2977. ld r20, 232(SP)
  2978. ld r19, 240(SP)
  2979. ld r18, 248(SP)
  2980. #else
  2981. lwz r31, 144(SP)
  2982. lwz r30, 148(SP)
  2983. lwz r29, 152(SP)
  2984. lwz r28, 156(SP)
  2985. lwz r27, 160(SP)
  2986. lwz r26, 164(SP)
  2987. lwz r25, 168(SP)
  2988. lwz r24, 172(SP)
  2989. lwz r23, 176(SP)
  2990. lwz r22, 180(SP)
  2991. lwz r21, 184(SP)
  2992. lwz r20, 188(SP)
  2993. lwz r19, 192(SP)
  2994. lwz r18, 196(SP)
  2995. #endif
  2996. addi SP, SP, STACKSIZE
  2997. blr
  2998. EPILOGUE
  2999. #endif