You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_power6_LT.S 63 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #if defined(linux) || defined(__FreeBSD__)
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define AORIG r18
  88. #define TEMP r19
  89. #define BB r20
  90. #define KK r21
  91. #define I r22
  92. #define J r23
  93. #define AO r24
  94. #define BO r25
  95. #define CO1 r26
  96. #define CO2 r27
  97. #define CO3 r28
  98. #define CO4 r29
  99. #define PREA r30
  100. #define PREC r31
  101. #ifndef NEEDPARAM
  102. PROLOGUE
  103. PROFCODE
  104. addi SP, SP, -STACKSIZE
  105. li r0, 0
  106. stfd f14, 0(SP)
  107. stfd f15, 8(SP)
  108. stfd f16, 16(SP)
  109. stfd f17, 24(SP)
  110. stfd f18, 32(SP)
  111. stfd f19, 40(SP)
  112. stfd f20, 48(SP)
  113. stfd f21, 56(SP)
  114. stfd f22, 64(SP)
  115. stfd f23, 72(SP)
  116. stfd f24, 80(SP)
  117. stfd f25, 88(SP)
  118. stfd f26, 96(SP)
  119. stfd f27, 104(SP)
  120. stfd f28, 112(SP)
  121. stfd f29, 120(SP)
  122. stfd f30, 128(SP)
  123. stfd f31, 136(SP)
  124. #ifdef __64BIT__
  125. std r31, 144(SP)
  126. std r30, 152(SP)
  127. std r29, 160(SP)
  128. std r28, 168(SP)
  129. std r27, 176(SP)
  130. std r26, 184(SP)
  131. std r25, 192(SP)
  132. std r24, 200(SP)
  133. std r23, 208(SP)
  134. std r22, 216(SP)
  135. std r21, 224(SP)
  136. std r20, 232(SP)
  137. std r19, 240(SP)
  138. std r18, 248(SP)
  139. #else
  140. stw r31, 144(SP)
  141. stw r30, 148(SP)
  142. stw r29, 152(SP)
  143. stw r28, 156(SP)
  144. stw r27, 160(SP)
  145. stw r26, 164(SP)
  146. stw r25, 168(SP)
  147. stw r24, 172(SP)
  148. stw r23, 176(SP)
  149. stw r22, 180(SP)
  150. stw r21, 184(SP)
  151. stw r20, 188(SP)
  152. stw r19, 192(SP)
  153. stw r18, 196(SP)
  154. #endif
  155. stw r0, FZERO
  156. #if defined(_AIX) || defined(__APPLE__)
  157. #if !defined(__64BIT__) && defined(DOUBLE)
  158. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  159. #endif
  160. #endif
  161. slwi LDC, LDC, BASE_SHIFT
  162. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  163. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  164. #endif
  165. #if defined(_AIX) || defined(__APPLE__)
  166. #ifdef __64BIT__
  167. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  168. #else
  169. #ifdef DOUBLE
  170. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  171. #else
  172. lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  173. #endif
  174. #endif
  175. #endif
  176. #ifdef LN
  177. mullw r0, M, K
  178. slwi r0, r0, BASE_SHIFT
  179. add A, A, r0
  180. slwi r0, M, BASE_SHIFT
  181. add C, C, r0
  182. #endif
  183. #ifdef RN
  184. neg KK, OFFSET
  185. #endif
  186. #ifdef RT
  187. mullw r0, N, K
  188. slwi r0, r0, BASE_SHIFT
  189. add B, B, r0
  190. mullw r0, N, LDC
  191. add C, C, r0
  192. sub KK, N, OFFSET
  193. #endif
  194. cmpwi cr0, M, 0
  195. ble LL(999)
  196. cmpwi cr0, N, 0
  197. ble LL(999)
  198. cmpwi cr0, K, 0
  199. ble LL(999)
  200. li PREA, (16 * 3 * SIZE)
  201. li PREC, 4 * SIZE
  202. lfs f0, FZERO
  203. srawi. J, N, 2
  204. ble LL(40)
  205. .align 4
  206. LL(10):
  207. #ifdef RT
  208. slwi r0, K, 2 + BASE_SHIFT
  209. sub B, B, r0
  210. slwi r0, LDC, 2
  211. sub C, C, r0
  212. #endif
  213. mr CO1, C
  214. add CO2, C, LDC
  215. add CO3, CO2, LDC
  216. add CO4, CO3, LDC
  217. #ifdef LN
  218. add KK, M, OFFSET
  219. #endif
  220. #ifdef LT
  221. mr KK, OFFSET
  222. #endif
  223. fmr f1, f0
  224. fmr f2, f0
  225. fmr f3, f0
  226. fmr f4, f0
  227. fmr f5, f0
  228. fmr f6, f0
  229. fmr f7, f0
  230. fmr f8, f0
  231. fmr f9, f0
  232. fmr f10, f0
  233. fmr f11, f0
  234. fmr f12, f0
  235. fmr f13, f0
  236. fmr f14, f0
  237. fmr f15, f0
  238. srawi. I, M, 2
  239. #if defined(LN) || defined(RT)
  240. mr AORIG, A
  241. #else
  242. mr AO, A
  243. #endif
  244. #ifndef RT
  245. add C, CO4, LDC
  246. #endif
  247. ble LL(20)
  248. .align 4
  249. LL(11):
  250. #if defined(LT) || defined(RN)
  251. LFD f16, 0 * SIZE(AO)
  252. LFD f17, 1 * SIZE(AO)
  253. LFD f18, 2 * SIZE(AO)
  254. LFD f19, 3 * SIZE(AO)
  255. LFD f20, 0 * SIZE(B)
  256. LFD f21, 1 * SIZE(B)
  257. LFD f22, 2 * SIZE(B)
  258. LFD f23, 3 * SIZE(B)
  259. dcbtst CO1, PREC
  260. dcbtst CO2, PREC
  261. dcbtst CO3, PREC
  262. dcbtst CO4, PREC
  263. srawi. r0, KK, 3
  264. mtspr CTR, r0
  265. mr BO, B
  266. #else
  267. #ifdef LN
  268. slwi r0, K, 2 + BASE_SHIFT
  269. sub AORIG, AORIG, r0
  270. #endif
  271. slwi TEMP, KK, 2 + BASE_SHIFT
  272. add AO, AORIG, TEMP
  273. add BO, B, TEMP
  274. sub TEMP, K, KK
  275. LFD f16, 0 * SIZE(AO)
  276. LFD f17, 1 * SIZE(AO)
  277. LFD f18, 2 * SIZE(AO)
  278. LFD f19, 3 * SIZE(AO)
  279. LFD f20, 0 * SIZE(BO)
  280. LFD f21, 1 * SIZE(BO)
  281. LFD f22, 2 * SIZE(BO)
  282. LFD f23, 3 * SIZE(BO)
  283. dcbtst CO1, PREC
  284. dcbtst CO2, PREC
  285. dcbtst CO3, PREC
  286. dcbtst CO4, PREC
  287. srawi. r0, TEMP, 3
  288. mtspr CTR, r0
  289. #endif
  290. ble LL(15)
  291. .align 4
  292. LL(12):
  293. dcbt AO, PREA
  294. dcbtst BO, PREA
  295. FMADD f0, f16, f20, f0
  296. FMADD f4, f16, f21, f4
  297. FMADD f8, f16, f22, f8
  298. FMADD f12, f16, f23, f12
  299. LFD f24, 4 * SIZE(AO)
  300. LFD f28, 4 * SIZE(BO)
  301. LFD f25, 5 * SIZE(AO)
  302. LFD f29, 5 * SIZE(BO)
  303. FMADD f1, f17, f20, f1
  304. FMADD f5, f17, f21, f5
  305. FMADD f9, f17, f22, f9
  306. FMADD f13, f17, f23, f13
  307. FMADD f2, f18, f20, f2
  308. FMADD f6, f18, f21, f6
  309. FMADD f10, f18, f22, f10
  310. FMADD f14, f18, f23, f14
  311. LFD f26, 6 * SIZE(AO)
  312. LFD f30, 6 * SIZE(BO)
  313. LFD f27, 7 * SIZE(AO)
  314. LFD f31, 7 * SIZE(BO)
  315. FMADD f3, f19, f20, f3
  316. FMADD f7, f19, f21, f7
  317. FMADD f11, f19, f22, f11
  318. FMADD f15, f19, f23, f15
  319. FMADD f0, f24, f28, f0
  320. FMADD f4, f24, f29, f4
  321. FMADD f8, f24, f30, f8
  322. FMADD f12, f24, f31, f12
  323. LFD f16, 8 * SIZE(AO)
  324. LFD f20, 8 * SIZE(BO)
  325. LFD f17, 9 * SIZE(AO)
  326. LFD f21, 9 * SIZE(BO)
  327. FMADD f1, f25, f28, f1
  328. FMADD f5, f25, f29, f5
  329. FMADD f9, f25, f30, f9
  330. FMADD f13, f25, f31, f13
  331. FMADD f2, f26, f28, f2
  332. FMADD f6, f26, f29, f6
  333. FMADD f10, f26, f30, f10
  334. FMADD f14, f26, f31, f14
  335. LFD f18, 10 * SIZE(AO)
  336. LFD f22, 10 * SIZE(BO)
  337. LFD f19, 11 * SIZE(AO)
  338. LFD f23, 11 * SIZE(BO)
  339. FMADD f3, f27, f28, f3
  340. FMADD f7, f27, f29, f7
  341. FMADD f11, f27, f30, f11
  342. FMADD f15, f27, f31, f15
  343. FMADD f0, f16, f20, f0
  344. FMADD f4, f16, f21, f4
  345. FMADD f8, f16, f22, f8
  346. FMADD f12, f16, f23, f12
  347. LFD f24, 12 * SIZE(AO)
  348. LFD f28, 12 * SIZE(BO)
  349. LFD f25, 13 * SIZE(AO)
  350. LFD f29, 13 * SIZE(BO)
  351. FMADD f1, f17, f20, f1
  352. FMADD f5, f17, f21, f5
  353. FMADD f9, f17, f22, f9
  354. FMADD f13, f17, f23, f13
  355. FMADD f2, f18, f20, f2
  356. FMADD f6, f18, f21, f6
  357. FMADD f10, f18, f22, f10
  358. FMADD f14, f18, f23, f14
  359. LFD f26, 14 * SIZE(AO)
  360. LFD f30, 14 * SIZE(BO)
  361. LFD f27, 15 * SIZE(AO)
  362. LFD f31, 15 * SIZE(BO)
  363. FMADD f3, f19, f20, f3
  364. FMADD f7, f19, f21, f7
  365. FMADD f11, f19, f22, f11
  366. FMADD f15, f19, f23, f15
  367. FMADD f0, f24, f28, f0
  368. FMADD f4, f24, f29, f4
  369. FMADD f8, f24, f30, f8
  370. FMADD f12, f24, f31, f12
  371. LFD f16, 16 * SIZE(AO)
  372. LFD f20, 16 * SIZE(BO)
  373. LFD f17, 17 * SIZE(AO)
  374. LFD f21, 17 * SIZE(BO)
  375. FMADD f1, f25, f28, f1
  376. FMADD f5, f25, f29, f5
  377. FMADD f9, f25, f30, f9
  378. FMADD f13, f25, f31, f13
  379. FMADD f2, f26, f28, f2
  380. FMADD f6, f26, f29, f6
  381. FMADD f10, f26, f30, f10
  382. FMADD f14, f26, f31, f14
  383. LFD f18, 18 * SIZE(AO)
  384. LFD f22, 18 * SIZE(BO)
  385. LFD f19, 19 * SIZE(AO)
  386. LFD f23, 19 * SIZE(BO)
  387. FMADD f3, f27, f28, f3
  388. FMADD f7, f27, f29, f7
  389. FMADD f11, f27, f30, f11
  390. FMADD f15, f27, f31, f15
  391. FMADD f0, f16, f20, f0
  392. FMADD f4, f16, f21, f4
  393. FMADD f8, f16, f22, f8
  394. FMADD f12, f16, f23, f12
  395. LFD f24, 20 * SIZE(AO)
  396. LFD f28, 20 * SIZE(BO)
  397. LFD f25, 21 * SIZE(AO)
  398. LFD f29, 21 * SIZE(BO)
  399. FMADD f1, f17, f20, f1
  400. FMADD f5, f17, f21, f5
  401. FMADD f9, f17, f22, f9
  402. FMADD f13, f17, f23, f13
  403. FMADD f2, f18, f20, f2
  404. FMADD f6, f18, f21, f6
  405. FMADD f10, f18, f22, f10
  406. FMADD f14, f18, f23, f14
  407. LFD f26, 22 * SIZE(AO)
  408. LFD f30, 22 * SIZE(BO)
  409. LFD f27, 23 * SIZE(AO)
  410. LFD f31, 23 * SIZE(BO)
  411. FMADD f3, f19, f20, f3
  412. FMADD f7, f19, f21, f7
  413. FMADD f11, f19, f22, f11
  414. FMADD f15, f19, f23, f15
  415. FMADD f0, f24, f28, f0
  416. FMADD f4, f24, f29, f4
  417. FMADD f8, f24, f30, f8
  418. FMADD f12, f24, f31, f12
  419. LFD f16, 24 * SIZE(AO)
  420. LFD f20, 24 * SIZE(BO)
  421. LFD f17, 25 * SIZE(AO)
  422. LFD f21, 25 * SIZE(BO)
  423. FMADD f1, f25, f28, f1
  424. FMADD f5, f25, f29, f5
  425. FMADD f9, f25, f30, f9
  426. FMADD f13, f25, f31, f13
  427. FMADD f2, f26, f28, f2
  428. FMADD f6, f26, f29, f6
  429. FMADD f10, f26, f30, f10
  430. FMADD f14, f26, f31, f14
  431. LFD f18, 26 * SIZE(AO)
  432. LFD f22, 26 * SIZE(BO)
  433. LFD f19, 27 * SIZE(AO)
  434. LFD f23, 27 * SIZE(BO)
  435. FMADD f3, f27, f28, f3
  436. FMADD f7, f27, f29, f7
  437. FMADD f11, f27, f30, f11
  438. FMADD f15, f27, f31, f15
  439. FMADD f0, f16, f20, f0
  440. FMADD f4, f16, f21, f4
  441. FMADD f8, f16, f22, f8
  442. FMADD f12, f16, f23, f12
  443. LFD f24, 28 * SIZE(AO)
  444. LFD f28, 28 * SIZE(BO)
  445. LFD f25, 29 * SIZE(AO)
  446. LFD f29, 29 * SIZE(BO)
  447. FMADD f1, f17, f20, f1
  448. FMADD f5, f17, f21, f5
  449. FMADD f9, f17, f22, f9
  450. FMADD f13, f17, f23, f13
  451. FMADD f2, f18, f20, f2
  452. FMADD f6, f18, f21, f6
  453. FMADD f10, f18, f22, f10
  454. FMADD f14, f18, f23, f14
  455. LFD f26, 30 * SIZE(AO)
  456. LFD f30, 30 * SIZE(BO)
  457. LFD f27, 31 * SIZE(AO)
  458. LFD f31, 31 * SIZE(BO)
  459. FMADD f3, f19, f20, f3
  460. FMADD f7, f19, f21, f7
  461. FMADD f11, f19, f22, f11
  462. FMADD f15, f19, f23, f15
  463. FMADD f0, f24, f28, f0
  464. FMADD f4, f24, f29, f4
  465. FMADD f8, f24, f30, f8
  466. FMADD f12, f24, f31, f12
  467. LFD f16, 32 * SIZE(AO)
  468. LFD f20, 32 * SIZE(BO)
  469. LFD f17, 33 * SIZE(AO)
  470. LFD f21, 33 * SIZE(BO)
  471. FMADD f1, f25, f28, f1
  472. FMADD f5, f25, f29, f5
  473. FMADD f9, f25, f30, f9
  474. FMADD f13, f25, f31, f13
  475. FMADD f2, f26, f28, f2
  476. FMADD f6, f26, f29, f6
  477. FMADD f10, f26, f30, f10
  478. FMADD f14, f26, f31, f14
  479. LFD f18, 34 * SIZE(AO)
  480. LFD f22, 34 * SIZE(BO)
  481. LFD f19, 35 * SIZE(AO)
  482. LFD f23, 35 * SIZE(BO)
  483. addi AO, AO, 32 * SIZE
  484. addi BO, BO, 32 * SIZE
  485. FMADD f3, f27, f28, f3
  486. FMADD f7, f27, f29, f7
  487. FMADD f11, f27, f30, f11
  488. FMADD f15, f27, f31, f15
  489. bdnz LL(12)
  490. .align 4
  491. LL(15):
  492. #if defined(LT) || defined(RN)
  493. andi. r0, KK, 7
  494. #else
  495. andi. r0, TEMP, 7
  496. #endif
  497. mtspr CTR, r0
  498. ble+ LL(18)
  499. .align 4
  500. LL(16):
  501. FMADD f0, f16, f20, f0
  502. FMADD f4, f16, f21, f4
  503. FMADD f8, f16, f22, f8
  504. FMADD f12, f16, f23, f12
  505. FMADD f1, f17, f20, f1
  506. FMADD f5, f17, f21, f5
  507. FMADD f9, f17, f22, f9
  508. FMADD f13, f17, f23, f13
  509. FMADD f2, f18, f20, f2
  510. FMADD f6, f18, f21, f6
  511. FMADD f10, f18, f22, f10
  512. FMADD f14, f18, f23, f14
  513. FMADD f3, f19, f20, f3
  514. FMADD f7, f19, f21, f7
  515. FMADD f11, f19, f22, f11
  516. FMADD f15, f19, f23, f15
  517. LFD f16, 4 * SIZE(AO)
  518. LFD f17, 5 * SIZE(AO)
  519. LFD f18, 6 * SIZE(AO)
  520. LFD f19, 7 * SIZE(AO)
  521. LFD f20, 4 * SIZE(BO)
  522. LFD f21, 5 * SIZE(BO)
  523. LFD f22, 6 * SIZE(BO)
  524. LFD f23, 7 * SIZE(BO)
  525. addi BO, BO, 4 * SIZE
  526. addi AO, AO, 4 * SIZE
  527. bdnz LL(16)
  528. .align 4
  529. LL(18):
  530. #if defined(LN) || defined(RT)
  531. subi r0, KK, 4
  532. slwi r0, r0, 2 + BASE_SHIFT
  533. add AO, AORIG, r0
  534. add BO, B, r0
  535. #endif
  536. #if defined(LN) || defined(LT)
  537. LFD f16, 0 * SIZE(BO)
  538. LFD f17, 1 * SIZE(BO)
  539. LFD f18, 2 * SIZE(BO)
  540. LFD f19, 3 * SIZE(BO)
  541. LFD f20, 4 * SIZE(BO)
  542. LFD f21, 5 * SIZE(BO)
  543. LFD f22, 6 * SIZE(BO)
  544. LFD f23, 7 * SIZE(BO)
  545. LFD f24, 8 * SIZE(BO)
  546. LFD f25, 9 * SIZE(BO)
  547. LFD f26, 10 * SIZE(BO)
  548. LFD f27, 11 * SIZE(BO)
  549. LFD f28, 12 * SIZE(BO)
  550. LFD f29, 13 * SIZE(BO)
  551. LFD f30, 14 * SIZE(BO)
  552. LFD f31, 15 * SIZE(BO)
  553. FSUB f0, f16, f0
  554. FSUB f4, f17, f4
  555. FSUB f8, f18, f8
  556. FSUB f12, f19, f12
  557. FSUB f1, f20, f1
  558. FSUB f5, f21, f5
  559. FSUB f9, f22, f9
  560. FSUB f13, f23, f13
  561. FSUB f2, f24, f2
  562. FSUB f6, f25, f6
  563. FSUB f10, f26, f10
  564. FSUB f14, f27, f14
  565. FSUB f3, f28, f3
  566. FSUB f7, f29, f7
  567. FSUB f11, f30, f11
  568. FSUB f15, f31, f15
  569. #else
  570. LFD f16, 0 * SIZE(AO)
  571. LFD f17, 1 * SIZE(AO)
  572. LFD f18, 2 * SIZE(AO)
  573. LFD f19, 3 * SIZE(AO)
  574. LFD f20, 4 * SIZE(AO)
  575. LFD f21, 5 * SIZE(AO)
  576. LFD f22, 6 * SIZE(AO)
  577. LFD f23, 7 * SIZE(AO)
  578. LFD f24, 8 * SIZE(AO)
  579. LFD f25, 9 * SIZE(AO)
  580. LFD f26, 10 * SIZE(AO)
  581. LFD f27, 11 * SIZE(AO)
  582. LFD f28, 12 * SIZE(AO)
  583. LFD f29, 13 * SIZE(AO)
  584. LFD f30, 14 * SIZE(AO)
  585. LFD f31, 15 * SIZE(AO)
  586. FSUB f0, f16, f0
  587. FSUB f1, f17, f1
  588. FSUB f2, f18, f2
  589. FSUB f3, f19, f3
  590. FSUB f4, f20, f4
  591. FSUB f5, f21, f5
  592. FSUB f6, f22, f6
  593. FSUB f7, f23, f7
  594. FSUB f8, f24, f8
  595. FSUB f9, f25, f9
  596. FSUB f10, f26, f10
  597. FSUB f11, f27, f11
  598. FSUB f12, f28, f12
  599. FSUB f13, f29, f13
  600. FSUB f14, f30, f14
  601. FSUB f15, f31, f15
  602. #endif
  603. #ifdef LN
  604. LFD f16, 15 * SIZE(AO)
  605. LFD f17, 14 * SIZE(AO)
  606. LFD f18, 13 * SIZE(AO)
  607. LFD f19, 12 * SIZE(AO)
  608. FMUL f3, f16, f3
  609. FMUL f7, f16, f7
  610. FMUL f11, f16, f11
  611. FMUL f15, f16, f15
  612. FNMSUB f2, f17, f3, f2
  613. FNMSUB f6, f17, f7, f6
  614. FNMSUB f10, f17, f11, f10
  615. FNMSUB f14, f17, f15, f14
  616. FNMSUB f1, f18, f3, f1
  617. FNMSUB f5, f18, f7, f5
  618. FNMSUB f9, f18, f11, f9
  619. FNMSUB f13, f18, f15, f13
  620. FNMSUB f0, f19, f3, f0
  621. FNMSUB f4, f19, f7, f4
  622. FNMSUB f8, f19, f11, f8
  623. FNMSUB f12, f19, f15, f12
  624. LFD f16, 10 * SIZE(AO)
  625. LFD f17, 9 * SIZE(AO)
  626. LFD f18, 8 * SIZE(AO)
  627. LFD f19, 5 * SIZE(AO)
  628. FMUL f2, f16, f2
  629. FMUL f6, f16, f6
  630. FMUL f10, f16, f10
  631. FMUL f14, f16, f14
  632. LFD f20, 4 * SIZE(AO)
  633. LFD f21, 0 * SIZE(AO)
  634. FNMSUB f1, f17, f2, f1
  635. FNMSUB f5, f17, f6, f5
  636. FNMSUB f9, f17, f10, f9
  637. FNMSUB f13, f17, f14, f13
  638. FNMSUB f0, f18, f2, f0
  639. FNMSUB f4, f18, f6, f4
  640. FNMSUB f8, f18, f10, f8
  641. FNMSUB f12, f18, f14, f12
  642. FMUL f1, f19, f1
  643. FMUL f5, f19, f5
  644. FMUL f9, f19, f9
  645. FMUL f13, f19, f13
  646. FNMSUB f0, f20, f1, f0
  647. FNMSUB f4, f20, f5, f4
  648. FNMSUB f8, f20, f9, f8
  649. FNMSUB f12, f20, f13, f12
  650. FMUL f0, f21, f0
  651. FMUL f4, f21, f4
  652. FMUL f8, f21, f8
  653. FMUL f12, f21, f12
  654. #endif
  655. #ifdef LT
  656. LFD f16, 0 * SIZE(AO)
  657. LFD f17, 1 * SIZE(AO)
  658. LFD f18, 2 * SIZE(AO)
  659. LFD f19, 3 * SIZE(AO)
  660. FMUL f0, f16, f0
  661. FMUL f4, f16, f4
  662. FMUL f8, f16, f8
  663. FMUL f12, f16, f12
  664. FNMSUB f1, f17, f0, f1
  665. FNMSUB f5, f17, f4, f5
  666. FNMSUB f9, f17, f8, f9
  667. FNMSUB f13, f17, f12, f13
  668. FNMSUB f2, f18, f0, f2
  669. FNMSUB f6, f18, f4, f6
  670. FNMSUB f10, f18, f8, f10
  671. FNMSUB f14, f18, f12, f14
  672. FNMSUB f3, f19, f0, f3
  673. FNMSUB f7, f19, f4, f7
  674. FNMSUB f11, f19, f8, f11
  675. FNMSUB f15, f19, f12, f15
  676. LFD f16, 5 * SIZE(AO)
  677. LFD f17, 6 * SIZE(AO)
  678. LFD f18, 7 * SIZE(AO)
  679. LFD f19, 10 * SIZE(AO)
  680. FMUL f1, f16, f1
  681. FMUL f5, f16, f5
  682. FMUL f9, f16, f9
  683. FMUL f13, f16, f13
  684. LFD f20, 11 * SIZE(AO)
  685. LFD f21, 15 * SIZE(AO)
  686. FNMSUB f2, f17, f1, f2
  687. FNMSUB f6, f17, f5, f6
  688. FNMSUB f10, f17, f9, f10
  689. FNMSUB f14, f17, f13, f14
  690. FNMSUB f3, f18, f1, f3
  691. FNMSUB f7, f18, f5, f7
  692. FNMSUB f11, f18, f9, f11
  693. FNMSUB f15, f18, f13, f15
  694. FMUL f2, f19, f2
  695. FMUL f6, f19, f6
  696. FMUL f10, f19, f10
  697. FMUL f14, f19, f14
  698. FNMSUB f3, f20, f2, f3
  699. FNMSUB f7, f20, f6, f7
  700. FNMSUB f11, f20, f10, f11
  701. FNMSUB f15, f20, f14, f15
  702. FMUL f3, f21, f3
  703. FMUL f7, f21, f7
  704. FMUL f11, f21, f11
  705. FMUL f15, f21, f15
  706. #endif
  707. #ifdef RN
  708. LFD f16, 0 * SIZE(BO)
  709. LFD f17, 1 * SIZE(BO)
  710. LFD f18, 2 * SIZE(BO)
  711. LFD f19, 3 * SIZE(BO)
  712. FMUL f0, f16, f0
  713. FMUL f1, f16, f1
  714. FMUL f2, f16, f2
  715. FMUL f3, f16, f3
  716. FNMSUB f4, f17, f0, f4
  717. FNMSUB f5, f17, f1, f5
  718. FNMSUB f6, f17, f2, f6
  719. FNMSUB f7, f17, f3, f7
  720. FNMSUB f8, f18, f0, f8
  721. FNMSUB f9, f18, f1, f9
  722. FNMSUB f10, f18, f2, f10
  723. FNMSUB f11, f18, f3, f11
  724. FNMSUB f12, f19, f0, f12
  725. FNMSUB f13, f19, f1, f13
  726. FNMSUB f14, f19, f2, f14
  727. FNMSUB f15, f19, f3, f15
  728. LFD f16, 5 * SIZE(BO)
  729. LFD f17, 6 * SIZE(BO)
  730. LFD f18, 7 * SIZE(BO)
  731. LFD f19, 10 * SIZE(BO)
  732. FMUL f4, f16, f4
  733. FMUL f5, f16, f5
  734. FMUL f6, f16, f6
  735. FMUL f7, f16, f7
  736. LFD f20, 11 * SIZE(BO)
  737. LFD f21, 15 * SIZE(BO)
  738. FNMSUB f8, f17, f4, f8
  739. FNMSUB f9, f17, f5, f9
  740. FNMSUB f10, f17, f6, f10
  741. FNMSUB f11, f17, f7, f11
  742. FNMSUB f12, f18, f4, f12
  743. FNMSUB f13, f18, f5, f13
  744. FNMSUB f14, f18, f6, f14
  745. FNMSUB f15, f18, f7, f15
  746. FMUL f8, f19, f8
  747. FMUL f9, f19, f9
  748. FMUL f10, f19, f10
  749. FMUL f11, f19, f11
  750. FNMSUB f12, f20, f8, f12
  751. FNMSUB f13, f20, f9, f13
  752. FNMSUB f14, f20, f10, f14
  753. FNMSUB f15, f20, f11, f15
  754. FMUL f12, f21, f12
  755. FMUL f13, f21, f13
  756. FMUL f14, f21, f14
  757. FMUL f15, f21, f15
  758. #endif
  759. #ifdef RT
  760. LFD f16, 15 * SIZE(BO)
  761. LFD f17, 14 * SIZE(BO)
  762. LFD f18, 13 * SIZE(BO)
  763. LFD f19, 12 * SIZE(BO)
  764. FMUL f12, f16, f12
  765. FMUL f13, f16, f13
  766. FMUL f14, f16, f14
  767. FMUL f15, f16, f15
  768. FNMSUB f8, f17, f12, f8
  769. FNMSUB f9, f17, f13, f9
  770. FNMSUB f10, f17, f14, f10
  771. FNMSUB f11, f17, f15, f11
  772. FNMSUB f4, f18, f12, f4
  773. FNMSUB f5, f18, f13, f5
  774. FNMSUB f6, f18, f14, f6
  775. FNMSUB f7, f18, f15, f7
  776. FNMSUB f0, f19, f12, f0
  777. FNMSUB f1, f19, f13, f1
  778. FNMSUB f2, f19, f14, f2
  779. FNMSUB f3, f19, f15, f3
  780. LFD f16, 10 * SIZE(BO)
  781. LFD f17, 9 * SIZE(BO)
  782. LFD f18, 8 * SIZE(BO)
  783. LFD f19, 5 * SIZE(BO)
  784. FMUL f8, f16, f8
  785. FMUL f9, f16, f9
  786. FMUL f10, f16, f10
  787. FMUL f11, f16, f11
  788. LFD f20, 4 * SIZE(BO)
  789. LFD f21, 0 * SIZE(BO)
  790. FNMSUB f4, f17, f8, f4
  791. FNMSUB f5, f17, f9, f5
  792. FNMSUB f6, f17, f10, f6
  793. FNMSUB f7, f17, f11, f7
  794. FNMSUB f0, f18, f8, f0
  795. FNMSUB f1, f18, f9, f1
  796. FNMSUB f2, f18, f10, f2
  797. FNMSUB f3, f18, f11, f3
  798. FMUL f4, f19, f4
  799. FMUL f5, f19, f5
  800. FMUL f6, f19, f6
  801. FMUL f7, f19, f7
  802. FNMSUB f0, f20, f4, f0
  803. FNMSUB f1, f20, f5, f1
  804. FNMSUB f2, f20, f6, f2
  805. FNMSUB f3, f20, f7, f3
  806. FMUL f0, f21, f0
  807. FMUL f1, f21, f1
  808. FMUL f2, f21, f2
  809. FMUL f3, f21, f3
  810. #endif
  811. #ifdef LN
  812. subi CO1, CO1, 4 * SIZE
  813. subi CO2, CO2, 4 * SIZE
  814. subi CO3, CO3, 4 * SIZE
  815. subi CO4, CO4, 4 * SIZE
  816. #endif
  817. #if defined(LN) || defined(LT)
  818. STFD f0, 0 * SIZE(BO)
  819. STFD f4, 1 * SIZE(BO)
  820. STFD f8, 2 * SIZE(BO)
  821. STFD f12, 3 * SIZE(BO)
  822. STFD f1, 4 * SIZE(BO)
  823. STFD f5, 5 * SIZE(BO)
  824. STFD f9, 6 * SIZE(BO)
  825. STFD f13, 7 * SIZE(BO)
  826. STFD f2, 8 * SIZE(BO)
  827. STFD f6, 9 * SIZE(BO)
  828. STFD f10, 10 * SIZE(BO)
  829. STFD f14, 11 * SIZE(BO)
  830. STFD f3, 12 * SIZE(BO)
  831. STFD f7, 13 * SIZE(BO)
  832. STFD f11, 14 * SIZE(BO)
  833. STFD f15, 15 * SIZE(BO)
  834. #else
  835. STFD f0, 0 * SIZE(AO)
  836. STFD f1, 1 * SIZE(AO)
  837. STFD f2, 2 * SIZE(AO)
  838. STFD f3, 3 * SIZE(AO)
  839. STFD f4, 4 * SIZE(AO)
  840. STFD f5, 5 * SIZE(AO)
  841. STFD f6, 6 * SIZE(AO)
  842. STFD f7, 7 * SIZE(AO)
  843. STFD f8, 8 * SIZE(AO)
  844. STFD f9, 9 * SIZE(AO)
  845. STFD f10, 10 * SIZE(AO)
  846. STFD f11, 11 * SIZE(AO)
  847. STFD f12, 12 * SIZE(AO)
  848. STFD f13, 13 * SIZE(AO)
  849. STFD f14, 14 * SIZE(AO)
  850. STFD f15, 15 * SIZE(AO)
  851. #endif
  852. STFD f0, 0 * SIZE(CO1)
  853. STFD f1, 1 * SIZE(CO1)
  854. STFD f2, 2 * SIZE(CO1)
  855. STFD f3, 3 * SIZE(CO1)
  856. STFD f4, 0 * SIZE(CO2)
  857. STFD f5, 1 * SIZE(CO2)
  858. STFD f6, 2 * SIZE(CO2)
  859. STFD f7, 3 * SIZE(CO2)
  860. STFD f8, 0 * SIZE(CO3)
  861. STFD f9, 1 * SIZE(CO3)
  862. STFD f10, 2 * SIZE(CO3)
  863. STFD f11, 3 * SIZE(CO3)
  864. STFD f12, 0 * SIZE(CO4)
  865. STFD f13, 1 * SIZE(CO4)
  866. STFD f14, 2 * SIZE(CO4)
  867. STFD f15, 3 * SIZE(CO4)
  868. lfs f0, FZERO
  869. fmr f1, f0
  870. fmr f2, f0
  871. fmr f3, f0
  872. fmr f4, f0
  873. fmr f5, f0
  874. fmr f6, f0
  875. fmr f7, f0
  876. fmr f8, f0
  877. fmr f9, f0
  878. fmr f10, f0
  879. fmr f11, f0
  880. fmr f12, f0
  881. fmr f13, f0
  882. fmr f14, f0
  883. fmr f15, f0
  884. #ifndef LN
  885. addi CO1, CO1, 4 * SIZE
  886. addi CO2, CO2, 4 * SIZE
  887. addi CO3, CO3, 4 * SIZE
  888. addi CO4, CO4, 4 * SIZE
  889. #endif
  890. #ifdef RT
  891. slwi r0, K, 2 + BASE_SHIFT
  892. add AORIG, AORIG, r0
  893. #endif
  894. #if defined(LT) || defined(RN)
  895. sub TEMP, K, KK
  896. slwi TEMP, TEMP, 2 + BASE_SHIFT
  897. add AO, AO, TEMP
  898. add BO, BO, TEMP
  899. #endif
  900. #ifdef LT
  901. addi KK, KK, 4
  902. #endif
  903. #ifdef LN
  904. subi KK, KK, 4
  905. #endif
  906. addic. I, I, -1
  907. bgt+ LL(11)
  908. .align 4
  909. LL(20):
  910. andi. I, M, 2
  911. ble LL(30)
  912. #if defined(LT) || defined(RN)
  913. LFD f16, 0 * SIZE(AO)
  914. LFD f17, 1 * SIZE(AO)
  915. LFD f18, 2 * SIZE(AO)
  916. LFD f19, 3 * SIZE(AO)
  917. LFD f20, 0 * SIZE(B)
  918. LFD f21, 1 * SIZE(B)
  919. LFD f22, 2 * SIZE(B)
  920. LFD f23, 3 * SIZE(B)
  921. LFD f24, 4 * SIZE(B)
  922. LFD f25, 5 * SIZE(B)
  923. LFD f26, 6 * SIZE(B)
  924. LFD f27, 7 * SIZE(B)
  925. srawi. r0, KK, 2
  926. mtspr CTR, r0
  927. mr BO, B
  928. #else
  929. #ifdef LN
  930. slwi r0, K, 1 + BASE_SHIFT
  931. sub AORIG, AORIG, r0
  932. #endif
  933. slwi r0, KK, 1 + BASE_SHIFT
  934. slwi TEMP, KK, 2 + BASE_SHIFT
  935. add AO, AORIG, r0
  936. add BO, B, TEMP
  937. sub TEMP, K, KK
  938. LFD f16, 0 * SIZE(AO)
  939. LFD f17, 1 * SIZE(AO)
  940. LFD f18, 2 * SIZE(AO)
  941. LFD f19, 3 * SIZE(AO)
  942. LFD f20, 0 * SIZE(BO)
  943. LFD f21, 1 * SIZE(BO)
  944. LFD f22, 2 * SIZE(BO)
  945. LFD f23, 3 * SIZE(BO)
  946. LFD f24, 4 * SIZE(BO)
  947. LFD f25, 5 * SIZE(BO)
  948. LFD f26, 6 * SIZE(BO)
  949. LFD f27, 7 * SIZE(BO)
  950. srawi. r0, TEMP, 2
  951. mtspr CTR, r0
  952. #endif
  953. ble LL(25)
  954. .align 5
  955. LL(22):
  956. FMADD f0, f16, f20, f0
  957. FMADD f1, f17, f20, f1
  958. FMADD f4, f16, f21, f4
  959. FMADD f5, f17, f21, f5
  960. FMADD f8, f16, f22, f8
  961. FMADD f9, f17, f22, f9
  962. FMADD f12, f16, f23, f12
  963. FMADD f13, f17, f23, f13
  964. LFD f20, 8 * SIZE(BO)
  965. LFD f21, 9 * SIZE(BO)
  966. LFD f22, 10 * SIZE(BO)
  967. LFD f23, 11 * SIZE(BO)
  968. FMADD f2, f18, f24, f2
  969. FMADD f3, f19, f24, f3
  970. FMADD f6, f18, f25, f6
  971. FMADD f7, f19, f25, f7
  972. FMADD f10, f18, f26, f10
  973. FMADD f11, f19, f26, f11
  974. FMADD f14, f18, f27, f14
  975. FMADD f15, f19, f27, f15
  976. LFD f16, 4 * SIZE(AO)
  977. LFD f17, 5 * SIZE(AO)
  978. LFD f18, 6 * SIZE(AO)
  979. LFD f19, 7 * SIZE(AO)
  980. FMADD f0, f16, f20, f0
  981. FMADD f1, f17, f20, f1
  982. FMADD f4, f16, f21, f4
  983. FMADD f5, f17, f21, f5
  984. LFD f24, 12 * SIZE(BO)
  985. LFD f25, 13 * SIZE(BO)
  986. LFD f26, 14 * SIZE(BO)
  987. LFD f27, 15 * SIZE(BO)
  988. FMADD f8, f16, f22, f8
  989. FMADD f9, f17, f22, f9
  990. FMADD f12, f16, f23, f12
  991. FMADD f13, f17, f23, f13
  992. LFD f20, 16 * SIZE(BO)
  993. LFD f21, 17 * SIZE(BO)
  994. LFD f22, 18 * SIZE(BO)
  995. LFD f23, 19 * SIZE(BO)
  996. FMADD f2, f18, f24, f2
  997. FMADD f3, f19, f24, f3
  998. FMADD f6, f18, f25, f6
  999. FMADD f7, f19, f25, f7
  1000. FMADD f10, f18, f26, f10
  1001. FMADD f11, f19, f26, f11
  1002. FMADD f14, f18, f27, f14
  1003. FMADD f15, f19, f27, f15
  1004. LFD f16, 8 * SIZE(AO)
  1005. LFD f17, 9 * SIZE(AO)
  1006. LFD f18, 10 * SIZE(AO)
  1007. LFD f19, 11 * SIZE(AO)
  1008. LFD f24, 20 * SIZE(BO)
  1009. LFD f25, 21 * SIZE(BO)
  1010. LFD f26, 22 * SIZE(BO)
  1011. LFD f27, 23 * SIZE(BO)
  1012. addi AO, AO, 8 * SIZE
  1013. addi BO, BO, 16 * SIZE
  1014. dcbtst AO, PREA
  1015. bdnz LL(22)
  1016. fadd f0, f2, f0
  1017. fadd f1, f3, f1
  1018. fadd f4, f6, f4
  1019. fadd f5, f7, f5
  1020. fadd f8, f10, f8
  1021. fadd f9, f11, f9
  1022. fadd f12, f14, f12
  1023. fadd f13, f15, f13
  1024. .align 4
  1025. LL(25):
  1026. #if defined(LT) || defined(RN)
  1027. andi. r0, KK, 3
  1028. #else
  1029. andi. r0, TEMP, 3
  1030. #endif
  1031. mtspr CTR, r0
  1032. ble+ LL(28)
  1033. .align 4
  1034. LL(26):
  1035. FMADD f0, f16, f20, f0
  1036. FMADD f1, f17, f20, f1
  1037. FMADD f4, f16, f21, f4
  1038. FMADD f5, f17, f21, f5
  1039. FMADD f8, f16, f22, f8
  1040. FMADD f9, f17, f22, f9
  1041. FMADD f12, f16, f23, f12
  1042. FMADD f13, f17, f23, f13
  1043. LFD f16, 2 * SIZE(AO)
  1044. LFD f17, 3 * SIZE(AO)
  1045. LFD f20, 4 * SIZE(BO)
  1046. LFD f21, 5 * SIZE(BO)
  1047. LFD f22, 6 * SIZE(BO)
  1048. LFD f23, 7 * SIZE(BO)
  1049. addi BO, BO, 4 * SIZE
  1050. addi AO, AO, 2 * SIZE
  1051. bdnz LL(26)
  1052. .align 4
  1053. LL(28):
  1054. #if defined(LN) || defined(RT)
  1055. #ifdef LN
  1056. subi r0, KK, 2
  1057. #else
  1058. subi r0, KK, 4
  1059. #endif
  1060. slwi TEMP, r0, 1 + BASE_SHIFT
  1061. slwi r0, r0, 2 + BASE_SHIFT
  1062. add AO, AORIG, TEMP
  1063. add BO, B, r0
  1064. #endif
  1065. #if defined(LN) || defined(LT)
  1066. LFD f16, 0 * SIZE(BO)
  1067. LFD f17, 1 * SIZE(BO)
  1068. LFD f18, 2 * SIZE(BO)
  1069. LFD f19, 3 * SIZE(BO)
  1070. LFD f20, 4 * SIZE(BO)
  1071. LFD f21, 5 * SIZE(BO)
  1072. LFD f22, 6 * SIZE(BO)
  1073. LFD f23, 7 * SIZE(BO)
  1074. FSUB f0, f16, f0
  1075. FSUB f4, f17, f4
  1076. FSUB f8, f18, f8
  1077. FSUB f12, f19, f12
  1078. FSUB f1, f20, f1
  1079. FSUB f5, f21, f5
  1080. FSUB f9, f22, f9
  1081. FSUB f13, f23, f13
  1082. #else
  1083. LFD f16, 0 * SIZE(AO)
  1084. LFD f17, 1 * SIZE(AO)
  1085. LFD f20, 2 * SIZE(AO)
  1086. LFD f21, 3 * SIZE(AO)
  1087. LFD f24, 4 * SIZE(AO)
  1088. LFD f25, 5 * SIZE(AO)
  1089. LFD f28, 6 * SIZE(AO)
  1090. LFD f29, 7 * SIZE(AO)
  1091. FSUB f0, f16, f0
  1092. FSUB f1, f17, f1
  1093. FSUB f4, f20, f4
  1094. FSUB f5, f21, f5
  1095. FSUB f8, f24, f8
  1096. FSUB f9, f25, f9
  1097. FSUB f12, f28, f12
  1098. FSUB f13, f29, f13
  1099. #endif
  1100. #ifdef LN
  1101. LFD f19, 3 * SIZE(AO)
  1102. LFD f20, 2 * SIZE(AO)
  1103. LFD f21, 0 * SIZE(AO)
  1104. FMUL f1, f19, f1
  1105. FMUL f5, f19, f5
  1106. FMUL f9, f19, f9
  1107. FMUL f13, f19, f13
  1108. FNMSUB f0, f20, f1, f0
  1109. FNMSUB f4, f20, f5, f4
  1110. FNMSUB f8, f20, f9, f8
  1111. FNMSUB f12, f20, f13, f12
  1112. FMUL f0, f21, f0
  1113. FMUL f4, f21, f4
  1114. FMUL f8, f21, f8
  1115. FMUL f12, f21, f12
  1116. #endif
  1117. #ifdef LT
  1118. LFD f16, 0 * SIZE(AO)
  1119. LFD f17, 1 * SIZE(AO)
  1120. FMUL f0, f16, f0
  1121. FMUL f4, f16, f4
  1122. FMUL f8, f16, f8
  1123. FMUL f12, f16, f12
  1124. FNMSUB f1, f17, f0, f1
  1125. FNMSUB f5, f17, f4, f5
  1126. FNMSUB f9, f17, f8, f9
  1127. FNMSUB f13, f17, f12, f13
  1128. LFD f17, 3 * SIZE(AO)
  1129. FMUL f1, f17, f1
  1130. FMUL f5, f17, f5
  1131. FMUL f9, f17, f9
  1132. FMUL f13, f17, f13
  1133. #endif
  1134. #ifdef RN
  1135. LFD f16, 0 * SIZE(BO)
  1136. LFD f17, 1 * SIZE(BO)
  1137. LFD f18, 2 * SIZE(BO)
  1138. LFD f19, 3 * SIZE(BO)
  1139. FMUL f0, f16, f0
  1140. FMUL f1, f16, f1
  1141. FNMSUB f4, f17, f0, f4
  1142. FNMSUB f5, f17, f1, f5
  1143. FNMSUB f8, f18, f0, f8
  1144. FNMSUB f9, f18, f1, f9
  1145. FNMSUB f12, f19, f0, f12
  1146. FNMSUB f13, f19, f1, f13
  1147. LFD f16, 5 * SIZE(BO)
  1148. LFD f17, 6 * SIZE(BO)
  1149. LFD f18, 7 * SIZE(BO)
  1150. LFD f19, 10 * SIZE(BO)
  1151. LFD f20, 11 * SIZE(BO)
  1152. LFD f21, 15 * SIZE(BO)
  1153. FMUL f4, f16, f4
  1154. FMUL f5, f16, f5
  1155. FNMSUB f8, f17, f4, f8
  1156. FNMSUB f9, f17, f5, f9
  1157. FNMSUB f12, f18, f4, f12
  1158. FNMSUB f13, f18, f5, f13
  1159. FMUL f8, f19, f8
  1160. FMUL f9, f19, f9
  1161. FNMSUB f12, f20, f8, f12
  1162. FNMSUB f13, f20, f9, f13
  1163. FMUL f12, f21, f12
  1164. FMUL f13, f21, f13
  1165. #endif
  1166. #ifdef RT
  1167. LFD f16, 15 * SIZE(BO)
  1168. LFD f17, 14 * SIZE(BO)
  1169. LFD f18, 13 * SIZE(BO)
  1170. LFD f19, 12 * SIZE(BO)
  1171. FMUL f12, f16, f12
  1172. FMUL f13, f16, f13
  1173. FNMSUB f8, f17, f12, f8
  1174. FNMSUB f9, f17, f13, f9
  1175. FNMSUB f4, f18, f12, f4
  1176. FNMSUB f5, f18, f13, f5
  1177. FNMSUB f0, f19, f12, f0
  1178. FNMSUB f1, f19, f13, f1
  1179. LFD f16, 10 * SIZE(BO)
  1180. LFD f17, 9 * SIZE(BO)
  1181. LFD f18, 8 * SIZE(BO)
  1182. LFD f19, 5 * SIZE(BO)
  1183. LFD f20, 4 * SIZE(BO)
  1184. LFD f21, 0 * SIZE(BO)
  1185. FMUL f8, f16, f8
  1186. FMUL f9, f16, f9
  1187. FNMSUB f4, f17, f8, f4
  1188. FNMSUB f5, f17, f9, f5
  1189. FNMSUB f0, f18, f8, f0
  1190. FNMSUB f1, f18, f9, f1
  1191. FMUL f4, f19, f4
  1192. FMUL f5, f19, f5
  1193. FNMSUB f0, f20, f4, f0
  1194. FNMSUB f1, f20, f5, f1
  1195. FMUL f0, f21, f0
  1196. FMUL f1, f21, f1
  1197. #endif
  1198. #ifdef LN
  1199. subi CO1, CO1, 2 * SIZE
  1200. subi CO2, CO2, 2 * SIZE
  1201. subi CO3, CO3, 2 * SIZE
  1202. subi CO4, CO4, 2 * SIZE
  1203. #endif
  1204. #if defined(LN) || defined(LT)
  1205. STFD f0, 0 * SIZE(BO)
  1206. STFD f4, 1 * SIZE(BO)
  1207. STFD f8, 2 * SIZE(BO)
  1208. STFD f12, 3 * SIZE(BO)
  1209. STFD f1, 4 * SIZE(BO)
  1210. STFD f5, 5 * SIZE(BO)
  1211. STFD f9, 6 * SIZE(BO)
  1212. STFD f13, 7 * SIZE(BO)
  1213. #else
  1214. STFD f0, 0 * SIZE(AO)
  1215. STFD f1, 1 * SIZE(AO)
  1216. STFD f4, 2 * SIZE(AO)
  1217. STFD f5, 3 * SIZE(AO)
  1218. STFD f8, 4 * SIZE(AO)
  1219. STFD f9, 5 * SIZE(AO)
  1220. STFD f12, 6 * SIZE(AO)
  1221. STFD f13, 7 * SIZE(AO)
  1222. #endif
  1223. STFD f0, 0 * SIZE(CO1)
  1224. STFD f1, 1 * SIZE(CO1)
  1225. STFD f4, 0 * SIZE(CO2)
  1226. STFD f5, 1 * SIZE(CO2)
  1227. STFD f8, 0 * SIZE(CO3)
  1228. STFD f9, 1 * SIZE(CO3)
  1229. STFD f12, 0 * SIZE(CO4)
  1230. STFD f13, 1 * SIZE(CO4)
  1231. lfs f0, FZERO
  1232. fmr f1, f0
  1233. fmr f2, f0
  1234. fmr f3, f0
  1235. fmr f4, f0
  1236. fmr f5, f0
  1237. fmr f6, f0
  1238. fmr f7, f0
  1239. fmr f8, f0
  1240. fmr f9, f0
  1241. fmr f10, f0
  1242. fmr f11, f0
  1243. fmr f12, f0
  1244. fmr f13, f0
  1245. fmr f14, f0
  1246. fmr f15, f0
  1247. #ifndef LN
  1248. addi CO1, CO1, 2 * SIZE
  1249. addi CO2, CO2, 2 * SIZE
  1250. addi CO3, CO3, 2 * SIZE
  1251. addi CO4, CO4, 2 * SIZE
  1252. #endif
  1253. #ifdef RT
  1254. slwi r0, K, 1 + BASE_SHIFT
  1255. add AORIG, AORIG, r0
  1256. #endif
  1257. #if defined(LT) || defined(RN)
  1258. sub TEMP, K, KK
  1259. slwi r0, TEMP, 1 + BASE_SHIFT
  1260. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1261. add AO, AO, r0
  1262. add BO, BO, TEMP
  1263. #endif
  1264. #ifdef LN
  1265. subi KK, KK, 2
  1266. #endif
  1267. #ifdef LT
  1268. addi KK, KK, 2
  1269. #endif
  1270. .align 4
  1271. LL(30):
  1272. andi. I, M, 1
  1273. ble LL(39)
  1274. #if defined(LT) || defined(RN)
  1275. LFD f16, 0 * SIZE(AO)
  1276. LFD f17, 1 * SIZE(AO)
  1277. LFD f18, 2 * SIZE(AO)
  1278. LFD f19, 3 * SIZE(AO)
  1279. LFD f20, 0 * SIZE(B)
  1280. LFD f21, 1 * SIZE(B)
  1281. LFD f22, 2 * SIZE(B)
  1282. LFD f23, 3 * SIZE(B)
  1283. LFD f24, 4 * SIZE(B)
  1284. LFD f25, 5 * SIZE(B)
  1285. LFD f26, 6 * SIZE(B)
  1286. LFD f27, 7 * SIZE(B)
  1287. srawi. r0, KK, 2
  1288. mtspr CTR, r0
  1289. mr BO, B
  1290. #else
  1291. #ifdef LN
  1292. slwi r0, K, BASE_SHIFT
  1293. sub AORIG, AORIG, r0
  1294. #endif
  1295. slwi r0, KK, 0 + BASE_SHIFT
  1296. slwi TEMP, KK, 2 + BASE_SHIFT
  1297. add AO, AORIG, r0
  1298. add BO, B, TEMP
  1299. sub TEMP, K, KK
  1300. LFD f16, 0 * SIZE(AO)
  1301. LFD f17, 1 * SIZE(AO)
  1302. LFD f18, 2 * SIZE(AO)
  1303. LFD f19, 3 * SIZE(AO)
  1304. LFD f20, 0 * SIZE(BO)
  1305. LFD f21, 1 * SIZE(BO)
  1306. LFD f22, 2 * SIZE(BO)
  1307. LFD f23, 3 * SIZE(BO)
  1308. LFD f24, 4 * SIZE(BO)
  1309. LFD f25, 5 * SIZE(BO)
  1310. LFD f26, 6 * SIZE(BO)
  1311. LFD f27, 7 * SIZE(BO)
  1312. srawi. r0, TEMP, 2
  1313. mtspr CTR, r0
  1314. #endif
  1315. ble LL(35)
  1316. .align 5
  1317. LL(32):
  1318. FMADD f0, f16, f20, f0
  1319. FMADD f4, f16, f21, f4
  1320. FMADD f8, f16, f22, f8
  1321. FMADD f12, f16, f23, f12
  1322. LFD f20, 8 * SIZE(BO)
  1323. LFD f21, 9 * SIZE(BO)
  1324. LFD f22, 10 * SIZE(BO)
  1325. LFD f23, 11 * SIZE(BO)
  1326. FMADD f1, f17, f24, f1
  1327. FMADD f5, f17, f25, f5
  1328. FMADD f9, f17, f26, f9
  1329. FMADD f13, f17, f27, f13
  1330. LFD f24, 12 * SIZE(BO)
  1331. LFD f25, 13 * SIZE(BO)
  1332. LFD f26, 14 * SIZE(BO)
  1333. LFD f27, 15 * SIZE(BO)
  1334. FMADD f0, f18, f20, f0
  1335. FMADD f4, f18, f21, f4
  1336. FMADD f8, f18, f22, f8
  1337. FMADD f12, f18, f23, f12
  1338. LFD f20, 16 * SIZE(BO)
  1339. LFD f21, 17 * SIZE(BO)
  1340. LFD f22, 18 * SIZE(BO)
  1341. LFD f23, 19 * SIZE(BO)
  1342. FMADD f1, f19, f24, f1
  1343. FMADD f5, f19, f25, f5
  1344. FMADD f9, f19, f26, f9
  1345. FMADD f13, f19, f27, f13
  1346. LFD f16, 4 * SIZE(AO)
  1347. LFD f17, 5 * SIZE(AO)
  1348. LFD f18, 6 * SIZE(AO)
  1349. LFD f19, 7 * SIZE(AO)
  1350. LFD f24, 20 * SIZE(BO)
  1351. LFD f25, 21 * SIZE(BO)
  1352. LFD f26, 22 * SIZE(BO)
  1353. LFD f27, 23 * SIZE(BO)
  1354. addi AO, AO, 4 * SIZE
  1355. addi BO, BO, 16 * SIZE
  1356. dcbtst AO, PREA
  1357. bdnz LL(32)
  1358. fadd f0, f1, f0
  1359. fadd f4, f5, f4
  1360. fadd f8, f9, f8
  1361. fadd f12, f13, f12
  1362. .align 4
  1363. LL(35):
  1364. #if defined(LT) || defined(RN)
  1365. andi. r0, KK, 3
  1366. #else
  1367. andi. r0, TEMP, 3
  1368. #endif
  1369. mtspr CTR, r0
  1370. ble+ LL(38)
  1371. .align 4
  1372. LL(36):
  1373. FMADD f0, f16, f20, f0
  1374. FMADD f4, f16, f21, f4
  1375. FMADD f8, f16, f22, f8
  1376. FMADD f12, f16, f23, f12
  1377. LFD f16, 1 * SIZE(AO)
  1378. LFD f20, 4 * SIZE(BO)
  1379. LFD f21, 5 * SIZE(BO)
  1380. LFD f22, 6 * SIZE(BO)
  1381. LFD f23, 7 * SIZE(BO)
  1382. addi BO, BO, 4 * SIZE
  1383. addi AO, AO, 1 * SIZE
  1384. bdnz LL(36)
  1385. .align 4
  1386. LL(38):
  1387. #if defined(LN) || defined(RT)
  1388. #ifdef LN
  1389. subi r0, KK, 1
  1390. #else
  1391. subi r0, KK, 4
  1392. #endif
  1393. slwi TEMP, r0, 0 + BASE_SHIFT
  1394. slwi r0, r0, 2 + BASE_SHIFT
  1395. add AO, AORIG, TEMP
  1396. add BO, B, r0
  1397. #endif
  1398. #if defined(LN) || defined(LT)
  1399. LFD f16, 0 * SIZE(BO)
  1400. LFD f17, 1 * SIZE(BO)
  1401. LFD f18, 2 * SIZE(BO)
  1402. LFD f19, 3 * SIZE(BO)
  1403. FSUB f0, f16, f0
  1404. FSUB f4, f17, f4
  1405. FSUB f8, f18, f8
  1406. FSUB f12, f19, f12
  1407. #else
  1408. LFD f16, 0 * SIZE(AO)
  1409. LFD f20, 1 * SIZE(AO)
  1410. LFD f24, 2 * SIZE(AO)
  1411. LFD f28, 3 * SIZE(AO)
  1412. FSUB f0, f16, f0
  1413. FSUB f4, f20, f4
  1414. FSUB f8, f24, f8
  1415. FSUB f12, f28, f12
  1416. #endif
  1417. #ifdef LN
  1418. LFD f21, 0 * SIZE(AO)
  1419. FMUL f0, f21, f0
  1420. FMUL f4, f21, f4
  1421. FMUL f8, f21, f8
  1422. FMUL f12, f21, f12
  1423. #endif
  1424. #ifdef LT
  1425. LFD f16, 0 * SIZE(AO)
  1426. FMUL f0, f16, f0
  1427. FMUL f4, f16, f4
  1428. FMUL f8, f16, f8
  1429. FMUL f12, f16, f12
  1430. #endif
  1431. #ifdef RN
  1432. LFD f16, 0 * SIZE(BO)
  1433. LFD f17, 1 * SIZE(BO)
  1434. LFD f18, 2 * SIZE(BO)
  1435. LFD f19, 3 * SIZE(BO)
  1436. FMUL f0, f16, f0
  1437. FNMSUB f4, f17, f0, f4
  1438. FNMSUB f8, f18, f0, f8
  1439. FNMSUB f12, f19, f0, f12
  1440. LFD f16, 5 * SIZE(BO)
  1441. LFD f17, 6 * SIZE(BO)
  1442. LFD f18, 7 * SIZE(BO)
  1443. LFD f19, 10 * SIZE(BO)
  1444. LFD f20, 11 * SIZE(BO)
  1445. LFD f21, 15 * SIZE(BO)
  1446. FMUL f4, f16, f4
  1447. FNMSUB f8, f17, f4, f8
  1448. FNMSUB f12, f18, f4, f12
  1449. FMUL f8, f19, f8
  1450. FNMSUB f12, f20, f8, f12
  1451. FMUL f12, f21, f12
  1452. #endif
  1453. #ifdef RT
  1454. LFD f16, 15 * SIZE(BO)
  1455. LFD f17, 14 * SIZE(BO)
  1456. LFD f18, 13 * SIZE(BO)
  1457. LFD f19, 12 * SIZE(BO)
  1458. FMUL f12, f16, f12
  1459. FNMSUB f8, f17, f12, f8
  1460. FNMSUB f4, f18, f12, f4
  1461. FNMSUB f0, f19, f12, f0
  1462. LFD f16, 10 * SIZE(BO)
  1463. LFD f17, 9 * SIZE(BO)
  1464. LFD f18, 8 * SIZE(BO)
  1465. LFD f19, 5 * SIZE(BO)
  1466. FMUL f8, f16, f8
  1467. LFD f20, 4 * SIZE(BO)
  1468. LFD f21, 0 * SIZE(BO)
  1469. FNMSUB f4, f17, f8, f4
  1470. FNMSUB f0, f18, f8, f0
  1471. FMUL f4, f19, f4
  1472. FNMSUB f0, f20, f4, f0
  1473. FMUL f0, f21, f0
  1474. #endif
  1475. #ifdef LN
  1476. subi CO1, CO1, 1 * SIZE
  1477. subi CO2, CO2, 1 * SIZE
  1478. subi CO3, CO3, 1 * SIZE
  1479. subi CO4, CO4, 1 * SIZE
  1480. #endif
  1481. #if defined(LN) || defined(LT)
  1482. STFD f0, 0 * SIZE(BO)
  1483. STFD f4, 1 * SIZE(BO)
  1484. STFD f8, 2 * SIZE(BO)
  1485. STFD f12, 3 * SIZE(BO)
  1486. #else
  1487. STFD f0, 0 * SIZE(AO)
  1488. STFD f4, 1 * SIZE(AO)
  1489. STFD f8, 2 * SIZE(AO)
  1490. STFD f12, 3 * SIZE(AO)
  1491. #endif
  1492. STFD f0, 0 * SIZE(CO1)
  1493. STFD f4, 0 * SIZE(CO2)
  1494. STFD f8, 0 * SIZE(CO3)
  1495. STFD f12, 0 * SIZE(CO4)
  1496. lfs f0, FZERO
  1497. fmr f1, f0
  1498. fmr f4, f0
  1499. fmr f5, f0
  1500. fmr f8, f0
  1501. fmr f9, f0
  1502. fmr f12, f0
  1503. fmr f13, f0
  1504. #ifndef LN
  1505. addi CO1, CO1, 1 * SIZE
  1506. addi CO2, CO2, 1 * SIZE
  1507. addi CO3, CO3, 1 * SIZE
  1508. addi CO4, CO4, 1 * SIZE
  1509. #endif
  1510. #ifdef RT
  1511. slwi r0, K, 0 + BASE_SHIFT
  1512. add AORIG, AORIG, r0
  1513. #endif
  1514. #if defined(LT) || defined(RN)
  1515. sub TEMP, K, KK
  1516. slwi r0, TEMP, 0 + BASE_SHIFT
  1517. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1518. add AO, AO, r0
  1519. add BO, BO, TEMP
  1520. #endif
  1521. #ifdef LN
  1522. subi KK, KK, 1
  1523. #endif
  1524. #ifdef LT
  1525. addi KK, KK, 1
  1526. #endif
  1527. .align 4
  1528. LL(39):
  1529. #ifdef LN
  1530. slwi r0, K, 2 + BASE_SHIFT
  1531. add B, B, r0
  1532. #endif
  1533. #if defined(LT) || defined(RN)
  1534. mr B, BO
  1535. #endif
  1536. #ifdef RN
  1537. addi KK, KK, 4
  1538. #endif
  1539. #ifdef RT
  1540. subi KK, KK, 4
  1541. #endif
  1542. addic. J, J, -1
  1543. lfs f0, FZERO
  1544. bgt LL(10)
  1545. .align 4
  1546. LL(40):
  1547. andi. J, N, 2
  1548. ble LL(70)
  1549. #ifdef RT
  1550. slwi r0, K, 1 + BASE_SHIFT
  1551. sub B, B, r0
  1552. slwi r0, LDC, 1
  1553. sub C, C, r0
  1554. #endif
  1555. mr CO1, C
  1556. add CO2, C, LDC
  1557. #ifdef LN
  1558. add KK, M, OFFSET
  1559. #endif
  1560. #ifdef LT
  1561. mr KK, OFFSET
  1562. #endif
  1563. fmr f1, f0
  1564. fmr f2, f0
  1565. fmr f3, f0
  1566. fmr f4, f0
  1567. fmr f5, f0
  1568. fmr f6, f0
  1569. fmr f7, f0
  1570. srawi. I, M, 2
  1571. #if defined(LN) || defined(RT)
  1572. mr AORIG, A
  1573. #else
  1574. mr AO, A
  1575. #endif
  1576. #ifndef RT
  1577. add C, CO2, LDC
  1578. #endif
  1579. ble LL(50)
  1580. .align 4
  1581. LL(41):
  1582. #if defined(LT) || defined(RN)
  1583. LFD f16, 0 * SIZE(AO)
  1584. LFD f17, 1 * SIZE(AO)
  1585. LFD f18, 2 * SIZE(AO)
  1586. LFD f19, 3 * SIZE(AO)
  1587. LFD f20, 0 * SIZE(B)
  1588. LFD f21, 1 * SIZE(B)
  1589. LFD f22, 2 * SIZE(B)
  1590. LFD f23, 3 * SIZE(B)
  1591. dcbt CO1, PREC
  1592. dcbt CO2, PREC
  1593. srawi. r0, KK, 2
  1594. mtspr CTR, r0
  1595. mr BO, B
  1596. #else
  1597. #ifdef LN
  1598. slwi r0, K, 2 + BASE_SHIFT
  1599. sub AORIG, AORIG, r0
  1600. #endif
  1601. slwi r0, KK, 2 + BASE_SHIFT
  1602. slwi TEMP, KK, 1 + BASE_SHIFT
  1603. add AO, AORIG, r0
  1604. add BO, B, TEMP
  1605. sub TEMP, K, KK
  1606. LFD f16, 0 * SIZE(AO)
  1607. LFD f17, 1 * SIZE(AO)
  1608. LFD f18, 2 * SIZE(AO)
  1609. LFD f19, 3 * SIZE(AO)
  1610. LFD f20, 0 * SIZE(BO)
  1611. LFD f21, 1 * SIZE(BO)
  1612. LFD f22, 2 * SIZE(BO)
  1613. LFD f23, 3 * SIZE(BO)
  1614. dcbt CO1, PREC
  1615. dcbt CO2, PREC
  1616. srawi. r0, TEMP, 2
  1617. mtspr CTR, r0
  1618. #endif
  1619. ble LL(45)
  1620. .align 5
  1621. LL(42):
  1622. FMADD f0, f16, f20, f0
  1623. FMADD f1, f17, f20, f1
  1624. FMADD f2, f18, f20, f2
  1625. FMADD f3, f19, f20, f3
  1626. FMADD f4, f16, f21, f4
  1627. FMADD f5, f17, f21, f5
  1628. FMADD f6, f18, f21, f6
  1629. FMADD f7, f19, f21, f7
  1630. LFD f16, 4 * SIZE(AO)
  1631. LFD f17, 5 * SIZE(AO)
  1632. LFD f18, 6 * SIZE(AO)
  1633. LFD f19, 7 * SIZE(AO)
  1634. FMADD f0, f16, f22, f0
  1635. FMADD f1, f17, f22, f1
  1636. FMADD f2, f18, f22, f2
  1637. FMADD f3, f19, f22, f3
  1638. FMADD f4, f16, f23, f4
  1639. FMADD f5, f17, f23, f5
  1640. FMADD f6, f18, f23, f6
  1641. FMADD f7, f19, f23, f7
  1642. LFD f16, 8 * SIZE(AO)
  1643. LFD f17, 9 * SIZE(AO)
  1644. LFD f18, 10 * SIZE(AO)
  1645. LFD f19, 11 * SIZE(AO)
  1646. LFD f20, 4 * SIZE(BO)
  1647. LFD f21, 5 * SIZE(BO)
  1648. LFD f22, 6 * SIZE(BO)
  1649. LFD f23, 7 * SIZE(BO)
  1650. FMADD f0, f16, f20, f0
  1651. FMADD f1, f17, f20, f1
  1652. FMADD f2, f18, f20, f2
  1653. FMADD f3, f19, f20, f3
  1654. FMADD f4, f16, f21, f4
  1655. FMADD f5, f17, f21, f5
  1656. FMADD f6, f18, f21, f6
  1657. FMADD f7, f19, f21, f7
  1658. LFD f16, 12 * SIZE(AO)
  1659. LFD f17, 13 * SIZE(AO)
  1660. LFD f18, 14 * SIZE(AO)
  1661. LFD f19, 15 * SIZE(AO)
  1662. FMADD f0, f16, f22, f0
  1663. FMADD f1, f17, f22, f1
  1664. FMADD f2, f18, f22, f2
  1665. FMADD f3, f19, f22, f3
  1666. FMADD f4, f16, f23, f4
  1667. FMADD f5, f17, f23, f5
  1668. FMADD f6, f18, f23, f6
  1669. FMADD f7, f19, f23, f7
  1670. LFD f16, 16 * SIZE(AO)
  1671. LFD f17, 17 * SIZE(AO)
  1672. LFD f18, 18 * SIZE(AO)
  1673. LFD f19, 19 * SIZE(AO)
  1674. LFD f20, 8 * SIZE(BO)
  1675. LFD f21, 9 * SIZE(BO)
  1676. LFD f22, 10 * SIZE(BO)
  1677. LFD f23, 11 * SIZE(BO)
  1678. addi AO, AO, 16 * SIZE
  1679. addi BO, BO, 8 * SIZE
  1680. dcbtst AO, PREA
  1681. bdnz LL(42)
  1682. .align 4
  1683. LL(45):
  1684. #if defined(LT) || defined(RN)
  1685. andi. r0, KK, 3
  1686. #else
  1687. andi. r0, TEMP, 3
  1688. #endif
  1689. mtspr CTR, r0
  1690. ble+ LL(48)
  1691. .align 4
  1692. LL(46):
  1693. FMADD f0, f16, f20, f0
  1694. FMADD f1, f17, f20, f1
  1695. FMADD f2, f18, f20, f2
  1696. FMADD f3, f19, f20, f3
  1697. FMADD f4, f16, f21, f4
  1698. FMADD f5, f17, f21, f5
  1699. FMADD f6, f18, f21, f6
  1700. FMADD f7, f19, f21, f7
  1701. LFD f16, 4 * SIZE(AO)
  1702. LFD f17, 5 * SIZE(AO)
  1703. LFD f18, 6 * SIZE(AO)
  1704. LFD f19, 7 * SIZE(AO)
  1705. LFD f20, 2 * SIZE(BO)
  1706. LFD f21, 3 * SIZE(BO)
  1707. addi BO, BO, 2 * SIZE
  1708. addi AO, AO, 4 * SIZE
  1709. bdnz LL(46)
  1710. .align 4
  1711. LL(48):
  1712. #if defined(LN) || defined(RT)
  1713. #ifdef LN
  1714. subi r0, KK, 4
  1715. #else
  1716. subi r0, KK, 2
  1717. #endif
  1718. slwi TEMP, r0, 2 + BASE_SHIFT
  1719. slwi r0, r0, 1 + BASE_SHIFT
  1720. add AO, AORIG, TEMP
  1721. add BO, B, r0
  1722. #endif
  1723. #if defined(LN) || defined(LT)
  1724. LFD f16, 0 * SIZE(BO)
  1725. LFD f17, 1 * SIZE(BO)
  1726. LFD f20, 2 * SIZE(BO)
  1727. LFD f21, 3 * SIZE(BO)
  1728. LFD f24, 4 * SIZE(BO)
  1729. LFD f25, 5 * SIZE(BO)
  1730. LFD f28, 6 * SIZE(BO)
  1731. LFD f29, 7 * SIZE(BO)
  1732. FSUB f0, f16, f0
  1733. FSUB f4, f17, f4
  1734. FSUB f1, f20, f1
  1735. FSUB f5, f21, f5
  1736. FSUB f2, f24, f2
  1737. FSUB f6, f25, f6
  1738. FSUB f3, f28, f3
  1739. FSUB f7, f29, f7
  1740. #else
  1741. LFD f16, 0 * SIZE(AO)
  1742. LFD f17, 1 * SIZE(AO)
  1743. LFD f18, 2 * SIZE(AO)
  1744. LFD f19, 3 * SIZE(AO)
  1745. LFD f20, 4 * SIZE(AO)
  1746. LFD f21, 5 * SIZE(AO)
  1747. LFD f22, 6 * SIZE(AO)
  1748. LFD f23, 7 * SIZE(AO)
  1749. FSUB f0, f16, f0
  1750. FSUB f1, f17, f1
  1751. FSUB f2, f18, f2
  1752. FSUB f3, f19, f3
  1753. FSUB f4, f20, f4
  1754. FSUB f5, f21, f5
  1755. FSUB f6, f22, f6
  1756. FSUB f7, f23, f7
  1757. #endif
  1758. #ifdef LN
  1759. LFD f16, 15 * SIZE(AO)
  1760. LFD f17, 14 * SIZE(AO)
  1761. LFD f18, 13 * SIZE(AO)
  1762. LFD f19, 12 * SIZE(AO)
  1763. FMUL f3, f16, f3
  1764. FMUL f7, f16, f7
  1765. FNMSUB f2, f17, f3, f2
  1766. FNMSUB f6, f17, f7, f6
  1767. FNMSUB f1, f18, f3, f1
  1768. FNMSUB f5, f18, f7, f5
  1769. FNMSUB f0, f19, f3, f0
  1770. FNMSUB f4, f19, f7, f4
  1771. LFD f16, 10 * SIZE(AO)
  1772. LFD f17, 9 * SIZE(AO)
  1773. LFD f18, 8 * SIZE(AO)
  1774. LFD f19, 5 * SIZE(AO)
  1775. LFD f20, 4 * SIZE(AO)
  1776. LFD f21, 0 * SIZE(AO)
  1777. FMUL f2, f16, f2
  1778. FMUL f6, f16, f6
  1779. FNMSUB f1, f17, f2, f1
  1780. FNMSUB f5, f17, f6, f5
  1781. FNMSUB f0, f18, f2, f0
  1782. FNMSUB f4, f18, f6, f4
  1783. FMUL f1, f19, f1
  1784. FMUL f5, f19, f5
  1785. FNMSUB f0, f20, f1, f0
  1786. FNMSUB f4, f20, f5, f4
  1787. FMUL f0, f21, f0
  1788. FMUL f4, f21, f4
  1789. #endif
  1790. #ifdef LT
  1791. LFD f16, 0 * SIZE(AO)
  1792. LFD f17, 1 * SIZE(AO)
  1793. LFD f18, 2 * SIZE(AO)
  1794. LFD f19, 3 * SIZE(AO)
  1795. FMUL f0, f16, f0
  1796. FMUL f4, f16, f4
  1797. FNMSUB f1, f17, f0, f1
  1798. FNMSUB f5, f17, f4, f5
  1799. FNMSUB f2, f18, f0, f2
  1800. FNMSUB f6, f18, f4, f6
  1801. FNMSUB f3, f19, f0, f3
  1802. FNMSUB f7, f19, f4, f7
  1803. LFD f17, 5 * SIZE(AO)
  1804. LFD f18, 6 * SIZE(AO)
  1805. LFD f19, 7 * SIZE(AO)
  1806. FMUL f1, f17, f1
  1807. FMUL f5, f17, f5
  1808. FNMSUB f2, f18, f1, f2
  1809. FNMSUB f6, f18, f5, f6
  1810. FNMSUB f3, f19, f1, f3
  1811. FNMSUB f7, f19, f5, f7
  1812. LFD f18, 10 * SIZE(AO)
  1813. LFD f19, 11 * SIZE(AO)
  1814. FMUL f2, f18, f2
  1815. FMUL f6, f18, f6
  1816. FNMSUB f3, f19, f2, f3
  1817. FNMSUB f7, f19, f6, f7
  1818. LFD f19, 15 * SIZE(AO)
  1819. FMUL f3, f19, f3
  1820. FMUL f7, f19, f7
  1821. #endif
  1822. #ifdef RN
  1823. LFD f16, 0 * SIZE(BO)
  1824. LFD f17, 1 * SIZE(BO)
  1825. LFD f18, 3 * SIZE(BO)
  1826. FMUL f0, f16, f0
  1827. FMUL f1, f16, f1
  1828. FMUL f2, f16, f2
  1829. FMUL f3, f16, f3
  1830. FNMSUB f4, f17, f0, f4
  1831. FNMSUB f5, f17, f1, f5
  1832. FNMSUB f6, f17, f2, f6
  1833. FNMSUB f7, f17, f3, f7
  1834. FMUL f4, f18, f4
  1835. FMUL f5, f18, f5
  1836. FMUL f6, f18, f6
  1837. FMUL f7, f18, f7
  1838. #endif
  1839. #ifdef RT
  1840. LFD f19, 3 * SIZE(BO)
  1841. LFD f20, 2 * SIZE(BO)
  1842. LFD f21, 0 * SIZE(BO)
  1843. FMUL f4, f19, f4
  1844. FMUL f5, f19, f5
  1845. FMUL f6, f19, f6
  1846. FMUL f7, f19, f7
  1847. FNMSUB f0, f20, f4, f0
  1848. FNMSUB f1, f20, f5, f1
  1849. FNMSUB f2, f20, f6, f2
  1850. FNMSUB f3, f20, f7, f3
  1851. FMUL f0, f21, f0
  1852. FMUL f1, f21, f1
  1853. FMUL f2, f21, f2
  1854. FMUL f3, f21, f3
  1855. #endif
  1856. #ifdef LN
  1857. subi CO1, CO1, 4 * SIZE
  1858. subi CO2, CO2, 4 * SIZE
  1859. #endif
  1860. #if defined(LN) || defined(LT)
  1861. STFD f0, 0 * SIZE(BO)
  1862. STFD f4, 1 * SIZE(BO)
  1863. STFD f1, 2 * SIZE(BO)
  1864. STFD f5, 3 * SIZE(BO)
  1865. STFD f2, 4 * SIZE(BO)
  1866. STFD f6, 5 * SIZE(BO)
  1867. STFD f3, 6 * SIZE(BO)
  1868. STFD f7, 7 * SIZE(BO)
  1869. #else
  1870. STFD f0, 0 * SIZE(AO)
  1871. STFD f1, 1 * SIZE(AO)
  1872. STFD f2, 2 * SIZE(AO)
  1873. STFD f3, 3 * SIZE(AO)
  1874. STFD f4, 4 * SIZE(AO)
  1875. STFD f5, 5 * SIZE(AO)
  1876. STFD f6, 6 * SIZE(AO)
  1877. STFD f7, 7 * SIZE(AO)
  1878. #endif
  1879. STFD f0, 0 * SIZE(CO1)
  1880. STFD f1, 1 * SIZE(CO1)
  1881. STFD f2, 2 * SIZE(CO1)
  1882. STFD f3, 3 * SIZE(CO1)
  1883. STFD f4, 0 * SIZE(CO2)
  1884. STFD f5, 1 * SIZE(CO2)
  1885. STFD f6, 2 * SIZE(CO2)
  1886. STFD f7, 3 * SIZE(CO2)
  1887. lfs f0, FZERO
  1888. fmr f1, f0
  1889. fmr f2, f0
  1890. fmr f3, f0
  1891. fmr f4, f0
  1892. fmr f5, f0
  1893. fmr f6, f0
  1894. fmr f7, f0
  1895. #ifndef LN
  1896. addi CO1, CO1, 4 * SIZE
  1897. addi CO2, CO2, 4 * SIZE
  1898. #endif
  1899. #ifdef RT
  1900. slwi r0, K, 2 + BASE_SHIFT
  1901. add AORIG, AORIG, r0
  1902. #endif
  1903. #if defined(LT) || defined(RN)
  1904. sub TEMP, K, KK
  1905. slwi r0, TEMP, 2 + BASE_SHIFT
  1906. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1907. add AO, AO, r0
  1908. add BO, BO, TEMP
  1909. #endif
  1910. #ifdef LN
  1911. subi KK, KK, 4
  1912. #endif
  1913. #ifdef LT
  1914. addi KK, KK, 4
  1915. #endif
  1916. addic. I, I, -1
  1917. bgt+ LL(41)
  1918. .align 4
  1919. LL(50):
  1920. andi. I, M, 2
  1921. ble LL(60)
  1922. #if defined(LT) || defined(RN)
  1923. LFD f16, 0 * SIZE(AO)
  1924. LFD f17, 1 * SIZE(AO)
  1925. LFD f18, 2 * SIZE(AO)
  1926. LFD f19, 3 * SIZE(AO)
  1927. LFD f20, 0 * SIZE(B)
  1928. LFD f21, 1 * SIZE(B)
  1929. LFD f22, 2 * SIZE(B)
  1930. LFD f23, 3 * SIZE(B)
  1931. LFD f24, 4 * SIZE(B)
  1932. LFD f25, 5 * SIZE(B)
  1933. LFD f26, 6 * SIZE(B)
  1934. LFD f27, 7 * SIZE(B)
  1935. srawi. r0, KK, 2
  1936. mtspr CTR, r0
  1937. mr BO, B
  1938. #else
  1939. #ifdef LN
  1940. slwi r0, K, 1 + BASE_SHIFT
  1941. sub AORIG, AORIG, r0
  1942. #endif
  1943. slwi r0, KK, 1 + BASE_SHIFT
  1944. slwi TEMP, KK, 1 + BASE_SHIFT
  1945. add AO, AORIG, r0
  1946. add BO, B, TEMP
  1947. sub TEMP, K, KK
  1948. LFD f16, 0 * SIZE(AO)
  1949. LFD f17, 1 * SIZE(AO)
  1950. LFD f18, 2 * SIZE(AO)
  1951. LFD f19, 3 * SIZE(AO)
  1952. LFD f20, 0 * SIZE(BO)
  1953. LFD f21, 1 * SIZE(BO)
  1954. LFD f22, 2 * SIZE(BO)
  1955. LFD f23, 3 * SIZE(BO)
  1956. LFD f24, 4 * SIZE(BO)
  1957. LFD f25, 5 * SIZE(BO)
  1958. LFD f26, 6 * SIZE(BO)
  1959. LFD f27, 7 * SIZE(BO)
  1960. srawi. r0, TEMP, 2
  1961. mtspr CTR, r0
  1962. #endif
  1963. ble LL(55)
  1964. .align 5
  1965. LL(52):
  1966. FMADD f0, f16, f20, f0
  1967. FMADD f1, f17, f20, f1
  1968. FMADD f2, f16, f21, f2
  1969. FMADD f3, f17, f21, f3
  1970. FMADD f4, f18, f22, f4
  1971. FMADD f5, f19, f22, f5
  1972. FMADD f6, f18, f23, f6
  1973. FMADD f7, f19, f23, f7
  1974. LFD f16, 4 * SIZE(AO)
  1975. LFD f17, 5 * SIZE(AO)
  1976. LFD f18, 6 * SIZE(AO)
  1977. LFD f19, 7 * SIZE(AO)
  1978. LFD f20, 8 * SIZE(BO)
  1979. LFD f21, 9 * SIZE(BO)
  1980. LFD f22, 10 * SIZE(BO)
  1981. LFD f23, 11 * SIZE(BO)
  1982. FMADD f0, f16, f24, f0
  1983. FMADD f1, f17, f24, f1
  1984. FMADD f2, f16, f25, f2
  1985. FMADD f3, f17, f25, f3
  1986. FMADD f4, f18, f26, f4
  1987. FMADD f5, f19, f26, f5
  1988. FMADD f6, f18, f27, f6
  1989. FMADD f7, f19, f27, f7
  1990. LFD f16, 8 * SIZE(AO)
  1991. LFD f17, 9 * SIZE(AO)
  1992. LFD f18, 10 * SIZE(AO)
  1993. LFD f19, 11 * SIZE(AO)
  1994. LFD f24, 12 * SIZE(BO)
  1995. LFD f25, 13 * SIZE(BO)
  1996. LFD f26, 14 * SIZE(BO)
  1997. LFD f27, 15 * SIZE(BO)
  1998. addi AO, AO, 8 * SIZE
  1999. addi BO, BO, 8 * SIZE
  2000. dcbtst AO, PREA
  2001. bdnz LL(52)
  2002. .align 4
  2003. LL(55):
  2004. #if defined(LT) || defined(RN)
  2005. andi. r0, KK, 3
  2006. #else
  2007. andi. r0, TEMP, 3
  2008. #endif
  2009. mtspr CTR, r0
  2010. ble+ LL(58)
  2011. .align 4
  2012. LL(56):
  2013. FMADD f0, f16, f20, f0
  2014. FMADD f1, f17, f20, f1
  2015. FMADD f2, f16, f21, f2
  2016. FMADD f3, f17, f21, f3
  2017. LFD f16, 2 * SIZE(AO)
  2018. LFD f17, 3 * SIZE(AO)
  2019. LFD f20, 2 * SIZE(BO)
  2020. LFD f21, 3 * SIZE(BO)
  2021. addi BO, BO, 2 * SIZE
  2022. addi AO, AO, 2 * SIZE
  2023. bdnz LL(56)
  2024. .align 4
  2025. LL(58):
  2026. FADD f0, f4, f0
  2027. FADD f1, f5, f1
  2028. FADD f2, f6, f2
  2029. FADD f3, f7, f3
  2030. #if defined(LN) || defined(RT)
  2031. #ifdef LN
  2032. subi r0, KK, 2
  2033. #else
  2034. subi r0, KK, 2
  2035. #endif
  2036. slwi TEMP, r0, 1 + BASE_SHIFT
  2037. slwi r0, r0, 1 + BASE_SHIFT
  2038. add AO, AORIG, TEMP
  2039. add BO, B, r0
  2040. #endif
  2041. #if defined(LN) || defined(LT)
  2042. LFD f16, 0 * SIZE(BO)
  2043. LFD f17, 1 * SIZE(BO)
  2044. LFD f20, 2 * SIZE(BO)
  2045. LFD f21, 3 * SIZE(BO)
  2046. FSUB f0, f16, f0
  2047. FSUB f2, f17, f2
  2048. FSUB f1, f20, f1
  2049. FSUB f3, f21, f3
  2050. #else
  2051. LFD f16, 0 * SIZE(AO)
  2052. LFD f17, 1 * SIZE(AO)
  2053. LFD f20, 2 * SIZE(AO)
  2054. LFD f21, 3 * SIZE(AO)
  2055. FSUB f0, f16, f0
  2056. FSUB f1, f17, f1
  2057. FSUB f2, f20, f2
  2058. FSUB f3, f21, f3
  2059. #endif
  2060. #ifdef LN
  2061. LFD f19, 3 * SIZE(AO)
  2062. LFD f20, 2 * SIZE(AO)
  2063. LFD f21, 0 * SIZE(AO)
  2064. FMUL f1, f19, f1
  2065. FMUL f3, f19, f3
  2066. FNMSUB f0, f20, f1, f0
  2067. FNMSUB f2, f20, f3, f2
  2068. FMUL f0, f21, f0
  2069. FMUL f2, f21, f2
  2070. #endif
  2071. #ifdef LT
  2072. LFD f16, 0 * SIZE(AO)
  2073. LFD f17, 1 * SIZE(AO)
  2074. FMUL f0, f16, f0
  2075. FMUL f2, f16, f2
  2076. FNMSUB f1, f17, f0, f1
  2077. FNMSUB f3, f17, f2, f3
  2078. LFD f17, 3 * SIZE(AO)
  2079. FMUL f1, f17, f1
  2080. FMUL f3, f17, f3
  2081. #endif
  2082. #ifdef RN
  2083. LFD f16, 0 * SIZE(BO)
  2084. LFD f17, 1 * SIZE(BO)
  2085. LFD f18, 3 * SIZE(BO)
  2086. FMUL f0, f16, f0
  2087. FMUL f1, f16, f1
  2088. FNMSUB f2, f17, f0, f2
  2089. FNMSUB f3, f17, f1, f3
  2090. FMUL f2, f18, f2
  2091. FMUL f3, f18, f3
  2092. #endif
  2093. #ifdef RT
  2094. LFD f19, 3 * SIZE(BO)
  2095. LFD f20, 2 * SIZE(BO)
  2096. LFD f21, 0 * SIZE(BO)
  2097. FMUL f2, f19, f2
  2098. FMUL f3, f19, f3
  2099. FNMSUB f0, f20, f2, f0
  2100. FNMSUB f1, f20, f3, f1
  2101. FMUL f0, f21, f0
  2102. FMUL f1, f21, f1
  2103. #endif
  2104. #ifdef LN
  2105. subi CO1, CO1, 2 * SIZE
  2106. subi CO2, CO2, 2 * SIZE
  2107. #endif
  2108. #if defined(LN) || defined(LT)
  2109. STFD f0, 0 * SIZE(BO)
  2110. STFD f2, 1 * SIZE(BO)
  2111. STFD f1, 2 * SIZE(BO)
  2112. STFD f3, 3 * SIZE(BO)
  2113. #else
  2114. STFD f0, 0 * SIZE(AO)
  2115. STFD f1, 1 * SIZE(AO)
  2116. STFD f2, 2 * SIZE(AO)
  2117. STFD f3, 3 * SIZE(AO)
  2118. #endif
  2119. STFD f0, 0 * SIZE(CO1)
  2120. STFD f1, 1 * SIZE(CO1)
  2121. STFD f2, 0 * SIZE(CO2)
  2122. STFD f3, 1 * SIZE(CO2)
  2123. lfs f0, FZERO
  2124. fmr f1, f0
  2125. fmr f2, f0
  2126. fmr f3, f0
  2127. fmr f4, f0
  2128. fmr f5, f0
  2129. fmr f6, f0
  2130. fmr f7, f0
  2131. #ifndef LN
  2132. addi CO1, CO1, 2 * SIZE
  2133. addi CO2, CO2, 2 * SIZE
  2134. #endif
  2135. #ifdef RT
  2136. slwi r0, K, 1 + BASE_SHIFT
  2137. add AORIG, AORIG, r0
  2138. #endif
  2139. #if defined(LT) || defined(RN)
  2140. sub TEMP, K, KK
  2141. slwi r0, TEMP, 1 + BASE_SHIFT
  2142. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2143. add AO, AO, r0
  2144. add BO, BO, TEMP
  2145. #endif
  2146. #ifdef LN
  2147. subi KK, KK, 2
  2148. #endif
  2149. #ifdef LT
  2150. addi KK, KK, 2
  2151. #endif
  2152. .align 4
  2153. LL(60):
  2154. andi. I, M, 1
  2155. ble LL(69)
  2156. #if defined(LT) || defined(RN)
  2157. LFD f16, 0 * SIZE(AO)
  2158. LFD f17, 1 * SIZE(AO)
  2159. LFD f18, 2 * SIZE(AO)
  2160. LFD f19, 3 * SIZE(AO)
  2161. LFD f20, 0 * SIZE(B)
  2162. LFD f21, 1 * SIZE(B)
  2163. LFD f22, 2 * SIZE(B)
  2164. LFD f23, 3 * SIZE(B)
  2165. LFD f24, 4 * SIZE(B)
  2166. LFD f25, 5 * SIZE(B)
  2167. LFD f26, 6 * SIZE(B)
  2168. LFD f27, 7 * SIZE(B)
  2169. srawi. r0, KK, 2
  2170. mtspr CTR, r0
  2171. mr BO, B
  2172. #else
  2173. #ifdef LN
  2174. slwi r0, K, BASE_SHIFT
  2175. sub AORIG, AORIG, r0
  2176. #endif
  2177. slwi r0, KK, 0 + BASE_SHIFT
  2178. slwi TEMP, KK, 1 + BASE_SHIFT
  2179. add AO, AORIG, r0
  2180. add BO, B, TEMP
  2181. sub TEMP, K, KK
  2182. LFD f16, 0 * SIZE(AO)
  2183. LFD f17, 1 * SIZE(AO)
  2184. LFD f18, 2 * SIZE(AO)
  2185. LFD f19, 3 * SIZE(AO)
  2186. LFD f20, 0 * SIZE(BO)
  2187. LFD f21, 1 * SIZE(BO)
  2188. LFD f22, 2 * SIZE(BO)
  2189. LFD f23, 3 * SIZE(BO)
  2190. LFD f24, 4 * SIZE(BO)
  2191. LFD f25, 5 * SIZE(BO)
  2192. LFD f26, 6 * SIZE(BO)
  2193. LFD f27, 7 * SIZE(BO)
  2194. srawi. r0, TEMP, 2
  2195. mtspr CTR, r0
  2196. #endif
  2197. ble LL(65)
  2198. .align 5
  2199. LL(62):
  2200. FMADD f0, f16, f20, f0
  2201. FMADD f1, f16, f21, f1
  2202. FMADD f2, f17, f22, f2
  2203. FMADD f3, f17, f23, f3
  2204. LFD f20, 8 * SIZE(BO)
  2205. LFD f21, 9 * SIZE(BO)
  2206. LFD f22, 10 * SIZE(BO)
  2207. LFD f23, 11 * SIZE(BO)
  2208. FMADD f0, f18, f24, f0
  2209. FMADD f1, f18, f25, f1
  2210. FMADD f2, f19, f26, f2
  2211. FMADD f3, f19, f27, f3
  2212. LFD f16, 4 * SIZE(AO)
  2213. LFD f17, 5 * SIZE(AO)
  2214. LFD f18, 6 * SIZE(AO)
  2215. LFD f19, 7 * SIZE(AO)
  2216. LFD f24, 12 * SIZE(BO)
  2217. LFD f25, 13 * SIZE(BO)
  2218. LFD f26, 14 * SIZE(BO)
  2219. LFD f27, 15 * SIZE(BO)
  2220. addi AO, AO, 4 * SIZE
  2221. addi BO, BO, 8 * SIZE
  2222. bdnz LL(62)
  2223. .align 4
  2224. LL(65):
  2225. #if defined(LT) || defined(RN)
  2226. andi. r0, KK, 3
  2227. #else
  2228. andi. r0, TEMP, 3
  2229. #endif
  2230. mtspr CTR, r0
  2231. ble+ LL(68)
  2232. .align 4
  2233. LL(66):
  2234. FMADD f0, f16, f20, f0
  2235. FMADD f1, f16, f21, f1
  2236. LFD f16, 1 * SIZE(AO)
  2237. LFD f20, 2 * SIZE(BO)
  2238. LFD f21, 3 * SIZE(BO)
  2239. addi BO, BO, 2 * SIZE
  2240. addi AO, AO, 1 * SIZE
  2241. bdnz LL(66)
  2242. .align 4
  2243. LL(68):
  2244. FADD f0, f2, f0
  2245. FADD f1, f3, f1
  2246. #if defined(LN) || defined(RT)
  2247. #ifdef LN
  2248. subi r0, KK, 1
  2249. #else
  2250. subi r0, KK, 2
  2251. #endif
  2252. slwi TEMP, r0, 0 + BASE_SHIFT
  2253. slwi r0, r0, 1 + BASE_SHIFT
  2254. add AO, AORIG, TEMP
  2255. add BO, B, r0
  2256. #endif
  2257. #if defined(LN) || defined(LT)
  2258. LFD f16, 0 * SIZE(BO)
  2259. LFD f17, 1 * SIZE(BO)
  2260. FSUB f0, f16, f0
  2261. FSUB f1, f17, f1
  2262. #else
  2263. LFD f16, 0 * SIZE(AO)
  2264. LFD f20, 1 * SIZE(AO)
  2265. FSUB f0, f16, f0
  2266. FSUB f1, f20, f1
  2267. #endif
  2268. #ifdef LN
  2269. LFD f21, 0 * SIZE(AO)
  2270. FMUL f0, f21, f0
  2271. FMUL f1, f21, f1
  2272. #endif
  2273. #ifdef LT
  2274. LFD f16, 0 * SIZE(AO)
  2275. FMUL f0, f16, f0
  2276. FMUL f1, f16, f1
  2277. #endif
  2278. #ifdef RN
  2279. LFD f16, 0 * SIZE(BO)
  2280. LFD f17, 1 * SIZE(BO)
  2281. LFD f18, 3 * SIZE(BO)
  2282. FMUL f0, f16, f0
  2283. FNMSUB f1, f17, f0, f1
  2284. FMUL f1, f18, f1
  2285. #endif
  2286. #ifdef RT
  2287. LFD f19, 3 * SIZE(BO)
  2288. LFD f20, 2 * SIZE(BO)
  2289. LFD f21, 0 * SIZE(BO)
  2290. FMUL f1, f19, f1
  2291. FNMSUB f0, f20, f1, f0
  2292. FMUL f0, f21, f0
  2293. #endif
  2294. #ifdef LN
  2295. subi CO1, CO1, 1 * SIZE
  2296. subi CO2, CO2, 1 * SIZE
  2297. #endif
  2298. #if defined(LN) || defined(LT)
  2299. STFD f0, 0 * SIZE(BO)
  2300. STFD f1, 1 * SIZE(BO)
  2301. #else
  2302. STFD f0, 0 * SIZE(AO)
  2303. STFD f1, 1 * SIZE(AO)
  2304. #endif
  2305. STFD f0, 0 * SIZE(CO1)
  2306. STFD f1, 0 * SIZE(CO2)
  2307. lfs f0, FZERO
  2308. fmr f1, f0
  2309. fmr f4, f0
  2310. fmr f5, f0
  2311. #ifndef LN
  2312. addi CO1, CO1, 1 * SIZE
  2313. addi CO2, CO2, 1 * SIZE
  2314. #endif
  2315. #ifdef RT
  2316. slwi r0, K, 0 + BASE_SHIFT
  2317. add AORIG, AORIG, r0
  2318. #endif
  2319. #if defined(LT) || defined(RN)
  2320. sub TEMP, K, KK
  2321. slwi r0, TEMP, 0 + BASE_SHIFT
  2322. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2323. add AO, AO, r0
  2324. add BO, BO, TEMP
  2325. #endif
  2326. #ifdef LN
  2327. subi KK, KK, 1
  2328. #endif
  2329. #ifdef LT
  2330. addi KK, KK, 1
  2331. #endif
  2332. .align 4
  2333. LL(69):
  2334. #ifdef LN
  2335. slwi r0, K, 1 + BASE_SHIFT
  2336. add B, B, r0
  2337. #endif
  2338. #if defined(LT) || defined(RN)
  2339. mr B, BO
  2340. #endif
  2341. #ifdef RN
  2342. addi KK, KK, 2
  2343. #endif
  2344. #ifdef RT
  2345. subi KK, KK, 2
  2346. #endif
  2347. lfs f0, FZERO
  2348. .align 4
  2349. LL(70):
  2350. andi. J, N, 1
  2351. ble LL(999)
  2352. #ifdef RT
  2353. slwi r0, K, 0 + BASE_SHIFT
  2354. sub B, B, r0
  2355. sub C, C, LDC
  2356. #endif
  2357. mr CO1, C
  2358. #ifdef LN
  2359. add KK, M, OFFSET
  2360. #endif
  2361. #ifdef LT
  2362. mr KK, OFFSET
  2363. #endif
  2364. fmr f1, f0
  2365. fmr f2, f0
  2366. fmr f3, f0
  2367. srawi. I, M, 2
  2368. #if defined(LN) || defined(RT)
  2369. mr AORIG, A
  2370. #else
  2371. mr AO, A
  2372. #endif
  2373. #ifndef RT
  2374. add C, CO1, LDC
  2375. #endif
  2376. ble LL(80)
  2377. .align 4
  2378. LL(71):
  2379. #if defined(LT) || defined(RN)
  2380. LFD f16, 0 * SIZE(AO)
  2381. LFD f17, 1 * SIZE(AO)
  2382. LFD f18, 2 * SIZE(AO)
  2383. LFD f19, 3 * SIZE(AO)
  2384. LFD f20, 0 * SIZE(B)
  2385. LFD f21, 1 * SIZE(B)
  2386. LFD f22, 2 * SIZE(B)
  2387. LFD f23, 3 * SIZE(B)
  2388. dcbt CO1, PREC
  2389. srawi. r0, KK, 2
  2390. mtspr CTR, r0
  2391. mr BO, B
  2392. #else
  2393. #ifdef LN
  2394. slwi r0, K, 2 + BASE_SHIFT
  2395. sub AORIG, AORIG, r0
  2396. #endif
  2397. slwi r0, KK, 2 + BASE_SHIFT
  2398. slwi TEMP, KK, 0 + BASE_SHIFT
  2399. add AO, AORIG, r0
  2400. add BO, B, TEMP
  2401. sub TEMP, K, KK
  2402. LFD f16, 0 * SIZE(AO)
  2403. LFD f17, 1 * SIZE(AO)
  2404. LFD f18, 2 * SIZE(AO)
  2405. LFD f19, 3 * SIZE(AO)
  2406. LFD f20, 0 * SIZE(BO)
  2407. LFD f21, 1 * SIZE(BO)
  2408. LFD f22, 2 * SIZE(BO)
  2409. LFD f23, 3 * SIZE(BO)
  2410. dcbt CO1, PREC
  2411. srawi. r0, TEMP, 2
  2412. mtspr CTR, r0
  2413. #endif
  2414. ble LL(75)
  2415. .align 5
  2416. LL(72):
  2417. FMADD f0, f16, f20, f0
  2418. FMADD f1, f17, f20, f1
  2419. FMADD f2, f18, f20, f2
  2420. FMADD f3, f19, f20, f3
  2421. LFD f16, 4 * SIZE(AO)
  2422. LFD f17, 5 * SIZE(AO)
  2423. LFD f18, 6 * SIZE(AO)
  2424. LFD f19, 7 * SIZE(AO)
  2425. FMADD f0, f16, f21, f0
  2426. FMADD f1, f17, f21, f1
  2427. FMADD f2, f18, f21, f2
  2428. FMADD f3, f19, f21, f3
  2429. LFD f16, 8 * SIZE(AO)
  2430. LFD f17, 9 * SIZE(AO)
  2431. LFD f18, 10 * SIZE(AO)
  2432. LFD f19, 11 * SIZE(AO)
  2433. FMADD f0, f16, f22, f0
  2434. FMADD f1, f17, f22, f1
  2435. FMADD f2, f18, f22, f2
  2436. FMADD f3, f19, f22, f3
  2437. LFD f16, 12 * SIZE(AO)
  2438. LFD f17, 13 * SIZE(AO)
  2439. LFD f18, 14 * SIZE(AO)
  2440. LFD f19, 15 * SIZE(AO)
  2441. FMADD f0, f16, f23, f0
  2442. FMADD f1, f17, f23, f1
  2443. FMADD f2, f18, f23, f2
  2444. FMADD f3, f19, f23, f3
  2445. LFD f16, 16 * SIZE(AO)
  2446. LFD f17, 17 * SIZE(AO)
  2447. LFD f18, 18 * SIZE(AO)
  2448. LFD f19, 19 * SIZE(AO)
  2449. LFD f20, 4 * SIZE(BO)
  2450. LFD f21, 5 * SIZE(BO)
  2451. LFD f22, 6 * SIZE(BO)
  2452. LFD f23, 7 * SIZE(BO)
  2453. addi AO, AO, 16 * SIZE
  2454. addi BO, BO, 4 * SIZE
  2455. dcbtst AO, PREA
  2456. bdnz LL(72)
  2457. .align 4
  2458. LL(75):
  2459. #if defined(LT) || defined(RN)
  2460. andi. r0, KK, 3
  2461. #else
  2462. andi. r0, TEMP, 3
  2463. #endif
  2464. mtspr CTR, r0
  2465. ble+ LL(78)
  2466. .align 4
  2467. LL(76):
  2468. FMADD f0, f16, f20, f0
  2469. FMADD f1, f17, f20, f1
  2470. FMADD f2, f18, f20, f2
  2471. FMADD f3, f19, f20, f3
  2472. LFD f16, 4 * SIZE(AO)
  2473. LFD f17, 5 * SIZE(AO)
  2474. LFD f18, 6 * SIZE(AO)
  2475. LFD f19, 7 * SIZE(AO)
  2476. LFD f20, 1 * SIZE(BO)
  2477. addi BO, BO, 1 * SIZE
  2478. addi AO, AO, 4 * SIZE
  2479. bdnz LL(76)
  2480. .align 4
  2481. LL(78):
  2482. #if defined(LN) || defined(RT)
  2483. #ifdef LN
  2484. subi r0, KK, 4
  2485. #else
  2486. subi r0, KK, 1
  2487. #endif
  2488. slwi TEMP, r0, 2 + BASE_SHIFT
  2489. slwi r0, r0, 0 + BASE_SHIFT
  2490. add AO, AORIG, TEMP
  2491. add BO, B, r0
  2492. #endif
  2493. #if defined(LN) || defined(LT)
  2494. LFD f16, 0 * SIZE(BO)
  2495. LFD f20, 1 * SIZE(BO)
  2496. LFD f24, 2 * SIZE(BO)
  2497. LFD f28, 3 * SIZE(BO)
  2498. FSUB f0, f16, f0
  2499. FSUB f1, f20, f1
  2500. FSUB f2, f24, f2
  2501. FSUB f3, f28, f3
  2502. #else
  2503. LFD f16, 0 * SIZE(AO)
  2504. LFD f17, 1 * SIZE(AO)
  2505. LFD f18, 2 * SIZE(AO)
  2506. LFD f19, 3 * SIZE(AO)
  2507. FSUB f0, f16, f0
  2508. FSUB f1, f17, f1
  2509. FSUB f2, f18, f2
  2510. FSUB f3, f19, f3
  2511. #endif
  2512. #ifdef LN
  2513. LFD f16, 15 * SIZE(AO)
  2514. LFD f17, 14 * SIZE(AO)
  2515. LFD f18, 13 * SIZE(AO)
  2516. LFD f19, 12 * SIZE(AO)
  2517. FMUL f3, f16, f3
  2518. FNMSUB f2, f17, f3, f2
  2519. FNMSUB f1, f18, f3, f1
  2520. FNMSUB f0, f19, f3, f0
  2521. LFD f16, 10 * SIZE(AO)
  2522. LFD f17, 9 * SIZE(AO)
  2523. LFD f18, 8 * SIZE(AO)
  2524. LFD f19, 5 * SIZE(AO)
  2525. LFD f20, 4 * SIZE(AO)
  2526. LFD f21, 0 * SIZE(AO)
  2527. FMUL f2, f16, f2
  2528. FNMSUB f1, f17, f2, f1
  2529. FNMSUB f0, f18, f2, f0
  2530. FMUL f1, f19, f1
  2531. FNMSUB f0, f20, f1, f0
  2532. FMUL f0, f21, f0
  2533. #endif
  2534. #ifdef LT
  2535. LFD f16, 0 * SIZE(AO)
  2536. LFD f17, 1 * SIZE(AO)
  2537. LFD f18, 2 * SIZE(AO)
  2538. LFD f19, 3 * SIZE(AO)
  2539. FMUL f0, f16, f0
  2540. FNMSUB f1, f17, f0, f1
  2541. FNMSUB f2, f18, f0, f2
  2542. FNMSUB f3, f19, f0, f3
  2543. LFD f17, 5 * SIZE(AO)
  2544. LFD f18, 6 * SIZE(AO)
  2545. LFD f19, 7 * SIZE(AO)
  2546. FMUL f1, f17, f1
  2547. FNMSUB f2, f18, f1, f2
  2548. FNMSUB f3, f19, f1, f3
  2549. LFD f18, 10 * SIZE(AO)
  2550. LFD f19, 11 * SIZE(AO)
  2551. FMUL f2, f18, f2
  2552. FNMSUB f3, f19, f2, f3
  2553. LFD f19, 15 * SIZE(AO)
  2554. FMUL f3, f19, f3
  2555. #endif
  2556. #ifdef RN
  2557. LFD f16, 0 * SIZE(BO)
  2558. FMUL f0, f16, f0
  2559. FMUL f1, f16, f1
  2560. FMUL f2, f16, f2
  2561. FMUL f3, f16, f3
  2562. #endif
  2563. #ifdef RT
  2564. LFD f21, 0 * SIZE(BO)
  2565. FMUL f0, f21, f0
  2566. FMUL f1, f21, f1
  2567. FMUL f2, f21, f2
  2568. FMUL f3, f21, f3
  2569. #endif
  2570. #ifdef LN
  2571. subi CO1, CO1, 4 * SIZE
  2572. #endif
  2573. #if defined(LN) || defined(LT)
  2574. STFD f0, 0 * SIZE(BO)
  2575. STFD f1, 1 * SIZE(BO)
  2576. STFD f2, 2 * SIZE(BO)
  2577. STFD f3, 3 * SIZE(BO)
  2578. #else
  2579. STFD f0, 0 * SIZE(AO)
  2580. STFD f1, 1 * SIZE(AO)
  2581. STFD f2, 2 * SIZE(AO)
  2582. STFD f3, 3 * SIZE(AO)
  2583. #endif
  2584. STFD f0, 0 * SIZE(CO1)
  2585. STFD f1, 1 * SIZE(CO1)
  2586. STFD f2, 2 * SIZE(CO1)
  2587. STFD f3, 3 * SIZE(CO1)
  2588. lfs f0, FZERO
  2589. fmr f1, f0
  2590. fmr f2, f0
  2591. fmr f3, f0
  2592. #ifndef LN
  2593. addi CO1, CO1, 4 * SIZE
  2594. #endif
  2595. #ifdef RT
  2596. slwi r0, K, 2 + BASE_SHIFT
  2597. add AORIG, AORIG, r0
  2598. #endif
  2599. #if defined(LT) || defined(RN)
  2600. sub TEMP, K, KK
  2601. slwi r0, TEMP, 2 + BASE_SHIFT
  2602. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2603. add AO, AO, r0
  2604. add BO, BO, TEMP
  2605. #endif
  2606. #ifdef LN
  2607. subi KK, KK, 4
  2608. #endif
  2609. #ifdef LT
  2610. addi KK, KK, 4
  2611. #endif
  2612. addic. I, I, -1
  2613. bgt+ LL(71)
  2614. .align 4
  2615. LL(80):
  2616. andi. I, M, 2
  2617. ble LL(90)
  2618. #if defined(LT) || defined(RN)
  2619. LFD f16, 0 * SIZE(AO)
  2620. LFD f17, 1 * SIZE(AO)
  2621. LFD f18, 2 * SIZE(AO)
  2622. LFD f19, 3 * SIZE(AO)
  2623. LFD f20, 0 * SIZE(B)
  2624. LFD f21, 1 * SIZE(B)
  2625. LFD f22, 2 * SIZE(B)
  2626. LFD f23, 3 * SIZE(B)
  2627. srawi. r0, KK, 2
  2628. mtspr CTR, r0
  2629. mr BO, B
  2630. #else
  2631. #ifdef LN
  2632. slwi r0, K, 1 + BASE_SHIFT
  2633. sub AORIG, AORIG, r0
  2634. #endif
  2635. slwi r0, KK, 1 + BASE_SHIFT
  2636. slwi TEMP, KK, 0 + BASE_SHIFT
  2637. add AO, AORIG, r0
  2638. add BO, B, TEMP
  2639. sub TEMP, K, KK
  2640. LFD f16, 0 * SIZE(AO)
  2641. LFD f17, 1 * SIZE(AO)
  2642. LFD f18, 2 * SIZE(AO)
  2643. LFD f19, 3 * SIZE(AO)
  2644. LFD f20, 0 * SIZE(BO)
  2645. LFD f21, 1 * SIZE(BO)
  2646. LFD f22, 2 * SIZE(BO)
  2647. LFD f23, 3 * SIZE(BO)
  2648. srawi. r0, TEMP, 2
  2649. mtspr CTR, r0
  2650. #endif
  2651. ble LL(85)
  2652. .align 5
  2653. LL(82):
  2654. FMADD f0, f16, f20, f0
  2655. FMADD f1, f17, f20, f1
  2656. FMADD f2, f18, f21, f2
  2657. FMADD f3, f19, f21, f3
  2658. LFD f16, 4 * SIZE(AO)
  2659. LFD f17, 5 * SIZE(AO)
  2660. LFD f18, 6 * SIZE(AO)
  2661. LFD f19, 7 * SIZE(AO)
  2662. FMADD f0, f16, f22, f0
  2663. FMADD f1, f17, f22, f1
  2664. FMADD f2, f18, f23, f2
  2665. FMADD f3, f19, f23, f3
  2666. LFD f16, 8 * SIZE(AO)
  2667. LFD f17, 9 * SIZE(AO)
  2668. LFD f18, 10 * SIZE(AO)
  2669. LFD f19, 11 * SIZE(AO)
  2670. LFD f20, 4 * SIZE(BO)
  2671. LFD f21, 5 * SIZE(BO)
  2672. LFD f22, 6 * SIZE(BO)
  2673. LFD f23, 7 * SIZE(BO)
  2674. addi AO, AO, 8 * SIZE
  2675. addi BO, BO, 4 * SIZE
  2676. dcbtst AO, PREA
  2677. bdnz LL(82)
  2678. .align 4
  2679. LL(85):
  2680. #if defined(LT) || defined(RN)
  2681. andi. r0, KK, 3
  2682. #else
  2683. andi. r0, TEMP, 3
  2684. #endif
  2685. mtspr CTR, r0
  2686. ble+ LL(88)
  2687. .align 4
  2688. LL(86):
  2689. FMADD f0, f16, f20, f0
  2690. FMADD f1, f17, f20, f1
  2691. LFD f16, 2 * SIZE(AO)
  2692. LFD f17, 3 * SIZE(AO)
  2693. LFD f20, 1 * SIZE(BO)
  2694. addi BO, BO, 1 * SIZE
  2695. addi AO, AO, 2 * SIZE
  2696. bdnz LL(86)
  2697. .align 4
  2698. LL(88):
  2699. FADD f0, f2, f0
  2700. FADD f1, f3, f1
  2701. #if defined(LN) || defined(RT)
  2702. #ifdef LN
  2703. subi r0, KK, 2
  2704. #else
  2705. subi r0, KK, 1
  2706. #endif
  2707. slwi TEMP, r0, 1 + BASE_SHIFT
  2708. slwi r0, r0, 0 + BASE_SHIFT
  2709. add AO, AORIG, TEMP
  2710. add BO, B, r0
  2711. #endif
  2712. #if defined(LN) || defined(LT)
  2713. LFD f16, 0 * SIZE(BO)
  2714. LFD f20, 1 * SIZE(BO)
  2715. FSUB f0, f16, f0
  2716. FSUB f1, f20, f1
  2717. #else
  2718. LFD f16, 0 * SIZE(AO)
  2719. LFD f17, 1 * SIZE(AO)
  2720. FSUB f0, f16, f0
  2721. FSUB f1, f17, f1
  2722. #endif
  2723. #ifdef LN
  2724. LFD f19, 3 * SIZE(AO)
  2725. LFD f20, 2 * SIZE(AO)
  2726. LFD f21, 0 * SIZE(AO)
  2727. FMUL f1, f19, f1
  2728. FNMSUB f0, f20, f1, f0
  2729. FMUL f0, f21, f0
  2730. #endif
  2731. #ifdef LT
  2732. LFD f16, 0 * SIZE(AO)
  2733. LFD f17, 1 * SIZE(AO)
  2734. FMUL f0, f16, f0
  2735. FNMSUB f1, f17, f0, f1
  2736. LFD f17, 3 * SIZE(AO)
  2737. FMUL f1, f17, f1
  2738. #endif
  2739. #ifdef RN
  2740. LFD f16, 0 * SIZE(BO)
  2741. FMUL f0, f16, f0
  2742. FMUL f1, f16, f1
  2743. #endif
  2744. #ifdef RT
  2745. LFD f21, 0 * SIZE(BO)
  2746. FMUL f0, f21, f0
  2747. FMUL f1, f21, f1
  2748. #endif
  2749. #ifdef LN
  2750. subi CO1, CO1, 2 * SIZE
  2751. #endif
  2752. #if defined(LN) || defined(LT)
  2753. STFD f0, 0 * SIZE(BO)
  2754. STFD f1, 1 * SIZE(BO)
  2755. #else
  2756. STFD f0, 0 * SIZE(AO)
  2757. STFD f1, 1 * SIZE(AO)
  2758. #endif
  2759. STFD f0, 0 * SIZE(CO1)
  2760. STFD f1, 1 * SIZE(CO1)
  2761. lfs f0, FZERO
  2762. fmr f1, f0
  2763. fmr f2, f0
  2764. fmr f3, f0
  2765. #ifndef LN
  2766. addi CO1, CO1, 2 * SIZE
  2767. #endif
  2768. #ifdef RT
  2769. slwi r0, K, 1 + BASE_SHIFT
  2770. add AORIG, AORIG, r0
  2771. #endif
  2772. #if defined(LT) || defined(RN)
  2773. sub TEMP, K, KK
  2774. slwi r0, TEMP, 1 + BASE_SHIFT
  2775. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2776. add AO, AO, r0
  2777. add BO, BO, TEMP
  2778. #endif
  2779. #ifdef LN
  2780. subi KK, KK, 2
  2781. #endif
  2782. #ifdef LT
  2783. addi KK, KK, 2
  2784. #endif
  2785. .align 4
  2786. LL(90):
  2787. andi. I, M, 1
  2788. ble LL(999)
  2789. #if defined(LT) || defined(RN)
  2790. LFD f16, 0 * SIZE(AO)
  2791. LFD f17, 1 * SIZE(AO)
  2792. LFD f18, 2 * SIZE(AO)
  2793. LFD f19, 3 * SIZE(AO)
  2794. LFD f20, 0 * SIZE(B)
  2795. LFD f21, 1 * SIZE(B)
  2796. LFD f22, 2 * SIZE(B)
  2797. LFD f23, 3 * SIZE(B)
  2798. srawi. r0, KK, 3
  2799. mtspr CTR, r0
  2800. mr BO, B
  2801. #else
  2802. #ifdef LN
  2803. slwi r0, K, BASE_SHIFT
  2804. sub AORIG, AORIG, r0
  2805. #endif
  2806. slwi r0, KK, 0 + BASE_SHIFT
  2807. slwi TEMP, KK, 0 + BASE_SHIFT
  2808. add AO, AORIG, r0
  2809. add BO, B, TEMP
  2810. sub TEMP, K, KK
  2811. LFD f16, 0 * SIZE(AO)
  2812. LFD f17, 1 * SIZE(AO)
  2813. LFD f18, 2 * SIZE(AO)
  2814. LFD f19, 3 * SIZE(AO)
  2815. LFD f20, 0 * SIZE(BO)
  2816. LFD f21, 1 * SIZE(BO)
  2817. LFD f22, 2 * SIZE(BO)
  2818. LFD f23, 3 * SIZE(BO)
  2819. srawi. r0, TEMP, 3
  2820. mtspr CTR, r0
  2821. #endif
  2822. ble LL(95)
  2823. .align 5
  2824. LL(92):
  2825. FMADD f0, f16, f20, f0
  2826. FMADD f1, f17, f21, f1
  2827. FMADD f2, f18, f22, f2
  2828. FMADD f3, f19, f23, f3
  2829. LFD f16, 4 * SIZE(AO)
  2830. LFD f17, 5 * SIZE(AO)
  2831. LFD f18, 6 * SIZE(AO)
  2832. LFD f19, 7 * SIZE(AO)
  2833. LFD f20, 4 * SIZE(BO)
  2834. LFD f21, 5 * SIZE(BO)
  2835. LFD f22, 6 * SIZE(BO)
  2836. LFD f23, 7 * SIZE(BO)
  2837. FMADD f0, f16, f20, f0
  2838. FMADD f1, f17, f21, f1
  2839. FMADD f2, f18, f22, f2
  2840. FMADD f3, f19, f23, f3
  2841. LFD f16, 8 * SIZE(AO)
  2842. LFD f17, 9 * SIZE(AO)
  2843. LFD f18, 10 * SIZE(AO)
  2844. LFD f19, 11 * SIZE(AO)
  2845. LFD f20, 8 * SIZE(BO)
  2846. LFD f21, 9 * SIZE(BO)
  2847. LFD f22, 10 * SIZE(BO)
  2848. LFD f23, 11 * SIZE(BO)
  2849. addi AO, AO, 8 * SIZE
  2850. addi BO, BO, 8 * SIZE
  2851. bdnz LL(92)
  2852. .align 4
  2853. LL(95):
  2854. #if defined(LT) || defined(RN)
  2855. andi. r0, KK, 7
  2856. #else
  2857. andi. r0, TEMP, 7
  2858. #endif
  2859. mtspr CTR, r0
  2860. ble+ LL(98)
  2861. .align 4
  2862. LL(96):
  2863. FMADD f0, f16, f20, f0
  2864. LFD f16, 1 * SIZE(AO)
  2865. LFD f20, 1 * SIZE(BO)
  2866. addi BO, BO, 1 * SIZE
  2867. addi AO, AO, 1 * SIZE
  2868. bdnz LL(96)
  2869. .align 4
  2870. LL(98):
  2871. FADD f0, f1, f0
  2872. FADD f2, f3, f2
  2873. FADD f0, f2, f0
  2874. #if defined(LN) || defined(RT)
  2875. #ifdef LN
  2876. subi r0, KK, 1
  2877. #else
  2878. subi r0, KK, 1
  2879. #endif
  2880. slwi TEMP, r0, 0 + BASE_SHIFT
  2881. slwi r0, r0, 0 + BASE_SHIFT
  2882. add AO, AORIG, TEMP
  2883. add BO, B, r0
  2884. #endif
  2885. #if defined(LN) || defined(LT)
  2886. LFD f16, 0 * SIZE(BO)
  2887. FSUB f0, f16, f0
  2888. #else
  2889. LFD f16, 0 * SIZE(AO)
  2890. FSUB f0, f16, f0
  2891. #endif
  2892. #ifdef LN
  2893. LFD f21, 0 * SIZE(AO)
  2894. FMUL f0, f21, f0
  2895. #endif
  2896. #ifdef LT
  2897. LFD f16, 0 * SIZE(AO)
  2898. FMUL f0, f16, f0
  2899. #endif
  2900. #ifdef RN
  2901. LFD f16, 0 * SIZE(BO)
  2902. FMUL f0, f16, f0
  2903. #endif
  2904. #ifdef RT
  2905. LFD f21, 0 * SIZE(BO)
  2906. FMUL f0, f21, f0
  2907. #endif
  2908. #ifdef LN
  2909. subi CO1, CO1, 1 * SIZE
  2910. #endif
  2911. #if defined(LN) || defined(LT)
  2912. STFD f0, 0 * SIZE(BO)
  2913. #else
  2914. STFD f0, 0 * SIZE(AO)
  2915. #endif
  2916. STFD f0, 0 * SIZE(CO1)
  2917. #ifndef LN
  2918. addi CO1, CO1, 1 * SIZE
  2919. #endif
  2920. #ifdef RT
  2921. slwi r0, K, 0 + BASE_SHIFT
  2922. add AORIG, AORIG, r0
  2923. #endif
  2924. #if defined(LT) || defined(RN)
  2925. sub TEMP, K, KK
  2926. slwi r0, TEMP, 0 + BASE_SHIFT
  2927. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2928. add AO, AO, r0
  2929. add BO, BO, TEMP
  2930. #endif
  2931. #ifdef LN
  2932. subi KK, KK, 1
  2933. #endif
  2934. #ifdef LT
  2935. addi KK, KK, 1
  2936. #endif
  2937. .align 4
  2938. LL(999):
  2939. addi r3, 0, 0
  2940. lfd f14, 0(SP)
  2941. lfd f15, 8(SP)
  2942. lfd f16, 16(SP)
  2943. lfd f17, 24(SP)
  2944. lfd f18, 32(SP)
  2945. lfd f19, 40(SP)
  2946. lfd f20, 48(SP)
  2947. lfd f21, 56(SP)
  2948. lfd f22, 64(SP)
  2949. lfd f23, 72(SP)
  2950. lfd f24, 80(SP)
  2951. lfd f25, 88(SP)
  2952. lfd f26, 96(SP)
  2953. lfd f27, 104(SP)
  2954. lfd f28, 112(SP)
  2955. lfd f29, 120(SP)
  2956. lfd f30, 128(SP)
  2957. lfd f31, 136(SP)
  2958. #ifdef __64BIT__
  2959. ld r31, 144(SP)
  2960. ld r30, 152(SP)
  2961. ld r29, 160(SP)
  2962. ld r28, 168(SP)
  2963. ld r27, 176(SP)
  2964. ld r26, 184(SP)
  2965. ld r25, 192(SP)
  2966. ld r24, 200(SP)
  2967. ld r23, 208(SP)
  2968. ld r22, 216(SP)
  2969. ld r21, 224(SP)
  2970. ld r20, 232(SP)
  2971. ld r19, 240(SP)
  2972. ld r18, 248(SP)
  2973. #else
  2974. lwz r31, 144(SP)
  2975. lwz r30, 148(SP)
  2976. lwz r29, 152(SP)
  2977. lwz r28, 156(SP)
  2978. lwz r27, 160(SP)
  2979. lwz r26, 164(SP)
  2980. lwz r25, 168(SP)
  2981. lwz r24, 172(SP)
  2982. lwz r23, 176(SP)
  2983. lwz r22, 180(SP)
  2984. lwz r21, 184(SP)
  2985. lwz r20, 188(SP)
  2986. lwz r19, 192(SP)
  2987. lwz r18, 196(SP)
  2988. #endif
  2989. addi SP, SP, STACKSIZE
  2990. blr
  2991. EPILOGUE
  2992. #endif