You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_hummer_LN.S 98 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define ALPHA 0
  41. #define FZERO 8
  42. #define M r3
  43. #define N r4
  44. #define K r5
  45. #if defined(linux) || defined(__FreeBSD__)
  46. #define A r6
  47. #define B r7
  48. #define C r8
  49. #define LDC r9
  50. #define OFFSET r10
  51. #endif
  52. #define TEMP r11
  53. #define AORIG r12
  54. #define KK r14
  55. #define INCM1 r15
  56. #define INCM4 r16
  57. #define INCM2 r17
  58. #define INC2 r19
  59. #define INC r20
  60. #define INC4 r21
  61. #define I r22
  62. #define J r23
  63. #define AO r24
  64. #define BO r25
  65. #define AO2 r26
  66. #define BO2 r27
  67. #define CO1 r28
  68. #define CO2 r29
  69. #define CO3 r30
  70. #define CO4 r31
  71. #ifndef NEEDPARAM
  72. #define A1 f16
  73. #define A2 f17
  74. #define A3 f18
  75. #define A4 f19
  76. #define A5 f20
  77. #define A6 f21
  78. #define A7 f22
  79. #define A8 f23
  80. #define A9 f24
  81. #define A10 f25
  82. #define B1 f26
  83. #define B2 f27
  84. #define B3 f28
  85. #define B4 f29
  86. #define B5 f30
  87. #define B6 f31
  88. #define AP B6
  89. PROLOGUE
  90. PROFCODE
  91. li r0, -16
  92. stfpdux f14, SP, r0
  93. stfpdux f15, SP, r0
  94. stfpdux f16, SP, r0
  95. stfpdux f17, SP, r0
  96. stfpdux f18, SP, r0
  97. stfpdux f19, SP, r0
  98. stfpdux f20, SP, r0
  99. stfpdux f21, SP, r0
  100. stfpdux f22, SP, r0
  101. stfpdux f23, SP, r0
  102. stfpdux f24, SP, r0
  103. stfpdux f25, SP, r0
  104. stfpdux f26, SP, r0
  105. stfpdux f27, SP, r0
  106. stfpdux f28, SP, r0
  107. stfpdux f29, SP, r0
  108. stfpdux f30, SP, r0
  109. stfpdux f31, SP, r0
  110. stwu r31, -4(SP)
  111. stwu r30, -4(SP)
  112. stwu r29, -4(SP)
  113. stwu r28, -4(SP)
  114. stwu r27, -4(SP)
  115. stwu r26, -4(SP)
  116. stwu r25, -4(SP)
  117. stwu r24, -4(SP)
  118. stwu r23, -4(SP)
  119. stwu r22, -4(SP)
  120. stwu r21, -4(SP)
  121. stwu r20, -4(SP)
  122. stwu r19, -4(SP)
  123. stwu r18, -4(SP)
  124. stwu r17, -4(SP)
  125. stwu r16, -4(SP)
  126. stwu r15, -4(SP)
  127. stwu r14, -4(SP) # dummy
  128. li r0, 0
  129. stwu r0, -4(SP)
  130. stwu r0, -4(SP)
  131. stfdu f1, -8(SP)
  132. slwi LDC, LDC, BASE_SHIFT
  133. cmpwi cr0, M, 0
  134. ble .L999
  135. cmpwi cr0, N, 0
  136. ble .L999
  137. cmpwi cr0, K, 0
  138. ble .L999
  139. li INC, 1 * SIZE
  140. li INC2, 2 * SIZE
  141. li INC4, 4 * SIZE
  142. li INCM1, -1 * SIZE
  143. li INCM2, -2 * SIZE
  144. li INCM4, -4 * SIZE
  145. addi C, C, - 1 * SIZE
  146. #ifdef LN
  147. mullw r0, M, K
  148. slwi r0, r0, BASE_SHIFT
  149. add A, A, r0
  150. slwi r0, M, BASE_SHIFT
  151. add C, C, r0
  152. #endif
  153. #ifdef RN
  154. neg KK, OFFSET
  155. #endif
  156. #ifdef RT
  157. mullw r0, N, K
  158. slwi r0, r0, BASE_SHIFT
  159. add B, B, r0
  160. mullw r0, N, LDC
  161. add C, C, r0
  162. sub KK, N, OFFSET
  163. #endif
  164. srawi. J, N, 2
  165. ble .L50
  166. .align 4
  167. .L10:
  168. #ifdef RT
  169. slwi r0, K, 2 + BASE_SHIFT
  170. sub B, B, r0
  171. slwi r0, LDC, 2
  172. sub C, C, r0
  173. #endif
  174. mr CO1, C
  175. add CO2, C, LDC
  176. add CO3, CO2, LDC
  177. add CO4, CO3, LDC
  178. #ifdef LN
  179. add KK, M, OFFSET
  180. #endif
  181. #ifdef LT
  182. mr KK, OFFSET
  183. #endif
  184. #if defined(LN) || defined(RT)
  185. addi AORIG, A, -4 * SIZE
  186. #else
  187. addi AO, A, -4 * SIZE
  188. #endif
  189. #ifndef RT
  190. add C, CO4, LDC
  191. #endif
  192. li r0, FZERO
  193. lfpsx f0, SP, r0
  194. andi. I, M, 1
  195. beq .L20
  196. #if defined(LT) || defined(RN)
  197. addi AO2, AO, 2 * SIZE
  198. fpmr f1, f0
  199. addi BO, B, - 4 * SIZE
  200. fpmr f2, f0
  201. addi BO2, B, - 2 * SIZE
  202. fpmr f3, f0
  203. srawi. r0, KK, 3
  204. mtspr CTR, r0
  205. ble .L44
  206. #else
  207. #ifdef LN
  208. slwi r0, K, 0 + BASE_SHIFT
  209. sub AORIG, AORIG, r0
  210. #endif
  211. slwi r0 , KK, 0 + BASE_SHIFT
  212. slwi TEMP, KK, 2 + BASE_SHIFT
  213. add AO, AORIG, r0
  214. add BO, B, TEMP
  215. sub TEMP, K, KK
  216. addi AO2, AO, 2 * SIZE
  217. fpmr f1, f0
  218. addi BO, BO, - 4 * SIZE
  219. fpmr f2, f0
  220. addi BO2, BO, 2 * SIZE
  221. fpmr f3, f0
  222. srawi. r0, TEMP, 3
  223. mtspr CTR, r0
  224. ble .L44
  225. #endif
  226. LFPDUX A1, AO, INC4
  227. LFPDUX B1, BO, INC4
  228. LFPDUX B2, BO2, INC4
  229. LFPDUX A2, AO2, INC4
  230. LFPDUX B3, BO, INC4
  231. LFPDUX B4, BO2, INC4
  232. LFPDUX A3, AO, INC4
  233. LFPDUX A5, BO, INC4
  234. LFPDUX A6, BO2, INC4
  235. LFPDUX A4, AO2, INC4
  236. LFPDUX A7, BO, INC4
  237. LFPDUX A8, BO2, INC4
  238. bdz- .L43
  239. .align 4
  240. .L42:
  241. fxcpmadd f0, A1, B1, f0
  242. LFPDUX B1, BO, INC4
  243. fxcpmadd f1, A1, B2, f1
  244. LFPDUX B2, BO2, INC4
  245. fxcsmadd f2, A1, B3, f2
  246. LFPDUX B3, BO, INC4
  247. fxcsmadd f3, A1, B4, f3
  248. LFPDUX B4, BO2, INC4
  249. LFPDUX A1, AO, INC4
  250. fxcpmadd f0, A2, A5, f0
  251. LFPDUX A5, BO, INC4
  252. fxcpmadd f1, A2, A6, f1
  253. LFPDUX A6, BO2, INC4
  254. fxcsmadd f2, A2, A7, f2
  255. LFPDUX A7, BO, INC4
  256. fxcsmadd f3, A2, A8, f3
  257. LFPDUX A8, BO2, INC4
  258. LFPDUX A2, AO2, INC4
  259. fxcpmadd f0, A3, B1, f0
  260. LFPDUX B1, BO, INC4
  261. fxcpmadd f1, A3, B2, f1
  262. LFPDUX B2, BO2, INC4
  263. fxcsmadd f2, A3, B3, f2
  264. LFPDUX B3, BO, INC4
  265. fxcsmadd f3, A3, B4, f3
  266. LFPDUX B4, BO2, INC4
  267. LFPDUX A3, AO, INC4
  268. fxcpmadd f0, A4, A5, f0
  269. LFPDUX A5, BO, INC4
  270. fxcpmadd f1, A4, A6, f1
  271. LFPDUX A6, BO2, INC4
  272. fxcsmadd f2, A4, A7, f2
  273. LFPDUX A7, BO, INC4
  274. fxcsmadd f3, A4, A8, f3
  275. LFPDUX A8, BO2, INC4
  276. LFPDUX A4, AO2, INC4
  277. bdnz+ .L42
  278. .align 4
  279. .L43:
  280. fxcpmadd f0, A1, B1, f0
  281. LFPDUX B1, BO, INC4
  282. fxcpmadd f1, A1, B2, f1
  283. LFPDUX B2, BO2, INC4
  284. fxcsmadd f2, A1, B3, f2
  285. LFPDUX B3, BO, INC4
  286. fxcsmadd f3, A1, B4, f3
  287. LFPDUX B4, BO2, INC4
  288. fxcpmadd f0, A2, A5, f0
  289. LFPDUX A5, BO, INC4
  290. fxcpmadd f1, A2, A6, f1
  291. LFPDUX A6, BO2, INC4
  292. fxcsmadd f2, A2, A7, f2
  293. LFPDUX A7, BO, INC4
  294. fxcsmadd f3, A2, A8, f3
  295. LFPDUX A8, BO2, INC4
  296. fxcpmadd f0, A3, B1, f0
  297. fxcpmadd f1, A3, B2, f1
  298. fxcsmadd f2, A3, B3, f2
  299. fxcsmadd f3, A3, B4, f3
  300. fxcpmadd f0, A4, A5, f0
  301. fxcpmadd f1, A4, A6, f1
  302. fxcsmadd f2, A4, A7, f2
  303. fxcsmadd f3, A4, A8, f3
  304. .align 4
  305. .L44:
  306. #if defined(LT) || defined(RN)
  307. andi. r0, KK, 7
  308. mtspr CTR, r0
  309. ble+ .L48
  310. #else
  311. andi. r0, TEMP, 7
  312. mtspr CTR, r0
  313. ble+ .L48
  314. #endif
  315. LFDX A1, AO, INC4
  316. LFPDUX B1, BO, INC4
  317. LFPDUX B2, BO2, INC4
  318. add AO, AO, INC
  319. bdz- .L47
  320. .align 4
  321. .L46:
  322. fxcpmadd f0, A1, B1, f0
  323. LFPDUX B1, BO, INC4
  324. fxcpmadd f1, A1, B2, f1
  325. LFDX A1, AO, INC4
  326. LFPDUX B2, BO2, INC4
  327. add AO, AO, INC
  328. bdnz+ .L46
  329. .align 4
  330. .L47:
  331. fxcpmadd f0, A1, B1, f0
  332. fxcpmadd f1, A1, B2, f1
  333. addi AO2, AO, 2 * SIZE
  334. .align 4
  335. .L48:
  336. fpadd f0, f0, f2
  337. fpadd f1, f1, f3
  338. #if defined(LN) || defined(RT)
  339. #ifdef LN
  340. subi r0, KK, 1
  341. #else
  342. subi r0, KK, 4
  343. #endif
  344. slwi TEMP, r0, 0 + BASE_SHIFT
  345. slwi r0, r0, 2 + BASE_SHIFT
  346. add AO, AORIG, TEMP
  347. add BO, B, r0
  348. addi AO2, AO, 2 * SIZE
  349. addi BO, BO, - 4 * SIZE
  350. addi BO2, BO, 2 * SIZE
  351. #endif
  352. #if defined(LN) || defined(LT)
  353. LFPDX f16, BO, INC4
  354. LFPDX f17, BO2, INC4
  355. fpsub f0, f16, f0
  356. fpsub f1, f17, f1
  357. #else
  358. LFPDX f16, AO, INC4
  359. LFPDX f17, AO2, INC4
  360. fpsub f0, f16, f0
  361. fpsub f1, f17, f1
  362. #endif
  363. #if defined(LN) || defined(LT)
  364. LFPDX A1, AO, INC4
  365. fxpmul f0, A1, f0
  366. fxpmul f1, A1, f1
  367. #endif
  368. #ifdef RN
  369. LFD A1, (4 + 0) * SIZE(BO)
  370. LFD A2, (4 + 1) * SIZE(BO)
  371. LFD A3, (4 + 2) * SIZE(BO)
  372. LFD A4, (4 + 3) * SIZE(BO)
  373. LFD A5, (4 + 5) * SIZE(BO)
  374. LFD A6, (4 + 6) * SIZE(BO)
  375. LFD A7, (4 + 7) * SIZE(BO)
  376. LFD A8, (4 + 10) * SIZE(BO)
  377. LFD A9, (4 + 11) * SIZE(BO)
  378. LFD A10, (4 + 15) * SIZE(BO)
  379. fsmtp f2, f0
  380. fsmtp f3, f1
  381. fmul f0, A1, f0
  382. fnmsub f2, A2, f0, f2
  383. fnmsub f1, A3, f0, f1
  384. fnmsub f3, A4, f0, f3
  385. fmul f2, A5, f2
  386. fnmsub f1, A6, f2, f1
  387. fnmsub f3, A7, f2, f3
  388. fmul f1, A8, f1
  389. fnmsub f3, A9, f1, f3
  390. fmul f3, A10, f3
  391. fsmfp f0, f2
  392. fsmfp f1, f3
  393. #endif
  394. #ifdef RT
  395. LFD A1, (4 + 15) * SIZE(BO)
  396. LFD A2, (4 + 14) * SIZE(BO)
  397. LFD A3, (4 + 13) * SIZE(BO)
  398. LFD A4, (4 + 12) * SIZE(BO)
  399. LFD A5, (4 + 10) * SIZE(BO)
  400. LFD A6, (4 + 9) * SIZE(BO)
  401. LFD A7, (4 + 8) * SIZE(BO)
  402. LFD A8, (4 + 5) * SIZE(BO)
  403. LFD A9, (4 + 4) * SIZE(BO)
  404. LFD A10, (4 + 0) * SIZE(BO)
  405. fsmtp f2, f0
  406. fsmtp f3, f1
  407. fmul f3, A1, f3
  408. fnmsub f1, A2, f3, f1
  409. fnmsub f2, A3, f3, f2
  410. fnmsub f0, A4, f3, f0
  411. fmul f1, A5, f1
  412. fnmsub f2, A6, f1, f2
  413. fnmsub f0, A7, f1, f0
  414. fmul f2, A8, f2
  415. fnmsub f0, A9, f2, f0
  416. fmul f0, A10, f0
  417. fsmfp f0, f2
  418. fsmfp f1, f3
  419. #endif
  420. #if defined(LN) || defined(LT)
  421. STFPDX f0, BO, INC4
  422. STFPDX f1, BO2, INC4
  423. #else
  424. STFPDX f0, AO, INC4
  425. STFPDX f1, AO2, INC4
  426. #endif
  427. #ifdef LN
  428. subi CO1, CO1, 1 * SIZE
  429. subi CO2, CO2, 1 * SIZE
  430. subi CO3, CO3, 1 * SIZE
  431. subi CO4, CO4, 1 * SIZE
  432. #endif
  433. STFDX f0, CO1, INC
  434. STFSDX f0, CO2, INC
  435. STFDX f1, CO3, INC
  436. STFSDX f1, CO4, INC
  437. #ifdef RT
  438. slwi r0, K, 0 + BASE_SHIFT
  439. add AORIG, AORIG, r0
  440. #endif
  441. #if defined(LT) || defined(RN)
  442. sub TEMP, K, KK
  443. slwi r0, TEMP, 0 + BASE_SHIFT
  444. slwi TEMP, TEMP, 2 + BASE_SHIFT
  445. add AO, AO, r0
  446. add BO, BO, TEMP
  447. #endif
  448. #ifdef LT
  449. addi KK, KK, 1
  450. #endif
  451. #ifdef LN
  452. subi KK, KK, 1
  453. #endif
  454. li r0, FZERO
  455. lfpsx f0, SP, r0
  456. .align 4
  457. .L20:
  458. andi. I, M, 2
  459. beq .L30
  460. #if defined(LT) || defined(RN)
  461. addi AO2, AO, 2 * SIZE
  462. fpmr f4, f0
  463. addi BO, B, - 4 * SIZE
  464. fpmr f8, f0
  465. addi BO2, B, - 2 * SIZE
  466. fpmr f12, f0
  467. srawi. r0, KK, 2
  468. mtspr CTR, r0
  469. ble .L34
  470. #else
  471. #ifdef LN
  472. slwi r0, K, 1 + BASE_SHIFT
  473. sub AORIG, AORIG, r0
  474. #endif
  475. slwi r0 , KK, 1 + BASE_SHIFT
  476. slwi TEMP, KK, 2 + BASE_SHIFT
  477. add AO, AORIG, r0
  478. add BO, B, TEMP
  479. sub TEMP, K, KK
  480. addi AO2, AO, 2 * SIZE
  481. fpmr f4, f0
  482. addi BO, BO, - 4 * SIZE
  483. fpmr f8, f0
  484. addi BO2, BO, 2 * SIZE
  485. fpmr f12, f0
  486. srawi. r0, TEMP, 2
  487. mtspr CTR, r0
  488. ble .L34
  489. #endif
  490. LFPDUX A1, AO, INC4
  491. LFPDUX B1, BO, INC4
  492. LFPDUX B2, BO2, INC4
  493. LFPDUX A2, AO2, INC4
  494. LFPDUX B3, BO, INC4
  495. LFPDUX B4, BO2, INC4
  496. LFPDUX A3, AO, INC4
  497. LFPDUX A5, BO, INC4
  498. LFPDUX A6, BO2, INC4
  499. LFPDUX A4, AO2, INC4
  500. LFPDUX A7, BO, INC4
  501. LFPDUX A8, BO2, INC4
  502. bdz- .L33
  503. .align 4
  504. .L32:
  505. fxcpmadd f0, B1, A1, f0
  506. fxcsmadd f4, B1, A1, f4
  507. LFPDUX B1, BO, INC4
  508. fxcpmadd f8, B2, A1, f8
  509. fxcsmadd f12, B2, A1, f12
  510. LFPDUX B2, BO2, INC4
  511. LFPDUX A1, AO, INC4
  512. fxcpmadd f0, B3, A2, f0
  513. fxcsmadd f4, B3, A2, f4
  514. LFPDUX B3, BO, INC4
  515. fxcpmadd f8, B4, A2, f8
  516. fxcsmadd f12, B4, A2, f12
  517. LFPDUX B4, BO2, INC4
  518. LFPDUX A2, AO2, INC4
  519. fxcpmadd f0, A5, A3, f0
  520. fxcsmadd f4, A5, A3, f4
  521. LFPDUX A5, BO, INC4
  522. fxcpmadd f8, A6, A3, f8
  523. fxcsmadd f12, A6, A3, f12
  524. LFPDUX A6, BO2, INC4
  525. LFPDUX A3, AO, INC4
  526. fxcpmadd f0, A7, A4, f0
  527. fxcsmadd f4, A7, A4, f4
  528. LFPDUX A7, BO, INC4
  529. fxcpmadd f8, A8, A4, f8
  530. fxcsmadd f12, A8, A4, f12
  531. LFPDUX A8, BO2, INC4
  532. LFPDUX A4, AO2, INC4
  533. bdnz+ .L32
  534. .align 4
  535. .L33:
  536. fxcpmadd f0, B1, A1, f0
  537. fxcsmadd f4, B1, A1, f4
  538. fxcpmadd f8, B2, A1, f8
  539. fxcsmadd f12, B2, A1, f12
  540. fxcpmadd f0, B3, A2, f0
  541. fxcsmadd f4, B3, A2, f4
  542. fxcpmadd f8, B4, A2, f8
  543. fxcsmadd f12, B4, A2, f12
  544. fxcpmadd f0, A5, A3, f0
  545. fxcsmadd f4, A5, A3, f4
  546. fxcpmadd f8, A6, A3, f8
  547. fxcsmadd f12, A6, A3, f12
  548. fxcpmadd f0, A7, A4, f0
  549. fxcsmadd f4, A7, A4, f4
  550. fxcpmadd f8, A8, A4, f8
  551. fxcsmadd f12, A8, A4, f12
  552. .align 4
  553. .L34:
  554. #if defined(LT) || defined(RN)
  555. andi. r0, KK, 3
  556. mtspr CTR, r0
  557. ble+ .L38
  558. #else
  559. andi. r0, TEMP, 3
  560. mtspr CTR, r0
  561. ble+ .L38
  562. #endif
  563. LFPDX A1, AO, INC4
  564. LFPDUX B1, BO, INC4
  565. LFPDUX B2, BO2, INC4
  566. add AO, AO, INC2
  567. bdz- .L37
  568. .align 4
  569. .L36:
  570. fxcpmadd f0, B1, A1, f0
  571. fxcsmadd f4, B1, A1, f4
  572. LFPDUX B1, BO, INC4
  573. fxcpmadd f8, B2, A1, f8
  574. fxcsmadd f12, B2, A1, f12
  575. LFPDX A1, AO, INC4
  576. LFPDUX B2, BO2, INC4
  577. add AO, AO, INC2
  578. bdnz+ .L36
  579. .align 4
  580. .L37:
  581. fxcpmadd f0, B1, A1, f0
  582. fxcsmadd f4, B1, A1, f4
  583. fxcpmadd f8, B2, A1, f8
  584. fxcsmadd f12, B2, A1, f12
  585. .align 4
  586. .L38:
  587. #if defined(LN) || defined(RT)
  588. #ifdef LN
  589. subi r0, KK, 2
  590. #else
  591. subi r0, KK, 4
  592. #endif
  593. slwi TEMP, r0, 1 + BASE_SHIFT
  594. slwi r0, r0, 2 + BASE_SHIFT
  595. add AO, AORIG, TEMP
  596. add BO, B, r0
  597. addi AO2, AO, 2 * SIZE
  598. addi BO, BO, - 4 * SIZE
  599. addi BO2, BO, 2 * SIZE
  600. #endif
  601. #if defined(LN) || defined(LT)
  602. fpmr f24, f0
  603. fpmr f28, f8
  604. fsmfp f0, f4
  605. fsmfp f8, f12
  606. fsmtp f4, f24
  607. fsmtp f12, f28
  608. LFPDUX f16, BO, INC4
  609. LFPDUX f17, BO2, INC4
  610. LFPDUX f18, BO, INC4
  611. LFPDUX f19, BO2, INC4
  612. subi BO, BO, 8 * SIZE
  613. subi BO2, BO2, 8 * SIZE
  614. fpsub f0, f16, f0
  615. fpsub f8, f17, f8
  616. fpsub f4, f18, f4
  617. fpsub f12, f19, f12
  618. #else
  619. LFPDUX f16, AO, INC4
  620. LFPDUX f17, AO2, INC4
  621. LFPDUX f18, AO, INC4
  622. LFPDUX f19, AO2, INC4
  623. subi AO, AO, 8 * SIZE
  624. subi AO2, AO2, 8 * SIZE
  625. fpsub f0, f16, f0
  626. fpsub f4, f17, f4
  627. fpsub f8, f18, f8
  628. fpsub f12, f19, f12
  629. #endif
  630. #ifdef LN
  631. addi AO, AO, 8 * SIZE
  632. addi AO2, AO2, 8 * SIZE
  633. LFPDUX A1, AO2, INCM4
  634. LFPDUX A2, AO, INCM4
  635. addi AO, AO, -4 * SIZE
  636. addi AO2, AO2, -4 * SIZE
  637. fxsmul f4, A1, f4
  638. fxsmul f12, A1, f12
  639. fxcpnmsub f0, A1, f4, f0
  640. fxcpnmsub f8, A1, f12, f8
  641. fxpmul f0, A2, f0
  642. fxpmul f8, A2, f8
  643. #endif
  644. #ifdef LT
  645. LFPDUX A1, AO, INC4
  646. LFPDUX A2, AO2, INC4
  647. subi AO, AO, 4 * SIZE
  648. subi AO2, AO2, 4 * SIZE
  649. fxpmul f0, A1, f0
  650. fxpmul f8, A1, f8
  651. fxcsnmsub f4, A1, f0, f4
  652. fxcsnmsub f12, A1, f8, f12
  653. fxsmul f4, A2, f4
  654. fxsmul f12, A2, f12
  655. #endif
  656. #ifdef RN
  657. LFPDUX A1, BO, INC4
  658. LFPDUX A2, BO2, INC4
  659. LFPDUX A3, BO, INC4
  660. LFPDUX A4, BO2, INC4
  661. add BO, BO, INC4
  662. LFPDUX A5, BO2, INC4
  663. add BO, BO, INC4
  664. LFPDUX A6, BO2, INC4
  665. subi BO, BO, 16 * SIZE
  666. subi BO2, BO2, 16 * SIZE
  667. fxpmul f0, A1, f0
  668. fxcsnmsub f4, A1, f0, f4
  669. fxcpnmsub f8, A2, f0, f8
  670. fxcsnmsub f12, A2, f0, f12
  671. fxsmul f4, A3, f4
  672. fxcpnmsub f8, A4, f4, f8
  673. fxcsnmsub f12, A4, f4, f12
  674. fxpmul f8, A5, f8
  675. fxcsnmsub f12, A5, f8, f12
  676. fxsmul f12, A6, f12
  677. #endif
  678. #ifdef RT
  679. addi BO, BO, 20 * SIZE
  680. addi BO2, BO2, 20 * SIZE
  681. LFPDUX A1, BO2, INCM4
  682. LFPDUX A2, BO, INCM4
  683. LFPDUX A3, BO2, INCM4
  684. LFPDUX A4, BO, INCM4
  685. add BO2, BO2, INCM4
  686. LFPDUX A5, BO, INCM4
  687. add BO2, BO2, INCM4
  688. LFPDUX A6, BO, INCM4
  689. subi BO, BO, 4 * SIZE
  690. subi BO2, BO2, 4 * SIZE
  691. fxsmul f12, A1, f12
  692. fxcpnmsub f8, A1, f12, f8
  693. fxcsnmsub f4, A2, f12, f4
  694. fxcpnmsub f0, A2, f12, f0
  695. fxpmul f8, A3, f8
  696. fxcsnmsub f4, A4, f8, f4
  697. fxcpnmsub f0, A4, f8, f0
  698. fxsmul f4, A5, f4
  699. fxcpnmsub f0, A5, f4, f0
  700. fxpmul f0, A6, f0
  701. #endif
  702. #ifdef LN
  703. subi CO1, CO1, 2 * SIZE
  704. subi CO2, CO2, 2 * SIZE
  705. subi CO3, CO3, 2 * SIZE
  706. subi CO4, CO4, 2 * SIZE
  707. #endif
  708. #if defined(LN) || defined(LT)
  709. STFPDUX f0, BO, INC4
  710. STFPDUX f8, BO2, INC4
  711. STFPDUX f4, BO, INC4
  712. STFPDUX f12, BO2, INC4
  713. subi BO, BO, 8 * SIZE
  714. subi BO2, BO2, 8 * SIZE
  715. STFDUX f0, CO1, INC
  716. STFDUX f4, CO1, INC
  717. STFSDUX f0, CO2, INC
  718. STFSDUX f4, CO2, INC
  719. STFDUX f8, CO3, INC
  720. STFDUX f12, CO3, INC
  721. STFSDUX f8, CO4, INC
  722. STFSDUX f12, CO4, INC
  723. #else
  724. STFPDUX f0, AO, INC4
  725. STFPDUX f4, AO2, INC4
  726. STFPDUX f8, AO, INC4
  727. STFPDUX f12, AO2, INC4
  728. subi AO, AO, 8 * SIZE
  729. subi AO2, AO2, 8 * SIZE
  730. STFDUX f0, CO1, INC
  731. STFSDUX f0, CO1, INC
  732. STFDUX f4, CO2, INC
  733. STFSDUX f4, CO2, INC
  734. STFDUX f8, CO3, INC
  735. STFSDUX f8, CO3, INC
  736. STFDUX f12, CO4, INC
  737. STFSDUX f12, CO4, INC
  738. #endif
  739. #ifdef LN
  740. subi CO1, CO1, 2 * SIZE
  741. subi CO2, CO2, 2 * SIZE
  742. subi CO3, CO3, 2 * SIZE
  743. subi CO4, CO4, 2 * SIZE
  744. #endif
  745. #ifdef RT
  746. slwi r0, K, 1 + BASE_SHIFT
  747. add AORIG, AORIG, r0
  748. #endif
  749. #if defined(LT) || defined(RN)
  750. sub TEMP, K, KK
  751. slwi r0, TEMP, 1 + BASE_SHIFT
  752. slwi TEMP, TEMP, 2 + BASE_SHIFT
  753. add AO, AO, r0
  754. add BO, BO, TEMP
  755. #endif
  756. #ifdef LT
  757. addi KK, KK, 2
  758. #endif
  759. #ifdef LN
  760. subi KK, KK, 2
  761. #endif
  762. li r0, FZERO
  763. lfpsx f0, SP, r0
  764. .align 4
  765. .L30:
  766. andi. I, M, 4
  767. beq .L40
  768. #if defined(LT) || defined(RN)
  769. addi AO2, AO, 2 * SIZE
  770. fpmr f4, f0
  771. addi BO, B, - 4 * SIZE
  772. fpmr f8, f0
  773. addi BO2, B, - 2 * SIZE
  774. fpmr f12, f0
  775. srawi. r0, KK, 2
  776. fpmr f1, f0
  777. fpmr f5, f0
  778. fpmr f9, f0
  779. mtspr CTR, r0
  780. fpmr f13, f0
  781. ble .L24
  782. #else
  783. #ifdef LN
  784. slwi r0, K, 2 + BASE_SHIFT
  785. sub AORIG, AORIG, r0
  786. #endif
  787. slwi r0 , KK, 2 + BASE_SHIFT
  788. slwi TEMP, KK, 2 + BASE_SHIFT
  789. add AO, AORIG, r0
  790. add BO, B, TEMP
  791. sub TEMP, K, KK
  792. addi AO2, AO, 2 * SIZE
  793. fpmr f4, f0
  794. addi BO, BO, - 4 * SIZE
  795. fpmr f8, f0
  796. addi BO2, BO, 2 * SIZE
  797. fpmr f12, f0
  798. srawi. r0, TEMP, 2
  799. fpmr f1, f0
  800. fpmr f5, f0
  801. fpmr f9, f0
  802. mtspr CTR, r0
  803. fpmr f13, f0
  804. ble .L24
  805. #endif
  806. LFPDUX A1, AO, INC4
  807. LFPDUX B1, BO, INC4
  808. LFPDUX A2, AO2, INC4
  809. LFPDUX B2, BO2, INC4
  810. LFPDUX A3, AO, INC4
  811. LFPDUX B3, BO, INC4
  812. LFPDUX A4, AO2, INC4
  813. LFPDUX B4, BO2, INC4
  814. LFPDUX A5, AO, INC4
  815. LFPDUX B5, BO, INC4
  816. LFPDUX A6, AO2, INC4
  817. LFPDUX B6, BO2, INC4
  818. LFPDUX A7, AO, INC4
  819. LFPDUX A9, BO, INC4
  820. LFPDUX A10, BO2, INC4
  821. bdz- .L23
  822. .align 4
  823. .L22:
  824. fxcpmadd f0, B1, A1, f0
  825. nop
  826. fxcsmadd f4, B1, A1, f4
  827. LFPDUX A8, AO2, INC4
  828. fxcpmadd f8, B2, A1, f8
  829. nop
  830. fxcsmadd f12, B2, A1, f12
  831. LFPDUX A1, AO, INC4
  832. fxcpmadd f1, B1, A2, f1
  833. nop
  834. fxcsmadd f5, B1, A2, f5
  835. LFPDUX B1, BO, INC4
  836. fxcpmadd f9, B2, A2, f9
  837. nop
  838. fxcsmadd f13, B2, A2, f13
  839. LFPDUX B2, BO2, INC4
  840. fxcpmadd f0, B3, A3, f0
  841. nop
  842. fxcsmadd f4, B3, A3, f4
  843. LFPDUX A2, AO2, INC4
  844. fxcpmadd f8, B4, A3, f8
  845. nop
  846. fxcsmadd f12, B4, A3, f12
  847. LFPDUX A3, AO, INC4
  848. fxcpmadd f1, B3, A4, f1
  849. nop
  850. fxcsmadd f5, B3, A4, f5
  851. LFPDUX B3, BO, INC4
  852. fxcpmadd f9, B4, A4, f9
  853. nop
  854. fxcsmadd f13, B4, A4, f13
  855. LFPDUX B4, BO2, INC4
  856. fxcpmadd f0, B5, A5, f0
  857. nop
  858. fxcsmadd f4, B5, A5, f4
  859. LFPDUX A4, AO2, INC4
  860. fxcpmadd f8, B6, A5, f8
  861. nop
  862. fxcsmadd f12, B6, A5, f12
  863. LFPDUX A5, AO, INC4
  864. fxcpmadd f1, B5, A6, f1
  865. nop
  866. fxcsmadd f5, B5, A6, f5
  867. LFPDUX B5, BO, INC4
  868. fxcpmadd f9, B6, A6, f9
  869. nop
  870. fxcsmadd f13, B6, A6, f13
  871. LFPDUX B6, BO2, INC4
  872. fxcpmadd f0, A9, A7, f0
  873. nop
  874. fxcsmadd f4, A9, A7, f4
  875. LFPDUX A6, AO2, INC4
  876. fxcpmadd f8, A10, A7, f8
  877. nop
  878. fxcsmadd f12, A10, A7, f12
  879. LFPDUX A7, AO, INC4
  880. fxcpmadd f1, A9, A8, f1
  881. nop
  882. fxcsmadd f5, A9, A8, f5
  883. LFPDUX A9, BO, INC4
  884. fxcpmadd f9, A10, A8, f9
  885. nop
  886. fxcsmadd f13, A10, A8, f13
  887. LFPDUX A10, BO2, INC4
  888. bdnz+ .L22
  889. .align 4
  890. .L23:
  891. fxcpmadd f0, B1, A1, f0
  892. fxcsmadd f4, B1, A1, f4
  893. LFPDUX A8, AO2, INC4
  894. fxcpmadd f8, B2, A1, f8
  895. fxcsmadd f12, B2, A1, f12
  896. fxcpmadd f1, B1, A2, f1
  897. fxcsmadd f5, B1, A2, f5
  898. fxcpmadd f9, B2, A2, f9
  899. fxcsmadd f13, B2, A2, f13
  900. fxcpmadd f0, B3, A3, f0
  901. fxcsmadd f4, B3, A3, f4
  902. fxcpmadd f8, B4, A3, f8
  903. fxcsmadd f12, B4, A3, f12
  904. fxcpmadd f1, B3, A4, f1
  905. fxcsmadd f5, B3, A4, f5
  906. fxcpmadd f9, B4, A4, f9
  907. fxcsmadd f13, B4, A4, f13
  908. fxcpmadd f0, B5, A5, f0
  909. fxcsmadd f4, B5, A5, f4
  910. fxcpmadd f8, B6, A5, f8
  911. fxcsmadd f12, B6, A5, f12
  912. fxcpmadd f1, B5, A6, f1
  913. fxcsmadd f5, B5, A6, f5
  914. fxcpmadd f9, B6, A6, f9
  915. fxcsmadd f13, B6, A6, f13
  916. fxcpmadd f0, A9, A7, f0
  917. fxcsmadd f4, A9, A7, f4
  918. fxcpmadd f8, A10, A7, f8
  919. fxcsmadd f12, A10, A7, f12
  920. fxcpmadd f1, A9, A8, f1
  921. fxcsmadd f5, A9, A8, f5
  922. fxcpmadd f9, A10, A8, f9
  923. fxcsmadd f13, A10, A8, f13
  924. .align 4
  925. .L24:
  926. #if defined(LT) || defined(RN)
  927. andi. r0, KK, 3
  928. mtspr CTR, r0
  929. ble+ .L28
  930. #else
  931. andi. r0, TEMP, 3
  932. mtspr CTR, r0
  933. ble+ .L28
  934. #endif
  935. LFPDUX A1, AO, INC4
  936. LFPDUX A2, AO2, INC4
  937. LFPDUX B1, BO, INC4
  938. LFPDUX B2, BO2, INC4
  939. bdz- .L27
  940. .align 4
  941. .L26:
  942. fxcpmadd f0, B1, A1, f0
  943. fxcsmadd f4, B1, A1, f4
  944. fxcpmadd f8, B2, A1, f8
  945. fxcsmadd f12, B2, A1, f12
  946. LFPDUX A1, AO, INC4
  947. fxcpmadd f1, B1, A2, f1
  948. fxcsmadd f5, B1, A2, f5
  949. LFPDUX B1, BO, INC4
  950. fxcpmadd f9, B2, A2, f9
  951. fxcsmadd f13, B2, A2, f13
  952. LFPDUX A2, AO2, INC4
  953. LFPDUX B2, BO2, INC4
  954. bdnz+ .L26
  955. .align 4
  956. .L27:
  957. fxcpmadd f0, B1, A1, f0
  958. fxcsmadd f4, B1, A1, f4
  959. fxcpmadd f8, B2, A1, f8
  960. fxcsmadd f12, B2, A1, f12
  961. fxcpmadd f1, B1, A2, f1
  962. fxcsmadd f5, B1, A2, f5
  963. fxcpmadd f9, B2, A2, f9
  964. fxcsmadd f13, B2, A2, f13
  965. .align 4
  966. .L28:
  967. #if defined(LN) || defined(RT)
  968. #ifdef LN
  969. subi r0, KK, 4
  970. #else
  971. subi r0, KK, 4
  972. #endif
  973. slwi TEMP, r0, 2 + BASE_SHIFT
  974. slwi r0, r0, 2 + BASE_SHIFT
  975. add AO, AORIG, TEMP
  976. add BO, B, r0
  977. addi AO2, AO, 2 * SIZE
  978. addi BO, BO, - 4 * SIZE
  979. addi BO2, BO, 2 * SIZE
  980. #endif
  981. #if defined(LN) || defined(LT)
  982. fpmr f24, f0
  983. fpmr f25, f1
  984. fpmr f28, f8
  985. fpmr f29, f9
  986. fsmfp f0, f4
  987. fsmfp f1, f5
  988. fsmfp f8, f12
  989. fsmfp f9, f13
  990. fsmtp f4, f24
  991. fsmtp f5, f25
  992. fsmtp f12, f28
  993. fsmtp f13, f29
  994. LFPDUX f16, BO, INC4
  995. LFPDUX f17, BO2, INC4
  996. LFPDUX f18, BO, INC4
  997. LFPDUX f19, BO2, INC4
  998. LFPDUX f20, BO, INC4
  999. LFPDUX f21, BO2, INC4
  1000. LFPDUX f22, BO, INC4
  1001. LFPDUX f23, BO2, INC4
  1002. subi BO, BO, 16 * SIZE
  1003. subi BO2, BO2, 16 * SIZE
  1004. fpsub f0, f16, f0
  1005. fpsub f8, f17, f8
  1006. fpsub f4, f18, f4
  1007. fpsub f12, f19, f12
  1008. fpsub f1, f20, f1
  1009. fpsub f9, f21, f9
  1010. fpsub f5, f22, f5
  1011. fpsub f13, f23, f13
  1012. #else
  1013. LFPDUX f16, AO, INC4
  1014. LFPDUX f17, AO2, INC4
  1015. LFPDUX f18, AO, INC4
  1016. LFPDUX f19, AO2, INC4
  1017. LFPDUX f20, AO, INC4
  1018. LFPDUX f21, AO2, INC4
  1019. LFPDUX f22, AO, INC4
  1020. LFPDUX f23, AO2, INC4
  1021. subi AO, AO, 16 * SIZE
  1022. subi AO2, AO2, 16 * SIZE
  1023. fpsub f0, f16, f0
  1024. fpsub f1, f17, f1
  1025. fpsub f4, f18, f4
  1026. fpsub f5, f19, f5
  1027. fpsub f8, f20, f8
  1028. fpsub f9, f21, f9
  1029. fpsub f12, f22, f12
  1030. fpsub f13, f23, f13
  1031. #endif
  1032. #ifdef LN
  1033. addi AO, AO, 20 * SIZE
  1034. addi AO2, AO2, 20 * SIZE
  1035. LFPDUX A1, AO2, INCM4
  1036. LFPDUX A2, AO, INCM4
  1037. LFPDUX A3, AO2, INCM4
  1038. LFPDUX A4, AO, INCM4
  1039. add AO2, AO2, INCM4
  1040. LFPDUX A5, AO, INCM4
  1041. add AO2, AO2, INCM4
  1042. LFPDUX A6, AO, INCM4
  1043. addi AO, AO, -4 * SIZE
  1044. addi AO2, AO2, -4 * SIZE
  1045. fxsmul f5, A1, f5
  1046. fxsmul f13, A1, f13
  1047. fxcpnmsub f1, A1, f5, f1
  1048. fxcpnmsub f9, A1, f13, f9
  1049. fxcsnmsub f4, A2, f5, f4
  1050. fxcsnmsub f12, A2, f13, f12
  1051. fxcpnmsub f0, A2, f5, f0
  1052. fxcpnmsub f8, A2, f13, f8
  1053. fxpmul f1, A3, f1
  1054. fxpmul f9, A3, f9
  1055. fxcsnmsub f4, A4, f1, f4
  1056. fxcsnmsub f12, A4, f9, f12
  1057. fxcpnmsub f0, A4, f1, f0
  1058. fxcpnmsub f8, A4, f9, f8
  1059. fxsmul f4, A5, f4
  1060. fxsmul f12, A5, f12
  1061. fxcpnmsub f0, A5, f4, f0
  1062. fxcpnmsub f8, A5, f12, f8
  1063. fxpmul f0, A6, f0
  1064. fxpmul f8, A6, f8
  1065. #endif
  1066. #ifdef LT
  1067. LFPDUX A1, AO, INC4
  1068. LFPDUX A2, AO2, INC4
  1069. LFPDUX A3, AO, INC4
  1070. LFPDUX A4, AO2, INC4
  1071. add AO, AO, INC4
  1072. LFPDUX A5, AO2, INC4
  1073. add AO, AO, INC4
  1074. LFPDUX A6, AO2, INC4
  1075. subi AO, AO, 16 * SIZE
  1076. subi AO2, AO2, 16 * SIZE
  1077. fxpmul f0, A1, f0
  1078. fxpmul f8, A1, f8
  1079. fxcsnmsub f4, A1, f0, f4
  1080. fxcsnmsub f12, A1, f8, f12
  1081. fxcpnmsub f1, A2, f0, f1
  1082. fxcpnmsub f9, A2, f8, f9
  1083. fxcsnmsub f5, A2, f0, f5
  1084. fxcsnmsub f13, A2, f8, f13
  1085. fxsmul f4, A3, f4
  1086. fxsmul f12, A3, f12
  1087. fxcpnmsub f1, A4, f4, f1
  1088. fxcpnmsub f9, A4, f12, f9
  1089. fxcsnmsub f5, A4, f4, f5
  1090. fxcsnmsub f13, A4, f12, f13
  1091. fxpmul f1, A5, f1
  1092. fxpmul f9, A5, f9
  1093. fxcsnmsub f5, A5, f1, f5
  1094. fxcsnmsub f13, A5, f9, f13
  1095. fxsmul f5, A6, f5
  1096. fxsmul f13, A6, f13
  1097. #endif
  1098. #ifdef RN
  1099. LFPDUX A1, BO, INC4
  1100. LFPDUX A2, BO2, INC4
  1101. LFPDUX A3, BO, INC4
  1102. LFPDUX A4, BO2, INC4
  1103. add BO, BO, INC4
  1104. LFPDUX A5, BO2, INC4
  1105. add BO, BO, INC4
  1106. LFPDUX A6, BO2, INC4
  1107. subi BO, BO, 16 * SIZE
  1108. subi BO2, BO2, 16 * SIZE
  1109. fxpmul f0, A1, f0
  1110. fxpmul f1, A1, f1
  1111. fxcsnmsub f4, A1, f0, f4
  1112. fxcsnmsub f5, A1, f1, f5
  1113. fxcpnmsub f8, A2, f0, f8
  1114. fxcpnmsub f9, A2, f1, f9
  1115. fxcsnmsub f12, A2, f0, f12
  1116. fxcsnmsub f13, A2, f1, f13
  1117. fxsmul f4, A3, f4
  1118. fxsmul f5, A3, f5
  1119. fxcpnmsub f8, A4, f4, f8
  1120. fxcpnmsub f9, A4, f5, f9
  1121. fxcsnmsub f12, A4, f4, f12
  1122. fxcsnmsub f13, A4, f5, f13
  1123. fxpmul f8, A5, f8
  1124. fxpmul f9, A5, f9
  1125. fxcsnmsub f12, A5, f8, f12
  1126. fxcsnmsub f13, A5, f9, f13
  1127. fxsmul f12, A6, f12
  1128. fxsmul f13, A6, f13
  1129. #endif
  1130. #ifdef RT
  1131. addi BO, BO, 20 * SIZE
  1132. addi BO2, BO2, 20 * SIZE
  1133. LFPDUX A1, BO2, INCM4
  1134. LFPDUX A2, BO, INCM4
  1135. LFPDUX A3, BO2, INCM4
  1136. LFPDUX A4, BO, INCM4
  1137. add BO2, BO2, INCM4
  1138. LFPDUX A5, BO, INCM4
  1139. add BO2, BO2, INCM4
  1140. LFPDUX A6, BO, INCM4
  1141. subi BO, BO, 4 * SIZE
  1142. subi BO2, BO2, 4 * SIZE
  1143. fxsmul f12, A1, f12
  1144. fxsmul f13, A1, f13
  1145. fxcpnmsub f8, A1, f12, f8
  1146. fxcpnmsub f9, A1, f13, f9
  1147. fxcsnmsub f4, A2, f12, f4
  1148. fxcsnmsub f5, A2, f13, f5
  1149. fxcpnmsub f0, A2, f12, f0
  1150. fxcpnmsub f1, A2, f13, f1
  1151. fxpmul f8, A3, f8
  1152. fxpmul f9, A3, f9
  1153. fxcsnmsub f4, A4, f8, f4
  1154. fxcsnmsub f5, A4, f9, f5
  1155. fxcpnmsub f0, A4, f8, f0
  1156. fxcpnmsub f1, A4, f9, f1
  1157. fxsmul f4, A5, f4
  1158. fxsmul f5, A5, f5
  1159. fxcpnmsub f0, A5, f4, f0
  1160. fxcpnmsub f1, A5, f5, f1
  1161. fxpmul f0, A6, f0
  1162. fxpmul f1, A6, f1
  1163. #endif
  1164. #ifdef LN
  1165. subi CO1, CO1, 4 * SIZE
  1166. subi CO2, CO2, 4 * SIZE
  1167. subi CO3, CO3, 4 * SIZE
  1168. subi CO4, CO4, 4 * SIZE
  1169. #endif
  1170. #if defined(LN) || defined(LT)
  1171. STFPDUX f0, BO, INC4
  1172. STFPDUX f8, BO2, INC4
  1173. STFPDUX f4, BO, INC4
  1174. STFPDUX f12, BO2, INC4
  1175. STFPDUX f1, BO, INC4
  1176. STFPDUX f9, BO2, INC4
  1177. STFPDUX f5, BO, INC4
  1178. STFPDUX f13, BO2, INC4
  1179. subi BO, BO, 16 * SIZE
  1180. subi BO2, BO2, 16 * SIZE
  1181. STFDUX f0, CO1, INC
  1182. STFDUX f4, CO1, INC
  1183. STFDUX f1, CO1, INC
  1184. STFDUX f5, CO1, INC
  1185. STFSDUX f0, CO2, INC
  1186. STFSDUX f4, CO2, INC
  1187. STFSDUX f1, CO2, INC
  1188. STFSDUX f5, CO2, INC
  1189. STFDUX f8, CO3, INC
  1190. STFDUX f12, CO3, INC
  1191. STFDUX f9, CO3, INC
  1192. STFDUX f13, CO3, INC
  1193. STFSDUX f8, CO4, INC
  1194. STFSDUX f12, CO4, INC
  1195. STFSDUX f9, CO4, INC
  1196. STFSDUX f13, CO4, INC
  1197. #else
  1198. STFPDUX f0, AO, INC4
  1199. STFPDUX f1, AO2, INC4
  1200. STFPDUX f4, AO, INC4
  1201. STFPDUX f5, AO2, INC4
  1202. STFPDUX f8, AO, INC4
  1203. STFPDUX f9, AO2, INC4
  1204. STFPDUX f12, AO, INC4
  1205. STFPDUX f13, AO2, INC4
  1206. subi AO, AO, 16 * SIZE
  1207. subi AO2, AO2, 16 * SIZE
  1208. STFDUX f0, CO1, INC
  1209. STFSDUX f0, CO1, INC
  1210. STFDUX f1, CO1, INC
  1211. STFSDUX f1, CO1, INC
  1212. STFDUX f4, CO2, INC
  1213. STFSDUX f4, CO2, INC
  1214. STFDUX f5, CO2, INC
  1215. STFSDUX f5, CO2, INC
  1216. STFDUX f8, CO3, INC
  1217. STFSDUX f8, CO3, INC
  1218. STFDUX f9, CO3, INC
  1219. STFSDUX f9, CO3, INC
  1220. STFDUX f12, CO4, INC
  1221. STFSDUX f12, CO4, INC
  1222. STFDUX f13, CO4, INC
  1223. STFSDUX f13, CO4, INC
  1224. #endif
  1225. #ifdef LN
  1226. subi CO1, CO1, 4 * SIZE
  1227. subi CO2, CO2, 4 * SIZE
  1228. subi CO3, CO3, 4 * SIZE
  1229. subi CO4, CO4, 4 * SIZE
  1230. #endif
  1231. #ifdef RT
  1232. slwi r0, K, 2 + BASE_SHIFT
  1233. add AORIG, AORIG, r0
  1234. #endif
  1235. #if defined(LT) || defined(RN)
  1236. sub TEMP, K, KK
  1237. slwi r0, TEMP, 2 + BASE_SHIFT
  1238. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1239. add AO, AO, r0
  1240. add BO, BO, TEMP
  1241. #endif
  1242. #ifdef LT
  1243. addi KK, KK, 4
  1244. #endif
  1245. #ifdef LN
  1246. subi KK, KK, 4
  1247. #endif
  1248. li r0, FZERO
  1249. lfpsx f0, SP, r0
  1250. .align 4
  1251. .L40:
  1252. srawi. I, M, 3
  1253. ble .L49
  1254. .align 4
  1255. .L11:
  1256. #if defined(LT) || defined(RN)
  1257. addi AO2, AO, 2 * SIZE
  1258. fpmr f4, f0
  1259. addi BO, B, - 4 * SIZE
  1260. fpmr f8, f0
  1261. addi BO2, B, - 2 * SIZE
  1262. fpmr f12, f0
  1263. fpmr f5, f0
  1264. fpmr f9, f0
  1265. fpmr f13, f0
  1266. fpmr f2, f0
  1267. fpmr f6, f0
  1268. fpmr f10, f0
  1269. fpmr f14, f0
  1270. fpmr f3, f0
  1271. fpmr f7, f0
  1272. fpmr f11, f0
  1273. fpmr f15, f0
  1274. nop
  1275. srawi. r0, KK, 2
  1276. fpmr f1, f0
  1277. mtspr CTR, r0
  1278. ble .L14
  1279. #else
  1280. #ifdef LN
  1281. slwi r0, K, 3 + BASE_SHIFT
  1282. sub AORIG, AORIG, r0
  1283. #endif
  1284. slwi r0 , KK, 3 + BASE_SHIFT
  1285. slwi TEMP, KK, 2 + BASE_SHIFT
  1286. add AO, AORIG, r0
  1287. add BO, B, TEMP
  1288. sub TEMP, K, KK
  1289. addi AO2, AO, 2 * SIZE
  1290. fpmr f4, f0
  1291. addi BO, BO, - 4 * SIZE
  1292. fpmr f8, f0
  1293. addi BO2, BO, 2 * SIZE
  1294. fpmr f12, f0
  1295. fpmr f5, f0
  1296. fpmr f9, f0
  1297. fpmr f13, f0
  1298. fpmr f2, f0
  1299. fpmr f6, f0
  1300. fpmr f10, f0
  1301. fpmr f14, f0
  1302. fpmr f3, f0
  1303. fpmr f7, f0
  1304. fpmr f11, f0
  1305. fpmr f15, f0
  1306. nop
  1307. srawi. r0, TEMP, 2
  1308. fpmr f1, f0
  1309. mtspr CTR, r0
  1310. ble .L14
  1311. #endif
  1312. LFPDUX A1, AO, INC4
  1313. fpmr f5, f0
  1314. LFPDUX A3, AO, INC4
  1315. fpmr f9, f0
  1316. LFPDUX B1, BO, INC4
  1317. fpmr f13, f0
  1318. LFPDUX A5, AO, INC4
  1319. fpmr f2, f0
  1320. LFPDUX A6, AO, INC4
  1321. fpmr f6, f0
  1322. LFPDUX B3, BO, INC4
  1323. fpmr f10, f0
  1324. LFPDUX A7, AO, INC4
  1325. fpmr f14, f0
  1326. LFPDUX A8, AO, INC4
  1327. fpmr f3, f0
  1328. LFPDUX B5, BO, INC4
  1329. fpmr f7, f0
  1330. LFPDUX A9, AO, INC4
  1331. fpmr f11, f0
  1332. LFPDUX A2, AO2, INC4
  1333. fpmr f15, f0
  1334. LFPDUX B2, BO2, INC4
  1335. bdz- .L13
  1336. .align 4
  1337. .L12:
  1338. ## 1 ##
  1339. fxcpmadd f0, B1, A1, f0
  1340. nop
  1341. fxcsmadd f4, B1, A1, f4
  1342. nop
  1343. fxcpmadd f8, B2, A1, f8
  1344. LFPDUX B4, BO2, INC4
  1345. fxcsmadd f12, B2, A1, f12
  1346. LFPDUX B6, BO, INC4
  1347. fxcpmadd f1, B1, A2, f1
  1348. nop
  1349. fxcsmadd f5, B1, A2, f5
  1350. LFPDUX A4, AO2, INC4
  1351. fxcpmadd f9, B2, A2, f9
  1352. LFPDUX A10, AO, INC4
  1353. fxcsmadd f13, B2, A2, f13
  1354. nop
  1355. fxcpmadd f2, B1, A3, f2
  1356. nop
  1357. fxcsmadd f6, B1, A3, f6
  1358. nop
  1359. fxcpmadd f10, B2, A3, f10
  1360. nop
  1361. fxcsmadd f14, B2, A3, f14
  1362. nop
  1363. fxcpmadd f3, B1, A4, f3
  1364. nop
  1365. fxcsmadd f7, B1, A4, f7
  1366. LFPDUX A2, AO2, INC4
  1367. fxcpmadd f11, B2, A4, f11
  1368. LFPDUX A1, AO, INC4
  1369. fxcsmadd f15, B2, A4, f15
  1370. nop
  1371. ## 2 ##
  1372. fxcpmadd f0, B3, A5, f0
  1373. nop
  1374. fxcsmadd f4, B3, A5, f4
  1375. nop
  1376. fxcpmadd f8, B4, A5, f8
  1377. LFPDUX B2, BO2, INC4
  1378. fxcsmadd f12, B4, A5, f12
  1379. LFPDUX B1, BO, INC4
  1380. fxcpmadd f1, B3, A2, f1
  1381. nop
  1382. fxcsmadd f5, B3, A2, f5
  1383. LFPDUX A4, AO2, INC4
  1384. fxcpmadd f9, B4, A2, f9
  1385. LFPDUX A3, AO, INC4
  1386. fxcsmadd f13, B4, A2, f13
  1387. nop
  1388. fxcpmadd f2, B3, A6, f2
  1389. nop
  1390. fxcsmadd f6, B3, A6, f6
  1391. nop
  1392. fxcpmadd f10, B4, A6, f10
  1393. nop
  1394. fxcsmadd f14, B4, A6, f14
  1395. nop
  1396. fxcpmadd f3, B3, A4, f3
  1397. nop
  1398. fxcsmadd f7, B3, A4, f7
  1399. LFPDUX A2, AO2, INC4
  1400. fxcpmadd f11, B4, A4, f11
  1401. LFPDUX A5, AO, INC4
  1402. fxcsmadd f15, B4, A4, f15
  1403. nop
  1404. ## 3 ##
  1405. fxcpmadd f0, B5, A7, f0
  1406. nop
  1407. fxcsmadd f4, B5, A7, f4
  1408. nop
  1409. fxcpmadd f8, B2, A7, f8
  1410. LFPDUX B4, BO2, INC4
  1411. fxcsmadd f12, B2, A7, f12
  1412. LFPDUX B3, BO, INC4
  1413. fxcpmadd f1, B5, A2, f1
  1414. nop
  1415. fxcsmadd f5, B5, A2, f5
  1416. LFPDUX A4, AO2, INC4
  1417. fxcpmadd f9, B2, A2, f9
  1418. LFPDUX A6, AO, INC4
  1419. fxcsmadd f13, B2, A2, f13
  1420. nop
  1421. fxcpmadd f2, B5, A8, f2
  1422. nop
  1423. fxcsmadd f6, B5, A8, f6
  1424. nop
  1425. fxcpmadd f10, B2, A8, f10
  1426. nop
  1427. fxcsmadd f14, B2, A8, f14
  1428. nop
  1429. fxcpmadd f3, B5, A4, f3
  1430. nop
  1431. fxcsmadd f7, B5, A4, f7
  1432. LFPDUX A2, AO2, INC4
  1433. fxcpmadd f11, B2, A4, f11
  1434. LFPDUX A7, AO, INC4
  1435. fxcsmadd f15, B2, A4, f15
  1436. nop
  1437. ## 4 ##
  1438. fxcpmadd f0, B6, A9, f0
  1439. nop
  1440. fxcsmadd f4, B6, A9, f4
  1441. nop
  1442. fxcpmadd f8, B4, A9, f8
  1443. LFPDUX B2, BO2, INC4
  1444. fxcsmadd f12, B4, A9, f12
  1445. LFPDUX B5, BO, INC4
  1446. fxcpmadd f1, B6, A2, f1
  1447. nop
  1448. fxcsmadd f5, B6, A2, f5
  1449. LFPDUX A4, AO2, INC4
  1450. fxcpmadd f9, B4, A2, f9
  1451. LFPDUX A8, AO, INC4
  1452. fxcsmadd f13, B4, A2, f13
  1453. nop
  1454. fxcpmadd f2, B6, A10, f2
  1455. nop
  1456. fxcsmadd f6, B6, A10, f6
  1457. nop
  1458. fxcpmadd f10, B4, A10, f10
  1459. nop
  1460. fxcsmadd f14, B4, A10, f14
  1461. nop
  1462. fxcpmadd f3, B6, A4, f3
  1463. LFPDUX A2, AO2, INC4
  1464. fxcsmadd f7, B6, A4, f7
  1465. LFPDUX A9, AO, INC4
  1466. fxcpmadd f11, B4, A4, f11
  1467. nop
  1468. fxcsmadd f15, B4, A4, f15
  1469. bdnz+ .L12
  1470. .align 4
  1471. .L13:
  1472. ## 1 ##
  1473. fxcpmadd f0, B1, A1, f0
  1474. nop
  1475. fxcsmadd f4, B1, A1, f4
  1476. nop
  1477. fxcpmadd f8, B2, A1, f8
  1478. LFPDUX B4, BO2, INC4
  1479. fxcsmadd f12, B2, A1, f12
  1480. LFPDUX B6, BO, INC4
  1481. fxcpmadd f1, B1, A2, f1
  1482. nop
  1483. fxcsmadd f5, B1, A2, f5
  1484. LFPDUX A4, AO2, INC4
  1485. fxcpmadd f9, B2, A2, f9
  1486. LFPDUX A10, AO, INC4
  1487. fxcsmadd f13, B2, A2, f13
  1488. nop
  1489. fxcpmadd f2, B1, A3, f2
  1490. nop
  1491. fxcsmadd f6, B1, A3, f6
  1492. nop
  1493. fxcpmadd f10, B2, A3, f10
  1494. nop
  1495. fxcsmadd f14, B2, A3, f14
  1496. nop
  1497. fxcpmadd f3, B1, A4, f3
  1498. nop
  1499. fxcsmadd f7, B1, A4, f7
  1500. LFPDUX A2, AO2, INC4
  1501. fxcpmadd f11, B2, A4, f11
  1502. nop
  1503. fxcsmadd f15, B2, A4, f15
  1504. nop
  1505. ## 2 ##
  1506. fxcpmadd f0, B3, A5, f0
  1507. nop
  1508. fxcsmadd f4, B3, A5, f4
  1509. nop
  1510. fxcpmadd f8, B4, A5, f8
  1511. LFPDUX B2, BO2, INC4
  1512. fxcsmadd f12, B4, A5, f12
  1513. nop
  1514. fxcpmadd f1, B3, A2, f1
  1515. nop
  1516. fxcsmadd f5, B3, A2, f5
  1517. LFPDUX A4, AO2, INC4
  1518. fxcpmadd f9, B4, A2, f9
  1519. nop
  1520. fxcsmadd f13, B4, A2, f13
  1521. nop
  1522. fxcpmadd f2, B3, A6, f2
  1523. nop
  1524. fxcsmadd f6, B3, A6, f6
  1525. nop
  1526. fxcpmadd f10, B4, A6, f10
  1527. nop
  1528. fxcsmadd f14, B4, A6, f14
  1529. nop
  1530. fxcpmadd f3, B3, A4, f3
  1531. nop
  1532. fxcsmadd f7, B3, A4, f7
  1533. LFPDUX A2, AO2, INC4
  1534. fxcpmadd f11, B4, A4, f11
  1535. nop
  1536. fxcsmadd f15, B4, A4, f15
  1537. nop
  1538. ## 3 ##
  1539. fxcpmadd f0, B5, A7, f0
  1540. nop
  1541. fxcsmadd f4, B5, A7, f4
  1542. nop
  1543. fxcpmadd f8, B2, A7, f8
  1544. LFPDUX B4, BO2, INC4
  1545. fxcsmadd f12, B2, A7, f12
  1546. nop
  1547. fxcpmadd f1, B5, A2, f1
  1548. nop
  1549. fxcsmadd f5, B5, A2, f5
  1550. LFPDUX A4, AO2, INC4
  1551. fxcpmadd f9, B2, A2, f9
  1552. nop
  1553. fxcsmadd f13, B2, A2, f13
  1554. fxcpmadd f2, B5, A8, f2
  1555. nop
  1556. fxcsmadd f6, B5, A8, f6
  1557. nop
  1558. fxcpmadd f10, B2, A8, f10
  1559. nop
  1560. fxcsmadd f14, B2, A8, f14
  1561. nop
  1562. fxcpmadd f3, B5, A4, f3
  1563. nop
  1564. fxcsmadd f7, B5, A4, f7
  1565. LFPDUX A2, AO2, INC4
  1566. fxcpmadd f11, B2, A4, f11
  1567. nop
  1568. fxcsmadd f15, B2, A4, f15
  1569. nop
  1570. ## 4 ##
  1571. fxcpmadd f0, B6, A9, f0
  1572. nop
  1573. fxcsmadd f4, B6, A9, f4
  1574. nop
  1575. fxcpmadd f8, B4, A9, f8
  1576. nop
  1577. fxcsmadd f12, B4, A9, f12
  1578. nop
  1579. fxcpmadd f1, B6, A2, f1
  1580. nop
  1581. fxcsmadd f5, B6, A2, f5
  1582. LFPDUX A4, AO2, INC4
  1583. fxcpmadd f9, B4, A2, f9
  1584. nop
  1585. fxcsmadd f13, B4, A2, f13
  1586. nop
  1587. fxcpmadd f2, B6, A10, f2
  1588. nop
  1589. fxcsmadd f6, B6, A10, f6
  1590. nop
  1591. fxcpmadd f10, B4, A10, f10
  1592. nop
  1593. fxcsmadd f14, B4, A10, f14
  1594. nop
  1595. fxcpmadd f3, B6, A4, f3
  1596. nop
  1597. fxcsmadd f7, B6, A4, f7
  1598. nop
  1599. fxcpmadd f11, B4, A4, f11
  1600. nop
  1601. fxcsmadd f15, B4, A4, f15
  1602. nop
  1603. .align 4
  1604. .L14:
  1605. #if defined(LT) || defined(RN)
  1606. andi. r0, KK, 3
  1607. mtspr CTR, r0
  1608. ble+ .L18
  1609. #else
  1610. andi. r0, TEMP, 3
  1611. mtspr CTR, r0
  1612. ble+ .L18
  1613. #endif
  1614. .align 4
  1615. .L15:
  1616. LFPDUX A2, AO, INC4
  1617. LFPDUX A4, AO2, INC4
  1618. LFPDUX A10, BO, INC4
  1619. LFPDUX B4, BO2, INC4
  1620. bdz- .L17
  1621. .align 4
  1622. .L16:
  1623. fxcpmadd f0, A10, A2, f0
  1624. fxcsmadd f4, A10, A2, f4
  1625. fxcpmadd f8, B4, A2, f8
  1626. fxcsmadd f12, B4, A2, f12
  1627. LFPDUX A2, AO, INC4
  1628. fxcpmadd f1, A10, A4, f1
  1629. fxcsmadd f5, A10, A4, f5
  1630. fxcpmadd f9, B4, A4, f9
  1631. fxcsmadd f13, B4, A4, f13
  1632. LFPDUX A4, AO2, INC4
  1633. fxcpmadd f2, A10, A2, f2
  1634. fxcsmadd f6, A10, A2, f6
  1635. fxcpmadd f10, B4, A2, f10
  1636. fxcsmadd f14, B4, A2, f14
  1637. LFPDUX A2, AO, INC4
  1638. fxcpmadd f3, A10, A4, f3
  1639. fxcsmadd f7, A10, A4, f7
  1640. LFPDUX A10, BO, INC4
  1641. fxcpmadd f11, B4, A4, f11
  1642. fxcsmadd f15, B4, A4, f15
  1643. LFPDUX A4, AO2, INC4
  1644. LFPDUX B4, BO2, INC4
  1645. bdnz+ .L16
  1646. .align 4
  1647. .L17:
  1648. fxcpmadd f0, A10, A2, f0
  1649. fxcsmadd f4, A10, A2, f4
  1650. fxcpmadd f8, B4, A2, f8
  1651. fxcsmadd f12, B4, A2, f12
  1652. LFPDUX A2, AO, INC4
  1653. fxcpmadd f1, A10, A4, f1
  1654. fxcsmadd f5, A10, A4, f5
  1655. fxcpmadd f9, B4, A4, f9
  1656. fxcsmadd f13, B4, A4, f13
  1657. LFPDUX A4, AO2, INC4
  1658. fxcpmadd f2, A10, A2, f2
  1659. fxcsmadd f6, A10, A2, f6
  1660. fxcpmadd f10, B4, A2, f10
  1661. fxcsmadd f14, B4, A2, f14
  1662. fxcpmadd f3, A10, A4, f3
  1663. fxcsmadd f7, A10, A4, f7
  1664. fxcpmadd f11, B4, A4, f11
  1665. fxcsmadd f15, B4, A4, f15
  1666. .align 4
  1667. .L18:
  1668. #if defined(LN) || defined(RT)
  1669. #ifdef LN
  1670. subi r0, KK, 8
  1671. #else
  1672. subi r0, KK, 4
  1673. #endif
  1674. slwi TEMP, r0, 3 + BASE_SHIFT
  1675. slwi r0, r0, 2 + BASE_SHIFT
  1676. add AO, AORIG, TEMP
  1677. add BO, B, r0
  1678. addi AO2, AO, 2 * SIZE
  1679. addi BO, BO, - 4 * SIZE
  1680. addi BO2, BO, 2 * SIZE
  1681. #endif
  1682. #if defined(LN) || defined(LT)
  1683. fpmr f24, f0
  1684. LFPDUX f16, BO, INC4
  1685. fpmr f25, f1
  1686. nop
  1687. fpmr f26, f2
  1688. LFPDUX f17, BO2, INC4
  1689. fpmr f27, f3
  1690. nop
  1691. fpmr f28, f8
  1692. LFPDUX f18, BO, INC4
  1693. fpmr f29, f9
  1694. nop
  1695. fpmr f30, f10
  1696. LFPDUX f19, BO2, INC4
  1697. fpmr f31, f11
  1698. nop
  1699. fsmfp f0, f4
  1700. LFPDUX f20, BO, INC4
  1701. fsmfp f1, f5
  1702. nop
  1703. fsmfp f2, f6
  1704. LFPDUX f21, BO2, INC4
  1705. fsmfp f3, f7
  1706. nop
  1707. fsmfp f8, f12
  1708. LFPDUX f22, BO, INC4
  1709. fsmfp f9, f13
  1710. nop
  1711. fsmfp f10, f14
  1712. LFPDUX f23, BO2, INC4
  1713. fsmfp f11, f15
  1714. nop
  1715. fsmtp f4, f24
  1716. LFPDUX f24, BO, INC4
  1717. fsmtp f5, f25
  1718. nop
  1719. fsmtp f6, f26
  1720. LFPDUX f25, BO2, INC4
  1721. fsmtp f7, f27
  1722. nop
  1723. fsmtp f12, f28
  1724. LFPDUX f26, BO, INC4
  1725. fsmtp f13, f29
  1726. nop
  1727. fsmtp f14, f30
  1728. LFPDUX f27, BO2, INC4
  1729. fsmtp f15, f31
  1730. nop
  1731. fpsub f0, f16, f0
  1732. LFPDUX f28, BO, INC4
  1733. fpsub f8, f17, f8
  1734. nop
  1735. fpsub f4, f18, f4
  1736. LFPDUX f29, BO2, INC4
  1737. fpsub f12, f19, f12
  1738. nop
  1739. fpsub f1, f20, f1
  1740. LFPDUX f30, BO, INC4
  1741. fpsub f9, f21, f9
  1742. subi BO, BO, 32 * SIZE
  1743. fpsub f5, f22, f5
  1744. LFPDUX f31, BO2, INC4
  1745. fpsub f13, f23, f13
  1746. subi BO2, BO2, 32 * SIZE
  1747. fpsub f2, f24, f2
  1748. fpsub f10, f25, f10
  1749. fpsub f6, f26, f6
  1750. fpsub f14, f27, f14
  1751. fpsub f3, f28, f3
  1752. fpsub f11, f29, f11
  1753. fpsub f7, f30, f7
  1754. fpsub f15, f31, f15
  1755. #else
  1756. LFPDUX f16, AO, INC4
  1757. LFPDUX f17, AO2, INC4
  1758. LFPDUX f18, AO, INC4
  1759. LFPDUX f19, AO2, INC4
  1760. LFPDUX f20, AO, INC4
  1761. LFPDUX f21, AO2, INC4
  1762. LFPDUX f22, AO, INC4
  1763. LFPDUX f23, AO2, INC4
  1764. fpsub f0, f16, f0
  1765. LFPDUX f24, AO, INC4
  1766. fpsub f1, f17, f1
  1767. LFPDUX f25, AO2, INC4
  1768. fpsub f2, f18, f2
  1769. LFPDUX f26, AO, INC4
  1770. fpsub f3, f19, f3
  1771. LFPDUX f27, AO2, INC4
  1772. fpsub f4, f20, f4
  1773. LFPDUX f28, AO, INC4
  1774. fpsub f5, f21, f5
  1775. LFPDUX f29, AO2, INC4
  1776. fpsub f6, f22, f6
  1777. LFPDUX f30, AO, INC4
  1778. fpsub f7, f23, f7
  1779. LFPDUX f31, AO2, INC4
  1780. fpsub f8, f24, f8
  1781. subi AO, AO, 32 * SIZE
  1782. fpsub f9, f25, f9
  1783. subi AO2, AO2, 32 * SIZE
  1784. fpsub f10, f26, f10
  1785. fpsub f11, f27, f11
  1786. fpsub f12, f28, f12
  1787. fpsub f13, f29, f13
  1788. fpsub f14, f30, f14
  1789. fpsub f15, f31, f15
  1790. #endif
  1791. #ifdef LN
  1792. addi AO, AO, 68 * SIZE
  1793. addi AO2, AO2, 68 * SIZE
  1794. LFPDUX A1, AO2, INCM4
  1795. LFPDUX A2, AO, INCM4
  1796. LFPDUX A3, AO2, INCM4
  1797. LFPDUX A4, AO, INCM4
  1798. LFPDUX A5, AO2, INCM4
  1799. LFPDUX A6, AO, INCM4
  1800. LFPDUX A7, AO2, INCM4
  1801. LFPDUX A8, AO, INCM4
  1802. fxsmul f7, A1, f7
  1803. fxsmul f15, A1, f15
  1804. fxcpnmsub f3, A1, f7, f3
  1805. fxcpnmsub f11, A1, f15, f11
  1806. fxcsnmsub f6, A2, f7, f6
  1807. fxcsnmsub f14, A2, f15, f14
  1808. fxcpnmsub f2, A2, f7, f2
  1809. fxcpnmsub f10, A2, f15, f10
  1810. fxcsnmsub f5, A3, f7, f5
  1811. fxcsnmsub f13, A3, f15, f13
  1812. fxcpnmsub f1, A3, f7, f1
  1813. fxcpnmsub f9, A3, f15, f9
  1814. fxcsnmsub f4, A4, f7, f4
  1815. fxcsnmsub f12, A4, f15, f12
  1816. fxcpnmsub f0, A4, f7, f0
  1817. fxcpnmsub f8, A4, f15, f8
  1818. fxpmul f3, A5, f3
  1819. fxpmul f11, A5, f11
  1820. fxcsnmsub f6, A6, f3, f6
  1821. fxcsnmsub f14, A6, f11, f14
  1822. fxcpnmsub f2, A6, f3, f2
  1823. fxcpnmsub f10, A6, f11, f10
  1824. fxcsnmsub f5, A7, f3, f5
  1825. fxcsnmsub f13, A7, f11, f13
  1826. fxcpnmsub f1, A7, f3, f1
  1827. fxcpnmsub f9, A7, f11, f9
  1828. fxcsnmsub f4, A8, f3, f4
  1829. fxcsnmsub f12, A8, f11, f12
  1830. fxcpnmsub f0, A8, f3, f0
  1831. fxcpnmsub f8, A8, f11, f8
  1832. add AO2, AO2, INCM4
  1833. LFPDUX A1, AO, INCM4
  1834. LFPDUX A2, AO2, INCM4
  1835. LFPDUX A3, AO, INCM4
  1836. add AO2, AO2, INCM4
  1837. LFPDUX A4, AO, INCM4
  1838. LFPDUX A5, AO2, INCM4
  1839. LFPDUX A6, AO, INCM4
  1840. add AO2, AO2, INCM4
  1841. add AO, AO, INCM4
  1842. LFPDUX A7, AO2, INCM4
  1843. LFPDUX A8, AO, INCM4
  1844. fxsmul f6, A1, f6
  1845. fxsmul f14, A1, f14
  1846. fxcpnmsub f2, A1, f6, f2
  1847. fxcpnmsub f10, A1, f14, f10
  1848. fxcsnmsub f5, A2, f6, f5
  1849. fxcsnmsub f13, A2, f14, f13
  1850. fxcpnmsub f1, A2, f6, f1
  1851. fxcpnmsub f9, A2, f14, f9
  1852. fxcsnmsub f4, A3, f6, f4
  1853. fxcsnmsub f12, A3, f14, f12
  1854. fxcpnmsub f0, A3, f6, f0
  1855. fxcpnmsub f8, A3, f14, f8
  1856. fxpmul f2, A4, f2
  1857. fxpmul f10, A4, f10
  1858. fxcsnmsub f5, A5, f2, f5
  1859. fxcsnmsub f13, A5, f10, f13
  1860. fxcpnmsub f1, A5, f2, f1
  1861. fxcpnmsub f9, A5, f10, f9
  1862. fxcsnmsub f4, A6, f2, f4
  1863. fxcsnmsub f12, A6, f10, f12
  1864. fxcpnmsub f0, A6, f2, f0
  1865. fxcpnmsub f8, A6, f10, f8
  1866. fxsmul f5, A7, f5
  1867. fxsmul f13, A7, f13
  1868. fxcpnmsub f1, A7, f5, f1
  1869. fxcpnmsub f9, A7, f13, f9
  1870. fxcsnmsub f4, A8, f5, f4
  1871. fxcsnmsub f12, A8, f13, f12
  1872. fxcpnmsub f0, A8, f5, f0
  1873. fxcpnmsub f8, A8, f13, f8
  1874. add AO2, AO2, INCM4
  1875. add AO, AO, INCM4
  1876. LFPDUX A1, AO2, INCM4
  1877. LFPDUX A2, AO, INCM4
  1878. subi AO2, AO2, 8 * SIZE
  1879. add AO, AO, INCM4
  1880. LFPDUX A3, AO, INCM4
  1881. subi AO2, AO2, 8 * SIZE
  1882. add AO, AO, INCM4
  1883. LFPDUX A4, AO, INCM4
  1884. addi AO, AO, -4 * SIZE
  1885. addi AO2, AO2, -4 * SIZE
  1886. fxpmul f1, A1, f1
  1887. fxpmul f9, A1, f9
  1888. fxcsnmsub f4, A2, f1, f4
  1889. fxcsnmsub f12, A2, f9, f12
  1890. fxcpnmsub f0, A2, f1, f0
  1891. fxcpnmsub f8, A2, f9, f8
  1892. fxsmul f4, A3, f4
  1893. fxsmul f12, A3, f12
  1894. fxcpnmsub f0, A3, f4, f0
  1895. fxcpnmsub f8, A3, f12, f8
  1896. fxpmul f0, A4, f0
  1897. fxpmul f8, A4, f8
  1898. #endif
  1899. #ifdef LT
  1900. LFPDUX A1, AO, INC4
  1901. LFPDUX A2, AO2, INC4
  1902. LFPDUX A3, AO, INC4
  1903. LFPDUX A4, AO2, INC4
  1904. LFPDUX A5, AO, INC4
  1905. LFPDUX A6, AO2, INC4
  1906. LFPDUX A7, AO, INC4
  1907. LFPDUX A8, AO2, INC4
  1908. fxpmul f0, A1, f0
  1909. fxpmul f8, A1, f8
  1910. fxcsnmsub f4, A1, f0, f4
  1911. fxcsnmsub f12, A1, f8, f12
  1912. fxcpnmsub f1, A2, f0, f1
  1913. fxcpnmsub f9, A2, f8, f9
  1914. fxcsnmsub f5, A2, f0, f5
  1915. fxcsnmsub f13, A2, f8, f13
  1916. fxcpnmsub f2, A3, f0, f2
  1917. fxcpnmsub f10, A3, f8, f10
  1918. fxcsnmsub f6, A3, f0, f6
  1919. fxcsnmsub f14, A3, f8, f14
  1920. fxcpnmsub f3, A4, f0, f3
  1921. fxcpnmsub f11, A4, f8, f11
  1922. fxcsnmsub f7, A4, f0, f7
  1923. fxcsnmsub f15, A4, f8, f15
  1924. fxsmul f4, A5, f4
  1925. fxsmul f12, A5, f12
  1926. fxcpnmsub f1, A6, f4, f1
  1927. fxcpnmsub f9, A6, f12, f9
  1928. fxcsnmsub f5, A6, f4, f5
  1929. fxcsnmsub f13, A6, f12, f13
  1930. fxcpnmsub f2, A7, f4, f2
  1931. fxcpnmsub f10, A7, f12, f10
  1932. fxcsnmsub f6, A7, f4, f6
  1933. fxcsnmsub f14, A7, f12, f14
  1934. fxcpnmsub f3, A8, f4, f3
  1935. fxcpnmsub f11, A8, f12, f11
  1936. fxcsnmsub f7, A8, f4, f7
  1937. fxcsnmsub f15, A8, f12, f15
  1938. add AO, AO, INC4
  1939. LFPDUX A1, AO2, INC4
  1940. LFPDUX A2, AO, INC4
  1941. LFPDUX A3, AO2, INC4
  1942. add AO, AO, INC4
  1943. LFPDUX A4, AO2, INC4
  1944. LFPDUX A5, AO, INC4
  1945. LFPDUX A6, AO2, INC4
  1946. add AO, AO, INC4
  1947. add AO2, AO2, INC4
  1948. LFPDUX A7, AO, INC4
  1949. LFPDUX A8, AO2, INC4
  1950. fxpmul f1, A1, f1
  1951. fxpmul f9, A1, f9
  1952. fxcsnmsub f5, A1, f1, f5
  1953. fxcsnmsub f13, A1, f9, f13
  1954. fxcpnmsub f2, A2, f1, f2
  1955. fxcpnmsub f10, A2, f9, f10
  1956. fxcsnmsub f6, A2, f1, f6
  1957. fxcsnmsub f14, A2, f9, f14
  1958. fxcpnmsub f3, A3, f1, f3
  1959. fxcpnmsub f11, A3, f9, f11
  1960. fxcsnmsub f7, A3, f1, f7
  1961. fxcsnmsub f15, A3, f9, f15
  1962. fxsmul f5, A4, f5
  1963. fxsmul f13, A4, f13
  1964. fxcpnmsub f2, A5, f5, f2
  1965. fxcpnmsub f10, A5, f13, f10
  1966. fxcsnmsub f6, A5, f5, f6
  1967. fxcsnmsub f14, A5, f13, f14
  1968. fxcpnmsub f3, A6, f5, f3
  1969. fxcpnmsub f11, A6, f13, f11
  1970. fxcsnmsub f7, A6, f5, f7
  1971. fxcsnmsub f15, A6, f13, f15
  1972. fxpmul f2, A7, f2
  1973. fxpmul f10, A7, f10
  1974. fxcsnmsub f6, A7, f2, f6
  1975. fxcsnmsub f14, A7, f10, f14
  1976. fxcpnmsub f3, A8, f2, f3
  1977. fxcpnmsub f11, A8, f10, f11
  1978. fxcsnmsub f7, A8, f2, f7
  1979. fxcsnmsub f15, A8, f10, f15
  1980. add AO, AO, INC4
  1981. add AO2, AO2, INC4
  1982. LFPDUX A1, AO, INC4
  1983. LFPDUX A2, AO2, INC4
  1984. addi AO, AO, 8 * SIZE
  1985. addi AO2, AO2, 4 * SIZE
  1986. LFPDUX A3, AO2, INC4
  1987. addi AO, AO, 8 * SIZE
  1988. addi AO2, AO2, 4 * SIZE
  1989. LFPDUX A4, AO2, INC4
  1990. subi AO, AO, 64 * SIZE
  1991. subi AO2, AO2, 64 * SIZE
  1992. fxsmul f6, A1, f6
  1993. fxsmul f14, A1, f14
  1994. fxcpnmsub f3, A2, f6, f3
  1995. fxcpnmsub f11, A2, f14, f11
  1996. fxcsnmsub f7, A2, f6, f7
  1997. fxcsnmsub f15, A2, f14, f15
  1998. fxpmul f3, A3, f3
  1999. fxpmul f11, A3, f11
  2000. fxcsnmsub f7, A3, f3, f7
  2001. fxcsnmsub f15, A3, f11, f15
  2002. fxsmul f7, A4, f7
  2003. fxsmul f15, A4, f15
  2004. #endif
  2005. #ifdef RN
  2006. LFPDUX A1, BO, INC4
  2007. LFPDUX A2, BO2, INC4
  2008. LFPDUX A3, BO, INC4
  2009. LFPDUX A4, BO2, INC4
  2010. add BO, BO, INC4
  2011. LFPDUX A5, BO2, INC4
  2012. add BO, BO, INC4
  2013. LFPDUX A6, BO2, INC4
  2014. subi BO, BO, 16 * SIZE
  2015. subi BO2, BO2, 16 * SIZE
  2016. fxpmul f0, A1, f0
  2017. fxpmul f1, A1, f1
  2018. fxpmul f2, A1, f2
  2019. fxpmul f3, A1, f3
  2020. fxcsnmsub f4, A1, f0, f4
  2021. fxcsnmsub f5, A1, f1, f5
  2022. fxcsnmsub f6, A1, f2, f6
  2023. fxcsnmsub f7, A1, f3, f7
  2024. fxcpnmsub f8, A2, f0, f8
  2025. fxcpnmsub f9, A2, f1, f9
  2026. fxcpnmsub f10, A2, f2, f10
  2027. fxcpnmsub f11, A2, f3, f11
  2028. fxcsnmsub f12, A2, f0, f12
  2029. fxcsnmsub f13, A2, f1, f13
  2030. fxcsnmsub f14, A2, f2, f14
  2031. fxcsnmsub f15, A2, f3, f15
  2032. fxsmul f4, A3, f4
  2033. fxsmul f5, A3, f5
  2034. fxsmul f6, A3, f6
  2035. fxsmul f7, A3, f7
  2036. fxcpnmsub f8, A4, f4, f8
  2037. fxcpnmsub f9, A4, f5, f9
  2038. fxcpnmsub f10, A4, f6, f10
  2039. fxcpnmsub f11, A4, f7, f11
  2040. fxcsnmsub f12, A4, f4, f12
  2041. fxcsnmsub f13, A4, f5, f13
  2042. fxcsnmsub f14, A4, f6, f14
  2043. fxcsnmsub f15, A4, f7, f15
  2044. fxpmul f8, A5, f8
  2045. fxpmul f9, A5, f9
  2046. fxpmul f10, A5, f10
  2047. fxpmul f11, A5, f11
  2048. fxcsnmsub f12, A5, f8, f12
  2049. fxcsnmsub f13, A5, f9, f13
  2050. fxcsnmsub f14, A5, f10, f14
  2051. fxcsnmsub f15, A5, f11, f15
  2052. fxsmul f12, A6, f12
  2053. fxsmul f13, A6, f13
  2054. fxsmul f14, A6, f14
  2055. fxsmul f15, A6, f15
  2056. #endif
  2057. #ifdef RT
  2058. addi BO, BO, 20 * SIZE
  2059. addi BO2, BO2, 20 * SIZE
  2060. LFPDUX A1, BO2, INCM4
  2061. LFPDUX A2, BO, INCM4
  2062. LFPDUX A3, BO2, INCM4
  2063. LFPDUX A4, BO, INCM4
  2064. add BO2, BO2, INCM4
  2065. LFPDUX A5, BO, INCM4
  2066. add BO2, BO2, INCM4
  2067. LFPDUX A6, BO, INCM4
  2068. subi BO, BO, 4 * SIZE
  2069. subi BO2, BO2, 4 * SIZE
  2070. fxsmul f12, A1, f12
  2071. fxsmul f13, A1, f13
  2072. fxsmul f14, A1, f14
  2073. fxsmul f15, A1, f15
  2074. fxcpnmsub f8, A1, f12, f8
  2075. fxcpnmsub f9, A1, f13, f9
  2076. fxcpnmsub f10, A1, f14, f10
  2077. fxcpnmsub f11, A1, f15, f11
  2078. fxcsnmsub f4, A2, f12, f4
  2079. fxcsnmsub f5, A2, f13, f5
  2080. fxcsnmsub f6, A2, f14, f6
  2081. fxcsnmsub f7, A2, f15, f7
  2082. fxcpnmsub f0, A2, f12, f0
  2083. fxcpnmsub f1, A2, f13, f1
  2084. fxcpnmsub f2, A2, f14, f2
  2085. fxcpnmsub f3, A2, f15, f3
  2086. fxpmul f8, A3, f8
  2087. fxpmul f9, A3, f9
  2088. fxpmul f10, A3, f10
  2089. fxpmul f11, A3, f11
  2090. fxcsnmsub f4, A4, f8, f4
  2091. fxcsnmsub f5, A4, f9, f5
  2092. fxcsnmsub f6, A4, f10, f6
  2093. fxcsnmsub f7, A4, f11, f7
  2094. fxcpnmsub f0, A4, f8, f0
  2095. fxcpnmsub f1, A4, f9, f1
  2096. fxcpnmsub f2, A4, f10, f2
  2097. fxcpnmsub f3, A4, f11, f3
  2098. fxsmul f4, A5, f4
  2099. fxsmul f5, A5, f5
  2100. fxsmul f6, A5, f6
  2101. fxsmul f7, A5, f7
  2102. fxcpnmsub f0, A5, f4, f0
  2103. fxcpnmsub f1, A5, f5, f1
  2104. fxcpnmsub f2, A5, f6, f2
  2105. fxcpnmsub f3, A5, f7, f3
  2106. fxpmul f0, A6, f0
  2107. fxpmul f1, A6, f1
  2108. fxpmul f2, A6, f2
  2109. fxpmul f3, A6, f3
  2110. #endif
  2111. #ifdef LN
  2112. subi CO1, CO1, 8 * SIZE
  2113. subi CO2, CO2, 8 * SIZE
  2114. subi CO3, CO3, 8 * SIZE
  2115. subi CO4, CO4, 8 * SIZE
  2116. #endif
  2117. #if defined(LN) || defined(LT)
  2118. STFPDUX f0, BO, INC4
  2119. STFPDUX f8, BO2, INC4
  2120. STFPDUX f4, BO, INC4
  2121. STFPDUX f12, BO2, INC4
  2122. STFPDUX f1, BO, INC4
  2123. STFPDUX f9, BO2, INC4
  2124. STFPDUX f5, BO, INC4
  2125. STFPDUX f13, BO2, INC4
  2126. STFPDUX f2, BO, INC4
  2127. STFPDUX f10, BO2, INC4
  2128. STFPDUX f6, BO, INC4
  2129. STFPDUX f14, BO2, INC4
  2130. STFPDUX f3, BO, INC4
  2131. STFPDUX f11, BO2, INC4
  2132. STFPDUX f7, BO, INC4
  2133. STFPDUX f15, BO2, INC4
  2134. subi BO, BO, 32 * SIZE
  2135. subi BO2, BO2, 32 * SIZE
  2136. STFDUX f0, CO1, INC
  2137. STFDUX f4, CO1, INC
  2138. STFDUX f1, CO1, INC
  2139. STFDUX f5, CO1, INC
  2140. STFDUX f2, CO1, INC
  2141. STFDUX f6, CO1, INC
  2142. STFDUX f3, CO1, INC
  2143. STFDUX f7, CO1, INC
  2144. STFSDUX f0, CO2, INC
  2145. STFSDUX f4, CO2, INC
  2146. STFSDUX f1, CO2, INC
  2147. STFSDUX f5, CO2, INC
  2148. STFSDUX f2, CO2, INC
  2149. STFSDUX f6, CO2, INC
  2150. STFSDUX f3, CO2, INC
  2151. STFSDUX f7, CO2, INC
  2152. STFDUX f8, CO3, INC
  2153. STFDUX f12, CO3, INC
  2154. STFDUX f9, CO3, INC
  2155. STFDUX f13, CO3, INC
  2156. STFDUX f10, CO3, INC
  2157. STFDUX f14, CO3, INC
  2158. STFDUX f11, CO3, INC
  2159. STFDUX f15, CO3, INC
  2160. STFSDUX f8, CO4, INC
  2161. STFSDUX f12, CO4, INC
  2162. STFSDUX f9, CO4, INC
  2163. STFSDUX f13, CO4, INC
  2164. STFSDUX f10, CO4, INC
  2165. STFSDUX f14, CO4, INC
  2166. STFSDUX f11, CO4, INC
  2167. STFSDUX f15, CO4, INC
  2168. #else
  2169. STFPDUX f0, AO, INC4
  2170. STFPDUX f1, AO2, INC4
  2171. STFPDUX f2, AO, INC4
  2172. STFPDUX f3, AO2, INC4
  2173. STFPDUX f4, AO, INC4
  2174. STFPDUX f5, AO2, INC4
  2175. STFPDUX f6, AO, INC4
  2176. STFPDUX f7, AO2, INC4
  2177. STFPDUX f8, AO, INC4
  2178. STFPDUX f9, AO2, INC4
  2179. STFPDUX f10, AO, INC4
  2180. STFPDUX f11, AO2, INC4
  2181. STFPDUX f12, AO, INC4
  2182. STFPDUX f13, AO2, INC4
  2183. STFPDUX f14, AO, INC4
  2184. STFPDUX f15, AO2, INC4
  2185. subi AO, AO, 32 * SIZE
  2186. subi AO2, AO2, 32 * SIZE
  2187. STFDUX f0, CO1, INC
  2188. STFSDUX f0, CO1, INC
  2189. STFDUX f1, CO1, INC
  2190. STFSDUX f1, CO1, INC
  2191. STFDUX f2, CO1, INC
  2192. STFSDUX f2, CO1, INC
  2193. STFDUX f3, CO1, INC
  2194. STFSDUX f3, CO1, INC
  2195. STFDUX f4, CO2, INC
  2196. STFSDUX f4, CO2, INC
  2197. STFDUX f5, CO2, INC
  2198. STFSDUX f5, CO2, INC
  2199. STFDUX f6, CO2, INC
  2200. STFSDUX f6, CO2, INC
  2201. STFDUX f7, CO2, INC
  2202. STFSDUX f7, CO2, INC
  2203. STFDUX f8, CO3, INC
  2204. STFSDUX f8, CO3, INC
  2205. STFDUX f9, CO3, INC
  2206. STFSDUX f9, CO3, INC
  2207. STFDUX f10, CO3, INC
  2208. STFSDUX f10, CO3, INC
  2209. STFDUX f11, CO3, INC
  2210. STFSDUX f11, CO3, INC
  2211. STFDUX f12, CO4, INC
  2212. STFSDUX f12, CO4, INC
  2213. STFDUX f13, CO4, INC
  2214. STFSDUX f13, CO4, INC
  2215. STFDUX f14, CO4, INC
  2216. STFSDUX f14, CO4, INC
  2217. STFDUX f15, CO4, INC
  2218. STFSDUX f15, CO4, INC
  2219. #endif
  2220. #ifdef LN
  2221. subi CO1, CO1, 8 * SIZE
  2222. subi CO2, CO2, 8 * SIZE
  2223. subi CO3, CO3, 8 * SIZE
  2224. subi CO4, CO4, 8 * SIZE
  2225. #endif
  2226. #ifdef RT
  2227. slwi r0, K, 3 + BASE_SHIFT
  2228. add AORIG, AORIG, r0
  2229. #endif
  2230. #if defined(LT) || defined(RN)
  2231. sub TEMP, K, KK
  2232. slwi r0, TEMP, 3 + BASE_SHIFT
  2233. slwi TEMP, TEMP, 2 + BASE_SHIFT
  2234. add AO, AO, r0
  2235. add BO, BO, TEMP
  2236. #endif
  2237. #ifdef LT
  2238. addi KK, KK, 8
  2239. #endif
  2240. #ifdef LN
  2241. subi KK, KK, 8
  2242. #endif
  2243. addic. I, I, -1
  2244. li r0, FZERO
  2245. lfpsx f0, SP, r0
  2246. bgt+ .L11
  2247. .align 4
  2248. .L49:
  2249. #ifdef LN
  2250. slwi r0, K, 2 + BASE_SHIFT
  2251. add B, B, r0
  2252. #endif
  2253. #if defined(LT) || defined(RN)
  2254. addi B, BO, 4 * SIZE
  2255. #endif
  2256. #ifdef RN
  2257. addi KK, KK, 4
  2258. #endif
  2259. #ifdef RT
  2260. subi KK, KK, 4
  2261. #endif
  2262. addic. J, J, -1
  2263. bgt+ .L10
  2264. .align 4
  2265. .L50:
  2266. andi. J, N, 2
  2267. beq .L90
  2268. #ifdef RT
  2269. slwi r0, K, 1 + BASE_SHIFT
  2270. sub B, B, r0
  2271. slwi r0, LDC, 1
  2272. sub C, C, r0
  2273. #endif
  2274. mr CO1, C
  2275. add CO2, C, LDC
  2276. #ifdef LN
  2277. add KK, M, OFFSET
  2278. #endif
  2279. #ifdef LT
  2280. mr KK, OFFSET
  2281. #endif
  2282. #if defined(LN) || defined(RT)
  2283. addi AORIG, A, -2 * SIZE
  2284. #else
  2285. addi AO, A, -2 * SIZE
  2286. #endif
  2287. #ifndef RT
  2288. add C, CO2, LDC
  2289. #endif
  2290. li r0, FZERO
  2291. lfpsx f0, SP, r0
  2292. andi. I, M, 1
  2293. beq .L60
  2294. #if defined(LT) || defined(RN)
  2295. addi BO, B, - 2 * SIZE
  2296. fpmr f1, f0
  2297. fpmr f2, f0
  2298. fpmr f3, f0
  2299. srawi. r0, KK, 3
  2300. mtspr CTR, r0
  2301. ble .L84
  2302. #else
  2303. #ifdef LN
  2304. slwi r0, K, 0 + BASE_SHIFT
  2305. sub AORIG, AORIG, r0
  2306. #endif
  2307. slwi r0 , KK, 0 + BASE_SHIFT
  2308. slwi TEMP, KK, 1 + BASE_SHIFT
  2309. add AO, AORIG, r0
  2310. add BO, B, TEMP
  2311. sub TEMP, K, KK
  2312. addi BO, BO, - 2 * SIZE
  2313. fpmr f1, f0
  2314. fpmr f2, f0
  2315. fpmr f3, f0
  2316. srawi. r0, TEMP, 3
  2317. mtspr CTR, r0
  2318. ble .L84
  2319. #endif
  2320. LFPDUX B1, BO, INC2
  2321. LFPDUX A1, AO, INC2
  2322. LFPDUX A2, AO, INC2
  2323. LFPDUX B2, BO, INC2
  2324. LFPDUX A3, AO, INC2
  2325. LFPDUX A4, AO, INC2
  2326. LFPDUX B3, BO, INC2
  2327. LFPDUX B4, BO, INC2
  2328. bdz- .L83
  2329. .align 4
  2330. .L82:
  2331. fxcpmadd f0, A1, B1, f0
  2332. LFPDUX B1, BO, INC2
  2333. fxcsmadd f1, A1, B2, f1
  2334. LFPDUX B2, BO, INC2
  2335. LFPDUX A1, AO, INC2
  2336. fxcpmadd f2, A2, B3, f2
  2337. LFPDUX B3, BO, INC2
  2338. fxcsmadd f3, A2, B4, f3
  2339. LFPDUX B4, BO, INC2
  2340. LFPDUX A2, AO, INC2
  2341. fxcpmadd f0, A3, B1, f0
  2342. LFPDUX B1, BO, INC2
  2343. fxcsmadd f1, A3, B2, f1
  2344. LFPDUX B2, BO, INC2
  2345. LFPDUX A3, AO, INC2
  2346. fxcpmadd f2, A4, B3, f2
  2347. LFPDUX B3, BO, INC2
  2348. fxcsmadd f3, A4, B4, f3
  2349. LFPDUX B4, BO, INC2
  2350. LFPDUX A4, AO, INC2
  2351. bdnz+ .L82
  2352. .align 4
  2353. .L83:
  2354. fxcpmadd f0, A1, B1, f0
  2355. LFPDUX B1, BO, INC2
  2356. fxcsmadd f1, A1, B2, f1
  2357. LFPDUX B2, BO, INC2
  2358. fxcpmadd f2, A2, B3, f2
  2359. LFPDUX B3, BO, INC2
  2360. fxcsmadd f3, A2, B4, f3
  2361. LFPDUX B4, BO, INC2
  2362. fxcpmadd f0, A3, B1, f0
  2363. fxcsmadd f1, A3, B2, f1
  2364. fxcpmadd f2, A4, B3, f2
  2365. fxcsmadd f3, A4, B4, f3
  2366. .align 4
  2367. .L84:
  2368. #if defined(LT) || defined(RN)
  2369. andi. r0, KK, 7
  2370. mtspr CTR, r0
  2371. ble+ .L88
  2372. #else
  2373. andi. r0, TEMP, 7
  2374. mtspr CTR, r0
  2375. ble+ .L88
  2376. #endif
  2377. LFDX A1, AO, INC2
  2378. LFPDUX B1, BO, INC2
  2379. add AO, AO, INC
  2380. bdz- .L87
  2381. .align 4
  2382. .L86:
  2383. fxcpmadd f0, A1, B1, f0
  2384. LFDX A1, AO, INC2
  2385. LFPDUX B1, BO, INC2
  2386. add AO, AO, INC
  2387. bdnz+ .L86
  2388. .align 4
  2389. .L87:
  2390. fxcpmadd f0, A1, B1, f0
  2391. .align 4
  2392. .L88:
  2393. fpadd f0, f0, f1
  2394. fpadd f2, f2, f3
  2395. fpadd f0, f0, f2
  2396. #if defined(LN) || defined(RT)
  2397. #ifdef LN
  2398. subi r0, KK, 1
  2399. #else
  2400. subi r0, KK, 2
  2401. #endif
  2402. slwi TEMP, r0, 0 + BASE_SHIFT
  2403. slwi r0, r0, 1 + BASE_SHIFT
  2404. add AO, AORIG, TEMP
  2405. add BO, B, r0
  2406. addi BO, BO, - 2 * SIZE
  2407. #endif
  2408. #if defined(LN) || defined(LT)
  2409. LFPDX f16, BO, INC2
  2410. fpsub f0, f16, f0
  2411. #else
  2412. LFPDX f16, AO, INC2
  2413. fpsub f0, f16, f0
  2414. #endif
  2415. #ifdef LN
  2416. LFPDX A1, AO, INC2
  2417. fxpmul f0, A1, f0
  2418. #endif
  2419. #ifdef LT
  2420. LFPDX A1, AO, INC2
  2421. fxpmul f0, A1, f0
  2422. #endif
  2423. #ifdef RN
  2424. LFD A1, (2 + 0) * SIZE(BO)
  2425. LFD A2, (2 + 1) * SIZE(BO)
  2426. LFD A3, (2 + 3) * SIZE(BO)
  2427. fsmtp f1, f0
  2428. fmul f0, A1, f0
  2429. fnmsub f1, A2, f0, f1
  2430. fmul f1, A3, f1
  2431. fsmfp f0, f1
  2432. #endif
  2433. #ifdef RT
  2434. LFD A1, (2 + 3) * SIZE(BO)
  2435. LFD A2, (2 + 2) * SIZE(BO)
  2436. LFD A3, (2 + 0) * SIZE(BO)
  2437. fsmtp f1, f0
  2438. fmul f1, A1, f1
  2439. fnmsub f0, A2, f1, f0
  2440. fmul f0, A3, f0
  2441. fsmfp f0, f1
  2442. #endif
  2443. #ifdef LN
  2444. subi CO1, CO1, 1 * SIZE
  2445. subi CO2, CO2, 1 * SIZE
  2446. #endif
  2447. #if defined(LN) || defined(LT)
  2448. STFPDX f0, BO, INC2
  2449. STFDUX f0, CO1, INC
  2450. STFSDUX f0, CO2, INC
  2451. #else
  2452. STFPDX f0, AO, INC2
  2453. STFDUX f0, CO1, INC
  2454. STFDUX f1, CO2, INC
  2455. #endif
  2456. #ifdef LN
  2457. subi CO1, CO1, 1 * SIZE
  2458. subi CO2, CO2, 1 * SIZE
  2459. #endif
  2460. #ifdef RT
  2461. slwi r0, K, 0 + BASE_SHIFT
  2462. add AORIG, AORIG, r0
  2463. #endif
  2464. #if defined(LT) || defined(RN)
  2465. sub TEMP, K, KK
  2466. slwi r0, TEMP, 0 + BASE_SHIFT
  2467. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2468. add AO, AO, r0
  2469. add BO, BO, TEMP
  2470. #endif
  2471. #ifdef LT
  2472. addi KK, KK, 1
  2473. #endif
  2474. #ifdef LN
  2475. subi KK, KK, 1
  2476. #endif
  2477. li r0, FZERO
  2478. lfpsx f0, SP, r0
  2479. .align 4
  2480. .L60:
  2481. andi. I, M, 2
  2482. beq .L70
  2483. #if defined(LT) || defined(RN)
  2484. addi BO, B, - 2 * SIZE
  2485. fpmr f1, f0
  2486. fpmr f2, f0
  2487. fpmr f3, f0
  2488. srawi. r0, KK, 3
  2489. mtspr CTR, r0
  2490. ble .L74
  2491. #else
  2492. #ifdef LN
  2493. slwi r0, K, 1 + BASE_SHIFT
  2494. sub AORIG, AORIG, r0
  2495. #endif
  2496. slwi r0 , KK, 1 + BASE_SHIFT
  2497. slwi TEMP, KK, 1 + BASE_SHIFT
  2498. add AO, AORIG, r0
  2499. add BO, B, TEMP
  2500. sub TEMP, K, KK
  2501. addi BO, BO, - 2 * SIZE
  2502. fpmr f1, f0
  2503. fpmr f2, f0
  2504. fpmr f3, f0
  2505. srawi. r0, TEMP, 3
  2506. mtspr CTR, r0
  2507. ble .L74
  2508. #endif
  2509. LFPDUX A1, AO, INC2
  2510. LFPDUX B1, BO, INC2
  2511. LFPDUX A2, AO, INC2
  2512. LFPDUX B2, BO, INC2
  2513. LFPDUX A3, AO, INC2
  2514. LFPDUX B3, BO, INC2
  2515. LFPDUX A4, AO, INC2
  2516. LFPDUX B4, BO, INC2
  2517. LFPDUX A5, AO, INC2
  2518. LFPDUX B5, BO, INC2
  2519. LFPDUX A6, AO, INC2
  2520. LFPDUX B6, BO, INC2
  2521. LFPDUX A7, AO, INC2
  2522. LFPDUX A9, BO, INC2
  2523. LFPDUX A8, AO, INC2
  2524. LFPDUX A10, BO, INC2
  2525. bdz- .L73
  2526. .align 4
  2527. .L72:
  2528. fxcpmadd f0, B1, A1, f0
  2529. fxcsmadd f1, B1, A1, f1
  2530. LFPDUX A1, AO, INC2
  2531. LFPDUX B1, BO, INC2
  2532. fxcpmadd f2, B2, A2, f2
  2533. fxcsmadd f3, B2, A2, f3
  2534. LFPDUX A2, AO, INC2
  2535. LFPDUX B2, BO, INC2
  2536. fxcpmadd f0, B3, A3, f0
  2537. fxcsmadd f1, B3, A3, f1
  2538. LFPDUX A3, AO, INC2
  2539. LFPDUX B3, BO, INC2
  2540. fxcpmadd f2, B4, A4, f2
  2541. fxcsmadd f3, B4, A4, f3
  2542. LFPDUX A4, AO, INC2
  2543. LFPDUX B4, BO, INC2
  2544. fxcpmadd f0, B5, A5, f0
  2545. fxcsmadd f1, B5, A5, f1
  2546. LFPDUX A5, AO, INC2
  2547. LFPDUX B5, BO, INC2
  2548. fxcpmadd f2, B6, A6, f2
  2549. fxcsmadd f3, B6, A6, f3
  2550. LFPDUX A6, AO, INC2
  2551. LFPDUX B6, BO, INC2
  2552. fxcpmadd f0, A9, A7, f0
  2553. fxcsmadd f1, A9, A7, f1
  2554. LFPDUX A7, AO, INC2
  2555. LFPDUX A9, BO, INC2
  2556. fxcpmadd f2, A10, A8, f2
  2557. fxcsmadd f3, A10, A8, f3
  2558. LFPDUX A8, AO, INC2
  2559. LFPDUX A10, BO, INC2
  2560. bdnz+ .L72
  2561. .align 4
  2562. .L73:
  2563. fxcpmadd f0, B1, A1, f0
  2564. fxcsmadd f1, B1, A1, f1
  2565. fxcpmadd f2, B2, A2, f2
  2566. fxcsmadd f3, B2, A2, f3
  2567. fxcpmadd f0, B3, A3, f0
  2568. fxcsmadd f1, B3, A3, f1
  2569. fxcpmadd f2, B4, A4, f2
  2570. fxcsmadd f3, B4, A4, f3
  2571. fxcpmadd f0, B5, A5, f0
  2572. fxcsmadd f1, B5, A5, f1
  2573. fxcpmadd f2, B6, A6, f2
  2574. fxcsmadd f3, B6, A6, f3
  2575. fxcpmadd f0, A9, A7, f0
  2576. fxcsmadd f1, A9, A7, f1
  2577. fxcpmadd f2, A10, A8, f2
  2578. fxcsmadd f3, A10, A8, f3
  2579. .align 4
  2580. .L74:
  2581. #if defined(LT) || defined(RN)
  2582. andi. r0, KK, 7
  2583. mtspr CTR, r0
  2584. ble+ .L78
  2585. #else
  2586. andi. r0, TEMP, 7
  2587. mtspr CTR, r0
  2588. ble+ .L78
  2589. #endif
  2590. LFPDUX A1, AO, INC2
  2591. LFPDUX B1, BO, INC2
  2592. bdz- .L77
  2593. .align 4
  2594. .L76:
  2595. fxcpmadd f0, B1, A1, f0
  2596. fxcsmadd f1, B1, A1, f1
  2597. LFPDUX A1, AO, INC2
  2598. LFPDUX B1, BO, INC2
  2599. bdnz+ .L76
  2600. .align 4
  2601. .L77:
  2602. fxcpmadd f0, B1, A1, f0
  2603. fxcsmadd f1, B1, A1, f1
  2604. .align 4
  2605. .L78:
  2606. fpadd f0, f0, f2
  2607. fpadd f1, f1, f3
  2608. #if defined(LN) || defined(RT)
  2609. #ifdef LN
  2610. subi r0, KK, 2
  2611. #else
  2612. subi r0, KK, 2
  2613. #endif
  2614. slwi TEMP, r0, 1 + BASE_SHIFT
  2615. slwi r0, r0, 1 + BASE_SHIFT
  2616. add AO, AORIG, TEMP
  2617. add BO, B, r0
  2618. addi BO, BO, - 2 * SIZE
  2619. #endif
  2620. #if defined(LN) || defined(LT)
  2621. fpmr f24, f0
  2622. fsmfp f0, f1
  2623. fsmtp f1, f24
  2624. LFPDUX f16, BO, INC2
  2625. LFPDUX f17, BO, INC2
  2626. subi BO, BO, 4 * SIZE
  2627. fpsub f0, f16, f0
  2628. fpsub f1, f17, f1
  2629. #else
  2630. LFPDUX f16, AO, INC2
  2631. LFPDUX f17, AO, INC2
  2632. subi AO, AO, 4 * SIZE
  2633. fpsub f0, f16, f0
  2634. fpsub f1, f17, f1
  2635. #endif
  2636. #ifdef LN
  2637. LFPDUX A1, AO, INC2
  2638. LFPDUX A2, AO, INC2
  2639. addi AO, AO, -4 * SIZE
  2640. fxsmul f1, A2, f1
  2641. fxcpnmsub f0, A2, f1, f0
  2642. fxpmul f0, A1, f0
  2643. #endif
  2644. #ifdef LT
  2645. LFPDUX A1, AO, INC2
  2646. LFPDUX A2, AO, INC2
  2647. addi AO, AO, -4 * SIZE
  2648. fxpmul f0, A1, f0
  2649. fxcsnmsub f1, A1, f0, f1
  2650. fxsmul f1, A2, f1
  2651. #endif
  2652. #ifdef RN
  2653. LFPDUX A1, BO, INC2
  2654. LFPDUX A2, BO, INC2
  2655. subi BO, BO, 4 * SIZE
  2656. fxpmul f0, A1, f0
  2657. fxcsnmsub f1, A1, f0, f1
  2658. fxsmul f1, A2, f1
  2659. #endif
  2660. #ifdef RT
  2661. LFPDUX A2, BO, INC2
  2662. LFPDUX A1, BO, INC2
  2663. subi BO, BO, 4 * SIZE
  2664. fxsmul f1, A1, f1
  2665. fxcpnmsub f0, A1, f1, f0
  2666. fxpmul f0, A2, f0
  2667. #endif
  2668. #ifdef LN
  2669. subi CO1, CO1, 2 * SIZE
  2670. subi CO2, CO2, 2 * SIZE
  2671. #endif
  2672. #if defined(LN) || defined(LT)
  2673. STFPDUX f0, BO, INC2
  2674. STFPDUX f1, BO, INC2
  2675. subi BO, BO, 4 * SIZE
  2676. STFDUX f0, CO1, INC
  2677. STFDUX f1, CO1, INC
  2678. STFSDUX f0, CO2, INC
  2679. STFSDUX f1, CO2, INC
  2680. #else
  2681. STFPDUX f0, AO, INC2
  2682. STFPDUX f1, AO, INC2
  2683. subi AO, AO, 4 * SIZE
  2684. STFDUX f0, CO1, INC
  2685. STFSDUX f0, CO1, INC
  2686. STFDUX f1, CO2, INC
  2687. STFSDUX f1, CO2, INC
  2688. #endif
  2689. #ifdef LN
  2690. subi CO1, CO1, 2 * SIZE
  2691. subi CO2, CO2, 2 * SIZE
  2692. #endif
  2693. #ifdef RT
  2694. slwi r0, K, 1 + BASE_SHIFT
  2695. add AORIG, AORIG, r0
  2696. #endif
  2697. #if defined(LT) || defined(RN)
  2698. sub TEMP, K, KK
  2699. slwi r0, TEMP, 1 + BASE_SHIFT
  2700. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2701. add AO, AO, r0
  2702. add BO, BO, TEMP
  2703. #endif
  2704. #ifdef LT
  2705. addi KK, KK, 2
  2706. #endif
  2707. #ifdef LN
  2708. subi KK, KK, 2
  2709. #endif
  2710. li r0, FZERO
  2711. lfpsx f0, SP, r0
  2712. .align 4
  2713. .L70:
  2714. andi. I, M, 4
  2715. beq .L80
  2716. #if defined(LT) || defined(RN)
  2717. addi BO, B, - 2 * SIZE
  2718. fpmr f1, f0
  2719. fpmr f2, f0
  2720. fpmr f3, f0
  2721. srawi. r0, KK, 2
  2722. mtspr CTR, r0
  2723. ble .L64
  2724. #else
  2725. #ifdef LN
  2726. slwi r0, K, 2 + BASE_SHIFT
  2727. sub AORIG, AORIG, r0
  2728. #endif
  2729. slwi r0 , KK, 2 + BASE_SHIFT
  2730. slwi TEMP, KK, 1 + BASE_SHIFT
  2731. add AO, AORIG, r0
  2732. add BO, B, TEMP
  2733. sub TEMP, K, KK
  2734. fpmr f1, f0
  2735. addi BO, BO, - 2 * SIZE
  2736. fpmr f2, f0
  2737. fpmr f3, f0
  2738. srawi. r0, TEMP, 2
  2739. mtspr CTR, r0
  2740. ble .L64
  2741. #endif
  2742. LFPDUX B1, BO, INC2
  2743. LFPDUX A1, AO, INC2
  2744. LFPDUX A2, AO, INC2
  2745. LFPDUX B2, BO, INC2
  2746. LFPDUX A3, AO, INC2
  2747. LFPDUX A4, AO, INC2
  2748. LFPDUX B3, BO, INC2
  2749. LFPDUX A5, AO, INC2
  2750. LFPDUX A6, AO, INC2
  2751. LFPDUX B4, BO, INC2
  2752. LFPDUX A7, AO, INC2
  2753. LFPDUX A8, AO, INC2
  2754. bdz- .L63
  2755. .align 4
  2756. .L62:
  2757. fxcpmadd f0, B1, A1, f0
  2758. fxcsmadd f2, B1, A1, f2
  2759. LFPDUX A1, AO, INC2
  2760. fxcpmadd f1, B1, A2, f1
  2761. fxcsmadd f3, B1, A2, f3
  2762. LFPDUX A2, AO, INC2
  2763. LFPDUX B1, BO, INC2
  2764. fxcpmadd f0, B2, A3, f0
  2765. fxcsmadd f2, B2, A3, f2
  2766. LFPDUX A3, AO, INC2
  2767. fxcpmadd f1, B2, A4, f1
  2768. fxcsmadd f3, B2, A4, f3
  2769. LFPDUX A4, AO, INC2
  2770. LFPDUX B2, BO, INC2
  2771. fxcpmadd f0, B3, A5, f0
  2772. fxcsmadd f2, B3, A5, f2
  2773. LFPDUX A5, AO, INC2
  2774. fxcpmadd f1, B3, A6, f1
  2775. fxcsmadd f3, B3, A6, f3
  2776. LFPDUX A6, AO, INC2
  2777. LFPDUX B3, BO, INC2
  2778. fxcpmadd f0, B4, A7, f0
  2779. fxcsmadd f2, B4, A7, f2
  2780. LFPDUX A7, AO, INC2
  2781. fxcpmadd f1, B4, A8, f1
  2782. fxcsmadd f3, B4, A8, f3
  2783. LFPDUX A8, AO, INC2
  2784. LFPDUX B4, BO, INC2
  2785. bdnz+ .L62
  2786. .align 4
  2787. .L63:
  2788. fxcpmadd f0, B1, A1, f0
  2789. fxcsmadd f2, B1, A1, f2
  2790. fxcpmadd f1, B1, A2, f1
  2791. fxcsmadd f3, B1, A2, f3
  2792. fxcpmadd f0, B2, A3, f0
  2793. fxcsmadd f2, B2, A3, f2
  2794. fxcpmadd f1, B2, A4, f1
  2795. fxcsmadd f3, B2, A4, f3
  2796. fxcpmadd f0, B3, A5, f0
  2797. fxcsmadd f2, B3, A5, f2
  2798. fxcpmadd f1, B3, A6, f1
  2799. fxcsmadd f3, B3, A6, f3
  2800. fxcpmadd f0, B4, A7, f0
  2801. fxcsmadd f2, B4, A7, f2
  2802. fxcpmadd f1, B4, A8, f1
  2803. fxcsmadd f3, B4, A8, f3
  2804. .align 4
  2805. .L64:
  2806. #if defined(LT) || defined(RN)
  2807. andi. r0, KK, 3
  2808. mtspr CTR, r0
  2809. ble+ .L68
  2810. #else
  2811. andi. r0, TEMP, 3
  2812. mtspr CTR, r0
  2813. ble+ .L68
  2814. #endif
  2815. LFPDUX A1, AO, INC2
  2816. LFPDUX B1, BO, INC2
  2817. LFPDUX A2, AO, INC2
  2818. bdz- .L67
  2819. .align 4
  2820. .L66:
  2821. fxcpmadd f0, B1, A1, f0
  2822. fxcsmadd f2, B1, A1, f2
  2823. LFPDUX A1, AO, INC2
  2824. fxcpmadd f1, B1, A2, f1
  2825. fxcsmadd f3, B1, A2, f3
  2826. LFPDUX B1, BO, INC2
  2827. LFPDUX A2, AO, INC2
  2828. bdnz+ .L66
  2829. .align 4
  2830. .L67:
  2831. fxcpmadd f0, B1, A1, f0
  2832. fxcsmadd f2, B1, A1, f2
  2833. fxcpmadd f1, B1, A2, f1
  2834. fxcsmadd f3, B1, A2, f3
  2835. .align 4
  2836. .L68:
  2837. #if defined(LN) || defined(RT)
  2838. #ifdef LN
  2839. subi r0, KK, 4
  2840. #else
  2841. subi r0, KK, 2
  2842. #endif
  2843. slwi TEMP, r0, 2 + BASE_SHIFT
  2844. slwi r0, r0, 1 + BASE_SHIFT
  2845. add AO, AORIG, TEMP
  2846. add BO, B, r0
  2847. addi BO, BO, - 2 * SIZE
  2848. #endif
  2849. #if defined(LN) || defined(LT)
  2850. fpmr f24, f0
  2851. fpmr f25, f1
  2852. fsmfp f0, f2
  2853. fsmfp f1, f3
  2854. fsmtp f2, f24
  2855. fsmtp f3, f25
  2856. LFPDUX f16, BO, INC2
  2857. LFPDUX f17, BO, INC2
  2858. LFPDUX f18, BO, INC2
  2859. LFPDUX f19, BO, INC2
  2860. subi BO, BO, 8 * SIZE
  2861. fpsub f0, f16, f0
  2862. fpsub f2, f17, f2
  2863. fpsub f1, f18, f1
  2864. fpsub f3, f19, f3
  2865. #else
  2866. LFPDUX f16, AO, INC2
  2867. LFPDUX f17, AO, INC2
  2868. LFPDUX f18, AO, INC2
  2869. LFPDUX f19, AO, INC2
  2870. subi AO, AO, 8 * SIZE
  2871. fpsub f0, f16, f0
  2872. fpsub f1, f17, f1
  2873. fpsub f2, f18, f2
  2874. fpsub f3, f19, f3
  2875. #endif
  2876. #ifdef LN
  2877. addi AO, AO, 18 * SIZE
  2878. LFPDUX A1, AO, INCM2
  2879. LFPDUX A2, AO, INCM2
  2880. LFPDUX A3, AO, INCM2
  2881. LFPDUX A4, AO, INCM2
  2882. add AO, AO, INCM2
  2883. LFPDUX A5, AO, INCM2
  2884. add AO, AO, INCM2
  2885. LFPDUX A6, AO, INCM2
  2886. subi AO, AO, 2 * SIZE
  2887. fxsmul f3, A1, f3
  2888. fxcpnmsub f1, A1, f3, f1
  2889. fxcsnmsub f2, A2, f3, f2
  2890. fxcpnmsub f0, A2, f3, f0
  2891. fxpmul f1, A3, f1
  2892. fxcsnmsub f2, A4, f1, f2
  2893. fxcpnmsub f0, A4, f1, f0
  2894. fxsmul f2, A5, f2
  2895. fxcpnmsub f0, A5, f2, f0
  2896. fxpmul f0, A6, f0
  2897. #endif
  2898. #ifdef LT
  2899. LFPDUX A1, AO, INC2
  2900. LFPDUX A2, AO, INC2
  2901. LFPDUX A3, AO, INC2
  2902. LFPDUX A4, AO, INC2
  2903. add AO, AO, INC2
  2904. LFPDUX A5, AO, INC2
  2905. add AO, AO, INC2
  2906. LFPDUX A6, AO, INC2
  2907. subi AO, AO, 16 * SIZE
  2908. fxpmul f0, A1, f0
  2909. fxcsnmsub f2, A1, f0, f2
  2910. fxcpnmsub f1, A2, f0, f1
  2911. fxcsnmsub f3, A2, f0, f3
  2912. fxsmul f2, A3, f2
  2913. fxcpnmsub f1, A4, f2, f1
  2914. fxcsnmsub f3, A4, f2, f3
  2915. fxpmul f1, A5, f1
  2916. fxcsnmsub f3, A5, f1, f3
  2917. fxsmul f3, A6, f3
  2918. #endif
  2919. #ifdef RN
  2920. LFPDUX A1, BO, INC2
  2921. LFPDUX A2, BO, INC2
  2922. subi BO, BO, 4 * SIZE
  2923. fxpmul f0, A1, f0
  2924. fxpmul f1, A1, f1
  2925. fxcsnmsub f2, A1, f0, f2
  2926. fxcsnmsub f3, A1, f1, f3
  2927. fxsmul f2, A2, f2
  2928. fxsmul f3, A2, f3
  2929. #endif
  2930. #ifdef RT
  2931. LFPDUX A2, BO, INC2
  2932. LFPDUX A1, BO, INC2
  2933. subi BO, BO, 4 * SIZE
  2934. fxsmul f2, A1, f2
  2935. fxsmul f3, A1, f3
  2936. fxcpnmsub f0, A1, f2, f0
  2937. fxcpnmsub f1, A1, f3, f1
  2938. fxpmul f0, A2, f0
  2939. fxpmul f1, A2, f1
  2940. #endif
  2941. #ifdef LN
  2942. subi CO1, CO1, 4 * SIZE
  2943. subi CO2, CO2, 4 * SIZE
  2944. #endif
  2945. #if defined(LN) || defined(LT)
  2946. STFPDUX f0, BO, INC2
  2947. STFPDUX f2, BO, INC2
  2948. STFPDUX f1, BO, INC2
  2949. STFPDUX f3, BO, INC2
  2950. subi BO, BO, 8 * SIZE
  2951. STFDUX f0, CO1, INC
  2952. STFDUX f2, CO1, INC
  2953. STFDUX f1, CO1, INC
  2954. STFDUX f3, CO1, INC
  2955. STFSDUX f0, CO2, INC
  2956. STFSDUX f2, CO2, INC
  2957. STFSDUX f1, CO2, INC
  2958. STFSDUX f3, CO2, INC
  2959. #else
  2960. STFPDUX f0, AO, INC2
  2961. STFPDUX f1, AO, INC2
  2962. STFPDUX f2, AO, INC2
  2963. STFPDUX f3, AO, INC2
  2964. subi AO, AO, 8 * SIZE
  2965. STFDUX f0, CO1, INC
  2966. STFSDUX f0, CO1, INC
  2967. STFDUX f1, CO1, INC
  2968. STFSDUX f1, CO1, INC
  2969. STFDUX f2, CO2, INC
  2970. STFSDUX f2, CO2, INC
  2971. STFDUX f3, CO2, INC
  2972. STFSDUX f3, CO2, INC
  2973. #endif
  2974. #ifdef LN
  2975. subi CO1, CO1, 4 * SIZE
  2976. subi CO2, CO2, 4 * SIZE
  2977. #endif
  2978. #ifdef RT
  2979. slwi r0, K, 2 + BASE_SHIFT
  2980. add AORIG, AORIG, r0
  2981. #endif
  2982. #if defined(LT) || defined(RN)
  2983. sub TEMP, K, KK
  2984. slwi r0, TEMP, 2 + BASE_SHIFT
  2985. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2986. add AO, AO, r0
  2987. add BO, BO, TEMP
  2988. #endif
  2989. #ifdef LT
  2990. addi KK, KK, 4
  2991. #endif
  2992. #ifdef LN
  2993. subi KK, KK, 4
  2994. #endif
  2995. li r0, FZERO
  2996. lfpsx f0, SP, r0
  2997. .align 4
  2998. .L80:
  2999. srawi. I, M, 3
  3000. ble .L89
  3001. .align 4
  3002. .L51:
  3003. #if defined(LT) || defined(RN)
  3004. fpmr f4, f0
  3005. addi BO, B, - 2 * SIZE
  3006. fpmr f1, f0
  3007. fpmr f5, f0
  3008. fpmr f2, f0
  3009. fpmr f6, f0
  3010. srawi. r0, KK, 2
  3011. fpmr f3, f0
  3012. mtspr CTR, r0
  3013. fpmr f7, f0
  3014. ble .L54
  3015. #else
  3016. #ifdef LN
  3017. slwi r0, K, 3 + BASE_SHIFT
  3018. sub AORIG, AORIG, r0
  3019. #endif
  3020. slwi r0 , KK, 3 + BASE_SHIFT
  3021. slwi TEMP, KK, 1 + BASE_SHIFT
  3022. add AO, AORIG, r0
  3023. add BO, B, TEMP
  3024. sub TEMP, K, KK
  3025. fpmr f4, f0
  3026. addi BO, BO, - 2 * SIZE
  3027. fpmr f1, f0
  3028. fpmr f5, f0
  3029. fpmr f2, f0
  3030. fpmr f6, f0
  3031. srawi. r0, TEMP, 2
  3032. fpmr f3, f0
  3033. mtspr CTR, r0
  3034. fpmr f7, f0
  3035. ble .L54
  3036. #endif
  3037. LFPDUX B1, BO, INC2
  3038. LFPDUX A1, AO, INC2
  3039. LFPDUX A2, AO, INC2
  3040. LFPDUX B2, BO, INC2
  3041. LFPDUX A3, AO, INC2
  3042. LFPDUX A4, AO, INC2
  3043. LFPDUX B3, BO, INC2
  3044. LFPDUX A5, AO, INC2
  3045. LFPDUX A6, AO, INC2
  3046. LFPDUX A7, AO, INC2
  3047. LFPDUX A8, AO, INC2
  3048. bdz- .L53
  3049. .align 4
  3050. .L52:
  3051. fxcpmadd f0, B1, A1, f0
  3052. LFPDUX B4, BO, INC2
  3053. fxcsmadd f4, B1, A1, f4
  3054. LFPDUX A1, AO, INC2
  3055. fxcpmadd f1, B1, A2, f1
  3056. nop
  3057. fxcsmadd f5, B1, A2, f5
  3058. LFPDUX A2, AO, INC2
  3059. fxcpmadd f2, B1, A3, f2
  3060. nop
  3061. fxcsmadd f6, B1, A3, f6
  3062. LFPDUX A3, AO, INC2
  3063. fxcpmadd f3, B1, A4, f3
  3064. nop
  3065. fxcsmadd f7, B1, A4, f7
  3066. LFPDUX A4, AO, INC2
  3067. fxcpmadd f0, B2, A5, f0
  3068. LFPDUX B1, BO, INC2
  3069. fxcsmadd f4, B2, A5, f4
  3070. LFPDUX A5, AO, INC2
  3071. fxcpmadd f1, B2, A6, f1
  3072. nop
  3073. fxcsmadd f5, B2, A6, f5
  3074. LFPDUX A6, AO, INC2
  3075. fxcpmadd f2, B2, A7, f2
  3076. nop
  3077. fxcsmadd f6, B2, A7, f6
  3078. LFPDUX A7, AO, INC2
  3079. fxcpmadd f3, B2, A8, f3
  3080. nop
  3081. fxcsmadd f7, B2, A8, f7
  3082. LFPDUX A8, AO, INC2
  3083. fxcpmadd f0, B3, A1, f0
  3084. LFPDUX B2, BO, INC2
  3085. fxcsmadd f4, B3, A1, f4
  3086. LFPDUX A1, AO, INC2
  3087. fxcpmadd f1, B3, A2, f1
  3088. nop
  3089. fxcsmadd f5, B3, A2, f5
  3090. LFPDUX A2, AO, INC2
  3091. fxcpmadd f2, B3, A3, f2
  3092. nop
  3093. fxcsmadd f6, B3, A3, f6
  3094. LFPDUX A3, AO, INC2
  3095. fxcpmadd f3, B3, A4, f3
  3096. nop
  3097. fxcsmadd f7, B3, A4, f7
  3098. LFPDUX A4, AO, INC2
  3099. fxcpmadd f0, B4, A5, f0
  3100. LFPDUX B3, BO, INC2
  3101. fxcsmadd f4, B4, A5, f4
  3102. LFPDUX A5, AO, INC2
  3103. fxcpmadd f1, B4, A6, f1
  3104. nop
  3105. fxcsmadd f5, B4, A6, f5
  3106. LFPDUX A6, AO, INC2
  3107. fxcpmadd f2, B4, A7, f2
  3108. nop
  3109. fxcsmadd f6, B4, A7, f6
  3110. LFPDUX A7, AO, INC2
  3111. fxcpmadd f3, B4, A8, f3
  3112. nop
  3113. fxcsmadd f7, B4, A8, f7
  3114. LFPDUX A8, AO, INC2
  3115. bdnz+ .L52
  3116. .align 4
  3117. .L53:
  3118. fxcpmadd f0, B1, A1, f0
  3119. LFPDUX B4, BO, INC2
  3120. fxcsmadd f4, B1, A1, f4
  3121. LFPDUX A1, AO, INC2
  3122. fxcpmadd f1, B1, A2, f1
  3123. nop
  3124. fxcsmadd f5, B1, A2, f5
  3125. LFPDUX A2, AO, INC2
  3126. fxcpmadd f2, B1, A3, f2
  3127. nop
  3128. fxcsmadd f6, B1, A3, f6
  3129. LFPDUX A3, AO, INC2
  3130. fxcpmadd f3, B1, A4, f3
  3131. nop
  3132. fxcsmadd f7, B1, A4, f7
  3133. LFPDUX A4, AO, INC2
  3134. fxcpmadd f0, B2, A5, f0
  3135. nop
  3136. fxcsmadd f4, B2, A5, f4
  3137. LFPDUX A5, AO, INC2
  3138. fxcpmadd f1, B2, A6, f1
  3139. nop
  3140. fxcsmadd f5, B2, A6, f5
  3141. LFPDUX A6, AO, INC2
  3142. fxcpmadd f2, B2, A7, f2
  3143. nop
  3144. fxcsmadd f6, B2, A7, f6
  3145. LFPDUX A7, AO, INC2
  3146. fxcpmadd f3, B2, A8, f3
  3147. nop
  3148. fxcsmadd f7, B2, A8, f7
  3149. LFPDUX A8, AO, INC2
  3150. fxcpmadd f0, B3, A1, f0
  3151. fxcsmadd f4, B3, A1, f4
  3152. fxcpmadd f1, B3, A2, f1
  3153. fxcsmadd f5, B3, A2, f5
  3154. fxcpmadd f2, B3, A3, f2
  3155. fxcsmadd f6, B3, A3, f6
  3156. fxcpmadd f3, B3, A4, f3
  3157. fxcsmadd f7, B3, A4, f7
  3158. fxcpmadd f0, B4, A5, f0
  3159. fxcsmadd f4, B4, A5, f4
  3160. fxcpmadd f1, B4, A6, f1
  3161. fxcsmadd f5, B4, A6, f5
  3162. fxcpmadd f2, B4, A7, f2
  3163. fxcsmadd f6, B4, A7, f6
  3164. fxcpmadd f3, B4, A8, f3
  3165. fxcsmadd f7, B4, A8, f7
  3166. .align 4
  3167. .L54:
  3168. #if defined(LT) || defined(RN)
  3169. andi. r0, KK, 3
  3170. mtspr CTR, r0
  3171. ble+ .L58
  3172. #else
  3173. andi. r0, TEMP, 3
  3174. mtspr CTR, r0
  3175. ble+ .L58
  3176. #endif
  3177. LFPDUX A1, AO, INC2
  3178. LFPDUX B1, BO, INC2
  3179. LFPDUX A2, AO, INC2
  3180. LFPDUX A3, AO, INC2
  3181. LFPDUX A4, AO, INC2
  3182. bdz- .L57
  3183. .align 4
  3184. .L56:
  3185. fxcpmadd f0, B1, A1, f0
  3186. fxcsmadd f4, B1, A1, f4
  3187. LFPDUX A1, AO, INC2
  3188. fxcpmadd f1, B1, A2, f1
  3189. fxcsmadd f5, B1, A2, f5
  3190. LFPDUX A2, AO, INC2
  3191. fxcpmadd f2, B1, A3, f2
  3192. fxcsmadd f6, B1, A3, f6
  3193. LFPDUX A3, AO, INC2
  3194. fxcpmadd f3, B1, A4, f3
  3195. fxcsmadd f7, B1, A4, f7
  3196. LFPDUX A4, AO, INC2
  3197. LFPDUX B1, BO, INC2
  3198. bdnz+ .L56
  3199. .align 4
  3200. .L57:
  3201. fxcpmadd f0, B1, A1, f0
  3202. fxcsmadd f4, B1, A1, f4
  3203. fxcpmadd f1, B1, A2, f1
  3204. fxcsmadd f5, B1, A2, f5
  3205. fxcpmadd f2, B1, A3, f2
  3206. fxcsmadd f6, B1, A3, f6
  3207. fxcpmadd f3, B1, A4, f3
  3208. fxcsmadd f7, B1, A4, f7
  3209. .align 4
  3210. .L58:
  3211. #if defined(LN) || defined(RT)
  3212. #ifdef LN
  3213. subi r0, KK, 8
  3214. #else
  3215. subi r0, KK, 2
  3216. #endif
  3217. slwi TEMP, r0, 3 + BASE_SHIFT
  3218. slwi r0, r0, 1 + BASE_SHIFT
  3219. add AO, AORIG, TEMP
  3220. add BO, B, r0
  3221. addi BO, BO, - 2 * SIZE
  3222. #endif
  3223. #if defined(LN) || defined(LT)
  3224. fpmr f24, f0
  3225. fpmr f25, f1
  3226. fpmr f26, f2
  3227. fpmr f27, f3
  3228. fsmfp f0, f4
  3229. fsmfp f1, f5
  3230. fsmfp f2, f6
  3231. fsmfp f3, f7
  3232. fsmtp f4, f24
  3233. fsmtp f5, f25
  3234. fsmtp f6, f26
  3235. fsmtp f7, f27
  3236. LFPDUX f16, BO, INC2
  3237. LFPDUX f17, BO, INC2
  3238. LFPDUX f18, BO, INC2
  3239. LFPDUX f19, BO, INC2
  3240. LFPDUX f20, BO, INC2
  3241. LFPDUX f21, BO, INC2
  3242. LFPDUX f22, BO, INC2
  3243. LFPDUX f23, BO, INC2
  3244. subi BO, BO, 16 * SIZE
  3245. fpsub f0, f16, f0
  3246. fpsub f4, f17, f4
  3247. fpsub f1, f18, f1
  3248. fpsub f5, f19, f5
  3249. fpsub f2, f20, f2
  3250. fpsub f6, f21, f6
  3251. fpsub f3, f22, f3
  3252. fpsub f7, f23, f7
  3253. #else
  3254. LFPDUX f16, AO, INC2
  3255. LFPDUX f17, AO, INC2
  3256. LFPDUX f18, AO, INC2
  3257. LFPDUX f19, AO, INC2
  3258. LFPDUX f20, AO, INC2
  3259. LFPDUX f21, AO, INC2
  3260. LFPDUX f22, AO, INC2
  3261. LFPDUX f23, AO, INC2
  3262. subi AO, AO, 16 * SIZE
  3263. fpsub f0, f16, f0
  3264. fpsub f1, f17, f1
  3265. fpsub f2, f18, f2
  3266. fpsub f3, f19, f3
  3267. fpsub f4, f20, f4
  3268. fpsub f5, f21, f5
  3269. fpsub f6, f22, f6
  3270. fpsub f7, f23, f7
  3271. #endif
  3272. #ifdef LN
  3273. addi AO, AO, 66 * SIZE
  3274. LFPDUX A1, AO, INCM2
  3275. LFPDUX A2, AO, INCM2
  3276. LFPDUX A3, AO, INCM2
  3277. LFPDUX A4, AO, INCM2
  3278. LFPDUX A5, AO, INCM2
  3279. LFPDUX A6, AO, INCM2
  3280. LFPDUX A7, AO, INCM2
  3281. LFPDUX A8, AO, INCM2
  3282. fxsmul f7, A1, f7
  3283. fxcpnmsub f3, A1, f7, f3
  3284. fxcsnmsub f6, A2, f7, f6
  3285. fxcpnmsub f2, A2, f7, f2
  3286. fxcsnmsub f5, A3, f7, f5
  3287. fxcpnmsub f1, A3, f7, f1
  3288. fxcsnmsub f4, A4, f7, f4
  3289. fxcpnmsub f0, A4, f7, f0
  3290. fxpmul f3, A5, f3
  3291. fxcsnmsub f6, A6, f3, f6
  3292. fxcpnmsub f2, A6, f3, f2
  3293. fxcsnmsub f5, A7, f3, f5
  3294. fxcpnmsub f1, A7, f3, f1
  3295. fxcsnmsub f4, A8, f3, f4
  3296. fxcpnmsub f0, A8, f3, f0
  3297. add AO, AO, INCM2
  3298. LFPDUX A1, AO, INCM2
  3299. LFPDUX A2, AO, INCM2
  3300. LFPDUX A3, AO, INCM2
  3301. add AO, AO, INCM2
  3302. LFPDUX A4, AO, INCM2
  3303. LFPDUX A5, AO, INCM2
  3304. LFPDUX A6, AO, INCM2
  3305. add AO, AO, INCM2
  3306. add AO, AO, INCM2
  3307. LFPDUX A7, AO, INCM2
  3308. LFPDUX A8, AO, INCM2
  3309. fxsmul f6, A1, f6
  3310. fxcpnmsub f2, A1, f6, f2
  3311. fxcsnmsub f5, A2, f6, f5
  3312. fxcpnmsub f1, A2, f6, f1
  3313. fxcsnmsub f4, A3, f6, f4
  3314. fxcpnmsub f0, A3, f6, f0
  3315. fxpmul f2, A4, f2
  3316. fxcsnmsub f5, A5, f2, f5
  3317. fxcpnmsub f1, A5, f2, f1
  3318. fxcsnmsub f4, A6, f2, f4
  3319. fxcpnmsub f0, A6, f2, f0
  3320. fxsmul f5, A7, f5
  3321. fxcpnmsub f1, A7, f5, f1
  3322. fxcsnmsub f4, A8, f5, f4
  3323. fxcpnmsub f0, A8, f5, f0
  3324. add AO, AO, INCM2
  3325. add AO, AO, INCM2
  3326. LFPDUX A1, AO, INCM2
  3327. LFPDUX A2, AO, INCM2
  3328. subi AO, AO, 6 * SIZE
  3329. LFPDUX A3, AO, INCM2
  3330. subi AO, AO, 6 * SIZE
  3331. LFPDUX A4, AO, INCM2
  3332. addi AO, AO, -2 * SIZE
  3333. fxpmul f1, A1, f1
  3334. fxcsnmsub f4, A2, f1, f4
  3335. fxcpnmsub f0, A2, f1, f0
  3336. fxsmul f4, A3, f4
  3337. fxcpnmsub f0, A3, f4, f0
  3338. fxpmul f0, A4, f0
  3339. #endif
  3340. #ifdef LT
  3341. LFPDUX A1, AO, INC2
  3342. LFPDUX A2, AO, INC2
  3343. LFPDUX A3, AO, INC2
  3344. LFPDUX A4, AO, INC2
  3345. LFPDUX A5, AO, INC2
  3346. LFPDUX A6, AO, INC2
  3347. LFPDUX A7, AO, INC2
  3348. LFPDUX A8, AO, INC2
  3349. fxpmul f0, A1, f0
  3350. fxcsnmsub f4, A1, f0, f4
  3351. fxcpnmsub f1, A2, f0, f1
  3352. fxcsnmsub f5, A2, f0, f5
  3353. fxcpnmsub f2, A3, f0, f2
  3354. fxcsnmsub f6, A3, f0, f6
  3355. fxcpnmsub f3, A4, f0, f3
  3356. fxcsnmsub f7, A4, f0, f7
  3357. fxsmul f4, A5, f4
  3358. fxcpnmsub f1, A6, f4, f1
  3359. fxcsnmsub f5, A6, f4, f5
  3360. fxcpnmsub f2, A7, f4, f2
  3361. fxcsnmsub f6, A7, f4, f6
  3362. fxcpnmsub f3, A8, f4, f3
  3363. fxcsnmsub f7, A8, f4, f7
  3364. add AO, AO, INC2
  3365. LFPDUX A1, AO, INC2
  3366. LFPDUX A2, AO, INC2
  3367. LFPDUX A3, AO, INC2
  3368. add AO, AO, INC2
  3369. LFPDUX A4, AO, INC2
  3370. LFPDUX A5, AO, INC2
  3371. LFPDUX A6, AO, INC2
  3372. add AO, AO, INC2
  3373. add AO, AO, INC2
  3374. LFPDUX A7, AO, INC2
  3375. LFPDUX A8, AO, INC2
  3376. fxpmul f1, A1, f1
  3377. fxcsnmsub f5, A1, f1, f5
  3378. fxcpnmsub f2, A2, f1, f2
  3379. fxcsnmsub f6, A2, f1, f6
  3380. fxcpnmsub f3, A3, f1, f3
  3381. fxcsnmsub f7, A3, f1, f7
  3382. fxsmul f5, A4, f5
  3383. fxcpnmsub f2, A5, f5, f2
  3384. fxcsnmsub f6, A5, f5, f6
  3385. fxcpnmsub f3, A6, f5, f3
  3386. fxcsnmsub f7, A6, f5, f7
  3387. fxpmul f2, A7, f2
  3388. fxcsnmsub f6, A7, f2, f6
  3389. fxcpnmsub f3, A8, f2, f3
  3390. fxcsnmsub f7, A8, f2, f7
  3391. add AO, AO, INC2
  3392. add AO, AO, INC2
  3393. LFPDUX A1, AO, INC2
  3394. LFPDUX A2, AO, INC2
  3395. addi AO, AO, 6 * SIZE
  3396. LFPDUX A3, AO, INC2
  3397. addi AO, AO, 6 * SIZE
  3398. LFPDUX A4, AO, INC2
  3399. subi AO, AO, 64 * SIZE
  3400. fxsmul f6, A1, f6
  3401. fxcpnmsub f3, A2, f6, f3
  3402. fxcsnmsub f7, A2, f6, f7
  3403. fxpmul f3, A3, f3
  3404. fxcsnmsub f7, A3, f3, f7
  3405. fxsmul f7, A4, f7
  3406. #endif
  3407. #ifdef RN
  3408. LFPDUX A1, BO, INC2
  3409. LFPDUX A2, BO, INC2
  3410. subi BO, BO, 4 * SIZE
  3411. fxpmul f0, A1, f0
  3412. fxpmul f1, A1, f1
  3413. fxpmul f2, A1, f2
  3414. fxpmul f3, A1, f3
  3415. fxcsnmsub f4, A1, f0, f4
  3416. fxcsnmsub f5, A1, f1, f5
  3417. fxcsnmsub f6, A1, f2, f6
  3418. fxcsnmsub f7, A1, f3, f7
  3419. fxsmul f4, A2, f4
  3420. fxsmul f5, A2, f5
  3421. fxsmul f6, A2, f6
  3422. fxsmul f7, A2, f7
  3423. #endif
  3424. #ifdef RT
  3425. LFPDUX A2, BO, INC2
  3426. LFPDUX A1, BO, INC2
  3427. subi BO, BO, 4 * SIZE
  3428. fxsmul f4, A1, f4
  3429. fxsmul f5, A1, f5
  3430. fxsmul f6, A1, f6
  3431. fxsmul f7, A1, f7
  3432. fxcpnmsub f0, A1, f4, f0
  3433. fxcpnmsub f1, A1, f5, f1
  3434. fxcpnmsub f2, A1, f6, f2
  3435. fxcpnmsub f3, A1, f7, f3
  3436. fxpmul f0, A2, f0
  3437. fxpmul f1, A2, f1
  3438. fxpmul f2, A2, f2
  3439. fxpmul f3, A2, f3
  3440. #endif
  3441. #ifdef LN
  3442. subi CO1, CO1, 8 * SIZE
  3443. subi CO2, CO2, 8 * SIZE
  3444. #endif
  3445. #if defined(LN) || defined(LT)
  3446. STFPDUX f0, BO, INC2
  3447. STFPDUX f4, BO, INC2
  3448. STFPDUX f1, BO, INC2
  3449. STFPDUX f5, BO, INC2
  3450. STFPDUX f2, BO, INC2
  3451. STFPDUX f6, BO, INC2
  3452. STFPDUX f3, BO, INC2
  3453. STFPDUX f7, BO, INC2
  3454. subi BO, BO, 16 * SIZE
  3455. STFDUX f0, CO1, INC
  3456. STFDUX f4, CO1, INC
  3457. STFDUX f1, CO1, INC
  3458. STFDUX f5, CO1, INC
  3459. STFDUX f2, CO1, INC
  3460. STFDUX f6, CO1, INC
  3461. STFDUX f3, CO1, INC
  3462. STFDUX f7, CO1, INC
  3463. STFSDUX f0, CO2, INC
  3464. STFSDUX f4, CO2, INC
  3465. STFSDUX f1, CO2, INC
  3466. STFSDUX f5, CO2, INC
  3467. STFSDUX f2, CO2, INC
  3468. STFSDUX f6, CO2, INC
  3469. STFSDUX f3, CO2, INC
  3470. STFSDUX f7, CO2, INC
  3471. #else
  3472. STFPDUX f0, AO, INC2
  3473. STFPDUX f1, AO, INC2
  3474. STFPDUX f2, AO, INC2
  3475. STFPDUX f3, AO, INC2
  3476. STFPDUX f4, AO, INC2
  3477. STFPDUX f5, AO, INC2
  3478. STFPDUX f6, AO, INC2
  3479. STFPDUX f7, AO, INC2
  3480. subi AO, AO, 16 * SIZE
  3481. STFDUX f0, CO1, INC
  3482. STFSDUX f0, CO1, INC
  3483. STFDUX f1, CO1, INC
  3484. STFSDUX f1, CO1, INC
  3485. STFDUX f2, CO1, INC
  3486. STFSDUX f2, CO1, INC
  3487. STFDUX f3, CO1, INC
  3488. STFSDUX f3, CO1, INC
  3489. STFDUX f4, CO2, INC
  3490. STFSDUX f4, CO2, INC
  3491. STFDUX f5, CO2, INC
  3492. STFSDUX f5, CO2, INC
  3493. STFDUX f6, CO2, INC
  3494. STFSDUX f6, CO2, INC
  3495. STFDUX f7, CO2, INC
  3496. STFSDUX f7, CO2, INC
  3497. #endif
  3498. #ifdef LN
  3499. subi CO1, CO1, 8 * SIZE
  3500. subi CO2, CO2, 8 * SIZE
  3501. #endif
  3502. #ifdef RT
  3503. slwi r0, K, 3 + BASE_SHIFT
  3504. add AORIG, AORIG, r0
  3505. #endif
  3506. #if defined(LT) || defined(RN)
  3507. sub TEMP, K, KK
  3508. slwi r0, TEMP, 3 + BASE_SHIFT
  3509. slwi TEMP, TEMP, 1 + BASE_SHIFT
  3510. add AO, AO, r0
  3511. add BO, BO, TEMP
  3512. #endif
  3513. #ifdef LT
  3514. addi KK, KK, 8
  3515. #endif
  3516. #ifdef LN
  3517. subi KK, KK, 8
  3518. #endif
  3519. addic. I, I, -1
  3520. li r0, FZERO
  3521. lfpsx f0, SP, r0
  3522. bgt+ .L51
  3523. .align 4
  3524. .L89:
  3525. #ifdef LN
  3526. slwi r0, K, 1 + BASE_SHIFT
  3527. add B, B, r0
  3528. #endif
  3529. #if defined(LT) || defined(RN)
  3530. addi B, BO, 2 * SIZE
  3531. #endif
  3532. #ifdef RN
  3533. addi KK, KK, 2
  3534. #endif
  3535. #ifdef RT
  3536. subi KK, KK, 2
  3537. #endif
  3538. .align 4
  3539. .L90:
  3540. andi. J, N, 1
  3541. beq .L999
  3542. #ifdef RT
  3543. slwi r0, K, 0 + BASE_SHIFT
  3544. sub B, B, r0
  3545. sub C, C, LDC
  3546. #endif
  3547. mr CO1, C
  3548. #ifdef LN
  3549. add KK, M, OFFSET
  3550. #endif
  3551. #ifdef LT
  3552. mr KK, OFFSET
  3553. #endif
  3554. #if defined(LN) || defined(RT)
  3555. addi AORIG, A, -2 * SIZE
  3556. #else
  3557. addi AO, A, -2 * SIZE
  3558. #endif
  3559. #ifndef RT
  3560. add C, CO1, LDC
  3561. #endif
  3562. li r0, FZERO
  3563. lfpsx f0, SP, r0
  3564. andi. I, M, 1
  3565. beq .L100
  3566. #if defined(LT) || defined(RN)
  3567. addi BO, B, - 2 * SIZE
  3568. fpmr f1, f0
  3569. fpmr f2, f0
  3570. fpmr f3, f0
  3571. srawi. r0, KK, 3
  3572. mtspr CTR, r0
  3573. ble .L124
  3574. #else
  3575. #ifdef LN
  3576. slwi r0, K, 0 + BASE_SHIFT
  3577. sub AORIG, AORIG, r0
  3578. #endif
  3579. slwi r0 , KK, 0 + BASE_SHIFT
  3580. slwi TEMP, KK, 0 + BASE_SHIFT
  3581. add AO, AORIG, r0
  3582. add BO, B, TEMP
  3583. sub TEMP, K, KK
  3584. addi BO, BO, - 2 * SIZE
  3585. fpmr f1, f0
  3586. fpmr f2, f0
  3587. fpmr f3, f0
  3588. srawi. r0, TEMP, 3
  3589. mtspr CTR, r0
  3590. ble .L124
  3591. #endif
  3592. LFPDUX A1, AO, INC2
  3593. LFPDUX B1, BO, INC2
  3594. LFPDUX A2, AO, INC2
  3595. LFPDUX B2, BO, INC2
  3596. LFPDUX A3, AO, INC2
  3597. LFPDUX B3, BO, INC2
  3598. LFPDUX A4, AO, INC2
  3599. LFPDUX B4, BO, INC2
  3600. bdz- .L123
  3601. .align 4
  3602. .L122:
  3603. fpmadd f0, A1, B1, f0
  3604. LFPDUX A1, AO, INC2
  3605. LFPDUX B1, BO, INC2
  3606. fpmadd f1, A2, B2, f1
  3607. LFPDUX A2, AO, INC2
  3608. LFPDUX B2, BO, INC2
  3609. fpmadd f2, A3, B3, f2
  3610. LFPDUX A3, AO, INC2
  3611. LFPDUX B3, BO, INC2
  3612. fpmadd f3, A4, B4, f3
  3613. LFPDUX A4, AO, INC2
  3614. LFPDUX B4, BO, INC2
  3615. bdnz+ .L122
  3616. .align 4
  3617. .L123:
  3618. fpmadd f0, A1, B1, f0
  3619. fpmadd f1, A2, B2, f1
  3620. fpmadd f2, A3, B3, f2
  3621. fpmadd f3, A4, B4, f3
  3622. .align 4
  3623. .L124:
  3624. #if defined(LT) || defined(RN)
  3625. andi. r0, KK, 7
  3626. mtspr CTR, r0
  3627. ble+ .L128
  3628. #else
  3629. andi. r0, TEMP, 7
  3630. mtspr CTR, r0
  3631. ble+ .L128
  3632. #endif
  3633. LFDX A1, AO, INC2
  3634. LFDX B1, BO, INC2
  3635. add AO, AO, INC
  3636. add BO, BO, INC
  3637. bdz- .L127
  3638. .align 4
  3639. .L126:
  3640. fmadd f0, A1, B1, f0
  3641. LFDX A1, AO, INC2
  3642. LFDX B1, BO, INC2
  3643. add AO, AO, INC
  3644. add BO, BO, INC
  3645. bdnz+ .L126
  3646. .align 4
  3647. .L127:
  3648. fmadd f0, A1, B1, f0
  3649. .align 4
  3650. .L128:
  3651. fpadd f0, f0, f1
  3652. fpadd f2, f2, f3
  3653. fpadd f0, f0, f2
  3654. fsmtp f1, f0
  3655. fadd f0, f0, f1
  3656. #if defined(LN) || defined(RT)
  3657. #ifdef LN
  3658. subi r0, KK, 1
  3659. #else
  3660. subi r0, KK, 1
  3661. #endif
  3662. slwi TEMP, r0, 0 + BASE_SHIFT
  3663. slwi r0, r0, 0 + BASE_SHIFT
  3664. add AO, AORIG, TEMP
  3665. add BO, B, r0
  3666. addi BO, BO, - 2 * SIZE
  3667. #endif
  3668. #if defined(LN) || defined(LT)
  3669. LFDX f16, BO, INC2
  3670. fsub f0, f16, f0
  3671. #else
  3672. LFDX f16, AO, INC2
  3673. fsub f0, f16, f0
  3674. #endif
  3675. #ifdef LN
  3676. LFD A1, (2 + 0) * SIZE(AO)
  3677. fmul f0, A1, f0
  3678. #endif
  3679. #ifdef LT
  3680. LFD A1, (2 + 0) * SIZE(AO)
  3681. fmul f0, A1, f0
  3682. #endif
  3683. #ifdef RN
  3684. LFDX A1, BO, INC2
  3685. fmul f0, A1, f0
  3686. #endif
  3687. #ifdef RT
  3688. LFDX A1, BO, INC2
  3689. fmul f0, A1, f0
  3690. #endif
  3691. #ifdef LN
  3692. subi CO1, CO1, 1 * SIZE
  3693. #endif
  3694. #if defined(LN) || defined(LT)
  3695. STFDX f0, BO, INC2
  3696. STFDUX f0, CO1, INC
  3697. #else
  3698. STFDX f0, AO, INC2
  3699. STFDUX f0, CO1, INC
  3700. #endif
  3701. #ifdef LN
  3702. subi CO1, CO1, 1 * SIZE
  3703. #endif
  3704. #ifdef RT
  3705. slwi r0, K, 0 + BASE_SHIFT
  3706. add AORIG, AORIG, r0
  3707. #endif
  3708. #if defined(LT) || defined(RN)
  3709. sub TEMP, K, KK
  3710. slwi r0, TEMP, 0 + BASE_SHIFT
  3711. slwi TEMP, TEMP, 0 + BASE_SHIFT
  3712. add AO, AO, r0
  3713. add BO, BO, TEMP
  3714. #endif
  3715. #ifdef LT
  3716. addi KK, KK, 1
  3717. #endif
  3718. #ifdef LN
  3719. subi KK, KK, 1
  3720. #endif
  3721. li r0, FZERO
  3722. lfpsx f0, SP, r0
  3723. .align 4
  3724. .L100:
  3725. andi. I, M, 2
  3726. beq .L110
  3727. #if defined(LT) || defined(RN)
  3728. addi BO, B, - 2 * SIZE
  3729. fpmr f1, f0
  3730. fpmr f2, f0
  3731. fpmr f3, f0
  3732. srawi. r0, KK, 3
  3733. mtspr CTR, r0
  3734. ble .L114
  3735. #else
  3736. #ifdef LN
  3737. slwi r0, K, 1 + BASE_SHIFT
  3738. sub AORIG, AORIG, r0
  3739. #endif
  3740. slwi r0 , KK, 1 + BASE_SHIFT
  3741. slwi TEMP, KK, 0 + BASE_SHIFT
  3742. add AO, AORIG, r0
  3743. add BO, B, TEMP
  3744. sub TEMP, K, KK
  3745. addi BO, BO, - 2 * SIZE
  3746. fpmr f1, f0
  3747. fpmr f2, f0
  3748. fpmr f3, f0
  3749. srawi. r0, TEMP, 3
  3750. mtspr CTR, r0
  3751. ble .L114
  3752. #endif
  3753. LFPDUX A1, AO, INC2
  3754. LFPDUX A2, AO, INC2
  3755. LFPDUX B1, BO, INC2
  3756. LFPDUX A3, AO, INC2
  3757. LFPDUX A4, AO, INC2
  3758. LFPDUX B2, BO, INC2
  3759. LFPDUX A5, AO, INC2
  3760. LFPDUX A6, AO, INC2
  3761. LFPDUX B3, BO, INC2
  3762. LFPDUX A7, AO, INC2
  3763. LFPDUX A8, AO, INC2
  3764. LFPDUX B4, BO, INC2
  3765. bdz- .L113
  3766. .align 4
  3767. .L112:
  3768. fxcpmadd f0, B1, A1, f0
  3769. LFPDUX A1, AO, INC2
  3770. fxcsmadd f1, B1, A2, f1
  3771. LFPDUX A2, AO, INC2
  3772. LFPDUX B1, BO, INC2
  3773. fxcpmadd f2, B2, A3, f2
  3774. LFPDUX A3, AO, INC2
  3775. fxcsmadd f3, B2, A4, f3
  3776. LFPDUX A4, AO, INC2
  3777. LFPDUX B2, BO, INC2
  3778. fxcpmadd f0, B3, A5, f0
  3779. LFPDUX A5, AO, INC2
  3780. fxcsmadd f1, B3, A6, f1
  3781. LFPDUX A6, AO, INC2
  3782. LFPDUX B3, BO, INC2
  3783. fxcpmadd f2, B4, A7, f2
  3784. LFPDUX A7, AO, INC2
  3785. fxcsmadd f3, B4, A8, f3
  3786. LFPDUX A8, AO, INC2
  3787. LFPDUX B4, BO, INC2
  3788. bdnz+ .L112
  3789. .align 4
  3790. .L113:
  3791. fxcpmadd f0, B1, A1, f0
  3792. fxcsmadd f1, B1, A2, f1
  3793. fxcpmadd f2, B2, A3, f2
  3794. fxcsmadd f3, B2, A4, f3
  3795. fxcpmadd f0, B3, A5, f0
  3796. fxcsmadd f1, B3, A6, f1
  3797. fxcpmadd f2, B4, A7, f2
  3798. fxcsmadd f3, B4, A8, f3
  3799. .align 4
  3800. .L114:
  3801. #if defined(LT) || defined(RN)
  3802. andi. r0, KK, 7
  3803. mtspr CTR, r0
  3804. ble+ .L118
  3805. #else
  3806. andi. r0, TEMP, 7
  3807. mtspr CTR, r0
  3808. ble+ .L118
  3809. #endif
  3810. LFPDUX A1, AO, INC2
  3811. LFDX B1, BO, INC2
  3812. add BO, BO, INC
  3813. bdz- .L117
  3814. .align 4
  3815. .L116:
  3816. fxcpmadd f0, B1, A1, f0
  3817. LFPDUX A1, AO, INC2
  3818. LFDX B1, BO, INC2
  3819. add BO, BO, INC
  3820. bdnz+ .L116
  3821. .align 4
  3822. .L117:
  3823. fxcpmadd f0, B1, A1, f0
  3824. .align 4
  3825. .L118:
  3826. fpadd f0, f0, f1
  3827. fpadd f2, f3, f2
  3828. fpadd f0, f0, f2
  3829. #if defined(LN) || defined(RT)
  3830. #ifdef LN
  3831. subi r0, KK, 2
  3832. #else
  3833. subi r0, KK, 1
  3834. #endif
  3835. slwi TEMP, r0, 1 + BASE_SHIFT
  3836. slwi r0, r0, 0 + BASE_SHIFT
  3837. add AO, AORIG, TEMP
  3838. add BO, B, r0
  3839. addi BO, BO, - 2 * SIZE
  3840. #endif
  3841. #if defined(LN) || defined(LT)
  3842. LFPDX f16, BO, INC2
  3843. fpsub f0, f16, f0
  3844. #else
  3845. LFPDX f16, AO, INC2
  3846. fpsub f0, f16, f0
  3847. #endif
  3848. #ifdef LN
  3849. fsmtp f4, f0
  3850. LFD A1, (2 + 3) * SIZE(AO)
  3851. LFD A2, (2 + 2) * SIZE(AO)
  3852. LFD A3, (2 + 0) * SIZE(AO)
  3853. fmul f4, A1, f4
  3854. fnmsub f0, A2, f4, f0
  3855. fmul f0, A3, f0
  3856. fsmfp f0, f4
  3857. #endif
  3858. #ifdef LT
  3859. fsmtp f4, f0
  3860. LFD A1, (2 + 0) * SIZE(AO)
  3861. LFD A2, (2 + 1) * SIZE(AO)
  3862. LFD A3, (2 + 3) * SIZE(AO)
  3863. fmul f0, A1, f0
  3864. fnmsub f4, A2, f0, f4
  3865. fmul f4, A3, f4
  3866. fsmfp f0, f4
  3867. #endif
  3868. #ifdef RN
  3869. LFPDX A1, BO, INC2
  3870. fxpmul f0, A1, f0
  3871. #endif
  3872. #ifdef RT
  3873. LFPDX A1, BO, INC2
  3874. fxpmul f0, A1, f0
  3875. #endif
  3876. #ifdef LN
  3877. subi CO1, CO1, 2 * SIZE
  3878. #endif
  3879. #if defined(LN) || defined(LT)
  3880. STFPDX f0, BO, INC2
  3881. STFDUX f0, CO1, INC
  3882. STFSDUX f0, CO1, INC
  3883. #else
  3884. STFPDX f0, AO, INC2
  3885. STFDUX f0, CO1, INC
  3886. STFSDUX f0, CO1, INC
  3887. #endif
  3888. #ifdef LN
  3889. subi CO1, CO1, 2 * SIZE
  3890. #endif
  3891. #ifdef RT
  3892. slwi r0, K, 1 + BASE_SHIFT
  3893. add AORIG, AORIG, r0
  3894. #endif
  3895. #if defined(LT) || defined(RN)
  3896. sub TEMP, K, KK
  3897. slwi r0, TEMP, 1 + BASE_SHIFT
  3898. slwi TEMP, TEMP, 0 + BASE_SHIFT
  3899. add AO, AO, r0
  3900. add BO, BO, TEMP
  3901. #endif
  3902. #ifdef LT
  3903. addi KK, KK, 2
  3904. #endif
  3905. #ifdef LN
  3906. subi KK, KK, 2
  3907. #endif
  3908. li r0, FZERO
  3909. lfpsx f0, SP, r0
  3910. .align 4
  3911. .L110:
  3912. andi. I, M, 4
  3913. beq .L120
  3914. #if defined(LT) || defined(RN)
  3915. addi BO, B, - 2 * SIZE
  3916. fpmr f1, f0
  3917. fpmr f2, f0
  3918. fpmr f3, f0
  3919. srawi. r0, KK, 3
  3920. mtspr CTR, r0
  3921. ble .L104
  3922. #else
  3923. #ifdef LN
  3924. slwi r0, K, 2 + BASE_SHIFT
  3925. sub AORIG, AORIG, r0
  3926. #endif
  3927. slwi r0 , KK, 2 + BASE_SHIFT
  3928. slwi TEMP, KK, 0 + BASE_SHIFT
  3929. add AO, AORIG, r0
  3930. add BO, B, TEMP
  3931. sub TEMP, K, KK
  3932. addi BO, BO, - 2 * SIZE
  3933. fpmr f1, f0
  3934. fpmr f2, f0
  3935. fpmr f3, f0
  3936. srawi. r0, TEMP, 3
  3937. mtspr CTR, r0
  3938. ble .L104
  3939. #endif
  3940. LFPDUX B1, BO, INC2
  3941. LFPDUX A1, AO, INC2
  3942. LFPDUX A2, AO, INC2
  3943. LFPDUX A3, AO, INC2
  3944. LFPDUX A4, AO, INC2
  3945. LFPDUX B2, BO, INC2
  3946. LFPDUX A5, AO, INC2
  3947. LFPDUX A6, AO, INC2
  3948. LFPDUX A7, AO, INC2
  3949. LFPDUX A8, AO, INC2
  3950. LFPDUX B3, BO, INC2
  3951. LFPDUX B4, BO, INC2
  3952. bdz- .L103
  3953. .align 4
  3954. .L102:
  3955. fxcpmadd f0, B1, A1, f0
  3956. LFPDUX A1, AO, INC2
  3957. fxcpmadd f1, B1, A2, f1
  3958. LFPDUX A2, AO, INC2
  3959. fxcsmadd f2, B1, A3, f2
  3960. LFPDUX A3, AO, INC2
  3961. fxcsmadd f3, B1, A4, f3
  3962. LFPDUX A4, AO, INC2
  3963. LFPDUX B1, BO, INC2
  3964. fxcpmadd f0, B2, A5, f0
  3965. LFPDUX A5, AO, INC2
  3966. fxcpmadd f1, B2, A6, f1
  3967. LFPDUX A6, AO, INC2
  3968. fxcsmadd f2, B2, A7, f2
  3969. LFPDUX A7, AO, INC2
  3970. fxcsmadd f3, B2, A8, f3
  3971. LFPDUX A8, AO, INC2
  3972. LFPDUX B2, BO, INC2
  3973. fxcpmadd f0, B3, A1, f0
  3974. LFPDUX A1, AO, INC2
  3975. fxcpmadd f1, B3, A2, f1
  3976. LFPDUX A2, AO, INC2
  3977. fxcsmadd f2, B3, A3, f2
  3978. LFPDUX A3, AO, INC2
  3979. fxcsmadd f3, B3, A4, f3
  3980. LFPDUX A4, AO, INC2
  3981. LFPDUX B3, BO, INC2
  3982. fxcpmadd f0, B4, A5, f0
  3983. LFPDUX A5, AO, INC2
  3984. fxcpmadd f1, B4, A6, f1
  3985. LFPDUX A6, AO, INC2
  3986. fxcsmadd f2, B4, A7, f2
  3987. LFPDUX A7, AO, INC2
  3988. fxcsmadd f3, B4, A8, f3
  3989. LFPDUX A8, AO, INC2
  3990. LFPDUX B4, BO, INC2
  3991. bdnz+ .L102
  3992. .align 4
  3993. .L103:
  3994. fxcpmadd f0, B1, A1, f0
  3995. LFPDUX A1, AO, INC2
  3996. fxcpmadd f1, B1, A2, f1
  3997. LFPDUX A2, AO, INC2
  3998. fxcsmadd f2, B1, A3, f2
  3999. LFPDUX A3, AO, INC2
  4000. fxcsmadd f3, B1, A4, f3
  4001. LFPDUX A4, AO, INC2
  4002. fxcpmadd f0, B2, A5, f0
  4003. LFPDUX A5, AO, INC2
  4004. fxcpmadd f1, B2, A6, f1
  4005. LFPDUX A6, AO, INC2
  4006. fxcsmadd f2, B2, A7, f2
  4007. LFPDUX A7, AO, INC2
  4008. fxcsmadd f3, B2, A8, f3
  4009. LFPDUX A8, AO, INC2
  4010. fxcpmadd f0, B3, A1, f0
  4011. fxcpmadd f1, B3, A2, f1
  4012. fxcsmadd f2, B3, A3, f2
  4013. fxcsmadd f3, B3, A4, f3
  4014. fxcpmadd f0, B4, A5, f0
  4015. fxcpmadd f1, B4, A6, f1
  4016. fxcsmadd f2, B4, A7, f2
  4017. fxcsmadd f3, B4, A8, f3
  4018. .align 4
  4019. .L104:
  4020. #if defined(LT) || defined(RN)
  4021. andi. r0, KK, 7
  4022. mtspr CTR, r0
  4023. ble+ .L108
  4024. #else
  4025. andi. r0, TEMP, 7
  4026. mtspr CTR, r0
  4027. ble+ .L108
  4028. #endif
  4029. LFPDUX A1, AO, INC2
  4030. LFDX B1, BO, INC2
  4031. LFPDUX A2, AO, INC2
  4032. add BO, BO, INC
  4033. bdz- .L107
  4034. .align 4
  4035. .L106:
  4036. fxcpmadd f0, B1, A1, f0
  4037. LFPDUX A1, AO, INC2
  4038. fxcpmadd f1, B1, A2, f1
  4039. LFDX B1, BO, INC2
  4040. LFPDUX A2, AO, INC2
  4041. add BO, BO, INC
  4042. bdnz+ .L106
  4043. .align 4
  4044. .L107:
  4045. fxcpmadd f0, B1, A1, f0
  4046. fxcpmadd f1, B1, A2, f1
  4047. .align 4
  4048. .L108:
  4049. fpadd f0, f0, f2
  4050. fpadd f1, f1, f3
  4051. #if defined(LN) || defined(RT)
  4052. #ifdef LN
  4053. subi r0, KK, 4
  4054. #else
  4055. subi r0, KK, 1
  4056. #endif
  4057. slwi TEMP, r0, 2 + BASE_SHIFT
  4058. slwi r0, r0, 0 + BASE_SHIFT
  4059. add AO, AORIG, TEMP
  4060. add BO, B, r0
  4061. addi BO, BO, - 2 * SIZE
  4062. #endif
  4063. #if defined(LN) || defined(LT)
  4064. LFPDUX f16, BO, INC2
  4065. LFPDUX f17, BO, INC2
  4066. subi BO, BO, 4 * SIZE
  4067. fpsub f0, f16, f0
  4068. fpsub f1, f17, f1
  4069. #else
  4070. LFPDUX f16, AO, INC2
  4071. LFPDUX f17, AO, INC2
  4072. subi AO, AO, 4 * SIZE
  4073. fpsub f0, f16, f0
  4074. fpsub f1, f17, f1
  4075. #endif
  4076. #ifdef LN
  4077. fsmtp f4, f0
  4078. fsmtp f5, f1
  4079. LFD A1, (2 + 15) * SIZE(AO)
  4080. LFD A2, (2 + 14) * SIZE(AO)
  4081. LFD A3, (2 + 13) * SIZE(AO)
  4082. LFD A4, (2 + 12) * SIZE(AO)
  4083. fmul f5, A1, f5
  4084. fnmsub f1, A2, f5, f1
  4085. fnmsub f4, A3, f5, f4
  4086. fnmsub f0, A4, f5, f0
  4087. LFD A1, (2 + 10) * SIZE(AO)
  4088. LFD A2, (2 + 9) * SIZE(AO)
  4089. LFD A3, (2 + 8) * SIZE(AO)
  4090. fmul f1, A1, f1
  4091. fnmsub f4, A2, f1, f4
  4092. fnmsub f0, A3, f1, f0
  4093. LFD A1, (2 + 5) * SIZE(AO)
  4094. LFD A2, (2 + 4) * SIZE(AO)
  4095. fmul f4, A1, f4
  4096. fnmsub f0, A2, f4, f0
  4097. LFD A1, (2 + 0) * SIZE(AO)
  4098. fmul f0, A1, f0
  4099. fsmfp f0, f4
  4100. fsmfp f1, f5
  4101. #endif
  4102. #ifdef LT
  4103. fsmtp f4, f0
  4104. fsmtp f5, f1
  4105. LFD A1, (2 + 0) * SIZE(AO)
  4106. LFD A2, (2 + 1) * SIZE(AO)
  4107. LFD A3, (2 + 2) * SIZE(AO)
  4108. LFD A4, (2 + 3) * SIZE(AO)
  4109. fmul f0, A1, f0
  4110. fnmsub f4, A2, f0, f4
  4111. fnmsub f1, A3, f0, f1
  4112. fnmsub f5, A4, f0, f5
  4113. LFD A1, (2 + 5) * SIZE(AO)
  4114. LFD A2, (2 + 6) * SIZE(AO)
  4115. LFD A3, (2 + 7) * SIZE(AO)
  4116. fmul f4, A1, f4
  4117. fnmsub f1, A2, f4, f1
  4118. fnmsub f5, A3, f4, f5
  4119. LFD A1, (2 + 10) * SIZE(AO)
  4120. LFD A2, (2 + 11) * SIZE(AO)
  4121. fmul f1, A1, f1
  4122. fnmsub f5, A2, f1, f5
  4123. LFD A1, (2 + 15) * SIZE(AO)
  4124. fmul f5, A1, f5
  4125. fsmfp f0, f4
  4126. fsmfp f1, f5
  4127. #endif
  4128. #ifdef RN
  4129. LFPDX A1, BO, INC2
  4130. fxpmul f0, A1, f0
  4131. fxpmul f1, A1, f1
  4132. #endif
  4133. #ifdef RT
  4134. LFPDX A1, BO, INC2
  4135. fxpmul f0, A1, f0
  4136. fxpmul f1, A1, f1
  4137. #endif
  4138. #ifdef LN
  4139. subi CO1, CO1, 4 * SIZE
  4140. #endif
  4141. #if defined(LN) || defined(LT)
  4142. STFPDUX f0, BO, INC2
  4143. STFPDUX f1, BO, INC2
  4144. subi BO, BO, 4 * SIZE
  4145. STFDUX f0, CO1, INC
  4146. STFSDUX f0, CO1, INC
  4147. STFDUX f1, CO1, INC
  4148. STFSDUX f1, CO1, INC
  4149. #else
  4150. STFPDUX f0, AO, INC2
  4151. STFPDUX f1, AO, INC2
  4152. subi AO, AO, 4 * SIZE
  4153. STFDUX f0, CO1, INC
  4154. STFSDUX f0, CO1, INC
  4155. STFDUX f1, CO1, INC
  4156. STFSDUX f1, CO1, INC
  4157. #endif
  4158. #ifdef LN
  4159. subi CO1, CO1, 4 * SIZE
  4160. #endif
  4161. #ifdef RT
  4162. slwi r0, K, 2 + BASE_SHIFT
  4163. add AORIG, AORIG, r0
  4164. #endif
  4165. #if defined(LT) || defined(RN)
  4166. sub TEMP, K, KK
  4167. slwi r0, TEMP, 2 + BASE_SHIFT
  4168. slwi TEMP, TEMP, 0 + BASE_SHIFT
  4169. add AO, AO, r0
  4170. add BO, BO, TEMP
  4171. #endif
  4172. #ifdef LT
  4173. addi KK, KK, 4
  4174. #endif
  4175. #ifdef LN
  4176. subi KK, KK, 4
  4177. #endif
  4178. li r0, FZERO
  4179. lfpsx f0, SP, r0
  4180. .align 4
  4181. .L120:
  4182. srawi. I, M, 3
  4183. ble .L129
  4184. .align 4
  4185. .L91:
  4186. #if defined(LT) || defined(RN)
  4187. fpmr f1, f0
  4188. addi BO, B, - 2 * SIZE
  4189. fpmr f2, f0
  4190. fpmr f3, f0
  4191. srawi. r0, KK, 2
  4192. mtspr CTR, r0
  4193. ble .L94
  4194. #else
  4195. #ifdef LN
  4196. slwi r0, K, 3 + BASE_SHIFT
  4197. sub AORIG, AORIG, r0
  4198. #endif
  4199. slwi r0 , KK, 3 + BASE_SHIFT
  4200. slwi TEMP, KK, 0 + BASE_SHIFT
  4201. add AO, AORIG, r0
  4202. add BO, B, TEMP
  4203. sub TEMP, K, KK
  4204. fpmr f1, f0
  4205. addi BO, BO, - 2 * SIZE
  4206. fpmr f2, f0
  4207. fpmr f3, f0
  4208. srawi. r0, TEMP, 2
  4209. mtspr CTR, r0
  4210. ble .L94
  4211. #endif
  4212. LFPDUX B1, BO, INC2
  4213. LFPDUX A1, AO, INC2
  4214. LFPDUX A2, AO, INC2
  4215. LFPDUX A3, AO, INC2
  4216. LFPDUX A4, AO, INC2
  4217. LFPDUX B2, BO, INC2
  4218. LFPDUX A5, AO, INC2
  4219. LFPDUX A6, AO, INC2
  4220. LFPDUX A7, AO, INC2
  4221. LFPDUX A8, AO, INC2
  4222. bdz- .L93
  4223. .align 4
  4224. .L92:
  4225. fxcpmadd f0, B1, A1, f0
  4226. LFPDUX A1, AO, INC2
  4227. fxcpmadd f1, B1, A2, f1
  4228. LFPDUX A2, AO, INC2
  4229. fxcpmadd f2, B1, A3, f2
  4230. LFPDUX A3, AO, INC2
  4231. fxcpmadd f3, B1, A4, f3
  4232. LFPDUX A4, AO, INC2
  4233. fxcsmadd f0, B1, A5, f0
  4234. LFPDUX A5, AO, INC2
  4235. fxcsmadd f1, B1, A6, f1
  4236. LFPDUX A6, AO, INC2
  4237. fxcsmadd f2, B1, A7, f2
  4238. LFPDUX A7, AO, INC2
  4239. fxcsmadd f3, B1, A8, f3
  4240. LFPDUX A8, AO, INC2
  4241. LFPDUX B1, BO, INC2
  4242. fxcpmadd f0, B2, A1, f0
  4243. LFPDUX A1, AO, INC2
  4244. fxcpmadd f1, B2, A2, f1
  4245. LFPDUX A2, AO, INC2
  4246. fxcpmadd f2, B2, A3, f2
  4247. LFPDUX A3, AO, INC2
  4248. fxcpmadd f3, B2, A4, f3
  4249. LFPDUX A4, AO, INC2
  4250. fxcsmadd f0, B2, A5, f0
  4251. LFPDUX A5, AO, INC2
  4252. fxcsmadd f1, B2, A6, f1
  4253. LFPDUX A6, AO, INC2
  4254. fxcsmadd f2, B2, A7, f2
  4255. LFPDUX A7, AO, INC2
  4256. fxcsmadd f3, B2, A8, f3
  4257. LFPDUX A8, AO, INC2
  4258. LFPDUX B2, BO, INC2
  4259. bdnz+ .L92
  4260. .align 4
  4261. .L93:
  4262. fxcpmadd f0, B1, A1, f0
  4263. LFPDUX A1, AO, INC2
  4264. fxcpmadd f1, B1, A2, f1
  4265. LFPDUX A2, AO, INC2
  4266. fxcpmadd f2, B1, A3, f2
  4267. LFPDUX A3, AO, INC2
  4268. fxcpmadd f3, B1, A4, f3
  4269. LFPDUX A4, AO, INC2
  4270. fxcsmadd f0, B1, A5, f0
  4271. LFPDUX A5, AO, INC2
  4272. fxcsmadd f1, B1, A6, f1
  4273. LFPDUX A6, AO, INC2
  4274. fxcsmadd f2, B1, A7, f2
  4275. LFPDUX A7, AO, INC2
  4276. fxcsmadd f3, B1, A8, f3
  4277. LFPDUX A8, AO, INC2
  4278. fxcpmadd f0, B2, A1, f0
  4279. fxcpmadd f1, B2, A2, f1
  4280. fxcpmadd f2, B2, A3, f2
  4281. fxcpmadd f3, B2, A4, f3
  4282. fxcsmadd f0, B2, A5, f0
  4283. fxcsmadd f1, B2, A6, f1
  4284. fxcsmadd f2, B2, A7, f2
  4285. fxcsmadd f3, B2, A8, f3
  4286. .align 4
  4287. .L94:
  4288. #if defined(LT) || defined(RN)
  4289. andi. r0, KK, 3
  4290. mtspr CTR, r0
  4291. ble+ .L98
  4292. #else
  4293. andi. r0, TEMP, 3
  4294. mtspr CTR, r0
  4295. ble+ .L98
  4296. #endif
  4297. LFDX B1, BO, INC2
  4298. LFPDUX A1, AO, INC2
  4299. LFPDUX A2, AO, INC2
  4300. LFPDUX A3, AO, INC2
  4301. LFPDUX A4, AO, INC2
  4302. add BO, BO, INC
  4303. bdz- .L97
  4304. .align 4
  4305. .L96:
  4306. fxcpmadd f0, B1, A1, f0
  4307. LFPDUX A1, AO, INC2
  4308. fxcpmadd f1, B1, A2, f1
  4309. LFPDUX A2, AO, INC2
  4310. fxcpmadd f2, B1, A3, f2
  4311. LFPDUX A3, AO, INC2
  4312. fxcpmadd f3, B1, A4, f3
  4313. LFDX B1, BO, INC2
  4314. LFPDUX A4, AO, INC2
  4315. add BO, BO, INC
  4316. bdnz+ .L96
  4317. .align 4
  4318. .L97:
  4319. fxcpmadd f0, B1, A1, f0
  4320. fxcpmadd f1, B1, A2, f1
  4321. fxcpmadd f2, B1, A3, f2
  4322. fxcpmadd f3, B1, A4, f3
  4323. .align 4
  4324. .L98:
  4325. #if defined(LN) || defined(RT)
  4326. #ifdef LN
  4327. subi r0, KK, 8
  4328. #else
  4329. subi r0, KK, 1
  4330. #endif
  4331. slwi TEMP, r0, 3 + BASE_SHIFT
  4332. slwi r0, r0, 0 + BASE_SHIFT
  4333. add AO, AORIG, TEMP
  4334. add BO, B, r0
  4335. addi BO, BO, - 2 * SIZE
  4336. #endif
  4337. #if defined(LN) || defined(LT)
  4338. LFPDUX f16, BO, INC2
  4339. LFPDUX f17, BO, INC2
  4340. LFPDUX f18, BO, INC2
  4341. LFPDUX f19, BO, INC2
  4342. subi BO, BO, 8 * SIZE
  4343. fpsub f0, f16, f0
  4344. fpsub f1, f17, f1
  4345. fpsub f2, f18, f2
  4346. fpsub f3, f19, f3
  4347. #else
  4348. LFPDUX f16, AO, INC2
  4349. LFPDUX f17, AO, INC2
  4350. LFPDUX f18, AO, INC2
  4351. LFPDUX f19, AO, INC2
  4352. subi AO, AO, 8 * SIZE
  4353. fpsub f0, f16, f0
  4354. fpsub f1, f17, f1
  4355. fpsub f2, f18, f2
  4356. fpsub f3, f19, f3
  4357. #endif
  4358. #ifdef LN
  4359. fsmtp f4, f0
  4360. fsmtp f5, f1
  4361. fsmtp f6, f2
  4362. fsmtp f7, f3
  4363. LFD A1, (2 + 63) * SIZE(AO)
  4364. LFD A2, (2 + 62) * SIZE(AO)
  4365. LFD A3, (2 + 61) * SIZE(AO)
  4366. LFD A4, (2 + 60) * SIZE(AO)
  4367. LFD A5, (2 + 59) * SIZE(AO)
  4368. LFD A6, (2 + 58) * SIZE(AO)
  4369. LFD A7, (2 + 57) * SIZE(AO)
  4370. LFD A8, (2 + 56) * SIZE(AO)
  4371. fmul f7, A1, f7
  4372. fnmsub f3, A2, f7, f3
  4373. fnmsub f6, A3, f7, f6
  4374. fnmsub f2, A4, f7, f2
  4375. fnmsub f5, A5, f7, f5
  4376. fnmsub f1, A6, f7, f1
  4377. fnmsub f4, A7, f7, f4
  4378. fnmsub f0, A8, f7, f0
  4379. LFD A1, (2 + 54) * SIZE(AO)
  4380. LFD A2, (2 + 53) * SIZE(AO)
  4381. LFD A3, (2 + 52) * SIZE(AO)
  4382. LFD A4, (2 + 51) * SIZE(AO)
  4383. LFD A5, (2 + 50) * SIZE(AO)
  4384. LFD A6, (2 + 49) * SIZE(AO)
  4385. LFD A7, (2 + 48) * SIZE(AO)
  4386. fmul f3, A1, f3
  4387. fnmsub f6, A2, f3, f6
  4388. fnmsub f2, A3, f3, f2
  4389. fnmsub f5, A4, f3, f5
  4390. fnmsub f1, A5, f3, f1
  4391. fnmsub f4, A6, f3, f4
  4392. fnmsub f0, A7, f3, f0
  4393. LFD A1, (2 + 45) * SIZE(AO)
  4394. LFD A2, (2 + 44) * SIZE(AO)
  4395. LFD A3, (2 + 43) * SIZE(AO)
  4396. LFD A4, (2 + 42) * SIZE(AO)
  4397. LFD A5, (2 + 41) * SIZE(AO)
  4398. LFD A6, (2 + 40) * SIZE(AO)
  4399. fmul f6, A1, f6
  4400. fnmsub f2, A2, f6, f2
  4401. fnmsub f5, A3, f6, f5
  4402. fnmsub f1, A4, f6, f1
  4403. fnmsub f4, A5, f6, f4
  4404. fnmsub f0, A6, f6, f0
  4405. LFD A1, (2 + 36) * SIZE(AO)
  4406. LFD A2, (2 + 35) * SIZE(AO)
  4407. LFD A3, (2 + 34) * SIZE(AO)
  4408. LFD A4, (2 + 33) * SIZE(AO)
  4409. LFD A5, (2 + 32) * SIZE(AO)
  4410. fmul f2, A1, f2
  4411. fnmsub f5, A2, f2, f5
  4412. fnmsub f1, A3, f2, f1
  4413. fnmsub f4, A4, f2, f4
  4414. fnmsub f0, A5, f2, f0
  4415. LFD A1, (2 + 27) * SIZE(AO)
  4416. LFD A2, (2 + 26) * SIZE(AO)
  4417. LFD A3, (2 + 25) * SIZE(AO)
  4418. LFD A4, (2 + 24) * SIZE(AO)
  4419. fmul f5, A1, f5
  4420. fnmsub f1, A2, f5, f1
  4421. fnmsub f4, A3, f5, f4
  4422. fnmsub f0, A4, f5, f0
  4423. LFD A1, (2 + 18) * SIZE(AO)
  4424. LFD A2, (2 + 17) * SIZE(AO)
  4425. LFD A3, (2 + 16) * SIZE(AO)
  4426. fmul f1, A1, f1
  4427. fnmsub f4, A2, f1, f4
  4428. fnmsub f0, A3, f1, f0
  4429. LFD A1, (2 + 9) * SIZE(AO)
  4430. LFD A2, (2 + 8) * SIZE(AO)
  4431. fmul f4, A1, f4
  4432. fnmsub f0, A2, f4, f0
  4433. LFD A1, (2 + 0) * SIZE(AO)
  4434. fmul f0, A1, f0
  4435. fsmfp f0, f4
  4436. fsmfp f1, f5
  4437. fsmfp f2, f6
  4438. fsmfp f3, f7
  4439. #endif
  4440. #ifdef LT
  4441. fsmtp f4, f0
  4442. fsmtp f5, f1
  4443. fsmtp f6, f2
  4444. fsmtp f7, f3
  4445. LFD A1, (2 + 0) * SIZE(AO)
  4446. LFD A2, (2 + 1) * SIZE(AO)
  4447. LFD A3, (2 + 2) * SIZE(AO)
  4448. LFD A4, (2 + 3) * SIZE(AO)
  4449. LFD A5, (2 + 4) * SIZE(AO)
  4450. LFD A6, (2 + 5) * SIZE(AO)
  4451. LFD A7, (2 + 6) * SIZE(AO)
  4452. LFD A8, (2 + 7) * SIZE(AO)
  4453. fmul f0, A1, f0
  4454. fnmsub f4, A2, f0, f4
  4455. fnmsub f1, A3, f0, f1
  4456. fnmsub f5, A4, f0, f5
  4457. fnmsub f2, A5, f0, f2
  4458. fnmsub f6, A6, f0, f6
  4459. fnmsub f3, A7, f0, f3
  4460. fnmsub f7, A8, f0, f7
  4461. LFD A1, (2 + 9) * SIZE(AO)
  4462. LFD A2, (2 + 10) * SIZE(AO)
  4463. LFD A3, (2 + 11) * SIZE(AO)
  4464. LFD A4, (2 + 12) * SIZE(AO)
  4465. LFD A5, (2 + 13) * SIZE(AO)
  4466. LFD A6, (2 + 14) * SIZE(AO)
  4467. LFD A7, (2 + 15) * SIZE(AO)
  4468. fmul f4, A1, f4
  4469. fnmsub f1, A2, f4, f1
  4470. fnmsub f5, A3, f4, f5
  4471. fnmsub f2, A4, f4, f2
  4472. fnmsub f6, A5, f4, f6
  4473. fnmsub f3, A6, f4, f3
  4474. fnmsub f7, A7, f4, f7
  4475. LFD A1, (2 + 18) * SIZE(AO)
  4476. LFD A2, (2 + 19) * SIZE(AO)
  4477. LFD A3, (2 + 20) * SIZE(AO)
  4478. LFD A4, (2 + 21) * SIZE(AO)
  4479. LFD A5, (2 + 22) * SIZE(AO)
  4480. LFD A6, (2 + 23) * SIZE(AO)
  4481. fmul f1, A1, f1
  4482. fnmsub f5, A2, f1, f5
  4483. fnmsub f2, A3, f1, f2
  4484. fnmsub f6, A4, f1, f6
  4485. fnmsub f3, A5, f1, f3
  4486. fnmsub f7, A6, f1, f7
  4487. LFD A1, (2 + 27) * SIZE(AO)
  4488. LFD A2, (2 + 28) * SIZE(AO)
  4489. LFD A3, (2 + 29) * SIZE(AO)
  4490. LFD A4, (2 + 30) * SIZE(AO)
  4491. LFD A5, (2 + 31) * SIZE(AO)
  4492. fmul f5, A1, f5
  4493. fnmsub f2, A2, f5, f2
  4494. fnmsub f6, A3, f5, f6
  4495. fnmsub f3, A4, f5, f3
  4496. fnmsub f7, A5, f5, f7
  4497. LFD A1, (2 + 36) * SIZE(AO)
  4498. LFD A2, (2 + 37) * SIZE(AO)
  4499. LFD A3, (2 + 38) * SIZE(AO)
  4500. LFD A4, (2 + 39) * SIZE(AO)
  4501. fmul f2, A1, f2
  4502. fnmsub f6, A2, f2, f6
  4503. fnmsub f3, A3, f2, f3
  4504. fnmsub f7, A4, f2, f7
  4505. LFD A1, (2 + 45) * SIZE(AO)
  4506. LFD A2, (2 + 46) * SIZE(AO)
  4507. LFD A3, (2 + 47) * SIZE(AO)
  4508. fmul f6, A1, f6
  4509. fnmsub f3, A2, f6, f3
  4510. fnmsub f7, A3, f6, f7
  4511. LFD A1, (2 + 54) * SIZE(AO)
  4512. LFD A2, (2 + 55) * SIZE(AO)
  4513. fmul f3, A1, f3
  4514. fnmsub f7, A2, f3, f7
  4515. LFD A1, (2 + 63) * SIZE(AO)
  4516. fmul f7, A1, f7
  4517. fsmfp f0, f4
  4518. fsmfp f1, f5
  4519. fsmfp f2, f6
  4520. fsmfp f3, f7
  4521. #endif
  4522. #ifdef RN
  4523. LFPDX A1, BO, INC2
  4524. fxpmul f0, A1, f0
  4525. fxpmul f1, A1, f1
  4526. fxpmul f2, A1, f2
  4527. fxpmul f3, A1, f3
  4528. #endif
  4529. #ifdef RT
  4530. LFPDX A1, BO, INC2
  4531. fxpmul f0, A1, f0
  4532. fxpmul f1, A1, f1
  4533. fxpmul f2, A1, f2
  4534. fxpmul f3, A1, f3
  4535. #endif
  4536. #ifdef LN
  4537. subi CO1, CO1, 8 * SIZE
  4538. #endif
  4539. #if defined(LN) || defined(LT)
  4540. STFPDUX f0, BO, INC2
  4541. STFPDUX f1, BO, INC2
  4542. STFPDUX f2, BO, INC2
  4543. STFPDUX f3, BO, INC2
  4544. subi BO, BO, 8 * SIZE
  4545. STFDUX f0, CO1, INC
  4546. STFSDUX f0, CO1, INC
  4547. STFDUX f1, CO1, INC
  4548. STFSDUX f1, CO1, INC
  4549. STFDUX f2, CO1, INC
  4550. STFSDUX f2, CO1, INC
  4551. STFDUX f3, CO1, INC
  4552. STFSDUX f3, CO1, INC
  4553. #else
  4554. STFPDUX f0, AO, INC2
  4555. STFPDUX f1, AO, INC2
  4556. STFPDUX f2, AO, INC2
  4557. STFPDUX f3, AO, INC2
  4558. subi AO, AO, 8 * SIZE
  4559. STFDUX f0, CO1, INC
  4560. STFSDUX f0, CO1, INC
  4561. STFDUX f1, CO1, INC
  4562. STFSDUX f1, CO1, INC
  4563. STFDUX f2, CO1, INC
  4564. STFSDUX f2, CO1, INC
  4565. STFDUX f3, CO1, INC
  4566. STFSDUX f3, CO1, INC
  4567. #endif
  4568. #ifdef LN
  4569. subi CO1, CO1, 8 * SIZE
  4570. #endif
  4571. #ifdef RT
  4572. slwi r0, K, 3 + BASE_SHIFT
  4573. add AORIG, AORIG, r0
  4574. #endif
  4575. #if defined(LT) || defined(RN)
  4576. sub TEMP, K, KK
  4577. slwi r0, TEMP, 3 + BASE_SHIFT
  4578. slwi TEMP, TEMP, 0 + BASE_SHIFT
  4579. add AO, AO, r0
  4580. add BO, BO, TEMP
  4581. #endif
  4582. #ifdef LT
  4583. addi KK, KK, 8
  4584. #endif
  4585. #ifdef LN
  4586. subi KK, KK, 8
  4587. #endif
  4588. addic. I, I, -1
  4589. li r0, FZERO
  4590. lfpsx f0, SP, r0
  4591. bgt+ .L91
  4592. .align 4
  4593. .L129:
  4594. #ifdef LN
  4595. slwi r0, K, 0 + BASE_SHIFT
  4596. add B, B, r0
  4597. #endif
  4598. #if defined(LT) || defined(RN)
  4599. addi B, BO, 2 * SIZE
  4600. #endif
  4601. #ifdef RN
  4602. addi KK, KK, 1
  4603. #endif
  4604. #ifdef RT
  4605. subi KK, KK, 1
  4606. #endif
  4607. .align 4
  4608. .L999:
  4609. addi SP, SP, 12
  4610. lwzu r14, 4(SP)
  4611. lwzu r15, 4(SP)
  4612. lwzu r16, 4(SP)
  4613. lwzu r17, 4(SP)
  4614. lwzu r18, 4(SP)
  4615. lwzu r19, 4(SP)
  4616. lwzu r20, 4(SP)
  4617. lwzu r21, 4(SP)
  4618. lwzu r22, 4(SP)
  4619. lwzu r23, 4(SP)
  4620. lwzu r24, 4(SP)
  4621. lwzu r25, 4(SP)
  4622. lwzu r26, 4(SP)
  4623. lwzu r27, 4(SP)
  4624. lwzu r28, 4(SP)
  4625. lwzu r29, 4(SP)
  4626. lwzu r30, 4(SP)
  4627. lwzu r31, 4(SP)
  4628. subi SP, SP, 12
  4629. li r0, 16
  4630. lfpdux f31, SP, r0
  4631. lfpdux f30, SP, r0
  4632. lfpdux f29, SP, r0
  4633. lfpdux f28, SP, r0
  4634. lfpdux f27, SP, r0
  4635. lfpdux f26, SP, r0
  4636. lfpdux f25, SP, r0
  4637. lfpdux f24, SP, r0
  4638. lfpdux f23, SP, r0
  4639. lfpdux f22, SP, r0
  4640. lfpdux f21, SP, r0
  4641. lfpdux f20, SP, r0
  4642. lfpdux f19, SP, r0
  4643. lfpdux f18, SP, r0
  4644. lfpdux f17, SP, r0
  4645. lfpdux f16, SP, r0
  4646. lfpdux f15, SP, r0
  4647. lfpdux f14, SP, r0
  4648. addi SP, SP, 16
  4649. blr
  4650. EPILOGUE
  4651. #endif