You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_hummer_RT.S 98 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define ALPHA 0
  41. #define FZERO 8
  42. #define M r3
  43. #define N r4
  44. #define K r5
  45. #if defined(linux) || defined(__FreeBSD__)
  46. #define A r6
  47. #define B r7
  48. #define C r8
  49. #define LDC r9
  50. #define OFFSET r10
  51. #endif
  52. #define TEMP r11
  53. #define AORIG r12
  54. #define KK r14
  55. #define INCM1 r15
  56. #define INCM4 r16
  57. #define INCM2 r17
  58. #define INC2 r19
  59. #define INC r20
  60. #define INC4 r21
  61. #define I r22
  62. #define J r23
  63. #define AO r24
  64. #define BO r25
  65. #define AO2 r26
  66. #define BO2 r27
  67. #define CO1 r28
  68. #define CO2 r29
  69. #define CO3 r30
  70. #define CO4 r31
  71. #ifndef NEEDPARAM
  72. #define A1 f16
  73. #define A2 f17
  74. #define A3 f18
  75. #define A4 f19
  76. #define A5 f20
  77. #define A6 f21
  78. #define A7 f22
  79. #define A8 f23
  80. #define A9 f24
  81. #define A10 f25
  82. #define B1 f26
  83. #define B2 f27
  84. #define B3 f28
  85. #define B4 f29
  86. #define B5 f30
  87. #define B6 f31
  88. #define AP B6
  89. PROLOGUE
  90. PROFCODE
  91. li r0, -16
  92. stfpdux f14, SP, r0
  93. stfpdux f15, SP, r0
  94. stfpdux f16, SP, r0
  95. stfpdux f17, SP, r0
  96. stfpdux f18, SP, r0
  97. stfpdux f19, SP, r0
  98. stfpdux f20, SP, r0
  99. stfpdux f21, SP, r0
  100. stfpdux f22, SP, r0
  101. stfpdux f23, SP, r0
  102. stfpdux f24, SP, r0
  103. stfpdux f25, SP, r0
  104. stfpdux f26, SP, r0
  105. stfpdux f27, SP, r0
  106. stfpdux f28, SP, r0
  107. stfpdux f29, SP, r0
  108. stfpdux f30, SP, r0
  109. stfpdux f31, SP, r0
  110. stwu r31, -4(SP)
  111. stwu r30, -4(SP)
  112. stwu r29, -4(SP)
  113. stwu r28, -4(SP)
  114. stwu r27, -4(SP)
  115. stwu r26, -4(SP)
  116. stwu r25, -4(SP)
  117. stwu r24, -4(SP)
  118. stwu r23, -4(SP)
  119. stwu r22, -4(SP)
  120. stwu r21, -4(SP)
  121. stwu r20, -4(SP)
  122. stwu r19, -4(SP)
  123. stwu r18, -4(SP)
  124. stwu r17, -4(SP)
  125. stwu r16, -4(SP)
  126. stwu r15, -4(SP)
  127. stwu r14, -4(SP) # dummy
  128. li r0, 0
  129. stwu r0, -4(SP)
  130. stwu r0, -4(SP)
  131. stfdu f1, -8(SP)
  132. slwi LDC, LDC, BASE_SHIFT
  133. cmpwi cr0, M, 0
  134. ble .L999
  135. cmpwi cr0, N, 0
  136. ble .L999
  137. cmpwi cr0, K, 0
  138. ble .L999
  139. li INC, 1 * SIZE
  140. li INC2, 2 * SIZE
  141. li INC4, 4 * SIZE
  142. li INCM1, -1 * SIZE
  143. li INCM2, -2 * SIZE
  144. li INCM4, -4 * SIZE
  145. addi C, C, - 1 * SIZE
  146. #ifdef LN
  147. mullw r0, M, K
  148. slwi r0, r0, BASE_SHIFT
  149. add A, A, r0
  150. slwi r0, M, BASE_SHIFT
  151. add C, C, r0
  152. #endif
  153. #ifdef RN
  154. neg KK, OFFSET
  155. #endif
  156. #ifdef RT
  157. mullw r0, N, K
  158. slwi r0, r0, BASE_SHIFT
  159. add B, B, r0
  160. mullw r0, N, LDC
  161. add C, C, r0
  162. sub KK, N, OFFSET
  163. #endif
  164. andi. J, N, 1
  165. beq .L50
  166. #ifdef RT
  167. slwi r0, K, 0 + BASE_SHIFT
  168. sub B, B, r0
  169. sub C, C, LDC
  170. #endif
  171. mr CO1, C
  172. #ifdef LN
  173. add KK, M, OFFSET
  174. #endif
  175. #ifdef LT
  176. mr KK, OFFSET
  177. #endif
  178. #if defined(LN) || defined(RT)
  179. addi AORIG, A, -2 * SIZE
  180. #else
  181. addi AO, A, -2 * SIZE
  182. #endif
  183. #ifndef RT
  184. add C, CO1, LDC
  185. #endif
  186. li r0, FZERO
  187. lfpsx f0, SP, r0
  188. srawi. I, M, 3
  189. ble .L100
  190. .align 4
  191. .L91:
  192. #if defined(LT) || defined(RN)
  193. fpmr f1, f0
  194. addi BO, B, - 2 * SIZE
  195. fpmr f2, f0
  196. fpmr f3, f0
  197. srawi. r0, KK, 2
  198. mtspr CTR, r0
  199. ble .L94
  200. #else
  201. #ifdef LN
  202. slwi r0, K, 3 + BASE_SHIFT
  203. sub AORIG, AORIG, r0
  204. #endif
  205. slwi r0 , KK, 3 + BASE_SHIFT
  206. slwi TEMP, KK, 0 + BASE_SHIFT
  207. add AO, AORIG, r0
  208. add BO, B, TEMP
  209. sub TEMP, K, KK
  210. fpmr f1, f0
  211. addi BO, BO, - 2 * SIZE
  212. fpmr f2, f0
  213. fpmr f3, f0
  214. srawi. r0, TEMP, 2
  215. mtspr CTR, r0
  216. ble .L94
  217. #endif
  218. LFPDUX B1, BO, INC2
  219. LFPDUX A1, AO, INC2
  220. LFPDUX A2, AO, INC2
  221. LFPDUX A3, AO, INC2
  222. LFPDUX A4, AO, INC2
  223. LFPDUX B2, BO, INC2
  224. LFPDUX A5, AO, INC2
  225. LFPDUX A6, AO, INC2
  226. LFPDUX A7, AO, INC2
  227. LFPDUX A8, AO, INC2
  228. bdz- .L93
  229. .align 4
  230. .L92:
  231. fxcpmadd f0, B1, A1, f0
  232. LFPDUX A1, AO, INC2
  233. fxcpmadd f1, B1, A2, f1
  234. LFPDUX A2, AO, INC2
  235. fxcpmadd f2, B1, A3, f2
  236. LFPDUX A3, AO, INC2
  237. fxcpmadd f3, B1, A4, f3
  238. LFPDUX A4, AO, INC2
  239. fxcsmadd f0, B1, A5, f0
  240. LFPDUX A5, AO, INC2
  241. fxcsmadd f1, B1, A6, f1
  242. LFPDUX A6, AO, INC2
  243. fxcsmadd f2, B1, A7, f2
  244. LFPDUX A7, AO, INC2
  245. fxcsmadd f3, B1, A8, f3
  246. LFPDUX A8, AO, INC2
  247. LFPDUX B1, BO, INC2
  248. fxcpmadd f0, B2, A1, f0
  249. LFPDUX A1, AO, INC2
  250. fxcpmadd f1, B2, A2, f1
  251. LFPDUX A2, AO, INC2
  252. fxcpmadd f2, B2, A3, f2
  253. LFPDUX A3, AO, INC2
  254. fxcpmadd f3, B2, A4, f3
  255. LFPDUX A4, AO, INC2
  256. fxcsmadd f0, B2, A5, f0
  257. LFPDUX A5, AO, INC2
  258. fxcsmadd f1, B2, A6, f1
  259. LFPDUX A6, AO, INC2
  260. fxcsmadd f2, B2, A7, f2
  261. LFPDUX A7, AO, INC2
  262. fxcsmadd f3, B2, A8, f3
  263. LFPDUX A8, AO, INC2
  264. LFPDUX B2, BO, INC2
  265. bdnz+ .L92
  266. .align 4
  267. .L93:
  268. fxcpmadd f0, B1, A1, f0
  269. LFPDUX A1, AO, INC2
  270. fxcpmadd f1, B1, A2, f1
  271. LFPDUX A2, AO, INC2
  272. fxcpmadd f2, B1, A3, f2
  273. LFPDUX A3, AO, INC2
  274. fxcpmadd f3, B1, A4, f3
  275. LFPDUX A4, AO, INC2
  276. fxcsmadd f0, B1, A5, f0
  277. LFPDUX A5, AO, INC2
  278. fxcsmadd f1, B1, A6, f1
  279. LFPDUX A6, AO, INC2
  280. fxcsmadd f2, B1, A7, f2
  281. LFPDUX A7, AO, INC2
  282. fxcsmadd f3, B1, A8, f3
  283. LFPDUX A8, AO, INC2
  284. fxcpmadd f0, B2, A1, f0
  285. fxcpmadd f1, B2, A2, f1
  286. fxcpmadd f2, B2, A3, f2
  287. fxcpmadd f3, B2, A4, f3
  288. fxcsmadd f0, B2, A5, f0
  289. fxcsmadd f1, B2, A6, f1
  290. fxcsmadd f2, B2, A7, f2
  291. fxcsmadd f3, B2, A8, f3
  292. .align 4
  293. .L94:
  294. #if defined(LT) || defined(RN)
  295. andi. r0, KK, 3
  296. mtspr CTR, r0
  297. ble+ .L98
  298. #else
  299. andi. r0, TEMP, 3
  300. mtspr CTR, r0
  301. ble+ .L98
  302. #endif
  303. LFDX B1, BO, INC2
  304. LFPDUX A1, AO, INC2
  305. LFPDUX A2, AO, INC2
  306. LFPDUX A3, AO, INC2
  307. LFPDUX A4, AO, INC2
  308. add BO, BO, INC
  309. bdz- .L97
  310. .align 4
  311. .L96:
  312. fxcpmadd f0, B1, A1, f0
  313. LFPDUX A1, AO, INC2
  314. fxcpmadd f1, B1, A2, f1
  315. LFPDUX A2, AO, INC2
  316. fxcpmadd f2, B1, A3, f2
  317. LFPDUX A3, AO, INC2
  318. fxcpmadd f3, B1, A4, f3
  319. LFDX B1, BO, INC2
  320. LFPDUX A4, AO, INC2
  321. add BO, BO, INC
  322. bdnz+ .L96
  323. .align 4
  324. .L97:
  325. fxcpmadd f0, B1, A1, f0
  326. fxcpmadd f1, B1, A2, f1
  327. fxcpmadd f2, B1, A3, f2
  328. fxcpmadd f3, B1, A4, f3
  329. .align 4
  330. .L98:
  331. #if defined(LN) || defined(RT)
  332. #ifdef LN
  333. subi r0, KK, 8
  334. #else
  335. subi r0, KK, 1
  336. #endif
  337. slwi TEMP, r0, 3 + BASE_SHIFT
  338. slwi r0, r0, 0 + BASE_SHIFT
  339. add AO, AORIG, TEMP
  340. add BO, B, r0
  341. addi BO, BO, - 2 * SIZE
  342. #endif
  343. #if defined(LN) || defined(LT)
  344. LFPDUX f16, BO, INC2
  345. LFPDUX f17, BO, INC2
  346. LFPDUX f18, BO, INC2
  347. LFPDUX f19, BO, INC2
  348. subi BO, BO, 8 * SIZE
  349. fpsub f0, f16, f0
  350. fpsub f1, f17, f1
  351. fpsub f2, f18, f2
  352. fpsub f3, f19, f3
  353. #else
  354. LFPDUX f16, AO, INC2
  355. LFPDUX f17, AO, INC2
  356. LFPDUX f18, AO, INC2
  357. LFPDUX f19, AO, INC2
  358. subi AO, AO, 8 * SIZE
  359. fpsub f0, f16, f0
  360. fpsub f1, f17, f1
  361. fpsub f2, f18, f2
  362. fpsub f3, f19, f3
  363. #endif
  364. #ifdef LN
  365. fsmtp f4, f0
  366. fsmtp f5, f1
  367. fsmtp f6, f2
  368. fsmtp f7, f3
  369. LFD A1, (2 + 63) * SIZE(AO)
  370. LFD A2, (2 + 62) * SIZE(AO)
  371. LFD A3, (2 + 61) * SIZE(AO)
  372. LFD A4, (2 + 60) * SIZE(AO)
  373. LFD A5, (2 + 59) * SIZE(AO)
  374. LFD A6, (2 + 58) * SIZE(AO)
  375. LFD A7, (2 + 57) * SIZE(AO)
  376. LFD A8, (2 + 56) * SIZE(AO)
  377. fmul f7, A1, f7
  378. fnmsub f3, A2, f7, f3
  379. fnmsub f6, A3, f7, f6
  380. fnmsub f2, A4, f7, f2
  381. fnmsub f5, A5, f7, f5
  382. fnmsub f1, A6, f7, f1
  383. fnmsub f4, A7, f7, f4
  384. fnmsub f0, A8, f7, f0
  385. LFD A1, (2 + 54) * SIZE(AO)
  386. LFD A2, (2 + 53) * SIZE(AO)
  387. LFD A3, (2 + 52) * SIZE(AO)
  388. LFD A4, (2 + 51) * SIZE(AO)
  389. LFD A5, (2 + 50) * SIZE(AO)
  390. LFD A6, (2 + 49) * SIZE(AO)
  391. LFD A7, (2 + 48) * SIZE(AO)
  392. fmul f3, A1, f3
  393. fnmsub f6, A2, f3, f6
  394. fnmsub f2, A3, f3, f2
  395. fnmsub f5, A4, f3, f5
  396. fnmsub f1, A5, f3, f1
  397. fnmsub f4, A6, f3, f4
  398. fnmsub f0, A7, f3, f0
  399. LFD A1, (2 + 45) * SIZE(AO)
  400. LFD A2, (2 + 44) * SIZE(AO)
  401. LFD A3, (2 + 43) * SIZE(AO)
  402. LFD A4, (2 + 42) * SIZE(AO)
  403. LFD A5, (2 + 41) * SIZE(AO)
  404. LFD A6, (2 + 40) * SIZE(AO)
  405. fmul f6, A1, f6
  406. fnmsub f2, A2, f6, f2
  407. fnmsub f5, A3, f6, f5
  408. fnmsub f1, A4, f6, f1
  409. fnmsub f4, A5, f6, f4
  410. fnmsub f0, A6, f6, f0
  411. LFD A1, (2 + 36) * SIZE(AO)
  412. LFD A2, (2 + 35) * SIZE(AO)
  413. LFD A3, (2 + 34) * SIZE(AO)
  414. LFD A4, (2 + 33) * SIZE(AO)
  415. LFD A5, (2 + 32) * SIZE(AO)
  416. fmul f2, A1, f2
  417. fnmsub f5, A2, f2, f5
  418. fnmsub f1, A3, f2, f1
  419. fnmsub f4, A4, f2, f4
  420. fnmsub f0, A5, f2, f0
  421. LFD A1, (2 + 27) * SIZE(AO)
  422. LFD A2, (2 + 26) * SIZE(AO)
  423. LFD A3, (2 + 25) * SIZE(AO)
  424. LFD A4, (2 + 24) * SIZE(AO)
  425. fmul f5, A1, f5
  426. fnmsub f1, A2, f5, f1
  427. fnmsub f4, A3, f5, f4
  428. fnmsub f0, A4, f5, f0
  429. LFD A1, (2 + 18) * SIZE(AO)
  430. LFD A2, (2 + 17) * SIZE(AO)
  431. LFD A3, (2 + 16) * SIZE(AO)
  432. fmul f1, A1, f1
  433. fnmsub f4, A2, f1, f4
  434. fnmsub f0, A3, f1, f0
  435. LFD A1, (2 + 9) * SIZE(AO)
  436. LFD A2, (2 + 8) * SIZE(AO)
  437. fmul f4, A1, f4
  438. fnmsub f0, A2, f4, f0
  439. LFD A1, (2 + 0) * SIZE(AO)
  440. fmul f0, A1, f0
  441. fsmfp f0, f4
  442. fsmfp f1, f5
  443. fsmfp f2, f6
  444. fsmfp f3, f7
  445. #endif
  446. #ifdef LT
  447. fsmtp f4, f0
  448. fsmtp f5, f1
  449. fsmtp f6, f2
  450. fsmtp f7, f3
  451. LFD A1, (2 + 0) * SIZE(AO)
  452. LFD A2, (2 + 1) * SIZE(AO)
  453. LFD A3, (2 + 2) * SIZE(AO)
  454. LFD A4, (2 + 3) * SIZE(AO)
  455. LFD A5, (2 + 4) * SIZE(AO)
  456. LFD A6, (2 + 5) * SIZE(AO)
  457. LFD A7, (2 + 6) * SIZE(AO)
  458. LFD A8, (2 + 7) * SIZE(AO)
  459. fmul f0, A1, f0
  460. fnmsub f4, A2, f0, f4
  461. fnmsub f1, A3, f0, f1
  462. fnmsub f5, A4, f0, f5
  463. fnmsub f2, A5, f0, f2
  464. fnmsub f6, A6, f0, f6
  465. fnmsub f3, A7, f0, f3
  466. fnmsub f7, A8, f0, f7
  467. LFD A1, (2 + 9) * SIZE(AO)
  468. LFD A2, (2 + 10) * SIZE(AO)
  469. LFD A3, (2 + 11) * SIZE(AO)
  470. LFD A4, (2 + 12) * SIZE(AO)
  471. LFD A5, (2 + 13) * SIZE(AO)
  472. LFD A6, (2 + 14) * SIZE(AO)
  473. LFD A7, (2 + 15) * SIZE(AO)
  474. fmul f4, A1, f4
  475. fnmsub f1, A2, f4, f1
  476. fnmsub f5, A3, f4, f5
  477. fnmsub f2, A4, f4, f2
  478. fnmsub f6, A5, f4, f6
  479. fnmsub f3, A6, f4, f3
  480. fnmsub f7, A7, f4, f7
  481. LFD A1, (2 + 18) * SIZE(AO)
  482. LFD A2, (2 + 19) * SIZE(AO)
  483. LFD A3, (2 + 20) * SIZE(AO)
  484. LFD A4, (2 + 21) * SIZE(AO)
  485. LFD A5, (2 + 22) * SIZE(AO)
  486. LFD A6, (2 + 23) * SIZE(AO)
  487. fmul f1, A1, f1
  488. fnmsub f5, A2, f1, f5
  489. fnmsub f2, A3, f1, f2
  490. fnmsub f6, A4, f1, f6
  491. fnmsub f3, A5, f1, f3
  492. fnmsub f7, A6, f1, f7
  493. LFD A1, (2 + 27) * SIZE(AO)
  494. LFD A2, (2 + 28) * SIZE(AO)
  495. LFD A3, (2 + 29) * SIZE(AO)
  496. LFD A4, (2 + 30) * SIZE(AO)
  497. LFD A5, (2 + 31) * SIZE(AO)
  498. fmul f5, A1, f5
  499. fnmsub f2, A2, f5, f2
  500. fnmsub f6, A3, f5, f6
  501. fnmsub f3, A4, f5, f3
  502. fnmsub f7, A5, f5, f7
  503. LFD A1, (2 + 36) * SIZE(AO)
  504. LFD A2, (2 + 37) * SIZE(AO)
  505. LFD A3, (2 + 38) * SIZE(AO)
  506. LFD A4, (2 + 39) * SIZE(AO)
  507. fmul f2, A1, f2
  508. fnmsub f6, A2, f2, f6
  509. fnmsub f3, A3, f2, f3
  510. fnmsub f7, A4, f2, f7
  511. LFD A1, (2 + 45) * SIZE(AO)
  512. LFD A2, (2 + 46) * SIZE(AO)
  513. LFD A3, (2 + 47) * SIZE(AO)
  514. fmul f6, A1, f6
  515. fnmsub f3, A2, f6, f3
  516. fnmsub f7, A3, f6, f7
  517. LFD A1, (2 + 54) * SIZE(AO)
  518. LFD A2, (2 + 55) * SIZE(AO)
  519. fmul f3, A1, f3
  520. fnmsub f7, A2, f3, f7
  521. LFD A1, (2 + 63) * SIZE(AO)
  522. fmul f7, A1, f7
  523. fsmfp f0, f4
  524. fsmfp f1, f5
  525. fsmfp f2, f6
  526. fsmfp f3, f7
  527. #endif
  528. #ifdef RN
  529. LFPDX A1, BO, INC2
  530. fxpmul f0, A1, f0
  531. fxpmul f1, A1, f1
  532. fxpmul f2, A1, f2
  533. fxpmul f3, A1, f3
  534. #endif
  535. #ifdef RT
  536. LFPDX A1, BO, INC2
  537. fxpmul f0, A1, f0
  538. fxpmul f1, A1, f1
  539. fxpmul f2, A1, f2
  540. fxpmul f3, A1, f3
  541. #endif
  542. #ifdef LN
  543. subi CO1, CO1, 8 * SIZE
  544. #endif
  545. #if defined(LN) || defined(LT)
  546. STFPDUX f0, BO, INC2
  547. STFPDUX f1, BO, INC2
  548. STFPDUX f2, BO, INC2
  549. STFPDUX f3, BO, INC2
  550. subi BO, BO, 8 * SIZE
  551. STFDUX f0, CO1, INC
  552. STFSDUX f0, CO1, INC
  553. STFDUX f1, CO1, INC
  554. STFSDUX f1, CO1, INC
  555. STFDUX f2, CO1, INC
  556. STFSDUX f2, CO1, INC
  557. STFDUX f3, CO1, INC
  558. STFSDUX f3, CO1, INC
  559. #else
  560. STFPDUX f0, AO, INC2
  561. STFPDUX f1, AO, INC2
  562. STFPDUX f2, AO, INC2
  563. STFPDUX f3, AO, INC2
  564. subi AO, AO, 8 * SIZE
  565. STFDUX f0, CO1, INC
  566. STFSDUX f0, CO1, INC
  567. STFDUX f1, CO1, INC
  568. STFSDUX f1, CO1, INC
  569. STFDUX f2, CO1, INC
  570. STFSDUX f2, CO1, INC
  571. STFDUX f3, CO1, INC
  572. STFSDUX f3, CO1, INC
  573. #endif
  574. #ifdef LN
  575. subi CO1, CO1, 8 * SIZE
  576. #endif
  577. #ifdef RT
  578. slwi r0, K, 3 + BASE_SHIFT
  579. add AORIG, AORIG, r0
  580. #endif
  581. #if defined(LT) || defined(RN)
  582. sub TEMP, K, KK
  583. slwi r0, TEMP, 3 + BASE_SHIFT
  584. slwi TEMP, TEMP, 0 + BASE_SHIFT
  585. add AO, AO, r0
  586. add BO, BO, TEMP
  587. #endif
  588. #ifdef LT
  589. addi KK, KK, 8
  590. #endif
  591. #ifdef LN
  592. subi KK, KK, 8
  593. #endif
  594. addic. I, I, -1
  595. li r0, FZERO
  596. lfpsx f0, SP, r0
  597. bgt+ .L91
  598. .align 4
  599. .L100:
  600. andi. I, M, 4
  601. beq .L110
  602. #if defined(LT) || defined(RN)
  603. addi BO, B, - 2 * SIZE
  604. fpmr f1, f0
  605. fpmr f2, f0
  606. fpmr f3, f0
  607. srawi. r0, KK, 3
  608. mtspr CTR, r0
  609. ble .L104
  610. #else
  611. #ifdef LN
  612. slwi r0, K, 2 + BASE_SHIFT
  613. sub AORIG, AORIG, r0
  614. #endif
  615. slwi r0 , KK, 2 + BASE_SHIFT
  616. slwi TEMP, KK, 0 + BASE_SHIFT
  617. add AO, AORIG, r0
  618. add BO, B, TEMP
  619. sub TEMP, K, KK
  620. addi BO, BO, - 2 * SIZE
  621. fpmr f1, f0
  622. fpmr f2, f0
  623. fpmr f3, f0
  624. srawi. r0, TEMP, 3
  625. mtspr CTR, r0
  626. ble .L104
  627. #endif
  628. LFPDUX B1, BO, INC2
  629. LFPDUX A1, AO, INC2
  630. LFPDUX A2, AO, INC2
  631. LFPDUX A3, AO, INC2
  632. LFPDUX A4, AO, INC2
  633. LFPDUX B2, BO, INC2
  634. LFPDUX A5, AO, INC2
  635. LFPDUX A6, AO, INC2
  636. LFPDUX A7, AO, INC2
  637. LFPDUX A8, AO, INC2
  638. LFPDUX B3, BO, INC2
  639. LFPDUX B4, BO, INC2
  640. bdz- .L103
  641. .align 4
  642. .L102:
  643. fxcpmadd f0, B1, A1, f0
  644. LFPDUX A1, AO, INC2
  645. fxcpmadd f1, B1, A2, f1
  646. LFPDUX A2, AO, INC2
  647. fxcsmadd f2, B1, A3, f2
  648. LFPDUX A3, AO, INC2
  649. fxcsmadd f3, B1, A4, f3
  650. LFPDUX A4, AO, INC2
  651. LFPDUX B1, BO, INC2
  652. fxcpmadd f0, B2, A5, f0
  653. LFPDUX A5, AO, INC2
  654. fxcpmadd f1, B2, A6, f1
  655. LFPDUX A6, AO, INC2
  656. fxcsmadd f2, B2, A7, f2
  657. LFPDUX A7, AO, INC2
  658. fxcsmadd f3, B2, A8, f3
  659. LFPDUX A8, AO, INC2
  660. LFPDUX B2, BO, INC2
  661. fxcpmadd f0, B3, A1, f0
  662. LFPDUX A1, AO, INC2
  663. fxcpmadd f1, B3, A2, f1
  664. LFPDUX A2, AO, INC2
  665. fxcsmadd f2, B3, A3, f2
  666. LFPDUX A3, AO, INC2
  667. fxcsmadd f3, B3, A4, f3
  668. LFPDUX A4, AO, INC2
  669. LFPDUX B3, BO, INC2
  670. fxcpmadd f0, B4, A5, f0
  671. LFPDUX A5, AO, INC2
  672. fxcpmadd f1, B4, A6, f1
  673. LFPDUX A6, AO, INC2
  674. fxcsmadd f2, B4, A7, f2
  675. LFPDUX A7, AO, INC2
  676. fxcsmadd f3, B4, A8, f3
  677. LFPDUX A8, AO, INC2
  678. LFPDUX B4, BO, INC2
  679. bdnz+ .L102
  680. .align 4
  681. .L103:
  682. fxcpmadd f0, B1, A1, f0
  683. LFPDUX A1, AO, INC2
  684. fxcpmadd f1, B1, A2, f1
  685. LFPDUX A2, AO, INC2
  686. fxcsmadd f2, B1, A3, f2
  687. LFPDUX A3, AO, INC2
  688. fxcsmadd f3, B1, A4, f3
  689. LFPDUX A4, AO, INC2
  690. fxcpmadd f0, B2, A5, f0
  691. LFPDUX A5, AO, INC2
  692. fxcpmadd f1, B2, A6, f1
  693. LFPDUX A6, AO, INC2
  694. fxcsmadd f2, B2, A7, f2
  695. LFPDUX A7, AO, INC2
  696. fxcsmadd f3, B2, A8, f3
  697. LFPDUX A8, AO, INC2
  698. fxcpmadd f0, B3, A1, f0
  699. fxcpmadd f1, B3, A2, f1
  700. fxcsmadd f2, B3, A3, f2
  701. fxcsmadd f3, B3, A4, f3
  702. fxcpmadd f0, B4, A5, f0
  703. fxcpmadd f1, B4, A6, f1
  704. fxcsmadd f2, B4, A7, f2
  705. fxcsmadd f3, B4, A8, f3
  706. .align 4
  707. .L104:
  708. #if defined(LT) || defined(RN)
  709. andi. r0, KK, 7
  710. mtspr CTR, r0
  711. ble+ .L108
  712. #else
  713. andi. r0, TEMP, 7
  714. mtspr CTR, r0
  715. ble+ .L108
  716. #endif
  717. LFPDUX A1, AO, INC2
  718. LFDX B1, BO, INC2
  719. LFPDUX A2, AO, INC2
  720. add BO, BO, INC
  721. bdz- .L107
  722. .align 4
  723. .L106:
  724. fxcpmadd f0, B1, A1, f0
  725. LFPDUX A1, AO, INC2
  726. fxcpmadd f1, B1, A2, f1
  727. LFDX B1, BO, INC2
  728. LFPDUX A2, AO, INC2
  729. add BO, BO, INC
  730. bdnz+ .L106
  731. .align 4
  732. .L107:
  733. fxcpmadd f0, B1, A1, f0
  734. fxcpmadd f1, B1, A2, f1
  735. .align 4
  736. .L108:
  737. fpadd f0, f0, f2
  738. fpadd f1, f1, f3
  739. #if defined(LN) || defined(RT)
  740. #ifdef LN
  741. subi r0, KK, 4
  742. #else
  743. subi r0, KK, 1
  744. #endif
  745. slwi TEMP, r0, 2 + BASE_SHIFT
  746. slwi r0, r0, 0 + BASE_SHIFT
  747. add AO, AORIG, TEMP
  748. add BO, B, r0
  749. addi BO, BO, - 2 * SIZE
  750. #endif
  751. #if defined(LN) || defined(LT)
  752. LFPDUX f16, BO, INC2
  753. LFPDUX f17, BO, INC2
  754. subi BO, BO, 4 * SIZE
  755. fpsub f0, f16, f0
  756. fpsub f1, f17, f1
  757. #else
  758. LFPDUX f16, AO, INC2
  759. LFPDUX f17, AO, INC2
  760. subi AO, AO, 4 * SIZE
  761. fpsub f0, f16, f0
  762. fpsub f1, f17, f1
  763. #endif
  764. #ifdef LN
  765. fsmtp f4, f0
  766. fsmtp f5, f1
  767. LFD A1, (2 + 15) * SIZE(AO)
  768. LFD A2, (2 + 14) * SIZE(AO)
  769. LFD A3, (2 + 13) * SIZE(AO)
  770. LFD A4, (2 + 12) * SIZE(AO)
  771. fmul f5, A1, f5
  772. fnmsub f1, A2, f5, f1
  773. fnmsub f4, A3, f5, f4
  774. fnmsub f0, A4, f5, f0
  775. LFD A1, (2 + 10) * SIZE(AO)
  776. LFD A2, (2 + 9) * SIZE(AO)
  777. LFD A3, (2 + 8) * SIZE(AO)
  778. fmul f1, A1, f1
  779. fnmsub f4, A2, f1, f4
  780. fnmsub f0, A3, f1, f0
  781. LFD A1, (2 + 5) * SIZE(AO)
  782. LFD A2, (2 + 4) * SIZE(AO)
  783. fmul f4, A1, f4
  784. fnmsub f0, A2, f4, f0
  785. LFD A1, (2 + 0) * SIZE(AO)
  786. fmul f0, A1, f0
  787. fsmfp f0, f4
  788. fsmfp f1, f5
  789. #endif
  790. #ifdef LT
  791. fsmtp f4, f0
  792. fsmtp f5, f1
  793. LFD A1, (2 + 0) * SIZE(AO)
  794. LFD A2, (2 + 1) * SIZE(AO)
  795. LFD A3, (2 + 2) * SIZE(AO)
  796. LFD A4, (2 + 3) * SIZE(AO)
  797. fmul f0, A1, f0
  798. fnmsub f4, A2, f0, f4
  799. fnmsub f1, A3, f0, f1
  800. fnmsub f5, A4, f0, f5
  801. LFD A1, (2 + 5) * SIZE(AO)
  802. LFD A2, (2 + 6) * SIZE(AO)
  803. LFD A3, (2 + 7) * SIZE(AO)
  804. fmul f4, A1, f4
  805. fnmsub f1, A2, f4, f1
  806. fnmsub f5, A3, f4, f5
  807. LFD A1, (2 + 10) * SIZE(AO)
  808. LFD A2, (2 + 11) * SIZE(AO)
  809. fmul f1, A1, f1
  810. fnmsub f5, A2, f1, f5
  811. LFD A1, (2 + 15) * SIZE(AO)
  812. fmul f5, A1, f5
  813. fsmfp f0, f4
  814. fsmfp f1, f5
  815. #endif
  816. #ifdef RN
  817. LFPDX A1, BO, INC2
  818. fxpmul f0, A1, f0
  819. fxpmul f1, A1, f1
  820. #endif
  821. #ifdef RT
  822. LFPDX A1, BO, INC2
  823. fxpmul f0, A1, f0
  824. fxpmul f1, A1, f1
  825. #endif
  826. #ifdef LN
  827. subi CO1, CO1, 4 * SIZE
  828. #endif
  829. #if defined(LN) || defined(LT)
  830. STFPDUX f0, BO, INC2
  831. STFPDUX f1, BO, INC2
  832. subi BO, BO, 4 * SIZE
  833. STFDUX f0, CO1, INC
  834. STFSDUX f0, CO1, INC
  835. STFDUX f1, CO1, INC
  836. STFSDUX f1, CO1, INC
  837. #else
  838. STFPDUX f0, AO, INC2
  839. STFPDUX f1, AO, INC2
  840. subi AO, AO, 4 * SIZE
  841. STFDUX f0, CO1, INC
  842. STFSDUX f0, CO1, INC
  843. STFDUX f1, CO1, INC
  844. STFSDUX f1, CO1, INC
  845. #endif
  846. #ifdef LN
  847. subi CO1, CO1, 4 * SIZE
  848. #endif
  849. #ifdef RT
  850. slwi r0, K, 2 + BASE_SHIFT
  851. add AORIG, AORIG, r0
  852. #endif
  853. #if defined(LT) || defined(RN)
  854. sub TEMP, K, KK
  855. slwi r0, TEMP, 2 + BASE_SHIFT
  856. slwi TEMP, TEMP, 0 + BASE_SHIFT
  857. add AO, AO, r0
  858. add BO, BO, TEMP
  859. #endif
  860. #ifdef LT
  861. addi KK, KK, 4
  862. #endif
  863. #ifdef LN
  864. subi KK, KK, 4
  865. #endif
  866. li r0, FZERO
  867. lfpsx f0, SP, r0
  868. .align 4
  869. .L110:
  870. andi. I, M, 2
  871. beq .L120
  872. #if defined(LT) || defined(RN)
  873. addi BO, B, - 2 * SIZE
  874. fpmr f1, f0
  875. fpmr f2, f0
  876. fpmr f3, f0
  877. srawi. r0, KK, 3
  878. mtspr CTR, r0
  879. ble .L114
  880. #else
  881. #ifdef LN
  882. slwi r0, K, 1 + BASE_SHIFT
  883. sub AORIG, AORIG, r0
  884. #endif
  885. slwi r0 , KK, 1 + BASE_SHIFT
  886. slwi TEMP, KK, 0 + BASE_SHIFT
  887. add AO, AORIG, r0
  888. add BO, B, TEMP
  889. sub TEMP, K, KK
  890. addi BO, BO, - 2 * SIZE
  891. fpmr f1, f0
  892. fpmr f2, f0
  893. fpmr f3, f0
  894. srawi. r0, TEMP, 3
  895. mtspr CTR, r0
  896. ble .L114
  897. #endif
  898. LFPDUX A1, AO, INC2
  899. LFPDUX A2, AO, INC2
  900. LFPDUX B1, BO, INC2
  901. LFPDUX A3, AO, INC2
  902. LFPDUX A4, AO, INC2
  903. LFPDUX B2, BO, INC2
  904. LFPDUX A5, AO, INC2
  905. LFPDUX A6, AO, INC2
  906. LFPDUX B3, BO, INC2
  907. LFPDUX A7, AO, INC2
  908. LFPDUX A8, AO, INC2
  909. LFPDUX B4, BO, INC2
  910. bdz- .L113
  911. .align 4
  912. .L112:
  913. fxcpmadd f0, B1, A1, f0
  914. LFPDUX A1, AO, INC2
  915. fxcsmadd f1, B1, A2, f1
  916. LFPDUX A2, AO, INC2
  917. LFPDUX B1, BO, INC2
  918. fxcpmadd f2, B2, A3, f2
  919. LFPDUX A3, AO, INC2
  920. fxcsmadd f3, B2, A4, f3
  921. LFPDUX A4, AO, INC2
  922. LFPDUX B2, BO, INC2
  923. fxcpmadd f0, B3, A5, f0
  924. LFPDUX A5, AO, INC2
  925. fxcsmadd f1, B3, A6, f1
  926. LFPDUX A6, AO, INC2
  927. LFPDUX B3, BO, INC2
  928. fxcpmadd f2, B4, A7, f2
  929. LFPDUX A7, AO, INC2
  930. fxcsmadd f3, B4, A8, f3
  931. LFPDUX A8, AO, INC2
  932. LFPDUX B4, BO, INC2
  933. bdnz+ .L112
  934. .align 4
  935. .L113:
  936. fxcpmadd f0, B1, A1, f0
  937. fxcsmadd f1, B1, A2, f1
  938. fxcpmadd f2, B2, A3, f2
  939. fxcsmadd f3, B2, A4, f3
  940. fxcpmadd f0, B3, A5, f0
  941. fxcsmadd f1, B3, A6, f1
  942. fxcpmadd f2, B4, A7, f2
  943. fxcsmadd f3, B4, A8, f3
  944. .align 4
  945. .L114:
  946. #if defined(LT) || defined(RN)
  947. andi. r0, KK, 7
  948. mtspr CTR, r0
  949. ble+ .L118
  950. #else
  951. andi. r0, TEMP, 7
  952. mtspr CTR, r0
  953. ble+ .L118
  954. #endif
  955. LFPDUX A1, AO, INC2
  956. LFDX B1, BO, INC2
  957. add BO, BO, INC
  958. bdz- .L117
  959. .align 4
  960. .L116:
  961. fxcpmadd f0, B1, A1, f0
  962. LFPDUX A1, AO, INC2
  963. LFDX B1, BO, INC2
  964. add BO, BO, INC
  965. bdnz+ .L116
  966. .align 4
  967. .L117:
  968. fxcpmadd f0, B1, A1, f0
  969. .align 4
  970. .L118:
  971. fpadd f0, f0, f1
  972. fpadd f2, f3, f2
  973. fpadd f0, f0, f2
  974. #if defined(LN) || defined(RT)
  975. #ifdef LN
  976. subi r0, KK, 2
  977. #else
  978. subi r0, KK, 1
  979. #endif
  980. slwi TEMP, r0, 1 + BASE_SHIFT
  981. slwi r0, r0, 0 + BASE_SHIFT
  982. add AO, AORIG, TEMP
  983. add BO, B, r0
  984. addi BO, BO, - 2 * SIZE
  985. #endif
  986. #if defined(LN) || defined(LT)
  987. LFPDX f16, BO, INC2
  988. fpsub f0, f16, f0
  989. #else
  990. LFPDX f16, AO, INC2
  991. fpsub f0, f16, f0
  992. #endif
  993. #ifdef LN
  994. fsmtp f4, f0
  995. LFD A1, (2 + 3) * SIZE(AO)
  996. LFD A2, (2 + 2) * SIZE(AO)
  997. LFD A3, (2 + 0) * SIZE(AO)
  998. fmul f4, A1, f4
  999. fnmsub f0, A2, f4, f0
  1000. fmul f0, A3, f0
  1001. fsmfp f0, f4
  1002. #endif
  1003. #ifdef LT
  1004. fsmtp f4, f0
  1005. LFD A1, (2 + 0) * SIZE(AO)
  1006. LFD A2, (2 + 1) * SIZE(AO)
  1007. LFD A3, (2 + 3) * SIZE(AO)
  1008. fmul f0, A1, f0
  1009. fnmsub f4, A2, f0, f4
  1010. fmul f4, A3, f4
  1011. fsmfp f0, f4
  1012. #endif
  1013. #ifdef RN
  1014. LFPDX A1, BO, INC2
  1015. fxpmul f0, A1, f0
  1016. #endif
  1017. #ifdef RT
  1018. LFPDX A1, BO, INC2
  1019. fxpmul f0, A1, f0
  1020. #endif
  1021. #ifdef LN
  1022. subi CO1, CO1, 2 * SIZE
  1023. #endif
  1024. #if defined(LN) || defined(LT)
  1025. STFPDX f0, BO, INC2
  1026. STFDUX f0, CO1, INC
  1027. STFSDUX f0, CO1, INC
  1028. #else
  1029. STFPDX f0, AO, INC2
  1030. STFDUX f0, CO1, INC
  1031. STFSDUX f0, CO1, INC
  1032. #endif
  1033. #ifdef LN
  1034. subi CO1, CO1, 2 * SIZE
  1035. #endif
  1036. #ifdef RT
  1037. slwi r0, K, 1 + BASE_SHIFT
  1038. add AORIG, AORIG, r0
  1039. #endif
  1040. #if defined(LT) || defined(RN)
  1041. sub TEMP, K, KK
  1042. slwi r0, TEMP, 1 + BASE_SHIFT
  1043. slwi TEMP, TEMP, 0 + BASE_SHIFT
  1044. add AO, AO, r0
  1045. add BO, BO, TEMP
  1046. #endif
  1047. #ifdef LT
  1048. addi KK, KK, 2
  1049. #endif
  1050. #ifdef LN
  1051. subi KK, KK, 2
  1052. #endif
  1053. li r0, FZERO
  1054. lfpsx f0, SP, r0
  1055. .align 4
  1056. .L120:
  1057. andi. I, M, 1
  1058. beq .L129
  1059. #if defined(LT) || defined(RN)
  1060. addi BO, B, - 2 * SIZE
  1061. fpmr f1, f0
  1062. fpmr f2, f0
  1063. fpmr f3, f0
  1064. srawi. r0, KK, 3
  1065. mtspr CTR, r0
  1066. ble .L124
  1067. #else
  1068. #ifdef LN
  1069. slwi r0, K, 0 + BASE_SHIFT
  1070. sub AORIG, AORIG, r0
  1071. #endif
  1072. slwi r0 , KK, 0 + BASE_SHIFT
  1073. slwi TEMP, KK, 0 + BASE_SHIFT
  1074. add AO, AORIG, r0
  1075. add BO, B, TEMP
  1076. sub TEMP, K, KK
  1077. addi BO, BO, - 2 * SIZE
  1078. fpmr f1, f0
  1079. fpmr f2, f0
  1080. fpmr f3, f0
  1081. srawi. r0, TEMP, 3
  1082. mtspr CTR, r0
  1083. ble .L124
  1084. #endif
  1085. LFPDUX A1, AO, INC2
  1086. LFPDUX B1, BO, INC2
  1087. LFPDUX A2, AO, INC2
  1088. LFPDUX B2, BO, INC2
  1089. LFPDUX A3, AO, INC2
  1090. LFPDUX B3, BO, INC2
  1091. LFPDUX A4, AO, INC2
  1092. LFPDUX B4, BO, INC2
  1093. bdz- .L123
  1094. .align 4
  1095. .L122:
  1096. fpmadd f0, A1, B1, f0
  1097. LFPDUX A1, AO, INC2
  1098. LFPDUX B1, BO, INC2
  1099. fpmadd f1, A2, B2, f1
  1100. LFPDUX A2, AO, INC2
  1101. LFPDUX B2, BO, INC2
  1102. fpmadd f2, A3, B3, f2
  1103. LFPDUX A3, AO, INC2
  1104. LFPDUX B3, BO, INC2
  1105. fpmadd f3, A4, B4, f3
  1106. LFPDUX A4, AO, INC2
  1107. LFPDUX B4, BO, INC2
  1108. bdnz+ .L122
  1109. .align 4
  1110. .L123:
  1111. fpmadd f0, A1, B1, f0
  1112. fpmadd f1, A2, B2, f1
  1113. fpmadd f2, A3, B3, f2
  1114. fpmadd f3, A4, B4, f3
  1115. .align 4
  1116. .L124:
  1117. #if defined(LT) || defined(RN)
  1118. andi. r0, KK, 7
  1119. mtspr CTR, r0
  1120. ble+ .L128
  1121. #else
  1122. andi. r0, TEMP, 7
  1123. mtspr CTR, r0
  1124. ble+ .L128
  1125. #endif
  1126. LFDX A1, AO, INC2
  1127. LFDX B1, BO, INC2
  1128. add AO, AO, INC
  1129. add BO, BO, INC
  1130. bdz- .L127
  1131. .align 4
  1132. .L126:
  1133. fmadd f0, A1, B1, f0
  1134. LFDX A1, AO, INC2
  1135. LFDX B1, BO, INC2
  1136. add AO, AO, INC
  1137. add BO, BO, INC
  1138. bdnz+ .L126
  1139. .align 4
  1140. .L127:
  1141. fmadd f0, A1, B1, f0
  1142. .align 4
  1143. .L128:
  1144. fpadd f0, f0, f1
  1145. fpadd f2, f2, f3
  1146. fpadd f0, f0, f2
  1147. fsmtp f1, f0
  1148. fadd f0, f0, f1
  1149. #if defined(LN) || defined(RT)
  1150. #ifdef LN
  1151. subi r0, KK, 1
  1152. #else
  1153. subi r0, KK, 1
  1154. #endif
  1155. slwi TEMP, r0, 0 + BASE_SHIFT
  1156. slwi r0, r0, 0 + BASE_SHIFT
  1157. add AO, AORIG, TEMP
  1158. add BO, B, r0
  1159. addi BO, BO, - 2 * SIZE
  1160. #endif
  1161. #if defined(LN) || defined(LT)
  1162. LFDX f16, BO, INC2
  1163. fsub f0, f16, f0
  1164. #else
  1165. LFDX f16, AO, INC2
  1166. fsub f0, f16, f0
  1167. #endif
  1168. #ifdef LN
  1169. LFD A1, (2 + 0) * SIZE(AO)
  1170. fmul f0, A1, f0
  1171. #endif
  1172. #ifdef LT
  1173. LFD A1, (2 + 0) * SIZE(AO)
  1174. fmul f0, A1, f0
  1175. #endif
  1176. #ifdef RN
  1177. LFDX A1, BO, INC2
  1178. fmul f0, A1, f0
  1179. #endif
  1180. #ifdef RT
  1181. LFDX A1, BO, INC2
  1182. fmul f0, A1, f0
  1183. #endif
  1184. #ifdef LN
  1185. subi CO1, CO1, 1 * SIZE
  1186. #endif
  1187. #if defined(LN) || defined(LT)
  1188. STFDX f0, BO, INC2
  1189. STFDUX f0, CO1, INC
  1190. #else
  1191. STFDX f0, AO, INC2
  1192. STFDUX f0, CO1, INC
  1193. #endif
  1194. #ifdef LN
  1195. subi CO1, CO1, 1 * SIZE
  1196. #endif
  1197. #ifdef RT
  1198. slwi r0, K, 0 + BASE_SHIFT
  1199. add AORIG, AORIG, r0
  1200. #endif
  1201. #if defined(LT) || defined(RN)
  1202. sub TEMP, K, KK
  1203. slwi r0, TEMP, 0 + BASE_SHIFT
  1204. slwi TEMP, TEMP, 0 + BASE_SHIFT
  1205. add AO, AO, r0
  1206. add BO, BO, TEMP
  1207. #endif
  1208. #ifdef LT
  1209. addi KK, KK, 1
  1210. #endif
  1211. #ifdef LN
  1212. subi KK, KK, 1
  1213. #endif
  1214. .align 4
  1215. .L129:
  1216. #ifdef LN
  1217. slwi r0, K, 0 + BASE_SHIFT
  1218. add B, B, r0
  1219. #endif
  1220. #if defined(LT) || defined(RN)
  1221. addi B, BO, 2 * SIZE
  1222. #endif
  1223. #ifdef RN
  1224. addi KK, KK, 1
  1225. #endif
  1226. #ifdef RT
  1227. subi KK, KK, 1
  1228. #endif
  1229. .align 4
  1230. .L50:
  1231. andi. J, N, 2
  1232. beq .L90
  1233. #ifdef RT
  1234. slwi r0, K, 1 + BASE_SHIFT
  1235. sub B, B, r0
  1236. slwi r0, LDC, 1
  1237. sub C, C, r0
  1238. #endif
  1239. mr CO1, C
  1240. add CO2, C, LDC
  1241. #ifdef LN
  1242. add KK, M, OFFSET
  1243. #endif
  1244. #ifdef LT
  1245. mr KK, OFFSET
  1246. #endif
  1247. #if defined(LN) || defined(RT)
  1248. addi AORIG, A, -2 * SIZE
  1249. #else
  1250. addi AO, A, -2 * SIZE
  1251. #endif
  1252. #ifndef RT
  1253. add C, CO2, LDC
  1254. #endif
  1255. li r0, FZERO
  1256. lfpsx f0, SP, r0
  1257. srawi. I, M, 3
  1258. ble .L60
  1259. .align 4
  1260. .L51:
  1261. #if defined(LT) || defined(RN)
  1262. fpmr f4, f0
  1263. addi BO, B, - 2 * SIZE
  1264. fpmr f1, f0
  1265. fpmr f5, f0
  1266. fpmr f2, f0
  1267. fpmr f6, f0
  1268. srawi. r0, KK, 2
  1269. fpmr f3, f0
  1270. mtspr CTR, r0
  1271. fpmr f7, f0
  1272. ble .L54
  1273. #else
  1274. #ifdef LN
  1275. slwi r0, K, 3 + BASE_SHIFT
  1276. sub AORIG, AORIG, r0
  1277. #endif
  1278. slwi r0 , KK, 3 + BASE_SHIFT
  1279. slwi TEMP, KK, 1 + BASE_SHIFT
  1280. add AO, AORIG, r0
  1281. add BO, B, TEMP
  1282. sub TEMP, K, KK
  1283. fpmr f4, f0
  1284. addi BO, BO, - 2 * SIZE
  1285. fpmr f1, f0
  1286. fpmr f5, f0
  1287. fpmr f2, f0
  1288. fpmr f6, f0
  1289. srawi. r0, TEMP, 2
  1290. fpmr f3, f0
  1291. mtspr CTR, r0
  1292. fpmr f7, f0
  1293. ble .L54
  1294. #endif
  1295. LFPDUX B1, BO, INC2
  1296. LFPDUX A1, AO, INC2
  1297. LFPDUX A2, AO, INC2
  1298. LFPDUX B2, BO, INC2
  1299. LFPDUX A3, AO, INC2
  1300. LFPDUX A4, AO, INC2
  1301. LFPDUX B3, BO, INC2
  1302. LFPDUX A5, AO, INC2
  1303. LFPDUX A6, AO, INC2
  1304. LFPDUX A7, AO, INC2
  1305. LFPDUX A8, AO, INC2
  1306. bdz- .L53
  1307. .align 4
  1308. .L52:
  1309. fxcpmadd f0, B1, A1, f0
  1310. LFPDUX B4, BO, INC2
  1311. fxcsmadd f4, B1, A1, f4
  1312. LFPDUX A1, AO, INC2
  1313. fxcpmadd f1, B1, A2, f1
  1314. nop
  1315. fxcsmadd f5, B1, A2, f5
  1316. LFPDUX A2, AO, INC2
  1317. fxcpmadd f2, B1, A3, f2
  1318. nop
  1319. fxcsmadd f6, B1, A3, f6
  1320. LFPDUX A3, AO, INC2
  1321. fxcpmadd f3, B1, A4, f3
  1322. nop
  1323. fxcsmadd f7, B1, A4, f7
  1324. LFPDUX A4, AO, INC2
  1325. fxcpmadd f0, B2, A5, f0
  1326. LFPDUX B1, BO, INC2
  1327. fxcsmadd f4, B2, A5, f4
  1328. LFPDUX A5, AO, INC2
  1329. fxcpmadd f1, B2, A6, f1
  1330. nop
  1331. fxcsmadd f5, B2, A6, f5
  1332. LFPDUX A6, AO, INC2
  1333. fxcpmadd f2, B2, A7, f2
  1334. nop
  1335. fxcsmadd f6, B2, A7, f6
  1336. LFPDUX A7, AO, INC2
  1337. fxcpmadd f3, B2, A8, f3
  1338. nop
  1339. fxcsmadd f7, B2, A8, f7
  1340. LFPDUX A8, AO, INC2
  1341. fxcpmadd f0, B3, A1, f0
  1342. LFPDUX B2, BO, INC2
  1343. fxcsmadd f4, B3, A1, f4
  1344. LFPDUX A1, AO, INC2
  1345. fxcpmadd f1, B3, A2, f1
  1346. nop
  1347. fxcsmadd f5, B3, A2, f5
  1348. LFPDUX A2, AO, INC2
  1349. fxcpmadd f2, B3, A3, f2
  1350. nop
  1351. fxcsmadd f6, B3, A3, f6
  1352. LFPDUX A3, AO, INC2
  1353. fxcpmadd f3, B3, A4, f3
  1354. nop
  1355. fxcsmadd f7, B3, A4, f7
  1356. LFPDUX A4, AO, INC2
  1357. fxcpmadd f0, B4, A5, f0
  1358. LFPDUX B3, BO, INC2
  1359. fxcsmadd f4, B4, A5, f4
  1360. LFPDUX A5, AO, INC2
  1361. fxcpmadd f1, B4, A6, f1
  1362. nop
  1363. fxcsmadd f5, B4, A6, f5
  1364. LFPDUX A6, AO, INC2
  1365. fxcpmadd f2, B4, A7, f2
  1366. nop
  1367. fxcsmadd f6, B4, A7, f6
  1368. LFPDUX A7, AO, INC2
  1369. fxcpmadd f3, B4, A8, f3
  1370. nop
  1371. fxcsmadd f7, B4, A8, f7
  1372. LFPDUX A8, AO, INC2
  1373. bdnz+ .L52
  1374. .align 4
  1375. .L53:
  1376. fxcpmadd f0, B1, A1, f0
  1377. LFPDUX B4, BO, INC2
  1378. fxcsmadd f4, B1, A1, f4
  1379. LFPDUX A1, AO, INC2
  1380. fxcpmadd f1, B1, A2, f1
  1381. nop
  1382. fxcsmadd f5, B1, A2, f5
  1383. LFPDUX A2, AO, INC2
  1384. fxcpmadd f2, B1, A3, f2
  1385. nop
  1386. fxcsmadd f6, B1, A3, f6
  1387. LFPDUX A3, AO, INC2
  1388. fxcpmadd f3, B1, A4, f3
  1389. nop
  1390. fxcsmadd f7, B1, A4, f7
  1391. LFPDUX A4, AO, INC2
  1392. fxcpmadd f0, B2, A5, f0
  1393. nop
  1394. fxcsmadd f4, B2, A5, f4
  1395. LFPDUX A5, AO, INC2
  1396. fxcpmadd f1, B2, A6, f1
  1397. nop
  1398. fxcsmadd f5, B2, A6, f5
  1399. LFPDUX A6, AO, INC2
  1400. fxcpmadd f2, B2, A7, f2
  1401. nop
  1402. fxcsmadd f6, B2, A7, f6
  1403. LFPDUX A7, AO, INC2
  1404. fxcpmadd f3, B2, A8, f3
  1405. nop
  1406. fxcsmadd f7, B2, A8, f7
  1407. LFPDUX A8, AO, INC2
  1408. fxcpmadd f0, B3, A1, f0
  1409. fxcsmadd f4, B3, A1, f4
  1410. fxcpmadd f1, B3, A2, f1
  1411. fxcsmadd f5, B3, A2, f5
  1412. fxcpmadd f2, B3, A3, f2
  1413. fxcsmadd f6, B3, A3, f6
  1414. fxcpmadd f3, B3, A4, f3
  1415. fxcsmadd f7, B3, A4, f7
  1416. fxcpmadd f0, B4, A5, f0
  1417. fxcsmadd f4, B4, A5, f4
  1418. fxcpmadd f1, B4, A6, f1
  1419. fxcsmadd f5, B4, A6, f5
  1420. fxcpmadd f2, B4, A7, f2
  1421. fxcsmadd f6, B4, A7, f6
  1422. fxcpmadd f3, B4, A8, f3
  1423. fxcsmadd f7, B4, A8, f7
  1424. .align 4
  1425. .L54:
  1426. #if defined(LT) || defined(RN)
  1427. andi. r0, KK, 3
  1428. mtspr CTR, r0
  1429. ble+ .L58
  1430. #else
  1431. andi. r0, TEMP, 3
  1432. mtspr CTR, r0
  1433. ble+ .L58
  1434. #endif
  1435. LFPDUX A1, AO, INC2
  1436. LFPDUX B1, BO, INC2
  1437. LFPDUX A2, AO, INC2
  1438. LFPDUX A3, AO, INC2
  1439. LFPDUX A4, AO, INC2
  1440. bdz- .L57
  1441. .align 4
  1442. .L56:
  1443. fxcpmadd f0, B1, A1, f0
  1444. fxcsmadd f4, B1, A1, f4
  1445. LFPDUX A1, AO, INC2
  1446. fxcpmadd f1, B1, A2, f1
  1447. fxcsmadd f5, B1, A2, f5
  1448. LFPDUX A2, AO, INC2
  1449. fxcpmadd f2, B1, A3, f2
  1450. fxcsmadd f6, B1, A3, f6
  1451. LFPDUX A3, AO, INC2
  1452. fxcpmadd f3, B1, A4, f3
  1453. fxcsmadd f7, B1, A4, f7
  1454. LFPDUX A4, AO, INC2
  1455. LFPDUX B1, BO, INC2
  1456. bdnz+ .L56
  1457. .align 4
  1458. .L57:
  1459. fxcpmadd f0, B1, A1, f0
  1460. fxcsmadd f4, B1, A1, f4
  1461. fxcpmadd f1, B1, A2, f1
  1462. fxcsmadd f5, B1, A2, f5
  1463. fxcpmadd f2, B1, A3, f2
  1464. fxcsmadd f6, B1, A3, f6
  1465. fxcpmadd f3, B1, A4, f3
  1466. fxcsmadd f7, B1, A4, f7
  1467. .align 4
  1468. .L58:
  1469. #if defined(LN) || defined(RT)
  1470. #ifdef LN
  1471. subi r0, KK, 8
  1472. #else
  1473. subi r0, KK, 2
  1474. #endif
  1475. slwi TEMP, r0, 3 + BASE_SHIFT
  1476. slwi r0, r0, 1 + BASE_SHIFT
  1477. add AO, AORIG, TEMP
  1478. add BO, B, r0
  1479. addi BO, BO, - 2 * SIZE
  1480. #endif
  1481. #if defined(LN) || defined(LT)
  1482. fpmr f24, f0
  1483. fpmr f25, f1
  1484. fpmr f26, f2
  1485. fpmr f27, f3
  1486. fsmfp f0, f4
  1487. fsmfp f1, f5
  1488. fsmfp f2, f6
  1489. fsmfp f3, f7
  1490. fsmtp f4, f24
  1491. fsmtp f5, f25
  1492. fsmtp f6, f26
  1493. fsmtp f7, f27
  1494. LFPDUX f16, BO, INC2
  1495. LFPDUX f17, BO, INC2
  1496. LFPDUX f18, BO, INC2
  1497. LFPDUX f19, BO, INC2
  1498. LFPDUX f20, BO, INC2
  1499. LFPDUX f21, BO, INC2
  1500. LFPDUX f22, BO, INC2
  1501. LFPDUX f23, BO, INC2
  1502. subi BO, BO, 16 * SIZE
  1503. fpsub f0, f16, f0
  1504. fpsub f4, f17, f4
  1505. fpsub f1, f18, f1
  1506. fpsub f5, f19, f5
  1507. fpsub f2, f20, f2
  1508. fpsub f6, f21, f6
  1509. fpsub f3, f22, f3
  1510. fpsub f7, f23, f7
  1511. #else
  1512. LFPDUX f16, AO, INC2
  1513. LFPDUX f17, AO, INC2
  1514. LFPDUX f18, AO, INC2
  1515. LFPDUX f19, AO, INC2
  1516. LFPDUX f20, AO, INC2
  1517. LFPDUX f21, AO, INC2
  1518. LFPDUX f22, AO, INC2
  1519. LFPDUX f23, AO, INC2
  1520. subi AO, AO, 16 * SIZE
  1521. fpsub f0, f16, f0
  1522. fpsub f1, f17, f1
  1523. fpsub f2, f18, f2
  1524. fpsub f3, f19, f3
  1525. fpsub f4, f20, f4
  1526. fpsub f5, f21, f5
  1527. fpsub f6, f22, f6
  1528. fpsub f7, f23, f7
  1529. #endif
  1530. #ifdef LN
  1531. addi AO, AO, 66 * SIZE
  1532. LFPDUX A1, AO, INCM2
  1533. LFPDUX A2, AO, INCM2
  1534. LFPDUX A3, AO, INCM2
  1535. LFPDUX A4, AO, INCM2
  1536. LFPDUX A5, AO, INCM2
  1537. LFPDUX A6, AO, INCM2
  1538. LFPDUX A7, AO, INCM2
  1539. LFPDUX A8, AO, INCM2
  1540. fxsmul f7, A1, f7
  1541. fxcpnmsub f3, A1, f7, f3
  1542. fxcsnmsub f6, A2, f7, f6
  1543. fxcpnmsub f2, A2, f7, f2
  1544. fxcsnmsub f5, A3, f7, f5
  1545. fxcpnmsub f1, A3, f7, f1
  1546. fxcsnmsub f4, A4, f7, f4
  1547. fxcpnmsub f0, A4, f7, f0
  1548. fxpmul f3, A5, f3
  1549. fxcsnmsub f6, A6, f3, f6
  1550. fxcpnmsub f2, A6, f3, f2
  1551. fxcsnmsub f5, A7, f3, f5
  1552. fxcpnmsub f1, A7, f3, f1
  1553. fxcsnmsub f4, A8, f3, f4
  1554. fxcpnmsub f0, A8, f3, f0
  1555. add AO, AO, INCM2
  1556. LFPDUX A1, AO, INCM2
  1557. LFPDUX A2, AO, INCM2
  1558. LFPDUX A3, AO, INCM2
  1559. add AO, AO, INCM2
  1560. LFPDUX A4, AO, INCM2
  1561. LFPDUX A5, AO, INCM2
  1562. LFPDUX A6, AO, INCM2
  1563. add AO, AO, INCM2
  1564. add AO, AO, INCM2
  1565. LFPDUX A7, AO, INCM2
  1566. LFPDUX A8, AO, INCM2
  1567. fxsmul f6, A1, f6
  1568. fxcpnmsub f2, A1, f6, f2
  1569. fxcsnmsub f5, A2, f6, f5
  1570. fxcpnmsub f1, A2, f6, f1
  1571. fxcsnmsub f4, A3, f6, f4
  1572. fxcpnmsub f0, A3, f6, f0
  1573. fxpmul f2, A4, f2
  1574. fxcsnmsub f5, A5, f2, f5
  1575. fxcpnmsub f1, A5, f2, f1
  1576. fxcsnmsub f4, A6, f2, f4
  1577. fxcpnmsub f0, A6, f2, f0
  1578. fxsmul f5, A7, f5
  1579. fxcpnmsub f1, A7, f5, f1
  1580. fxcsnmsub f4, A8, f5, f4
  1581. fxcpnmsub f0, A8, f5, f0
  1582. add AO, AO, INCM2
  1583. add AO, AO, INCM2
  1584. LFPDUX A1, AO, INCM2
  1585. LFPDUX A2, AO, INCM2
  1586. subi AO, AO, 6 * SIZE
  1587. LFPDUX A3, AO, INCM2
  1588. subi AO, AO, 6 * SIZE
  1589. LFPDUX A4, AO, INCM2
  1590. addi AO, AO, -2 * SIZE
  1591. fxpmul f1, A1, f1
  1592. fxcsnmsub f4, A2, f1, f4
  1593. fxcpnmsub f0, A2, f1, f0
  1594. fxsmul f4, A3, f4
  1595. fxcpnmsub f0, A3, f4, f0
  1596. fxpmul f0, A4, f0
  1597. #endif
  1598. #ifdef LT
  1599. LFPDUX A1, AO, INC2
  1600. LFPDUX A2, AO, INC2
  1601. LFPDUX A3, AO, INC2
  1602. LFPDUX A4, AO, INC2
  1603. LFPDUX A5, AO, INC2
  1604. LFPDUX A6, AO, INC2
  1605. LFPDUX A7, AO, INC2
  1606. LFPDUX A8, AO, INC2
  1607. fxpmul f0, A1, f0
  1608. fxcsnmsub f4, A1, f0, f4
  1609. fxcpnmsub f1, A2, f0, f1
  1610. fxcsnmsub f5, A2, f0, f5
  1611. fxcpnmsub f2, A3, f0, f2
  1612. fxcsnmsub f6, A3, f0, f6
  1613. fxcpnmsub f3, A4, f0, f3
  1614. fxcsnmsub f7, A4, f0, f7
  1615. fxsmul f4, A5, f4
  1616. fxcpnmsub f1, A6, f4, f1
  1617. fxcsnmsub f5, A6, f4, f5
  1618. fxcpnmsub f2, A7, f4, f2
  1619. fxcsnmsub f6, A7, f4, f6
  1620. fxcpnmsub f3, A8, f4, f3
  1621. fxcsnmsub f7, A8, f4, f7
  1622. add AO, AO, INC2
  1623. LFPDUX A1, AO, INC2
  1624. LFPDUX A2, AO, INC2
  1625. LFPDUX A3, AO, INC2
  1626. add AO, AO, INC2
  1627. LFPDUX A4, AO, INC2
  1628. LFPDUX A5, AO, INC2
  1629. LFPDUX A6, AO, INC2
  1630. add AO, AO, INC2
  1631. add AO, AO, INC2
  1632. LFPDUX A7, AO, INC2
  1633. LFPDUX A8, AO, INC2
  1634. fxpmul f1, A1, f1
  1635. fxcsnmsub f5, A1, f1, f5
  1636. fxcpnmsub f2, A2, f1, f2
  1637. fxcsnmsub f6, A2, f1, f6
  1638. fxcpnmsub f3, A3, f1, f3
  1639. fxcsnmsub f7, A3, f1, f7
  1640. fxsmul f5, A4, f5
  1641. fxcpnmsub f2, A5, f5, f2
  1642. fxcsnmsub f6, A5, f5, f6
  1643. fxcpnmsub f3, A6, f5, f3
  1644. fxcsnmsub f7, A6, f5, f7
  1645. fxpmul f2, A7, f2
  1646. fxcsnmsub f6, A7, f2, f6
  1647. fxcpnmsub f3, A8, f2, f3
  1648. fxcsnmsub f7, A8, f2, f7
  1649. add AO, AO, INC2
  1650. add AO, AO, INC2
  1651. LFPDUX A1, AO, INC2
  1652. LFPDUX A2, AO, INC2
  1653. addi AO, AO, 6 * SIZE
  1654. LFPDUX A3, AO, INC2
  1655. addi AO, AO, 6 * SIZE
  1656. LFPDUX A4, AO, INC2
  1657. subi AO, AO, 64 * SIZE
  1658. fxsmul f6, A1, f6
  1659. fxcpnmsub f3, A2, f6, f3
  1660. fxcsnmsub f7, A2, f6, f7
  1661. fxpmul f3, A3, f3
  1662. fxcsnmsub f7, A3, f3, f7
  1663. fxsmul f7, A4, f7
  1664. #endif
  1665. #ifdef RN
  1666. LFPDUX A1, BO, INC2
  1667. LFPDUX A2, BO, INC2
  1668. subi BO, BO, 4 * SIZE
  1669. fxpmul f0, A1, f0
  1670. fxpmul f1, A1, f1
  1671. fxpmul f2, A1, f2
  1672. fxpmul f3, A1, f3
  1673. fxcsnmsub f4, A1, f0, f4
  1674. fxcsnmsub f5, A1, f1, f5
  1675. fxcsnmsub f6, A1, f2, f6
  1676. fxcsnmsub f7, A1, f3, f7
  1677. fxsmul f4, A2, f4
  1678. fxsmul f5, A2, f5
  1679. fxsmul f6, A2, f6
  1680. fxsmul f7, A2, f7
  1681. #endif
  1682. #ifdef RT
  1683. LFPDUX A2, BO, INC2
  1684. LFPDUX A1, BO, INC2
  1685. subi BO, BO, 4 * SIZE
  1686. fxsmul f4, A1, f4
  1687. fxsmul f5, A1, f5
  1688. fxsmul f6, A1, f6
  1689. fxsmul f7, A1, f7
  1690. fxcpnmsub f0, A1, f4, f0
  1691. fxcpnmsub f1, A1, f5, f1
  1692. fxcpnmsub f2, A1, f6, f2
  1693. fxcpnmsub f3, A1, f7, f3
  1694. fxpmul f0, A2, f0
  1695. fxpmul f1, A2, f1
  1696. fxpmul f2, A2, f2
  1697. fxpmul f3, A2, f3
  1698. #endif
  1699. #ifdef LN
  1700. subi CO1, CO1, 8 * SIZE
  1701. subi CO2, CO2, 8 * SIZE
  1702. #endif
  1703. #if defined(LN) || defined(LT)
  1704. STFPDUX f0, BO, INC2
  1705. STFPDUX f4, BO, INC2
  1706. STFPDUX f1, BO, INC2
  1707. STFPDUX f5, BO, INC2
  1708. STFPDUX f2, BO, INC2
  1709. STFPDUX f6, BO, INC2
  1710. STFPDUX f3, BO, INC2
  1711. STFPDUX f7, BO, INC2
  1712. subi BO, BO, 16 * SIZE
  1713. STFDUX f0, CO1, INC
  1714. STFDUX f4, CO1, INC
  1715. STFDUX f1, CO1, INC
  1716. STFDUX f5, CO1, INC
  1717. STFDUX f2, CO1, INC
  1718. STFDUX f6, CO1, INC
  1719. STFDUX f3, CO1, INC
  1720. STFDUX f7, CO1, INC
  1721. STFSDUX f0, CO2, INC
  1722. STFSDUX f4, CO2, INC
  1723. STFSDUX f1, CO2, INC
  1724. STFSDUX f5, CO2, INC
  1725. STFSDUX f2, CO2, INC
  1726. STFSDUX f6, CO2, INC
  1727. STFSDUX f3, CO2, INC
  1728. STFSDUX f7, CO2, INC
  1729. #else
  1730. STFPDUX f0, AO, INC2
  1731. STFPDUX f1, AO, INC2
  1732. STFPDUX f2, AO, INC2
  1733. STFPDUX f3, AO, INC2
  1734. STFPDUX f4, AO, INC2
  1735. STFPDUX f5, AO, INC2
  1736. STFPDUX f6, AO, INC2
  1737. STFPDUX f7, AO, INC2
  1738. subi AO, AO, 16 * SIZE
  1739. STFDUX f0, CO1, INC
  1740. STFSDUX f0, CO1, INC
  1741. STFDUX f1, CO1, INC
  1742. STFSDUX f1, CO1, INC
  1743. STFDUX f2, CO1, INC
  1744. STFSDUX f2, CO1, INC
  1745. STFDUX f3, CO1, INC
  1746. STFSDUX f3, CO1, INC
  1747. STFDUX f4, CO2, INC
  1748. STFSDUX f4, CO2, INC
  1749. STFDUX f5, CO2, INC
  1750. STFSDUX f5, CO2, INC
  1751. STFDUX f6, CO2, INC
  1752. STFSDUX f6, CO2, INC
  1753. STFDUX f7, CO2, INC
  1754. STFSDUX f7, CO2, INC
  1755. #endif
  1756. #ifdef LN
  1757. subi CO1, CO1, 8 * SIZE
  1758. subi CO2, CO2, 8 * SIZE
  1759. #endif
  1760. #ifdef RT
  1761. slwi r0, K, 3 + BASE_SHIFT
  1762. add AORIG, AORIG, r0
  1763. #endif
  1764. #if defined(LT) || defined(RN)
  1765. sub TEMP, K, KK
  1766. slwi r0, TEMP, 3 + BASE_SHIFT
  1767. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1768. add AO, AO, r0
  1769. add BO, BO, TEMP
  1770. #endif
  1771. #ifdef LT
  1772. addi KK, KK, 8
  1773. #endif
  1774. #ifdef LN
  1775. subi KK, KK, 8
  1776. #endif
  1777. addic. I, I, -1
  1778. li r0, FZERO
  1779. lfpsx f0, SP, r0
  1780. bgt+ .L51
  1781. .align 4
  1782. .L60:
  1783. andi. I, M, 4
  1784. beq .L70
  1785. #if defined(LT) || defined(RN)
  1786. fpmr f1, f0
  1787. addi BO, B, - 2 * SIZE
  1788. fpmr f2, f0
  1789. fpmr f3, f0
  1790. srawi. r0, KK, 2
  1791. mtspr CTR, r0
  1792. ble .L64
  1793. #else
  1794. #ifdef LN
  1795. slwi r0, K, 2 + BASE_SHIFT
  1796. sub AORIG, AORIG, r0
  1797. #endif
  1798. slwi r0 , KK, 2 + BASE_SHIFT
  1799. slwi TEMP, KK, 1 + BASE_SHIFT
  1800. add AO, AORIG, r0
  1801. add BO, B, TEMP
  1802. sub TEMP, K, KK
  1803. fpmr f1, f0
  1804. addi BO, BO, - 2 * SIZE
  1805. fpmr f2, f0
  1806. fpmr f3, f0
  1807. srawi. r0, TEMP, 2
  1808. mtspr CTR, r0
  1809. ble .L64
  1810. #endif
  1811. LFPDUX B1, BO, INC2
  1812. LFPDUX A1, AO, INC2
  1813. LFPDUX A2, AO, INC2
  1814. LFPDUX B2, BO, INC2
  1815. LFPDUX A3, AO, INC2
  1816. LFPDUX A4, AO, INC2
  1817. LFPDUX B3, BO, INC2
  1818. LFPDUX A5, AO, INC2
  1819. LFPDUX A6, AO, INC2
  1820. LFPDUX B4, BO, INC2
  1821. LFPDUX A7, AO, INC2
  1822. LFPDUX A8, AO, INC2
  1823. bdz- .L63
  1824. .align 4
  1825. .L62:
  1826. fxcpmadd f0, B1, A1, f0
  1827. fxcsmadd f2, B1, A1, f2
  1828. LFPDUX A1, AO, INC2
  1829. fxcpmadd f1, B1, A2, f1
  1830. fxcsmadd f3, B1, A2, f3
  1831. LFPDUX A2, AO, INC2
  1832. LFPDUX B1, BO, INC2
  1833. fxcpmadd f0, B2, A3, f0
  1834. fxcsmadd f2, B2, A3, f2
  1835. LFPDUX A3, AO, INC2
  1836. fxcpmadd f1, B2, A4, f1
  1837. fxcsmadd f3, B2, A4, f3
  1838. LFPDUX A4, AO, INC2
  1839. LFPDUX B2, BO, INC2
  1840. fxcpmadd f0, B3, A5, f0
  1841. fxcsmadd f2, B3, A5, f2
  1842. LFPDUX A5, AO, INC2
  1843. fxcpmadd f1, B3, A6, f1
  1844. fxcsmadd f3, B3, A6, f3
  1845. LFPDUX A6, AO, INC2
  1846. LFPDUX B3, BO, INC2
  1847. fxcpmadd f0, B4, A7, f0
  1848. fxcsmadd f2, B4, A7, f2
  1849. LFPDUX A7, AO, INC2
  1850. fxcpmadd f1, B4, A8, f1
  1851. fxcsmadd f3, B4, A8, f3
  1852. LFPDUX A8, AO, INC2
  1853. LFPDUX B4, BO, INC2
  1854. bdnz+ .L62
  1855. .align 4
  1856. .L63:
  1857. fxcpmadd f0, B1, A1, f0
  1858. fxcsmadd f2, B1, A1, f2
  1859. fxcpmadd f1, B1, A2, f1
  1860. fxcsmadd f3, B1, A2, f3
  1861. fxcpmadd f0, B2, A3, f0
  1862. fxcsmadd f2, B2, A3, f2
  1863. fxcpmadd f1, B2, A4, f1
  1864. fxcsmadd f3, B2, A4, f3
  1865. fxcpmadd f0, B3, A5, f0
  1866. fxcsmadd f2, B3, A5, f2
  1867. fxcpmadd f1, B3, A6, f1
  1868. fxcsmadd f3, B3, A6, f3
  1869. fxcpmadd f0, B4, A7, f0
  1870. fxcsmadd f2, B4, A7, f2
  1871. fxcpmadd f1, B4, A8, f1
  1872. fxcsmadd f3, B4, A8, f3
  1873. .align 4
  1874. .L64:
  1875. #if defined(LT) || defined(RN)
  1876. andi. r0, KK, 3
  1877. mtspr CTR, r0
  1878. ble+ .L68
  1879. #else
  1880. andi. r0, TEMP, 3
  1881. mtspr CTR, r0
  1882. ble+ .L68
  1883. #endif
  1884. LFPDUX A1, AO, INC2
  1885. LFPDUX B1, BO, INC2
  1886. LFPDUX A2, AO, INC2
  1887. bdz- .L67
  1888. .align 4
  1889. .L66:
  1890. fxcpmadd f0, B1, A1, f0
  1891. fxcsmadd f2, B1, A1, f2
  1892. LFPDUX A1, AO, INC2
  1893. fxcpmadd f1, B1, A2, f1
  1894. fxcsmadd f3, B1, A2, f3
  1895. LFPDUX B1, BO, INC2
  1896. LFPDUX A2, AO, INC2
  1897. bdnz+ .L66
  1898. .align 4
  1899. .L67:
  1900. fxcpmadd f0, B1, A1, f0
  1901. fxcsmadd f2, B1, A1, f2
  1902. fxcpmadd f1, B1, A2, f1
  1903. fxcsmadd f3, B1, A2, f3
  1904. .align 4
  1905. .L68:
  1906. #if defined(LN) || defined(RT)
  1907. #ifdef LN
  1908. subi r0, KK, 4
  1909. #else
  1910. subi r0, KK, 2
  1911. #endif
  1912. slwi TEMP, r0, 2 + BASE_SHIFT
  1913. slwi r0, r0, 1 + BASE_SHIFT
  1914. add AO, AORIG, TEMP
  1915. add BO, B, r0
  1916. addi BO, BO, - 2 * SIZE
  1917. #endif
  1918. #if defined(LN) || defined(LT)
  1919. fpmr f24, f0
  1920. fpmr f25, f1
  1921. fsmfp f0, f2
  1922. fsmfp f1, f3
  1923. fsmtp f2, f24
  1924. fsmtp f3, f25
  1925. LFPDUX f16, BO, INC2
  1926. LFPDUX f17, BO, INC2
  1927. LFPDUX f18, BO, INC2
  1928. LFPDUX f19, BO, INC2
  1929. subi BO, BO, 8 * SIZE
  1930. fpsub f0, f16, f0
  1931. fpsub f2, f17, f2
  1932. fpsub f1, f18, f1
  1933. fpsub f3, f19, f3
  1934. #else
  1935. LFPDUX f16, AO, INC2
  1936. LFPDUX f17, AO, INC2
  1937. LFPDUX f18, AO, INC2
  1938. LFPDUX f19, AO, INC2
  1939. subi AO, AO, 8 * SIZE
  1940. fpsub f0, f16, f0
  1941. fpsub f1, f17, f1
  1942. fpsub f2, f18, f2
  1943. fpsub f3, f19, f3
  1944. #endif
  1945. #ifdef LN
  1946. addi AO, AO, 18 * SIZE
  1947. LFPDUX A1, AO, INCM2
  1948. LFPDUX A2, AO, INCM2
  1949. LFPDUX A3, AO, INCM2
  1950. LFPDUX A4, AO, INCM2
  1951. add AO, AO, INCM2
  1952. LFPDUX A5, AO, INCM2
  1953. add AO, AO, INCM2
  1954. LFPDUX A6, AO, INCM2
  1955. subi AO, AO, 2 * SIZE
  1956. fxsmul f3, A1, f3
  1957. fxcpnmsub f1, A1, f3, f1
  1958. fxcsnmsub f2, A2, f3, f2
  1959. fxcpnmsub f0, A2, f3, f0
  1960. fxpmul f1, A3, f1
  1961. fxcsnmsub f2, A4, f1, f2
  1962. fxcpnmsub f0, A4, f1, f0
  1963. fxsmul f2, A5, f2
  1964. fxcpnmsub f0, A5, f2, f0
  1965. fxpmul f0, A6, f0
  1966. #endif
  1967. #ifdef LT
  1968. LFPDUX A1, AO, INC2
  1969. LFPDUX A2, AO, INC2
  1970. LFPDUX A3, AO, INC2
  1971. LFPDUX A4, AO, INC2
  1972. add AO, AO, INC2
  1973. LFPDUX A5, AO, INC2
  1974. add AO, AO, INC2
  1975. LFPDUX A6, AO, INC2
  1976. subi AO, AO, 16 * SIZE
  1977. fxpmul f0, A1, f0
  1978. fxcsnmsub f2, A1, f0, f2
  1979. fxcpnmsub f1, A2, f0, f1
  1980. fxcsnmsub f3, A2, f0, f3
  1981. fxsmul f2, A3, f2
  1982. fxcpnmsub f1, A4, f2, f1
  1983. fxcsnmsub f3, A4, f2, f3
  1984. fxpmul f1, A5, f1
  1985. fxcsnmsub f3, A5, f1, f3
  1986. fxsmul f3, A6, f3
  1987. #endif
  1988. #ifdef RN
  1989. LFPDUX A1, BO, INC2
  1990. LFPDUX A2, BO, INC2
  1991. subi BO, BO, 4 * SIZE
  1992. fxpmul f0, A1, f0
  1993. fxpmul f1, A1, f1
  1994. fxcsnmsub f2, A1, f0, f2
  1995. fxcsnmsub f3, A1, f1, f3
  1996. fxsmul f2, A2, f2
  1997. fxsmul f3, A2, f3
  1998. #endif
  1999. #ifdef RT
  2000. LFPDUX A2, BO, INC2
  2001. LFPDUX A1, BO, INC2
  2002. subi BO, BO, 4 * SIZE
  2003. fxsmul f2, A1, f2
  2004. fxsmul f3, A1, f3
  2005. fxcpnmsub f0, A1, f2, f0
  2006. fxcpnmsub f1, A1, f3, f1
  2007. fxpmul f0, A2, f0
  2008. fxpmul f1, A2, f1
  2009. #endif
  2010. #ifdef LN
  2011. subi CO1, CO1, 4 * SIZE
  2012. subi CO2, CO2, 4 * SIZE
  2013. #endif
  2014. #if defined(LN) || defined(LT)
  2015. STFPDUX f0, BO, INC2
  2016. STFPDUX f2, BO, INC2
  2017. STFPDUX f1, BO, INC2
  2018. STFPDUX f3, BO, INC2
  2019. subi BO, BO, 8 * SIZE
  2020. STFDUX f0, CO1, INC
  2021. STFDUX f2, CO1, INC
  2022. STFDUX f1, CO1, INC
  2023. STFDUX f3, CO1, INC
  2024. STFSDUX f0, CO2, INC
  2025. STFSDUX f2, CO2, INC
  2026. STFSDUX f1, CO2, INC
  2027. STFSDUX f3, CO2, INC
  2028. #else
  2029. STFPDUX f0, AO, INC2
  2030. STFPDUX f1, AO, INC2
  2031. STFPDUX f2, AO, INC2
  2032. STFPDUX f3, AO, INC2
  2033. subi AO, AO, 8 * SIZE
  2034. STFDUX f0, CO1, INC
  2035. STFSDUX f0, CO1, INC
  2036. STFDUX f1, CO1, INC
  2037. STFSDUX f1, CO1, INC
  2038. STFDUX f2, CO2, INC
  2039. STFSDUX f2, CO2, INC
  2040. STFDUX f3, CO2, INC
  2041. STFSDUX f3, CO2, INC
  2042. #endif
  2043. #ifdef LN
  2044. subi CO1, CO1, 4 * SIZE
  2045. subi CO2, CO2, 4 * SIZE
  2046. #endif
  2047. #ifdef RT
  2048. slwi r0, K, 2 + BASE_SHIFT
  2049. add AORIG, AORIG, r0
  2050. #endif
  2051. #if defined(LT) || defined(RN)
  2052. sub TEMP, K, KK
  2053. slwi r0, TEMP, 2 + BASE_SHIFT
  2054. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2055. add AO, AO, r0
  2056. add BO, BO, TEMP
  2057. #endif
  2058. #ifdef LT
  2059. addi KK, KK, 4
  2060. #endif
  2061. #ifdef LN
  2062. subi KK, KK, 4
  2063. #endif
  2064. li r0, FZERO
  2065. lfpsx f0, SP, r0
  2066. .align 4
  2067. .L70:
  2068. andi. I, M, 2
  2069. beq .L80
  2070. #if defined(LT) || defined(RN)
  2071. addi BO, B, - 2 * SIZE
  2072. fpmr f1, f0
  2073. fpmr f2, f0
  2074. fpmr f3, f0
  2075. srawi. r0, KK, 3
  2076. mtspr CTR, r0
  2077. ble .L74
  2078. #else
  2079. #ifdef LN
  2080. slwi r0, K, 1 + BASE_SHIFT
  2081. sub AORIG, AORIG, r0
  2082. #endif
  2083. slwi r0 , KK, 1 + BASE_SHIFT
  2084. slwi TEMP, KK, 1 + BASE_SHIFT
  2085. add AO, AORIG, r0
  2086. add BO, B, TEMP
  2087. sub TEMP, K, KK
  2088. addi BO, BO, - 2 * SIZE
  2089. fpmr f1, f0
  2090. fpmr f2, f0
  2091. fpmr f3, f0
  2092. srawi. r0, TEMP, 3
  2093. mtspr CTR, r0
  2094. ble .L74
  2095. #endif
  2096. LFPDUX A1, AO, INC2
  2097. LFPDUX B1, BO, INC2
  2098. LFPDUX A2, AO, INC2
  2099. LFPDUX B2, BO, INC2
  2100. LFPDUX A3, AO, INC2
  2101. LFPDUX B3, BO, INC2
  2102. LFPDUX A4, AO, INC2
  2103. LFPDUX B4, BO, INC2
  2104. LFPDUX A5, AO, INC2
  2105. LFPDUX B5, BO, INC2
  2106. LFPDUX A6, AO, INC2
  2107. LFPDUX B6, BO, INC2
  2108. LFPDUX A7, AO, INC2
  2109. LFPDUX A9, BO, INC2
  2110. LFPDUX A8, AO, INC2
  2111. LFPDUX A10, BO, INC2
  2112. bdz- .L73
  2113. .align 4
  2114. .L72:
  2115. fxcpmadd f0, B1, A1, f0
  2116. fxcsmadd f1, B1, A1, f1
  2117. LFPDUX A1, AO, INC2
  2118. LFPDUX B1, BO, INC2
  2119. fxcpmadd f2, B2, A2, f2
  2120. fxcsmadd f3, B2, A2, f3
  2121. LFPDUX A2, AO, INC2
  2122. LFPDUX B2, BO, INC2
  2123. fxcpmadd f0, B3, A3, f0
  2124. fxcsmadd f1, B3, A3, f1
  2125. LFPDUX A3, AO, INC2
  2126. LFPDUX B3, BO, INC2
  2127. fxcpmadd f2, B4, A4, f2
  2128. fxcsmadd f3, B4, A4, f3
  2129. LFPDUX A4, AO, INC2
  2130. LFPDUX B4, BO, INC2
  2131. fxcpmadd f0, B5, A5, f0
  2132. fxcsmadd f1, B5, A5, f1
  2133. LFPDUX A5, AO, INC2
  2134. LFPDUX B5, BO, INC2
  2135. fxcpmadd f2, B6, A6, f2
  2136. fxcsmadd f3, B6, A6, f3
  2137. LFPDUX A6, AO, INC2
  2138. LFPDUX B6, BO, INC2
  2139. fxcpmadd f0, A9, A7, f0
  2140. fxcsmadd f1, A9, A7, f1
  2141. LFPDUX A7, AO, INC2
  2142. LFPDUX A9, BO, INC2
  2143. fxcpmadd f2, A10, A8, f2
  2144. fxcsmadd f3, A10, A8, f3
  2145. LFPDUX A8, AO, INC2
  2146. LFPDUX A10, BO, INC2
  2147. bdnz+ .L72
  2148. .align 4
  2149. .L73:
  2150. fxcpmadd f0, B1, A1, f0
  2151. fxcsmadd f1, B1, A1, f1
  2152. fxcpmadd f2, B2, A2, f2
  2153. fxcsmadd f3, B2, A2, f3
  2154. fxcpmadd f0, B3, A3, f0
  2155. fxcsmadd f1, B3, A3, f1
  2156. fxcpmadd f2, B4, A4, f2
  2157. fxcsmadd f3, B4, A4, f3
  2158. fxcpmadd f0, B5, A5, f0
  2159. fxcsmadd f1, B5, A5, f1
  2160. fxcpmadd f2, B6, A6, f2
  2161. fxcsmadd f3, B6, A6, f3
  2162. fxcpmadd f0, A9, A7, f0
  2163. fxcsmadd f1, A9, A7, f1
  2164. fxcpmadd f2, A10, A8, f2
  2165. fxcsmadd f3, A10, A8, f3
  2166. .align 4
  2167. .L74:
  2168. #if defined(LT) || defined(RN)
  2169. andi. r0, KK, 7
  2170. mtspr CTR, r0
  2171. ble+ .L78
  2172. #else
  2173. andi. r0, TEMP, 7
  2174. mtspr CTR, r0
  2175. ble+ .L78
  2176. #endif
  2177. LFPDUX A1, AO, INC2
  2178. LFPDUX B1, BO, INC2
  2179. bdz- .L77
  2180. .align 4
  2181. .L76:
  2182. fxcpmadd f0, B1, A1, f0
  2183. fxcsmadd f1, B1, A1, f1
  2184. LFPDUX A1, AO, INC2
  2185. LFPDUX B1, BO, INC2
  2186. bdnz+ .L76
  2187. .align 4
  2188. .L77:
  2189. fxcpmadd f0, B1, A1, f0
  2190. fxcsmadd f1, B1, A1, f1
  2191. .align 4
  2192. .L78:
  2193. fpadd f0, f0, f2
  2194. fpadd f1, f1, f3
  2195. #if defined(LN) || defined(RT)
  2196. #ifdef LN
  2197. subi r0, KK, 2
  2198. #else
  2199. subi r0, KK, 2
  2200. #endif
  2201. slwi TEMP, r0, 1 + BASE_SHIFT
  2202. slwi r0, r0, 1 + BASE_SHIFT
  2203. add AO, AORIG, TEMP
  2204. add BO, B, r0
  2205. addi BO, BO, - 2 * SIZE
  2206. #endif
  2207. #if defined(LN) || defined(LT)
  2208. fpmr f24, f0
  2209. fsmfp f0, f1
  2210. fsmtp f1, f24
  2211. LFPDUX f16, BO, INC2
  2212. LFPDUX f17, BO, INC2
  2213. subi BO, BO, 4 * SIZE
  2214. fpsub f0, f16, f0
  2215. fpsub f1, f17, f1
  2216. #else
  2217. LFPDUX f16, AO, INC2
  2218. LFPDUX f17, AO, INC2
  2219. subi AO, AO, 4 * SIZE
  2220. fpsub f0, f16, f0
  2221. fpsub f1, f17, f1
  2222. #endif
  2223. #ifdef LN
  2224. LFPDUX A1, AO, INC2
  2225. LFPDUX A2, AO, INC2
  2226. addi AO, AO, -4 * SIZE
  2227. fxsmul f1, A2, f1
  2228. fxcpnmsub f0, A2, f1, f0
  2229. fxpmul f0, A1, f0
  2230. #endif
  2231. #ifdef LT
  2232. LFPDUX A1, AO, INC2
  2233. LFPDUX A2, AO, INC2
  2234. addi AO, AO, -4 * SIZE
  2235. fxpmul f0, A1, f0
  2236. fxcsnmsub f1, A1, f0, f1
  2237. fxsmul f1, A2, f1
  2238. #endif
  2239. #ifdef RN
  2240. LFPDUX A1, BO, INC2
  2241. LFPDUX A2, BO, INC2
  2242. subi BO, BO, 4 * SIZE
  2243. fxpmul f0, A1, f0
  2244. fxcsnmsub f1, A1, f0, f1
  2245. fxsmul f1, A2, f1
  2246. #endif
  2247. #ifdef RT
  2248. LFPDUX A2, BO, INC2
  2249. LFPDUX A1, BO, INC2
  2250. subi BO, BO, 4 * SIZE
  2251. fxsmul f1, A1, f1
  2252. fxcpnmsub f0, A1, f1, f0
  2253. fxpmul f0, A2, f0
  2254. #endif
  2255. #ifdef LN
  2256. subi CO1, CO1, 2 * SIZE
  2257. subi CO2, CO2, 2 * SIZE
  2258. #endif
  2259. #if defined(LN) || defined(LT)
  2260. STFPDUX f0, BO, INC2
  2261. STFPDUX f1, BO, INC2
  2262. subi BO, BO, 4 * SIZE
  2263. STFDUX f0, CO1, INC
  2264. STFDUX f1, CO1, INC
  2265. STFSDUX f0, CO2, INC
  2266. STFSDUX f1, CO2, INC
  2267. #else
  2268. STFPDUX f0, AO, INC2
  2269. STFPDUX f1, AO, INC2
  2270. subi AO, AO, 4 * SIZE
  2271. STFDUX f0, CO1, INC
  2272. STFSDUX f0, CO1, INC
  2273. STFDUX f1, CO2, INC
  2274. STFSDUX f1, CO2, INC
  2275. #endif
  2276. #ifdef LN
  2277. subi CO1, CO1, 2 * SIZE
  2278. subi CO2, CO2, 2 * SIZE
  2279. #endif
  2280. #ifdef RT
  2281. slwi r0, K, 1 + BASE_SHIFT
  2282. add AORIG, AORIG, r0
  2283. #endif
  2284. #if defined(LT) || defined(RN)
  2285. sub TEMP, K, KK
  2286. slwi r0, TEMP, 1 + BASE_SHIFT
  2287. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2288. add AO, AO, r0
  2289. add BO, BO, TEMP
  2290. #endif
  2291. #ifdef LT
  2292. addi KK, KK, 2
  2293. #endif
  2294. #ifdef LN
  2295. subi KK, KK, 2
  2296. #endif
  2297. li r0, FZERO
  2298. lfpsx f0, SP, r0
  2299. .align 4
  2300. .L80:
  2301. andi. I, M, 1
  2302. beq .L89
  2303. #if defined(LT) || defined(RN)
  2304. addi BO, B, - 2 * SIZE
  2305. fpmr f1, f0
  2306. fpmr f2, f0
  2307. fpmr f3, f0
  2308. srawi. r0, KK, 3
  2309. mtspr CTR, r0
  2310. ble .L84
  2311. #else
  2312. #ifdef LN
  2313. slwi r0, K, 0 + BASE_SHIFT
  2314. sub AORIG, AORIG, r0
  2315. #endif
  2316. slwi r0 , KK, 0 + BASE_SHIFT
  2317. slwi TEMP, KK, 1 + BASE_SHIFT
  2318. add AO, AORIG, r0
  2319. add BO, B, TEMP
  2320. sub TEMP, K, KK
  2321. addi BO, BO, - 2 * SIZE
  2322. fpmr f1, f0
  2323. fpmr f2, f0
  2324. fpmr f3, f0
  2325. srawi. r0, TEMP, 3
  2326. mtspr CTR, r0
  2327. ble .L84
  2328. #endif
  2329. LFPDUX B1, BO, INC2
  2330. LFPDUX A1, AO, INC2
  2331. LFPDUX A2, AO, INC2
  2332. LFPDUX B2, BO, INC2
  2333. LFPDUX A3, AO, INC2
  2334. LFPDUX A4, AO, INC2
  2335. LFPDUX B3, BO, INC2
  2336. LFPDUX B4, BO, INC2
  2337. bdz- .L83
  2338. .align 4
  2339. .L82:
  2340. fxcpmadd f0, A1, B1, f0
  2341. LFPDUX B1, BO, INC2
  2342. fxcsmadd f1, A1, B2, f1
  2343. LFPDUX B2, BO, INC2
  2344. LFPDUX A1, AO, INC2
  2345. fxcpmadd f2, A2, B3, f2
  2346. LFPDUX B3, BO, INC2
  2347. fxcsmadd f3, A2, B4, f3
  2348. LFPDUX B4, BO, INC2
  2349. LFPDUX A2, AO, INC2
  2350. fxcpmadd f0, A3, B1, f0
  2351. LFPDUX B1, BO, INC2
  2352. fxcsmadd f1, A3, B2, f1
  2353. LFPDUX B2, BO, INC2
  2354. LFPDUX A3, AO, INC2
  2355. fxcpmadd f2, A4, B3, f2
  2356. LFPDUX B3, BO, INC2
  2357. fxcsmadd f3, A4, B4, f3
  2358. LFPDUX B4, BO, INC2
  2359. LFPDUX A4, AO, INC2
  2360. bdnz+ .L82
  2361. .align 4
  2362. .L83:
  2363. fxcpmadd f0, A1, B1, f0
  2364. LFPDUX B1, BO, INC2
  2365. fxcsmadd f1, A1, B2, f1
  2366. LFPDUX B2, BO, INC2
  2367. fxcpmadd f2, A2, B3, f2
  2368. LFPDUX B3, BO, INC2
  2369. fxcsmadd f3, A2, B4, f3
  2370. LFPDUX B4, BO, INC2
  2371. fxcpmadd f0, A3, B1, f0
  2372. fxcsmadd f1, A3, B2, f1
  2373. fxcpmadd f2, A4, B3, f2
  2374. fxcsmadd f3, A4, B4, f3
  2375. .align 4
  2376. .L84:
  2377. #if defined(LT) || defined(RN)
  2378. andi. r0, KK, 7
  2379. mtspr CTR, r0
  2380. ble+ .L88
  2381. #else
  2382. andi. r0, TEMP, 7
  2383. mtspr CTR, r0
  2384. ble+ .L88
  2385. #endif
  2386. LFDX A1, AO, INC2
  2387. LFPDUX B1, BO, INC2
  2388. add AO, AO, INC
  2389. bdz- .L87
  2390. .align 4
  2391. .L86:
  2392. fxcpmadd f0, A1, B1, f0
  2393. LFDX A1, AO, INC2
  2394. LFPDUX B1, BO, INC2
  2395. add AO, AO, INC
  2396. bdnz+ .L86
  2397. .align 4
  2398. .L87:
  2399. fxcpmadd f0, A1, B1, f0
  2400. .align 4
  2401. .L88:
  2402. fpadd f0, f0, f1
  2403. fpadd f2, f2, f3
  2404. fpadd f0, f0, f2
  2405. #if defined(LN) || defined(RT)
  2406. #ifdef LN
  2407. subi r0, KK, 1
  2408. #else
  2409. subi r0, KK, 2
  2410. #endif
  2411. slwi TEMP, r0, 0 + BASE_SHIFT
  2412. slwi r0, r0, 1 + BASE_SHIFT
  2413. add AO, AORIG, TEMP
  2414. add BO, B, r0
  2415. addi BO, BO, - 2 * SIZE
  2416. #endif
  2417. #if defined(LN) || defined(LT)
  2418. LFPDX f16, BO, INC2
  2419. fpsub f0, f16, f0
  2420. #else
  2421. LFPDX f16, AO, INC2
  2422. fpsub f0, f16, f0
  2423. #endif
  2424. #ifdef LN
  2425. LFPDX A1, AO, INC2
  2426. fxpmul f0, A1, f0
  2427. #endif
  2428. #ifdef LT
  2429. LFPDX A1, AO, INC2
  2430. fxpmul f0, A1, f0
  2431. #endif
  2432. #ifdef RN
  2433. LFD A1, (2 + 0) * SIZE(BO)
  2434. LFD A2, (2 + 1) * SIZE(BO)
  2435. LFD A3, (2 + 3) * SIZE(BO)
  2436. fsmtp f1, f0
  2437. fmul f0, A1, f0
  2438. fnmsub f1, A2, f0, f1
  2439. fmul f1, A3, f1
  2440. fsmfp f0, f1
  2441. #endif
  2442. #ifdef RT
  2443. LFD A1, (2 + 3) * SIZE(BO)
  2444. LFD A2, (2 + 2) * SIZE(BO)
  2445. LFD A3, (2 + 0) * SIZE(BO)
  2446. fsmtp f1, f0
  2447. fmul f1, A1, f1
  2448. fnmsub f0, A2, f1, f0
  2449. fmul f0, A3, f0
  2450. fsmfp f0, f1
  2451. #endif
  2452. #ifdef LN
  2453. subi CO1, CO1, 1 * SIZE
  2454. subi CO2, CO2, 1 * SIZE
  2455. #endif
  2456. #if defined(LN) || defined(LT)
  2457. STFPDX f0, BO, INC2
  2458. STFDUX f0, CO1, INC
  2459. STFSDUX f0, CO2, INC
  2460. #else
  2461. STFPDX f0, AO, INC2
  2462. STFDUX f0, CO1, INC
  2463. STFDUX f1, CO2, INC
  2464. #endif
  2465. #ifdef LN
  2466. subi CO1, CO1, 1 * SIZE
  2467. subi CO2, CO2, 1 * SIZE
  2468. #endif
  2469. #ifdef RT
  2470. slwi r0, K, 0 + BASE_SHIFT
  2471. add AORIG, AORIG, r0
  2472. #endif
  2473. #if defined(LT) || defined(RN)
  2474. sub TEMP, K, KK
  2475. slwi r0, TEMP, 0 + BASE_SHIFT
  2476. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2477. add AO, AO, r0
  2478. add BO, BO, TEMP
  2479. #endif
  2480. #ifdef LT
  2481. addi KK, KK, 1
  2482. #endif
  2483. #ifdef LN
  2484. subi KK, KK, 1
  2485. #endif
  2486. .align 4
  2487. .L89:
  2488. #ifdef LN
  2489. slwi r0, K, 1 + BASE_SHIFT
  2490. add B, B, r0
  2491. #endif
  2492. #if defined(LT) || defined(RN)
  2493. addi B, BO, 2 * SIZE
  2494. #endif
  2495. #ifdef RN
  2496. addi KK, KK, 2
  2497. #endif
  2498. #ifdef RT
  2499. subi KK, KK, 2
  2500. #endif
  2501. .align 4
  2502. .L90:
  2503. srawi. J, N, 2
  2504. ble .L999
  2505. .align 4
  2506. .L10:
  2507. #ifdef RT
  2508. slwi r0, K, 2 + BASE_SHIFT
  2509. sub B, B, r0
  2510. slwi r0, LDC, 2
  2511. sub C, C, r0
  2512. #endif
  2513. mr CO1, C
  2514. add CO2, C, LDC
  2515. add CO3, CO2, LDC
  2516. add CO4, CO3, LDC
  2517. #ifdef LN
  2518. add KK, M, OFFSET
  2519. #endif
  2520. #ifdef LT
  2521. mr KK, OFFSET
  2522. #endif
  2523. #if defined(LN) || defined(RT)
  2524. addi AORIG, A, -4 * SIZE
  2525. #else
  2526. addi AO, A, -4 * SIZE
  2527. #endif
  2528. #ifndef RT
  2529. add C, CO4, LDC
  2530. #endif
  2531. li r0, FZERO
  2532. lfpsx f0, SP, r0
  2533. srawi. I, M, 3
  2534. ble .L20
  2535. .align 4
  2536. .L11:
  2537. #if defined(LT) || defined(RN)
  2538. addi AO2, AO, 2 * SIZE
  2539. fpmr f4, f0
  2540. addi BO, B, - 4 * SIZE
  2541. fpmr f8, f0
  2542. addi BO2, B, - 2 * SIZE
  2543. fpmr f12, f0
  2544. fpmr f5, f0
  2545. fpmr f9, f0
  2546. fpmr f13, f0
  2547. fpmr f2, f0
  2548. fpmr f6, f0
  2549. fpmr f10, f0
  2550. fpmr f14, f0
  2551. fpmr f3, f0
  2552. fpmr f7, f0
  2553. fpmr f11, f0
  2554. fpmr f15, f0
  2555. nop
  2556. srawi. r0, KK, 2
  2557. fpmr f1, f0
  2558. mtspr CTR, r0
  2559. ble .L14
  2560. #else
  2561. #ifdef LN
  2562. slwi r0, K, 3 + BASE_SHIFT
  2563. sub AORIG, AORIG, r0
  2564. #endif
  2565. slwi r0 , KK, 3 + BASE_SHIFT
  2566. slwi TEMP, KK, 2 + BASE_SHIFT
  2567. add AO, AORIG, r0
  2568. add BO, B, TEMP
  2569. sub TEMP, K, KK
  2570. addi AO2, AO, 2 * SIZE
  2571. fpmr f4, f0
  2572. addi BO, BO, - 4 * SIZE
  2573. fpmr f8, f0
  2574. addi BO2, BO, 2 * SIZE
  2575. fpmr f12, f0
  2576. fpmr f5, f0
  2577. fpmr f9, f0
  2578. fpmr f13, f0
  2579. fpmr f2, f0
  2580. fpmr f6, f0
  2581. fpmr f10, f0
  2582. fpmr f14, f0
  2583. fpmr f3, f0
  2584. fpmr f7, f0
  2585. fpmr f11, f0
  2586. fpmr f15, f0
  2587. nop
  2588. srawi. r0, TEMP, 2
  2589. fpmr f1, f0
  2590. mtspr CTR, r0
  2591. ble .L14
  2592. #endif
  2593. LFPDUX A1, AO, INC4
  2594. fpmr f5, f0
  2595. LFPDUX A3, AO, INC4
  2596. fpmr f9, f0
  2597. LFPDUX B1, BO, INC4
  2598. fpmr f13, f0
  2599. LFPDUX A5, AO, INC4
  2600. fpmr f2, f0
  2601. LFPDUX A6, AO, INC4
  2602. fpmr f6, f0
  2603. LFPDUX B3, BO, INC4
  2604. fpmr f10, f0
  2605. LFPDUX A7, AO, INC4
  2606. fpmr f14, f0
  2607. LFPDUX A8, AO, INC4
  2608. fpmr f3, f0
  2609. LFPDUX B5, BO, INC4
  2610. fpmr f7, f0
  2611. LFPDUX A9, AO, INC4
  2612. fpmr f11, f0
  2613. LFPDUX A2, AO2, INC4
  2614. fpmr f15, f0
  2615. LFPDUX B2, BO2, INC4
  2616. bdz- .L13
  2617. .align 4
  2618. .L12:
  2619. ## 1 ##
  2620. fxcpmadd f0, B1, A1, f0
  2621. nop
  2622. fxcsmadd f4, B1, A1, f4
  2623. nop
  2624. fxcpmadd f8, B2, A1, f8
  2625. LFPDUX B4, BO2, INC4
  2626. fxcsmadd f12, B2, A1, f12
  2627. LFPDUX B6, BO, INC4
  2628. fxcpmadd f1, B1, A2, f1
  2629. nop
  2630. fxcsmadd f5, B1, A2, f5
  2631. LFPDUX A4, AO2, INC4
  2632. fxcpmadd f9, B2, A2, f9
  2633. LFPDUX A10, AO, INC4
  2634. fxcsmadd f13, B2, A2, f13
  2635. nop
  2636. fxcpmadd f2, B1, A3, f2
  2637. nop
  2638. fxcsmadd f6, B1, A3, f6
  2639. nop
  2640. fxcpmadd f10, B2, A3, f10
  2641. nop
  2642. fxcsmadd f14, B2, A3, f14
  2643. nop
  2644. fxcpmadd f3, B1, A4, f3
  2645. nop
  2646. fxcsmadd f7, B1, A4, f7
  2647. LFPDUX A2, AO2, INC4
  2648. fxcpmadd f11, B2, A4, f11
  2649. LFPDUX A1, AO, INC4
  2650. fxcsmadd f15, B2, A4, f15
  2651. nop
  2652. ## 2 ##
  2653. fxcpmadd f0, B3, A5, f0
  2654. nop
  2655. fxcsmadd f4, B3, A5, f4
  2656. nop
  2657. fxcpmadd f8, B4, A5, f8
  2658. LFPDUX B2, BO2, INC4
  2659. fxcsmadd f12, B4, A5, f12
  2660. LFPDUX B1, BO, INC4
  2661. fxcpmadd f1, B3, A2, f1
  2662. nop
  2663. fxcsmadd f5, B3, A2, f5
  2664. LFPDUX A4, AO2, INC4
  2665. fxcpmadd f9, B4, A2, f9
  2666. LFPDUX A3, AO, INC4
  2667. fxcsmadd f13, B4, A2, f13
  2668. nop
  2669. fxcpmadd f2, B3, A6, f2
  2670. nop
  2671. fxcsmadd f6, B3, A6, f6
  2672. nop
  2673. fxcpmadd f10, B4, A6, f10
  2674. nop
  2675. fxcsmadd f14, B4, A6, f14
  2676. nop
  2677. fxcpmadd f3, B3, A4, f3
  2678. nop
  2679. fxcsmadd f7, B3, A4, f7
  2680. LFPDUX A2, AO2, INC4
  2681. fxcpmadd f11, B4, A4, f11
  2682. LFPDUX A5, AO, INC4
  2683. fxcsmadd f15, B4, A4, f15
  2684. nop
  2685. ## 3 ##
  2686. fxcpmadd f0, B5, A7, f0
  2687. nop
  2688. fxcsmadd f4, B5, A7, f4
  2689. nop
  2690. fxcpmadd f8, B2, A7, f8
  2691. LFPDUX B4, BO2, INC4
  2692. fxcsmadd f12, B2, A7, f12
  2693. LFPDUX B3, BO, INC4
  2694. fxcpmadd f1, B5, A2, f1
  2695. nop
  2696. fxcsmadd f5, B5, A2, f5
  2697. LFPDUX A4, AO2, INC4
  2698. fxcpmadd f9, B2, A2, f9
  2699. LFPDUX A6, AO, INC4
  2700. fxcsmadd f13, B2, A2, f13
  2701. nop
  2702. fxcpmadd f2, B5, A8, f2
  2703. nop
  2704. fxcsmadd f6, B5, A8, f6
  2705. nop
  2706. fxcpmadd f10, B2, A8, f10
  2707. nop
  2708. fxcsmadd f14, B2, A8, f14
  2709. nop
  2710. fxcpmadd f3, B5, A4, f3
  2711. nop
  2712. fxcsmadd f7, B5, A4, f7
  2713. LFPDUX A2, AO2, INC4
  2714. fxcpmadd f11, B2, A4, f11
  2715. LFPDUX A7, AO, INC4
  2716. fxcsmadd f15, B2, A4, f15
  2717. nop
  2718. ## 4 ##
  2719. fxcpmadd f0, B6, A9, f0
  2720. nop
  2721. fxcsmadd f4, B6, A9, f4
  2722. nop
  2723. fxcpmadd f8, B4, A9, f8
  2724. LFPDUX B2, BO2, INC4
  2725. fxcsmadd f12, B4, A9, f12
  2726. LFPDUX B5, BO, INC4
  2727. fxcpmadd f1, B6, A2, f1
  2728. nop
  2729. fxcsmadd f5, B6, A2, f5
  2730. LFPDUX A4, AO2, INC4
  2731. fxcpmadd f9, B4, A2, f9
  2732. LFPDUX A8, AO, INC4
  2733. fxcsmadd f13, B4, A2, f13
  2734. nop
  2735. fxcpmadd f2, B6, A10, f2
  2736. nop
  2737. fxcsmadd f6, B6, A10, f6
  2738. nop
  2739. fxcpmadd f10, B4, A10, f10
  2740. nop
  2741. fxcsmadd f14, B4, A10, f14
  2742. nop
  2743. fxcpmadd f3, B6, A4, f3
  2744. LFPDUX A2, AO2, INC4
  2745. fxcsmadd f7, B6, A4, f7
  2746. LFPDUX A9, AO, INC4
  2747. fxcpmadd f11, B4, A4, f11
  2748. nop
  2749. fxcsmadd f15, B4, A4, f15
  2750. bdnz+ .L12
  2751. .align 4
  2752. .L13:
  2753. ## 1 ##
  2754. fxcpmadd f0, B1, A1, f0
  2755. nop
  2756. fxcsmadd f4, B1, A1, f4
  2757. nop
  2758. fxcpmadd f8, B2, A1, f8
  2759. LFPDUX B4, BO2, INC4
  2760. fxcsmadd f12, B2, A1, f12
  2761. LFPDUX B6, BO, INC4
  2762. fxcpmadd f1, B1, A2, f1
  2763. nop
  2764. fxcsmadd f5, B1, A2, f5
  2765. LFPDUX A4, AO2, INC4
  2766. fxcpmadd f9, B2, A2, f9
  2767. LFPDUX A10, AO, INC4
  2768. fxcsmadd f13, B2, A2, f13
  2769. nop
  2770. fxcpmadd f2, B1, A3, f2
  2771. nop
  2772. fxcsmadd f6, B1, A3, f6
  2773. nop
  2774. fxcpmadd f10, B2, A3, f10
  2775. nop
  2776. fxcsmadd f14, B2, A3, f14
  2777. nop
  2778. fxcpmadd f3, B1, A4, f3
  2779. nop
  2780. fxcsmadd f7, B1, A4, f7
  2781. LFPDUX A2, AO2, INC4
  2782. fxcpmadd f11, B2, A4, f11
  2783. nop
  2784. fxcsmadd f15, B2, A4, f15
  2785. nop
  2786. ## 2 ##
  2787. fxcpmadd f0, B3, A5, f0
  2788. nop
  2789. fxcsmadd f4, B3, A5, f4
  2790. nop
  2791. fxcpmadd f8, B4, A5, f8
  2792. LFPDUX B2, BO2, INC4
  2793. fxcsmadd f12, B4, A5, f12
  2794. nop
  2795. fxcpmadd f1, B3, A2, f1
  2796. nop
  2797. fxcsmadd f5, B3, A2, f5
  2798. LFPDUX A4, AO2, INC4
  2799. fxcpmadd f9, B4, A2, f9
  2800. nop
  2801. fxcsmadd f13, B4, A2, f13
  2802. nop
  2803. fxcpmadd f2, B3, A6, f2
  2804. nop
  2805. fxcsmadd f6, B3, A6, f6
  2806. nop
  2807. fxcpmadd f10, B4, A6, f10
  2808. nop
  2809. fxcsmadd f14, B4, A6, f14
  2810. nop
  2811. fxcpmadd f3, B3, A4, f3
  2812. nop
  2813. fxcsmadd f7, B3, A4, f7
  2814. LFPDUX A2, AO2, INC4
  2815. fxcpmadd f11, B4, A4, f11
  2816. nop
  2817. fxcsmadd f15, B4, A4, f15
  2818. nop
  2819. ## 3 ##
  2820. fxcpmadd f0, B5, A7, f0
  2821. nop
  2822. fxcsmadd f4, B5, A7, f4
  2823. nop
  2824. fxcpmadd f8, B2, A7, f8
  2825. LFPDUX B4, BO2, INC4
  2826. fxcsmadd f12, B2, A7, f12
  2827. nop
  2828. fxcpmadd f1, B5, A2, f1
  2829. nop
  2830. fxcsmadd f5, B5, A2, f5
  2831. LFPDUX A4, AO2, INC4
  2832. fxcpmadd f9, B2, A2, f9
  2833. nop
  2834. fxcsmadd f13, B2, A2, f13
  2835. fxcpmadd f2, B5, A8, f2
  2836. nop
  2837. fxcsmadd f6, B5, A8, f6
  2838. nop
  2839. fxcpmadd f10, B2, A8, f10
  2840. nop
  2841. fxcsmadd f14, B2, A8, f14
  2842. nop
  2843. fxcpmadd f3, B5, A4, f3
  2844. nop
  2845. fxcsmadd f7, B5, A4, f7
  2846. LFPDUX A2, AO2, INC4
  2847. fxcpmadd f11, B2, A4, f11
  2848. nop
  2849. fxcsmadd f15, B2, A4, f15
  2850. nop
  2851. ## 4 ##
  2852. fxcpmadd f0, B6, A9, f0
  2853. nop
  2854. fxcsmadd f4, B6, A9, f4
  2855. nop
  2856. fxcpmadd f8, B4, A9, f8
  2857. nop
  2858. fxcsmadd f12, B4, A9, f12
  2859. nop
  2860. fxcpmadd f1, B6, A2, f1
  2861. nop
  2862. fxcsmadd f5, B6, A2, f5
  2863. LFPDUX A4, AO2, INC4
  2864. fxcpmadd f9, B4, A2, f9
  2865. nop
  2866. fxcsmadd f13, B4, A2, f13
  2867. nop
  2868. fxcpmadd f2, B6, A10, f2
  2869. nop
  2870. fxcsmadd f6, B6, A10, f6
  2871. nop
  2872. fxcpmadd f10, B4, A10, f10
  2873. nop
  2874. fxcsmadd f14, B4, A10, f14
  2875. nop
  2876. fxcpmadd f3, B6, A4, f3
  2877. nop
  2878. fxcsmadd f7, B6, A4, f7
  2879. nop
  2880. fxcpmadd f11, B4, A4, f11
  2881. nop
  2882. fxcsmadd f15, B4, A4, f15
  2883. nop
  2884. .align 4
  2885. .L14:
  2886. #if defined(LT) || defined(RN)
  2887. andi. r0, KK, 3
  2888. mtspr CTR, r0
  2889. ble+ .L18
  2890. #else
  2891. andi. r0, TEMP, 3
  2892. mtspr CTR, r0
  2893. ble+ .L18
  2894. #endif
  2895. .align 4
  2896. .L15:
  2897. LFPDUX A2, AO, INC4
  2898. LFPDUX A4, AO2, INC4
  2899. LFPDUX A10, BO, INC4
  2900. LFPDUX B4, BO2, INC4
  2901. bdz- .L17
  2902. .align 4
  2903. .L16:
  2904. fxcpmadd f0, A10, A2, f0
  2905. fxcsmadd f4, A10, A2, f4
  2906. fxcpmadd f8, B4, A2, f8
  2907. fxcsmadd f12, B4, A2, f12
  2908. LFPDUX A2, AO, INC4
  2909. fxcpmadd f1, A10, A4, f1
  2910. fxcsmadd f5, A10, A4, f5
  2911. fxcpmadd f9, B4, A4, f9
  2912. fxcsmadd f13, B4, A4, f13
  2913. LFPDUX A4, AO2, INC4
  2914. fxcpmadd f2, A10, A2, f2
  2915. fxcsmadd f6, A10, A2, f6
  2916. fxcpmadd f10, B4, A2, f10
  2917. fxcsmadd f14, B4, A2, f14
  2918. LFPDUX A2, AO, INC4
  2919. fxcpmadd f3, A10, A4, f3
  2920. fxcsmadd f7, A10, A4, f7
  2921. LFPDUX A10, BO, INC4
  2922. fxcpmadd f11, B4, A4, f11
  2923. fxcsmadd f15, B4, A4, f15
  2924. LFPDUX A4, AO2, INC4
  2925. LFPDUX B4, BO2, INC4
  2926. bdnz+ .L16
  2927. .align 4
  2928. .L17:
  2929. fxcpmadd f0, A10, A2, f0
  2930. fxcsmadd f4, A10, A2, f4
  2931. fxcpmadd f8, B4, A2, f8
  2932. fxcsmadd f12, B4, A2, f12
  2933. LFPDUX A2, AO, INC4
  2934. fxcpmadd f1, A10, A4, f1
  2935. fxcsmadd f5, A10, A4, f5
  2936. fxcpmadd f9, B4, A4, f9
  2937. fxcsmadd f13, B4, A4, f13
  2938. LFPDUX A4, AO2, INC4
  2939. fxcpmadd f2, A10, A2, f2
  2940. fxcsmadd f6, A10, A2, f6
  2941. fxcpmadd f10, B4, A2, f10
  2942. fxcsmadd f14, B4, A2, f14
  2943. fxcpmadd f3, A10, A4, f3
  2944. fxcsmadd f7, A10, A4, f7
  2945. fxcpmadd f11, B4, A4, f11
  2946. fxcsmadd f15, B4, A4, f15
  2947. .align 4
  2948. .L18:
  2949. #if defined(LN) || defined(RT)
  2950. #ifdef LN
  2951. subi r0, KK, 8
  2952. #else
  2953. subi r0, KK, 4
  2954. #endif
  2955. slwi TEMP, r0, 3 + BASE_SHIFT
  2956. slwi r0, r0, 2 + BASE_SHIFT
  2957. add AO, AORIG, TEMP
  2958. add BO, B, r0
  2959. addi AO2, AO, 2 * SIZE
  2960. addi BO, BO, - 4 * SIZE
  2961. addi BO2, BO, 2 * SIZE
  2962. #endif
  2963. #if defined(LN) || defined(LT)
  2964. fpmr f24, f0
  2965. LFPDUX f16, BO, INC4
  2966. fpmr f25, f1
  2967. nop
  2968. fpmr f26, f2
  2969. LFPDUX f17, BO2, INC4
  2970. fpmr f27, f3
  2971. nop
  2972. fpmr f28, f8
  2973. LFPDUX f18, BO, INC4
  2974. fpmr f29, f9
  2975. nop
  2976. fpmr f30, f10
  2977. LFPDUX f19, BO2, INC4
  2978. fpmr f31, f11
  2979. nop
  2980. fsmfp f0, f4
  2981. LFPDUX f20, BO, INC4
  2982. fsmfp f1, f5
  2983. nop
  2984. fsmfp f2, f6
  2985. LFPDUX f21, BO2, INC4
  2986. fsmfp f3, f7
  2987. nop
  2988. fsmfp f8, f12
  2989. LFPDUX f22, BO, INC4
  2990. fsmfp f9, f13
  2991. nop
  2992. fsmfp f10, f14
  2993. LFPDUX f23, BO2, INC4
  2994. fsmfp f11, f15
  2995. nop
  2996. fsmtp f4, f24
  2997. LFPDUX f24, BO, INC4
  2998. fsmtp f5, f25
  2999. nop
  3000. fsmtp f6, f26
  3001. LFPDUX f25, BO2, INC4
  3002. fsmtp f7, f27
  3003. nop
  3004. fsmtp f12, f28
  3005. LFPDUX f26, BO, INC4
  3006. fsmtp f13, f29
  3007. nop
  3008. fsmtp f14, f30
  3009. LFPDUX f27, BO2, INC4
  3010. fsmtp f15, f31
  3011. nop
  3012. fpsub f0, f16, f0
  3013. LFPDUX f28, BO, INC4
  3014. fpsub f8, f17, f8
  3015. nop
  3016. fpsub f4, f18, f4
  3017. LFPDUX f29, BO2, INC4
  3018. fpsub f12, f19, f12
  3019. nop
  3020. fpsub f1, f20, f1
  3021. LFPDUX f30, BO, INC4
  3022. fpsub f9, f21, f9
  3023. subi BO, BO, 32 * SIZE
  3024. fpsub f5, f22, f5
  3025. LFPDUX f31, BO2, INC4
  3026. fpsub f13, f23, f13
  3027. subi BO2, BO2, 32 * SIZE
  3028. fpsub f2, f24, f2
  3029. fpsub f10, f25, f10
  3030. fpsub f6, f26, f6
  3031. fpsub f14, f27, f14
  3032. fpsub f3, f28, f3
  3033. fpsub f11, f29, f11
  3034. fpsub f7, f30, f7
  3035. fpsub f15, f31, f15
  3036. #else
  3037. LFPDUX f16, AO, INC4
  3038. LFPDUX f17, AO2, INC4
  3039. LFPDUX f18, AO, INC4
  3040. LFPDUX f19, AO2, INC4
  3041. LFPDUX f20, AO, INC4
  3042. LFPDUX f21, AO2, INC4
  3043. LFPDUX f22, AO, INC4
  3044. LFPDUX f23, AO2, INC4
  3045. fpsub f0, f16, f0
  3046. LFPDUX f24, AO, INC4
  3047. fpsub f1, f17, f1
  3048. LFPDUX f25, AO2, INC4
  3049. fpsub f2, f18, f2
  3050. LFPDUX f26, AO, INC4
  3051. fpsub f3, f19, f3
  3052. LFPDUX f27, AO2, INC4
  3053. fpsub f4, f20, f4
  3054. LFPDUX f28, AO, INC4
  3055. fpsub f5, f21, f5
  3056. LFPDUX f29, AO2, INC4
  3057. fpsub f6, f22, f6
  3058. LFPDUX f30, AO, INC4
  3059. fpsub f7, f23, f7
  3060. LFPDUX f31, AO2, INC4
  3061. fpsub f8, f24, f8
  3062. subi AO, AO, 32 * SIZE
  3063. fpsub f9, f25, f9
  3064. subi AO2, AO2, 32 * SIZE
  3065. fpsub f10, f26, f10
  3066. fpsub f11, f27, f11
  3067. fpsub f12, f28, f12
  3068. fpsub f13, f29, f13
  3069. fpsub f14, f30, f14
  3070. fpsub f15, f31, f15
  3071. #endif
  3072. #ifdef LN
  3073. addi AO, AO, 68 * SIZE
  3074. addi AO2, AO2, 68 * SIZE
  3075. LFPDUX A1, AO2, INCM4
  3076. LFPDUX A2, AO, INCM4
  3077. LFPDUX A3, AO2, INCM4
  3078. LFPDUX A4, AO, INCM4
  3079. LFPDUX A5, AO2, INCM4
  3080. LFPDUX A6, AO, INCM4
  3081. LFPDUX A7, AO2, INCM4
  3082. LFPDUX A8, AO, INCM4
  3083. fxsmul f7, A1, f7
  3084. fxsmul f15, A1, f15
  3085. fxcpnmsub f3, A1, f7, f3
  3086. fxcpnmsub f11, A1, f15, f11
  3087. fxcsnmsub f6, A2, f7, f6
  3088. fxcsnmsub f14, A2, f15, f14
  3089. fxcpnmsub f2, A2, f7, f2
  3090. fxcpnmsub f10, A2, f15, f10
  3091. fxcsnmsub f5, A3, f7, f5
  3092. fxcsnmsub f13, A3, f15, f13
  3093. fxcpnmsub f1, A3, f7, f1
  3094. fxcpnmsub f9, A3, f15, f9
  3095. fxcsnmsub f4, A4, f7, f4
  3096. fxcsnmsub f12, A4, f15, f12
  3097. fxcpnmsub f0, A4, f7, f0
  3098. fxcpnmsub f8, A4, f15, f8
  3099. fxpmul f3, A5, f3
  3100. fxpmul f11, A5, f11
  3101. fxcsnmsub f6, A6, f3, f6
  3102. fxcsnmsub f14, A6, f11, f14
  3103. fxcpnmsub f2, A6, f3, f2
  3104. fxcpnmsub f10, A6, f11, f10
  3105. fxcsnmsub f5, A7, f3, f5
  3106. fxcsnmsub f13, A7, f11, f13
  3107. fxcpnmsub f1, A7, f3, f1
  3108. fxcpnmsub f9, A7, f11, f9
  3109. fxcsnmsub f4, A8, f3, f4
  3110. fxcsnmsub f12, A8, f11, f12
  3111. fxcpnmsub f0, A8, f3, f0
  3112. fxcpnmsub f8, A8, f11, f8
  3113. add AO2, AO2, INCM4
  3114. LFPDUX A1, AO, INCM4
  3115. LFPDUX A2, AO2, INCM4
  3116. LFPDUX A3, AO, INCM4
  3117. add AO2, AO2, INCM4
  3118. LFPDUX A4, AO, INCM4
  3119. LFPDUX A5, AO2, INCM4
  3120. LFPDUX A6, AO, INCM4
  3121. add AO2, AO2, INCM4
  3122. add AO, AO, INCM4
  3123. LFPDUX A7, AO2, INCM4
  3124. LFPDUX A8, AO, INCM4
  3125. fxsmul f6, A1, f6
  3126. fxsmul f14, A1, f14
  3127. fxcpnmsub f2, A1, f6, f2
  3128. fxcpnmsub f10, A1, f14, f10
  3129. fxcsnmsub f5, A2, f6, f5
  3130. fxcsnmsub f13, A2, f14, f13
  3131. fxcpnmsub f1, A2, f6, f1
  3132. fxcpnmsub f9, A2, f14, f9
  3133. fxcsnmsub f4, A3, f6, f4
  3134. fxcsnmsub f12, A3, f14, f12
  3135. fxcpnmsub f0, A3, f6, f0
  3136. fxcpnmsub f8, A3, f14, f8
  3137. fxpmul f2, A4, f2
  3138. fxpmul f10, A4, f10
  3139. fxcsnmsub f5, A5, f2, f5
  3140. fxcsnmsub f13, A5, f10, f13
  3141. fxcpnmsub f1, A5, f2, f1
  3142. fxcpnmsub f9, A5, f10, f9
  3143. fxcsnmsub f4, A6, f2, f4
  3144. fxcsnmsub f12, A6, f10, f12
  3145. fxcpnmsub f0, A6, f2, f0
  3146. fxcpnmsub f8, A6, f10, f8
  3147. fxsmul f5, A7, f5
  3148. fxsmul f13, A7, f13
  3149. fxcpnmsub f1, A7, f5, f1
  3150. fxcpnmsub f9, A7, f13, f9
  3151. fxcsnmsub f4, A8, f5, f4
  3152. fxcsnmsub f12, A8, f13, f12
  3153. fxcpnmsub f0, A8, f5, f0
  3154. fxcpnmsub f8, A8, f13, f8
  3155. add AO2, AO2, INCM4
  3156. add AO, AO, INCM4
  3157. LFPDUX A1, AO2, INCM4
  3158. LFPDUX A2, AO, INCM4
  3159. subi AO2, AO2, 8 * SIZE
  3160. add AO, AO, INCM4
  3161. LFPDUX A3, AO, INCM4
  3162. subi AO2, AO2, 8 * SIZE
  3163. add AO, AO, INCM4
  3164. LFPDUX A4, AO, INCM4
  3165. addi AO, AO, -4 * SIZE
  3166. addi AO2, AO2, -4 * SIZE
  3167. fxpmul f1, A1, f1
  3168. fxpmul f9, A1, f9
  3169. fxcsnmsub f4, A2, f1, f4
  3170. fxcsnmsub f12, A2, f9, f12
  3171. fxcpnmsub f0, A2, f1, f0
  3172. fxcpnmsub f8, A2, f9, f8
  3173. fxsmul f4, A3, f4
  3174. fxsmul f12, A3, f12
  3175. fxcpnmsub f0, A3, f4, f0
  3176. fxcpnmsub f8, A3, f12, f8
  3177. fxpmul f0, A4, f0
  3178. fxpmul f8, A4, f8
  3179. #endif
  3180. #ifdef LT
  3181. LFPDUX A1, AO, INC4
  3182. LFPDUX A2, AO2, INC4
  3183. LFPDUX A3, AO, INC4
  3184. LFPDUX A4, AO2, INC4
  3185. LFPDUX A5, AO, INC4
  3186. LFPDUX A6, AO2, INC4
  3187. LFPDUX A7, AO, INC4
  3188. LFPDUX A8, AO2, INC4
  3189. fxpmul f0, A1, f0
  3190. fxpmul f8, A1, f8
  3191. fxcsnmsub f4, A1, f0, f4
  3192. fxcsnmsub f12, A1, f8, f12
  3193. fxcpnmsub f1, A2, f0, f1
  3194. fxcpnmsub f9, A2, f8, f9
  3195. fxcsnmsub f5, A2, f0, f5
  3196. fxcsnmsub f13, A2, f8, f13
  3197. fxcpnmsub f2, A3, f0, f2
  3198. fxcpnmsub f10, A3, f8, f10
  3199. fxcsnmsub f6, A3, f0, f6
  3200. fxcsnmsub f14, A3, f8, f14
  3201. fxcpnmsub f3, A4, f0, f3
  3202. fxcpnmsub f11, A4, f8, f11
  3203. fxcsnmsub f7, A4, f0, f7
  3204. fxcsnmsub f15, A4, f8, f15
  3205. fxsmul f4, A5, f4
  3206. fxsmul f12, A5, f12
  3207. fxcpnmsub f1, A6, f4, f1
  3208. fxcpnmsub f9, A6, f12, f9
  3209. fxcsnmsub f5, A6, f4, f5
  3210. fxcsnmsub f13, A6, f12, f13
  3211. fxcpnmsub f2, A7, f4, f2
  3212. fxcpnmsub f10, A7, f12, f10
  3213. fxcsnmsub f6, A7, f4, f6
  3214. fxcsnmsub f14, A7, f12, f14
  3215. fxcpnmsub f3, A8, f4, f3
  3216. fxcpnmsub f11, A8, f12, f11
  3217. fxcsnmsub f7, A8, f4, f7
  3218. fxcsnmsub f15, A8, f12, f15
  3219. add AO, AO, INC4
  3220. LFPDUX A1, AO2, INC4
  3221. LFPDUX A2, AO, INC4
  3222. LFPDUX A3, AO2, INC4
  3223. add AO, AO, INC4
  3224. LFPDUX A4, AO2, INC4
  3225. LFPDUX A5, AO, INC4
  3226. LFPDUX A6, AO2, INC4
  3227. add AO, AO, INC4
  3228. add AO2, AO2, INC4
  3229. LFPDUX A7, AO, INC4
  3230. LFPDUX A8, AO2, INC4
  3231. fxpmul f1, A1, f1
  3232. fxpmul f9, A1, f9
  3233. fxcsnmsub f5, A1, f1, f5
  3234. fxcsnmsub f13, A1, f9, f13
  3235. fxcpnmsub f2, A2, f1, f2
  3236. fxcpnmsub f10, A2, f9, f10
  3237. fxcsnmsub f6, A2, f1, f6
  3238. fxcsnmsub f14, A2, f9, f14
  3239. fxcpnmsub f3, A3, f1, f3
  3240. fxcpnmsub f11, A3, f9, f11
  3241. fxcsnmsub f7, A3, f1, f7
  3242. fxcsnmsub f15, A3, f9, f15
  3243. fxsmul f5, A4, f5
  3244. fxsmul f13, A4, f13
  3245. fxcpnmsub f2, A5, f5, f2
  3246. fxcpnmsub f10, A5, f13, f10
  3247. fxcsnmsub f6, A5, f5, f6
  3248. fxcsnmsub f14, A5, f13, f14
  3249. fxcpnmsub f3, A6, f5, f3
  3250. fxcpnmsub f11, A6, f13, f11
  3251. fxcsnmsub f7, A6, f5, f7
  3252. fxcsnmsub f15, A6, f13, f15
  3253. fxpmul f2, A7, f2
  3254. fxpmul f10, A7, f10
  3255. fxcsnmsub f6, A7, f2, f6
  3256. fxcsnmsub f14, A7, f10, f14
  3257. fxcpnmsub f3, A8, f2, f3
  3258. fxcpnmsub f11, A8, f10, f11
  3259. fxcsnmsub f7, A8, f2, f7
  3260. fxcsnmsub f15, A8, f10, f15
  3261. add AO, AO, INC4
  3262. add AO2, AO2, INC4
  3263. LFPDUX A1, AO, INC4
  3264. LFPDUX A2, AO2, INC4
  3265. addi AO, AO, 8 * SIZE
  3266. addi AO2, AO2, 4 * SIZE
  3267. LFPDUX A3, AO2, INC4
  3268. addi AO, AO, 8 * SIZE
  3269. addi AO2, AO2, 4 * SIZE
  3270. LFPDUX A4, AO2, INC4
  3271. subi AO, AO, 64 * SIZE
  3272. subi AO2, AO2, 64 * SIZE
  3273. fxsmul f6, A1, f6
  3274. fxsmul f14, A1, f14
  3275. fxcpnmsub f3, A2, f6, f3
  3276. fxcpnmsub f11, A2, f14, f11
  3277. fxcsnmsub f7, A2, f6, f7
  3278. fxcsnmsub f15, A2, f14, f15
  3279. fxpmul f3, A3, f3
  3280. fxpmul f11, A3, f11
  3281. fxcsnmsub f7, A3, f3, f7
  3282. fxcsnmsub f15, A3, f11, f15
  3283. fxsmul f7, A4, f7
  3284. fxsmul f15, A4, f15
  3285. #endif
  3286. #ifdef RN
  3287. LFPDUX A1, BO, INC4
  3288. LFPDUX A2, BO2, INC4
  3289. LFPDUX A3, BO, INC4
  3290. LFPDUX A4, BO2, INC4
  3291. add BO, BO, INC4
  3292. LFPDUX A5, BO2, INC4
  3293. add BO, BO, INC4
  3294. LFPDUX A6, BO2, INC4
  3295. subi BO, BO, 16 * SIZE
  3296. subi BO2, BO2, 16 * SIZE
  3297. fxpmul f0, A1, f0
  3298. fxpmul f1, A1, f1
  3299. fxpmul f2, A1, f2
  3300. fxpmul f3, A1, f3
  3301. fxcsnmsub f4, A1, f0, f4
  3302. fxcsnmsub f5, A1, f1, f5
  3303. fxcsnmsub f6, A1, f2, f6
  3304. fxcsnmsub f7, A1, f3, f7
  3305. fxcpnmsub f8, A2, f0, f8
  3306. fxcpnmsub f9, A2, f1, f9
  3307. fxcpnmsub f10, A2, f2, f10
  3308. fxcpnmsub f11, A2, f3, f11
  3309. fxcsnmsub f12, A2, f0, f12
  3310. fxcsnmsub f13, A2, f1, f13
  3311. fxcsnmsub f14, A2, f2, f14
  3312. fxcsnmsub f15, A2, f3, f15
  3313. fxsmul f4, A3, f4
  3314. fxsmul f5, A3, f5
  3315. fxsmul f6, A3, f6
  3316. fxsmul f7, A3, f7
  3317. fxcpnmsub f8, A4, f4, f8
  3318. fxcpnmsub f9, A4, f5, f9
  3319. fxcpnmsub f10, A4, f6, f10
  3320. fxcpnmsub f11, A4, f7, f11
  3321. fxcsnmsub f12, A4, f4, f12
  3322. fxcsnmsub f13, A4, f5, f13
  3323. fxcsnmsub f14, A4, f6, f14
  3324. fxcsnmsub f15, A4, f7, f15
  3325. fxpmul f8, A5, f8
  3326. fxpmul f9, A5, f9
  3327. fxpmul f10, A5, f10
  3328. fxpmul f11, A5, f11
  3329. fxcsnmsub f12, A5, f8, f12
  3330. fxcsnmsub f13, A5, f9, f13
  3331. fxcsnmsub f14, A5, f10, f14
  3332. fxcsnmsub f15, A5, f11, f15
  3333. fxsmul f12, A6, f12
  3334. fxsmul f13, A6, f13
  3335. fxsmul f14, A6, f14
  3336. fxsmul f15, A6, f15
  3337. #endif
  3338. #ifdef RT
  3339. addi BO, BO, 20 * SIZE
  3340. addi BO2, BO2, 20 * SIZE
  3341. LFPDUX A1, BO2, INCM4
  3342. LFPDUX A2, BO, INCM4
  3343. LFPDUX A3, BO2, INCM4
  3344. LFPDUX A4, BO, INCM4
  3345. add BO2, BO2, INCM4
  3346. LFPDUX A5, BO, INCM4
  3347. add BO2, BO2, INCM4
  3348. LFPDUX A6, BO, INCM4
  3349. subi BO, BO, 4 * SIZE
  3350. subi BO2, BO2, 4 * SIZE
  3351. fxsmul f12, A1, f12
  3352. fxsmul f13, A1, f13
  3353. fxsmul f14, A1, f14
  3354. fxsmul f15, A1, f15
  3355. fxcpnmsub f8, A1, f12, f8
  3356. fxcpnmsub f9, A1, f13, f9
  3357. fxcpnmsub f10, A1, f14, f10
  3358. fxcpnmsub f11, A1, f15, f11
  3359. fxcsnmsub f4, A2, f12, f4
  3360. fxcsnmsub f5, A2, f13, f5
  3361. fxcsnmsub f6, A2, f14, f6
  3362. fxcsnmsub f7, A2, f15, f7
  3363. fxcpnmsub f0, A2, f12, f0
  3364. fxcpnmsub f1, A2, f13, f1
  3365. fxcpnmsub f2, A2, f14, f2
  3366. fxcpnmsub f3, A2, f15, f3
  3367. fxpmul f8, A3, f8
  3368. fxpmul f9, A3, f9
  3369. fxpmul f10, A3, f10
  3370. fxpmul f11, A3, f11
  3371. fxcsnmsub f4, A4, f8, f4
  3372. fxcsnmsub f5, A4, f9, f5
  3373. fxcsnmsub f6, A4, f10, f6
  3374. fxcsnmsub f7, A4, f11, f7
  3375. fxcpnmsub f0, A4, f8, f0
  3376. fxcpnmsub f1, A4, f9, f1
  3377. fxcpnmsub f2, A4, f10, f2
  3378. fxcpnmsub f3, A4, f11, f3
  3379. fxsmul f4, A5, f4
  3380. fxsmul f5, A5, f5
  3381. fxsmul f6, A5, f6
  3382. fxsmul f7, A5, f7
  3383. fxcpnmsub f0, A5, f4, f0
  3384. fxcpnmsub f1, A5, f5, f1
  3385. fxcpnmsub f2, A5, f6, f2
  3386. fxcpnmsub f3, A5, f7, f3
  3387. fxpmul f0, A6, f0
  3388. fxpmul f1, A6, f1
  3389. fxpmul f2, A6, f2
  3390. fxpmul f3, A6, f3
  3391. #endif
  3392. #ifdef LN
  3393. subi CO1, CO1, 8 * SIZE
  3394. subi CO2, CO2, 8 * SIZE
  3395. subi CO3, CO3, 8 * SIZE
  3396. subi CO4, CO4, 8 * SIZE
  3397. #endif
  3398. #if defined(LN) || defined(LT)
  3399. STFPDUX f0, BO, INC4
  3400. STFPDUX f8, BO2, INC4
  3401. STFPDUX f4, BO, INC4
  3402. STFPDUX f12, BO2, INC4
  3403. STFPDUX f1, BO, INC4
  3404. STFPDUX f9, BO2, INC4
  3405. STFPDUX f5, BO, INC4
  3406. STFPDUX f13, BO2, INC4
  3407. STFPDUX f2, BO, INC4
  3408. STFPDUX f10, BO2, INC4
  3409. STFPDUX f6, BO, INC4
  3410. STFPDUX f14, BO2, INC4
  3411. STFPDUX f3, BO, INC4
  3412. STFPDUX f11, BO2, INC4
  3413. STFPDUX f7, BO, INC4
  3414. STFPDUX f15, BO2, INC4
  3415. subi BO, BO, 32 * SIZE
  3416. subi BO2, BO2, 32 * SIZE
  3417. STFDUX f0, CO1, INC
  3418. STFDUX f4, CO1, INC
  3419. STFDUX f1, CO1, INC
  3420. STFDUX f5, CO1, INC
  3421. STFDUX f2, CO1, INC
  3422. STFDUX f6, CO1, INC
  3423. STFDUX f3, CO1, INC
  3424. STFDUX f7, CO1, INC
  3425. STFSDUX f0, CO2, INC
  3426. STFSDUX f4, CO2, INC
  3427. STFSDUX f1, CO2, INC
  3428. STFSDUX f5, CO2, INC
  3429. STFSDUX f2, CO2, INC
  3430. STFSDUX f6, CO2, INC
  3431. STFSDUX f3, CO2, INC
  3432. STFSDUX f7, CO2, INC
  3433. STFDUX f8, CO3, INC
  3434. STFDUX f12, CO3, INC
  3435. STFDUX f9, CO3, INC
  3436. STFDUX f13, CO3, INC
  3437. STFDUX f10, CO3, INC
  3438. STFDUX f14, CO3, INC
  3439. STFDUX f11, CO3, INC
  3440. STFDUX f15, CO3, INC
  3441. STFSDUX f8, CO4, INC
  3442. STFSDUX f12, CO4, INC
  3443. STFSDUX f9, CO4, INC
  3444. STFSDUX f13, CO4, INC
  3445. STFSDUX f10, CO4, INC
  3446. STFSDUX f14, CO4, INC
  3447. STFSDUX f11, CO4, INC
  3448. STFSDUX f15, CO4, INC
  3449. #else
  3450. STFPDUX f0, AO, INC4
  3451. STFPDUX f1, AO2, INC4
  3452. STFPDUX f2, AO, INC4
  3453. STFPDUX f3, AO2, INC4
  3454. STFPDUX f4, AO, INC4
  3455. STFPDUX f5, AO2, INC4
  3456. STFPDUX f6, AO, INC4
  3457. STFPDUX f7, AO2, INC4
  3458. STFPDUX f8, AO, INC4
  3459. STFPDUX f9, AO2, INC4
  3460. STFPDUX f10, AO, INC4
  3461. STFPDUX f11, AO2, INC4
  3462. STFPDUX f12, AO, INC4
  3463. STFPDUX f13, AO2, INC4
  3464. STFPDUX f14, AO, INC4
  3465. STFPDUX f15, AO2, INC4
  3466. subi AO, AO, 32 * SIZE
  3467. subi AO2, AO2, 32 * SIZE
  3468. STFDUX f0, CO1, INC
  3469. STFSDUX f0, CO1, INC
  3470. STFDUX f1, CO1, INC
  3471. STFSDUX f1, CO1, INC
  3472. STFDUX f2, CO1, INC
  3473. STFSDUX f2, CO1, INC
  3474. STFDUX f3, CO1, INC
  3475. STFSDUX f3, CO1, INC
  3476. STFDUX f4, CO2, INC
  3477. STFSDUX f4, CO2, INC
  3478. STFDUX f5, CO2, INC
  3479. STFSDUX f5, CO2, INC
  3480. STFDUX f6, CO2, INC
  3481. STFSDUX f6, CO2, INC
  3482. STFDUX f7, CO2, INC
  3483. STFSDUX f7, CO2, INC
  3484. STFDUX f8, CO3, INC
  3485. STFSDUX f8, CO3, INC
  3486. STFDUX f9, CO3, INC
  3487. STFSDUX f9, CO3, INC
  3488. STFDUX f10, CO3, INC
  3489. STFSDUX f10, CO3, INC
  3490. STFDUX f11, CO3, INC
  3491. STFSDUX f11, CO3, INC
  3492. STFDUX f12, CO4, INC
  3493. STFSDUX f12, CO4, INC
  3494. STFDUX f13, CO4, INC
  3495. STFSDUX f13, CO4, INC
  3496. STFDUX f14, CO4, INC
  3497. STFSDUX f14, CO4, INC
  3498. STFDUX f15, CO4, INC
  3499. STFSDUX f15, CO4, INC
  3500. #endif
  3501. #ifdef LN
  3502. subi CO1, CO1, 8 * SIZE
  3503. subi CO2, CO2, 8 * SIZE
  3504. subi CO3, CO3, 8 * SIZE
  3505. subi CO4, CO4, 8 * SIZE
  3506. #endif
  3507. #ifdef RT
  3508. slwi r0, K, 3 + BASE_SHIFT
  3509. add AORIG, AORIG, r0
  3510. #endif
  3511. #if defined(LT) || defined(RN)
  3512. sub TEMP, K, KK
  3513. slwi r0, TEMP, 3 + BASE_SHIFT
  3514. slwi TEMP, TEMP, 2 + BASE_SHIFT
  3515. add AO, AO, r0
  3516. add BO, BO, TEMP
  3517. #endif
  3518. #ifdef LT
  3519. addi KK, KK, 8
  3520. #endif
  3521. #ifdef LN
  3522. subi KK, KK, 8
  3523. #endif
  3524. addic. I, I, -1
  3525. li r0, FZERO
  3526. lfpsx f0, SP, r0
  3527. bgt+ .L11
  3528. .align 4
  3529. .L20:
  3530. andi. I, M, 4
  3531. beq .L30
  3532. #if defined(LT) || defined(RN)
  3533. addi AO2, AO, 2 * SIZE
  3534. fpmr f4, f0
  3535. addi BO, B, - 4 * SIZE
  3536. fpmr f8, f0
  3537. addi BO2, B, - 2 * SIZE
  3538. fpmr f12, f0
  3539. srawi. r0, KK, 2
  3540. fpmr f1, f0
  3541. fpmr f5, f0
  3542. fpmr f9, f0
  3543. mtspr CTR, r0
  3544. fpmr f13, f0
  3545. ble .L24
  3546. #else
  3547. #ifdef LN
  3548. slwi r0, K, 2 + BASE_SHIFT
  3549. sub AORIG, AORIG, r0
  3550. #endif
  3551. slwi r0 , KK, 2 + BASE_SHIFT
  3552. slwi TEMP, KK, 2 + BASE_SHIFT
  3553. add AO, AORIG, r0
  3554. add BO, B, TEMP
  3555. sub TEMP, K, KK
  3556. addi AO2, AO, 2 * SIZE
  3557. fpmr f4, f0
  3558. addi BO, BO, - 4 * SIZE
  3559. fpmr f8, f0
  3560. addi BO2, BO, 2 * SIZE
  3561. fpmr f12, f0
  3562. srawi. r0, TEMP, 2
  3563. fpmr f1, f0
  3564. fpmr f5, f0
  3565. fpmr f9, f0
  3566. mtspr CTR, r0
  3567. fpmr f13, f0
  3568. ble .L24
  3569. #endif
  3570. LFPDUX A1, AO, INC4
  3571. LFPDUX B1, BO, INC4
  3572. LFPDUX A2, AO2, INC4
  3573. LFPDUX B2, BO2, INC4
  3574. LFPDUX A3, AO, INC4
  3575. LFPDUX B3, BO, INC4
  3576. LFPDUX A4, AO2, INC4
  3577. LFPDUX B4, BO2, INC4
  3578. LFPDUX A5, AO, INC4
  3579. LFPDUX B5, BO, INC4
  3580. LFPDUX A6, AO2, INC4
  3581. LFPDUX B6, BO2, INC4
  3582. LFPDUX A7, AO, INC4
  3583. LFPDUX A9, BO, INC4
  3584. LFPDUX A10, BO2, INC4
  3585. bdz- .L23
  3586. .align 4
  3587. .L22:
  3588. fxcpmadd f0, B1, A1, f0
  3589. nop
  3590. fxcsmadd f4, B1, A1, f4
  3591. LFPDUX A8, AO2, INC4
  3592. fxcpmadd f8, B2, A1, f8
  3593. nop
  3594. fxcsmadd f12, B2, A1, f12
  3595. LFPDUX A1, AO, INC4
  3596. fxcpmadd f1, B1, A2, f1
  3597. nop
  3598. fxcsmadd f5, B1, A2, f5
  3599. LFPDUX B1, BO, INC4
  3600. fxcpmadd f9, B2, A2, f9
  3601. nop
  3602. fxcsmadd f13, B2, A2, f13
  3603. LFPDUX B2, BO2, INC4
  3604. fxcpmadd f0, B3, A3, f0
  3605. nop
  3606. fxcsmadd f4, B3, A3, f4
  3607. LFPDUX A2, AO2, INC4
  3608. fxcpmadd f8, B4, A3, f8
  3609. nop
  3610. fxcsmadd f12, B4, A3, f12
  3611. LFPDUX A3, AO, INC4
  3612. fxcpmadd f1, B3, A4, f1
  3613. nop
  3614. fxcsmadd f5, B3, A4, f5
  3615. LFPDUX B3, BO, INC4
  3616. fxcpmadd f9, B4, A4, f9
  3617. nop
  3618. fxcsmadd f13, B4, A4, f13
  3619. LFPDUX B4, BO2, INC4
  3620. fxcpmadd f0, B5, A5, f0
  3621. nop
  3622. fxcsmadd f4, B5, A5, f4
  3623. LFPDUX A4, AO2, INC4
  3624. fxcpmadd f8, B6, A5, f8
  3625. nop
  3626. fxcsmadd f12, B6, A5, f12
  3627. LFPDUX A5, AO, INC4
  3628. fxcpmadd f1, B5, A6, f1
  3629. nop
  3630. fxcsmadd f5, B5, A6, f5
  3631. LFPDUX B5, BO, INC4
  3632. fxcpmadd f9, B6, A6, f9
  3633. nop
  3634. fxcsmadd f13, B6, A6, f13
  3635. LFPDUX B6, BO2, INC4
  3636. fxcpmadd f0, A9, A7, f0
  3637. nop
  3638. fxcsmadd f4, A9, A7, f4
  3639. LFPDUX A6, AO2, INC4
  3640. fxcpmadd f8, A10, A7, f8
  3641. nop
  3642. fxcsmadd f12, A10, A7, f12
  3643. LFPDUX A7, AO, INC4
  3644. fxcpmadd f1, A9, A8, f1
  3645. nop
  3646. fxcsmadd f5, A9, A8, f5
  3647. LFPDUX A9, BO, INC4
  3648. fxcpmadd f9, A10, A8, f9
  3649. nop
  3650. fxcsmadd f13, A10, A8, f13
  3651. LFPDUX A10, BO2, INC4
  3652. bdnz+ .L22
  3653. .align 4
  3654. .L23:
  3655. fxcpmadd f0, B1, A1, f0
  3656. fxcsmadd f4, B1, A1, f4
  3657. LFPDUX A8, AO2, INC4
  3658. fxcpmadd f8, B2, A1, f8
  3659. fxcsmadd f12, B2, A1, f12
  3660. fxcpmadd f1, B1, A2, f1
  3661. fxcsmadd f5, B1, A2, f5
  3662. fxcpmadd f9, B2, A2, f9
  3663. fxcsmadd f13, B2, A2, f13
  3664. fxcpmadd f0, B3, A3, f0
  3665. fxcsmadd f4, B3, A3, f4
  3666. fxcpmadd f8, B4, A3, f8
  3667. fxcsmadd f12, B4, A3, f12
  3668. fxcpmadd f1, B3, A4, f1
  3669. fxcsmadd f5, B3, A4, f5
  3670. fxcpmadd f9, B4, A4, f9
  3671. fxcsmadd f13, B4, A4, f13
  3672. fxcpmadd f0, B5, A5, f0
  3673. fxcsmadd f4, B5, A5, f4
  3674. fxcpmadd f8, B6, A5, f8
  3675. fxcsmadd f12, B6, A5, f12
  3676. fxcpmadd f1, B5, A6, f1
  3677. fxcsmadd f5, B5, A6, f5
  3678. fxcpmadd f9, B6, A6, f9
  3679. fxcsmadd f13, B6, A6, f13
  3680. fxcpmadd f0, A9, A7, f0
  3681. fxcsmadd f4, A9, A7, f4
  3682. fxcpmadd f8, A10, A7, f8
  3683. fxcsmadd f12, A10, A7, f12
  3684. fxcpmadd f1, A9, A8, f1
  3685. fxcsmadd f5, A9, A8, f5
  3686. fxcpmadd f9, A10, A8, f9
  3687. fxcsmadd f13, A10, A8, f13
  3688. .align 4
  3689. .L24:
  3690. #if defined(LT) || defined(RN)
  3691. andi. r0, KK, 3
  3692. mtspr CTR, r0
  3693. ble+ .L28
  3694. #else
  3695. andi. r0, TEMP, 3
  3696. mtspr CTR, r0
  3697. ble+ .L28
  3698. #endif
  3699. LFPDUX A1, AO, INC4
  3700. LFPDUX A2, AO2, INC4
  3701. LFPDUX B1, BO, INC4
  3702. LFPDUX B2, BO2, INC4
  3703. bdz- .L27
  3704. .align 4
  3705. .L26:
  3706. fxcpmadd f0, B1, A1, f0
  3707. fxcsmadd f4, B1, A1, f4
  3708. fxcpmadd f8, B2, A1, f8
  3709. fxcsmadd f12, B2, A1, f12
  3710. LFPDUX A1, AO, INC4
  3711. fxcpmadd f1, B1, A2, f1
  3712. fxcsmadd f5, B1, A2, f5
  3713. LFPDUX B1, BO, INC4
  3714. fxcpmadd f9, B2, A2, f9
  3715. fxcsmadd f13, B2, A2, f13
  3716. LFPDUX A2, AO2, INC4
  3717. LFPDUX B2, BO2, INC4
  3718. bdnz+ .L26
  3719. .align 4
  3720. .L27:
  3721. fxcpmadd f0, B1, A1, f0
  3722. fxcsmadd f4, B1, A1, f4
  3723. fxcpmadd f8, B2, A1, f8
  3724. fxcsmadd f12, B2, A1, f12
  3725. fxcpmadd f1, B1, A2, f1
  3726. fxcsmadd f5, B1, A2, f5
  3727. fxcpmadd f9, B2, A2, f9
  3728. fxcsmadd f13, B2, A2, f13
  3729. .align 4
  3730. .L28:
  3731. #if defined(LN) || defined(RT)
  3732. #ifdef LN
  3733. subi r0, KK, 4
  3734. #else
  3735. subi r0, KK, 4
  3736. #endif
  3737. slwi TEMP, r0, 2 + BASE_SHIFT
  3738. slwi r0, r0, 2 + BASE_SHIFT
  3739. add AO, AORIG, TEMP
  3740. add BO, B, r0
  3741. addi AO2, AO, 2 * SIZE
  3742. addi BO, BO, - 4 * SIZE
  3743. addi BO2, BO, 2 * SIZE
  3744. #endif
  3745. #if defined(LN) || defined(LT)
  3746. fpmr f24, f0
  3747. fpmr f25, f1
  3748. fpmr f28, f8
  3749. fpmr f29, f9
  3750. fsmfp f0, f4
  3751. fsmfp f1, f5
  3752. fsmfp f8, f12
  3753. fsmfp f9, f13
  3754. fsmtp f4, f24
  3755. fsmtp f5, f25
  3756. fsmtp f12, f28
  3757. fsmtp f13, f29
  3758. LFPDUX f16, BO, INC4
  3759. LFPDUX f17, BO2, INC4
  3760. LFPDUX f18, BO, INC4
  3761. LFPDUX f19, BO2, INC4
  3762. LFPDUX f20, BO, INC4
  3763. LFPDUX f21, BO2, INC4
  3764. LFPDUX f22, BO, INC4
  3765. LFPDUX f23, BO2, INC4
  3766. subi BO, BO, 16 * SIZE
  3767. subi BO2, BO2, 16 * SIZE
  3768. fpsub f0, f16, f0
  3769. fpsub f8, f17, f8
  3770. fpsub f4, f18, f4
  3771. fpsub f12, f19, f12
  3772. fpsub f1, f20, f1
  3773. fpsub f9, f21, f9
  3774. fpsub f5, f22, f5
  3775. fpsub f13, f23, f13
  3776. #else
  3777. LFPDUX f16, AO, INC4
  3778. LFPDUX f17, AO2, INC4
  3779. LFPDUX f18, AO, INC4
  3780. LFPDUX f19, AO2, INC4
  3781. LFPDUX f20, AO, INC4
  3782. LFPDUX f21, AO2, INC4
  3783. LFPDUX f22, AO, INC4
  3784. LFPDUX f23, AO2, INC4
  3785. subi AO, AO, 16 * SIZE
  3786. subi AO2, AO2, 16 * SIZE
  3787. fpsub f0, f16, f0
  3788. fpsub f1, f17, f1
  3789. fpsub f4, f18, f4
  3790. fpsub f5, f19, f5
  3791. fpsub f8, f20, f8
  3792. fpsub f9, f21, f9
  3793. fpsub f12, f22, f12
  3794. fpsub f13, f23, f13
  3795. #endif
  3796. #ifdef LN
  3797. addi AO, AO, 20 * SIZE
  3798. addi AO2, AO2, 20 * SIZE
  3799. LFPDUX A1, AO2, INCM4
  3800. LFPDUX A2, AO, INCM4
  3801. LFPDUX A3, AO2, INCM4
  3802. LFPDUX A4, AO, INCM4
  3803. add AO2, AO2, INCM4
  3804. LFPDUX A5, AO, INCM4
  3805. add AO2, AO2, INCM4
  3806. LFPDUX A6, AO, INCM4
  3807. addi AO, AO, -4 * SIZE
  3808. addi AO2, AO2, -4 * SIZE
  3809. fxsmul f5, A1, f5
  3810. fxsmul f13, A1, f13
  3811. fxcpnmsub f1, A1, f5, f1
  3812. fxcpnmsub f9, A1, f13, f9
  3813. fxcsnmsub f4, A2, f5, f4
  3814. fxcsnmsub f12, A2, f13, f12
  3815. fxcpnmsub f0, A2, f5, f0
  3816. fxcpnmsub f8, A2, f13, f8
  3817. fxpmul f1, A3, f1
  3818. fxpmul f9, A3, f9
  3819. fxcsnmsub f4, A4, f1, f4
  3820. fxcsnmsub f12, A4, f9, f12
  3821. fxcpnmsub f0, A4, f1, f0
  3822. fxcpnmsub f8, A4, f9, f8
  3823. fxsmul f4, A5, f4
  3824. fxsmul f12, A5, f12
  3825. fxcpnmsub f0, A5, f4, f0
  3826. fxcpnmsub f8, A5, f12, f8
  3827. fxpmul f0, A6, f0
  3828. fxpmul f8, A6, f8
  3829. #endif
  3830. #ifdef LT
  3831. LFPDUX A1, AO, INC4
  3832. LFPDUX A2, AO2, INC4
  3833. LFPDUX A3, AO, INC4
  3834. LFPDUX A4, AO2, INC4
  3835. add AO, AO, INC4
  3836. LFPDUX A5, AO2, INC4
  3837. add AO, AO, INC4
  3838. LFPDUX A6, AO2, INC4
  3839. subi AO, AO, 16 * SIZE
  3840. subi AO2, AO2, 16 * SIZE
  3841. fxpmul f0, A1, f0
  3842. fxpmul f8, A1, f8
  3843. fxcsnmsub f4, A1, f0, f4
  3844. fxcsnmsub f12, A1, f8, f12
  3845. fxcpnmsub f1, A2, f0, f1
  3846. fxcpnmsub f9, A2, f8, f9
  3847. fxcsnmsub f5, A2, f0, f5
  3848. fxcsnmsub f13, A2, f8, f13
  3849. fxsmul f4, A3, f4
  3850. fxsmul f12, A3, f12
  3851. fxcpnmsub f1, A4, f4, f1
  3852. fxcpnmsub f9, A4, f12, f9
  3853. fxcsnmsub f5, A4, f4, f5
  3854. fxcsnmsub f13, A4, f12, f13
  3855. fxpmul f1, A5, f1
  3856. fxpmul f9, A5, f9
  3857. fxcsnmsub f5, A5, f1, f5
  3858. fxcsnmsub f13, A5, f9, f13
  3859. fxsmul f5, A6, f5
  3860. fxsmul f13, A6, f13
  3861. #endif
  3862. #ifdef RN
  3863. LFPDUX A1, BO, INC4
  3864. LFPDUX A2, BO2, INC4
  3865. LFPDUX A3, BO, INC4
  3866. LFPDUX A4, BO2, INC4
  3867. add BO, BO, INC4
  3868. LFPDUX A5, BO2, INC4
  3869. add BO, BO, INC4
  3870. LFPDUX A6, BO2, INC4
  3871. subi BO, BO, 16 * SIZE
  3872. subi BO2, BO2, 16 * SIZE
  3873. fxpmul f0, A1, f0
  3874. fxpmul f1, A1, f1
  3875. fxcsnmsub f4, A1, f0, f4
  3876. fxcsnmsub f5, A1, f1, f5
  3877. fxcpnmsub f8, A2, f0, f8
  3878. fxcpnmsub f9, A2, f1, f9
  3879. fxcsnmsub f12, A2, f0, f12
  3880. fxcsnmsub f13, A2, f1, f13
  3881. fxsmul f4, A3, f4
  3882. fxsmul f5, A3, f5
  3883. fxcpnmsub f8, A4, f4, f8
  3884. fxcpnmsub f9, A4, f5, f9
  3885. fxcsnmsub f12, A4, f4, f12
  3886. fxcsnmsub f13, A4, f5, f13
  3887. fxpmul f8, A5, f8
  3888. fxpmul f9, A5, f9
  3889. fxcsnmsub f12, A5, f8, f12
  3890. fxcsnmsub f13, A5, f9, f13
  3891. fxsmul f12, A6, f12
  3892. fxsmul f13, A6, f13
  3893. #endif
  3894. #ifdef RT
  3895. addi BO, BO, 20 * SIZE
  3896. addi BO2, BO2, 20 * SIZE
  3897. LFPDUX A1, BO2, INCM4
  3898. LFPDUX A2, BO, INCM4
  3899. LFPDUX A3, BO2, INCM4
  3900. LFPDUX A4, BO, INCM4
  3901. add BO2, BO2, INCM4
  3902. LFPDUX A5, BO, INCM4
  3903. add BO2, BO2, INCM4
  3904. LFPDUX A6, BO, INCM4
  3905. subi BO, BO, 4 * SIZE
  3906. subi BO2, BO2, 4 * SIZE
  3907. fxsmul f12, A1, f12
  3908. fxsmul f13, A1, f13
  3909. fxcpnmsub f8, A1, f12, f8
  3910. fxcpnmsub f9, A1, f13, f9
  3911. fxcsnmsub f4, A2, f12, f4
  3912. fxcsnmsub f5, A2, f13, f5
  3913. fxcpnmsub f0, A2, f12, f0
  3914. fxcpnmsub f1, A2, f13, f1
  3915. fxpmul f8, A3, f8
  3916. fxpmul f9, A3, f9
  3917. fxcsnmsub f4, A4, f8, f4
  3918. fxcsnmsub f5, A4, f9, f5
  3919. fxcpnmsub f0, A4, f8, f0
  3920. fxcpnmsub f1, A4, f9, f1
  3921. fxsmul f4, A5, f4
  3922. fxsmul f5, A5, f5
  3923. fxcpnmsub f0, A5, f4, f0
  3924. fxcpnmsub f1, A5, f5, f1
  3925. fxpmul f0, A6, f0
  3926. fxpmul f1, A6, f1
  3927. #endif
  3928. #ifdef LN
  3929. subi CO1, CO1, 4 * SIZE
  3930. subi CO2, CO2, 4 * SIZE
  3931. subi CO3, CO3, 4 * SIZE
  3932. subi CO4, CO4, 4 * SIZE
  3933. #endif
  3934. #if defined(LN) || defined(LT)
  3935. STFPDUX f0, BO, INC4
  3936. STFPDUX f8, BO2, INC4
  3937. STFPDUX f4, BO, INC4
  3938. STFPDUX f12, BO2, INC4
  3939. STFPDUX f1, BO, INC4
  3940. STFPDUX f9, BO2, INC4
  3941. STFPDUX f5, BO, INC4
  3942. STFPDUX f13, BO2, INC4
  3943. subi BO, BO, 16 * SIZE
  3944. subi BO2, BO2, 16 * SIZE
  3945. STFDUX f0, CO1, INC
  3946. STFDUX f4, CO1, INC
  3947. STFDUX f1, CO1, INC
  3948. STFDUX f5, CO1, INC
  3949. STFSDUX f0, CO2, INC
  3950. STFSDUX f4, CO2, INC
  3951. STFSDUX f1, CO2, INC
  3952. STFSDUX f5, CO2, INC
  3953. STFDUX f8, CO3, INC
  3954. STFDUX f12, CO3, INC
  3955. STFDUX f9, CO3, INC
  3956. STFDUX f13, CO3, INC
  3957. STFSDUX f8, CO4, INC
  3958. STFSDUX f12, CO4, INC
  3959. STFSDUX f9, CO4, INC
  3960. STFSDUX f13, CO4, INC
  3961. #else
  3962. STFPDUX f0, AO, INC4
  3963. STFPDUX f1, AO2, INC4
  3964. STFPDUX f4, AO, INC4
  3965. STFPDUX f5, AO2, INC4
  3966. STFPDUX f8, AO, INC4
  3967. STFPDUX f9, AO2, INC4
  3968. STFPDUX f12, AO, INC4
  3969. STFPDUX f13, AO2, INC4
  3970. subi AO, AO, 16 * SIZE
  3971. subi AO2, AO2, 16 * SIZE
  3972. STFDUX f0, CO1, INC
  3973. STFSDUX f0, CO1, INC
  3974. STFDUX f1, CO1, INC
  3975. STFSDUX f1, CO1, INC
  3976. STFDUX f4, CO2, INC
  3977. STFSDUX f4, CO2, INC
  3978. STFDUX f5, CO2, INC
  3979. STFSDUX f5, CO2, INC
  3980. STFDUX f8, CO3, INC
  3981. STFSDUX f8, CO3, INC
  3982. STFDUX f9, CO3, INC
  3983. STFSDUX f9, CO3, INC
  3984. STFDUX f12, CO4, INC
  3985. STFSDUX f12, CO4, INC
  3986. STFDUX f13, CO4, INC
  3987. STFSDUX f13, CO4, INC
  3988. #endif
  3989. #ifdef LN
  3990. subi CO1, CO1, 4 * SIZE
  3991. subi CO2, CO2, 4 * SIZE
  3992. subi CO3, CO3, 4 * SIZE
  3993. subi CO4, CO4, 4 * SIZE
  3994. #endif
  3995. #ifdef RT
  3996. slwi r0, K, 2 + BASE_SHIFT
  3997. add AORIG, AORIG, r0
  3998. #endif
  3999. #if defined(LT) || defined(RN)
  4000. sub TEMP, K, KK
  4001. slwi r0, TEMP, 2 + BASE_SHIFT
  4002. slwi TEMP, TEMP, 2 + BASE_SHIFT
  4003. add AO, AO, r0
  4004. add BO, BO, TEMP
  4005. #endif
  4006. #ifdef LT
  4007. addi KK, KK, 4
  4008. #endif
  4009. #ifdef LN
  4010. subi KK, KK, 4
  4011. #endif
  4012. li r0, FZERO
  4013. lfpsx f0, SP, r0
  4014. .align 4
  4015. .L30:
  4016. andi. I, M, 2
  4017. beq .L40
  4018. #if defined(LT) || defined(RN)
  4019. addi AO2, AO, 2 * SIZE
  4020. fpmr f4, f0
  4021. addi BO, B, - 4 * SIZE
  4022. fpmr f8, f0
  4023. addi BO2, B, - 2 * SIZE
  4024. fpmr f12, f0
  4025. srawi. r0, KK, 2
  4026. mtspr CTR, r0
  4027. ble .L34
  4028. #else
  4029. #ifdef LN
  4030. slwi r0, K, 1 + BASE_SHIFT
  4031. sub AORIG, AORIG, r0
  4032. #endif
  4033. slwi r0 , KK, 1 + BASE_SHIFT
  4034. slwi TEMP, KK, 2 + BASE_SHIFT
  4035. add AO, AORIG, r0
  4036. add BO, B, TEMP
  4037. sub TEMP, K, KK
  4038. addi AO2, AO, 2 * SIZE
  4039. fpmr f4, f0
  4040. addi BO, BO, - 4 * SIZE
  4041. fpmr f8, f0
  4042. addi BO2, BO, 2 * SIZE
  4043. fpmr f12, f0
  4044. srawi. r0, TEMP, 2
  4045. mtspr CTR, r0
  4046. ble .L34
  4047. #endif
  4048. LFPDUX A1, AO, INC4
  4049. LFPDUX B1, BO, INC4
  4050. LFPDUX B2, BO2, INC4
  4051. LFPDUX A2, AO2, INC4
  4052. LFPDUX B3, BO, INC4
  4053. LFPDUX B4, BO2, INC4
  4054. LFPDUX A3, AO, INC4
  4055. LFPDUX A5, BO, INC4
  4056. LFPDUX A6, BO2, INC4
  4057. LFPDUX A4, AO2, INC4
  4058. LFPDUX A7, BO, INC4
  4059. LFPDUX A8, BO2, INC4
  4060. bdz- .L33
  4061. .align 4
  4062. .L32:
  4063. fxcpmadd f0, B1, A1, f0
  4064. fxcsmadd f4, B1, A1, f4
  4065. LFPDUX B1, BO, INC4
  4066. fxcpmadd f8, B2, A1, f8
  4067. fxcsmadd f12, B2, A1, f12
  4068. LFPDUX B2, BO2, INC4
  4069. LFPDUX A1, AO, INC4
  4070. fxcpmadd f0, B3, A2, f0
  4071. fxcsmadd f4, B3, A2, f4
  4072. LFPDUX B3, BO, INC4
  4073. fxcpmadd f8, B4, A2, f8
  4074. fxcsmadd f12, B4, A2, f12
  4075. LFPDUX B4, BO2, INC4
  4076. LFPDUX A2, AO2, INC4
  4077. fxcpmadd f0, A5, A3, f0
  4078. fxcsmadd f4, A5, A3, f4
  4079. LFPDUX A5, BO, INC4
  4080. fxcpmadd f8, A6, A3, f8
  4081. fxcsmadd f12, A6, A3, f12
  4082. LFPDUX A6, BO2, INC4
  4083. LFPDUX A3, AO, INC4
  4084. fxcpmadd f0, A7, A4, f0
  4085. fxcsmadd f4, A7, A4, f4
  4086. LFPDUX A7, BO, INC4
  4087. fxcpmadd f8, A8, A4, f8
  4088. fxcsmadd f12, A8, A4, f12
  4089. LFPDUX A8, BO2, INC4
  4090. LFPDUX A4, AO2, INC4
  4091. bdnz+ .L32
  4092. .align 4
  4093. .L33:
  4094. fxcpmadd f0, B1, A1, f0
  4095. fxcsmadd f4, B1, A1, f4
  4096. fxcpmadd f8, B2, A1, f8
  4097. fxcsmadd f12, B2, A1, f12
  4098. fxcpmadd f0, B3, A2, f0
  4099. fxcsmadd f4, B3, A2, f4
  4100. fxcpmadd f8, B4, A2, f8
  4101. fxcsmadd f12, B4, A2, f12
  4102. fxcpmadd f0, A5, A3, f0
  4103. fxcsmadd f4, A5, A3, f4
  4104. fxcpmadd f8, A6, A3, f8
  4105. fxcsmadd f12, A6, A3, f12
  4106. fxcpmadd f0, A7, A4, f0
  4107. fxcsmadd f4, A7, A4, f4
  4108. fxcpmadd f8, A8, A4, f8
  4109. fxcsmadd f12, A8, A4, f12
  4110. .align 4
  4111. .L34:
  4112. #if defined(LT) || defined(RN)
  4113. andi. r0, KK, 3
  4114. mtspr CTR, r0
  4115. ble+ .L38
  4116. #else
  4117. andi. r0, TEMP, 3
  4118. mtspr CTR, r0
  4119. ble+ .L38
  4120. #endif
  4121. LFPDX A1, AO, INC4
  4122. LFPDUX B1, BO, INC4
  4123. LFPDUX B2, BO2, INC4
  4124. add AO, AO, INC2
  4125. bdz- .L37
  4126. .align 4
  4127. .L36:
  4128. fxcpmadd f0, B1, A1, f0
  4129. fxcsmadd f4, B1, A1, f4
  4130. LFPDUX B1, BO, INC4
  4131. fxcpmadd f8, B2, A1, f8
  4132. fxcsmadd f12, B2, A1, f12
  4133. LFPDX A1, AO, INC4
  4134. LFPDUX B2, BO2, INC4
  4135. add AO, AO, INC2
  4136. bdnz+ .L36
  4137. .align 4
  4138. .L37:
  4139. fxcpmadd f0, B1, A1, f0
  4140. fxcsmadd f4, B1, A1, f4
  4141. fxcpmadd f8, B2, A1, f8
  4142. fxcsmadd f12, B2, A1, f12
  4143. .align 4
  4144. .L38:
  4145. #if defined(LN) || defined(RT)
  4146. #ifdef LN
  4147. subi r0, KK, 2
  4148. #else
  4149. subi r0, KK, 4
  4150. #endif
  4151. slwi TEMP, r0, 1 + BASE_SHIFT
  4152. slwi r0, r0, 2 + BASE_SHIFT
  4153. add AO, AORIG, TEMP
  4154. add BO, B, r0
  4155. addi AO2, AO, 2 * SIZE
  4156. addi BO, BO, - 4 * SIZE
  4157. addi BO2, BO, 2 * SIZE
  4158. #endif
  4159. #if defined(LN) || defined(LT)
  4160. fpmr f24, f0
  4161. fpmr f28, f8
  4162. fsmfp f0, f4
  4163. fsmfp f8, f12
  4164. fsmtp f4, f24
  4165. fsmtp f12, f28
  4166. LFPDUX f16, BO, INC4
  4167. LFPDUX f17, BO2, INC4
  4168. LFPDUX f18, BO, INC4
  4169. LFPDUX f19, BO2, INC4
  4170. subi BO, BO, 8 * SIZE
  4171. subi BO2, BO2, 8 * SIZE
  4172. fpsub f0, f16, f0
  4173. fpsub f8, f17, f8
  4174. fpsub f4, f18, f4
  4175. fpsub f12, f19, f12
  4176. #else
  4177. LFPDUX f16, AO, INC4
  4178. LFPDUX f17, AO2, INC4
  4179. LFPDUX f18, AO, INC4
  4180. LFPDUX f19, AO2, INC4
  4181. subi AO, AO, 8 * SIZE
  4182. subi AO2, AO2, 8 * SIZE
  4183. fpsub f0, f16, f0
  4184. fpsub f4, f17, f4
  4185. fpsub f8, f18, f8
  4186. fpsub f12, f19, f12
  4187. #endif
  4188. #ifdef LN
  4189. addi AO, AO, 8 * SIZE
  4190. addi AO2, AO2, 8 * SIZE
  4191. LFPDUX A1, AO2, INCM4
  4192. LFPDUX A2, AO, INCM4
  4193. addi AO, AO, -4 * SIZE
  4194. addi AO2, AO2, -4 * SIZE
  4195. fxsmul f4, A1, f4
  4196. fxsmul f12, A1, f12
  4197. fxcpnmsub f0, A1, f4, f0
  4198. fxcpnmsub f8, A1, f12, f8
  4199. fxpmul f0, A2, f0
  4200. fxpmul f8, A2, f8
  4201. #endif
  4202. #ifdef LT
  4203. LFPDUX A1, AO, INC4
  4204. LFPDUX A2, AO2, INC4
  4205. subi AO, AO, 4 * SIZE
  4206. subi AO2, AO2, 4 * SIZE
  4207. fxpmul f0, A1, f0
  4208. fxpmul f8, A1, f8
  4209. fxcsnmsub f4, A1, f0, f4
  4210. fxcsnmsub f12, A1, f8, f12
  4211. fxsmul f4, A2, f4
  4212. fxsmul f12, A2, f12
  4213. #endif
  4214. #ifdef RN
  4215. LFPDUX A1, BO, INC4
  4216. LFPDUX A2, BO2, INC4
  4217. LFPDUX A3, BO, INC4
  4218. LFPDUX A4, BO2, INC4
  4219. add BO, BO, INC4
  4220. LFPDUX A5, BO2, INC4
  4221. add BO, BO, INC4
  4222. LFPDUX A6, BO2, INC4
  4223. subi BO, BO, 16 * SIZE
  4224. subi BO2, BO2, 16 * SIZE
  4225. fxpmul f0, A1, f0
  4226. fxcsnmsub f4, A1, f0, f4
  4227. fxcpnmsub f8, A2, f0, f8
  4228. fxcsnmsub f12, A2, f0, f12
  4229. fxsmul f4, A3, f4
  4230. fxcpnmsub f8, A4, f4, f8
  4231. fxcsnmsub f12, A4, f4, f12
  4232. fxpmul f8, A5, f8
  4233. fxcsnmsub f12, A5, f8, f12
  4234. fxsmul f12, A6, f12
  4235. #endif
  4236. #ifdef RT
  4237. addi BO, BO, 20 * SIZE
  4238. addi BO2, BO2, 20 * SIZE
  4239. LFPDUX A1, BO2, INCM4
  4240. LFPDUX A2, BO, INCM4
  4241. LFPDUX A3, BO2, INCM4
  4242. LFPDUX A4, BO, INCM4
  4243. add BO2, BO2, INCM4
  4244. LFPDUX A5, BO, INCM4
  4245. add BO2, BO2, INCM4
  4246. LFPDUX A6, BO, INCM4
  4247. subi BO, BO, 4 * SIZE
  4248. subi BO2, BO2, 4 * SIZE
  4249. fxsmul f12, A1, f12
  4250. fxcpnmsub f8, A1, f12, f8
  4251. fxcsnmsub f4, A2, f12, f4
  4252. fxcpnmsub f0, A2, f12, f0
  4253. fxpmul f8, A3, f8
  4254. fxcsnmsub f4, A4, f8, f4
  4255. fxcpnmsub f0, A4, f8, f0
  4256. fxsmul f4, A5, f4
  4257. fxcpnmsub f0, A5, f4, f0
  4258. fxpmul f0, A6, f0
  4259. #endif
  4260. #ifdef LN
  4261. subi CO1, CO1, 2 * SIZE
  4262. subi CO2, CO2, 2 * SIZE
  4263. subi CO3, CO3, 2 * SIZE
  4264. subi CO4, CO4, 2 * SIZE
  4265. #endif
  4266. #if defined(LN) || defined(LT)
  4267. STFPDUX f0, BO, INC4
  4268. STFPDUX f8, BO2, INC4
  4269. STFPDUX f4, BO, INC4
  4270. STFPDUX f12, BO2, INC4
  4271. subi BO, BO, 8 * SIZE
  4272. subi BO2, BO2, 8 * SIZE
  4273. STFDUX f0, CO1, INC
  4274. STFDUX f4, CO1, INC
  4275. STFSDUX f0, CO2, INC
  4276. STFSDUX f4, CO2, INC
  4277. STFDUX f8, CO3, INC
  4278. STFDUX f12, CO3, INC
  4279. STFSDUX f8, CO4, INC
  4280. STFSDUX f12, CO4, INC
  4281. #else
  4282. STFPDUX f0, AO, INC4
  4283. STFPDUX f4, AO2, INC4
  4284. STFPDUX f8, AO, INC4
  4285. STFPDUX f12, AO2, INC4
  4286. subi AO, AO, 8 * SIZE
  4287. subi AO2, AO2, 8 * SIZE
  4288. STFDUX f0, CO1, INC
  4289. STFSDUX f0, CO1, INC
  4290. STFDUX f4, CO2, INC
  4291. STFSDUX f4, CO2, INC
  4292. STFDUX f8, CO3, INC
  4293. STFSDUX f8, CO3, INC
  4294. STFDUX f12, CO4, INC
  4295. STFSDUX f12, CO4, INC
  4296. #endif
  4297. #ifdef LN
  4298. subi CO1, CO1, 2 * SIZE
  4299. subi CO2, CO2, 2 * SIZE
  4300. subi CO3, CO3, 2 * SIZE
  4301. subi CO4, CO4, 2 * SIZE
  4302. #endif
  4303. #ifdef RT
  4304. slwi r0, K, 1 + BASE_SHIFT
  4305. add AORIG, AORIG, r0
  4306. #endif
  4307. #if defined(LT) || defined(RN)
  4308. sub TEMP, K, KK
  4309. slwi r0, TEMP, 1 + BASE_SHIFT
  4310. slwi TEMP, TEMP, 2 + BASE_SHIFT
  4311. add AO, AO, r0
  4312. add BO, BO, TEMP
  4313. #endif
  4314. #ifdef LT
  4315. addi KK, KK, 2
  4316. #endif
  4317. #ifdef LN
  4318. subi KK, KK, 2
  4319. #endif
  4320. li r0, FZERO
  4321. lfpsx f0, SP, r0
  4322. .align 4
  4323. .L40:
  4324. andi. I, M, 1
  4325. beq .L49
  4326. #if defined(LT) || defined(RN)
  4327. addi AO2, AO, 2 * SIZE
  4328. fpmr f1, f0
  4329. addi BO, B, - 4 * SIZE
  4330. fpmr f2, f0
  4331. addi BO2, B, - 2 * SIZE
  4332. fpmr f3, f0
  4333. srawi. r0, KK, 3
  4334. mtspr CTR, r0
  4335. ble .L44
  4336. #else
  4337. #ifdef LN
  4338. slwi r0, K, 0 + BASE_SHIFT
  4339. sub AORIG, AORIG, r0
  4340. #endif
  4341. slwi r0 , KK, 0 + BASE_SHIFT
  4342. slwi TEMP, KK, 2 + BASE_SHIFT
  4343. add AO, AORIG, r0
  4344. add BO, B, TEMP
  4345. sub TEMP, K, KK
  4346. addi AO2, AO, 2 * SIZE
  4347. fpmr f1, f0
  4348. addi BO, BO, - 4 * SIZE
  4349. fpmr f2, f0
  4350. addi BO2, BO, 2 * SIZE
  4351. fpmr f3, f0
  4352. srawi. r0, TEMP, 3
  4353. mtspr CTR, r0
  4354. ble .L44
  4355. #endif
  4356. LFPDUX A1, AO, INC4
  4357. LFPDUX B1, BO, INC4
  4358. LFPDUX B2, BO2, INC4
  4359. LFPDUX A2, AO2, INC4
  4360. LFPDUX B3, BO, INC4
  4361. LFPDUX B4, BO2, INC4
  4362. LFPDUX A3, AO, INC4
  4363. LFPDUX A5, BO, INC4
  4364. LFPDUX A6, BO2, INC4
  4365. LFPDUX A4, AO2, INC4
  4366. LFPDUX A7, BO, INC4
  4367. LFPDUX A8, BO2, INC4
  4368. bdz- .L43
  4369. .align 4
  4370. .L42:
  4371. fxcpmadd f0, A1, B1, f0
  4372. LFPDUX B1, BO, INC4
  4373. fxcpmadd f1, A1, B2, f1
  4374. LFPDUX B2, BO2, INC4
  4375. fxcsmadd f2, A1, B3, f2
  4376. LFPDUX B3, BO, INC4
  4377. fxcsmadd f3, A1, B4, f3
  4378. LFPDUX B4, BO2, INC4
  4379. LFPDUX A1, AO, INC4
  4380. fxcpmadd f0, A2, A5, f0
  4381. LFPDUX A5, BO, INC4
  4382. fxcpmadd f1, A2, A6, f1
  4383. LFPDUX A6, BO2, INC4
  4384. fxcsmadd f2, A2, A7, f2
  4385. LFPDUX A7, BO, INC4
  4386. fxcsmadd f3, A2, A8, f3
  4387. LFPDUX A8, BO2, INC4
  4388. LFPDUX A2, AO2, INC4
  4389. fxcpmadd f0, A3, B1, f0
  4390. LFPDUX B1, BO, INC4
  4391. fxcpmadd f1, A3, B2, f1
  4392. LFPDUX B2, BO2, INC4
  4393. fxcsmadd f2, A3, B3, f2
  4394. LFPDUX B3, BO, INC4
  4395. fxcsmadd f3, A3, B4, f3
  4396. LFPDUX B4, BO2, INC4
  4397. LFPDUX A3, AO, INC4
  4398. fxcpmadd f0, A4, A5, f0
  4399. LFPDUX A5, BO, INC4
  4400. fxcpmadd f1, A4, A6, f1
  4401. LFPDUX A6, BO2, INC4
  4402. fxcsmadd f2, A4, A7, f2
  4403. LFPDUX A7, BO, INC4
  4404. fxcsmadd f3, A4, A8, f3
  4405. LFPDUX A8, BO2, INC4
  4406. LFPDUX A4, AO2, INC4
  4407. bdnz+ .L42
  4408. .align 4
  4409. .L43:
  4410. fxcpmadd f0, A1, B1, f0
  4411. LFPDUX B1, BO, INC4
  4412. fxcpmadd f1, A1, B2, f1
  4413. LFPDUX B2, BO2, INC4
  4414. fxcsmadd f2, A1, B3, f2
  4415. LFPDUX B3, BO, INC4
  4416. fxcsmadd f3, A1, B4, f3
  4417. LFPDUX B4, BO2, INC4
  4418. fxcpmadd f0, A2, A5, f0
  4419. LFPDUX A5, BO, INC4
  4420. fxcpmadd f1, A2, A6, f1
  4421. LFPDUX A6, BO2, INC4
  4422. fxcsmadd f2, A2, A7, f2
  4423. LFPDUX A7, BO, INC4
  4424. fxcsmadd f3, A2, A8, f3
  4425. LFPDUX A8, BO2, INC4
  4426. fxcpmadd f0, A3, B1, f0
  4427. fxcpmadd f1, A3, B2, f1
  4428. fxcsmadd f2, A3, B3, f2
  4429. fxcsmadd f3, A3, B4, f3
  4430. fxcpmadd f0, A4, A5, f0
  4431. fxcpmadd f1, A4, A6, f1
  4432. fxcsmadd f2, A4, A7, f2
  4433. fxcsmadd f3, A4, A8, f3
  4434. .align 4
  4435. .L44:
  4436. #if defined(LT) || defined(RN)
  4437. andi. r0, KK, 7
  4438. mtspr CTR, r0
  4439. ble+ .L48
  4440. #else
  4441. andi. r0, TEMP, 7
  4442. mtspr CTR, r0
  4443. ble+ .L48
  4444. #endif
  4445. LFDX A1, AO, INC4
  4446. LFPDUX B1, BO, INC4
  4447. LFPDUX B2, BO2, INC4
  4448. add AO, AO, INC
  4449. bdz- .L47
  4450. .align 4
  4451. .L46:
  4452. fxcpmadd f0, A1, B1, f0
  4453. LFPDUX B1, BO, INC4
  4454. fxcpmadd f1, A1, B2, f1
  4455. LFDX A1, AO, INC4
  4456. LFPDUX B2, BO2, INC4
  4457. add AO, AO, INC
  4458. bdnz+ .L46
  4459. .align 4
  4460. .L47:
  4461. fxcpmadd f0, A1, B1, f0
  4462. fxcpmadd f1, A1, B2, f1
  4463. addi AO2, AO, 2 * SIZE
  4464. .align 4
  4465. .L48:
  4466. fpadd f0, f0, f2
  4467. fpadd f1, f1, f3
  4468. #if defined(LN) || defined(RT)
  4469. #ifdef LN
  4470. subi r0, KK, 1
  4471. #else
  4472. subi r0, KK, 4
  4473. #endif
  4474. slwi TEMP, r0, 0 + BASE_SHIFT
  4475. slwi r0, r0, 2 + BASE_SHIFT
  4476. add AO, AORIG, TEMP
  4477. add BO, B, r0
  4478. addi AO2, AO, 2 * SIZE
  4479. addi BO, BO, - 4 * SIZE
  4480. addi BO2, BO, 2 * SIZE
  4481. #endif
  4482. #if defined(LN) || defined(LT)
  4483. LFPDX f16, BO, INC4
  4484. LFPDX f17, BO2, INC4
  4485. fpsub f0, f16, f0
  4486. fpsub f1, f17, f1
  4487. #else
  4488. LFPDX f16, AO, INC4
  4489. LFPDX f17, AO2, INC4
  4490. fpsub f0, f16, f0
  4491. fpsub f1, f17, f1
  4492. #endif
  4493. #if defined(LN) || defined(LT)
  4494. LFPDX A1, AO, INC4
  4495. fxpmul f0, A1, f0
  4496. fxpmul f1, A1, f1
  4497. #endif
  4498. #ifdef RN
  4499. LFD A1, (4 + 0) * SIZE(BO)
  4500. LFD A2, (4 + 1) * SIZE(BO)
  4501. LFD A3, (4 + 2) * SIZE(BO)
  4502. LFD A4, (4 + 3) * SIZE(BO)
  4503. LFD A5, (4 + 5) * SIZE(BO)
  4504. LFD A6, (4 + 6) * SIZE(BO)
  4505. LFD A7, (4 + 7) * SIZE(BO)
  4506. LFD A8, (4 + 10) * SIZE(BO)
  4507. LFD A9, (4 + 11) * SIZE(BO)
  4508. LFD A10, (4 + 15) * SIZE(BO)
  4509. fsmtp f2, f0
  4510. fsmtp f3, f1
  4511. fmul f0, A1, f0
  4512. fnmsub f2, A2, f0, f2
  4513. fnmsub f1, A3, f0, f1
  4514. fnmsub f3, A4, f0, f3
  4515. fmul f2, A5, f2
  4516. fnmsub f1, A6, f2, f1
  4517. fnmsub f3, A7, f2, f3
  4518. fmul f1, A8, f1
  4519. fnmsub f3, A9, f1, f3
  4520. fmul f3, A10, f3
  4521. fsmfp f0, f2
  4522. fsmfp f1, f3
  4523. #endif
  4524. #ifdef RT
  4525. LFD A1, (4 + 15) * SIZE(BO)
  4526. LFD A2, (4 + 14) * SIZE(BO)
  4527. LFD A3, (4 + 13) * SIZE(BO)
  4528. LFD A4, (4 + 12) * SIZE(BO)
  4529. LFD A5, (4 + 10) * SIZE(BO)
  4530. LFD A6, (4 + 9) * SIZE(BO)
  4531. LFD A7, (4 + 8) * SIZE(BO)
  4532. LFD A8, (4 + 5) * SIZE(BO)
  4533. LFD A9, (4 + 4) * SIZE(BO)
  4534. LFD A10, (4 + 0) * SIZE(BO)
  4535. fsmtp f2, f0
  4536. fsmtp f3, f1
  4537. fmul f3, A1, f3
  4538. fnmsub f1, A2, f3, f1
  4539. fnmsub f2, A3, f3, f2
  4540. fnmsub f0, A4, f3, f0
  4541. fmul f1, A5, f1
  4542. fnmsub f2, A6, f1, f2
  4543. fnmsub f0, A7, f1, f0
  4544. fmul f2, A8, f2
  4545. fnmsub f0, A9, f2, f0
  4546. fmul f0, A10, f0
  4547. fsmfp f0, f2
  4548. fsmfp f1, f3
  4549. #endif
  4550. #if defined(LN) || defined(LT)
  4551. STFPDX f0, BO, INC4
  4552. STFPDX f1, BO2, INC4
  4553. #else
  4554. STFPDX f0, AO, INC4
  4555. STFPDX f1, AO2, INC4
  4556. #endif
  4557. #ifdef LN
  4558. subi CO1, CO1, 1 * SIZE
  4559. subi CO2, CO2, 1 * SIZE
  4560. subi CO3, CO3, 1 * SIZE
  4561. subi CO4, CO4, 1 * SIZE
  4562. #endif
  4563. STFDUX f0, CO1, INC
  4564. STFSDUX f0, CO2, INC
  4565. STFDUX f1, CO3, INC
  4566. STFSDUX f1, CO4, INC
  4567. #ifdef LN
  4568. subi CO1, CO1, 1 * SIZE
  4569. subi CO2, CO2, 1 * SIZE
  4570. subi CO3, CO3, 1 * SIZE
  4571. subi CO4, CO4, 1 * SIZE
  4572. #endif
  4573. #ifdef RT
  4574. slwi r0, K, 0 + BASE_SHIFT
  4575. add AORIG, AORIG, r0
  4576. #endif
  4577. #if defined(LT) || defined(RN)
  4578. sub TEMP, K, KK
  4579. slwi r0, TEMP, 0 + BASE_SHIFT
  4580. slwi TEMP, TEMP, 2 + BASE_SHIFT
  4581. add AO, AO, r0
  4582. add BO, BO, TEMP
  4583. #endif
  4584. #ifdef LT
  4585. addi KK, KK, 1
  4586. #endif
  4587. #ifdef LN
  4588. subi KK, KK, 1
  4589. #endif
  4590. .align 4
  4591. .L49:
  4592. #ifdef LN
  4593. slwi r0, K, 2 + BASE_SHIFT
  4594. add B, B, r0
  4595. #endif
  4596. #if defined(LT) || defined(RN)
  4597. addi B, BO, 4 * SIZE
  4598. #endif
  4599. #ifdef RN
  4600. addi KK, KK, 4
  4601. #endif
  4602. #ifdef RT
  4603. subi KK, KK, 4
  4604. #endif
  4605. addic. J, J, -1
  4606. bgt+ .L10
  4607. .align 4
  4608. .L999:
  4609. addi SP, SP, 12
  4610. lwzu r14, 4(SP)
  4611. lwzu r15, 4(SP)
  4612. lwzu r16, 4(SP)
  4613. lwzu r17, 4(SP)
  4614. lwzu r18, 4(SP)
  4615. lwzu r19, 4(SP)
  4616. lwzu r20, 4(SP)
  4617. lwzu r21, 4(SP)
  4618. lwzu r22, 4(SP)
  4619. lwzu r23, 4(SP)
  4620. lwzu r24, 4(SP)
  4621. lwzu r25, 4(SP)
  4622. lwzu r26, 4(SP)
  4623. lwzu r27, 4(SP)
  4624. lwzu r28, 4(SP)
  4625. lwzu r29, 4(SP)
  4626. lwzu r30, 4(SP)
  4627. lwzu r31, 4(SP)
  4628. subi SP, SP, 12
  4629. li r0, 16
  4630. lfpdux f31, SP, r0
  4631. lfpdux f30, SP, r0
  4632. lfpdux f29, SP, r0
  4633. lfpdux f28, SP, r0
  4634. lfpdux f27, SP, r0
  4635. lfpdux f26, SP, r0
  4636. lfpdux f25, SP, r0
  4637. lfpdux f24, SP, r0
  4638. lfpdux f23, SP, r0
  4639. lfpdux f22, SP, r0
  4640. lfpdux f21, SP, r0
  4641. lfpdux f20, SP, r0
  4642. lfpdux f19, SP, r0
  4643. lfpdux f18, SP, r0
  4644. lfpdux f17, SP, r0
  4645. lfpdux f16, SP, r0
  4646. lfpdux f15, SP, r0
  4647. lfpdux f14, SP, r0
  4648. addi SP, SP, 16
  4649. blr
  4650. EPILOGUE
  4651. #endif