You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_hummer_RT.S 51 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #undef ZERO
  41. #define ALPHA 0
  42. #define FZERO 16
  43. #define M r3
  44. #define N r4
  45. #define K r5
  46. #if defined(linux) || defined(__FreeBSD__)
  47. #define A r6
  48. #define B r7
  49. #define C r8
  50. #define LDC r9
  51. #define OFFSET r10
  52. #endif
  53. #define TEMP r11
  54. #define AORIG r12
  55. #define KK r14
  56. #define INCM1 r15
  57. #define INCM3 r16
  58. #define INCM5 r17
  59. #define INCM7 r18
  60. #define INC2 r19
  61. #define INC r20
  62. #define INC4 r21
  63. #define I r22
  64. #define J r23
  65. #define AO r24
  66. #define BO r25
  67. #define AO2 r26
  68. #define BO2 r27
  69. #define CO1 r28
  70. #define CO2 r29
  71. #define ZERO r31
  72. #ifndef NEEDPARAM
  73. #define A1 f16
  74. #define A2 f17
  75. #define A3 f18
  76. #define A4 f19
  77. #define A5 f20
  78. #define A6 f21
  79. #define A7 f22
  80. #define A8 f23
  81. #define A9 f24
  82. #define A10 f25
  83. #define B1 f26
  84. #define B2 f27
  85. #define B3 f28
  86. #define B4 f29
  87. #define B5 f30
  88. #define B6 f31
  89. #define AP B6
  90. #ifndef CONJ
  91. #define FXCPMADD fxcpmadd
  92. #define FXCSMADD fxcxnpma
  93. #else
  94. #if defined(LN) || defined(LT)
  95. #define FXCPMADD fxcpnsma
  96. #define FXCSMADD fxcxma
  97. #else
  98. #define FXCPMADD fxcpmadd
  99. #define FXCSMADD fxcxnsma
  100. #endif
  101. #endif
  102. #ifndef CONJ
  103. #define FXCXNPMA fxcxnpma
  104. #define FXCXNSMA fxcxnsma
  105. #else
  106. #define FXCXNPMA fxcxnsma
  107. #define FXCXNSMA fxcxnpma
  108. #endif
  109. PROLOGUE
  110. PROFCODE
  111. li r0, -16
  112. stfpdux f14, SP, r0
  113. stfpdux f15, SP, r0
  114. stfpdux f16, SP, r0
  115. stfpdux f17, SP, r0
  116. stfpdux f18, SP, r0
  117. stfpdux f19, SP, r0
  118. stfpdux f20, SP, r0
  119. stfpdux f21, SP, r0
  120. stfpdux f22, SP, r0
  121. stfpdux f23, SP, r0
  122. stfpdux f24, SP, r0
  123. stfpdux f25, SP, r0
  124. stfpdux f26, SP, r0
  125. stfpdux f27, SP, r0
  126. stfpdux f28, SP, r0
  127. stfpdux f29, SP, r0
  128. stfpdux f30, SP, r0
  129. stfpdux f31, SP, r0
  130. stwu r31, -4(SP)
  131. stwu r30, -4(SP)
  132. stwu r29, -4(SP)
  133. stwu r28, -4(SP)
  134. stwu r27, -4(SP)
  135. stwu r26, -4(SP)
  136. stwu r25, -4(SP)
  137. stwu r24, -4(SP)
  138. stwu r23, -4(SP)
  139. stwu r22, -4(SP)
  140. stwu r21, -4(SP)
  141. stwu r20, -4(SP)
  142. stwu r19, -4(SP)
  143. stwu r18, -4(SP)
  144. stwu r17, -4(SP)
  145. stwu r16, -4(SP)
  146. stwu r15, -4(SP)
  147. stwu r14, -4(SP)
  148. li r0, 0
  149. stwu r0, -4(SP)
  150. stwu r0, -4(SP)
  151. stfdu f2, -8(SP)
  152. stfdu f1, -8(SP)
  153. slwi LDC, LDC, ZBASE_SHIFT
  154. cmpwi cr0, M, 0
  155. ble .L999
  156. cmpwi cr0, N, 0
  157. ble .L999
  158. cmpwi cr0, K, 0
  159. ble .L999
  160. li INC, 1 * SIZE
  161. li INC2, 2 * SIZE
  162. li INC4, 4 * SIZE
  163. li INCM1, -1 * SIZE
  164. li INCM3, -3 * SIZE
  165. li INCM5, -5 * SIZE
  166. li INCM7, -7 * SIZE
  167. addi C, C, - 1 * SIZE
  168. #ifdef LN
  169. mullw r0, M, K
  170. slwi r0, r0, ZBASE_SHIFT
  171. add A, A, r0
  172. slwi r0, M, ZBASE_SHIFT
  173. add C, C, r0
  174. #endif
  175. #ifdef RN
  176. neg KK, OFFSET
  177. #endif
  178. #ifdef RT
  179. mullw r0, N, K
  180. slwi r0, r0, ZBASE_SHIFT
  181. add B, B, r0
  182. mullw r0, N, LDC
  183. add C, C, r0
  184. sub KK, N, OFFSET
  185. #endif
  186. andi. J, N, 1
  187. beq .L50
  188. #ifdef RT
  189. slwi r0, K, 0 + ZBASE_SHIFT
  190. sub B, B, r0
  191. sub C, C, LDC
  192. #endif
  193. mr CO1, C
  194. #ifdef LN
  195. add KK, M, OFFSET
  196. #endif
  197. #ifdef LT
  198. mr KK, OFFSET
  199. #endif
  200. #if defined(LN) || defined(RT)
  201. addi AORIG, A, -2 * SIZE
  202. #else
  203. addi AO, A, -2 * SIZE
  204. #endif
  205. #ifndef RT
  206. add C, CO2, LDC
  207. #endif
  208. li r0, FZERO
  209. lfpsx f0, SP, r0
  210. srawi. I, M, 2
  211. ble .L60
  212. .align 4
  213. .L51:
  214. #if defined(LT) || defined(RN)
  215. fpmr f4, f0
  216. addi BO, B, - 2 * SIZE
  217. fpmr f1, f0
  218. fpmr f5, f0
  219. fpmr f2, f0
  220. fpmr f6, f0
  221. fpmr f3, f0
  222. fpmr f7, f0
  223. srawi. r0, KK, 2
  224. mtspr CTR, r0
  225. ble .L54
  226. #else
  227. #ifdef LN
  228. slwi r0, K, 2 + ZBASE_SHIFT
  229. sub AORIG, AORIG, r0
  230. #endif
  231. slwi r0 , KK, 2 + ZBASE_SHIFT
  232. slwi TEMP, KK, 0 + ZBASE_SHIFT
  233. add AO, AORIG, r0
  234. add BO, B, TEMP
  235. sub TEMP, K, KK
  236. fpmr f4, f0
  237. addi BO, BO, - 2 * SIZE
  238. fpmr f1, f0
  239. fpmr f5, f0
  240. fpmr f2, f0
  241. fpmr f6, f0
  242. fpmr f3, f0
  243. fpmr f7, f0
  244. srawi. r0, TEMP, 2
  245. mtspr CTR, r0
  246. ble .L54
  247. #endif
  248. LFPDUX B1, BO, INC2
  249. LFPDUX A1, AO, INC2
  250. LFPDUX A2, AO, INC2
  251. LFPDUX B2, BO, INC2
  252. LFPDUX A3, AO, INC2
  253. LFPDUX A4, AO, INC2
  254. LFPDUX B3, BO, INC2
  255. LFPDUX A5, AO, INC2
  256. LFPDUX A6, AO, INC2
  257. LFPDUX A7, AO, INC2
  258. LFPDUX A8, AO, INC2
  259. bdz- .L53
  260. .align 4
  261. .L52:
  262. FXCPMADD f0, B1, A1, f0
  263. LFPDUX B4, BO, INC2
  264. FXCSMADD f4, B1, A1, f4
  265. LFPDUX A1, AO, INC2
  266. FXCPMADD f1, B1, A2, f1
  267. nop
  268. FXCSMADD f5, B1, A2, f5
  269. LFPDUX A2, AO, INC2
  270. FXCPMADD f2, B1, A3, f2
  271. nop
  272. FXCSMADD f6, B1, A3, f6
  273. LFPDUX A3, AO, INC2
  274. FXCPMADD f3, B1, A4, f3
  275. nop
  276. FXCSMADD f7, B1, A4, f7
  277. LFPDUX A4, AO, INC2
  278. FXCPMADD f0, B2, A5, f0
  279. LFPDUX B1, BO, INC2
  280. FXCSMADD f4, B2, A5, f4
  281. LFPDUX A5, AO, INC2
  282. FXCPMADD f1, B2, A6, f1
  283. nop
  284. FXCSMADD f5, B2, A6, f5
  285. LFPDUX A6, AO, INC2
  286. FXCPMADD f2, B2, A7, f2
  287. nop
  288. FXCSMADD f6, B2, A7, f6
  289. LFPDUX A7, AO, INC2
  290. FXCPMADD f3, B2, A8, f3
  291. nop
  292. FXCSMADD f7, B2, A8, f7
  293. LFPDUX A8, AO, INC2
  294. FXCPMADD f0, B3, A1, f0
  295. LFPDUX B2, BO, INC2
  296. FXCSMADD f4, B3, A1, f4
  297. LFPDUX A1, AO, INC2
  298. FXCPMADD f1, B3, A2, f1
  299. nop
  300. FXCSMADD f5, B3, A2, f5
  301. LFPDUX A2, AO, INC2
  302. FXCPMADD f2, B3, A3, f2
  303. nop
  304. FXCSMADD f6, B3, A3, f6
  305. LFPDUX A3, AO, INC2
  306. FXCPMADD f3, B3, A4, f3
  307. nop
  308. FXCSMADD f7, B3, A4, f7
  309. LFPDUX A4, AO, INC2
  310. FXCPMADD f0, B4, A5, f0
  311. LFPDUX B3, BO, INC2
  312. FXCSMADD f4, B4, A5, f4
  313. LFPDUX A5, AO, INC2
  314. FXCPMADD f1, B4, A6, f1
  315. nop
  316. FXCSMADD f5, B4, A6, f5
  317. LFPDUX A6, AO, INC2
  318. FXCPMADD f2, B4, A7, f2
  319. nop
  320. FXCSMADD f6, B4, A7, f6
  321. LFPDUX A7, AO, INC2
  322. FXCPMADD f3, B4, A8, f3
  323. nop
  324. FXCSMADD f7, B4, A8, f7
  325. LFPDUX A8, AO, INC2
  326. bdnz+ .L52
  327. .align 4
  328. .L53:
  329. FXCPMADD f0, B1, A1, f0
  330. LFPDUX B4, BO, INC2
  331. FXCSMADD f4, B1, A1, f4
  332. LFPDUX A1, AO, INC2
  333. FXCPMADD f1, B1, A2, f1
  334. nop
  335. FXCSMADD f5, B1, A2, f5
  336. LFPDUX A2, AO, INC2
  337. FXCPMADD f2, B1, A3, f2
  338. nop
  339. FXCSMADD f6, B1, A3, f6
  340. LFPDUX A3, AO, INC2
  341. FXCPMADD f3, B1, A4, f3
  342. nop
  343. FXCSMADD f7, B1, A4, f7
  344. LFPDUX A4, AO, INC2
  345. FXCPMADD f0, B2, A5, f0
  346. nop
  347. FXCSMADD f4, B2, A5, f4
  348. LFPDUX A5, AO, INC2
  349. FXCPMADD f1, B2, A6, f1
  350. nop
  351. FXCSMADD f5, B2, A6, f5
  352. LFPDUX A6, AO, INC2
  353. FXCPMADD f2, B2, A7, f2
  354. nop
  355. FXCSMADD f6, B2, A7, f6
  356. LFPDUX A7, AO, INC2
  357. FXCPMADD f3, B2, A8, f3
  358. nop
  359. FXCSMADD f7, B2, A8, f7
  360. LFPDUX A8, AO, INC2
  361. FXCPMADD f0, B3, A1, f0
  362. FXCSMADD f4, B3, A1, f4
  363. FXCPMADD f1, B3, A2, f1
  364. FXCSMADD f5, B3, A2, f5
  365. FXCPMADD f2, B3, A3, f2
  366. FXCSMADD f6, B3, A3, f6
  367. FXCPMADD f3, B3, A4, f3
  368. FXCSMADD f7, B3, A4, f7
  369. FXCPMADD f0, B4, A5, f0
  370. FXCSMADD f4, B4, A5, f4
  371. FXCPMADD f1, B4, A6, f1
  372. FXCSMADD f5, B4, A6, f5
  373. FXCPMADD f2, B4, A7, f2
  374. FXCSMADD f6, B4, A7, f6
  375. FXCPMADD f3, B4, A8, f3
  376. FXCSMADD f7, B4, A8, f7
  377. .align 4
  378. .L54:
  379. #if defined(LT) || defined(RN)
  380. andi. r0, KK, 3
  381. mtspr CTR, r0
  382. ble+ .L58
  383. #else
  384. andi. r0, TEMP, 3
  385. mtspr CTR, r0
  386. ble+ .L58
  387. #endif
  388. LFPDUX A1, AO, INC2
  389. LFPDUX B1, BO, INC2
  390. LFPDUX A2, AO, INC2
  391. LFPDUX A3, AO, INC2
  392. LFPDUX A4, AO, INC2
  393. bdz- .L57
  394. .align 4
  395. .L56:
  396. FXCPMADD f0, B1, A1, f0
  397. FXCSMADD f4, B1, A1, f4
  398. LFPDUX A1, AO, INC2
  399. FXCPMADD f1, B1, A2, f1
  400. FXCSMADD f5, B1, A2, f5
  401. LFPDUX A2, AO, INC2
  402. FXCPMADD f2, B1, A3, f2
  403. FXCSMADD f6, B1, A3, f6
  404. LFPDUX A3, AO, INC2
  405. FXCPMADD f3, B1, A4, f3
  406. FXCSMADD f7, B1, A4, f7
  407. LFPDUX A4, AO, INC2
  408. LFPDUX B1, BO, INC2
  409. bdnz+ .L56
  410. .align 4
  411. .L57:
  412. FXCPMADD f0, B1, A1, f0
  413. FXCSMADD f4, B1, A1, f4
  414. FXCPMADD f1, B1, A2, f1
  415. FXCSMADD f5, B1, A2, f5
  416. FXCPMADD f2, B1, A3, f2
  417. FXCSMADD f6, B1, A3, f6
  418. FXCPMADD f3, B1, A4, f3
  419. FXCSMADD f7, B1, A4, f7
  420. .align 4
  421. .L58:
  422. fpadd f0, f0, f4
  423. fpadd f1, f1, f5
  424. fpadd f2, f2, f6
  425. fpadd f3, f3, f7
  426. #if defined(LN) || defined(RT)
  427. #ifdef LN
  428. subi r0, KK, 4
  429. #else
  430. subi r0, KK, 1
  431. #endif
  432. slwi TEMP, r0, 2 + ZBASE_SHIFT
  433. slwi r0, r0, 0 + ZBASE_SHIFT
  434. add AO, AORIG, TEMP
  435. add BO, B, r0
  436. addi BO, BO, - 2 * SIZE
  437. #endif
  438. #if defined(LN) || defined(LT)
  439. LFPDUX f16, BO, INC2
  440. LFPDUX f17, BO, INC2
  441. LFPDUX f18, BO, INC2
  442. LFPDUX f19, BO, INC2
  443. subi BO, BO, 8 * SIZE
  444. #else
  445. LFPDUX f16, AO, INC2
  446. LFPDUX f17, AO, INC2
  447. LFPDUX f18, AO, INC2
  448. LFPDUX f19, AO, INC2
  449. subi AO, AO, 8 * SIZE
  450. #endif
  451. fpsub f0, f16, f0
  452. fpsub f1, f17, f1
  453. fpsub f2, f18, f2
  454. fpsub f3, f19, f3
  455. #ifdef LN
  456. LFPDUX A1, AO, INC2
  457. add AO, AO, INC2
  458. add AO, AO, INC2
  459. add AO, AO, INC2
  460. LFPDUX A2, AO, INC2
  461. LFPDUX A3, AO, INC2
  462. add AO, AO, INC2
  463. add AO, AO, INC2
  464. LFPDUX A4, AO, INC2
  465. LFPDUX A5, AO, INC2
  466. LFPDUX A6, AO, INC2
  467. add AO, AO, INC2
  468. LFPDUX A7, AO, INC2
  469. LFPDUX A8, AO, INC2
  470. LFPDUX A9, AO, INC2
  471. LFPDUX A10, AO, INC2
  472. subi AO, AO, 32 * SIZE
  473. fxpmul f4, A10, f3
  474. FXCXNPMA f3, A10, f3, f4
  475. fxcpnmsub f2, A9, f3, f2
  476. FXCXNSMA f2, A9, f3, f2
  477. fxcpnmsub f1, A8, f3, f1
  478. FXCXNSMA f1, A8, f3, f1
  479. fxcpnmsub f0, A7, f3, f0
  480. FXCXNSMA f0, A7, f3, f0
  481. fxpmul f4, A6, f2
  482. FXCXNPMA f2, A6, f2, f4
  483. fxcpnmsub f1, A5, f2, f1
  484. FXCXNSMA f1, A5, f2, f1
  485. fxcpnmsub f0, A4, f2, f0
  486. FXCXNSMA f0, A4, f2, f0
  487. fxpmul f4, A3, f1
  488. FXCXNPMA f1, A3, f1, f4
  489. fxcpnmsub f0, A2, f1, f0
  490. FXCXNSMA f0, A2, f1, f0
  491. fxpmul f4, A1, f0
  492. FXCXNPMA f0, A1, f0, f4
  493. #endif
  494. #ifdef LT
  495. LFPDUX A1, AO, INC2
  496. LFPDUX A2, AO, INC2
  497. LFPDUX A3, AO, INC2
  498. LFPDUX A4, AO, INC2
  499. add AO, AO, INC2
  500. LFPDUX A5, AO, INC2
  501. LFPDUX A6, AO, INC2
  502. LFPDUX A7, AO, INC2
  503. add AO, AO, INC2
  504. add AO, AO, INC2
  505. LFPDUX A8, AO, INC2
  506. LFPDUX A9, AO, INC2
  507. add AO, AO, INC2
  508. add AO, AO, INC2
  509. add AO, AO, INC2
  510. LFPDUX A10, AO, INC2
  511. subi AO, AO, 32 * SIZE
  512. fxpmul f4, A1, f0
  513. FXCXNPMA f0, A1, f0, f4
  514. fxcpnmsub f1, A2, f0, f1
  515. FXCXNSMA f1, A2, f0, f1
  516. fxcpnmsub f2, A3, f0, f2
  517. FXCXNSMA f2, A3, f0, f2
  518. fxcpnmsub f3, A4, f0, f3
  519. FXCXNSMA f3, A4, f0, f3
  520. fxpmul f6, A5, f1
  521. FXCXNPMA f1, A5, f1, f6
  522. fxcpnmsub f2, A6, f1, f2
  523. FXCXNSMA f2, A6, f1, f2
  524. fxcpnmsub f3, A7, f1, f3
  525. FXCXNSMA f3, A7, f1, f3
  526. fxpmul f4, A8, f2
  527. FXCXNPMA f2, A8, f2, f4
  528. fxcpnmsub f3, A9, f2, f3
  529. FXCXNSMA f3, A9, f2, f3
  530. fxpmul f6, A10, f3
  531. FXCXNPMA f3, A10, f3, f6
  532. #endif
  533. #ifdef RN
  534. LFPDX A1, BO, INC2
  535. fxpmul f4, A1, f0
  536. fxpmul f5, A1, f1
  537. fxpmul f6, A1, f2
  538. fxpmul f7, A1, f3
  539. FXCXNPMA f0, A1, f0, f4
  540. FXCXNPMA f1, A1, f1, f5
  541. FXCXNPMA f2, A1, f2, f6
  542. FXCXNPMA f3, A1, f3, f7
  543. #endif
  544. #ifdef RT
  545. LFPDX A1, BO, INC2
  546. fxpmul f4, A1, f0
  547. fxpmul f5, A1, f1
  548. fxpmul f6, A1, f2
  549. fxpmul f7, A1, f3
  550. FXCXNPMA f0, A1, f0, f4
  551. FXCXNPMA f1, A1, f1, f5
  552. FXCXNPMA f2, A1, f2, f6
  553. FXCXNPMA f3, A1, f3, f7
  554. #endif
  555. #ifdef LN
  556. subi CO1, CO1, 8 * SIZE
  557. #endif
  558. #if defined(LN) || defined(LT)
  559. STFPDUX f0, BO, INC2
  560. STFPDUX f1, BO, INC2
  561. STFPDUX f2, BO, INC2
  562. STFPDUX f3, BO, INC2
  563. subi BO, BO, 8 * SIZE
  564. #else
  565. STFPDUX f0, AO, INC2
  566. STFPDUX f1, AO, INC2
  567. STFPDUX f2, AO, INC2
  568. STFPDUX f3, AO, INC2
  569. subi AO, AO, 8 * SIZE
  570. #endif
  571. STFDUX f0, CO1, INC
  572. STFSDUX f0, CO1, INC
  573. STFDUX f1, CO1, INC
  574. STFSDUX f1, CO1, INC
  575. STFDUX f2, CO1, INC
  576. STFSDUX f2, CO1, INC
  577. STFDUX f3, CO1, INC
  578. STFSDUX f3, CO1, INC
  579. #ifdef LN
  580. subi CO1, CO1, 8 * SIZE
  581. #endif
  582. #ifdef RT
  583. slwi r0, K, 2 + ZBASE_SHIFT
  584. add AORIG, AORIG, r0
  585. #endif
  586. #if defined(LT) || defined(RN)
  587. sub TEMP, K, KK
  588. slwi r0, TEMP, 2 + ZBASE_SHIFT
  589. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  590. add AO, AO, r0
  591. add BO, BO, TEMP
  592. #endif
  593. #ifdef LT
  594. addi KK, KK, 4
  595. #endif
  596. #ifdef LN
  597. subi KK, KK, 4
  598. #endif
  599. addic. I, I, -1
  600. li r0, FZERO
  601. lfpsx f0, SP, r0
  602. bgt+ .L51
  603. .align 4
  604. .L60:
  605. andi. I, M, 2
  606. beq .L70
  607. #if defined(LT) || defined(RN)
  608. fpmr f1, f0
  609. addi BO, B, - 2 * SIZE
  610. fpmr f2, f0
  611. fpmr f3, f0
  612. srawi. r0, KK, 2
  613. mtspr CTR, r0
  614. ble .L64
  615. #else
  616. #ifdef LN
  617. slwi r0, K, 1 + ZBASE_SHIFT
  618. sub AORIG, AORIG, r0
  619. #endif
  620. slwi r0 , KK, 1 + ZBASE_SHIFT
  621. slwi TEMP, KK, 0 + ZBASE_SHIFT
  622. add AO, AORIG, r0
  623. add BO, B, TEMP
  624. sub TEMP, K, KK
  625. fpmr f1, f0
  626. addi BO, BO, - 2 * SIZE
  627. fpmr f2, f0
  628. fpmr f3, f0
  629. srawi. r0, TEMP, 2
  630. mtspr CTR, r0
  631. ble .L64
  632. #endif
  633. LFPDUX B1, BO, INC2
  634. LFPDUX A1, AO, INC2
  635. LFPDUX A2, AO, INC2
  636. LFPDUX B2, BO, INC2
  637. LFPDUX A3, AO, INC2
  638. LFPDUX A4, AO, INC2
  639. LFPDUX B3, BO, INC2
  640. LFPDUX A5, AO, INC2
  641. LFPDUX A6, AO, INC2
  642. LFPDUX B4, BO, INC2
  643. LFPDUX A7, AO, INC2
  644. LFPDUX A8, AO, INC2
  645. bdz- .L63
  646. .align 4
  647. .L62:
  648. FXCPMADD f0, B1, A1, f0
  649. FXCSMADD f2, B1, A1, f2
  650. LFPDUX A1, AO, INC2
  651. FXCPMADD f1, B1, A2, f1
  652. FXCSMADD f3, B1, A2, f3
  653. LFPDUX A2, AO, INC2
  654. LFPDUX B1, BO, INC2
  655. FXCPMADD f0, B2, A3, f0
  656. FXCSMADD f2, B2, A3, f2
  657. LFPDUX A3, AO, INC2
  658. FXCPMADD f1, B2, A4, f1
  659. FXCSMADD f3, B2, A4, f3
  660. LFPDUX A4, AO, INC2
  661. LFPDUX B2, BO, INC2
  662. FXCPMADD f0, B3, A5, f0
  663. FXCSMADD f2, B3, A5, f2
  664. LFPDUX A5, AO, INC2
  665. FXCPMADD f1, B3, A6, f1
  666. FXCSMADD f3, B3, A6, f3
  667. LFPDUX A6, AO, INC2
  668. LFPDUX B3, BO, INC2
  669. FXCPMADD f0, B4, A7, f0
  670. FXCSMADD f2, B4, A7, f2
  671. LFPDUX A7, AO, INC2
  672. FXCPMADD f1, B4, A8, f1
  673. FXCSMADD f3, B4, A8, f3
  674. LFPDUX A8, AO, INC2
  675. LFPDUX B4, BO, INC2
  676. bdnz+ .L62
  677. .align 4
  678. .L63:
  679. FXCPMADD f0, B1, A1, f0
  680. FXCSMADD f2, B1, A1, f2
  681. FXCPMADD f1, B1, A2, f1
  682. FXCSMADD f3, B1, A2, f3
  683. FXCPMADD f0, B2, A3, f0
  684. FXCSMADD f2, B2, A3, f2
  685. FXCPMADD f1, B2, A4, f1
  686. FXCSMADD f3, B2, A4, f3
  687. FXCPMADD f0, B3, A5, f0
  688. FXCSMADD f2, B3, A5, f2
  689. FXCPMADD f1, B3, A6, f1
  690. FXCSMADD f3, B3, A6, f3
  691. FXCPMADD f0, B4, A7, f0
  692. FXCSMADD f2, B4, A7, f2
  693. FXCPMADD f1, B4, A8, f1
  694. FXCSMADD f3, B4, A8, f3
  695. .align 4
  696. .L64:
  697. #if defined(LT) || defined(RN)
  698. andi. r0, KK, 3
  699. mtspr CTR, r0
  700. ble+ .L68
  701. #else
  702. andi. r0, TEMP, 3
  703. mtspr CTR, r0
  704. ble+ .L68
  705. #endif
  706. LFPDUX A1, AO, INC2
  707. LFPDUX B1, BO, INC2
  708. LFPDUX A2, AO, INC2
  709. bdz- .L67
  710. .align 4
  711. .L66:
  712. FXCPMADD f0, B1, A1, f0
  713. FXCSMADD f2, B1, A1, f2
  714. LFPDUX A1, AO, INC2
  715. FXCPMADD f1, B1, A2, f1
  716. FXCSMADD f3, B1, A2, f3
  717. LFPDUX B1, BO, INC2
  718. LFPDUX A2, AO, INC2
  719. bdnz+ .L66
  720. .align 4
  721. .L67:
  722. FXCPMADD f0, B1, A1, f0
  723. FXCSMADD f2, B1, A1, f2
  724. FXCPMADD f1, B1, A2, f1
  725. FXCSMADD f3, B1, A2, f3
  726. .align 4
  727. .L68:
  728. fpadd f0, f0, f2
  729. fpadd f1, f1, f3
  730. #if defined(LN) || defined(RT)
  731. #ifdef LN
  732. subi r0, KK, 2
  733. #else
  734. subi r0, KK, 1
  735. #endif
  736. slwi TEMP, r0, 1 + ZBASE_SHIFT
  737. slwi r0, r0, 0 + ZBASE_SHIFT
  738. add AO, AORIG, TEMP
  739. add BO, B, r0
  740. addi BO, BO, - 2 * SIZE
  741. #endif
  742. #if defined(LN) || defined(LT)
  743. LFPDUX f16, BO, INC2
  744. LFPDUX f17, BO, INC2
  745. subi BO, BO, 4 * SIZE
  746. #else
  747. LFPDUX f16, AO, INC2
  748. LFPDUX f17, AO, INC2
  749. subi AO, AO, 4 * SIZE
  750. #endif
  751. fpsub f0, f16, f0
  752. fpsub f1, f17, f1
  753. #ifdef LN
  754. LFPDUX A1, AO, INC2
  755. add AO, AO, INC2
  756. LFPDUX A2, AO, INC2
  757. LFPDUX A3, AO, INC2
  758. subi AO, AO, 8 * SIZE
  759. fxpmul f4, A3, f1
  760. FXCXNPMA f1, A3, f1, f4
  761. fxcpnmsub f0, A2, f1, f0
  762. FXCXNSMA f0, A2, f1, f0
  763. fxpmul f4, A1, f0
  764. FXCXNPMA f0, A1, f0, f4
  765. #endif
  766. #ifdef LT
  767. LFPDUX A1, AO, INC2
  768. LFPDUX A2, AO, INC2
  769. add AO, AO, INC2
  770. LFPDUX A3, AO, INC2
  771. subi AO, AO, 8 * SIZE
  772. fxpmul f4, A1, f0
  773. FXCXNPMA f0, A1, f0, f4
  774. fxcpnmsub f1, A2, f0, f1
  775. FXCXNSMA f1, A2, f0, f1
  776. fxpmul f6, A3, f1
  777. FXCXNPMA f1, A3, f1, f6
  778. #endif
  779. #ifdef RN
  780. LFPDX A1, BO, INC2
  781. fxpmul f4, A1, f0
  782. fxpmul f5, A1, f1
  783. FXCXNPMA f0, A1, f0, f4
  784. FXCXNPMA f1, A1, f1, f5
  785. #endif
  786. #ifdef RT
  787. LFPDX A1, BO, INC2
  788. fxpmul f4, A1, f0
  789. fxpmul f5, A1, f1
  790. FXCXNPMA f0, A1, f0, f4
  791. FXCXNPMA f1, A1, f1, f5
  792. #endif
  793. #ifdef LN
  794. subi CO1, CO1, 4 * SIZE
  795. #endif
  796. #if defined(LN) || defined(LT)
  797. STFPDUX f0, BO, INC2
  798. STFPDUX f1, BO, INC2
  799. subi BO, BO, 4 * SIZE
  800. #else
  801. STFPDUX f0, AO, INC2
  802. STFPDUX f1, AO, INC2
  803. subi AO, AO, 4 * SIZE
  804. #endif
  805. STFDUX f0, CO1, INC
  806. STFSDUX f0, CO1, INC
  807. STFDUX f1, CO1, INC
  808. STFSDUX f1, CO1, INC
  809. #ifdef LN
  810. subi CO1, CO1, 4 * SIZE
  811. #endif
  812. #ifdef RT
  813. slwi r0, K, 1 + ZBASE_SHIFT
  814. add AORIG, AORIG, r0
  815. #endif
  816. #if defined(LT) || defined(RN)
  817. sub TEMP, K, KK
  818. slwi r0, TEMP, 1 + ZBASE_SHIFT
  819. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  820. add AO, AO, r0
  821. add BO, BO, TEMP
  822. #endif
  823. #ifdef LT
  824. addi KK, KK, 2
  825. #endif
  826. #ifdef LN
  827. subi KK, KK, 2
  828. #endif
  829. li r0, FZERO
  830. lfpsx f0, SP, r0
  831. .align 4
  832. .L70:
  833. andi. I, M, 1
  834. beq .L89
  835. #if defined(LT) || defined(RN)
  836. addi BO, B, - 2 * SIZE
  837. fpmr f1, f0
  838. fpmr f2, f0
  839. fpmr f3, f0
  840. srawi. r0, KK, 3
  841. mtspr CTR, r0
  842. ble .L74
  843. #else
  844. #ifdef LN
  845. slwi r0, K, 0 + ZBASE_SHIFT
  846. sub AORIG, AORIG, r0
  847. #endif
  848. slwi TEMP, KK, 0 + ZBASE_SHIFT
  849. add AO, AORIG, TEMP
  850. add BO, B, TEMP
  851. sub TEMP, K, KK
  852. addi BO, BO, - 2 * SIZE
  853. fpmr f1, f0
  854. fpmr f2, f0
  855. fpmr f3, f0
  856. srawi. r0, TEMP, 3
  857. mtspr CTR, r0
  858. ble .L74
  859. #endif
  860. LFPDUX A1, AO, INC2
  861. LFPDUX B1, BO, INC2
  862. LFPDUX A2, AO, INC2
  863. LFPDUX B2, BO, INC2
  864. LFPDUX A3, AO, INC2
  865. LFPDUX B3, BO, INC2
  866. LFPDUX A4, AO, INC2
  867. LFPDUX B4, BO, INC2
  868. LFPDUX A5, AO, INC2
  869. LFPDUX B5, BO, INC2
  870. LFPDUX A6, AO, INC2
  871. LFPDUX B6, BO, INC2
  872. LFPDUX A7, AO, INC2
  873. LFPDUX A9, BO, INC2
  874. LFPDUX A8, AO, INC2
  875. LFPDUX A10, BO, INC2
  876. bdz- .L73
  877. .align 4
  878. .L72:
  879. FXCPMADD f0, B1, A1, f0
  880. FXCSMADD f1, B1, A1, f1
  881. LFPDUX A1, AO, INC2
  882. LFPDUX B1, BO, INC2
  883. FXCPMADD f2, B2, A2, f2
  884. FXCSMADD f3, B2, A2, f3
  885. LFPDUX A2, AO, INC2
  886. LFPDUX B2, BO, INC2
  887. FXCPMADD f0, B3, A3, f0
  888. FXCSMADD f1, B3, A3, f1
  889. LFPDUX A3, AO, INC2
  890. LFPDUX B3, BO, INC2
  891. FXCPMADD f2, B4, A4, f2
  892. FXCSMADD f3, B4, A4, f3
  893. LFPDUX A4, AO, INC2
  894. LFPDUX B4, BO, INC2
  895. FXCPMADD f0, B5, A5, f0
  896. FXCSMADD f1, B5, A5, f1
  897. LFPDUX A5, AO, INC2
  898. LFPDUX B5, BO, INC2
  899. FXCPMADD f2, B6, A6, f2
  900. FXCSMADD f3, B6, A6, f3
  901. LFPDUX A6, AO, INC2
  902. LFPDUX B6, BO, INC2
  903. FXCPMADD f0, A9, A7, f0
  904. FXCSMADD f1, A9, A7, f1
  905. LFPDUX A7, AO, INC2
  906. LFPDUX A9, BO, INC2
  907. FXCPMADD f2, A10, A8, f2
  908. FXCSMADD f3, A10, A8, f3
  909. LFPDUX A8, AO, INC2
  910. LFPDUX A10, BO, INC2
  911. bdnz+ .L72
  912. .align 4
  913. .L73:
  914. FXCPMADD f0, B1, A1, f0
  915. FXCSMADD f1, B1, A1, f1
  916. FXCPMADD f2, B2, A2, f2
  917. FXCSMADD f3, B2, A2, f3
  918. FXCPMADD f0, B3, A3, f0
  919. FXCSMADD f1, B3, A3, f1
  920. FXCPMADD f2, B4, A4, f2
  921. FXCSMADD f3, B4, A4, f3
  922. FXCPMADD f0, B5, A5, f0
  923. FXCSMADD f1, B5, A5, f1
  924. FXCPMADD f2, B6, A6, f2
  925. FXCSMADD f3, B6, A6, f3
  926. FXCPMADD f0, A9, A7, f0
  927. FXCSMADD f1, A9, A7, f1
  928. FXCPMADD f2, A10, A8, f2
  929. FXCSMADD f3, A10, A8, f3
  930. .align 4
  931. .L74:
  932. #if defined(LT) || defined(RN)
  933. andi. r0, KK, 7
  934. mtspr CTR, r0
  935. ble+ .L78
  936. #else
  937. andi. r0, TEMP, 7
  938. mtspr CTR, r0
  939. ble+ .L78
  940. #endif
  941. LFPDUX A1, AO, INC2
  942. LFPDUX B1, BO, INC2
  943. bdz- .L77
  944. .align 4
  945. .L76:
  946. FXCPMADD f0, B1, A1, f0
  947. FXCSMADD f1, B1, A1, f1
  948. LFPDUX A1, AO, INC2
  949. LFPDUX B1, BO, INC2
  950. bdnz+ .L76
  951. .align 4
  952. .L77:
  953. FXCPMADD f0, B1, A1, f0
  954. FXCSMADD f1, B1, A1, f1
  955. .align 4
  956. .L78:
  957. fpadd f0, f0, f2
  958. fpadd f1, f1, f3
  959. fpadd f0, f0, f1
  960. #if defined(LN) || defined(RT)
  961. #ifdef LN
  962. subi r0, KK, 1
  963. #else
  964. subi r0, KK, 1
  965. #endif
  966. slwi TEMP, r0, 0 + ZBASE_SHIFT
  967. add AO, AORIG, TEMP
  968. add BO, B, TEMP
  969. addi BO, BO, - 2 * SIZE
  970. #endif
  971. #if defined(LN) || defined(LT)
  972. LFPDX f16, BO, INC2
  973. #else
  974. LFPDX f16, AO, INC2
  975. #endif
  976. fpsub f0, f16, f0
  977. #ifdef LN
  978. LFPDX A1, AO, INC2
  979. fxpmul f4, A1, f0
  980. FXCXNPMA f0, A1, f0, f4
  981. #endif
  982. #ifdef LT
  983. LFPDX A1, AO, INC2
  984. fxpmul f4, A1, f0
  985. FXCXNPMA f0, A1, f0, f4
  986. #endif
  987. #ifdef RN
  988. LFPDX A1, BO, INC2
  989. fxpmul f4, A1, f0
  990. FXCXNPMA f0, A1, f0, f4
  991. #endif
  992. #ifdef RT
  993. LFPDX A1, BO, INC2
  994. fxpmul f4, A1, f0
  995. FXCXNPMA f0, A1, f0, f4
  996. #endif
  997. #ifdef LN
  998. subi CO1, CO1, 2 * SIZE
  999. #endif
  1000. #if defined(LN) || defined(LT)
  1001. STFPDX f0, BO, INC2
  1002. #else
  1003. STFPDX f0, AO, INC2
  1004. #endif
  1005. STFDUX f0, CO1, INC
  1006. STFSDUX f0, CO1, INC
  1007. #ifdef LN
  1008. subi CO1, CO1, 2 * SIZE
  1009. #endif
  1010. #ifdef RT
  1011. slwi r0, K, 0 + ZBASE_SHIFT
  1012. add AORIG, AORIG, r0
  1013. #endif
  1014. #if defined(LT) || defined(RN)
  1015. sub TEMP, K, KK
  1016. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  1017. add AO, AO, TEMP
  1018. add BO, BO, TEMP
  1019. #endif
  1020. #ifdef LT
  1021. addi KK, KK, 1
  1022. #endif
  1023. #ifdef LN
  1024. subi KK, KK, 1
  1025. #endif
  1026. li r0, FZERO
  1027. lfpsx f0, SP, r0
  1028. .align 4
  1029. .L89:
  1030. #ifdef LN
  1031. slwi r0, K, 0 + ZBASE_SHIFT
  1032. add B, B, r0
  1033. #endif
  1034. #if defined(LT) || defined(RN)
  1035. addi B, BO, 2 * SIZE
  1036. #endif
  1037. #ifdef RN
  1038. addi KK, KK, 1
  1039. #endif
  1040. #ifdef RT
  1041. subi KK, KK, 1
  1042. #endif
  1043. .align 4
  1044. .L50:
  1045. srawi. J, N, 1
  1046. ble .L999
  1047. .align 4
  1048. .L10:
  1049. #ifdef RT
  1050. slwi r0, K, 1 + ZBASE_SHIFT
  1051. sub B, B, r0
  1052. slwi r0, LDC, 1
  1053. sub C, C, r0
  1054. #endif
  1055. mr CO1, C
  1056. add CO2, C, LDC
  1057. #ifdef LN
  1058. add KK, M, OFFSET
  1059. #endif
  1060. #ifdef LT
  1061. mr KK, OFFSET
  1062. #endif
  1063. #if defined(LN) || defined(RT)
  1064. addi AORIG, A, -4 * SIZE
  1065. #else
  1066. addi AO, A, -4 * SIZE
  1067. #endif
  1068. #ifndef RT
  1069. add C, CO2, LDC
  1070. #endif
  1071. li r0, FZERO
  1072. lfpsx f0, SP, r0
  1073. srawi. I, M, 2
  1074. ble .L20
  1075. .align 4
  1076. .L11:
  1077. #if defined(LT) || defined(RN)
  1078. addi AO2, AO, 2 * SIZE
  1079. fpmr f4, f0
  1080. addi BO, B, - 4 * SIZE
  1081. fpmr f8, f0
  1082. addi BO2, B, - 2 * SIZE
  1083. fpmr f12, f0
  1084. fpmr f5, f0
  1085. fpmr f9, f0
  1086. fpmr f13, f0
  1087. fpmr f2, f0
  1088. fpmr f6, f0
  1089. fpmr f10, f0
  1090. fpmr f14, f0
  1091. fpmr f3, f0
  1092. fpmr f7, f0
  1093. fpmr f11, f0
  1094. fpmr f15, f0
  1095. srawi. r0, KK, 2
  1096. fpmr f1, f0
  1097. mtspr CTR, r0
  1098. ble .L14
  1099. #else
  1100. #ifdef LN
  1101. slwi r0, K, 2 + ZBASE_SHIFT
  1102. sub AORIG, AORIG, r0
  1103. #endif
  1104. slwi r0 , KK, 2 + ZBASE_SHIFT
  1105. slwi TEMP, KK, 1 + ZBASE_SHIFT
  1106. add AO, AORIG, r0
  1107. add BO, B, TEMP
  1108. sub TEMP, K, KK
  1109. fpmr f5, f0
  1110. fpmr f9, f0
  1111. fpmr f13, f0
  1112. fpmr f2, f0
  1113. fpmr f6, f0
  1114. fpmr f10, f0
  1115. fpmr f14, f0
  1116. fpmr f3, f0
  1117. fpmr f7, f0
  1118. fpmr f11, f0
  1119. fpmr f15, f0
  1120. addi AO2, AO, 2 * SIZE
  1121. fpmr f4, f0
  1122. addi BO, BO, - 4 * SIZE
  1123. fpmr f8, f0
  1124. addi BO2, BO, 2 * SIZE
  1125. fpmr f12, f0
  1126. srawi. r0, TEMP, 2
  1127. fpmr f1, f0
  1128. mtspr CTR, r0
  1129. ble .L14
  1130. #endif
  1131. LFPDUX A1, AO, INC4
  1132. fpmr f5, f0
  1133. LFPDUX A3, AO, INC4
  1134. fpmr f9, f0
  1135. LFPDUX B1, BO, INC4
  1136. fpmr f13, f0
  1137. LFPDUX A5, AO, INC4
  1138. fpmr f2, f0
  1139. LFPDUX A6, AO, INC4
  1140. fpmr f6, f0
  1141. LFPDUX B3, BO, INC4
  1142. fpmr f10, f0
  1143. LFPDUX A7, AO, INC4
  1144. fpmr f14, f0
  1145. LFPDUX A8, AO, INC4
  1146. fpmr f3, f0
  1147. LFPDUX B5, BO, INC4
  1148. fpmr f7, f0
  1149. LFPDUX A9, AO, INC4
  1150. fpmr f11, f0
  1151. LFPDUX A2, AO2, INC4
  1152. fpmr f15, f0
  1153. LFPDUX B2, BO2, INC4
  1154. bdz- .L13
  1155. .align 4
  1156. .L12:
  1157. ## 1 ##
  1158. FXCPMADD f0, B1, A1, f0
  1159. nop
  1160. FXCSMADD f4, B1, A1, f4
  1161. nop
  1162. FXCPMADD f8, B2, A1, f8
  1163. LFPDUX B4, BO2, INC4
  1164. FXCSMADD f12, B2, A1, f12
  1165. LFPDUX B6, BO, INC4
  1166. FXCPMADD f1, B1, A2, f1
  1167. nop
  1168. FXCSMADD f5, B1, A2, f5
  1169. LFPDUX A4, AO2, INC4
  1170. FXCPMADD f9, B2, A2, f9
  1171. LFPDUX A10, AO, INC4
  1172. FXCSMADD f13, B2, A2, f13
  1173. nop
  1174. FXCPMADD f2, B1, A3, f2
  1175. nop
  1176. FXCSMADD f6, B1, A3, f6
  1177. nop
  1178. FXCPMADD f10, B2, A3, f10
  1179. nop
  1180. FXCSMADD f14, B2, A3, f14
  1181. nop
  1182. FXCPMADD f3, B1, A4, f3
  1183. nop
  1184. FXCSMADD f7, B1, A4, f7
  1185. LFPDUX A2, AO2, INC4
  1186. FXCPMADD f11, B2, A4, f11
  1187. LFPDUX A1, AO, INC4
  1188. FXCSMADD f15, B2, A4, f15
  1189. nop
  1190. ## 2 ##
  1191. FXCPMADD f0, B3, A5, f0
  1192. nop
  1193. FXCSMADD f4, B3, A5, f4
  1194. nop
  1195. FXCPMADD f8, B4, A5, f8
  1196. LFPDUX B2, BO2, INC4
  1197. FXCSMADD f12, B4, A5, f12
  1198. LFPDUX B1, BO, INC4
  1199. FXCPMADD f1, B3, A2, f1
  1200. nop
  1201. FXCSMADD f5, B3, A2, f5
  1202. LFPDUX A4, AO2, INC4
  1203. FXCPMADD f9, B4, A2, f9
  1204. LFPDUX A3, AO, INC4
  1205. FXCSMADD f13, B4, A2, f13
  1206. nop
  1207. FXCPMADD f2, B3, A6, f2
  1208. nop
  1209. FXCSMADD f6, B3, A6, f6
  1210. nop
  1211. FXCPMADD f10, B4, A6, f10
  1212. nop
  1213. FXCSMADD f14, B4, A6, f14
  1214. nop
  1215. FXCPMADD f3, B3, A4, f3
  1216. nop
  1217. FXCSMADD f7, B3, A4, f7
  1218. LFPDUX A2, AO2, INC4
  1219. FXCPMADD f11, B4, A4, f11
  1220. LFPDUX A5, AO, INC4
  1221. FXCSMADD f15, B4, A4, f15
  1222. nop
  1223. ## 3 ##
  1224. FXCPMADD f0, B5, A7, f0
  1225. nop
  1226. FXCSMADD f4, B5, A7, f4
  1227. nop
  1228. FXCPMADD f8, B2, A7, f8
  1229. LFPDUX B4, BO2, INC4
  1230. FXCSMADD f12, B2, A7, f12
  1231. LFPDUX B3, BO, INC4
  1232. FXCPMADD f1, B5, A2, f1
  1233. nop
  1234. FXCSMADD f5, B5, A2, f5
  1235. LFPDUX A4, AO2, INC4
  1236. FXCPMADD f9, B2, A2, f9
  1237. LFPDUX A6, AO, INC4
  1238. FXCSMADD f13, B2, A2, f13
  1239. nop
  1240. FXCPMADD f2, B5, A8, f2
  1241. nop
  1242. FXCSMADD f6, B5, A8, f6
  1243. nop
  1244. FXCPMADD f10, B2, A8, f10
  1245. nop
  1246. FXCSMADD f14, B2, A8, f14
  1247. nop
  1248. FXCPMADD f3, B5, A4, f3
  1249. nop
  1250. FXCSMADD f7, B5, A4, f7
  1251. LFPDUX A2, AO2, INC4
  1252. FXCPMADD f11, B2, A4, f11
  1253. LFPDUX A7, AO, INC4
  1254. FXCSMADD f15, B2, A4, f15
  1255. nop
  1256. ## 4 ##
  1257. FXCPMADD f0, B6, A9, f0
  1258. nop
  1259. FXCSMADD f4, B6, A9, f4
  1260. nop
  1261. FXCPMADD f8, B4, A9, f8
  1262. LFPDUX B2, BO2, INC4
  1263. FXCSMADD f12, B4, A9, f12
  1264. LFPDUX B5, BO, INC4
  1265. FXCPMADD f1, B6, A2, f1
  1266. nop
  1267. FXCSMADD f5, B6, A2, f5
  1268. LFPDUX A4, AO2, INC4
  1269. FXCPMADD f9, B4, A2, f9
  1270. LFPDUX A8, AO, INC4
  1271. FXCSMADD f13, B4, A2, f13
  1272. nop
  1273. FXCPMADD f2, B6, A10, f2
  1274. nop
  1275. FXCSMADD f6, B6, A10, f6
  1276. nop
  1277. FXCPMADD f10, B4, A10, f10
  1278. nop
  1279. FXCSMADD f14, B4, A10, f14
  1280. nop
  1281. FXCPMADD f3, B6, A4, f3
  1282. LFPDUX A2, AO2, INC4
  1283. FXCSMADD f7, B6, A4, f7
  1284. LFPDUX A9, AO, INC4
  1285. FXCPMADD f11, B4, A4, f11
  1286. nop
  1287. FXCSMADD f15, B4, A4, f15
  1288. bdnz+ .L12
  1289. .align 4
  1290. .L13:
  1291. ## 1 ##
  1292. FXCPMADD f0, B1, A1, f0
  1293. nop
  1294. FXCSMADD f4, B1, A1, f4
  1295. nop
  1296. FXCPMADD f8, B2, A1, f8
  1297. LFPDUX B4, BO2, INC4
  1298. FXCSMADD f12, B2, A1, f12
  1299. LFPDUX B6, BO, INC4
  1300. FXCPMADD f1, B1, A2, f1
  1301. nop
  1302. FXCSMADD f5, B1, A2, f5
  1303. LFPDUX A4, AO2, INC4
  1304. FXCPMADD f9, B2, A2, f9
  1305. LFPDUX A10, AO, INC4
  1306. FXCSMADD f13, B2, A2, f13
  1307. nop
  1308. FXCPMADD f2, B1, A3, f2
  1309. nop
  1310. FXCSMADD f6, B1, A3, f6
  1311. nop
  1312. FXCPMADD f10, B2, A3, f10
  1313. nop
  1314. FXCSMADD f14, B2, A3, f14
  1315. nop
  1316. FXCPMADD f3, B1, A4, f3
  1317. nop
  1318. FXCSMADD f7, B1, A4, f7
  1319. LFPDUX A2, AO2, INC4
  1320. FXCPMADD f11, B2, A4, f11
  1321. nop
  1322. FXCSMADD f15, B2, A4, f15
  1323. nop
  1324. ## 2 ##
  1325. FXCPMADD f0, B3, A5, f0
  1326. nop
  1327. FXCSMADD f4, B3, A5, f4
  1328. nop
  1329. FXCPMADD f8, B4, A5, f8
  1330. LFPDUX B2, BO2, INC4
  1331. FXCSMADD f12, B4, A5, f12
  1332. nop
  1333. FXCPMADD f1, B3, A2, f1
  1334. nop
  1335. FXCSMADD f5, B3, A2, f5
  1336. LFPDUX A4, AO2, INC4
  1337. FXCPMADD f9, B4, A2, f9
  1338. nop
  1339. FXCSMADD f13, B4, A2, f13
  1340. nop
  1341. FXCPMADD f2, B3, A6, f2
  1342. nop
  1343. FXCSMADD f6, B3, A6, f6
  1344. nop
  1345. FXCPMADD f10, B4, A6, f10
  1346. nop
  1347. FXCSMADD f14, B4, A6, f14
  1348. nop
  1349. FXCPMADD f3, B3, A4, f3
  1350. nop
  1351. FXCSMADD f7, B3, A4, f7
  1352. LFPDUX A2, AO2, INC4
  1353. FXCPMADD f11, B4, A4, f11
  1354. nop
  1355. FXCSMADD f15, B4, A4, f15
  1356. nop
  1357. ## 3 ##
  1358. FXCPMADD f0, B5, A7, f0
  1359. nop
  1360. FXCSMADD f4, B5, A7, f4
  1361. nop
  1362. FXCPMADD f8, B2, A7, f8
  1363. LFPDUX B4, BO2, INC4
  1364. FXCSMADD f12, B2, A7, f12
  1365. nop
  1366. FXCPMADD f1, B5, A2, f1
  1367. nop
  1368. FXCSMADD f5, B5, A2, f5
  1369. LFPDUX A4, AO2, INC4
  1370. FXCPMADD f9, B2, A2, f9
  1371. nop
  1372. FXCSMADD f13, B2, A2, f13
  1373. nop
  1374. FXCPMADD f2, B5, A8, f2
  1375. nop
  1376. FXCSMADD f6, B5, A8, f6
  1377. nop
  1378. FXCPMADD f10, B2, A8, f10
  1379. nop
  1380. FXCSMADD f14, B2, A8, f14
  1381. nop
  1382. FXCPMADD f3, B5, A4, f3
  1383. nop
  1384. FXCSMADD f7, B5, A4, f7
  1385. LFPDUX A2, AO2, INC4
  1386. FXCPMADD f11, B2, A4, f11
  1387. nop
  1388. FXCSMADD f15, B2, A4, f15
  1389. nop
  1390. ## 4 ##
  1391. FXCPMADD f0, B6, A9, f0
  1392. nop
  1393. FXCSMADD f4, B6, A9, f4
  1394. nop
  1395. FXCPMADD f8, B4, A9, f8
  1396. nop
  1397. FXCSMADD f12, B4, A9, f12
  1398. nop
  1399. FXCPMADD f1, B6, A2, f1
  1400. nop
  1401. FXCSMADD f5, B6, A2, f5
  1402. LFPDUX A4, AO2, INC4
  1403. FXCPMADD f9, B4, A2, f9
  1404. nop
  1405. FXCSMADD f13, B4, A2, f13
  1406. nop
  1407. FXCPMADD f2, B6, A10, f2
  1408. nop
  1409. FXCSMADD f6, B6, A10, f6
  1410. nop
  1411. FXCPMADD f10, B4, A10, f10
  1412. nop
  1413. FXCSMADD f14, B4, A10, f14
  1414. nop
  1415. FXCPMADD f3, B6, A4, f3
  1416. nop
  1417. FXCSMADD f7, B6, A4, f7
  1418. nop
  1419. FXCPMADD f11, B4, A4, f11
  1420. nop
  1421. FXCSMADD f15, B4, A4, f15
  1422. nop
  1423. .align 4
  1424. .L14:
  1425. #if defined(LT) || defined(RN)
  1426. andi. r0, KK, 3
  1427. mtspr CTR, r0
  1428. ble+ .L18
  1429. #else
  1430. andi. r0, TEMP, 3
  1431. mtspr CTR, r0
  1432. ble+ .L18
  1433. #endif
  1434. .L15:
  1435. LFPDUX A2, AO, INC4
  1436. LFPDUX A4, AO2, INC4
  1437. LFPDUX A10, BO, INC4
  1438. LFPDUX B4, BO2, INC4
  1439. bdz- .L17
  1440. .align 4
  1441. .L16:
  1442. FXCPMADD f0, A10, A2, f0
  1443. FXCSMADD f4, A10, A2, f4
  1444. FXCPMADD f8, B4, A2, f8
  1445. FXCSMADD f12, B4, A2, f12
  1446. LFPDUX A2, AO, INC4
  1447. FXCPMADD f1, A10, A4, f1
  1448. FXCSMADD f5, A10, A4, f5
  1449. FXCPMADD f9, B4, A4, f9
  1450. FXCSMADD f13, B4, A4, f13
  1451. LFPDUX A4, AO2, INC4
  1452. FXCPMADD f2, A10, A2, f2
  1453. FXCSMADD f6, A10, A2, f6
  1454. FXCPMADD f10, B4, A2, f10
  1455. FXCSMADD f14, B4, A2, f14
  1456. LFPDUX A2, AO, INC4
  1457. FXCPMADD f3, A10, A4, f3
  1458. FXCSMADD f7, A10, A4, f7
  1459. LFPDUX A10, BO, INC4
  1460. FXCPMADD f11, B4, A4, f11
  1461. FXCSMADD f15, B4, A4, f15
  1462. LFPDUX A4, AO2, INC4
  1463. LFPDUX B4, BO2, INC4
  1464. bdnz+ .L16
  1465. .align 4
  1466. .L17:
  1467. FXCPMADD f0, A10, A2, f0
  1468. FXCSMADD f4, A10, A2, f4
  1469. FXCPMADD f8, B4, A2, f8
  1470. FXCSMADD f12, B4, A2, f12
  1471. LFPDUX A2, AO, INC4
  1472. FXCPMADD f1, A10, A4, f1
  1473. FXCSMADD f5, A10, A4, f5
  1474. FXCPMADD f9, B4, A4, f9
  1475. FXCSMADD f13, B4, A4, f13
  1476. LFPDUX A4, AO2, INC4
  1477. FXCPMADD f2, A10, A2, f2
  1478. FXCSMADD f6, A10, A2, f6
  1479. FXCPMADD f10, B4, A2, f10
  1480. FXCSMADD f14, B4, A2, f14
  1481. FXCPMADD f3, A10, A4, f3
  1482. FXCSMADD f7, A10, A4, f7
  1483. FXCPMADD f11, B4, A4, f11
  1484. FXCSMADD f15, B4, A4, f15
  1485. .align 4
  1486. .L18:
  1487. fpadd f0, f0, f4
  1488. fpadd f8, f8, f12
  1489. fpadd f1, f1, f5
  1490. fpadd f9, f9, f13
  1491. fpadd f2, f2, f6
  1492. fpadd f10, f10, f14
  1493. fpadd f3, f3, f7
  1494. fpadd f11, f11, f15
  1495. #if defined(LN) || defined(RT)
  1496. #ifdef LN
  1497. subi r0, KK, 4
  1498. #else
  1499. subi r0, KK, 2
  1500. #endif
  1501. slwi TEMP, r0, 2 + ZBASE_SHIFT
  1502. slwi r0, r0, 1 + ZBASE_SHIFT
  1503. add AO, AORIG, TEMP
  1504. add BO, B, r0
  1505. addi AO2, AO, 2 * SIZE
  1506. addi BO, BO, - 4 * SIZE
  1507. addi BO2, BO, 2 * SIZE
  1508. #endif
  1509. #if defined(LN) || defined(LT)
  1510. LFPDUX f16, BO, INC4
  1511. LFPDUX f20, BO2, INC4
  1512. LFPDUX f17, BO, INC4
  1513. LFPDUX f21, BO2, INC4
  1514. LFPDUX f18, BO, INC4
  1515. LFPDUX f22, BO2, INC4
  1516. LFPDUX f19, BO, INC4
  1517. LFPDUX f23, BO2, INC4
  1518. subi BO, BO, 16 * SIZE
  1519. subi BO2, BO2, 16 * SIZE
  1520. #else
  1521. LFPDUX f16, AO, INC4
  1522. LFPDUX f17, AO2, INC4
  1523. LFPDUX f18, AO, INC4
  1524. LFPDUX f19, AO2, INC4
  1525. LFPDUX f20, AO, INC4
  1526. LFPDUX f21, AO2, INC4
  1527. LFPDUX f22, AO, INC4
  1528. LFPDUX f23, AO2, INC4
  1529. subi AO, AO, 16 * SIZE
  1530. subi AO2, AO2, 16 * SIZE
  1531. #endif
  1532. fpsub f0, f16, f0
  1533. fpsub f1, f17, f1
  1534. fpsub f2, f18, f2
  1535. fpsub f3, f19, f3
  1536. fpsub f8, f20, f8
  1537. fpsub f9, f21, f9
  1538. fpsub f10, f22, f10
  1539. fpsub f11, f23, f11
  1540. #ifdef LN
  1541. LFPDUX A1, AO, INC4
  1542. add AO2, AO2, INC4
  1543. add AO, AO, INC4
  1544. add AO2, AO2, INC4
  1545. LFPDUX A2, AO, INC4
  1546. LFPDUX A3, AO2, INC4
  1547. add AO, AO, INC4
  1548. add AO2, AO2, INC4
  1549. LFPDUX A4, AO, INC4
  1550. LFPDUX A5, AO2, INC4
  1551. LFPDUX A6, AO, INC4
  1552. add AO2, AO2, INC4
  1553. LFPDUX A7, AO, INC4
  1554. LFPDUX A8, AO2, INC4
  1555. LFPDUX A9, AO, INC4
  1556. LFPDUX A10, AO2, INC4
  1557. subi AO, AO, 32 * SIZE
  1558. subi AO2, AO2, 32 * SIZE
  1559. fxpmul f4, A10, f3
  1560. fxpmul f5, A10, f11
  1561. FXCXNPMA f3, A10, f3, f4
  1562. FXCXNPMA f11, A10, f11, f5
  1563. fxcpnmsub f2, A9, f3, f2
  1564. fxcpnmsub f10, A9, f11, f10
  1565. FXCXNSMA f2, A9, f3, f2
  1566. FXCXNSMA f10, A9, f11, f10
  1567. fxcpnmsub f1, A8, f3, f1
  1568. fxcpnmsub f9, A8, f11, f9
  1569. FXCXNSMA f1, A8, f3, f1
  1570. FXCXNSMA f9, A8, f11, f9
  1571. fxcpnmsub f0, A7, f3, f0
  1572. fxcpnmsub f8, A7, f11, f8
  1573. FXCXNSMA f0, A7, f3, f0
  1574. FXCXNSMA f8, A7, f11, f8
  1575. fxpmul f4, A6, f2
  1576. fxpmul f5, A6, f10
  1577. FXCXNPMA f2, A6, f2, f4
  1578. FXCXNPMA f10, A6, f10, f5
  1579. fxcpnmsub f1, A5, f2, f1
  1580. fxcpnmsub f9, A5, f10, f9
  1581. FXCXNSMA f1, A5, f2, f1
  1582. FXCXNSMA f9, A5, f10, f9
  1583. fxcpnmsub f0, A4, f2, f0
  1584. fxcpnmsub f8, A4, f10, f8
  1585. FXCXNSMA f0, A4, f2, f0
  1586. FXCXNSMA f8, A4, f10, f8
  1587. fxpmul f4, A3, f1
  1588. fxpmul f5, A3, f9
  1589. FXCXNPMA f1, A3, f1, f4
  1590. FXCXNPMA f9, A3, f9, f5
  1591. fxcpnmsub f0, A2, f1, f0
  1592. fxcpnmsub f8, A2, f9, f8
  1593. FXCXNSMA f0, A2, f1, f0
  1594. FXCXNSMA f8, A2, f9, f8
  1595. fxpmul f4, A1, f0
  1596. fxpmul f5, A1, f8
  1597. FXCXNPMA f0, A1, f0, f4
  1598. FXCXNPMA f8, A1, f8, f5
  1599. #endif
  1600. #ifdef LT
  1601. LFPDUX A1, AO, INC4
  1602. LFPDUX A2, AO2, INC4
  1603. LFPDUX A3, AO, INC4
  1604. LFPDUX A4, AO2, INC4
  1605. add AO, AO, INC4
  1606. LFPDUX A5, AO2, INC4
  1607. LFPDUX A6, AO, INC4
  1608. LFPDUX A7, AO2, INC4
  1609. add AO, AO, INC4
  1610. add AO2, AO2, INC4
  1611. LFPDUX A8, AO, INC4
  1612. LFPDUX A9, AO2, INC4
  1613. add AO, AO, INC4
  1614. add AO2, AO2, INC4
  1615. add AO, AO, INC4
  1616. LFPDUX A10, AO2, INC4
  1617. subi AO, AO, 32 * SIZE
  1618. subi AO2, AO2, 32 * SIZE
  1619. fxpmul f4, A1, f0
  1620. fxpmul f5, A1, f8
  1621. FXCXNPMA f0, A1, f0, f4
  1622. FXCXNPMA f8, A1, f8, f5
  1623. fxcpnmsub f1, A2, f0, f1
  1624. fxcpnmsub f9, A2, f8, f9
  1625. FXCXNSMA f1, A2, f0, f1
  1626. FXCXNSMA f9, A2, f8, f9
  1627. fxcpnmsub f2, A3, f0, f2
  1628. fxcpnmsub f10, A3, f8, f10
  1629. FXCXNSMA f2, A3, f0, f2
  1630. FXCXNSMA f10, A3, f8, f10
  1631. fxcpnmsub f3, A4, f0, f3
  1632. fxcpnmsub f11, A4, f8, f11
  1633. FXCXNSMA f3, A4, f0, f3
  1634. FXCXNSMA f11, A4, f8, f11
  1635. fxpmul f6, A5, f1
  1636. fxpmul f7, A5, f9
  1637. FXCXNPMA f1, A5, f1, f6
  1638. FXCXNPMA f9, A5, f9, f7
  1639. fxcpnmsub f2, A6, f1, f2
  1640. fxcpnmsub f10, A6, f9, f10
  1641. FXCXNSMA f2, A6, f1, f2
  1642. FXCXNSMA f10, A6, f9, f10
  1643. fxcpnmsub f3, A7, f1, f3
  1644. fxcpnmsub f11, A7, f9, f11
  1645. FXCXNSMA f3, A7, f1, f3
  1646. FXCXNSMA f11, A7, f9, f11
  1647. fxpmul f4, A8, f2
  1648. fxpmul f5, A8, f10
  1649. FXCXNPMA f2, A8, f2, f4
  1650. FXCXNPMA f10, A8, f10, f5
  1651. fxcpnmsub f3, A9, f2, f3
  1652. fxcpnmsub f11, A9, f10, f11
  1653. FXCXNSMA f3, A9, f2, f3
  1654. FXCXNSMA f11, A9, f10, f11
  1655. fxpmul f6, A10, f3
  1656. fxpmul f7, A10, f11
  1657. FXCXNPMA f3, A10, f3, f6
  1658. FXCXNPMA f11, A10, f11, f7
  1659. #endif
  1660. #ifdef RN
  1661. LFPDUX A1, BO, INC4
  1662. LFPDUX A2, BO2, INC4
  1663. add BO, BO, INC4
  1664. LFPDUX A3, BO2, INC4
  1665. subi BO, BO, 8 * SIZE
  1666. subi BO2, BO2, 8 * SIZE
  1667. fxpmul f4, A1, f0
  1668. fxpmul f5, A1, f1
  1669. fxpmul f6, A1, f2
  1670. fxpmul f7, A1, f3
  1671. FXCXNPMA f0, A1, f0, f4
  1672. FXCXNPMA f1, A1, f1, f5
  1673. FXCXNPMA f2, A1, f2, f6
  1674. FXCXNPMA f3, A1, f3, f7
  1675. fxcpnmsub f8, A2, f0, f8
  1676. fxcpnmsub f9, A2, f1, f9
  1677. fxcpnmsub f10, A2, f2, f10
  1678. fxcpnmsub f11, A2, f3, f11
  1679. FXCXNSMA f8, A2, f0, f8
  1680. FXCXNSMA f9, A2, f1, f9
  1681. FXCXNSMA f10, A2, f2, f10
  1682. FXCXNSMA f11, A2, f3, f11
  1683. fxpmul f4, A3, f8
  1684. fxpmul f5, A3, f9
  1685. fxpmul f6, A3, f10
  1686. fxpmul f7, A3, f11
  1687. FXCXNPMA f8, A3, f8, f4
  1688. FXCXNPMA f9, A3, f9, f5
  1689. FXCXNPMA f10, A3, f10, f6
  1690. FXCXNPMA f11, A3, f11, f7
  1691. #endif
  1692. #ifdef RT
  1693. LFPDUX A1, BO, INC4
  1694. add BO2, BO2, INC4
  1695. LFPDUX A2, BO, INC4
  1696. LFPDUX A3, BO2, INC4
  1697. subi BO, BO, 8 * SIZE
  1698. subi BO2, BO2, 8 * SIZE
  1699. fxpmul f4, A3, f8
  1700. fxpmul f5, A3, f9
  1701. fxpmul f6, A3, f10
  1702. fxpmul f7, A3, f11
  1703. FXCXNPMA f8, A3, f8, f4
  1704. FXCXNPMA f9, A3, f9, f5
  1705. FXCXNPMA f10, A3, f10, f6
  1706. FXCXNPMA f11, A3, f11, f7
  1707. fxcpnmsub f0, A2, f8, f0
  1708. fxcpnmsub f1, A2, f9, f1
  1709. fxcpnmsub f2, A2, f10, f2
  1710. fxcpnmsub f3, A2, f11, f3
  1711. FXCXNSMA f0, A2, f8, f0
  1712. FXCXNSMA f1, A2, f9, f1
  1713. FXCXNSMA f2, A2, f10, f2
  1714. FXCXNSMA f3, A2, f11, f3
  1715. fxpmul f4, A1, f0
  1716. fxpmul f5, A1, f1
  1717. fxpmul f6, A1, f2
  1718. fxpmul f7, A1, f3
  1719. FXCXNPMA f0, A1, f0, f4
  1720. FXCXNPMA f1, A1, f1, f5
  1721. FXCXNPMA f2, A1, f2, f6
  1722. FXCXNPMA f3, A1, f3, f7
  1723. #endif
  1724. #ifdef LN
  1725. subi CO1, CO1, 8 * SIZE
  1726. subi CO2, CO2, 8 * SIZE
  1727. #endif
  1728. #if defined(LN) || defined(LT)
  1729. STFPDUX f0, BO, INC4
  1730. STFPDUX f8, BO2, INC4
  1731. STFPDUX f1, BO, INC4
  1732. STFPDUX f9, BO2, INC4
  1733. STFPDUX f2, BO, INC4
  1734. STFPDUX f10, BO2, INC4
  1735. STFPDUX f3, BO, INC4
  1736. STFPDUX f11, BO2, INC4
  1737. subi BO, BO, 16 * SIZE
  1738. subi BO2, BO2, 16 * SIZE
  1739. #else
  1740. STFPDUX f0, AO, INC4
  1741. STFPDUX f1, AO2, INC4
  1742. STFPDUX f2, AO, INC4
  1743. STFPDUX f3, AO2, INC4
  1744. STFPDUX f8, AO, INC4
  1745. STFPDUX f9, AO2, INC4
  1746. STFPDUX f10, AO, INC4
  1747. STFPDUX f11, AO2, INC4
  1748. subi AO, AO, 16 * SIZE
  1749. subi AO2, AO2, 16 * SIZE
  1750. #endif
  1751. STFDUX f0, CO1, INC
  1752. STFSDUX f0, CO1, INC
  1753. STFDUX f1, CO1, INC
  1754. STFSDUX f1, CO1, INC
  1755. STFDUX f2, CO1, INC
  1756. STFSDUX f2, CO1, INC
  1757. STFDUX f3, CO1, INC
  1758. STFSDUX f3, CO1, INC
  1759. STFDUX f8, CO2, INC
  1760. STFSDUX f8, CO2, INC
  1761. STFDUX f9, CO2, INC
  1762. STFSDUX f9, CO2, INC
  1763. STFDUX f10, CO2, INC
  1764. STFSDUX f10, CO2, INC
  1765. STFDUX f11, CO2, INC
  1766. STFSDUX f11, CO2, INC
  1767. #ifdef LN
  1768. subi CO1, CO1, 8 * SIZE
  1769. subi CO2, CO2, 8 * SIZE
  1770. #endif
  1771. #ifdef RT
  1772. slwi r0, K, 2 + ZBASE_SHIFT
  1773. add AORIG, AORIG, r0
  1774. #endif
  1775. #if defined(LT) || defined(RN)
  1776. sub TEMP, K, KK
  1777. slwi r0, TEMP, 2 + ZBASE_SHIFT
  1778. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  1779. add AO, AO, r0
  1780. add BO, BO, TEMP
  1781. #endif
  1782. #ifdef LT
  1783. addi KK, KK, 4
  1784. #endif
  1785. #ifdef LN
  1786. subi KK, KK, 4
  1787. #endif
  1788. addic. I, I, -1
  1789. li r0, FZERO
  1790. lfpsx f0, SP, r0
  1791. bgt+ .L11
  1792. .align 4
  1793. .L20:
  1794. andi. I, M, 2
  1795. beq .L30
  1796. #if defined(LT) || defined(RN)
  1797. addi AO2, AO, 2 * SIZE
  1798. fpmr f4, f0
  1799. addi BO, B, - 4 * SIZE
  1800. fpmr f8, f0
  1801. addi BO2, B, - 2 * SIZE
  1802. fpmr f12, f0
  1803. srawi. r0, KK, 2
  1804. fpmr f1, f0
  1805. fpmr f5, f0
  1806. fpmr f9, f0
  1807. mtspr CTR, r0
  1808. fpmr f13, f0
  1809. ble .L24
  1810. #else
  1811. #ifdef LN
  1812. slwi r0, K, 1 + ZBASE_SHIFT
  1813. sub AORIG, AORIG, r0
  1814. #endif
  1815. slwi r0 , KK, 1 + ZBASE_SHIFT
  1816. add AO, AORIG, r0
  1817. add BO, B, r0
  1818. sub TEMP, K, KK
  1819. addi AO2, AO, 2 * SIZE
  1820. fpmr f4, f0
  1821. addi BO, BO, - 4 * SIZE
  1822. fpmr f8, f0
  1823. addi BO2, BO, 2 * SIZE
  1824. fpmr f12, f0
  1825. fpmr f1, f0
  1826. fpmr f5, f0
  1827. fpmr f9, f0
  1828. fpmr f13, f0
  1829. srawi. r0, TEMP, 2
  1830. mtspr CTR, r0
  1831. ble .L24
  1832. #endif
  1833. LFPDUX A1, AO, INC4
  1834. LFPDUX B1, BO, INC4
  1835. LFPDUX A2, AO2, INC4
  1836. LFPDUX B2, BO2, INC4
  1837. LFPDUX A3, AO, INC4
  1838. LFPDUX B3, BO, INC4
  1839. LFPDUX A4, AO2, INC4
  1840. LFPDUX B4, BO2, INC4
  1841. LFPDUX A5, AO, INC4
  1842. LFPDUX B5, BO, INC4
  1843. LFPDUX A6, AO2, INC4
  1844. LFPDUX B6, BO2, INC4
  1845. LFPDUX A7, AO, INC4
  1846. LFPDUX A9, BO, INC4
  1847. LFPDUX A10, BO2, INC4
  1848. bdz- .L23
  1849. .align 4
  1850. .L22:
  1851. FXCPMADD f0, B1, A1, f0
  1852. nop
  1853. FXCSMADD f4, B1, A1, f4
  1854. LFPDUX A8, AO2, INC4
  1855. FXCPMADD f8, B2, A1, f8
  1856. nop
  1857. FXCSMADD f12, B2, A1, f12
  1858. LFPDUX A1, AO, INC4
  1859. FXCPMADD f1, B1, A2, f1
  1860. nop
  1861. FXCSMADD f5, B1, A2, f5
  1862. LFPDUX B1, BO, INC4
  1863. FXCPMADD f9, B2, A2, f9
  1864. nop
  1865. FXCSMADD f13, B2, A2, f13
  1866. LFPDUX B2, BO2, INC4
  1867. FXCPMADD f0, B3, A3, f0
  1868. nop
  1869. FXCSMADD f4, B3, A3, f4
  1870. LFPDUX A2, AO2, INC4
  1871. FXCPMADD f8, B4, A3, f8
  1872. nop
  1873. FXCSMADD f12, B4, A3, f12
  1874. LFPDUX A3, AO, INC4
  1875. FXCPMADD f1, B3, A4, f1
  1876. nop
  1877. FXCSMADD f5, B3, A4, f5
  1878. LFPDUX B3, BO, INC4
  1879. FXCPMADD f9, B4, A4, f9
  1880. nop
  1881. FXCSMADD f13, B4, A4, f13
  1882. LFPDUX B4, BO2, INC4
  1883. FXCPMADD f0, B5, A5, f0
  1884. nop
  1885. FXCSMADD f4, B5, A5, f4
  1886. LFPDUX A4, AO2, INC4
  1887. FXCPMADD f8, B6, A5, f8
  1888. nop
  1889. FXCSMADD f12, B6, A5, f12
  1890. LFPDUX A5, AO, INC4
  1891. FXCPMADD f1, B5, A6, f1
  1892. nop
  1893. FXCSMADD f5, B5, A6, f5
  1894. LFPDUX B5, BO, INC4
  1895. FXCPMADD f9, B6, A6, f9
  1896. nop
  1897. FXCSMADD f13, B6, A6, f13
  1898. LFPDUX B6, BO2, INC4
  1899. FXCPMADD f0, A9, A7, f0
  1900. nop
  1901. FXCSMADD f4, A9, A7, f4
  1902. LFPDUX A6, AO2, INC4
  1903. FXCPMADD f8, A10, A7, f8
  1904. nop
  1905. FXCSMADD f12, A10, A7, f12
  1906. LFPDUX A7, AO, INC4
  1907. FXCPMADD f1, A9, A8, f1
  1908. nop
  1909. FXCSMADD f5, A9, A8, f5
  1910. LFPDUX A9, BO, INC4
  1911. FXCPMADD f9, A10, A8, f9
  1912. nop
  1913. FXCSMADD f13, A10, A8, f13
  1914. LFPDUX A10, BO2, INC4
  1915. bdnz+ .L22
  1916. .align 4
  1917. .L23:
  1918. FXCPMADD f0, B1, A1, f0
  1919. FXCSMADD f4, B1, A1, f4
  1920. LFPDUX A8, AO2, INC4
  1921. FXCPMADD f8, B2, A1, f8
  1922. FXCSMADD f12, B2, A1, f12
  1923. FXCPMADD f1, B1, A2, f1
  1924. FXCSMADD f5, B1, A2, f5
  1925. FXCPMADD f9, B2, A2, f9
  1926. FXCSMADD f13, B2, A2, f13
  1927. FXCPMADD f0, B3, A3, f0
  1928. FXCSMADD f4, B3, A3, f4
  1929. FXCPMADD f8, B4, A3, f8
  1930. FXCSMADD f12, B4, A3, f12
  1931. FXCPMADD f1, B3, A4, f1
  1932. FXCSMADD f5, B3, A4, f5
  1933. FXCPMADD f9, B4, A4, f9
  1934. FXCSMADD f13, B4, A4, f13
  1935. FXCPMADD f0, B5, A5, f0
  1936. FXCSMADD f4, B5, A5, f4
  1937. FXCPMADD f8, B6, A5, f8
  1938. FXCSMADD f12, B6, A5, f12
  1939. FXCPMADD f1, B5, A6, f1
  1940. FXCSMADD f5, B5, A6, f5
  1941. FXCPMADD f9, B6, A6, f9
  1942. FXCSMADD f13, B6, A6, f13
  1943. FXCPMADD f0, A9, A7, f0
  1944. FXCSMADD f4, A9, A7, f4
  1945. FXCPMADD f8, A10, A7, f8
  1946. FXCSMADD f12, A10, A7, f12
  1947. FXCPMADD f1, A9, A8, f1
  1948. FXCSMADD f5, A9, A8, f5
  1949. FXCPMADD f9, A10, A8, f9
  1950. FXCSMADD f13, A10, A8, f13
  1951. .align 4
  1952. .L24:
  1953. #if defined(LT) || defined(RN)
  1954. andi. r0, KK, 3
  1955. mtspr CTR, r0
  1956. ble+ .L28
  1957. #else
  1958. andi. r0, TEMP, 3
  1959. mtspr CTR, r0
  1960. ble+ .L28
  1961. #endif
  1962. LFPDUX A1, AO, INC4
  1963. LFPDUX A2, AO2, INC4
  1964. LFPDUX B1, BO, INC4
  1965. LFPDUX B2, BO2, INC4
  1966. bdz- .L27
  1967. .align 4
  1968. .L26:
  1969. FXCPMADD f0, B1, A1, f0
  1970. FXCSMADD f4, B1, A1, f4
  1971. FXCPMADD f8, B2, A1, f8
  1972. FXCSMADD f12, B2, A1, f12
  1973. LFPDUX A1, AO, INC4
  1974. FXCPMADD f1, B1, A2, f1
  1975. FXCSMADD f5, B1, A2, f5
  1976. LFPDUX B1, BO, INC4
  1977. FXCPMADD f9, B2, A2, f9
  1978. FXCSMADD f13, B2, A2, f13
  1979. LFPDUX A2, AO2, INC4
  1980. LFPDUX B2, BO2, INC4
  1981. bdnz+ .L26
  1982. .align 4
  1983. .L27:
  1984. FXCPMADD f0, B1, A1, f0
  1985. FXCSMADD f4, B1, A1, f4
  1986. FXCPMADD f8, B2, A1, f8
  1987. FXCSMADD f12, B2, A1, f12
  1988. FXCPMADD f1, B1, A2, f1
  1989. FXCSMADD f5, B1, A2, f5
  1990. FXCPMADD f9, B2, A2, f9
  1991. FXCSMADD f13, B2, A2, f13
  1992. .align 4
  1993. .L28:
  1994. fpadd f0, f0, f4
  1995. fpadd f8, f8, f12
  1996. fpadd f1, f1, f5
  1997. fpadd f9, f9, f13
  1998. #if defined(LN) || defined(RT)
  1999. #ifdef LN
  2000. subi r0, KK, 2
  2001. #else
  2002. subi r0, KK, 2
  2003. #endif
  2004. slwi r0, r0, 1 + ZBASE_SHIFT
  2005. add AO, AORIG, r0
  2006. add BO, B, r0
  2007. addi AO2, AO, 2 * SIZE
  2008. addi BO, BO, - 4 * SIZE
  2009. addi BO2, BO, 2 * SIZE
  2010. #endif
  2011. #if defined(LN) || defined(LT)
  2012. LFPDUX f16, BO, INC4
  2013. LFPDUX f18, BO2, INC4
  2014. LFPDUX f17, BO, INC4
  2015. LFPDUX f19, BO2, INC4
  2016. subi BO, BO, 8 * SIZE
  2017. subi BO2, BO2, 8 * SIZE
  2018. #else
  2019. LFPDUX f16, AO, INC4
  2020. LFPDUX f17, AO2, INC4
  2021. LFPDUX f18, AO, INC4
  2022. LFPDUX f19, AO2, INC4
  2023. subi AO, AO, 8 * SIZE
  2024. subi AO2, AO2, 8 * SIZE
  2025. #endif
  2026. fpsub f0, f16, f0
  2027. fpsub f1, f17, f1
  2028. fpsub f8, f18, f8
  2029. fpsub f9, f19, f9
  2030. #ifdef LN
  2031. LFPDUX A1, AO, INC4
  2032. add AO2, AO2, INC4
  2033. LFPDUX A2, AO, INC4
  2034. LFPDUX A3, AO2, INC4
  2035. subi AO, AO, 8 * SIZE
  2036. subi AO2, AO2, 8 * SIZE
  2037. fxpmul f4, A3, f1
  2038. fxpmul f5, A3, f9
  2039. FXCXNPMA f1, A3, f1, f4
  2040. FXCXNPMA f9, A3, f9, f5
  2041. fxcpnmsub f0, A2, f1, f0
  2042. fxcpnmsub f8, A2, f9, f8
  2043. FXCXNSMA f0, A2, f1, f0
  2044. FXCXNSMA f8, A2, f9, f8
  2045. fxpmul f4, A1, f0
  2046. fxpmul f5, A1, f8
  2047. FXCXNPMA f0, A1, f0, f4
  2048. FXCXNPMA f8, A1, f8, f5
  2049. #endif
  2050. #ifdef LT
  2051. LFPDUX A1, AO, INC4
  2052. LFPDUX A2, AO2, INC4
  2053. add AO, AO, INC4
  2054. LFPDUX A3, AO2, INC4
  2055. subi AO, AO, 8 * SIZE
  2056. subi AO2, AO2, 8 * SIZE
  2057. fxpmul f4, A1, f0
  2058. fxpmul f5, A1, f8
  2059. FXCXNPMA f0, A1, f0, f4
  2060. FXCXNPMA f8, A1, f8, f5
  2061. fxcpnmsub f1, A2, f0, f1
  2062. fxcpnmsub f9, A2, f8, f9
  2063. FXCXNSMA f1, A2, f0, f1
  2064. FXCXNSMA f9, A2, f8, f9
  2065. fxpmul f6, A3, f1
  2066. fxpmul f7, A3, f9
  2067. FXCXNPMA f1, A3, f1, f6
  2068. FXCXNPMA f9, A3, f9, f7
  2069. #endif
  2070. #ifdef RN
  2071. LFPDUX A1, BO, INC4
  2072. LFPDUX A2, BO2, INC4
  2073. add BO, BO, INC4
  2074. LFPDUX A3, BO2, INC4
  2075. subi BO, BO, 8 * SIZE
  2076. subi BO2, BO2, 8 * SIZE
  2077. fxpmul f4, A1, f0
  2078. fxpmul f5, A1, f1
  2079. FXCXNPMA f0, A1, f0, f4
  2080. FXCXNPMA f1, A1, f1, f5
  2081. fxcpnmsub f8, A2, f0, f8
  2082. fxcpnmsub f9, A2, f1, f9
  2083. FXCXNSMA f8, A2, f0, f8
  2084. FXCXNSMA f9, A2, f1, f9
  2085. fxpmul f4, A3, f8
  2086. fxpmul f5, A3, f9
  2087. FXCXNPMA f8, A3, f8, f4
  2088. FXCXNPMA f9, A3, f9, f5
  2089. #endif
  2090. #ifdef RT
  2091. LFPDUX A1, BO, INC4
  2092. add BO2, BO2, INC4
  2093. LFPDUX A2, BO, INC4
  2094. LFPDUX A3, BO2, INC4
  2095. subi BO, BO, 8 * SIZE
  2096. subi BO2, BO2, 8 * SIZE
  2097. fxpmul f4, A3, f8
  2098. fxpmul f5, A3, f9
  2099. FXCXNPMA f8, A3, f8, f4
  2100. FXCXNPMA f9, A3, f9, f5
  2101. fxcpnmsub f0, A2, f8, f0
  2102. fxcpnmsub f1, A2, f9, f1
  2103. FXCXNSMA f0, A2, f8, f0
  2104. FXCXNSMA f1, A2, f9, f1
  2105. fxpmul f4, A1, f0
  2106. fxpmul f5, A1, f1
  2107. FXCXNPMA f0, A1, f0, f4
  2108. FXCXNPMA f1, A1, f1, f5
  2109. #endif
  2110. #ifdef LN
  2111. subi CO1, CO1, 4 * SIZE
  2112. subi CO2, CO2, 4 * SIZE
  2113. #endif
  2114. #if defined(LN) || defined(LT)
  2115. STFPDUX f0, BO, INC4
  2116. STFPDUX f8, BO2, INC4
  2117. STFPDUX f1, BO, INC4
  2118. STFPDUX f9, BO2, INC4
  2119. subi BO, BO, 8 * SIZE
  2120. subi BO2, BO2, 8 * SIZE
  2121. #else
  2122. STFPDUX f0, AO, INC4
  2123. STFPDUX f1, AO2, INC4
  2124. STFPDUX f8, AO, INC4
  2125. STFPDUX f9, AO2, INC4
  2126. subi AO, AO, 8 * SIZE
  2127. subi AO2, AO2, 8 * SIZE
  2128. #endif
  2129. STFDUX f0, CO1, INC
  2130. STFSDUX f0, CO1, INC
  2131. STFDUX f1, CO1, INC
  2132. STFSDUX f1, CO1, INC
  2133. STFDUX f8, CO2, INC
  2134. STFSDUX f8, CO2, INC
  2135. STFDUX f9, CO2, INC
  2136. STFSDUX f9, CO2, INC
  2137. #ifdef LN
  2138. subi CO1, CO1, 4 * SIZE
  2139. subi CO2, CO2, 4 * SIZE
  2140. #endif
  2141. #ifdef RT
  2142. slwi r0, K, 1 + ZBASE_SHIFT
  2143. add AORIG, AORIG, r0
  2144. #endif
  2145. #if defined(LT) || defined(RN)
  2146. sub TEMP, K, KK
  2147. slwi r0, TEMP, 1 + ZBASE_SHIFT
  2148. add AO, AO, r0
  2149. add BO, BO, r0
  2150. #endif
  2151. #ifdef LT
  2152. addi KK, KK, 2
  2153. #endif
  2154. #ifdef LN
  2155. subi KK, KK, 2
  2156. #endif
  2157. li r0, FZERO
  2158. lfpsx f0, SP, r0
  2159. .align 4
  2160. .L30:
  2161. andi. I, M, 1
  2162. beq .L49
  2163. #if defined(LT) || defined(RN)
  2164. addi AO2, AO, 2 * SIZE
  2165. fpmr f1, f0
  2166. addi BO, B, - 4 * SIZE
  2167. fpmr f2, f0
  2168. addi BO2, B, - 2 * SIZE
  2169. fpmr f3, f0
  2170. srawi. r0, KK, 2
  2171. mtspr CTR, r0
  2172. ble .L34
  2173. #else
  2174. #ifdef LN
  2175. slwi r0, K, 0 + ZBASE_SHIFT
  2176. sub AORIG, AORIG, r0
  2177. #endif
  2178. slwi r0 , KK, 0 + ZBASE_SHIFT
  2179. slwi TEMP, KK, 1 + ZBASE_SHIFT
  2180. add AO, AORIG, r0
  2181. add BO, B, TEMP
  2182. sub TEMP, K, KK
  2183. addi AO2, AO, 2 * SIZE
  2184. fpmr f1, f0
  2185. addi BO, BO, - 4 * SIZE
  2186. fpmr f2, f0
  2187. addi BO2, BO, 2 * SIZE
  2188. fpmr f3, f0
  2189. srawi. r0, TEMP, 2
  2190. mtspr CTR, r0
  2191. ble .L34
  2192. #endif
  2193. LFPDUX A1, AO, INC4
  2194. LFPDUX B1, BO, INC4
  2195. LFPDUX B2, BO2, INC4
  2196. LFPDUX A2, AO2, INC4
  2197. LFPDUX B3, BO, INC4
  2198. LFPDUX B4, BO2, INC4
  2199. LFPDUX A3, AO, INC4
  2200. LFPDUX A5, BO, INC4
  2201. LFPDUX A6, BO2, INC4
  2202. LFPDUX A4, AO2, INC4
  2203. LFPDUX A7, BO, INC4
  2204. LFPDUX A8, BO2, INC4
  2205. bdz- .L33
  2206. .align 4
  2207. .L32:
  2208. FXCPMADD f0, B1, A1, f0
  2209. FXCSMADD f1, B1, A1, f1
  2210. LFPDUX B1, BO, INC4
  2211. FXCPMADD f2, B2, A1, f2
  2212. FXCSMADD f3, B2, A1, f3
  2213. LFPDUX B2, BO2, INC4
  2214. LFPDUX A1, AO, INC4
  2215. FXCPMADD f0, B3, A2, f0
  2216. FXCSMADD f1, B3, A2, f1
  2217. LFPDUX B3, BO, INC4
  2218. FXCPMADD f2, B4, A2, f2
  2219. FXCSMADD f3, B4, A2, f3
  2220. LFPDUX B4, BO2, INC4
  2221. LFPDUX A2, AO2, INC4
  2222. FXCPMADD f0, A5, A3, f0
  2223. FXCSMADD f1, A5, A3, f1
  2224. LFPDUX A5, BO, INC4
  2225. FXCPMADD f2, A6, A3, f2
  2226. FXCSMADD f3, A6, A3, f3
  2227. LFPDUX A6, BO2, INC4
  2228. LFPDUX A3, AO, INC4
  2229. FXCPMADD f0, A7, A4, f0
  2230. FXCSMADD f1, A7, A4, f1
  2231. LFPDUX A7, BO, INC4
  2232. FXCPMADD f2, A8, A4, f2
  2233. FXCSMADD f3, A8, A4, f3
  2234. LFPDUX A8, BO2, INC4
  2235. LFPDUX A4, AO2, INC4
  2236. bdnz+ .L32
  2237. .align 4
  2238. .L33:
  2239. FXCPMADD f0, B1, A1, f0
  2240. FXCSMADD f1, B1, A1, f1
  2241. FXCPMADD f2, B2, A1, f2
  2242. FXCSMADD f3, B2, A1, f3
  2243. FXCPMADD f0, B3, A2, f0
  2244. FXCSMADD f1, B3, A2, f1
  2245. FXCPMADD f2, B4, A2, f2
  2246. FXCSMADD f3, B4, A2, f3
  2247. FXCPMADD f0, A5, A3, f0
  2248. FXCSMADD f1, A5, A3, f1
  2249. FXCPMADD f2, A6, A3, f2
  2250. FXCSMADD f3, A6, A3, f3
  2251. FXCPMADD f0, A7, A4, f0
  2252. FXCSMADD f1, A7, A4, f1
  2253. FXCPMADD f2, A8, A4, f2
  2254. FXCSMADD f3, A8, A4, f3
  2255. .align 4
  2256. .L34:
  2257. #if defined(LT) || defined(RN)
  2258. andi. r0, KK, 3
  2259. mtspr CTR, r0
  2260. ble+ .L38
  2261. #else
  2262. andi. r0, TEMP, 3
  2263. mtspr CTR, r0
  2264. ble+ .L38
  2265. #endif
  2266. LFPDX A1, AO, INC4
  2267. LFPDUX B1, BO, INC4
  2268. LFPDUX B2, BO2, INC4
  2269. add AO, AO, INC2
  2270. bdz- .L37
  2271. .align 4
  2272. .L36:
  2273. FXCPMADD f0, B1, A1, f0
  2274. FXCSMADD f1, B1, A1, f1
  2275. LFPDUX B1, BO, INC4
  2276. FXCPMADD f2, B2, A1, f2
  2277. FXCSMADD f3, B2, A1, f3
  2278. LFPDX A1, AO, INC4
  2279. LFPDUX B2, BO2, INC4
  2280. add AO, AO, INC2
  2281. bdnz+ .L36
  2282. .align 4
  2283. .L37:
  2284. FXCPMADD f0, B1, A1, f0
  2285. FXCSMADD f1, B1, A1, f1
  2286. FXCPMADD f2, B2, A1, f2
  2287. FXCSMADD f3, B2, A1, f3
  2288. .align 4
  2289. .L38:
  2290. fpadd f0, f0, f1
  2291. fpadd f2, f2, f3
  2292. #if defined(LN) || defined(RT)
  2293. #ifdef LN
  2294. subi r0, KK, 1
  2295. #else
  2296. subi r0, KK, 2
  2297. #endif
  2298. slwi TEMP, r0, 0 + ZBASE_SHIFT
  2299. slwi r0, r0, 1 + ZBASE_SHIFT
  2300. add AO, AORIG, TEMP
  2301. add BO, B, r0
  2302. addi BO, BO, - 4 * SIZE
  2303. #endif
  2304. addi AO2, AO, 2 * SIZE
  2305. addi BO2, BO, 2 * SIZE
  2306. #if defined(LN) || defined(LT)
  2307. LFPDX f16, BO, INC4
  2308. LFPDX f17, BO2, INC4
  2309. #else
  2310. LFPDX f16, AO, INC4
  2311. LFPDX f17, AO2, INC4
  2312. #endif
  2313. fpsub f0, f16, f0
  2314. fpsub f2, f17, f2
  2315. #ifdef LN
  2316. LFPDX A1, AO, INC4
  2317. fxpmul f4, A1, f0
  2318. fxpmul f5, A1, f2
  2319. FXCXNPMA f0, A1, f0, f4
  2320. FXCXNPMA f2, A1, f2, f5
  2321. #endif
  2322. #ifdef LT
  2323. LFPDX A1, AO, INC4
  2324. fxpmul f4, A1, f0
  2325. fxpmul f5, A1, f2
  2326. FXCXNPMA f0, A1, f0, f4
  2327. FXCXNPMA f2, A1, f2, f5
  2328. #endif
  2329. #ifdef RN
  2330. LFPDUX A1, BO, INC4
  2331. LFPDUX A2, BO2, INC4
  2332. add BO, BO, INC4
  2333. LFPDUX A3, BO2, INC4
  2334. subi BO, BO, 8 * SIZE
  2335. subi BO2, BO2, 8 * SIZE
  2336. fxpmul f4, A1, f0
  2337. FXCXNPMA f0, A1, f0, f4
  2338. fxcpnmsub f2, A2, f0, f2
  2339. FXCXNSMA f2, A2, f0, f2
  2340. fxpmul f4, A3, f2
  2341. FXCXNPMA f2, A3, f2, f4
  2342. #endif
  2343. #ifdef RT
  2344. LFPDUX A1, BO, INC4
  2345. add BO2, BO2, INC4
  2346. LFPDUX A2, BO, INC4
  2347. LFPDUX A3, BO2, INC4
  2348. subi BO, BO, 8 * SIZE
  2349. subi BO2, BO2, 8 * SIZE
  2350. fxpmul f4, A3, f2
  2351. FXCXNPMA f2, A3, f2, f4
  2352. fxcpnmsub f0, A2, f2, f0
  2353. FXCXNSMA f0, A2, f2, f0
  2354. fxpmul f4, A1, f0
  2355. FXCXNPMA f0, A1, f0, f4
  2356. #endif
  2357. #ifdef LN
  2358. subi CO1, CO1, 2 * SIZE
  2359. subi CO2, CO2, 2 * SIZE
  2360. #endif
  2361. #if defined(LN) || defined(LT)
  2362. STFPDX f0, BO, INC4
  2363. STFPDX f2, BO2, INC4
  2364. #else
  2365. STFPDX f0, AO, INC4
  2366. STFPDX f2, AO2, INC4
  2367. #endif
  2368. STFDUX f0, CO1, INC
  2369. STFSDUX f0, CO1, INC
  2370. STFDUX f2, CO2, INC
  2371. STFSDUX f2, CO2, INC
  2372. #ifdef LN
  2373. subi CO1, CO1, 2 * SIZE
  2374. subi CO2, CO2, 2 * SIZE
  2375. #endif
  2376. #ifdef RT
  2377. slwi r0, K, 0 + ZBASE_SHIFT
  2378. add AORIG, AORIG, r0
  2379. #endif
  2380. #if defined(LT) || defined(RN)
  2381. sub TEMP, K, KK
  2382. slwi r0, TEMP, 0 + ZBASE_SHIFT
  2383. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  2384. add AO, AO, r0
  2385. add BO, BO, TEMP
  2386. #endif
  2387. #ifdef LT
  2388. addi KK, KK, 1
  2389. #endif
  2390. #ifdef LN
  2391. subi KK, KK, 1
  2392. #endif
  2393. li r0, FZERO
  2394. lfpsx f0, SP, r0
  2395. .align 4
  2396. .L49:
  2397. #ifdef LN
  2398. slwi r0, K, 1 + ZBASE_SHIFT
  2399. add B, B, r0
  2400. #endif
  2401. #if defined(LT) || defined(RN)
  2402. addi B, BO, 4 * SIZE
  2403. #endif
  2404. #ifdef RN
  2405. addi KK, KK, 2
  2406. #endif
  2407. #ifdef RT
  2408. subi KK, KK, 2
  2409. #endif
  2410. addic. J, J, -1
  2411. bgt+ .L10
  2412. .align 4
  2413. .L999:
  2414. addi SP, SP, 20
  2415. lwzu r14, 4(SP)
  2416. lwzu r15, 4(SP)
  2417. lwzu r16, 4(SP)
  2418. lwzu r17, 4(SP)
  2419. lwzu r18, 4(SP)
  2420. lwzu r19, 4(SP)
  2421. lwzu r20, 4(SP)
  2422. lwzu r21, 4(SP)
  2423. lwzu r22, 4(SP)
  2424. lwzu r23, 4(SP)
  2425. lwzu r24, 4(SP)
  2426. lwzu r25, 4(SP)
  2427. lwzu r26, 4(SP)
  2428. lwzu r27, 4(SP)
  2429. lwzu r28, 4(SP)
  2430. lwzu r29, 4(SP)
  2431. lwzu r30, 4(SP)
  2432. lwzu r31, 4(SP)
  2433. subi SP, SP, 12
  2434. li r0, 16
  2435. lfpdux f31, SP, r0
  2436. lfpdux f30, SP, r0
  2437. lfpdux f29, SP, r0
  2438. lfpdux f28, SP, r0
  2439. lfpdux f27, SP, r0
  2440. lfpdux f26, SP, r0
  2441. lfpdux f25, SP, r0
  2442. lfpdux f24, SP, r0
  2443. lfpdux f23, SP, r0
  2444. lfpdux f22, SP, r0
  2445. lfpdux f21, SP, r0
  2446. lfpdux f20, SP, r0
  2447. lfpdux f19, SP, r0
  2448. lfpdux f18, SP, r0
  2449. lfpdux f17, SP, r0
  2450. lfpdux f16, SP, r0
  2451. lfpdux f15, SP, r0
  2452. lfpdux f14, SP, r0
  2453. addi SP, SP, 16
  2454. blr
  2455. .align 4
  2456. EPILOGUE
  2457. #endif