You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_hummer_LT.S 51 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #undef ZERO
  41. #define ALPHA 0
  42. #define FZERO 16
  43. #define M r3
  44. #define N r4
  45. #define K r5
  46. #if defined(linux) || defined(__FreeBSD__)
  47. #define A r6
  48. #define B r7
  49. #define C r8
  50. #define LDC r9
  51. #define OFFSET r10
  52. #endif
  53. #define TEMP r11
  54. #define AORIG r12
  55. #define KK r14
  56. #define INCM1 r15
  57. #define INCM3 r16
  58. #define INCM5 r17
  59. #define INCM7 r18
  60. #define INC2 r19
  61. #define INC r20
  62. #define INC4 r21
  63. #define I r22
  64. #define J r23
  65. #define AO r24
  66. #define BO r25
  67. #define AO2 r26
  68. #define BO2 r27
  69. #define CO1 r28
  70. #define CO2 r29
  71. #define ZERO r31
  72. #ifndef NEEDPARAM
  73. #define A1 f16
  74. #define A2 f17
  75. #define A3 f18
  76. #define A4 f19
  77. #define A5 f20
  78. #define A6 f21
  79. #define A7 f22
  80. #define A8 f23
  81. #define A9 f24
  82. #define A10 f25
  83. #define B1 f26
  84. #define B2 f27
  85. #define B3 f28
  86. #define B4 f29
  87. #define B5 f30
  88. #define B6 f31
  89. #define AP B6
  90. #ifndef CONJ
  91. #define FXCPMADD fxcpmadd
  92. #define FXCSMADD fxcxnpma
  93. #else
  94. #if defined(LN) || defined(LT)
  95. #define FXCPMADD fxcpnsma
  96. #define FXCSMADD fxcxma
  97. #else
  98. #define FXCPMADD fxcpmadd
  99. #define FXCSMADD fxcxnsma
  100. #endif
  101. #endif
  102. #ifndef CONJ
  103. #define FXCXNPMA fxcxnpma
  104. #define FXCXNSMA fxcxnsma
  105. #else
  106. #define FXCXNPMA fxcxnsma
  107. #define FXCXNSMA fxcxnpma
  108. #endif
  109. PROLOGUE
  110. PROFCODE
  111. li r0, -16
  112. stfpdux f14, SP, r0
  113. stfpdux f15, SP, r0
  114. stfpdux f16, SP, r0
  115. stfpdux f17, SP, r0
  116. stfpdux f18, SP, r0
  117. stfpdux f19, SP, r0
  118. stfpdux f20, SP, r0
  119. stfpdux f21, SP, r0
  120. stfpdux f22, SP, r0
  121. stfpdux f23, SP, r0
  122. stfpdux f24, SP, r0
  123. stfpdux f25, SP, r0
  124. stfpdux f26, SP, r0
  125. stfpdux f27, SP, r0
  126. stfpdux f28, SP, r0
  127. stfpdux f29, SP, r0
  128. stfpdux f30, SP, r0
  129. stfpdux f31, SP, r0
  130. stwu r31, -4(SP)
  131. stwu r30, -4(SP)
  132. stwu r29, -4(SP)
  133. stwu r28, -4(SP)
  134. stwu r27, -4(SP)
  135. stwu r26, -4(SP)
  136. stwu r25, -4(SP)
  137. stwu r24, -4(SP)
  138. stwu r23, -4(SP)
  139. stwu r22, -4(SP)
  140. stwu r21, -4(SP)
  141. stwu r20, -4(SP)
  142. stwu r19, -4(SP)
  143. stwu r18, -4(SP)
  144. stwu r17, -4(SP)
  145. stwu r16, -4(SP)
  146. stwu r15, -4(SP)
  147. stwu r14, -4(SP)
  148. li r0, 0
  149. stwu r0, -4(SP)
  150. stwu r0, -4(SP)
  151. stfdu f2, -8(SP)
  152. stfdu f1, -8(SP)
  153. slwi LDC, LDC, ZBASE_SHIFT
  154. cmpwi cr0, M, 0
  155. ble .L999
  156. cmpwi cr0, N, 0
  157. ble .L999
  158. cmpwi cr0, K, 0
  159. ble .L999
  160. li INC, 1 * SIZE
  161. li INC2, 2 * SIZE
  162. li INC4, 4 * SIZE
  163. li INCM1, -1 * SIZE
  164. li INCM3, -3 * SIZE
  165. li INCM5, -5 * SIZE
  166. li INCM7, -7 * SIZE
  167. addi C, C, - 1 * SIZE
  168. #ifdef LN
  169. mullw r0, M, K
  170. slwi r0, r0, ZBASE_SHIFT
  171. add A, A, r0
  172. slwi r0, M, ZBASE_SHIFT
  173. add C, C, r0
  174. #endif
  175. #ifdef RN
  176. neg KK, OFFSET
  177. #endif
  178. #ifdef RT
  179. mullw r0, N, K
  180. slwi r0, r0, ZBASE_SHIFT
  181. add B, B, r0
  182. mullw r0, N, LDC
  183. add C, C, r0
  184. sub KK, N, OFFSET
  185. #endif
  186. srawi. J, N, 1
  187. ble .L50
  188. .align 4
  189. .L10:
  190. #ifdef RT
  191. slwi r0, K, 1 + ZBASE_SHIFT
  192. sub B, B, r0
  193. slwi r0, LDC, 1
  194. sub C, C, r0
  195. #endif
  196. mr CO1, C
  197. add CO2, C, LDC
  198. #ifdef LN
  199. add KK, M, OFFSET
  200. #endif
  201. #ifdef LT
  202. mr KK, OFFSET
  203. #endif
  204. #if defined(LN) || defined(RT)
  205. addi AORIG, A, -4 * SIZE
  206. #else
  207. addi AO, A, -4 * SIZE
  208. #endif
  209. #ifndef RT
  210. add C, CO2, LDC
  211. #endif
  212. li r0, FZERO
  213. lfpsx f0, SP, r0
  214. srawi. I, M, 2
  215. ble .L20
  216. .align 4
  217. .L11:
  218. #if defined(LT) || defined(RN)
  219. addi AO2, AO, 2 * SIZE
  220. fpmr f4, f0
  221. addi BO, B, - 4 * SIZE
  222. fpmr f8, f0
  223. addi BO2, B, - 2 * SIZE
  224. fpmr f12, f0
  225. fpmr f5, f0
  226. fpmr f9, f0
  227. fpmr f13, f0
  228. fpmr f2, f0
  229. fpmr f6, f0
  230. fpmr f10, f0
  231. fpmr f14, f0
  232. fpmr f3, f0
  233. fpmr f7, f0
  234. fpmr f11, f0
  235. fpmr f15, f0
  236. srawi. r0, KK, 2
  237. fpmr f1, f0
  238. mtspr CTR, r0
  239. ble .L14
  240. #else
  241. #ifdef LN
  242. slwi r0, K, 2 + ZBASE_SHIFT
  243. sub AORIG, AORIG, r0
  244. #endif
  245. slwi r0 , KK, 2 + ZBASE_SHIFT
  246. slwi TEMP, KK, 1 + ZBASE_SHIFT
  247. add AO, AORIG, r0
  248. add BO, B, TEMP
  249. sub TEMP, K, KK
  250. fpmr f5, f0
  251. fpmr f9, f0
  252. fpmr f13, f0
  253. fpmr f2, f0
  254. fpmr f6, f0
  255. fpmr f10, f0
  256. fpmr f14, f0
  257. fpmr f3, f0
  258. fpmr f7, f0
  259. fpmr f11, f0
  260. fpmr f15, f0
  261. addi AO2, AO, 2 * SIZE
  262. fpmr f4, f0
  263. addi BO, BO, - 4 * SIZE
  264. fpmr f8, f0
  265. addi BO2, BO, 2 * SIZE
  266. fpmr f12, f0
  267. srawi. r0, TEMP, 2
  268. fpmr f1, f0
  269. mtspr CTR, r0
  270. ble .L14
  271. #endif
  272. LFPDUX A1, AO, INC4
  273. fpmr f5, f0
  274. LFPDUX A3, AO, INC4
  275. fpmr f9, f0
  276. LFPDUX B1, BO, INC4
  277. fpmr f13, f0
  278. LFPDUX A5, AO, INC4
  279. fpmr f2, f0
  280. LFPDUX A6, AO, INC4
  281. fpmr f6, f0
  282. LFPDUX B3, BO, INC4
  283. fpmr f10, f0
  284. LFPDUX A7, AO, INC4
  285. fpmr f14, f0
  286. LFPDUX A8, AO, INC4
  287. fpmr f3, f0
  288. LFPDUX B5, BO, INC4
  289. fpmr f7, f0
  290. LFPDUX A9, AO, INC4
  291. fpmr f11, f0
  292. LFPDUX A2, AO2, INC4
  293. fpmr f15, f0
  294. LFPDUX B2, BO2, INC4
  295. bdz- .L13
  296. .align 4
  297. .L12:
  298. ## 1 ##
  299. FXCPMADD f0, B1, A1, f0
  300. nop
  301. FXCSMADD f4, B1, A1, f4
  302. nop
  303. FXCPMADD f8, B2, A1, f8
  304. LFPDUX B4, BO2, INC4
  305. FXCSMADD f12, B2, A1, f12
  306. LFPDUX B6, BO, INC4
  307. FXCPMADD f1, B1, A2, f1
  308. nop
  309. FXCSMADD f5, B1, A2, f5
  310. LFPDUX A4, AO2, INC4
  311. FXCPMADD f9, B2, A2, f9
  312. LFPDUX A10, AO, INC4
  313. FXCSMADD f13, B2, A2, f13
  314. nop
  315. FXCPMADD f2, B1, A3, f2
  316. nop
  317. FXCSMADD f6, B1, A3, f6
  318. nop
  319. FXCPMADD f10, B2, A3, f10
  320. nop
  321. FXCSMADD f14, B2, A3, f14
  322. nop
  323. FXCPMADD f3, B1, A4, f3
  324. nop
  325. FXCSMADD f7, B1, A4, f7
  326. LFPDUX A2, AO2, INC4
  327. FXCPMADD f11, B2, A4, f11
  328. LFPDUX A1, AO, INC4
  329. FXCSMADD f15, B2, A4, f15
  330. nop
  331. ## 2 ##
  332. FXCPMADD f0, B3, A5, f0
  333. nop
  334. FXCSMADD f4, B3, A5, f4
  335. nop
  336. FXCPMADD f8, B4, A5, f8
  337. LFPDUX B2, BO2, INC4
  338. FXCSMADD f12, B4, A5, f12
  339. LFPDUX B1, BO, INC4
  340. FXCPMADD f1, B3, A2, f1
  341. nop
  342. FXCSMADD f5, B3, A2, f5
  343. LFPDUX A4, AO2, INC4
  344. FXCPMADD f9, B4, A2, f9
  345. LFPDUX A3, AO, INC4
  346. FXCSMADD f13, B4, A2, f13
  347. nop
  348. FXCPMADD f2, B3, A6, f2
  349. nop
  350. FXCSMADD f6, B3, A6, f6
  351. nop
  352. FXCPMADD f10, B4, A6, f10
  353. nop
  354. FXCSMADD f14, B4, A6, f14
  355. nop
  356. FXCPMADD f3, B3, A4, f3
  357. nop
  358. FXCSMADD f7, B3, A4, f7
  359. LFPDUX A2, AO2, INC4
  360. FXCPMADD f11, B4, A4, f11
  361. LFPDUX A5, AO, INC4
  362. FXCSMADD f15, B4, A4, f15
  363. nop
  364. ## 3 ##
  365. FXCPMADD f0, B5, A7, f0
  366. nop
  367. FXCSMADD f4, B5, A7, f4
  368. nop
  369. FXCPMADD f8, B2, A7, f8
  370. LFPDUX B4, BO2, INC4
  371. FXCSMADD f12, B2, A7, f12
  372. LFPDUX B3, BO, INC4
  373. FXCPMADD f1, B5, A2, f1
  374. nop
  375. FXCSMADD f5, B5, A2, f5
  376. LFPDUX A4, AO2, INC4
  377. FXCPMADD f9, B2, A2, f9
  378. LFPDUX A6, AO, INC4
  379. FXCSMADD f13, B2, A2, f13
  380. nop
  381. FXCPMADD f2, B5, A8, f2
  382. nop
  383. FXCSMADD f6, B5, A8, f6
  384. nop
  385. FXCPMADD f10, B2, A8, f10
  386. nop
  387. FXCSMADD f14, B2, A8, f14
  388. nop
  389. FXCPMADD f3, B5, A4, f3
  390. nop
  391. FXCSMADD f7, B5, A4, f7
  392. LFPDUX A2, AO2, INC4
  393. FXCPMADD f11, B2, A4, f11
  394. LFPDUX A7, AO, INC4
  395. FXCSMADD f15, B2, A4, f15
  396. nop
  397. ## 4 ##
  398. FXCPMADD f0, B6, A9, f0
  399. nop
  400. FXCSMADD f4, B6, A9, f4
  401. nop
  402. FXCPMADD f8, B4, A9, f8
  403. LFPDUX B2, BO2, INC4
  404. FXCSMADD f12, B4, A9, f12
  405. LFPDUX B5, BO, INC4
  406. FXCPMADD f1, B6, A2, f1
  407. nop
  408. FXCSMADD f5, B6, A2, f5
  409. LFPDUX A4, AO2, INC4
  410. FXCPMADD f9, B4, A2, f9
  411. LFPDUX A8, AO, INC4
  412. FXCSMADD f13, B4, A2, f13
  413. nop
  414. FXCPMADD f2, B6, A10, f2
  415. nop
  416. FXCSMADD f6, B6, A10, f6
  417. nop
  418. FXCPMADD f10, B4, A10, f10
  419. nop
  420. FXCSMADD f14, B4, A10, f14
  421. nop
  422. FXCPMADD f3, B6, A4, f3
  423. LFPDUX A2, AO2, INC4
  424. FXCSMADD f7, B6, A4, f7
  425. LFPDUX A9, AO, INC4
  426. FXCPMADD f11, B4, A4, f11
  427. nop
  428. FXCSMADD f15, B4, A4, f15
  429. bdnz+ .L12
  430. .align 4
  431. .L13:
  432. ## 1 ##
  433. FXCPMADD f0, B1, A1, f0
  434. nop
  435. FXCSMADD f4, B1, A1, f4
  436. nop
  437. FXCPMADD f8, B2, A1, f8
  438. LFPDUX B4, BO2, INC4
  439. FXCSMADD f12, B2, A1, f12
  440. LFPDUX B6, BO, INC4
  441. FXCPMADD f1, B1, A2, f1
  442. nop
  443. FXCSMADD f5, B1, A2, f5
  444. LFPDUX A4, AO2, INC4
  445. FXCPMADD f9, B2, A2, f9
  446. LFPDUX A10, AO, INC4
  447. FXCSMADD f13, B2, A2, f13
  448. nop
  449. FXCPMADD f2, B1, A3, f2
  450. nop
  451. FXCSMADD f6, B1, A3, f6
  452. nop
  453. FXCPMADD f10, B2, A3, f10
  454. nop
  455. FXCSMADD f14, B2, A3, f14
  456. nop
  457. FXCPMADD f3, B1, A4, f3
  458. nop
  459. FXCSMADD f7, B1, A4, f7
  460. LFPDUX A2, AO2, INC4
  461. FXCPMADD f11, B2, A4, f11
  462. nop
  463. FXCSMADD f15, B2, A4, f15
  464. nop
  465. ## 2 ##
  466. FXCPMADD f0, B3, A5, f0
  467. nop
  468. FXCSMADD f4, B3, A5, f4
  469. nop
  470. FXCPMADD f8, B4, A5, f8
  471. LFPDUX B2, BO2, INC4
  472. FXCSMADD f12, B4, A5, f12
  473. nop
  474. FXCPMADD f1, B3, A2, f1
  475. nop
  476. FXCSMADD f5, B3, A2, f5
  477. LFPDUX A4, AO2, INC4
  478. FXCPMADD f9, B4, A2, f9
  479. nop
  480. FXCSMADD f13, B4, A2, f13
  481. nop
  482. FXCPMADD f2, B3, A6, f2
  483. nop
  484. FXCSMADD f6, B3, A6, f6
  485. nop
  486. FXCPMADD f10, B4, A6, f10
  487. nop
  488. FXCSMADD f14, B4, A6, f14
  489. nop
  490. FXCPMADD f3, B3, A4, f3
  491. nop
  492. FXCSMADD f7, B3, A4, f7
  493. LFPDUX A2, AO2, INC4
  494. FXCPMADD f11, B4, A4, f11
  495. nop
  496. FXCSMADD f15, B4, A4, f15
  497. nop
  498. ## 3 ##
  499. FXCPMADD f0, B5, A7, f0
  500. nop
  501. FXCSMADD f4, B5, A7, f4
  502. nop
  503. FXCPMADD f8, B2, A7, f8
  504. LFPDUX B4, BO2, INC4
  505. FXCSMADD f12, B2, A7, f12
  506. nop
  507. FXCPMADD f1, B5, A2, f1
  508. nop
  509. FXCSMADD f5, B5, A2, f5
  510. LFPDUX A4, AO2, INC4
  511. FXCPMADD f9, B2, A2, f9
  512. nop
  513. FXCSMADD f13, B2, A2, f13
  514. nop
  515. FXCPMADD f2, B5, A8, f2
  516. nop
  517. FXCSMADD f6, B5, A8, f6
  518. nop
  519. FXCPMADD f10, B2, A8, f10
  520. nop
  521. FXCSMADD f14, B2, A8, f14
  522. nop
  523. FXCPMADD f3, B5, A4, f3
  524. nop
  525. FXCSMADD f7, B5, A4, f7
  526. LFPDUX A2, AO2, INC4
  527. FXCPMADD f11, B2, A4, f11
  528. nop
  529. FXCSMADD f15, B2, A4, f15
  530. nop
  531. ## 4 ##
  532. FXCPMADD f0, B6, A9, f0
  533. nop
  534. FXCSMADD f4, B6, A9, f4
  535. nop
  536. FXCPMADD f8, B4, A9, f8
  537. nop
  538. FXCSMADD f12, B4, A9, f12
  539. nop
  540. FXCPMADD f1, B6, A2, f1
  541. nop
  542. FXCSMADD f5, B6, A2, f5
  543. LFPDUX A4, AO2, INC4
  544. FXCPMADD f9, B4, A2, f9
  545. nop
  546. FXCSMADD f13, B4, A2, f13
  547. nop
  548. FXCPMADD f2, B6, A10, f2
  549. nop
  550. FXCSMADD f6, B6, A10, f6
  551. nop
  552. FXCPMADD f10, B4, A10, f10
  553. nop
  554. FXCSMADD f14, B4, A10, f14
  555. nop
  556. FXCPMADD f3, B6, A4, f3
  557. nop
  558. FXCSMADD f7, B6, A4, f7
  559. nop
  560. FXCPMADD f11, B4, A4, f11
  561. nop
  562. FXCSMADD f15, B4, A4, f15
  563. nop
  564. .align 4
  565. .L14:
  566. #if defined(LT) || defined(RN)
  567. andi. r0, KK, 3
  568. mtspr CTR, r0
  569. ble+ .L18
  570. #else
  571. andi. r0, TEMP, 3
  572. mtspr CTR, r0
  573. ble+ .L18
  574. #endif
  575. .L15:
  576. LFPDUX A2, AO, INC4
  577. LFPDUX A4, AO2, INC4
  578. LFPDUX A10, BO, INC4
  579. LFPDUX B4, BO2, INC4
  580. bdz- .L17
  581. .align 4
  582. .L16:
  583. FXCPMADD f0, A10, A2, f0
  584. FXCSMADD f4, A10, A2, f4
  585. FXCPMADD f8, B4, A2, f8
  586. FXCSMADD f12, B4, A2, f12
  587. LFPDUX A2, AO, INC4
  588. FXCPMADD f1, A10, A4, f1
  589. FXCSMADD f5, A10, A4, f5
  590. FXCPMADD f9, B4, A4, f9
  591. FXCSMADD f13, B4, A4, f13
  592. LFPDUX A4, AO2, INC4
  593. FXCPMADD f2, A10, A2, f2
  594. FXCSMADD f6, A10, A2, f6
  595. FXCPMADD f10, B4, A2, f10
  596. FXCSMADD f14, B4, A2, f14
  597. LFPDUX A2, AO, INC4
  598. FXCPMADD f3, A10, A4, f3
  599. FXCSMADD f7, A10, A4, f7
  600. LFPDUX A10, BO, INC4
  601. FXCPMADD f11, B4, A4, f11
  602. FXCSMADD f15, B4, A4, f15
  603. LFPDUX A4, AO2, INC4
  604. LFPDUX B4, BO2, INC4
  605. bdnz+ .L16
  606. .align 4
  607. .L17:
  608. FXCPMADD f0, A10, A2, f0
  609. FXCSMADD f4, A10, A2, f4
  610. FXCPMADD f8, B4, A2, f8
  611. FXCSMADD f12, B4, A2, f12
  612. LFPDUX A2, AO, INC4
  613. FXCPMADD f1, A10, A4, f1
  614. FXCSMADD f5, A10, A4, f5
  615. FXCPMADD f9, B4, A4, f9
  616. FXCSMADD f13, B4, A4, f13
  617. LFPDUX A4, AO2, INC4
  618. FXCPMADD f2, A10, A2, f2
  619. FXCSMADD f6, A10, A2, f6
  620. FXCPMADD f10, B4, A2, f10
  621. FXCSMADD f14, B4, A2, f14
  622. FXCPMADD f3, A10, A4, f3
  623. FXCSMADD f7, A10, A4, f7
  624. FXCPMADD f11, B4, A4, f11
  625. FXCSMADD f15, B4, A4, f15
  626. .align 4
  627. .L18:
  628. fpadd f0, f0, f4
  629. fpadd f8, f8, f12
  630. fpadd f1, f1, f5
  631. fpadd f9, f9, f13
  632. fpadd f2, f2, f6
  633. fpadd f10, f10, f14
  634. fpadd f3, f3, f7
  635. fpadd f11, f11, f15
  636. #if defined(LN) || defined(RT)
  637. #ifdef LN
  638. subi r0, KK, 4
  639. #else
  640. subi r0, KK, 2
  641. #endif
  642. slwi TEMP, r0, 2 + ZBASE_SHIFT
  643. slwi r0, r0, 1 + ZBASE_SHIFT
  644. add AO, AORIG, TEMP
  645. add BO, B, r0
  646. addi AO2, AO, 2 * SIZE
  647. addi BO, BO, - 4 * SIZE
  648. addi BO2, BO, 2 * SIZE
  649. #endif
  650. #if defined(LN) || defined(LT)
  651. LFPDUX f16, BO, INC4
  652. LFPDUX f20, BO2, INC4
  653. LFPDUX f17, BO, INC4
  654. LFPDUX f21, BO2, INC4
  655. LFPDUX f18, BO, INC4
  656. LFPDUX f22, BO2, INC4
  657. LFPDUX f19, BO, INC4
  658. LFPDUX f23, BO2, INC4
  659. subi BO, BO, 16 * SIZE
  660. subi BO2, BO2, 16 * SIZE
  661. #else
  662. LFPDUX f16, AO, INC4
  663. LFPDUX f17, AO2, INC4
  664. LFPDUX f18, AO, INC4
  665. LFPDUX f19, AO2, INC4
  666. LFPDUX f20, AO, INC4
  667. LFPDUX f21, AO2, INC4
  668. LFPDUX f22, AO, INC4
  669. LFPDUX f23, AO2, INC4
  670. subi AO, AO, 16 * SIZE
  671. subi AO2, AO2, 16 * SIZE
  672. #endif
  673. fpsub f0, f16, f0
  674. fpsub f1, f17, f1
  675. fpsub f2, f18, f2
  676. fpsub f3, f19, f3
  677. fpsub f8, f20, f8
  678. fpsub f9, f21, f9
  679. fpsub f10, f22, f10
  680. fpsub f11, f23, f11
  681. #ifdef LN
  682. LFPDUX A1, AO, INC4
  683. add AO2, AO2, INC4
  684. add AO, AO, INC4
  685. add AO2, AO2, INC4
  686. LFPDUX A2, AO, INC4
  687. LFPDUX A3, AO2, INC4
  688. add AO, AO, INC4
  689. add AO2, AO2, INC4
  690. LFPDUX A4, AO, INC4
  691. LFPDUX A5, AO2, INC4
  692. LFPDUX A6, AO, INC4
  693. add AO2, AO2, INC4
  694. LFPDUX A7, AO, INC4
  695. LFPDUX A8, AO2, INC4
  696. LFPDUX A9, AO, INC4
  697. LFPDUX A10, AO2, INC4
  698. subi AO, AO, 32 * SIZE
  699. subi AO2, AO2, 32 * SIZE
  700. fxpmul f4, A10, f3
  701. fxpmul f5, A10, f11
  702. FXCXNPMA f3, A10, f3, f4
  703. FXCXNPMA f11, A10, f11, f5
  704. fxcpnmsub f2, A9, f3, f2
  705. fxcpnmsub f10, A9, f11, f10
  706. FXCXNSMA f2, A9, f3, f2
  707. FXCXNSMA f10, A9, f11, f10
  708. fxcpnmsub f1, A8, f3, f1
  709. fxcpnmsub f9, A8, f11, f9
  710. FXCXNSMA f1, A8, f3, f1
  711. FXCXNSMA f9, A8, f11, f9
  712. fxcpnmsub f0, A7, f3, f0
  713. fxcpnmsub f8, A7, f11, f8
  714. FXCXNSMA f0, A7, f3, f0
  715. FXCXNSMA f8, A7, f11, f8
  716. fxpmul f4, A6, f2
  717. fxpmul f5, A6, f10
  718. FXCXNPMA f2, A6, f2, f4
  719. FXCXNPMA f10, A6, f10, f5
  720. fxcpnmsub f1, A5, f2, f1
  721. fxcpnmsub f9, A5, f10, f9
  722. FXCXNSMA f1, A5, f2, f1
  723. FXCXNSMA f9, A5, f10, f9
  724. fxcpnmsub f0, A4, f2, f0
  725. fxcpnmsub f8, A4, f10, f8
  726. FXCXNSMA f0, A4, f2, f0
  727. FXCXNSMA f8, A4, f10, f8
  728. fxpmul f4, A3, f1
  729. fxpmul f5, A3, f9
  730. FXCXNPMA f1, A3, f1, f4
  731. FXCXNPMA f9, A3, f9, f5
  732. fxcpnmsub f0, A2, f1, f0
  733. fxcpnmsub f8, A2, f9, f8
  734. FXCXNSMA f0, A2, f1, f0
  735. FXCXNSMA f8, A2, f9, f8
  736. fxpmul f4, A1, f0
  737. fxpmul f5, A1, f8
  738. FXCXNPMA f0, A1, f0, f4
  739. FXCXNPMA f8, A1, f8, f5
  740. #endif
  741. #ifdef LT
  742. LFPDUX A1, AO, INC4
  743. LFPDUX A2, AO2, INC4
  744. LFPDUX A3, AO, INC4
  745. LFPDUX A4, AO2, INC4
  746. add AO, AO, INC4
  747. LFPDUX A5, AO2, INC4
  748. LFPDUX A6, AO, INC4
  749. LFPDUX A7, AO2, INC4
  750. add AO, AO, INC4
  751. add AO2, AO2, INC4
  752. LFPDUX A8, AO, INC4
  753. LFPDUX A9, AO2, INC4
  754. add AO, AO, INC4
  755. add AO2, AO2, INC4
  756. add AO, AO, INC4
  757. LFPDUX A10, AO2, INC4
  758. subi AO, AO, 32 * SIZE
  759. subi AO2, AO2, 32 * SIZE
  760. fxpmul f4, A1, f0
  761. fxpmul f5, A1, f8
  762. FXCXNPMA f0, A1, f0, f4
  763. FXCXNPMA f8, A1, f8, f5
  764. fxcpnmsub f1, A2, f0, f1
  765. fxcpnmsub f9, A2, f8, f9
  766. FXCXNSMA f1, A2, f0, f1
  767. FXCXNSMA f9, A2, f8, f9
  768. fxcpnmsub f2, A3, f0, f2
  769. fxcpnmsub f10, A3, f8, f10
  770. FXCXNSMA f2, A3, f0, f2
  771. FXCXNSMA f10, A3, f8, f10
  772. fxcpnmsub f3, A4, f0, f3
  773. fxcpnmsub f11, A4, f8, f11
  774. FXCXNSMA f3, A4, f0, f3
  775. FXCXNSMA f11, A4, f8, f11
  776. fxpmul f6, A5, f1
  777. fxpmul f7, A5, f9
  778. FXCXNPMA f1, A5, f1, f6
  779. FXCXNPMA f9, A5, f9, f7
  780. fxcpnmsub f2, A6, f1, f2
  781. fxcpnmsub f10, A6, f9, f10
  782. FXCXNSMA f2, A6, f1, f2
  783. FXCXNSMA f10, A6, f9, f10
  784. fxcpnmsub f3, A7, f1, f3
  785. fxcpnmsub f11, A7, f9, f11
  786. FXCXNSMA f3, A7, f1, f3
  787. FXCXNSMA f11, A7, f9, f11
  788. fxpmul f4, A8, f2
  789. fxpmul f5, A8, f10
  790. FXCXNPMA f2, A8, f2, f4
  791. FXCXNPMA f10, A8, f10, f5
  792. fxcpnmsub f3, A9, f2, f3
  793. fxcpnmsub f11, A9, f10, f11
  794. FXCXNSMA f3, A9, f2, f3
  795. FXCXNSMA f11, A9, f10, f11
  796. fxpmul f6, A10, f3
  797. fxpmul f7, A10, f11
  798. FXCXNPMA f3, A10, f3, f6
  799. FXCXNPMA f11, A10, f11, f7
  800. #endif
  801. #ifdef RN
  802. LFPDUX A1, BO, INC4
  803. LFPDUX A2, BO2, INC4
  804. add BO, BO, INC4
  805. LFPDUX A3, BO2, INC4
  806. subi BO, BO, 8 * SIZE
  807. subi BO2, BO2, 8 * SIZE
  808. fxpmul f4, A1, f0
  809. fxpmul f5, A1, f1
  810. fxpmul f6, A1, f2
  811. fxpmul f7, A1, f3
  812. FXCXNPMA f0, A1, f0, f4
  813. FXCXNPMA f1, A1, f1, f5
  814. FXCXNPMA f2, A1, f2, f6
  815. FXCXNPMA f3, A1, f3, f7
  816. fxcpnmsub f8, A2, f0, f8
  817. fxcpnmsub f9, A2, f1, f9
  818. fxcpnmsub f10, A2, f2, f10
  819. fxcpnmsub f11, A2, f3, f11
  820. FXCXNSMA f8, A2, f0, f8
  821. FXCXNSMA f9, A2, f1, f9
  822. FXCXNSMA f10, A2, f2, f10
  823. FXCXNSMA f11, A2, f3, f11
  824. fxpmul f4, A3, f8
  825. fxpmul f5, A3, f9
  826. fxpmul f6, A3, f10
  827. fxpmul f7, A3, f11
  828. FXCXNPMA f8, A3, f8, f4
  829. FXCXNPMA f9, A3, f9, f5
  830. FXCXNPMA f10, A3, f10, f6
  831. FXCXNPMA f11, A3, f11, f7
  832. #endif
  833. #ifdef RT
  834. LFPDUX A1, BO, INC4
  835. add BO2, BO2, INC4
  836. LFPDUX A2, BO, INC4
  837. LFPDUX A3, BO2, INC4
  838. subi BO, BO, 8 * SIZE
  839. subi BO2, BO2, 8 * SIZE
  840. fxpmul f4, A3, f8
  841. fxpmul f5, A3, f9
  842. fxpmul f6, A3, f10
  843. fxpmul f7, A3, f11
  844. FXCXNPMA f8, A3, f8, f4
  845. FXCXNPMA f9, A3, f9, f5
  846. FXCXNPMA f10, A3, f10, f6
  847. FXCXNPMA f11, A3, f11, f7
  848. fxcpnmsub f0, A2, f8, f0
  849. fxcpnmsub f1, A2, f9, f1
  850. fxcpnmsub f2, A2, f10, f2
  851. fxcpnmsub f3, A2, f11, f3
  852. FXCXNSMA f0, A2, f8, f0
  853. FXCXNSMA f1, A2, f9, f1
  854. FXCXNSMA f2, A2, f10, f2
  855. FXCXNSMA f3, A2, f11, f3
  856. fxpmul f4, A1, f0
  857. fxpmul f5, A1, f1
  858. fxpmul f6, A1, f2
  859. fxpmul f7, A1, f3
  860. FXCXNPMA f0, A1, f0, f4
  861. FXCXNPMA f1, A1, f1, f5
  862. FXCXNPMA f2, A1, f2, f6
  863. FXCXNPMA f3, A1, f3, f7
  864. #endif
  865. #ifdef LN
  866. subi CO1, CO1, 8 * SIZE
  867. subi CO2, CO2, 8 * SIZE
  868. #endif
  869. #if defined(LN) || defined(LT)
  870. STFPDUX f0, BO, INC4
  871. STFPDUX f8, BO2, INC4
  872. STFPDUX f1, BO, INC4
  873. STFPDUX f9, BO2, INC4
  874. STFPDUX f2, BO, INC4
  875. STFPDUX f10, BO2, INC4
  876. STFPDUX f3, BO, INC4
  877. STFPDUX f11, BO2, INC4
  878. subi BO, BO, 16 * SIZE
  879. subi BO2, BO2, 16 * SIZE
  880. #else
  881. STFPDUX f0, AO, INC4
  882. STFPDUX f1, AO2, INC4
  883. STFPDUX f2, AO, INC4
  884. STFPDUX f3, AO2, INC4
  885. STFPDUX f8, AO, INC4
  886. STFPDUX f9, AO2, INC4
  887. STFPDUX f10, AO, INC4
  888. STFPDUX f11, AO2, INC4
  889. subi AO, AO, 16 * SIZE
  890. subi AO2, AO2, 16 * SIZE
  891. #endif
  892. STFDUX f0, CO1, INC
  893. STFSDUX f0, CO1, INC
  894. STFDUX f1, CO1, INC
  895. STFSDUX f1, CO1, INC
  896. STFDUX f2, CO1, INC
  897. STFSDUX f2, CO1, INC
  898. STFDUX f3, CO1, INC
  899. STFSDUX f3, CO1, INC
  900. STFDUX f8, CO2, INC
  901. STFSDUX f8, CO2, INC
  902. STFDUX f9, CO2, INC
  903. STFSDUX f9, CO2, INC
  904. STFDUX f10, CO2, INC
  905. STFSDUX f10, CO2, INC
  906. STFDUX f11, CO2, INC
  907. STFSDUX f11, CO2, INC
  908. #ifdef LN
  909. subi CO1, CO1, 8 * SIZE
  910. subi CO2, CO2, 8 * SIZE
  911. #endif
  912. #ifdef RT
  913. slwi r0, K, 2 + ZBASE_SHIFT
  914. add AORIG, AORIG, r0
  915. #endif
  916. #if defined(LT) || defined(RN)
  917. sub TEMP, K, KK
  918. slwi r0, TEMP, 2 + ZBASE_SHIFT
  919. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  920. add AO, AO, r0
  921. add BO, BO, TEMP
  922. #endif
  923. #ifdef LT
  924. addi KK, KK, 4
  925. #endif
  926. #ifdef LN
  927. subi KK, KK, 4
  928. #endif
  929. addic. I, I, -1
  930. li r0, FZERO
  931. lfpsx f0, SP, r0
  932. bgt+ .L11
  933. .align 4
  934. .L20:
  935. andi. I, M, 2
  936. beq .L30
  937. #if defined(LT) || defined(RN)
  938. addi AO2, AO, 2 * SIZE
  939. fpmr f4, f0
  940. addi BO, B, - 4 * SIZE
  941. fpmr f8, f0
  942. addi BO2, B, - 2 * SIZE
  943. fpmr f12, f0
  944. srawi. r0, KK, 2
  945. fpmr f1, f0
  946. fpmr f5, f0
  947. fpmr f9, f0
  948. mtspr CTR, r0
  949. fpmr f13, f0
  950. ble .L24
  951. #else
  952. #ifdef LN
  953. slwi r0, K, 1 + ZBASE_SHIFT
  954. sub AORIG, AORIG, r0
  955. #endif
  956. slwi r0 , KK, 1 + ZBASE_SHIFT
  957. add AO, AORIG, r0
  958. add BO, B, r0
  959. sub TEMP, K, KK
  960. addi AO2, AO, 2 * SIZE
  961. fpmr f4, f0
  962. addi BO, BO, - 4 * SIZE
  963. fpmr f8, f0
  964. addi BO2, BO, 2 * SIZE
  965. fpmr f12, f0
  966. fpmr f1, f0
  967. fpmr f5, f0
  968. fpmr f9, f0
  969. fpmr f13, f0
  970. srawi. r0, TEMP, 2
  971. mtspr CTR, r0
  972. ble .L24
  973. #endif
  974. LFPDUX A1, AO, INC4
  975. LFPDUX B1, BO, INC4
  976. LFPDUX A2, AO2, INC4
  977. LFPDUX B2, BO2, INC4
  978. LFPDUX A3, AO, INC4
  979. LFPDUX B3, BO, INC4
  980. LFPDUX A4, AO2, INC4
  981. LFPDUX B4, BO2, INC4
  982. LFPDUX A5, AO, INC4
  983. LFPDUX B5, BO, INC4
  984. LFPDUX A6, AO2, INC4
  985. LFPDUX B6, BO2, INC4
  986. LFPDUX A7, AO, INC4
  987. LFPDUX A9, BO, INC4
  988. LFPDUX A10, BO2, INC4
  989. bdz- .L23
  990. .align 4
  991. .L22:
  992. FXCPMADD f0, B1, A1, f0
  993. nop
  994. FXCSMADD f4, B1, A1, f4
  995. LFPDUX A8, AO2, INC4
  996. FXCPMADD f8, B2, A1, f8
  997. nop
  998. FXCSMADD f12, B2, A1, f12
  999. LFPDUX A1, AO, INC4
  1000. FXCPMADD f1, B1, A2, f1
  1001. nop
  1002. FXCSMADD f5, B1, A2, f5
  1003. LFPDUX B1, BO, INC4
  1004. FXCPMADD f9, B2, A2, f9
  1005. nop
  1006. FXCSMADD f13, B2, A2, f13
  1007. LFPDUX B2, BO2, INC4
  1008. FXCPMADD f0, B3, A3, f0
  1009. nop
  1010. FXCSMADD f4, B3, A3, f4
  1011. LFPDUX A2, AO2, INC4
  1012. FXCPMADD f8, B4, A3, f8
  1013. nop
  1014. FXCSMADD f12, B4, A3, f12
  1015. LFPDUX A3, AO, INC4
  1016. FXCPMADD f1, B3, A4, f1
  1017. nop
  1018. FXCSMADD f5, B3, A4, f5
  1019. LFPDUX B3, BO, INC4
  1020. FXCPMADD f9, B4, A4, f9
  1021. nop
  1022. FXCSMADD f13, B4, A4, f13
  1023. LFPDUX B4, BO2, INC4
  1024. FXCPMADD f0, B5, A5, f0
  1025. nop
  1026. FXCSMADD f4, B5, A5, f4
  1027. LFPDUX A4, AO2, INC4
  1028. FXCPMADD f8, B6, A5, f8
  1029. nop
  1030. FXCSMADD f12, B6, A5, f12
  1031. LFPDUX A5, AO, INC4
  1032. FXCPMADD f1, B5, A6, f1
  1033. nop
  1034. FXCSMADD f5, B5, A6, f5
  1035. LFPDUX B5, BO, INC4
  1036. FXCPMADD f9, B6, A6, f9
  1037. nop
  1038. FXCSMADD f13, B6, A6, f13
  1039. LFPDUX B6, BO2, INC4
  1040. FXCPMADD f0, A9, A7, f0
  1041. nop
  1042. FXCSMADD f4, A9, A7, f4
  1043. LFPDUX A6, AO2, INC4
  1044. FXCPMADD f8, A10, A7, f8
  1045. nop
  1046. FXCSMADD f12, A10, A7, f12
  1047. LFPDUX A7, AO, INC4
  1048. FXCPMADD f1, A9, A8, f1
  1049. nop
  1050. FXCSMADD f5, A9, A8, f5
  1051. LFPDUX A9, BO, INC4
  1052. FXCPMADD f9, A10, A8, f9
  1053. nop
  1054. FXCSMADD f13, A10, A8, f13
  1055. LFPDUX A10, BO2, INC4
  1056. bdnz+ .L22
  1057. .align 4
  1058. .L23:
  1059. FXCPMADD f0, B1, A1, f0
  1060. FXCSMADD f4, B1, A1, f4
  1061. LFPDUX A8, AO2, INC4
  1062. FXCPMADD f8, B2, A1, f8
  1063. FXCSMADD f12, B2, A1, f12
  1064. FXCPMADD f1, B1, A2, f1
  1065. FXCSMADD f5, B1, A2, f5
  1066. FXCPMADD f9, B2, A2, f9
  1067. FXCSMADD f13, B2, A2, f13
  1068. FXCPMADD f0, B3, A3, f0
  1069. FXCSMADD f4, B3, A3, f4
  1070. FXCPMADD f8, B4, A3, f8
  1071. FXCSMADD f12, B4, A3, f12
  1072. FXCPMADD f1, B3, A4, f1
  1073. FXCSMADD f5, B3, A4, f5
  1074. FXCPMADD f9, B4, A4, f9
  1075. FXCSMADD f13, B4, A4, f13
  1076. FXCPMADD f0, B5, A5, f0
  1077. FXCSMADD f4, B5, A5, f4
  1078. FXCPMADD f8, B6, A5, f8
  1079. FXCSMADD f12, B6, A5, f12
  1080. FXCPMADD f1, B5, A6, f1
  1081. FXCSMADD f5, B5, A6, f5
  1082. FXCPMADD f9, B6, A6, f9
  1083. FXCSMADD f13, B6, A6, f13
  1084. FXCPMADD f0, A9, A7, f0
  1085. FXCSMADD f4, A9, A7, f4
  1086. FXCPMADD f8, A10, A7, f8
  1087. FXCSMADD f12, A10, A7, f12
  1088. FXCPMADD f1, A9, A8, f1
  1089. FXCSMADD f5, A9, A8, f5
  1090. FXCPMADD f9, A10, A8, f9
  1091. FXCSMADD f13, A10, A8, f13
  1092. .align 4
  1093. .L24:
  1094. #if defined(LT) || defined(RN)
  1095. andi. r0, KK, 3
  1096. mtspr CTR, r0
  1097. ble+ .L28
  1098. #else
  1099. andi. r0, TEMP, 3
  1100. mtspr CTR, r0
  1101. ble+ .L28
  1102. #endif
  1103. LFPDUX A1, AO, INC4
  1104. LFPDUX A2, AO2, INC4
  1105. LFPDUX B1, BO, INC4
  1106. LFPDUX B2, BO2, INC4
  1107. bdz- .L27
  1108. .align 4
  1109. .L26:
  1110. FXCPMADD f0, B1, A1, f0
  1111. FXCSMADD f4, B1, A1, f4
  1112. FXCPMADD f8, B2, A1, f8
  1113. FXCSMADD f12, B2, A1, f12
  1114. LFPDUX A1, AO, INC4
  1115. FXCPMADD f1, B1, A2, f1
  1116. FXCSMADD f5, B1, A2, f5
  1117. LFPDUX B1, BO, INC4
  1118. FXCPMADD f9, B2, A2, f9
  1119. FXCSMADD f13, B2, A2, f13
  1120. LFPDUX A2, AO2, INC4
  1121. LFPDUX B2, BO2, INC4
  1122. bdnz+ .L26
  1123. .align 4
  1124. .L27:
  1125. FXCPMADD f0, B1, A1, f0
  1126. FXCSMADD f4, B1, A1, f4
  1127. FXCPMADD f8, B2, A1, f8
  1128. FXCSMADD f12, B2, A1, f12
  1129. FXCPMADD f1, B1, A2, f1
  1130. FXCSMADD f5, B1, A2, f5
  1131. FXCPMADD f9, B2, A2, f9
  1132. FXCSMADD f13, B2, A2, f13
  1133. .align 4
  1134. .L28:
  1135. fpadd f0, f0, f4
  1136. fpadd f8, f8, f12
  1137. fpadd f1, f1, f5
  1138. fpadd f9, f9, f13
  1139. #if defined(LN) || defined(RT)
  1140. #ifdef LN
  1141. subi r0, KK, 2
  1142. #else
  1143. subi r0, KK, 2
  1144. #endif
  1145. slwi r0, r0, 1 + ZBASE_SHIFT
  1146. add AO, AORIG, r0
  1147. add BO, B, r0
  1148. addi AO2, AO, 2 * SIZE
  1149. addi BO, BO, - 4 * SIZE
  1150. addi BO2, BO, 2 * SIZE
  1151. #endif
  1152. #if defined(LN) || defined(LT)
  1153. LFPDUX f16, BO, INC4
  1154. LFPDUX f18, BO2, INC4
  1155. LFPDUX f17, BO, INC4
  1156. LFPDUX f19, BO2, INC4
  1157. subi BO, BO, 8 * SIZE
  1158. subi BO2, BO2, 8 * SIZE
  1159. #else
  1160. LFPDUX f16, AO, INC4
  1161. LFPDUX f17, AO2, INC4
  1162. LFPDUX f18, AO, INC4
  1163. LFPDUX f19, AO2, INC4
  1164. subi AO, AO, 8 * SIZE
  1165. subi AO2, AO2, 8 * SIZE
  1166. #endif
  1167. fpsub f0, f16, f0
  1168. fpsub f1, f17, f1
  1169. fpsub f8, f18, f8
  1170. fpsub f9, f19, f9
  1171. #ifdef LN
  1172. LFPDUX A1, AO, INC4
  1173. add AO2, AO2, INC4
  1174. LFPDUX A2, AO, INC4
  1175. LFPDUX A3, AO2, INC4
  1176. subi AO, AO, 8 * SIZE
  1177. subi AO2, AO2, 8 * SIZE
  1178. fxpmul f4, A3, f1
  1179. fxpmul f5, A3, f9
  1180. FXCXNPMA f1, A3, f1, f4
  1181. FXCXNPMA f9, A3, f9, f5
  1182. fxcpnmsub f0, A2, f1, f0
  1183. fxcpnmsub f8, A2, f9, f8
  1184. FXCXNSMA f0, A2, f1, f0
  1185. FXCXNSMA f8, A2, f9, f8
  1186. fxpmul f4, A1, f0
  1187. fxpmul f5, A1, f8
  1188. FXCXNPMA f0, A1, f0, f4
  1189. FXCXNPMA f8, A1, f8, f5
  1190. #endif
  1191. #ifdef LT
  1192. LFPDUX A1, AO, INC4
  1193. LFPDUX A2, AO2, INC4
  1194. add AO, AO, INC4
  1195. LFPDUX A3, AO2, INC4
  1196. subi AO, AO, 8 * SIZE
  1197. subi AO2, AO2, 8 * SIZE
  1198. fxpmul f4, A1, f0
  1199. fxpmul f5, A1, f8
  1200. FXCXNPMA f0, A1, f0, f4
  1201. FXCXNPMA f8, A1, f8, f5
  1202. fxcpnmsub f1, A2, f0, f1
  1203. fxcpnmsub f9, A2, f8, f9
  1204. FXCXNSMA f1, A2, f0, f1
  1205. FXCXNSMA f9, A2, f8, f9
  1206. fxpmul f6, A3, f1
  1207. fxpmul f7, A3, f9
  1208. FXCXNPMA f1, A3, f1, f6
  1209. FXCXNPMA f9, A3, f9, f7
  1210. #endif
  1211. #ifdef RN
  1212. LFPDUX A1, BO, INC4
  1213. LFPDUX A2, BO2, INC4
  1214. add BO, BO, INC4
  1215. LFPDUX A3, BO2, INC4
  1216. subi BO, BO, 8 * SIZE
  1217. subi BO2, BO2, 8 * SIZE
  1218. fxpmul f4, A1, f0
  1219. fxpmul f5, A1, f1
  1220. FXCXNPMA f0, A1, f0, f4
  1221. FXCXNPMA f1, A1, f1, f5
  1222. fxcpnmsub f8, A2, f0, f8
  1223. fxcpnmsub f9, A2, f1, f9
  1224. FXCXNSMA f8, A2, f0, f8
  1225. FXCXNSMA f9, A2, f1, f9
  1226. fxpmul f4, A3, f8
  1227. fxpmul f5, A3, f9
  1228. FXCXNPMA f8, A3, f8, f4
  1229. FXCXNPMA f9, A3, f9, f5
  1230. #endif
  1231. #ifdef RT
  1232. LFPDUX A1, BO, INC4
  1233. add BO2, BO2, INC4
  1234. LFPDUX A2, BO, INC4
  1235. LFPDUX A3, BO2, INC4
  1236. subi BO, BO, 8 * SIZE
  1237. subi BO2, BO2, 8 * SIZE
  1238. fxpmul f4, A3, f8
  1239. fxpmul f5, A3, f9
  1240. FXCXNPMA f8, A3, f8, f4
  1241. FXCXNPMA f9, A3, f9, f5
  1242. fxcpnmsub f0, A2, f8, f0
  1243. fxcpnmsub f1, A2, f9, f1
  1244. FXCXNSMA f0, A2, f8, f0
  1245. FXCXNSMA f1, A2, f9, f1
  1246. fxpmul f4, A1, f0
  1247. fxpmul f5, A1, f1
  1248. FXCXNPMA f0, A1, f0, f4
  1249. FXCXNPMA f1, A1, f1, f5
  1250. #endif
  1251. #ifdef LN
  1252. subi CO1, CO1, 4 * SIZE
  1253. subi CO2, CO2, 4 * SIZE
  1254. #endif
  1255. #if defined(LN) || defined(LT)
  1256. STFPDUX f0, BO, INC4
  1257. STFPDUX f8, BO2, INC4
  1258. STFPDUX f1, BO, INC4
  1259. STFPDUX f9, BO2, INC4
  1260. subi BO, BO, 8 * SIZE
  1261. subi BO2, BO2, 8 * SIZE
  1262. #else
  1263. STFPDUX f0, AO, INC4
  1264. STFPDUX f1, AO2, INC4
  1265. STFPDUX f8, AO, INC4
  1266. STFPDUX f9, AO2, INC4
  1267. subi AO, AO, 8 * SIZE
  1268. subi AO2, AO2, 8 * SIZE
  1269. #endif
  1270. STFDUX f0, CO1, INC
  1271. STFSDUX f0, CO1, INC
  1272. STFDUX f1, CO1, INC
  1273. STFSDUX f1, CO1, INC
  1274. STFDUX f8, CO2, INC
  1275. STFSDUX f8, CO2, INC
  1276. STFDUX f9, CO2, INC
  1277. STFSDUX f9, CO2, INC
  1278. #ifdef LN
  1279. subi CO1, CO1, 4 * SIZE
  1280. subi CO2, CO2, 4 * SIZE
  1281. #endif
  1282. #ifdef RT
  1283. slwi r0, K, 1 + ZBASE_SHIFT
  1284. add AORIG, AORIG, r0
  1285. #endif
  1286. #if defined(LT) || defined(RN)
  1287. sub TEMP, K, KK
  1288. slwi r0, TEMP, 1 + ZBASE_SHIFT
  1289. add AO, AO, r0
  1290. add BO, BO, r0
  1291. #endif
  1292. #ifdef LT
  1293. addi KK, KK, 2
  1294. #endif
  1295. #ifdef LN
  1296. subi KK, KK, 2
  1297. #endif
  1298. li r0, FZERO
  1299. lfpsx f0, SP, r0
  1300. .align 4
  1301. .L30:
  1302. andi. I, M, 1
  1303. beq .L49
  1304. #if defined(LT) || defined(RN)
  1305. addi AO2, AO, 2 * SIZE
  1306. fpmr f1, f0
  1307. addi BO, B, - 4 * SIZE
  1308. fpmr f2, f0
  1309. addi BO2, B, - 2 * SIZE
  1310. fpmr f3, f0
  1311. srawi. r0, KK, 2
  1312. mtspr CTR, r0
  1313. ble .L34
  1314. #else
  1315. #ifdef LN
  1316. slwi r0, K, 0 + ZBASE_SHIFT
  1317. sub AORIG, AORIG, r0
  1318. #endif
  1319. slwi r0 , KK, 0 + ZBASE_SHIFT
  1320. slwi TEMP, KK, 1 + ZBASE_SHIFT
  1321. add AO, AORIG, r0
  1322. add BO, B, TEMP
  1323. sub TEMP, K, KK
  1324. addi AO2, AO, 2 * SIZE
  1325. fpmr f1, f0
  1326. addi BO, BO, - 4 * SIZE
  1327. fpmr f2, f0
  1328. addi BO2, BO, 2 * SIZE
  1329. fpmr f3, f0
  1330. srawi. r0, TEMP, 2
  1331. mtspr CTR, r0
  1332. ble .L34
  1333. #endif
  1334. LFPDUX A1, AO, INC4
  1335. LFPDUX B1, BO, INC4
  1336. LFPDUX B2, BO2, INC4
  1337. LFPDUX A2, AO2, INC4
  1338. LFPDUX B3, BO, INC4
  1339. LFPDUX B4, BO2, INC4
  1340. LFPDUX A3, AO, INC4
  1341. LFPDUX A5, BO, INC4
  1342. LFPDUX A6, BO2, INC4
  1343. LFPDUX A4, AO2, INC4
  1344. LFPDUX A7, BO, INC4
  1345. LFPDUX A8, BO2, INC4
  1346. bdz- .L33
  1347. .align 4
  1348. .L32:
  1349. FXCPMADD f0, B1, A1, f0
  1350. FXCSMADD f1, B1, A1, f1
  1351. LFPDUX B1, BO, INC4
  1352. FXCPMADD f2, B2, A1, f2
  1353. FXCSMADD f3, B2, A1, f3
  1354. LFPDUX B2, BO2, INC4
  1355. LFPDUX A1, AO, INC4
  1356. FXCPMADD f0, B3, A2, f0
  1357. FXCSMADD f1, B3, A2, f1
  1358. LFPDUX B3, BO, INC4
  1359. FXCPMADD f2, B4, A2, f2
  1360. FXCSMADD f3, B4, A2, f3
  1361. LFPDUX B4, BO2, INC4
  1362. LFPDUX A2, AO2, INC4
  1363. FXCPMADD f0, A5, A3, f0
  1364. FXCSMADD f1, A5, A3, f1
  1365. LFPDUX A5, BO, INC4
  1366. FXCPMADD f2, A6, A3, f2
  1367. FXCSMADD f3, A6, A3, f3
  1368. LFPDUX A6, BO2, INC4
  1369. LFPDUX A3, AO, INC4
  1370. FXCPMADD f0, A7, A4, f0
  1371. FXCSMADD f1, A7, A4, f1
  1372. LFPDUX A7, BO, INC4
  1373. FXCPMADD f2, A8, A4, f2
  1374. FXCSMADD f3, A8, A4, f3
  1375. LFPDUX A8, BO2, INC4
  1376. LFPDUX A4, AO2, INC4
  1377. bdnz+ .L32
  1378. .align 4
  1379. .L33:
  1380. FXCPMADD f0, B1, A1, f0
  1381. FXCSMADD f1, B1, A1, f1
  1382. FXCPMADD f2, B2, A1, f2
  1383. FXCSMADD f3, B2, A1, f3
  1384. FXCPMADD f0, B3, A2, f0
  1385. FXCSMADD f1, B3, A2, f1
  1386. FXCPMADD f2, B4, A2, f2
  1387. FXCSMADD f3, B4, A2, f3
  1388. FXCPMADD f0, A5, A3, f0
  1389. FXCSMADD f1, A5, A3, f1
  1390. FXCPMADD f2, A6, A3, f2
  1391. FXCSMADD f3, A6, A3, f3
  1392. FXCPMADD f0, A7, A4, f0
  1393. FXCSMADD f1, A7, A4, f1
  1394. FXCPMADD f2, A8, A4, f2
  1395. FXCSMADD f3, A8, A4, f3
  1396. .align 4
  1397. .L34:
  1398. #if defined(LT) || defined(RN)
  1399. andi. r0, KK, 3
  1400. mtspr CTR, r0
  1401. ble+ .L38
  1402. #else
  1403. andi. r0, TEMP, 3
  1404. mtspr CTR, r0
  1405. ble+ .L38
  1406. #endif
  1407. LFPDX A1, AO, INC4
  1408. LFPDUX B1, BO, INC4
  1409. LFPDUX B2, BO2, INC4
  1410. add AO, AO, INC2
  1411. bdz- .L37
  1412. .align 4
  1413. .L36:
  1414. FXCPMADD f0, B1, A1, f0
  1415. FXCSMADD f1, B1, A1, f1
  1416. LFPDUX B1, BO, INC4
  1417. FXCPMADD f2, B2, A1, f2
  1418. FXCSMADD f3, B2, A1, f3
  1419. LFPDX A1, AO, INC4
  1420. LFPDUX B2, BO2, INC4
  1421. add AO, AO, INC2
  1422. bdnz+ .L36
  1423. .align 4
  1424. .L37:
  1425. FXCPMADD f0, B1, A1, f0
  1426. FXCSMADD f1, B1, A1, f1
  1427. FXCPMADD f2, B2, A1, f2
  1428. FXCSMADD f3, B2, A1, f3
  1429. .align 4
  1430. .L38:
  1431. fpadd f0, f0, f1
  1432. fpadd f2, f2, f3
  1433. #if defined(LN) || defined(RT)
  1434. #ifdef LN
  1435. subi r0, KK, 1
  1436. #else
  1437. subi r0, KK, 2
  1438. #endif
  1439. slwi TEMP, r0, 0 + ZBASE_SHIFT
  1440. slwi r0, r0, 1 + ZBASE_SHIFT
  1441. add AO, AORIG, TEMP
  1442. add BO, B, r0
  1443. addi BO, BO, - 4 * SIZE
  1444. #endif
  1445. addi AO2, AO, 2 * SIZE
  1446. addi BO2, BO, 2 * SIZE
  1447. #if defined(LN) || defined(LT)
  1448. LFPDX f16, BO, INC4
  1449. LFPDX f17, BO2, INC4
  1450. #else
  1451. LFPDX f16, AO, INC4
  1452. LFPDX f17, AO2, INC4
  1453. #endif
  1454. fpsub f0, f16, f0
  1455. fpsub f2, f17, f2
  1456. #ifdef LN
  1457. LFPDX A1, AO, INC4
  1458. fxpmul f4, A1, f0
  1459. fxpmul f5, A1, f2
  1460. FXCXNPMA f0, A1, f0, f4
  1461. FXCXNPMA f2, A1, f2, f5
  1462. #endif
  1463. #ifdef LT
  1464. LFPDX A1, AO, INC4
  1465. fxpmul f4, A1, f0
  1466. fxpmul f5, A1, f2
  1467. FXCXNPMA f0, A1, f0, f4
  1468. FXCXNPMA f2, A1, f2, f5
  1469. #endif
  1470. #ifdef RN
  1471. LFPDUX A1, BO, INC4
  1472. LFPDUX A2, BO2, INC4
  1473. add BO, BO, INC4
  1474. LFPDUX A3, BO2, INC4
  1475. subi BO, BO, 8 * SIZE
  1476. subi BO2, BO2, 8 * SIZE
  1477. fxpmul f4, A1, f0
  1478. FXCXNPMA f0, A1, f0, f4
  1479. fxcpnmsub f2, A2, f0, f2
  1480. FXCXNSMA f2, A2, f0, f2
  1481. fxpmul f4, A3, f2
  1482. FXCXNPMA f2, A3, f2, f4
  1483. #endif
  1484. #ifdef RT
  1485. LFPDUX A1, BO, INC4
  1486. add BO2, BO2, INC4
  1487. LFPDUX A2, BO, INC4
  1488. LFPDUX A3, BO2, INC4
  1489. subi BO, BO, 8 * SIZE
  1490. subi BO2, BO2, 8 * SIZE
  1491. fxpmul f4, A3, f2
  1492. FXCXNPMA f2, A3, f2, f4
  1493. fxcpnmsub f0, A2, f2, f0
  1494. FXCXNSMA f0, A2, f2, f0
  1495. fxpmul f4, A1, f0
  1496. FXCXNPMA f0, A1, f0, f4
  1497. #endif
  1498. #ifdef LN
  1499. subi CO1, CO1, 2 * SIZE
  1500. subi CO2, CO2, 2 * SIZE
  1501. #endif
  1502. #if defined(LN) || defined(LT)
  1503. STFPDX f0, BO, INC4
  1504. STFPDX f2, BO2, INC4
  1505. #else
  1506. STFPDX f0, AO, INC4
  1507. STFPDX f2, AO2, INC4
  1508. #endif
  1509. STFDUX f0, CO1, INC
  1510. STFSDUX f0, CO1, INC
  1511. STFDUX f2, CO2, INC
  1512. STFSDUX f2, CO2, INC
  1513. #ifdef LN
  1514. subi CO1, CO1, 2 * SIZE
  1515. subi CO2, CO2, 2 * SIZE
  1516. #endif
  1517. #ifdef RT
  1518. slwi r0, K, 0 + ZBASE_SHIFT
  1519. add AORIG, AORIG, r0
  1520. #endif
  1521. #if defined(LT) || defined(RN)
  1522. sub TEMP, K, KK
  1523. slwi r0, TEMP, 0 + ZBASE_SHIFT
  1524. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  1525. add AO, AO, r0
  1526. add BO, BO, TEMP
  1527. #endif
  1528. #ifdef LT
  1529. addi KK, KK, 1
  1530. #endif
  1531. #ifdef LN
  1532. subi KK, KK, 1
  1533. #endif
  1534. li r0, FZERO
  1535. lfpsx f0, SP, r0
  1536. .align 4
  1537. .L49:
  1538. #ifdef LN
  1539. slwi r0, K, 1 + ZBASE_SHIFT
  1540. add B, B, r0
  1541. #endif
  1542. #if defined(LT) || defined(RN)
  1543. addi B, BO, 4 * SIZE
  1544. #endif
  1545. #ifdef RN
  1546. addi KK, KK, 2
  1547. #endif
  1548. #ifdef RT
  1549. subi KK, KK, 2
  1550. #endif
  1551. addic. J, J, -1
  1552. bgt+ .L10
  1553. .align 4
  1554. .L50:
  1555. andi. J, N, 1
  1556. beq .L999
  1557. #ifdef RT
  1558. slwi r0, K, 0 + ZBASE_SHIFT
  1559. sub B, B, r0
  1560. sub C, C, LDC
  1561. #endif
  1562. mr CO1, C
  1563. #ifdef LN
  1564. add KK, M, OFFSET
  1565. #endif
  1566. #ifdef LT
  1567. mr KK, OFFSET
  1568. #endif
  1569. #if defined(LN) || defined(RT)
  1570. addi AORIG, A, -2 * SIZE
  1571. #else
  1572. addi AO, A, -2 * SIZE
  1573. #endif
  1574. #ifndef RT
  1575. add C, CO2, LDC
  1576. #endif
  1577. li r0, FZERO
  1578. lfpsx f0, SP, r0
  1579. srawi. I, M, 2
  1580. ble .L60
  1581. .align 4
  1582. .L51:
  1583. #if defined(LT) || defined(RN)
  1584. fpmr f4, f0
  1585. addi BO, B, - 2 * SIZE
  1586. fpmr f1, f0
  1587. fpmr f5, f0
  1588. fpmr f2, f0
  1589. fpmr f6, f0
  1590. fpmr f3, f0
  1591. fpmr f7, f0
  1592. srawi. r0, KK, 2
  1593. mtspr CTR, r0
  1594. ble .L54
  1595. #else
  1596. #ifdef LN
  1597. slwi r0, K, 2 + ZBASE_SHIFT
  1598. sub AORIG, AORIG, r0
  1599. #endif
  1600. slwi r0 , KK, 2 + ZBASE_SHIFT
  1601. slwi TEMP, KK, 0 + ZBASE_SHIFT
  1602. add AO, AORIG, r0
  1603. add BO, B, TEMP
  1604. sub TEMP, K, KK
  1605. fpmr f4, f0
  1606. addi BO, BO, - 2 * SIZE
  1607. fpmr f1, f0
  1608. fpmr f5, f0
  1609. fpmr f2, f0
  1610. fpmr f6, f0
  1611. fpmr f3, f0
  1612. fpmr f7, f0
  1613. srawi. r0, TEMP, 2
  1614. mtspr CTR, r0
  1615. ble .L54
  1616. #endif
  1617. LFPDUX B1, BO, INC2
  1618. LFPDUX A1, AO, INC2
  1619. LFPDUX A2, AO, INC2
  1620. LFPDUX B2, BO, INC2
  1621. LFPDUX A3, AO, INC2
  1622. LFPDUX A4, AO, INC2
  1623. LFPDUX B3, BO, INC2
  1624. LFPDUX A5, AO, INC2
  1625. LFPDUX A6, AO, INC2
  1626. LFPDUX A7, AO, INC2
  1627. LFPDUX A8, AO, INC2
  1628. bdz- .L53
  1629. .align 4
  1630. .L52:
  1631. FXCPMADD f0, B1, A1, f0
  1632. LFPDUX B4, BO, INC2
  1633. FXCSMADD f4, B1, A1, f4
  1634. LFPDUX A1, AO, INC2
  1635. FXCPMADD f1, B1, A2, f1
  1636. nop
  1637. FXCSMADD f5, B1, A2, f5
  1638. LFPDUX A2, AO, INC2
  1639. FXCPMADD f2, B1, A3, f2
  1640. nop
  1641. FXCSMADD f6, B1, A3, f6
  1642. LFPDUX A3, AO, INC2
  1643. FXCPMADD f3, B1, A4, f3
  1644. nop
  1645. FXCSMADD f7, B1, A4, f7
  1646. LFPDUX A4, AO, INC2
  1647. FXCPMADD f0, B2, A5, f0
  1648. LFPDUX B1, BO, INC2
  1649. FXCSMADD f4, B2, A5, f4
  1650. LFPDUX A5, AO, INC2
  1651. FXCPMADD f1, B2, A6, f1
  1652. nop
  1653. FXCSMADD f5, B2, A6, f5
  1654. LFPDUX A6, AO, INC2
  1655. FXCPMADD f2, B2, A7, f2
  1656. nop
  1657. FXCSMADD f6, B2, A7, f6
  1658. LFPDUX A7, AO, INC2
  1659. FXCPMADD f3, B2, A8, f3
  1660. nop
  1661. FXCSMADD f7, B2, A8, f7
  1662. LFPDUX A8, AO, INC2
  1663. FXCPMADD f0, B3, A1, f0
  1664. LFPDUX B2, BO, INC2
  1665. FXCSMADD f4, B3, A1, f4
  1666. LFPDUX A1, AO, INC2
  1667. FXCPMADD f1, B3, A2, f1
  1668. nop
  1669. FXCSMADD f5, B3, A2, f5
  1670. LFPDUX A2, AO, INC2
  1671. FXCPMADD f2, B3, A3, f2
  1672. nop
  1673. FXCSMADD f6, B3, A3, f6
  1674. LFPDUX A3, AO, INC2
  1675. FXCPMADD f3, B3, A4, f3
  1676. nop
  1677. FXCSMADD f7, B3, A4, f7
  1678. LFPDUX A4, AO, INC2
  1679. FXCPMADD f0, B4, A5, f0
  1680. LFPDUX B3, BO, INC2
  1681. FXCSMADD f4, B4, A5, f4
  1682. LFPDUX A5, AO, INC2
  1683. FXCPMADD f1, B4, A6, f1
  1684. nop
  1685. FXCSMADD f5, B4, A6, f5
  1686. LFPDUX A6, AO, INC2
  1687. FXCPMADD f2, B4, A7, f2
  1688. nop
  1689. FXCSMADD f6, B4, A7, f6
  1690. LFPDUX A7, AO, INC2
  1691. FXCPMADD f3, B4, A8, f3
  1692. nop
  1693. FXCSMADD f7, B4, A8, f7
  1694. LFPDUX A8, AO, INC2
  1695. bdnz+ .L52
  1696. .align 4
  1697. .L53:
  1698. FXCPMADD f0, B1, A1, f0
  1699. LFPDUX B4, BO, INC2
  1700. FXCSMADD f4, B1, A1, f4
  1701. LFPDUX A1, AO, INC2
  1702. FXCPMADD f1, B1, A2, f1
  1703. nop
  1704. FXCSMADD f5, B1, A2, f5
  1705. LFPDUX A2, AO, INC2
  1706. FXCPMADD f2, B1, A3, f2
  1707. nop
  1708. FXCSMADD f6, B1, A3, f6
  1709. LFPDUX A3, AO, INC2
  1710. FXCPMADD f3, B1, A4, f3
  1711. nop
  1712. FXCSMADD f7, B1, A4, f7
  1713. LFPDUX A4, AO, INC2
  1714. FXCPMADD f0, B2, A5, f0
  1715. nop
  1716. FXCSMADD f4, B2, A5, f4
  1717. LFPDUX A5, AO, INC2
  1718. FXCPMADD f1, B2, A6, f1
  1719. nop
  1720. FXCSMADD f5, B2, A6, f5
  1721. LFPDUX A6, AO, INC2
  1722. FXCPMADD f2, B2, A7, f2
  1723. nop
  1724. FXCSMADD f6, B2, A7, f6
  1725. LFPDUX A7, AO, INC2
  1726. FXCPMADD f3, B2, A8, f3
  1727. nop
  1728. FXCSMADD f7, B2, A8, f7
  1729. LFPDUX A8, AO, INC2
  1730. FXCPMADD f0, B3, A1, f0
  1731. FXCSMADD f4, B3, A1, f4
  1732. FXCPMADD f1, B3, A2, f1
  1733. FXCSMADD f5, B3, A2, f5
  1734. FXCPMADD f2, B3, A3, f2
  1735. FXCSMADD f6, B3, A3, f6
  1736. FXCPMADD f3, B3, A4, f3
  1737. FXCSMADD f7, B3, A4, f7
  1738. FXCPMADD f0, B4, A5, f0
  1739. FXCSMADD f4, B4, A5, f4
  1740. FXCPMADD f1, B4, A6, f1
  1741. FXCSMADD f5, B4, A6, f5
  1742. FXCPMADD f2, B4, A7, f2
  1743. FXCSMADD f6, B4, A7, f6
  1744. FXCPMADD f3, B4, A8, f3
  1745. FXCSMADD f7, B4, A8, f7
  1746. .align 4
  1747. .L54:
  1748. #if defined(LT) || defined(RN)
  1749. andi. r0, KK, 3
  1750. mtspr CTR, r0
  1751. ble+ .L58
  1752. #else
  1753. andi. r0, TEMP, 3
  1754. mtspr CTR, r0
  1755. ble+ .L58
  1756. #endif
  1757. LFPDUX A1, AO, INC2
  1758. LFPDUX B1, BO, INC2
  1759. LFPDUX A2, AO, INC2
  1760. LFPDUX A3, AO, INC2
  1761. LFPDUX A4, AO, INC2
  1762. bdz- .L57
  1763. .align 4
  1764. .L56:
  1765. FXCPMADD f0, B1, A1, f0
  1766. FXCSMADD f4, B1, A1, f4
  1767. LFPDUX A1, AO, INC2
  1768. FXCPMADD f1, B1, A2, f1
  1769. FXCSMADD f5, B1, A2, f5
  1770. LFPDUX A2, AO, INC2
  1771. FXCPMADD f2, B1, A3, f2
  1772. FXCSMADD f6, B1, A3, f6
  1773. LFPDUX A3, AO, INC2
  1774. FXCPMADD f3, B1, A4, f3
  1775. FXCSMADD f7, B1, A4, f7
  1776. LFPDUX A4, AO, INC2
  1777. LFPDUX B1, BO, INC2
  1778. bdnz+ .L56
  1779. .align 4
  1780. .L57:
  1781. FXCPMADD f0, B1, A1, f0
  1782. FXCSMADD f4, B1, A1, f4
  1783. FXCPMADD f1, B1, A2, f1
  1784. FXCSMADD f5, B1, A2, f5
  1785. FXCPMADD f2, B1, A3, f2
  1786. FXCSMADD f6, B1, A3, f6
  1787. FXCPMADD f3, B1, A4, f3
  1788. FXCSMADD f7, B1, A4, f7
  1789. .align 4
  1790. .L58:
  1791. fpadd f0, f0, f4
  1792. fpadd f1, f1, f5
  1793. fpadd f2, f2, f6
  1794. fpadd f3, f3, f7
  1795. #if defined(LN) || defined(RT)
  1796. #ifdef LN
  1797. subi r0, KK, 4
  1798. #else
  1799. subi r0, KK, 1
  1800. #endif
  1801. slwi TEMP, r0, 2 + ZBASE_SHIFT
  1802. slwi r0, r0, 0 + ZBASE_SHIFT
  1803. add AO, AORIG, TEMP
  1804. add BO, B, r0
  1805. addi BO, BO, - 2 * SIZE
  1806. #endif
  1807. #if defined(LN) || defined(LT)
  1808. LFPDUX f16, BO, INC2
  1809. LFPDUX f17, BO, INC2
  1810. LFPDUX f18, BO, INC2
  1811. LFPDUX f19, BO, INC2
  1812. subi BO, BO, 8 * SIZE
  1813. #else
  1814. LFPDUX f16, AO, INC2
  1815. LFPDUX f17, AO, INC2
  1816. LFPDUX f18, AO, INC2
  1817. LFPDUX f19, AO, INC2
  1818. subi AO, AO, 8 * SIZE
  1819. #endif
  1820. fpsub f0, f16, f0
  1821. fpsub f1, f17, f1
  1822. fpsub f2, f18, f2
  1823. fpsub f3, f19, f3
  1824. #ifdef LN
  1825. LFPDUX A1, AO, INC2
  1826. add AO, AO, INC2
  1827. add AO, AO, INC2
  1828. add AO, AO, INC2
  1829. LFPDUX A2, AO, INC2
  1830. LFPDUX A3, AO, INC2
  1831. add AO, AO, INC2
  1832. add AO, AO, INC2
  1833. LFPDUX A4, AO, INC2
  1834. LFPDUX A5, AO, INC2
  1835. LFPDUX A6, AO, INC2
  1836. add AO, AO, INC2
  1837. LFPDUX A7, AO, INC2
  1838. LFPDUX A8, AO, INC2
  1839. LFPDUX A9, AO, INC2
  1840. LFPDUX A10, AO, INC2
  1841. subi AO, AO, 32 * SIZE
  1842. fxpmul f4, A10, f3
  1843. FXCXNPMA f3, A10, f3, f4
  1844. fxcpnmsub f2, A9, f3, f2
  1845. FXCXNSMA f2, A9, f3, f2
  1846. fxcpnmsub f1, A8, f3, f1
  1847. FXCXNSMA f1, A8, f3, f1
  1848. fxcpnmsub f0, A7, f3, f0
  1849. FXCXNSMA f0, A7, f3, f0
  1850. fxpmul f4, A6, f2
  1851. FXCXNPMA f2, A6, f2, f4
  1852. fxcpnmsub f1, A5, f2, f1
  1853. FXCXNSMA f1, A5, f2, f1
  1854. fxcpnmsub f0, A4, f2, f0
  1855. FXCXNSMA f0, A4, f2, f0
  1856. fxpmul f4, A3, f1
  1857. FXCXNPMA f1, A3, f1, f4
  1858. fxcpnmsub f0, A2, f1, f0
  1859. FXCXNSMA f0, A2, f1, f0
  1860. fxpmul f4, A1, f0
  1861. FXCXNPMA f0, A1, f0, f4
  1862. #endif
  1863. #ifdef LT
  1864. LFPDUX A1, AO, INC2
  1865. LFPDUX A2, AO, INC2
  1866. LFPDUX A3, AO, INC2
  1867. LFPDUX A4, AO, INC2
  1868. add AO, AO, INC2
  1869. LFPDUX A5, AO, INC2
  1870. LFPDUX A6, AO, INC2
  1871. LFPDUX A7, AO, INC2
  1872. add AO, AO, INC2
  1873. add AO, AO, INC2
  1874. LFPDUX A8, AO, INC2
  1875. LFPDUX A9, AO, INC2
  1876. add AO, AO, INC2
  1877. add AO, AO, INC2
  1878. add AO, AO, INC2
  1879. LFPDUX A10, AO, INC2
  1880. subi AO, AO, 32 * SIZE
  1881. fxpmul f4, A1, f0
  1882. FXCXNPMA f0, A1, f0, f4
  1883. fxcpnmsub f1, A2, f0, f1
  1884. FXCXNSMA f1, A2, f0, f1
  1885. fxcpnmsub f2, A3, f0, f2
  1886. FXCXNSMA f2, A3, f0, f2
  1887. fxcpnmsub f3, A4, f0, f3
  1888. FXCXNSMA f3, A4, f0, f3
  1889. fxpmul f6, A5, f1
  1890. FXCXNPMA f1, A5, f1, f6
  1891. fxcpnmsub f2, A6, f1, f2
  1892. FXCXNSMA f2, A6, f1, f2
  1893. fxcpnmsub f3, A7, f1, f3
  1894. FXCXNSMA f3, A7, f1, f3
  1895. fxpmul f4, A8, f2
  1896. FXCXNPMA f2, A8, f2, f4
  1897. fxcpnmsub f3, A9, f2, f3
  1898. FXCXNSMA f3, A9, f2, f3
  1899. fxpmul f6, A10, f3
  1900. FXCXNPMA f3, A10, f3, f6
  1901. #endif
  1902. #ifdef RN
  1903. LFPDX A1, BO, INC2
  1904. fxpmul f4, A1, f0
  1905. fxpmul f5, A1, f1
  1906. fxpmul f6, A1, f2
  1907. fxpmul f7, A1, f3
  1908. FXCXNPMA f0, A1, f0, f4
  1909. FXCXNPMA f1, A1, f1, f5
  1910. FXCXNPMA f2, A1, f2, f6
  1911. FXCXNPMA f3, A1, f3, f7
  1912. #endif
  1913. #ifdef RT
  1914. LFPDX A1, BO, INC2
  1915. fxpmul f4, A1, f0
  1916. fxpmul f5, A1, f1
  1917. fxpmul f6, A1, f2
  1918. fxpmul f7, A1, f3
  1919. FXCXNPMA f0, A1, f0, f4
  1920. FXCXNPMA f1, A1, f1, f5
  1921. FXCXNPMA f2, A1, f2, f6
  1922. FXCXNPMA f3, A1, f3, f7
  1923. #endif
  1924. #ifdef LN
  1925. subi CO1, CO1, 8 * SIZE
  1926. #endif
  1927. #if defined(LN) || defined(LT)
  1928. STFPDUX f0, BO, INC2
  1929. STFPDUX f1, BO, INC2
  1930. STFPDUX f2, BO, INC2
  1931. STFPDUX f3, BO, INC2
  1932. subi BO, BO, 8 * SIZE
  1933. #else
  1934. STFPDUX f0, AO, INC2
  1935. STFPDUX f1, AO, INC2
  1936. STFPDUX f2, AO, INC2
  1937. STFPDUX f3, AO, INC2
  1938. subi AO, AO, 8 * SIZE
  1939. #endif
  1940. STFDUX f0, CO1, INC
  1941. STFSDUX f0, CO1, INC
  1942. STFDUX f1, CO1, INC
  1943. STFSDUX f1, CO1, INC
  1944. STFDUX f2, CO1, INC
  1945. STFSDUX f2, CO1, INC
  1946. STFDUX f3, CO1, INC
  1947. STFSDUX f3, CO1, INC
  1948. #ifdef LN
  1949. subi CO1, CO1, 8 * SIZE
  1950. #endif
  1951. #ifdef RT
  1952. slwi r0, K, 2 + ZBASE_SHIFT
  1953. add AORIG, AORIG, r0
  1954. #endif
  1955. #if defined(LT) || defined(RN)
  1956. sub TEMP, K, KK
  1957. slwi r0, TEMP, 2 + ZBASE_SHIFT
  1958. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  1959. add AO, AO, r0
  1960. add BO, BO, TEMP
  1961. #endif
  1962. #ifdef LT
  1963. addi KK, KK, 4
  1964. #endif
  1965. #ifdef LN
  1966. subi KK, KK, 4
  1967. #endif
  1968. addic. I, I, -1
  1969. li r0, FZERO
  1970. lfpsx f0, SP, r0
  1971. bgt+ .L51
  1972. .align 4
  1973. .L60:
  1974. andi. I, M, 2
  1975. beq .L70
  1976. #if defined(LT) || defined(RN)
  1977. fpmr f1, f0
  1978. addi BO, B, - 2 * SIZE
  1979. fpmr f2, f0
  1980. fpmr f3, f0
  1981. srawi. r0, KK, 2
  1982. mtspr CTR, r0
  1983. ble .L64
  1984. #else
  1985. #ifdef LN
  1986. slwi r0, K, 1 + ZBASE_SHIFT
  1987. sub AORIG, AORIG, r0
  1988. #endif
  1989. slwi r0 , KK, 1 + ZBASE_SHIFT
  1990. slwi TEMP, KK, 0 + ZBASE_SHIFT
  1991. add AO, AORIG, r0
  1992. add BO, B, TEMP
  1993. sub TEMP, K, KK
  1994. fpmr f1, f0
  1995. addi BO, BO, - 2 * SIZE
  1996. fpmr f2, f0
  1997. fpmr f3, f0
  1998. srawi. r0, TEMP, 2
  1999. mtspr CTR, r0
  2000. ble .L64
  2001. #endif
  2002. LFPDUX B1, BO, INC2
  2003. LFPDUX A1, AO, INC2
  2004. LFPDUX A2, AO, INC2
  2005. LFPDUX B2, BO, INC2
  2006. LFPDUX A3, AO, INC2
  2007. LFPDUX A4, AO, INC2
  2008. LFPDUX B3, BO, INC2
  2009. LFPDUX A5, AO, INC2
  2010. LFPDUX A6, AO, INC2
  2011. LFPDUX B4, BO, INC2
  2012. LFPDUX A7, AO, INC2
  2013. LFPDUX A8, AO, INC2
  2014. bdz- .L63
  2015. .align 4
  2016. .L62:
  2017. FXCPMADD f0, B1, A1, f0
  2018. FXCSMADD f2, B1, A1, f2
  2019. LFPDUX A1, AO, INC2
  2020. FXCPMADD f1, B1, A2, f1
  2021. FXCSMADD f3, B1, A2, f3
  2022. LFPDUX A2, AO, INC2
  2023. LFPDUX B1, BO, INC2
  2024. FXCPMADD f0, B2, A3, f0
  2025. FXCSMADD f2, B2, A3, f2
  2026. LFPDUX A3, AO, INC2
  2027. FXCPMADD f1, B2, A4, f1
  2028. FXCSMADD f3, B2, A4, f3
  2029. LFPDUX A4, AO, INC2
  2030. LFPDUX B2, BO, INC2
  2031. FXCPMADD f0, B3, A5, f0
  2032. FXCSMADD f2, B3, A5, f2
  2033. LFPDUX A5, AO, INC2
  2034. FXCPMADD f1, B3, A6, f1
  2035. FXCSMADD f3, B3, A6, f3
  2036. LFPDUX A6, AO, INC2
  2037. LFPDUX B3, BO, INC2
  2038. FXCPMADD f0, B4, A7, f0
  2039. FXCSMADD f2, B4, A7, f2
  2040. LFPDUX A7, AO, INC2
  2041. FXCPMADD f1, B4, A8, f1
  2042. FXCSMADD f3, B4, A8, f3
  2043. LFPDUX A8, AO, INC2
  2044. LFPDUX B4, BO, INC2
  2045. bdnz+ .L62
  2046. .align 4
  2047. .L63:
  2048. FXCPMADD f0, B1, A1, f0
  2049. FXCSMADD f2, B1, A1, f2
  2050. FXCPMADD f1, B1, A2, f1
  2051. FXCSMADD f3, B1, A2, f3
  2052. FXCPMADD f0, B2, A3, f0
  2053. FXCSMADD f2, B2, A3, f2
  2054. FXCPMADD f1, B2, A4, f1
  2055. FXCSMADD f3, B2, A4, f3
  2056. FXCPMADD f0, B3, A5, f0
  2057. FXCSMADD f2, B3, A5, f2
  2058. FXCPMADD f1, B3, A6, f1
  2059. FXCSMADD f3, B3, A6, f3
  2060. FXCPMADD f0, B4, A7, f0
  2061. FXCSMADD f2, B4, A7, f2
  2062. FXCPMADD f1, B4, A8, f1
  2063. FXCSMADD f3, B4, A8, f3
  2064. .align 4
  2065. .L64:
  2066. #if defined(LT) || defined(RN)
  2067. andi. r0, KK, 3
  2068. mtspr CTR, r0
  2069. ble+ .L68
  2070. #else
  2071. andi. r0, TEMP, 3
  2072. mtspr CTR, r0
  2073. ble+ .L68
  2074. #endif
  2075. LFPDUX A1, AO, INC2
  2076. LFPDUX B1, BO, INC2
  2077. LFPDUX A2, AO, INC2
  2078. bdz- .L67
  2079. .align 4
  2080. .L66:
  2081. FXCPMADD f0, B1, A1, f0
  2082. FXCSMADD f2, B1, A1, f2
  2083. LFPDUX A1, AO, INC2
  2084. FXCPMADD f1, B1, A2, f1
  2085. FXCSMADD f3, B1, A2, f3
  2086. LFPDUX B1, BO, INC2
  2087. LFPDUX A2, AO, INC2
  2088. bdnz+ .L66
  2089. .align 4
  2090. .L67:
  2091. FXCPMADD f0, B1, A1, f0
  2092. FXCSMADD f2, B1, A1, f2
  2093. FXCPMADD f1, B1, A2, f1
  2094. FXCSMADD f3, B1, A2, f3
  2095. .align 4
  2096. .L68:
  2097. fpadd f0, f0, f2
  2098. fpadd f1, f1, f3
  2099. #if defined(LN) || defined(RT)
  2100. #ifdef LN
  2101. subi r0, KK, 2
  2102. #else
  2103. subi r0, KK, 1
  2104. #endif
  2105. slwi TEMP, r0, 1 + ZBASE_SHIFT
  2106. slwi r0, r0, 0 + ZBASE_SHIFT
  2107. add AO, AORIG, TEMP
  2108. add BO, B, r0
  2109. addi BO, BO, - 2 * SIZE
  2110. #endif
  2111. #if defined(LN) || defined(LT)
  2112. LFPDUX f16, BO, INC2
  2113. LFPDUX f17, BO, INC2
  2114. subi BO, BO, 4 * SIZE
  2115. #else
  2116. LFPDUX f16, AO, INC2
  2117. LFPDUX f17, AO, INC2
  2118. subi AO, AO, 4 * SIZE
  2119. #endif
  2120. fpsub f0, f16, f0
  2121. fpsub f1, f17, f1
  2122. #ifdef LN
  2123. LFPDUX A1, AO, INC2
  2124. add AO, AO, INC2
  2125. LFPDUX A2, AO, INC2
  2126. LFPDUX A3, AO, INC2
  2127. subi AO, AO, 8 * SIZE
  2128. fxpmul f4, A3, f1
  2129. FXCXNPMA f1, A3, f1, f4
  2130. fxcpnmsub f0, A2, f1, f0
  2131. FXCXNSMA f0, A2, f1, f0
  2132. fxpmul f4, A1, f0
  2133. FXCXNPMA f0, A1, f0, f4
  2134. #endif
  2135. #ifdef LT
  2136. LFPDUX A1, AO, INC2
  2137. LFPDUX A2, AO, INC2
  2138. add AO, AO, INC2
  2139. LFPDUX A3, AO, INC2
  2140. subi AO, AO, 8 * SIZE
  2141. fxpmul f4, A1, f0
  2142. FXCXNPMA f0, A1, f0, f4
  2143. fxcpnmsub f1, A2, f0, f1
  2144. FXCXNSMA f1, A2, f0, f1
  2145. fxpmul f6, A3, f1
  2146. FXCXNPMA f1, A3, f1, f6
  2147. #endif
  2148. #ifdef RN
  2149. LFPDX A1, BO, INC2
  2150. fxpmul f4, A1, f0
  2151. fxpmul f5, A1, f1
  2152. FXCXNPMA f0, A1, f0, f4
  2153. FXCXNPMA f1, A1, f1, f5
  2154. #endif
  2155. #ifdef RT
  2156. LFPDX A1, BO, INC2
  2157. fxpmul f4, A1, f0
  2158. fxpmul f5, A1, f1
  2159. FXCXNPMA f0, A1, f0, f4
  2160. FXCXNPMA f1, A1, f1, f5
  2161. #endif
  2162. #ifdef LN
  2163. subi CO1, CO1, 4 * SIZE
  2164. #endif
  2165. #if defined(LN) || defined(LT)
  2166. STFPDUX f0, BO, INC2
  2167. STFPDUX f1, BO, INC2
  2168. subi BO, BO, 4 * SIZE
  2169. #else
  2170. STFPDUX f0, AO, INC2
  2171. STFPDUX f1, AO, INC2
  2172. subi AO, AO, 4 * SIZE
  2173. #endif
  2174. STFDUX f0, CO1, INC
  2175. STFSDUX f0, CO1, INC
  2176. STFDUX f1, CO1, INC
  2177. STFSDUX f1, CO1, INC
  2178. #ifdef LN
  2179. subi CO1, CO1, 4 * SIZE
  2180. #endif
  2181. #ifdef RT
  2182. slwi r0, K, 1 + ZBASE_SHIFT
  2183. add AORIG, AORIG, r0
  2184. #endif
  2185. #if defined(LT) || defined(RN)
  2186. sub TEMP, K, KK
  2187. slwi r0, TEMP, 1 + ZBASE_SHIFT
  2188. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  2189. add AO, AO, r0
  2190. add BO, BO, TEMP
  2191. #endif
  2192. #ifdef LT
  2193. addi KK, KK, 2
  2194. #endif
  2195. #ifdef LN
  2196. subi KK, KK, 2
  2197. #endif
  2198. li r0, FZERO
  2199. lfpsx f0, SP, r0
  2200. .align 4
  2201. .L70:
  2202. andi. I, M, 1
  2203. beq .L89
  2204. #if defined(LT) || defined(RN)
  2205. addi BO, B, - 2 * SIZE
  2206. fpmr f1, f0
  2207. fpmr f2, f0
  2208. fpmr f3, f0
  2209. srawi. r0, KK, 3
  2210. mtspr CTR, r0
  2211. ble .L74
  2212. #else
  2213. #ifdef LN
  2214. slwi r0, K, 0 + ZBASE_SHIFT
  2215. sub AORIG, AORIG, r0
  2216. #endif
  2217. slwi TEMP, KK, 0 + ZBASE_SHIFT
  2218. add AO, AORIG, TEMP
  2219. add BO, B, TEMP
  2220. sub TEMP, K, KK
  2221. addi BO, BO, - 2 * SIZE
  2222. fpmr f1, f0
  2223. fpmr f2, f0
  2224. fpmr f3, f0
  2225. srawi. r0, TEMP, 3
  2226. mtspr CTR, r0
  2227. ble .L74
  2228. #endif
  2229. LFPDUX A1, AO, INC2
  2230. LFPDUX B1, BO, INC2
  2231. LFPDUX A2, AO, INC2
  2232. LFPDUX B2, BO, INC2
  2233. LFPDUX A3, AO, INC2
  2234. LFPDUX B3, BO, INC2
  2235. LFPDUX A4, AO, INC2
  2236. LFPDUX B4, BO, INC2
  2237. LFPDUX A5, AO, INC2
  2238. LFPDUX B5, BO, INC2
  2239. LFPDUX A6, AO, INC2
  2240. LFPDUX B6, BO, INC2
  2241. LFPDUX A7, AO, INC2
  2242. LFPDUX A9, BO, INC2
  2243. LFPDUX A8, AO, INC2
  2244. LFPDUX A10, BO, INC2
  2245. bdz- .L73
  2246. .align 4
  2247. .L72:
  2248. FXCPMADD f0, B1, A1, f0
  2249. FXCSMADD f1, B1, A1, f1
  2250. LFPDUX A1, AO, INC2
  2251. LFPDUX B1, BO, INC2
  2252. FXCPMADD f2, B2, A2, f2
  2253. FXCSMADD f3, B2, A2, f3
  2254. LFPDUX A2, AO, INC2
  2255. LFPDUX B2, BO, INC2
  2256. FXCPMADD f0, B3, A3, f0
  2257. FXCSMADD f1, B3, A3, f1
  2258. LFPDUX A3, AO, INC2
  2259. LFPDUX B3, BO, INC2
  2260. FXCPMADD f2, B4, A4, f2
  2261. FXCSMADD f3, B4, A4, f3
  2262. LFPDUX A4, AO, INC2
  2263. LFPDUX B4, BO, INC2
  2264. FXCPMADD f0, B5, A5, f0
  2265. FXCSMADD f1, B5, A5, f1
  2266. LFPDUX A5, AO, INC2
  2267. LFPDUX B5, BO, INC2
  2268. FXCPMADD f2, B6, A6, f2
  2269. FXCSMADD f3, B6, A6, f3
  2270. LFPDUX A6, AO, INC2
  2271. LFPDUX B6, BO, INC2
  2272. FXCPMADD f0, A9, A7, f0
  2273. FXCSMADD f1, A9, A7, f1
  2274. LFPDUX A7, AO, INC2
  2275. LFPDUX A9, BO, INC2
  2276. FXCPMADD f2, A10, A8, f2
  2277. FXCSMADD f3, A10, A8, f3
  2278. LFPDUX A8, AO, INC2
  2279. LFPDUX A10, BO, INC2
  2280. bdnz+ .L72
  2281. .align 4
  2282. .L73:
  2283. FXCPMADD f0, B1, A1, f0
  2284. FXCSMADD f1, B1, A1, f1
  2285. FXCPMADD f2, B2, A2, f2
  2286. FXCSMADD f3, B2, A2, f3
  2287. FXCPMADD f0, B3, A3, f0
  2288. FXCSMADD f1, B3, A3, f1
  2289. FXCPMADD f2, B4, A4, f2
  2290. FXCSMADD f3, B4, A4, f3
  2291. FXCPMADD f0, B5, A5, f0
  2292. FXCSMADD f1, B5, A5, f1
  2293. FXCPMADD f2, B6, A6, f2
  2294. FXCSMADD f3, B6, A6, f3
  2295. FXCPMADD f0, A9, A7, f0
  2296. FXCSMADD f1, A9, A7, f1
  2297. FXCPMADD f2, A10, A8, f2
  2298. FXCSMADD f3, A10, A8, f3
  2299. .align 4
  2300. .L74:
  2301. #if defined(LT) || defined(RN)
  2302. andi. r0, KK, 7
  2303. mtspr CTR, r0
  2304. ble+ .L78
  2305. #else
  2306. andi. r0, TEMP, 7
  2307. mtspr CTR, r0
  2308. ble+ .L78
  2309. #endif
  2310. LFPDUX A1, AO, INC2
  2311. LFPDUX B1, BO, INC2
  2312. bdz- .L77
  2313. .align 4
  2314. .L76:
  2315. FXCPMADD f0, B1, A1, f0
  2316. FXCSMADD f1, B1, A1, f1
  2317. LFPDUX A1, AO, INC2
  2318. LFPDUX B1, BO, INC2
  2319. bdnz+ .L76
  2320. .align 4
  2321. .L77:
  2322. FXCPMADD f0, B1, A1, f0
  2323. FXCSMADD f1, B1, A1, f1
  2324. .align 4
  2325. .L78:
  2326. fpadd f0, f0, f2
  2327. fpadd f1, f1, f3
  2328. fpadd f0, f0, f1
  2329. #if defined(LN) || defined(RT)
  2330. #ifdef LN
  2331. subi r0, KK, 1
  2332. #else
  2333. subi r0, KK, 1
  2334. #endif
  2335. slwi TEMP, r0, 0 + ZBASE_SHIFT
  2336. add AO, AORIG, TEMP
  2337. add BO, B, TEMP
  2338. addi BO, BO, - 2 * SIZE
  2339. #endif
  2340. #if defined(LN) || defined(LT)
  2341. LFPDX f16, BO, INC2
  2342. #else
  2343. LFPDX f16, AO, INC2
  2344. #endif
  2345. fpsub f0, f16, f0
  2346. #ifdef LN
  2347. LFPDX A1, AO, INC2
  2348. fxpmul f4, A1, f0
  2349. FXCXNPMA f0, A1, f0, f4
  2350. #endif
  2351. #ifdef LT
  2352. LFPDX A1, AO, INC2
  2353. fxpmul f4, A1, f0
  2354. FXCXNPMA f0, A1, f0, f4
  2355. #endif
  2356. #ifdef RN
  2357. LFPDX A1, BO, INC2
  2358. fxpmul f4, A1, f0
  2359. FXCXNPMA f0, A1, f0, f4
  2360. #endif
  2361. #ifdef RT
  2362. LFPDX A1, BO, INC2
  2363. fxpmul f4, A1, f0
  2364. FXCXNPMA f0, A1, f0, f4
  2365. #endif
  2366. #ifdef LN
  2367. subi CO1, CO1, 2 * SIZE
  2368. #endif
  2369. #if defined(LN) || defined(LT)
  2370. STFPDX f0, BO, INC2
  2371. #else
  2372. STFPDX f0, AO, INC2
  2373. #endif
  2374. STFDUX f0, CO1, INC
  2375. STFSDUX f0, CO1, INC
  2376. #ifdef LN
  2377. subi CO1, CO1, 2 * SIZE
  2378. #endif
  2379. #ifdef RT
  2380. slwi r0, K, 0 + ZBASE_SHIFT
  2381. add AORIG, AORIG, r0
  2382. #endif
  2383. #if defined(LT) || defined(RN)
  2384. sub TEMP, K, KK
  2385. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  2386. add AO, AO, TEMP
  2387. add BO, BO, TEMP
  2388. #endif
  2389. #ifdef LT
  2390. addi KK, KK, 1
  2391. #endif
  2392. #ifdef LN
  2393. subi KK, KK, 1
  2394. #endif
  2395. li r0, FZERO
  2396. lfpsx f0, SP, r0
  2397. .align 4
  2398. .L89:
  2399. #ifdef LN
  2400. slwi r0, K, 0 + ZBASE_SHIFT
  2401. add B, B, r0
  2402. #endif
  2403. #if defined(LT) || defined(RN)
  2404. addi B, BO, 2 * SIZE
  2405. #endif
  2406. #ifdef RN
  2407. addi KK, KK, 1
  2408. #endif
  2409. #ifdef RT
  2410. subi KK, KK, 1
  2411. #endif
  2412. .align 4
  2413. .L999:
  2414. addi SP, SP, 20
  2415. lwzu r14, 4(SP)
  2416. lwzu r15, 4(SP)
  2417. lwzu r16, 4(SP)
  2418. lwzu r17, 4(SP)
  2419. lwzu r18, 4(SP)
  2420. lwzu r19, 4(SP)
  2421. lwzu r20, 4(SP)
  2422. lwzu r21, 4(SP)
  2423. lwzu r22, 4(SP)
  2424. lwzu r23, 4(SP)
  2425. lwzu r24, 4(SP)
  2426. lwzu r25, 4(SP)
  2427. lwzu r26, 4(SP)
  2428. lwzu r27, 4(SP)
  2429. lwzu r28, 4(SP)
  2430. lwzu r29, 4(SP)
  2431. lwzu r30, 4(SP)
  2432. lwzu r31, 4(SP)
  2433. subi SP, SP, 12
  2434. li r0, 16
  2435. lfpdux f31, SP, r0
  2436. lfpdux f30, SP, r0
  2437. lfpdux f29, SP, r0
  2438. lfpdux f28, SP, r0
  2439. lfpdux f27, SP, r0
  2440. lfpdux f26, SP, r0
  2441. lfpdux f25, SP, r0
  2442. lfpdux f24, SP, r0
  2443. lfpdux f23, SP, r0
  2444. lfpdux f22, SP, r0
  2445. lfpdux f21, SP, r0
  2446. lfpdux f20, SP, r0
  2447. lfpdux f19, SP, r0
  2448. lfpdux f18, SP, r0
  2449. lfpdux f17, SP, r0
  2450. lfpdux f16, SP, r0
  2451. lfpdux f15, SP, r0
  2452. lfpdux f14, SP, r0
  2453. addi SP, SP, 16
  2454. blr
  2455. .align 4
  2456. EPILOGUE
  2457. #endif