You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_hummer_LT.S 98 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define ALPHA 0
  41. #define FZERO 8
  42. #define M r3
  43. #define N r4
  44. #define K r5
  45. #if defined(linux) || defined(__FreeBSD__)
  46. #define A r6
  47. #define B r7
  48. #define C r8
  49. #define LDC r9
  50. #define OFFSET r10
  51. #endif
  52. #define TEMP r11
  53. #define AORIG r12
  54. #define KK r14
  55. #define INCM1 r15
  56. #define INCM4 r16
  57. #define INCM2 r17
  58. #define INC2 r19
  59. #define INC r20
  60. #define INC4 r21
  61. #define I r22
  62. #define J r23
  63. #define AO r24
  64. #define BO r25
  65. #define AO2 r26
  66. #define BO2 r27
  67. #define CO1 r28
  68. #define CO2 r29
  69. #define CO3 r30
  70. #define CO4 r31
  71. #ifndef NEEDPARAM
  72. #define A1 f16
  73. #define A2 f17
  74. #define A3 f18
  75. #define A4 f19
  76. #define A5 f20
  77. #define A6 f21
  78. #define A7 f22
  79. #define A8 f23
  80. #define A9 f24
  81. #define A10 f25
  82. #define B1 f26
  83. #define B2 f27
  84. #define B3 f28
  85. #define B4 f29
  86. #define B5 f30
  87. #define B6 f31
  88. #define AP B6
  89. PROLOGUE
  90. PROFCODE
  91. li r0, -16
  92. stfpdux f14, SP, r0
  93. stfpdux f15, SP, r0
  94. stfpdux f16, SP, r0
  95. stfpdux f17, SP, r0
  96. stfpdux f18, SP, r0
  97. stfpdux f19, SP, r0
  98. stfpdux f20, SP, r0
  99. stfpdux f21, SP, r0
  100. stfpdux f22, SP, r0
  101. stfpdux f23, SP, r0
  102. stfpdux f24, SP, r0
  103. stfpdux f25, SP, r0
  104. stfpdux f26, SP, r0
  105. stfpdux f27, SP, r0
  106. stfpdux f28, SP, r0
  107. stfpdux f29, SP, r0
  108. stfpdux f30, SP, r0
  109. stfpdux f31, SP, r0
  110. stwu r31, -4(SP)
  111. stwu r30, -4(SP)
  112. stwu r29, -4(SP)
  113. stwu r28, -4(SP)
  114. stwu r27, -4(SP)
  115. stwu r26, -4(SP)
  116. stwu r25, -4(SP)
  117. stwu r24, -4(SP)
  118. stwu r23, -4(SP)
  119. stwu r22, -4(SP)
  120. stwu r21, -4(SP)
  121. stwu r20, -4(SP)
  122. stwu r19, -4(SP)
  123. stwu r18, -4(SP)
  124. stwu r17, -4(SP)
  125. stwu r16, -4(SP)
  126. stwu r15, -4(SP)
  127. stwu r14, -4(SP) # dummy
  128. li r0, 0
  129. stwu r0, -4(SP)
  130. stwu r0, -4(SP)
  131. stfdu f1, -8(SP)
  132. slwi LDC, LDC, BASE_SHIFT
  133. cmpwi cr0, M, 0
  134. ble .L999
  135. cmpwi cr0, N, 0
  136. ble .L999
  137. cmpwi cr0, K, 0
  138. ble .L999
  139. li INC, 1 * SIZE
  140. li INC2, 2 * SIZE
  141. li INC4, 4 * SIZE
  142. li INCM1, -1 * SIZE
  143. li INCM2, -2 * SIZE
  144. li INCM4, -4 * SIZE
  145. addi C, C, - 1 * SIZE
  146. #ifdef LN
  147. mullw r0, M, K
  148. slwi r0, r0, BASE_SHIFT
  149. add A, A, r0
  150. slwi r0, M, BASE_SHIFT
  151. add C, C, r0
  152. #endif
  153. #ifdef RN
  154. neg KK, OFFSET
  155. #endif
  156. #ifdef RT
  157. mullw r0, N, K
  158. slwi r0, r0, BASE_SHIFT
  159. add B, B, r0
  160. mullw r0, N, LDC
  161. add C, C, r0
  162. sub KK, N, OFFSET
  163. #endif
  164. srawi. J, N, 2
  165. ble .L50
  166. .align 4
  167. .L10:
  168. #ifdef RT
  169. slwi r0, K, 2 + BASE_SHIFT
  170. sub B, B, r0
  171. slwi r0, LDC, 2
  172. sub C, C, r0
  173. #endif
  174. mr CO1, C
  175. add CO2, C, LDC
  176. add CO3, CO2, LDC
  177. add CO4, CO3, LDC
  178. #ifdef LN
  179. add KK, M, OFFSET
  180. #endif
  181. #ifdef LT
  182. mr KK, OFFSET
  183. #endif
  184. #if defined(LN) || defined(RT)
  185. addi AORIG, A, -4 * SIZE
  186. #else
  187. addi AO, A, -4 * SIZE
  188. #endif
  189. #ifndef RT
  190. add C, CO4, LDC
  191. #endif
  192. li r0, FZERO
  193. lfpsx f0, SP, r0
  194. srawi. I, M, 3
  195. ble .L20
  196. .align 4
  197. .L11:
  198. #if defined(LT) || defined(RN)
  199. addi AO2, AO, 2 * SIZE
  200. fpmr f4, f0
  201. addi BO, B, - 4 * SIZE
  202. fpmr f8, f0
  203. addi BO2, B, - 2 * SIZE
  204. fpmr f12, f0
  205. fpmr f5, f0
  206. fpmr f9, f0
  207. fpmr f13, f0
  208. fpmr f2, f0
  209. fpmr f6, f0
  210. fpmr f10, f0
  211. fpmr f14, f0
  212. fpmr f3, f0
  213. fpmr f7, f0
  214. fpmr f11, f0
  215. fpmr f15, f0
  216. nop
  217. srawi. r0, KK, 2
  218. fpmr f1, f0
  219. mtspr CTR, r0
  220. ble .L14
  221. #else
  222. #ifdef LN
  223. slwi r0, K, 3 + BASE_SHIFT
  224. sub AORIG, AORIG, r0
  225. #endif
  226. slwi r0 , KK, 3 + BASE_SHIFT
  227. slwi TEMP, KK, 2 + BASE_SHIFT
  228. add AO, AORIG, r0
  229. add BO, B, TEMP
  230. sub TEMP, K, KK
  231. addi AO2, AO, 2 * SIZE
  232. fpmr f4, f0
  233. addi BO, BO, - 4 * SIZE
  234. fpmr f8, f0
  235. addi BO2, BO, 2 * SIZE
  236. fpmr f12, f0
  237. fpmr f5, f0
  238. fpmr f9, f0
  239. fpmr f13, f0
  240. fpmr f2, f0
  241. fpmr f6, f0
  242. fpmr f10, f0
  243. fpmr f14, f0
  244. fpmr f3, f0
  245. fpmr f7, f0
  246. fpmr f11, f0
  247. fpmr f15, f0
  248. nop
  249. srawi. r0, TEMP, 2
  250. fpmr f1, f0
  251. mtspr CTR, r0
  252. ble .L14
  253. #endif
  254. LFPDUX A1, AO, INC4
  255. fpmr f5, f0
  256. LFPDUX A3, AO, INC4
  257. fpmr f9, f0
  258. LFPDUX B1, BO, INC4
  259. fpmr f13, f0
  260. LFPDUX A5, AO, INC4
  261. fpmr f2, f0
  262. LFPDUX A6, AO, INC4
  263. fpmr f6, f0
  264. LFPDUX B3, BO, INC4
  265. fpmr f10, f0
  266. LFPDUX A7, AO, INC4
  267. fpmr f14, f0
  268. LFPDUX A8, AO, INC4
  269. fpmr f3, f0
  270. LFPDUX B5, BO, INC4
  271. fpmr f7, f0
  272. LFPDUX A9, AO, INC4
  273. fpmr f11, f0
  274. LFPDUX A2, AO2, INC4
  275. fpmr f15, f0
  276. LFPDUX B2, BO2, INC4
  277. bdz- .L13
  278. .align 4
  279. .L12:
  280. ## 1 ##
  281. fxcpmadd f0, B1, A1, f0
  282. nop
  283. fxcsmadd f4, B1, A1, f4
  284. nop
  285. fxcpmadd f8, B2, A1, f8
  286. LFPDUX B4, BO2, INC4
  287. fxcsmadd f12, B2, A1, f12
  288. LFPDUX B6, BO, INC4
  289. fxcpmadd f1, B1, A2, f1
  290. nop
  291. fxcsmadd f5, B1, A2, f5
  292. LFPDUX A4, AO2, INC4
  293. fxcpmadd f9, B2, A2, f9
  294. LFPDUX A10, AO, INC4
  295. fxcsmadd f13, B2, A2, f13
  296. nop
  297. fxcpmadd f2, B1, A3, f2
  298. nop
  299. fxcsmadd f6, B1, A3, f6
  300. nop
  301. fxcpmadd f10, B2, A3, f10
  302. nop
  303. fxcsmadd f14, B2, A3, f14
  304. nop
  305. fxcpmadd f3, B1, A4, f3
  306. nop
  307. fxcsmadd f7, B1, A4, f7
  308. LFPDUX A2, AO2, INC4
  309. fxcpmadd f11, B2, A4, f11
  310. LFPDUX A1, AO, INC4
  311. fxcsmadd f15, B2, A4, f15
  312. nop
  313. ## 2 ##
  314. fxcpmadd f0, B3, A5, f0
  315. nop
  316. fxcsmadd f4, B3, A5, f4
  317. nop
  318. fxcpmadd f8, B4, A5, f8
  319. LFPDUX B2, BO2, INC4
  320. fxcsmadd f12, B4, A5, f12
  321. LFPDUX B1, BO, INC4
  322. fxcpmadd f1, B3, A2, f1
  323. nop
  324. fxcsmadd f5, B3, A2, f5
  325. LFPDUX A4, AO2, INC4
  326. fxcpmadd f9, B4, A2, f9
  327. LFPDUX A3, AO, INC4
  328. fxcsmadd f13, B4, A2, f13
  329. nop
  330. fxcpmadd f2, B3, A6, f2
  331. nop
  332. fxcsmadd f6, B3, A6, f6
  333. nop
  334. fxcpmadd f10, B4, A6, f10
  335. nop
  336. fxcsmadd f14, B4, A6, f14
  337. nop
  338. fxcpmadd f3, B3, A4, f3
  339. nop
  340. fxcsmadd f7, B3, A4, f7
  341. LFPDUX A2, AO2, INC4
  342. fxcpmadd f11, B4, A4, f11
  343. LFPDUX A5, AO, INC4
  344. fxcsmadd f15, B4, A4, f15
  345. nop
  346. ## 3 ##
  347. fxcpmadd f0, B5, A7, f0
  348. nop
  349. fxcsmadd f4, B5, A7, f4
  350. nop
  351. fxcpmadd f8, B2, A7, f8
  352. LFPDUX B4, BO2, INC4
  353. fxcsmadd f12, B2, A7, f12
  354. LFPDUX B3, BO, INC4
  355. fxcpmadd f1, B5, A2, f1
  356. nop
  357. fxcsmadd f5, B5, A2, f5
  358. LFPDUX A4, AO2, INC4
  359. fxcpmadd f9, B2, A2, f9
  360. LFPDUX A6, AO, INC4
  361. fxcsmadd f13, B2, A2, f13
  362. nop
  363. fxcpmadd f2, B5, A8, f2
  364. nop
  365. fxcsmadd f6, B5, A8, f6
  366. nop
  367. fxcpmadd f10, B2, A8, f10
  368. nop
  369. fxcsmadd f14, B2, A8, f14
  370. nop
  371. fxcpmadd f3, B5, A4, f3
  372. nop
  373. fxcsmadd f7, B5, A4, f7
  374. LFPDUX A2, AO2, INC4
  375. fxcpmadd f11, B2, A4, f11
  376. LFPDUX A7, AO, INC4
  377. fxcsmadd f15, B2, A4, f15
  378. nop
  379. ## 4 ##
  380. fxcpmadd f0, B6, A9, f0
  381. nop
  382. fxcsmadd f4, B6, A9, f4
  383. nop
  384. fxcpmadd f8, B4, A9, f8
  385. LFPDUX B2, BO2, INC4
  386. fxcsmadd f12, B4, A9, f12
  387. LFPDUX B5, BO, INC4
  388. fxcpmadd f1, B6, A2, f1
  389. nop
  390. fxcsmadd f5, B6, A2, f5
  391. LFPDUX A4, AO2, INC4
  392. fxcpmadd f9, B4, A2, f9
  393. LFPDUX A8, AO, INC4
  394. fxcsmadd f13, B4, A2, f13
  395. nop
  396. fxcpmadd f2, B6, A10, f2
  397. nop
  398. fxcsmadd f6, B6, A10, f6
  399. nop
  400. fxcpmadd f10, B4, A10, f10
  401. nop
  402. fxcsmadd f14, B4, A10, f14
  403. nop
  404. fxcpmadd f3, B6, A4, f3
  405. LFPDUX A2, AO2, INC4
  406. fxcsmadd f7, B6, A4, f7
  407. LFPDUX A9, AO, INC4
  408. fxcpmadd f11, B4, A4, f11
  409. nop
  410. fxcsmadd f15, B4, A4, f15
  411. bdnz+ .L12
  412. .align 4
  413. .L13:
  414. ## 1 ##
  415. fxcpmadd f0, B1, A1, f0
  416. nop
  417. fxcsmadd f4, B1, A1, f4
  418. nop
  419. fxcpmadd f8, B2, A1, f8
  420. LFPDUX B4, BO2, INC4
  421. fxcsmadd f12, B2, A1, f12
  422. LFPDUX B6, BO, INC4
  423. fxcpmadd f1, B1, A2, f1
  424. nop
  425. fxcsmadd f5, B1, A2, f5
  426. LFPDUX A4, AO2, INC4
  427. fxcpmadd f9, B2, A2, f9
  428. LFPDUX A10, AO, INC4
  429. fxcsmadd f13, B2, A2, f13
  430. nop
  431. fxcpmadd f2, B1, A3, f2
  432. nop
  433. fxcsmadd f6, B1, A3, f6
  434. nop
  435. fxcpmadd f10, B2, A3, f10
  436. nop
  437. fxcsmadd f14, B2, A3, f14
  438. nop
  439. fxcpmadd f3, B1, A4, f3
  440. nop
  441. fxcsmadd f7, B1, A4, f7
  442. LFPDUX A2, AO2, INC4
  443. fxcpmadd f11, B2, A4, f11
  444. nop
  445. fxcsmadd f15, B2, A4, f15
  446. nop
  447. ## 2 ##
  448. fxcpmadd f0, B3, A5, f0
  449. nop
  450. fxcsmadd f4, B3, A5, f4
  451. nop
  452. fxcpmadd f8, B4, A5, f8
  453. LFPDUX B2, BO2, INC4
  454. fxcsmadd f12, B4, A5, f12
  455. nop
  456. fxcpmadd f1, B3, A2, f1
  457. nop
  458. fxcsmadd f5, B3, A2, f5
  459. LFPDUX A4, AO2, INC4
  460. fxcpmadd f9, B4, A2, f9
  461. nop
  462. fxcsmadd f13, B4, A2, f13
  463. nop
  464. fxcpmadd f2, B3, A6, f2
  465. nop
  466. fxcsmadd f6, B3, A6, f6
  467. nop
  468. fxcpmadd f10, B4, A6, f10
  469. nop
  470. fxcsmadd f14, B4, A6, f14
  471. nop
  472. fxcpmadd f3, B3, A4, f3
  473. nop
  474. fxcsmadd f7, B3, A4, f7
  475. LFPDUX A2, AO2, INC4
  476. fxcpmadd f11, B4, A4, f11
  477. nop
  478. fxcsmadd f15, B4, A4, f15
  479. nop
  480. ## 3 ##
  481. fxcpmadd f0, B5, A7, f0
  482. nop
  483. fxcsmadd f4, B5, A7, f4
  484. nop
  485. fxcpmadd f8, B2, A7, f8
  486. LFPDUX B4, BO2, INC4
  487. fxcsmadd f12, B2, A7, f12
  488. nop
  489. fxcpmadd f1, B5, A2, f1
  490. nop
  491. fxcsmadd f5, B5, A2, f5
  492. LFPDUX A4, AO2, INC4
  493. fxcpmadd f9, B2, A2, f9
  494. nop
  495. fxcsmadd f13, B2, A2, f13
  496. fxcpmadd f2, B5, A8, f2
  497. nop
  498. fxcsmadd f6, B5, A8, f6
  499. nop
  500. fxcpmadd f10, B2, A8, f10
  501. nop
  502. fxcsmadd f14, B2, A8, f14
  503. nop
  504. fxcpmadd f3, B5, A4, f3
  505. nop
  506. fxcsmadd f7, B5, A4, f7
  507. LFPDUX A2, AO2, INC4
  508. fxcpmadd f11, B2, A4, f11
  509. nop
  510. fxcsmadd f15, B2, A4, f15
  511. nop
  512. ## 4 ##
  513. fxcpmadd f0, B6, A9, f0
  514. nop
  515. fxcsmadd f4, B6, A9, f4
  516. nop
  517. fxcpmadd f8, B4, A9, f8
  518. nop
  519. fxcsmadd f12, B4, A9, f12
  520. nop
  521. fxcpmadd f1, B6, A2, f1
  522. nop
  523. fxcsmadd f5, B6, A2, f5
  524. LFPDUX A4, AO2, INC4
  525. fxcpmadd f9, B4, A2, f9
  526. nop
  527. fxcsmadd f13, B4, A2, f13
  528. nop
  529. fxcpmadd f2, B6, A10, f2
  530. nop
  531. fxcsmadd f6, B6, A10, f6
  532. nop
  533. fxcpmadd f10, B4, A10, f10
  534. nop
  535. fxcsmadd f14, B4, A10, f14
  536. nop
  537. fxcpmadd f3, B6, A4, f3
  538. nop
  539. fxcsmadd f7, B6, A4, f7
  540. nop
  541. fxcpmadd f11, B4, A4, f11
  542. nop
  543. fxcsmadd f15, B4, A4, f15
  544. nop
  545. .align 4
  546. .L14:
  547. #if defined(LT) || defined(RN)
  548. andi. r0, KK, 3
  549. mtspr CTR, r0
  550. ble+ .L18
  551. #else
  552. andi. r0, TEMP, 3
  553. mtspr CTR, r0
  554. ble+ .L18
  555. #endif
  556. .align 4
  557. .L15:
  558. LFPDUX A2, AO, INC4
  559. LFPDUX A4, AO2, INC4
  560. LFPDUX A10, BO, INC4
  561. LFPDUX B4, BO2, INC4
  562. bdz- .L17
  563. .align 4
  564. .L16:
  565. fxcpmadd f0, A10, A2, f0
  566. fxcsmadd f4, A10, A2, f4
  567. fxcpmadd f8, B4, A2, f8
  568. fxcsmadd f12, B4, A2, f12
  569. LFPDUX A2, AO, INC4
  570. fxcpmadd f1, A10, A4, f1
  571. fxcsmadd f5, A10, A4, f5
  572. fxcpmadd f9, B4, A4, f9
  573. fxcsmadd f13, B4, A4, f13
  574. LFPDUX A4, AO2, INC4
  575. fxcpmadd f2, A10, A2, f2
  576. fxcsmadd f6, A10, A2, f6
  577. fxcpmadd f10, B4, A2, f10
  578. fxcsmadd f14, B4, A2, f14
  579. LFPDUX A2, AO, INC4
  580. fxcpmadd f3, A10, A4, f3
  581. fxcsmadd f7, A10, A4, f7
  582. LFPDUX A10, BO, INC4
  583. fxcpmadd f11, B4, A4, f11
  584. fxcsmadd f15, B4, A4, f15
  585. LFPDUX A4, AO2, INC4
  586. LFPDUX B4, BO2, INC4
  587. bdnz+ .L16
  588. .align 4
  589. .L17:
  590. fxcpmadd f0, A10, A2, f0
  591. fxcsmadd f4, A10, A2, f4
  592. fxcpmadd f8, B4, A2, f8
  593. fxcsmadd f12, B4, A2, f12
  594. LFPDUX A2, AO, INC4
  595. fxcpmadd f1, A10, A4, f1
  596. fxcsmadd f5, A10, A4, f5
  597. fxcpmadd f9, B4, A4, f9
  598. fxcsmadd f13, B4, A4, f13
  599. LFPDUX A4, AO2, INC4
  600. fxcpmadd f2, A10, A2, f2
  601. fxcsmadd f6, A10, A2, f6
  602. fxcpmadd f10, B4, A2, f10
  603. fxcsmadd f14, B4, A2, f14
  604. fxcpmadd f3, A10, A4, f3
  605. fxcsmadd f7, A10, A4, f7
  606. fxcpmadd f11, B4, A4, f11
  607. fxcsmadd f15, B4, A4, f15
  608. .align 4
  609. .L18:
  610. #if defined(LN) || defined(RT)
  611. #ifdef LN
  612. subi r0, KK, 8
  613. #else
  614. subi r0, KK, 4
  615. #endif
  616. slwi TEMP, r0, 3 + BASE_SHIFT
  617. slwi r0, r0, 2 + BASE_SHIFT
  618. add AO, AORIG, TEMP
  619. add BO, B, r0
  620. addi AO2, AO, 2 * SIZE
  621. addi BO, BO, - 4 * SIZE
  622. addi BO2, BO, 2 * SIZE
  623. #endif
  624. #if defined(LN) || defined(LT)
  625. fpmr f24, f0
  626. LFPDUX f16, BO, INC4
  627. fpmr f25, f1
  628. nop
  629. fpmr f26, f2
  630. LFPDUX f17, BO2, INC4
  631. fpmr f27, f3
  632. nop
  633. fpmr f28, f8
  634. LFPDUX f18, BO, INC4
  635. fpmr f29, f9
  636. nop
  637. fpmr f30, f10
  638. LFPDUX f19, BO2, INC4
  639. fpmr f31, f11
  640. nop
  641. fsmfp f0, f4
  642. LFPDUX f20, BO, INC4
  643. fsmfp f1, f5
  644. nop
  645. fsmfp f2, f6
  646. LFPDUX f21, BO2, INC4
  647. fsmfp f3, f7
  648. nop
  649. fsmfp f8, f12
  650. LFPDUX f22, BO, INC4
  651. fsmfp f9, f13
  652. nop
  653. fsmfp f10, f14
  654. LFPDUX f23, BO2, INC4
  655. fsmfp f11, f15
  656. nop
  657. fsmtp f4, f24
  658. LFPDUX f24, BO, INC4
  659. fsmtp f5, f25
  660. nop
  661. fsmtp f6, f26
  662. LFPDUX f25, BO2, INC4
  663. fsmtp f7, f27
  664. nop
  665. fsmtp f12, f28
  666. LFPDUX f26, BO, INC4
  667. fsmtp f13, f29
  668. nop
  669. fsmtp f14, f30
  670. LFPDUX f27, BO2, INC4
  671. fsmtp f15, f31
  672. nop
  673. fpsub f0, f16, f0
  674. LFPDUX f28, BO, INC4
  675. fpsub f8, f17, f8
  676. nop
  677. fpsub f4, f18, f4
  678. LFPDUX f29, BO2, INC4
  679. fpsub f12, f19, f12
  680. nop
  681. fpsub f1, f20, f1
  682. LFPDUX f30, BO, INC4
  683. fpsub f9, f21, f9
  684. subi BO, BO, 32 * SIZE
  685. fpsub f5, f22, f5
  686. LFPDUX f31, BO2, INC4
  687. fpsub f13, f23, f13
  688. subi BO2, BO2, 32 * SIZE
  689. fpsub f2, f24, f2
  690. fpsub f10, f25, f10
  691. fpsub f6, f26, f6
  692. fpsub f14, f27, f14
  693. fpsub f3, f28, f3
  694. fpsub f11, f29, f11
  695. fpsub f7, f30, f7
  696. fpsub f15, f31, f15
  697. #else
  698. LFPDUX f16, AO, INC4
  699. LFPDUX f17, AO2, INC4
  700. LFPDUX f18, AO, INC4
  701. LFPDUX f19, AO2, INC4
  702. LFPDUX f20, AO, INC4
  703. LFPDUX f21, AO2, INC4
  704. LFPDUX f22, AO, INC4
  705. LFPDUX f23, AO2, INC4
  706. fpsub f0, f16, f0
  707. LFPDUX f24, AO, INC4
  708. fpsub f1, f17, f1
  709. LFPDUX f25, AO2, INC4
  710. fpsub f2, f18, f2
  711. LFPDUX f26, AO, INC4
  712. fpsub f3, f19, f3
  713. LFPDUX f27, AO2, INC4
  714. fpsub f4, f20, f4
  715. LFPDUX f28, AO, INC4
  716. fpsub f5, f21, f5
  717. LFPDUX f29, AO2, INC4
  718. fpsub f6, f22, f6
  719. LFPDUX f30, AO, INC4
  720. fpsub f7, f23, f7
  721. LFPDUX f31, AO2, INC4
  722. fpsub f8, f24, f8
  723. subi AO, AO, 32 * SIZE
  724. fpsub f9, f25, f9
  725. subi AO2, AO2, 32 * SIZE
  726. fpsub f10, f26, f10
  727. fpsub f11, f27, f11
  728. fpsub f12, f28, f12
  729. fpsub f13, f29, f13
  730. fpsub f14, f30, f14
  731. fpsub f15, f31, f15
  732. #endif
  733. #ifdef LN
  734. addi AO, AO, 68 * SIZE
  735. addi AO2, AO2, 68 * SIZE
  736. LFPDUX A1, AO2, INCM4
  737. LFPDUX A2, AO, INCM4
  738. LFPDUX A3, AO2, INCM4
  739. LFPDUX A4, AO, INCM4
  740. LFPDUX A5, AO2, INCM4
  741. LFPDUX A6, AO, INCM4
  742. LFPDUX A7, AO2, INCM4
  743. LFPDUX A8, AO, INCM4
  744. fxsmul f7, A1, f7
  745. fxsmul f15, A1, f15
  746. fxcpnmsub f3, A1, f7, f3
  747. fxcpnmsub f11, A1, f15, f11
  748. fxcsnmsub f6, A2, f7, f6
  749. fxcsnmsub f14, A2, f15, f14
  750. fxcpnmsub f2, A2, f7, f2
  751. fxcpnmsub f10, A2, f15, f10
  752. fxcsnmsub f5, A3, f7, f5
  753. fxcsnmsub f13, A3, f15, f13
  754. fxcpnmsub f1, A3, f7, f1
  755. fxcpnmsub f9, A3, f15, f9
  756. fxcsnmsub f4, A4, f7, f4
  757. fxcsnmsub f12, A4, f15, f12
  758. fxcpnmsub f0, A4, f7, f0
  759. fxcpnmsub f8, A4, f15, f8
  760. fxpmul f3, A5, f3
  761. fxpmul f11, A5, f11
  762. fxcsnmsub f6, A6, f3, f6
  763. fxcsnmsub f14, A6, f11, f14
  764. fxcpnmsub f2, A6, f3, f2
  765. fxcpnmsub f10, A6, f11, f10
  766. fxcsnmsub f5, A7, f3, f5
  767. fxcsnmsub f13, A7, f11, f13
  768. fxcpnmsub f1, A7, f3, f1
  769. fxcpnmsub f9, A7, f11, f9
  770. fxcsnmsub f4, A8, f3, f4
  771. fxcsnmsub f12, A8, f11, f12
  772. fxcpnmsub f0, A8, f3, f0
  773. fxcpnmsub f8, A8, f11, f8
  774. add AO2, AO2, INCM4
  775. LFPDUX A1, AO, INCM4
  776. LFPDUX A2, AO2, INCM4
  777. LFPDUX A3, AO, INCM4
  778. add AO2, AO2, INCM4
  779. LFPDUX A4, AO, INCM4
  780. LFPDUX A5, AO2, INCM4
  781. LFPDUX A6, AO, INCM4
  782. add AO2, AO2, INCM4
  783. add AO, AO, INCM4
  784. LFPDUX A7, AO2, INCM4
  785. LFPDUX A8, AO, INCM4
  786. fxsmul f6, A1, f6
  787. fxsmul f14, A1, f14
  788. fxcpnmsub f2, A1, f6, f2
  789. fxcpnmsub f10, A1, f14, f10
  790. fxcsnmsub f5, A2, f6, f5
  791. fxcsnmsub f13, A2, f14, f13
  792. fxcpnmsub f1, A2, f6, f1
  793. fxcpnmsub f9, A2, f14, f9
  794. fxcsnmsub f4, A3, f6, f4
  795. fxcsnmsub f12, A3, f14, f12
  796. fxcpnmsub f0, A3, f6, f0
  797. fxcpnmsub f8, A3, f14, f8
  798. fxpmul f2, A4, f2
  799. fxpmul f10, A4, f10
  800. fxcsnmsub f5, A5, f2, f5
  801. fxcsnmsub f13, A5, f10, f13
  802. fxcpnmsub f1, A5, f2, f1
  803. fxcpnmsub f9, A5, f10, f9
  804. fxcsnmsub f4, A6, f2, f4
  805. fxcsnmsub f12, A6, f10, f12
  806. fxcpnmsub f0, A6, f2, f0
  807. fxcpnmsub f8, A6, f10, f8
  808. fxsmul f5, A7, f5
  809. fxsmul f13, A7, f13
  810. fxcpnmsub f1, A7, f5, f1
  811. fxcpnmsub f9, A7, f13, f9
  812. fxcsnmsub f4, A8, f5, f4
  813. fxcsnmsub f12, A8, f13, f12
  814. fxcpnmsub f0, A8, f5, f0
  815. fxcpnmsub f8, A8, f13, f8
  816. add AO2, AO2, INCM4
  817. add AO, AO, INCM4
  818. LFPDUX A1, AO2, INCM4
  819. LFPDUX A2, AO, INCM4
  820. subi AO2, AO2, 8 * SIZE
  821. add AO, AO, INCM4
  822. LFPDUX A3, AO, INCM4
  823. subi AO2, AO2, 8 * SIZE
  824. add AO, AO, INCM4
  825. LFPDUX A4, AO, INCM4
  826. addi AO, AO, -4 * SIZE
  827. addi AO2, AO2, -4 * SIZE
  828. fxpmul f1, A1, f1
  829. fxpmul f9, A1, f9
  830. fxcsnmsub f4, A2, f1, f4
  831. fxcsnmsub f12, A2, f9, f12
  832. fxcpnmsub f0, A2, f1, f0
  833. fxcpnmsub f8, A2, f9, f8
  834. fxsmul f4, A3, f4
  835. fxsmul f12, A3, f12
  836. fxcpnmsub f0, A3, f4, f0
  837. fxcpnmsub f8, A3, f12, f8
  838. fxpmul f0, A4, f0
  839. fxpmul f8, A4, f8
  840. #endif
  841. #ifdef LT
  842. LFPDUX A1, AO, INC4
  843. LFPDUX A2, AO2, INC4
  844. LFPDUX A3, AO, INC4
  845. LFPDUX A4, AO2, INC4
  846. LFPDUX A5, AO, INC4
  847. LFPDUX A6, AO2, INC4
  848. LFPDUX A7, AO, INC4
  849. LFPDUX A8, AO2, INC4
  850. fxpmul f0, A1, f0
  851. fxpmul f8, A1, f8
  852. fxcsnmsub f4, A1, f0, f4
  853. fxcsnmsub f12, A1, f8, f12
  854. fxcpnmsub f1, A2, f0, f1
  855. fxcpnmsub f9, A2, f8, f9
  856. fxcsnmsub f5, A2, f0, f5
  857. fxcsnmsub f13, A2, f8, f13
  858. fxcpnmsub f2, A3, f0, f2
  859. fxcpnmsub f10, A3, f8, f10
  860. fxcsnmsub f6, A3, f0, f6
  861. fxcsnmsub f14, A3, f8, f14
  862. fxcpnmsub f3, A4, f0, f3
  863. fxcpnmsub f11, A4, f8, f11
  864. fxcsnmsub f7, A4, f0, f7
  865. fxcsnmsub f15, A4, f8, f15
  866. fxsmul f4, A5, f4
  867. fxsmul f12, A5, f12
  868. fxcpnmsub f1, A6, f4, f1
  869. fxcpnmsub f9, A6, f12, f9
  870. fxcsnmsub f5, A6, f4, f5
  871. fxcsnmsub f13, A6, f12, f13
  872. fxcpnmsub f2, A7, f4, f2
  873. fxcpnmsub f10, A7, f12, f10
  874. fxcsnmsub f6, A7, f4, f6
  875. fxcsnmsub f14, A7, f12, f14
  876. fxcpnmsub f3, A8, f4, f3
  877. fxcpnmsub f11, A8, f12, f11
  878. fxcsnmsub f7, A8, f4, f7
  879. fxcsnmsub f15, A8, f12, f15
  880. add AO, AO, INC4
  881. LFPDUX A1, AO2, INC4
  882. LFPDUX A2, AO, INC4
  883. LFPDUX A3, AO2, INC4
  884. add AO, AO, INC4
  885. LFPDUX A4, AO2, INC4
  886. LFPDUX A5, AO, INC4
  887. LFPDUX A6, AO2, INC4
  888. add AO, AO, INC4
  889. add AO2, AO2, INC4
  890. LFPDUX A7, AO, INC4
  891. LFPDUX A8, AO2, INC4
  892. fxpmul f1, A1, f1
  893. fxpmul f9, A1, f9
  894. fxcsnmsub f5, A1, f1, f5
  895. fxcsnmsub f13, A1, f9, f13
  896. fxcpnmsub f2, A2, f1, f2
  897. fxcpnmsub f10, A2, f9, f10
  898. fxcsnmsub f6, A2, f1, f6
  899. fxcsnmsub f14, A2, f9, f14
  900. fxcpnmsub f3, A3, f1, f3
  901. fxcpnmsub f11, A3, f9, f11
  902. fxcsnmsub f7, A3, f1, f7
  903. fxcsnmsub f15, A3, f9, f15
  904. fxsmul f5, A4, f5
  905. fxsmul f13, A4, f13
  906. fxcpnmsub f2, A5, f5, f2
  907. fxcpnmsub f10, A5, f13, f10
  908. fxcsnmsub f6, A5, f5, f6
  909. fxcsnmsub f14, A5, f13, f14
  910. fxcpnmsub f3, A6, f5, f3
  911. fxcpnmsub f11, A6, f13, f11
  912. fxcsnmsub f7, A6, f5, f7
  913. fxcsnmsub f15, A6, f13, f15
  914. fxpmul f2, A7, f2
  915. fxpmul f10, A7, f10
  916. fxcsnmsub f6, A7, f2, f6
  917. fxcsnmsub f14, A7, f10, f14
  918. fxcpnmsub f3, A8, f2, f3
  919. fxcpnmsub f11, A8, f10, f11
  920. fxcsnmsub f7, A8, f2, f7
  921. fxcsnmsub f15, A8, f10, f15
  922. add AO, AO, INC4
  923. add AO2, AO2, INC4
  924. LFPDUX A1, AO, INC4
  925. LFPDUX A2, AO2, INC4
  926. addi AO, AO, 8 * SIZE
  927. addi AO2, AO2, 4 * SIZE
  928. LFPDUX A3, AO2, INC4
  929. addi AO, AO, 8 * SIZE
  930. addi AO2, AO2, 4 * SIZE
  931. LFPDUX A4, AO2, INC4
  932. subi AO, AO, 64 * SIZE
  933. subi AO2, AO2, 64 * SIZE
  934. fxsmul f6, A1, f6
  935. fxsmul f14, A1, f14
  936. fxcpnmsub f3, A2, f6, f3
  937. fxcpnmsub f11, A2, f14, f11
  938. fxcsnmsub f7, A2, f6, f7
  939. fxcsnmsub f15, A2, f14, f15
  940. fxpmul f3, A3, f3
  941. fxpmul f11, A3, f11
  942. fxcsnmsub f7, A3, f3, f7
  943. fxcsnmsub f15, A3, f11, f15
  944. fxsmul f7, A4, f7
  945. fxsmul f15, A4, f15
  946. #endif
  947. #ifdef RN
  948. LFPDUX A1, BO, INC4
  949. LFPDUX A2, BO2, INC4
  950. LFPDUX A3, BO, INC4
  951. LFPDUX A4, BO2, INC4
  952. add BO, BO, INC4
  953. LFPDUX A5, BO2, INC4
  954. add BO, BO, INC4
  955. LFPDUX A6, BO2, INC4
  956. subi BO, BO, 16 * SIZE
  957. subi BO2, BO2, 16 * SIZE
  958. fxpmul f0, A1, f0
  959. fxpmul f1, A1, f1
  960. fxpmul f2, A1, f2
  961. fxpmul f3, A1, f3
  962. fxcsnmsub f4, A1, f0, f4
  963. fxcsnmsub f5, A1, f1, f5
  964. fxcsnmsub f6, A1, f2, f6
  965. fxcsnmsub f7, A1, f3, f7
  966. fxcpnmsub f8, A2, f0, f8
  967. fxcpnmsub f9, A2, f1, f9
  968. fxcpnmsub f10, A2, f2, f10
  969. fxcpnmsub f11, A2, f3, f11
  970. fxcsnmsub f12, A2, f0, f12
  971. fxcsnmsub f13, A2, f1, f13
  972. fxcsnmsub f14, A2, f2, f14
  973. fxcsnmsub f15, A2, f3, f15
  974. fxsmul f4, A3, f4
  975. fxsmul f5, A3, f5
  976. fxsmul f6, A3, f6
  977. fxsmul f7, A3, f7
  978. fxcpnmsub f8, A4, f4, f8
  979. fxcpnmsub f9, A4, f5, f9
  980. fxcpnmsub f10, A4, f6, f10
  981. fxcpnmsub f11, A4, f7, f11
  982. fxcsnmsub f12, A4, f4, f12
  983. fxcsnmsub f13, A4, f5, f13
  984. fxcsnmsub f14, A4, f6, f14
  985. fxcsnmsub f15, A4, f7, f15
  986. fxpmul f8, A5, f8
  987. fxpmul f9, A5, f9
  988. fxpmul f10, A5, f10
  989. fxpmul f11, A5, f11
  990. fxcsnmsub f12, A5, f8, f12
  991. fxcsnmsub f13, A5, f9, f13
  992. fxcsnmsub f14, A5, f10, f14
  993. fxcsnmsub f15, A5, f11, f15
  994. fxsmul f12, A6, f12
  995. fxsmul f13, A6, f13
  996. fxsmul f14, A6, f14
  997. fxsmul f15, A6, f15
  998. #endif
  999. #ifdef RT
  1000. addi BO, BO, 20 * SIZE
  1001. addi BO2, BO2, 20 * SIZE
  1002. LFPDUX A1, BO2, INCM4
  1003. LFPDUX A2, BO, INCM4
  1004. LFPDUX A3, BO2, INCM4
  1005. LFPDUX A4, BO, INCM4
  1006. add BO2, BO2, INCM4
  1007. LFPDUX A5, BO, INCM4
  1008. add BO2, BO2, INCM4
  1009. LFPDUX A6, BO, INCM4
  1010. subi BO, BO, 4 * SIZE
  1011. subi BO2, BO2, 4 * SIZE
  1012. fxsmul f12, A1, f12
  1013. fxsmul f13, A1, f13
  1014. fxsmul f14, A1, f14
  1015. fxsmul f15, A1, f15
  1016. fxcpnmsub f8, A1, f12, f8
  1017. fxcpnmsub f9, A1, f13, f9
  1018. fxcpnmsub f10, A1, f14, f10
  1019. fxcpnmsub f11, A1, f15, f11
  1020. fxcsnmsub f4, A2, f12, f4
  1021. fxcsnmsub f5, A2, f13, f5
  1022. fxcsnmsub f6, A2, f14, f6
  1023. fxcsnmsub f7, A2, f15, f7
  1024. fxcpnmsub f0, A2, f12, f0
  1025. fxcpnmsub f1, A2, f13, f1
  1026. fxcpnmsub f2, A2, f14, f2
  1027. fxcpnmsub f3, A2, f15, f3
  1028. fxpmul f8, A3, f8
  1029. fxpmul f9, A3, f9
  1030. fxpmul f10, A3, f10
  1031. fxpmul f11, A3, f11
  1032. fxcsnmsub f4, A4, f8, f4
  1033. fxcsnmsub f5, A4, f9, f5
  1034. fxcsnmsub f6, A4, f10, f6
  1035. fxcsnmsub f7, A4, f11, f7
  1036. fxcpnmsub f0, A4, f8, f0
  1037. fxcpnmsub f1, A4, f9, f1
  1038. fxcpnmsub f2, A4, f10, f2
  1039. fxcpnmsub f3, A4, f11, f3
  1040. fxsmul f4, A5, f4
  1041. fxsmul f5, A5, f5
  1042. fxsmul f6, A5, f6
  1043. fxsmul f7, A5, f7
  1044. fxcpnmsub f0, A5, f4, f0
  1045. fxcpnmsub f1, A5, f5, f1
  1046. fxcpnmsub f2, A5, f6, f2
  1047. fxcpnmsub f3, A5, f7, f3
  1048. fxpmul f0, A6, f0
  1049. fxpmul f1, A6, f1
  1050. fxpmul f2, A6, f2
  1051. fxpmul f3, A6, f3
  1052. #endif
  1053. #ifdef LN
  1054. subi CO1, CO1, 8 * SIZE
  1055. subi CO2, CO2, 8 * SIZE
  1056. subi CO3, CO3, 8 * SIZE
  1057. subi CO4, CO4, 8 * SIZE
  1058. #endif
  1059. #if defined(LN) || defined(LT)
  1060. STFPDUX f0, BO, INC4
  1061. STFPDUX f8, BO2, INC4
  1062. STFPDUX f4, BO, INC4
  1063. STFPDUX f12, BO2, INC4
  1064. STFPDUX f1, BO, INC4
  1065. STFPDUX f9, BO2, INC4
  1066. STFPDUX f5, BO, INC4
  1067. STFPDUX f13, BO2, INC4
  1068. STFPDUX f2, BO, INC4
  1069. STFPDUX f10, BO2, INC4
  1070. STFPDUX f6, BO, INC4
  1071. STFPDUX f14, BO2, INC4
  1072. STFPDUX f3, BO, INC4
  1073. STFPDUX f11, BO2, INC4
  1074. STFPDUX f7, BO, INC4
  1075. STFPDUX f15, BO2, INC4
  1076. subi BO, BO, 32 * SIZE
  1077. subi BO2, BO2, 32 * SIZE
  1078. STFDUX f0, CO1, INC
  1079. STFDUX f4, CO1, INC
  1080. STFDUX f1, CO1, INC
  1081. STFDUX f5, CO1, INC
  1082. STFDUX f2, CO1, INC
  1083. STFDUX f6, CO1, INC
  1084. STFDUX f3, CO1, INC
  1085. STFDUX f7, CO1, INC
  1086. STFSDUX f0, CO2, INC
  1087. STFSDUX f4, CO2, INC
  1088. STFSDUX f1, CO2, INC
  1089. STFSDUX f5, CO2, INC
  1090. STFSDUX f2, CO2, INC
  1091. STFSDUX f6, CO2, INC
  1092. STFSDUX f3, CO2, INC
  1093. STFSDUX f7, CO2, INC
  1094. STFDUX f8, CO3, INC
  1095. STFDUX f12, CO3, INC
  1096. STFDUX f9, CO3, INC
  1097. STFDUX f13, CO3, INC
  1098. STFDUX f10, CO3, INC
  1099. STFDUX f14, CO3, INC
  1100. STFDUX f11, CO3, INC
  1101. STFDUX f15, CO3, INC
  1102. STFSDUX f8, CO4, INC
  1103. STFSDUX f12, CO4, INC
  1104. STFSDUX f9, CO4, INC
  1105. STFSDUX f13, CO4, INC
  1106. STFSDUX f10, CO4, INC
  1107. STFSDUX f14, CO4, INC
  1108. STFSDUX f11, CO4, INC
  1109. STFSDUX f15, CO4, INC
  1110. #else
  1111. STFPDUX f0, AO, INC4
  1112. STFPDUX f1, AO2, INC4
  1113. STFPDUX f2, AO, INC4
  1114. STFPDUX f3, AO2, INC4
  1115. STFPDUX f4, AO, INC4
  1116. STFPDUX f5, AO2, INC4
  1117. STFPDUX f6, AO, INC4
  1118. STFPDUX f7, AO2, INC4
  1119. STFPDUX f8, AO, INC4
  1120. STFPDUX f9, AO2, INC4
  1121. STFPDUX f10, AO, INC4
  1122. STFPDUX f11, AO2, INC4
  1123. STFPDUX f12, AO, INC4
  1124. STFPDUX f13, AO2, INC4
  1125. STFPDUX f14, AO, INC4
  1126. STFPDUX f15, AO2, INC4
  1127. subi AO, AO, 32 * SIZE
  1128. subi AO2, AO2, 32 * SIZE
  1129. STFDUX f0, CO1, INC
  1130. STFSDUX f0, CO1, INC
  1131. STFDUX f1, CO1, INC
  1132. STFSDUX f1, CO1, INC
  1133. STFDUX f2, CO1, INC
  1134. STFSDUX f2, CO1, INC
  1135. STFDUX f3, CO1, INC
  1136. STFSDUX f3, CO1, INC
  1137. STFDUX f4, CO2, INC
  1138. STFSDUX f4, CO2, INC
  1139. STFDUX f5, CO2, INC
  1140. STFSDUX f5, CO2, INC
  1141. STFDUX f6, CO2, INC
  1142. STFSDUX f6, CO2, INC
  1143. STFDUX f7, CO2, INC
  1144. STFSDUX f7, CO2, INC
  1145. STFDUX f8, CO3, INC
  1146. STFSDUX f8, CO3, INC
  1147. STFDUX f9, CO3, INC
  1148. STFSDUX f9, CO3, INC
  1149. STFDUX f10, CO3, INC
  1150. STFSDUX f10, CO3, INC
  1151. STFDUX f11, CO3, INC
  1152. STFSDUX f11, CO3, INC
  1153. STFDUX f12, CO4, INC
  1154. STFSDUX f12, CO4, INC
  1155. STFDUX f13, CO4, INC
  1156. STFSDUX f13, CO4, INC
  1157. STFDUX f14, CO4, INC
  1158. STFSDUX f14, CO4, INC
  1159. STFDUX f15, CO4, INC
  1160. STFSDUX f15, CO4, INC
  1161. #endif
  1162. #ifdef LN
  1163. subi CO1, CO1, 8 * SIZE
  1164. subi CO2, CO2, 8 * SIZE
  1165. subi CO3, CO3, 8 * SIZE
  1166. subi CO4, CO4, 8 * SIZE
  1167. #endif
  1168. #ifdef RT
  1169. slwi r0, K, 3 + BASE_SHIFT
  1170. add AORIG, AORIG, r0
  1171. #endif
  1172. #if defined(LT) || defined(RN)
  1173. sub TEMP, K, KK
  1174. slwi r0, TEMP, 3 + BASE_SHIFT
  1175. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1176. add AO, AO, r0
  1177. add BO, BO, TEMP
  1178. #endif
  1179. #ifdef LT
  1180. addi KK, KK, 8
  1181. #endif
  1182. #ifdef LN
  1183. subi KK, KK, 8
  1184. #endif
  1185. addic. I, I, -1
  1186. li r0, FZERO
  1187. lfpsx f0, SP, r0
  1188. bgt+ .L11
  1189. .align 4
  1190. .L20:
  1191. andi. I, M, 4
  1192. beq .L30
  1193. #if defined(LT) || defined(RN)
  1194. addi AO2, AO, 2 * SIZE
  1195. fpmr f4, f0
  1196. addi BO, B, - 4 * SIZE
  1197. fpmr f8, f0
  1198. addi BO2, B, - 2 * SIZE
  1199. fpmr f12, f0
  1200. srawi. r0, KK, 2
  1201. fpmr f1, f0
  1202. fpmr f5, f0
  1203. fpmr f9, f0
  1204. mtspr CTR, r0
  1205. fpmr f13, f0
  1206. ble .L24
  1207. #else
  1208. #ifdef LN
  1209. slwi r0, K, 2 + BASE_SHIFT
  1210. sub AORIG, AORIG, r0
  1211. #endif
  1212. slwi r0 , KK, 2 + BASE_SHIFT
  1213. slwi TEMP, KK, 2 + BASE_SHIFT
  1214. add AO, AORIG, r0
  1215. add BO, B, TEMP
  1216. sub TEMP, K, KK
  1217. addi AO2, AO, 2 * SIZE
  1218. fpmr f4, f0
  1219. addi BO, BO, - 4 * SIZE
  1220. fpmr f8, f0
  1221. addi BO2, BO, 2 * SIZE
  1222. fpmr f12, f0
  1223. srawi. r0, TEMP, 2
  1224. fpmr f1, f0
  1225. fpmr f5, f0
  1226. fpmr f9, f0
  1227. mtspr CTR, r0
  1228. fpmr f13, f0
  1229. ble .L24
  1230. #endif
  1231. LFPDUX A1, AO, INC4
  1232. LFPDUX B1, BO, INC4
  1233. LFPDUX A2, AO2, INC4
  1234. LFPDUX B2, BO2, INC4
  1235. LFPDUX A3, AO, INC4
  1236. LFPDUX B3, BO, INC4
  1237. LFPDUX A4, AO2, INC4
  1238. LFPDUX B4, BO2, INC4
  1239. LFPDUX A5, AO, INC4
  1240. LFPDUX B5, BO, INC4
  1241. LFPDUX A6, AO2, INC4
  1242. LFPDUX B6, BO2, INC4
  1243. LFPDUX A7, AO, INC4
  1244. LFPDUX A9, BO, INC4
  1245. LFPDUX A10, BO2, INC4
  1246. bdz- .L23
  1247. .align 4
  1248. .L22:
  1249. fxcpmadd f0, B1, A1, f0
  1250. nop
  1251. fxcsmadd f4, B1, A1, f4
  1252. LFPDUX A8, AO2, INC4
  1253. fxcpmadd f8, B2, A1, f8
  1254. nop
  1255. fxcsmadd f12, B2, A1, f12
  1256. LFPDUX A1, AO, INC4
  1257. fxcpmadd f1, B1, A2, f1
  1258. nop
  1259. fxcsmadd f5, B1, A2, f5
  1260. LFPDUX B1, BO, INC4
  1261. fxcpmadd f9, B2, A2, f9
  1262. nop
  1263. fxcsmadd f13, B2, A2, f13
  1264. LFPDUX B2, BO2, INC4
  1265. fxcpmadd f0, B3, A3, f0
  1266. nop
  1267. fxcsmadd f4, B3, A3, f4
  1268. LFPDUX A2, AO2, INC4
  1269. fxcpmadd f8, B4, A3, f8
  1270. nop
  1271. fxcsmadd f12, B4, A3, f12
  1272. LFPDUX A3, AO, INC4
  1273. fxcpmadd f1, B3, A4, f1
  1274. nop
  1275. fxcsmadd f5, B3, A4, f5
  1276. LFPDUX B3, BO, INC4
  1277. fxcpmadd f9, B4, A4, f9
  1278. nop
  1279. fxcsmadd f13, B4, A4, f13
  1280. LFPDUX B4, BO2, INC4
  1281. fxcpmadd f0, B5, A5, f0
  1282. nop
  1283. fxcsmadd f4, B5, A5, f4
  1284. LFPDUX A4, AO2, INC4
  1285. fxcpmadd f8, B6, A5, f8
  1286. nop
  1287. fxcsmadd f12, B6, A5, f12
  1288. LFPDUX A5, AO, INC4
  1289. fxcpmadd f1, B5, A6, f1
  1290. nop
  1291. fxcsmadd f5, B5, A6, f5
  1292. LFPDUX B5, BO, INC4
  1293. fxcpmadd f9, B6, A6, f9
  1294. nop
  1295. fxcsmadd f13, B6, A6, f13
  1296. LFPDUX B6, BO2, INC4
  1297. fxcpmadd f0, A9, A7, f0
  1298. nop
  1299. fxcsmadd f4, A9, A7, f4
  1300. LFPDUX A6, AO2, INC4
  1301. fxcpmadd f8, A10, A7, f8
  1302. nop
  1303. fxcsmadd f12, A10, A7, f12
  1304. LFPDUX A7, AO, INC4
  1305. fxcpmadd f1, A9, A8, f1
  1306. nop
  1307. fxcsmadd f5, A9, A8, f5
  1308. LFPDUX A9, BO, INC4
  1309. fxcpmadd f9, A10, A8, f9
  1310. nop
  1311. fxcsmadd f13, A10, A8, f13
  1312. LFPDUX A10, BO2, INC4
  1313. bdnz+ .L22
  1314. .align 4
  1315. .L23:
  1316. fxcpmadd f0, B1, A1, f0
  1317. fxcsmadd f4, B1, A1, f4
  1318. LFPDUX A8, AO2, INC4
  1319. fxcpmadd f8, B2, A1, f8
  1320. fxcsmadd f12, B2, A1, f12
  1321. fxcpmadd f1, B1, A2, f1
  1322. fxcsmadd f5, B1, A2, f5
  1323. fxcpmadd f9, B2, A2, f9
  1324. fxcsmadd f13, B2, A2, f13
  1325. fxcpmadd f0, B3, A3, f0
  1326. fxcsmadd f4, B3, A3, f4
  1327. fxcpmadd f8, B4, A3, f8
  1328. fxcsmadd f12, B4, A3, f12
  1329. fxcpmadd f1, B3, A4, f1
  1330. fxcsmadd f5, B3, A4, f5
  1331. fxcpmadd f9, B4, A4, f9
  1332. fxcsmadd f13, B4, A4, f13
  1333. fxcpmadd f0, B5, A5, f0
  1334. fxcsmadd f4, B5, A5, f4
  1335. fxcpmadd f8, B6, A5, f8
  1336. fxcsmadd f12, B6, A5, f12
  1337. fxcpmadd f1, B5, A6, f1
  1338. fxcsmadd f5, B5, A6, f5
  1339. fxcpmadd f9, B6, A6, f9
  1340. fxcsmadd f13, B6, A6, f13
  1341. fxcpmadd f0, A9, A7, f0
  1342. fxcsmadd f4, A9, A7, f4
  1343. fxcpmadd f8, A10, A7, f8
  1344. fxcsmadd f12, A10, A7, f12
  1345. fxcpmadd f1, A9, A8, f1
  1346. fxcsmadd f5, A9, A8, f5
  1347. fxcpmadd f9, A10, A8, f9
  1348. fxcsmadd f13, A10, A8, f13
  1349. .align 4
  1350. .L24:
  1351. #if defined(LT) || defined(RN)
  1352. andi. r0, KK, 3
  1353. mtspr CTR, r0
  1354. ble+ .L28
  1355. #else
  1356. andi. r0, TEMP, 3
  1357. mtspr CTR, r0
  1358. ble+ .L28
  1359. #endif
  1360. LFPDUX A1, AO, INC4
  1361. LFPDUX A2, AO2, INC4
  1362. LFPDUX B1, BO, INC4
  1363. LFPDUX B2, BO2, INC4
  1364. bdz- .L27
  1365. .align 4
  1366. .L26:
  1367. fxcpmadd f0, B1, A1, f0
  1368. fxcsmadd f4, B1, A1, f4
  1369. fxcpmadd f8, B2, A1, f8
  1370. fxcsmadd f12, B2, A1, f12
  1371. LFPDUX A1, AO, INC4
  1372. fxcpmadd f1, B1, A2, f1
  1373. fxcsmadd f5, B1, A2, f5
  1374. LFPDUX B1, BO, INC4
  1375. fxcpmadd f9, B2, A2, f9
  1376. fxcsmadd f13, B2, A2, f13
  1377. LFPDUX A2, AO2, INC4
  1378. LFPDUX B2, BO2, INC4
  1379. bdnz+ .L26
  1380. .align 4
  1381. .L27:
  1382. fxcpmadd f0, B1, A1, f0
  1383. fxcsmadd f4, B1, A1, f4
  1384. fxcpmadd f8, B2, A1, f8
  1385. fxcsmadd f12, B2, A1, f12
  1386. fxcpmadd f1, B1, A2, f1
  1387. fxcsmadd f5, B1, A2, f5
  1388. fxcpmadd f9, B2, A2, f9
  1389. fxcsmadd f13, B2, A2, f13
  1390. .align 4
  1391. .L28:
  1392. #if defined(LN) || defined(RT)
  1393. #ifdef LN
  1394. subi r0, KK, 4
  1395. #else
  1396. subi r0, KK, 4
  1397. #endif
  1398. slwi TEMP, r0, 2 + BASE_SHIFT
  1399. slwi r0, r0, 2 + BASE_SHIFT
  1400. add AO, AORIG, TEMP
  1401. add BO, B, r0
  1402. addi AO2, AO, 2 * SIZE
  1403. addi BO, BO, - 4 * SIZE
  1404. addi BO2, BO, 2 * SIZE
  1405. #endif
  1406. #if defined(LN) || defined(LT)
  1407. fpmr f24, f0
  1408. fpmr f25, f1
  1409. fpmr f28, f8
  1410. fpmr f29, f9
  1411. fsmfp f0, f4
  1412. fsmfp f1, f5
  1413. fsmfp f8, f12
  1414. fsmfp f9, f13
  1415. fsmtp f4, f24
  1416. fsmtp f5, f25
  1417. fsmtp f12, f28
  1418. fsmtp f13, f29
  1419. LFPDUX f16, BO, INC4
  1420. LFPDUX f17, BO2, INC4
  1421. LFPDUX f18, BO, INC4
  1422. LFPDUX f19, BO2, INC4
  1423. LFPDUX f20, BO, INC4
  1424. LFPDUX f21, BO2, INC4
  1425. LFPDUX f22, BO, INC4
  1426. LFPDUX f23, BO2, INC4
  1427. subi BO, BO, 16 * SIZE
  1428. subi BO2, BO2, 16 * SIZE
  1429. fpsub f0, f16, f0
  1430. fpsub f8, f17, f8
  1431. fpsub f4, f18, f4
  1432. fpsub f12, f19, f12
  1433. fpsub f1, f20, f1
  1434. fpsub f9, f21, f9
  1435. fpsub f5, f22, f5
  1436. fpsub f13, f23, f13
  1437. #else
  1438. LFPDUX f16, AO, INC4
  1439. LFPDUX f17, AO2, INC4
  1440. LFPDUX f18, AO, INC4
  1441. LFPDUX f19, AO2, INC4
  1442. LFPDUX f20, AO, INC4
  1443. LFPDUX f21, AO2, INC4
  1444. LFPDUX f22, AO, INC4
  1445. LFPDUX f23, AO2, INC4
  1446. subi AO, AO, 16 * SIZE
  1447. subi AO2, AO2, 16 * SIZE
  1448. fpsub f0, f16, f0
  1449. fpsub f1, f17, f1
  1450. fpsub f4, f18, f4
  1451. fpsub f5, f19, f5
  1452. fpsub f8, f20, f8
  1453. fpsub f9, f21, f9
  1454. fpsub f12, f22, f12
  1455. fpsub f13, f23, f13
  1456. #endif
  1457. #ifdef LN
  1458. addi AO, AO, 20 * SIZE
  1459. addi AO2, AO2, 20 * SIZE
  1460. LFPDUX A1, AO2, INCM4
  1461. LFPDUX A2, AO, INCM4
  1462. LFPDUX A3, AO2, INCM4
  1463. LFPDUX A4, AO, INCM4
  1464. add AO2, AO2, INCM4
  1465. LFPDUX A5, AO, INCM4
  1466. add AO2, AO2, INCM4
  1467. LFPDUX A6, AO, INCM4
  1468. addi AO, AO, -4 * SIZE
  1469. addi AO2, AO2, -4 * SIZE
  1470. fxsmul f5, A1, f5
  1471. fxsmul f13, A1, f13
  1472. fxcpnmsub f1, A1, f5, f1
  1473. fxcpnmsub f9, A1, f13, f9
  1474. fxcsnmsub f4, A2, f5, f4
  1475. fxcsnmsub f12, A2, f13, f12
  1476. fxcpnmsub f0, A2, f5, f0
  1477. fxcpnmsub f8, A2, f13, f8
  1478. fxpmul f1, A3, f1
  1479. fxpmul f9, A3, f9
  1480. fxcsnmsub f4, A4, f1, f4
  1481. fxcsnmsub f12, A4, f9, f12
  1482. fxcpnmsub f0, A4, f1, f0
  1483. fxcpnmsub f8, A4, f9, f8
  1484. fxsmul f4, A5, f4
  1485. fxsmul f12, A5, f12
  1486. fxcpnmsub f0, A5, f4, f0
  1487. fxcpnmsub f8, A5, f12, f8
  1488. fxpmul f0, A6, f0
  1489. fxpmul f8, A6, f8
  1490. #endif
  1491. #ifdef LT
  1492. LFPDUX A1, AO, INC4
  1493. LFPDUX A2, AO2, INC4
  1494. LFPDUX A3, AO, INC4
  1495. LFPDUX A4, AO2, INC4
  1496. add AO, AO, INC4
  1497. LFPDUX A5, AO2, INC4
  1498. add AO, AO, INC4
  1499. LFPDUX A6, AO2, INC4
  1500. subi AO, AO, 16 * SIZE
  1501. subi AO2, AO2, 16 * SIZE
  1502. fxpmul f0, A1, f0
  1503. fxpmul f8, A1, f8
  1504. fxcsnmsub f4, A1, f0, f4
  1505. fxcsnmsub f12, A1, f8, f12
  1506. fxcpnmsub f1, A2, f0, f1
  1507. fxcpnmsub f9, A2, f8, f9
  1508. fxcsnmsub f5, A2, f0, f5
  1509. fxcsnmsub f13, A2, f8, f13
  1510. fxsmul f4, A3, f4
  1511. fxsmul f12, A3, f12
  1512. fxcpnmsub f1, A4, f4, f1
  1513. fxcpnmsub f9, A4, f12, f9
  1514. fxcsnmsub f5, A4, f4, f5
  1515. fxcsnmsub f13, A4, f12, f13
  1516. fxpmul f1, A5, f1
  1517. fxpmul f9, A5, f9
  1518. fxcsnmsub f5, A5, f1, f5
  1519. fxcsnmsub f13, A5, f9, f13
  1520. fxsmul f5, A6, f5
  1521. fxsmul f13, A6, f13
  1522. #endif
  1523. #ifdef RN
  1524. LFPDUX A1, BO, INC4
  1525. LFPDUX A2, BO2, INC4
  1526. LFPDUX A3, BO, INC4
  1527. LFPDUX A4, BO2, INC4
  1528. add BO, BO, INC4
  1529. LFPDUX A5, BO2, INC4
  1530. add BO, BO, INC4
  1531. LFPDUX A6, BO2, INC4
  1532. subi BO, BO, 16 * SIZE
  1533. subi BO2, BO2, 16 * SIZE
  1534. fxpmul f0, A1, f0
  1535. fxpmul f1, A1, f1
  1536. fxcsnmsub f4, A1, f0, f4
  1537. fxcsnmsub f5, A1, f1, f5
  1538. fxcpnmsub f8, A2, f0, f8
  1539. fxcpnmsub f9, A2, f1, f9
  1540. fxcsnmsub f12, A2, f0, f12
  1541. fxcsnmsub f13, A2, f1, f13
  1542. fxsmul f4, A3, f4
  1543. fxsmul f5, A3, f5
  1544. fxcpnmsub f8, A4, f4, f8
  1545. fxcpnmsub f9, A4, f5, f9
  1546. fxcsnmsub f12, A4, f4, f12
  1547. fxcsnmsub f13, A4, f5, f13
  1548. fxpmul f8, A5, f8
  1549. fxpmul f9, A5, f9
  1550. fxcsnmsub f12, A5, f8, f12
  1551. fxcsnmsub f13, A5, f9, f13
  1552. fxsmul f12, A6, f12
  1553. fxsmul f13, A6, f13
  1554. #endif
  1555. #ifdef RT
  1556. addi BO, BO, 20 * SIZE
  1557. addi BO2, BO2, 20 * SIZE
  1558. LFPDUX A1, BO2, INCM4
  1559. LFPDUX A2, BO, INCM4
  1560. LFPDUX A3, BO2, INCM4
  1561. LFPDUX A4, BO, INCM4
  1562. add BO2, BO2, INCM4
  1563. LFPDUX A5, BO, INCM4
  1564. add BO2, BO2, INCM4
  1565. LFPDUX A6, BO, INCM4
  1566. subi BO, BO, 4 * SIZE
  1567. subi BO2, BO2, 4 * SIZE
  1568. fxsmul f12, A1, f12
  1569. fxsmul f13, A1, f13
  1570. fxcpnmsub f8, A1, f12, f8
  1571. fxcpnmsub f9, A1, f13, f9
  1572. fxcsnmsub f4, A2, f12, f4
  1573. fxcsnmsub f5, A2, f13, f5
  1574. fxcpnmsub f0, A2, f12, f0
  1575. fxcpnmsub f1, A2, f13, f1
  1576. fxpmul f8, A3, f8
  1577. fxpmul f9, A3, f9
  1578. fxcsnmsub f4, A4, f8, f4
  1579. fxcsnmsub f5, A4, f9, f5
  1580. fxcpnmsub f0, A4, f8, f0
  1581. fxcpnmsub f1, A4, f9, f1
  1582. fxsmul f4, A5, f4
  1583. fxsmul f5, A5, f5
  1584. fxcpnmsub f0, A5, f4, f0
  1585. fxcpnmsub f1, A5, f5, f1
  1586. fxpmul f0, A6, f0
  1587. fxpmul f1, A6, f1
  1588. #endif
  1589. #ifdef LN
  1590. subi CO1, CO1, 4 * SIZE
  1591. subi CO2, CO2, 4 * SIZE
  1592. subi CO3, CO3, 4 * SIZE
  1593. subi CO4, CO4, 4 * SIZE
  1594. #endif
  1595. #if defined(LN) || defined(LT)
  1596. STFPDUX f0, BO, INC4
  1597. STFPDUX f8, BO2, INC4
  1598. STFPDUX f4, BO, INC4
  1599. STFPDUX f12, BO2, INC4
  1600. STFPDUX f1, BO, INC4
  1601. STFPDUX f9, BO2, INC4
  1602. STFPDUX f5, BO, INC4
  1603. STFPDUX f13, BO2, INC4
  1604. subi BO, BO, 16 * SIZE
  1605. subi BO2, BO2, 16 * SIZE
  1606. STFDUX f0, CO1, INC
  1607. STFDUX f4, CO1, INC
  1608. STFDUX f1, CO1, INC
  1609. STFDUX f5, CO1, INC
  1610. STFSDUX f0, CO2, INC
  1611. STFSDUX f4, CO2, INC
  1612. STFSDUX f1, CO2, INC
  1613. STFSDUX f5, CO2, INC
  1614. STFDUX f8, CO3, INC
  1615. STFDUX f12, CO3, INC
  1616. STFDUX f9, CO3, INC
  1617. STFDUX f13, CO3, INC
  1618. STFSDUX f8, CO4, INC
  1619. STFSDUX f12, CO4, INC
  1620. STFSDUX f9, CO4, INC
  1621. STFSDUX f13, CO4, INC
  1622. #else
  1623. STFPDUX f0, AO, INC4
  1624. STFPDUX f1, AO2, INC4
  1625. STFPDUX f4, AO, INC4
  1626. STFPDUX f5, AO2, INC4
  1627. STFPDUX f8, AO, INC4
  1628. STFPDUX f9, AO2, INC4
  1629. STFPDUX f12, AO, INC4
  1630. STFPDUX f13, AO2, INC4
  1631. subi AO, AO, 16 * SIZE
  1632. subi AO2, AO2, 16 * SIZE
  1633. STFDUX f0, CO1, INC
  1634. STFSDUX f0, CO1, INC
  1635. STFDUX f1, CO1, INC
  1636. STFSDUX f1, CO1, INC
  1637. STFDUX f4, CO2, INC
  1638. STFSDUX f4, CO2, INC
  1639. STFDUX f5, CO2, INC
  1640. STFSDUX f5, CO2, INC
  1641. STFDUX f8, CO3, INC
  1642. STFSDUX f8, CO3, INC
  1643. STFDUX f9, CO3, INC
  1644. STFSDUX f9, CO3, INC
  1645. STFDUX f12, CO4, INC
  1646. STFSDUX f12, CO4, INC
  1647. STFDUX f13, CO4, INC
  1648. STFSDUX f13, CO4, INC
  1649. #endif
  1650. #ifdef LN
  1651. subi CO1, CO1, 4 * SIZE
  1652. subi CO2, CO2, 4 * SIZE
  1653. subi CO3, CO3, 4 * SIZE
  1654. subi CO4, CO4, 4 * SIZE
  1655. #endif
  1656. #ifdef RT
  1657. slwi r0, K, 2 + BASE_SHIFT
  1658. add AORIG, AORIG, r0
  1659. #endif
  1660. #if defined(LT) || defined(RN)
  1661. sub TEMP, K, KK
  1662. slwi r0, TEMP, 2 + BASE_SHIFT
  1663. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1664. add AO, AO, r0
  1665. add BO, BO, TEMP
  1666. #endif
  1667. #ifdef LT
  1668. addi KK, KK, 4
  1669. #endif
  1670. #ifdef LN
  1671. subi KK, KK, 4
  1672. #endif
  1673. li r0, FZERO
  1674. lfpsx f0, SP, r0
  1675. .align 4
  1676. .L30:
  1677. andi. I, M, 2
  1678. beq .L40
  1679. #if defined(LT) || defined(RN)
  1680. addi AO2, AO, 2 * SIZE
  1681. fpmr f4, f0
  1682. addi BO, B, - 4 * SIZE
  1683. fpmr f8, f0
  1684. addi BO2, B, - 2 * SIZE
  1685. fpmr f12, f0
  1686. srawi. r0, KK, 2
  1687. mtspr CTR, r0
  1688. ble .L34
  1689. #else
  1690. #ifdef LN
  1691. slwi r0, K, 1 + BASE_SHIFT
  1692. sub AORIG, AORIG, r0
  1693. #endif
  1694. slwi r0 , KK, 1 + BASE_SHIFT
  1695. slwi TEMP, KK, 2 + BASE_SHIFT
  1696. add AO, AORIG, r0
  1697. add BO, B, TEMP
  1698. sub TEMP, K, KK
  1699. addi AO2, AO, 2 * SIZE
  1700. fpmr f4, f0
  1701. addi BO, BO, - 4 * SIZE
  1702. fpmr f8, f0
  1703. addi BO2, BO, 2 * SIZE
  1704. fpmr f12, f0
  1705. srawi. r0, TEMP, 2
  1706. mtspr CTR, r0
  1707. ble .L34
  1708. #endif
  1709. LFPDUX A1, AO, INC4
  1710. LFPDUX B1, BO, INC4
  1711. LFPDUX B2, BO2, INC4
  1712. LFPDUX A2, AO2, INC4
  1713. LFPDUX B3, BO, INC4
  1714. LFPDUX B4, BO2, INC4
  1715. LFPDUX A3, AO, INC4
  1716. LFPDUX A5, BO, INC4
  1717. LFPDUX A6, BO2, INC4
  1718. LFPDUX A4, AO2, INC4
  1719. LFPDUX A7, BO, INC4
  1720. LFPDUX A8, BO2, INC4
  1721. bdz- .L33
  1722. .align 4
  1723. .L32:
  1724. fxcpmadd f0, B1, A1, f0
  1725. fxcsmadd f4, B1, A1, f4
  1726. LFPDUX B1, BO, INC4
  1727. fxcpmadd f8, B2, A1, f8
  1728. fxcsmadd f12, B2, A1, f12
  1729. LFPDUX B2, BO2, INC4
  1730. LFPDUX A1, AO, INC4
  1731. fxcpmadd f0, B3, A2, f0
  1732. fxcsmadd f4, B3, A2, f4
  1733. LFPDUX B3, BO, INC4
  1734. fxcpmadd f8, B4, A2, f8
  1735. fxcsmadd f12, B4, A2, f12
  1736. LFPDUX B4, BO2, INC4
  1737. LFPDUX A2, AO2, INC4
  1738. fxcpmadd f0, A5, A3, f0
  1739. fxcsmadd f4, A5, A3, f4
  1740. LFPDUX A5, BO, INC4
  1741. fxcpmadd f8, A6, A3, f8
  1742. fxcsmadd f12, A6, A3, f12
  1743. LFPDUX A6, BO2, INC4
  1744. LFPDUX A3, AO, INC4
  1745. fxcpmadd f0, A7, A4, f0
  1746. fxcsmadd f4, A7, A4, f4
  1747. LFPDUX A7, BO, INC4
  1748. fxcpmadd f8, A8, A4, f8
  1749. fxcsmadd f12, A8, A4, f12
  1750. LFPDUX A8, BO2, INC4
  1751. LFPDUX A4, AO2, INC4
  1752. bdnz+ .L32
  1753. .align 4
  1754. .L33:
  1755. fxcpmadd f0, B1, A1, f0
  1756. fxcsmadd f4, B1, A1, f4
  1757. fxcpmadd f8, B2, A1, f8
  1758. fxcsmadd f12, B2, A1, f12
  1759. fxcpmadd f0, B3, A2, f0
  1760. fxcsmadd f4, B3, A2, f4
  1761. fxcpmadd f8, B4, A2, f8
  1762. fxcsmadd f12, B4, A2, f12
  1763. fxcpmadd f0, A5, A3, f0
  1764. fxcsmadd f4, A5, A3, f4
  1765. fxcpmadd f8, A6, A3, f8
  1766. fxcsmadd f12, A6, A3, f12
  1767. fxcpmadd f0, A7, A4, f0
  1768. fxcsmadd f4, A7, A4, f4
  1769. fxcpmadd f8, A8, A4, f8
  1770. fxcsmadd f12, A8, A4, f12
  1771. .align 4
  1772. .L34:
  1773. #if defined(LT) || defined(RN)
  1774. andi. r0, KK, 3
  1775. mtspr CTR, r0
  1776. ble+ .L38
  1777. #else
  1778. andi. r0, TEMP, 3
  1779. mtspr CTR, r0
  1780. ble+ .L38
  1781. #endif
  1782. LFPDX A1, AO, INC4
  1783. LFPDUX B1, BO, INC4
  1784. LFPDUX B2, BO2, INC4
  1785. add AO, AO, INC2
  1786. bdz- .L37
  1787. .align 4
  1788. .L36:
  1789. fxcpmadd f0, B1, A1, f0
  1790. fxcsmadd f4, B1, A1, f4
  1791. LFPDUX B1, BO, INC4
  1792. fxcpmadd f8, B2, A1, f8
  1793. fxcsmadd f12, B2, A1, f12
  1794. LFPDX A1, AO, INC4
  1795. LFPDUX B2, BO2, INC4
  1796. add AO, AO, INC2
  1797. bdnz+ .L36
  1798. .align 4
  1799. .L37:
  1800. fxcpmadd f0, B1, A1, f0
  1801. fxcsmadd f4, B1, A1, f4
  1802. fxcpmadd f8, B2, A1, f8
  1803. fxcsmadd f12, B2, A1, f12
  1804. .align 4
  1805. .L38:
  1806. #if defined(LN) || defined(RT)
  1807. #ifdef LN
  1808. subi r0, KK, 2
  1809. #else
  1810. subi r0, KK, 4
  1811. #endif
  1812. slwi TEMP, r0, 1 + BASE_SHIFT
  1813. slwi r0, r0, 2 + BASE_SHIFT
  1814. add AO, AORIG, TEMP
  1815. add BO, B, r0
  1816. addi AO2, AO, 2 * SIZE
  1817. addi BO, BO, - 4 * SIZE
  1818. addi BO2, BO, 2 * SIZE
  1819. #endif
  1820. #if defined(LN) || defined(LT)
  1821. fpmr f24, f0
  1822. fpmr f28, f8
  1823. fsmfp f0, f4
  1824. fsmfp f8, f12
  1825. fsmtp f4, f24
  1826. fsmtp f12, f28
  1827. LFPDUX f16, BO, INC4
  1828. LFPDUX f17, BO2, INC4
  1829. LFPDUX f18, BO, INC4
  1830. LFPDUX f19, BO2, INC4
  1831. subi BO, BO, 8 * SIZE
  1832. subi BO2, BO2, 8 * SIZE
  1833. fpsub f0, f16, f0
  1834. fpsub f8, f17, f8
  1835. fpsub f4, f18, f4
  1836. fpsub f12, f19, f12
  1837. #else
  1838. LFPDUX f16, AO, INC4
  1839. LFPDUX f17, AO2, INC4
  1840. LFPDUX f18, AO, INC4
  1841. LFPDUX f19, AO2, INC4
  1842. subi AO, AO, 8 * SIZE
  1843. subi AO2, AO2, 8 * SIZE
  1844. fpsub f0, f16, f0
  1845. fpsub f4, f17, f4
  1846. fpsub f8, f18, f8
  1847. fpsub f12, f19, f12
  1848. #endif
  1849. #ifdef LN
  1850. addi AO, AO, 8 * SIZE
  1851. addi AO2, AO2, 8 * SIZE
  1852. LFPDUX A1, AO2, INCM4
  1853. LFPDUX A2, AO, INCM4
  1854. addi AO, AO, -4 * SIZE
  1855. addi AO2, AO2, -4 * SIZE
  1856. fxsmul f4, A1, f4
  1857. fxsmul f12, A1, f12
  1858. fxcpnmsub f0, A1, f4, f0
  1859. fxcpnmsub f8, A1, f12, f8
  1860. fxpmul f0, A2, f0
  1861. fxpmul f8, A2, f8
  1862. #endif
  1863. #ifdef LT
  1864. LFPDUX A1, AO, INC4
  1865. LFPDUX A2, AO2, INC4
  1866. subi AO, AO, 4 * SIZE
  1867. subi AO2, AO2, 4 * SIZE
  1868. fxpmul f0, A1, f0
  1869. fxpmul f8, A1, f8
  1870. fxcsnmsub f4, A1, f0, f4
  1871. fxcsnmsub f12, A1, f8, f12
  1872. fxsmul f4, A2, f4
  1873. fxsmul f12, A2, f12
  1874. #endif
  1875. #ifdef RN
  1876. LFPDUX A1, BO, INC4
  1877. LFPDUX A2, BO2, INC4
  1878. LFPDUX A3, BO, INC4
  1879. LFPDUX A4, BO2, INC4
  1880. add BO, BO, INC4
  1881. LFPDUX A5, BO2, INC4
  1882. add BO, BO, INC4
  1883. LFPDUX A6, BO2, INC4
  1884. subi BO, BO, 16 * SIZE
  1885. subi BO2, BO2, 16 * SIZE
  1886. fxpmul f0, A1, f0
  1887. fxcsnmsub f4, A1, f0, f4
  1888. fxcpnmsub f8, A2, f0, f8
  1889. fxcsnmsub f12, A2, f0, f12
  1890. fxsmul f4, A3, f4
  1891. fxcpnmsub f8, A4, f4, f8
  1892. fxcsnmsub f12, A4, f4, f12
  1893. fxpmul f8, A5, f8
  1894. fxcsnmsub f12, A5, f8, f12
  1895. fxsmul f12, A6, f12
  1896. #endif
  1897. #ifdef RT
  1898. addi BO, BO, 20 * SIZE
  1899. addi BO2, BO2, 20 * SIZE
  1900. LFPDUX A1, BO2, INCM4
  1901. LFPDUX A2, BO, INCM4
  1902. LFPDUX A3, BO2, INCM4
  1903. LFPDUX A4, BO, INCM4
  1904. add BO2, BO2, INCM4
  1905. LFPDUX A5, BO, INCM4
  1906. add BO2, BO2, INCM4
  1907. LFPDUX A6, BO, INCM4
  1908. subi BO, BO, 4 * SIZE
  1909. subi BO2, BO2, 4 * SIZE
  1910. fxsmul f12, A1, f12
  1911. fxcpnmsub f8, A1, f12, f8
  1912. fxcsnmsub f4, A2, f12, f4
  1913. fxcpnmsub f0, A2, f12, f0
  1914. fxpmul f8, A3, f8
  1915. fxcsnmsub f4, A4, f8, f4
  1916. fxcpnmsub f0, A4, f8, f0
  1917. fxsmul f4, A5, f4
  1918. fxcpnmsub f0, A5, f4, f0
  1919. fxpmul f0, A6, f0
  1920. #endif
  1921. #ifdef LN
  1922. subi CO1, CO1, 2 * SIZE
  1923. subi CO2, CO2, 2 * SIZE
  1924. subi CO3, CO3, 2 * SIZE
  1925. subi CO4, CO4, 2 * SIZE
  1926. #endif
  1927. #if defined(LN) || defined(LT)
  1928. STFPDUX f0, BO, INC4
  1929. STFPDUX f8, BO2, INC4
  1930. STFPDUX f4, BO, INC4
  1931. STFPDUX f12, BO2, INC4
  1932. subi BO, BO, 8 * SIZE
  1933. subi BO2, BO2, 8 * SIZE
  1934. STFDUX f0, CO1, INC
  1935. STFDUX f4, CO1, INC
  1936. STFSDUX f0, CO2, INC
  1937. STFSDUX f4, CO2, INC
  1938. STFDUX f8, CO3, INC
  1939. STFDUX f12, CO3, INC
  1940. STFSDUX f8, CO4, INC
  1941. STFSDUX f12, CO4, INC
  1942. #else
  1943. STFPDUX f0, AO, INC4
  1944. STFPDUX f4, AO2, INC4
  1945. STFPDUX f8, AO, INC4
  1946. STFPDUX f12, AO2, INC4
  1947. subi AO, AO, 8 * SIZE
  1948. subi AO2, AO2, 8 * SIZE
  1949. STFDUX f0, CO1, INC
  1950. STFSDUX f0, CO1, INC
  1951. STFDUX f4, CO2, INC
  1952. STFSDUX f4, CO2, INC
  1953. STFDUX f8, CO3, INC
  1954. STFSDUX f8, CO3, INC
  1955. STFDUX f12, CO4, INC
  1956. STFSDUX f12, CO4, INC
  1957. #endif
  1958. #ifdef LN
  1959. subi CO1, CO1, 2 * SIZE
  1960. subi CO2, CO2, 2 * SIZE
  1961. subi CO3, CO3, 2 * SIZE
  1962. subi CO4, CO4, 2 * SIZE
  1963. #endif
  1964. #ifdef RT
  1965. slwi r0, K, 1 + BASE_SHIFT
  1966. add AORIG, AORIG, r0
  1967. #endif
  1968. #if defined(LT) || defined(RN)
  1969. sub TEMP, K, KK
  1970. slwi r0, TEMP, 1 + BASE_SHIFT
  1971. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1972. add AO, AO, r0
  1973. add BO, BO, TEMP
  1974. #endif
  1975. #ifdef LT
  1976. addi KK, KK, 2
  1977. #endif
  1978. #ifdef LN
  1979. subi KK, KK, 2
  1980. #endif
  1981. li r0, FZERO
  1982. lfpsx f0, SP, r0
  1983. .align 4
  1984. .L40:
  1985. andi. I, M, 1
  1986. beq .L49
  1987. #if defined(LT) || defined(RN)
  1988. addi AO2, AO, 2 * SIZE
  1989. fpmr f1, f0
  1990. addi BO, B, - 4 * SIZE
  1991. fpmr f2, f0
  1992. addi BO2, B, - 2 * SIZE
  1993. fpmr f3, f0
  1994. srawi. r0, KK, 3
  1995. mtspr CTR, r0
  1996. ble .L44
  1997. #else
  1998. #ifdef LN
  1999. slwi r0, K, 0 + BASE_SHIFT
  2000. sub AORIG, AORIG, r0
  2001. #endif
  2002. slwi r0 , KK, 0 + BASE_SHIFT
  2003. slwi TEMP, KK, 2 + BASE_SHIFT
  2004. add AO, AORIG, r0
  2005. add BO, B, TEMP
  2006. sub TEMP, K, KK
  2007. addi AO2, AO, 2 * SIZE
  2008. fpmr f1, f0
  2009. addi BO, BO, - 4 * SIZE
  2010. fpmr f2, f0
  2011. addi BO2, BO, 2 * SIZE
  2012. fpmr f3, f0
  2013. srawi. r0, TEMP, 3
  2014. mtspr CTR, r0
  2015. ble .L44
  2016. #endif
  2017. LFPDUX A1, AO, INC4
  2018. LFPDUX B1, BO, INC4
  2019. LFPDUX B2, BO2, INC4
  2020. LFPDUX A2, AO2, INC4
  2021. LFPDUX B3, BO, INC4
  2022. LFPDUX B4, BO2, INC4
  2023. LFPDUX A3, AO, INC4
  2024. LFPDUX A5, BO, INC4
  2025. LFPDUX A6, BO2, INC4
  2026. LFPDUX A4, AO2, INC4
  2027. LFPDUX A7, BO, INC4
  2028. LFPDUX A8, BO2, INC4
  2029. bdz- .L43
  2030. .align 4
  2031. .L42:
  2032. fxcpmadd f0, A1, B1, f0
  2033. LFPDUX B1, BO, INC4
  2034. fxcpmadd f1, A1, B2, f1
  2035. LFPDUX B2, BO2, INC4
  2036. fxcsmadd f2, A1, B3, f2
  2037. LFPDUX B3, BO, INC4
  2038. fxcsmadd f3, A1, B4, f3
  2039. LFPDUX B4, BO2, INC4
  2040. LFPDUX A1, AO, INC4
  2041. fxcpmadd f0, A2, A5, f0
  2042. LFPDUX A5, BO, INC4
  2043. fxcpmadd f1, A2, A6, f1
  2044. LFPDUX A6, BO2, INC4
  2045. fxcsmadd f2, A2, A7, f2
  2046. LFPDUX A7, BO, INC4
  2047. fxcsmadd f3, A2, A8, f3
  2048. LFPDUX A8, BO2, INC4
  2049. LFPDUX A2, AO2, INC4
  2050. fxcpmadd f0, A3, B1, f0
  2051. LFPDUX B1, BO, INC4
  2052. fxcpmadd f1, A3, B2, f1
  2053. LFPDUX B2, BO2, INC4
  2054. fxcsmadd f2, A3, B3, f2
  2055. LFPDUX B3, BO, INC4
  2056. fxcsmadd f3, A3, B4, f3
  2057. LFPDUX B4, BO2, INC4
  2058. LFPDUX A3, AO, INC4
  2059. fxcpmadd f0, A4, A5, f0
  2060. LFPDUX A5, BO, INC4
  2061. fxcpmadd f1, A4, A6, f1
  2062. LFPDUX A6, BO2, INC4
  2063. fxcsmadd f2, A4, A7, f2
  2064. LFPDUX A7, BO, INC4
  2065. fxcsmadd f3, A4, A8, f3
  2066. LFPDUX A8, BO2, INC4
  2067. LFPDUX A4, AO2, INC4
  2068. bdnz+ .L42
  2069. .align 4
  2070. .L43:
  2071. fxcpmadd f0, A1, B1, f0
  2072. LFPDUX B1, BO, INC4
  2073. fxcpmadd f1, A1, B2, f1
  2074. LFPDUX B2, BO2, INC4
  2075. fxcsmadd f2, A1, B3, f2
  2076. LFPDUX B3, BO, INC4
  2077. fxcsmadd f3, A1, B4, f3
  2078. LFPDUX B4, BO2, INC4
  2079. fxcpmadd f0, A2, A5, f0
  2080. LFPDUX A5, BO, INC4
  2081. fxcpmadd f1, A2, A6, f1
  2082. LFPDUX A6, BO2, INC4
  2083. fxcsmadd f2, A2, A7, f2
  2084. LFPDUX A7, BO, INC4
  2085. fxcsmadd f3, A2, A8, f3
  2086. LFPDUX A8, BO2, INC4
  2087. fxcpmadd f0, A3, B1, f0
  2088. fxcpmadd f1, A3, B2, f1
  2089. fxcsmadd f2, A3, B3, f2
  2090. fxcsmadd f3, A3, B4, f3
  2091. fxcpmadd f0, A4, A5, f0
  2092. fxcpmadd f1, A4, A6, f1
  2093. fxcsmadd f2, A4, A7, f2
  2094. fxcsmadd f3, A4, A8, f3
  2095. .align 4
  2096. .L44:
  2097. #if defined(LT) || defined(RN)
  2098. andi. r0, KK, 7
  2099. mtspr CTR, r0
  2100. ble+ .L48
  2101. #else
  2102. andi. r0, TEMP, 7
  2103. mtspr CTR, r0
  2104. ble+ .L48
  2105. #endif
  2106. LFDX A1, AO, INC4
  2107. LFPDUX B1, BO, INC4
  2108. LFPDUX B2, BO2, INC4
  2109. add AO, AO, INC
  2110. bdz- .L47
  2111. .align 4
  2112. .L46:
  2113. fxcpmadd f0, A1, B1, f0
  2114. LFPDUX B1, BO, INC4
  2115. fxcpmadd f1, A1, B2, f1
  2116. LFDX A1, AO, INC4
  2117. LFPDUX B2, BO2, INC4
  2118. add AO, AO, INC
  2119. bdnz+ .L46
  2120. .align 4
  2121. .L47:
  2122. fxcpmadd f0, A1, B1, f0
  2123. fxcpmadd f1, A1, B2, f1
  2124. addi AO2, AO, 2 * SIZE
  2125. .align 4
  2126. .L48:
  2127. fpadd f0, f0, f2
  2128. fpadd f1, f1, f3
  2129. #if defined(LN) || defined(RT)
  2130. #ifdef LN
  2131. subi r0, KK, 1
  2132. #else
  2133. subi r0, KK, 4
  2134. #endif
  2135. slwi TEMP, r0, 0 + BASE_SHIFT
  2136. slwi r0, r0, 2 + BASE_SHIFT
  2137. add AO, AORIG, TEMP
  2138. add BO, B, r0
  2139. addi AO2, AO, 2 * SIZE
  2140. addi BO, BO, - 4 * SIZE
  2141. addi BO2, BO, 2 * SIZE
  2142. #endif
  2143. #if defined(LN) || defined(LT)
  2144. LFPDX f16, BO, INC4
  2145. LFPDX f17, BO2, INC4
  2146. fpsub f0, f16, f0
  2147. fpsub f1, f17, f1
  2148. #else
  2149. LFPDX f16, AO, INC4
  2150. LFPDX f17, AO2, INC4
  2151. fpsub f0, f16, f0
  2152. fpsub f1, f17, f1
  2153. #endif
  2154. #if defined(LN) || defined(LT)
  2155. LFPDX A1, AO, INC4
  2156. fxpmul f0, A1, f0
  2157. fxpmul f1, A1, f1
  2158. #endif
  2159. #ifdef RN
  2160. LFD A1, (4 + 0) * SIZE(BO)
  2161. LFD A2, (4 + 1) * SIZE(BO)
  2162. LFD A3, (4 + 2) * SIZE(BO)
  2163. LFD A4, (4 + 3) * SIZE(BO)
  2164. LFD A5, (4 + 5) * SIZE(BO)
  2165. LFD A6, (4 + 6) * SIZE(BO)
  2166. LFD A7, (4 + 7) * SIZE(BO)
  2167. LFD A8, (4 + 10) * SIZE(BO)
  2168. LFD A9, (4 + 11) * SIZE(BO)
  2169. LFD A10, (4 + 15) * SIZE(BO)
  2170. fsmtp f2, f0
  2171. fsmtp f3, f1
  2172. fmul f0, A1, f0
  2173. fnmsub f2, A2, f0, f2
  2174. fnmsub f1, A3, f0, f1
  2175. fnmsub f3, A4, f0, f3
  2176. fmul f2, A5, f2
  2177. fnmsub f1, A6, f2, f1
  2178. fnmsub f3, A7, f2, f3
  2179. fmul f1, A8, f1
  2180. fnmsub f3, A9, f1, f3
  2181. fmul f3, A10, f3
  2182. fsmfp f0, f2
  2183. fsmfp f1, f3
  2184. #endif
  2185. #ifdef RT
  2186. LFD A1, (4 + 15) * SIZE(BO)
  2187. LFD A2, (4 + 14) * SIZE(BO)
  2188. LFD A3, (4 + 13) * SIZE(BO)
  2189. LFD A4, (4 + 12) * SIZE(BO)
  2190. LFD A5, (4 + 10) * SIZE(BO)
  2191. LFD A6, (4 + 9) * SIZE(BO)
  2192. LFD A7, (4 + 8) * SIZE(BO)
  2193. LFD A8, (4 + 5) * SIZE(BO)
  2194. LFD A9, (4 + 4) * SIZE(BO)
  2195. LFD A10, (4 + 0) * SIZE(BO)
  2196. fsmtp f2, f0
  2197. fsmtp f3, f1
  2198. fmul f3, A1, f3
  2199. fnmsub f1, A2, f3, f1
  2200. fnmsub f2, A3, f3, f2
  2201. fnmsub f0, A4, f3, f0
  2202. fmul f1, A5, f1
  2203. fnmsub f2, A6, f1, f2
  2204. fnmsub f0, A7, f1, f0
  2205. fmul f2, A8, f2
  2206. fnmsub f0, A9, f2, f0
  2207. fmul f0, A10, f0
  2208. fsmfp f0, f2
  2209. fsmfp f1, f3
  2210. #endif
  2211. #if defined(LN) || defined(LT)
  2212. STFPDX f0, BO, INC4
  2213. STFPDX f1, BO2, INC4
  2214. #else
  2215. STFPDX f0, AO, INC4
  2216. STFPDX f1, AO2, INC4
  2217. #endif
  2218. #ifdef LN
  2219. subi CO1, CO1, 1 * SIZE
  2220. subi CO2, CO2, 1 * SIZE
  2221. subi CO3, CO3, 1 * SIZE
  2222. subi CO4, CO4, 1 * SIZE
  2223. #endif
  2224. STFDUX f0, CO1, INC
  2225. STFSDUX f0, CO2, INC
  2226. STFDUX f1, CO3, INC
  2227. STFSDUX f1, CO4, INC
  2228. #ifdef LN
  2229. subi CO1, CO1, 1 * SIZE
  2230. subi CO2, CO2, 1 * SIZE
  2231. subi CO3, CO3, 1 * SIZE
  2232. subi CO4, CO4, 1 * SIZE
  2233. #endif
  2234. #ifdef RT
  2235. slwi r0, K, 0 + BASE_SHIFT
  2236. add AORIG, AORIG, r0
  2237. #endif
  2238. #if defined(LT) || defined(RN)
  2239. sub TEMP, K, KK
  2240. slwi r0, TEMP, 0 + BASE_SHIFT
  2241. slwi TEMP, TEMP, 2 + BASE_SHIFT
  2242. add AO, AO, r0
  2243. add BO, BO, TEMP
  2244. #endif
  2245. #ifdef LT
  2246. addi KK, KK, 1
  2247. #endif
  2248. #ifdef LN
  2249. subi KK, KK, 1
  2250. #endif
  2251. .align 4
  2252. .L49:
  2253. #ifdef LN
  2254. slwi r0, K, 2 + BASE_SHIFT
  2255. add B, B, r0
  2256. #endif
  2257. #if defined(LT) || defined(RN)
  2258. addi B, BO, 4 * SIZE
  2259. #endif
  2260. #ifdef RN
  2261. addi KK, KK, 4
  2262. #endif
  2263. #ifdef RT
  2264. subi KK, KK, 4
  2265. #endif
  2266. addic. J, J, -1
  2267. bgt+ .L10
  2268. .align 4
  2269. .L50:
  2270. andi. J, N, 2
  2271. beq .L90
  2272. #ifdef RT
  2273. slwi r0, K, 1 + BASE_SHIFT
  2274. sub B, B, r0
  2275. slwi r0, LDC, 1
  2276. sub C, C, r0
  2277. #endif
  2278. mr CO1, C
  2279. add CO2, C, LDC
  2280. #ifdef LN
  2281. add KK, M, OFFSET
  2282. #endif
  2283. #ifdef LT
  2284. mr KK, OFFSET
  2285. #endif
  2286. #if defined(LN) || defined(RT)
  2287. addi AORIG, A, -2 * SIZE
  2288. #else
  2289. addi AO, A, -2 * SIZE
  2290. #endif
  2291. #ifndef RT
  2292. add C, CO2, LDC
  2293. #endif
  2294. li r0, FZERO
  2295. lfpsx f0, SP, r0
  2296. srawi. I, M, 3
  2297. ble .L60
  2298. .align 4
  2299. .L51:
  2300. #if defined(LT) || defined(RN)
  2301. fpmr f4, f0
  2302. addi BO, B, - 2 * SIZE
  2303. fpmr f1, f0
  2304. fpmr f5, f0
  2305. fpmr f2, f0
  2306. fpmr f6, f0
  2307. srawi. r0, KK, 2
  2308. fpmr f3, f0
  2309. mtspr CTR, r0
  2310. fpmr f7, f0
  2311. ble .L54
  2312. #else
  2313. #ifdef LN
  2314. slwi r0, K, 3 + BASE_SHIFT
  2315. sub AORIG, AORIG, r0
  2316. #endif
  2317. slwi r0 , KK, 3 + BASE_SHIFT
  2318. slwi TEMP, KK, 1 + BASE_SHIFT
  2319. add AO, AORIG, r0
  2320. add BO, B, TEMP
  2321. sub TEMP, K, KK
  2322. fpmr f4, f0
  2323. addi BO, BO, - 2 * SIZE
  2324. fpmr f1, f0
  2325. fpmr f5, f0
  2326. fpmr f2, f0
  2327. fpmr f6, f0
  2328. srawi. r0, TEMP, 2
  2329. fpmr f3, f0
  2330. mtspr CTR, r0
  2331. fpmr f7, f0
  2332. ble .L54
  2333. #endif
  2334. LFPDUX B1, BO, INC2
  2335. LFPDUX A1, AO, INC2
  2336. LFPDUX A2, AO, INC2
  2337. LFPDUX B2, BO, INC2
  2338. LFPDUX A3, AO, INC2
  2339. LFPDUX A4, AO, INC2
  2340. LFPDUX B3, BO, INC2
  2341. LFPDUX A5, AO, INC2
  2342. LFPDUX A6, AO, INC2
  2343. LFPDUX A7, AO, INC2
  2344. LFPDUX A8, AO, INC2
  2345. bdz- .L53
  2346. .align 4
  2347. .L52:
  2348. fxcpmadd f0, B1, A1, f0
  2349. LFPDUX B4, BO, INC2
  2350. fxcsmadd f4, B1, A1, f4
  2351. LFPDUX A1, AO, INC2
  2352. fxcpmadd f1, B1, A2, f1
  2353. nop
  2354. fxcsmadd f5, B1, A2, f5
  2355. LFPDUX A2, AO, INC2
  2356. fxcpmadd f2, B1, A3, f2
  2357. nop
  2358. fxcsmadd f6, B1, A3, f6
  2359. LFPDUX A3, AO, INC2
  2360. fxcpmadd f3, B1, A4, f3
  2361. nop
  2362. fxcsmadd f7, B1, A4, f7
  2363. LFPDUX A4, AO, INC2
  2364. fxcpmadd f0, B2, A5, f0
  2365. LFPDUX B1, BO, INC2
  2366. fxcsmadd f4, B2, A5, f4
  2367. LFPDUX A5, AO, INC2
  2368. fxcpmadd f1, B2, A6, f1
  2369. nop
  2370. fxcsmadd f5, B2, A6, f5
  2371. LFPDUX A6, AO, INC2
  2372. fxcpmadd f2, B2, A7, f2
  2373. nop
  2374. fxcsmadd f6, B2, A7, f6
  2375. LFPDUX A7, AO, INC2
  2376. fxcpmadd f3, B2, A8, f3
  2377. nop
  2378. fxcsmadd f7, B2, A8, f7
  2379. LFPDUX A8, AO, INC2
  2380. fxcpmadd f0, B3, A1, f0
  2381. LFPDUX B2, BO, INC2
  2382. fxcsmadd f4, B3, A1, f4
  2383. LFPDUX A1, AO, INC2
  2384. fxcpmadd f1, B3, A2, f1
  2385. nop
  2386. fxcsmadd f5, B3, A2, f5
  2387. LFPDUX A2, AO, INC2
  2388. fxcpmadd f2, B3, A3, f2
  2389. nop
  2390. fxcsmadd f6, B3, A3, f6
  2391. LFPDUX A3, AO, INC2
  2392. fxcpmadd f3, B3, A4, f3
  2393. nop
  2394. fxcsmadd f7, B3, A4, f7
  2395. LFPDUX A4, AO, INC2
  2396. fxcpmadd f0, B4, A5, f0
  2397. LFPDUX B3, BO, INC2
  2398. fxcsmadd f4, B4, A5, f4
  2399. LFPDUX A5, AO, INC2
  2400. fxcpmadd f1, B4, A6, f1
  2401. nop
  2402. fxcsmadd f5, B4, A6, f5
  2403. LFPDUX A6, AO, INC2
  2404. fxcpmadd f2, B4, A7, f2
  2405. nop
  2406. fxcsmadd f6, B4, A7, f6
  2407. LFPDUX A7, AO, INC2
  2408. fxcpmadd f3, B4, A8, f3
  2409. nop
  2410. fxcsmadd f7, B4, A8, f7
  2411. LFPDUX A8, AO, INC2
  2412. bdnz+ .L52
  2413. .align 4
  2414. .L53:
  2415. fxcpmadd f0, B1, A1, f0
  2416. LFPDUX B4, BO, INC2
  2417. fxcsmadd f4, B1, A1, f4
  2418. LFPDUX A1, AO, INC2
  2419. fxcpmadd f1, B1, A2, f1
  2420. nop
  2421. fxcsmadd f5, B1, A2, f5
  2422. LFPDUX A2, AO, INC2
  2423. fxcpmadd f2, B1, A3, f2
  2424. nop
  2425. fxcsmadd f6, B1, A3, f6
  2426. LFPDUX A3, AO, INC2
  2427. fxcpmadd f3, B1, A4, f3
  2428. nop
  2429. fxcsmadd f7, B1, A4, f7
  2430. LFPDUX A4, AO, INC2
  2431. fxcpmadd f0, B2, A5, f0
  2432. nop
  2433. fxcsmadd f4, B2, A5, f4
  2434. LFPDUX A5, AO, INC2
  2435. fxcpmadd f1, B2, A6, f1
  2436. nop
  2437. fxcsmadd f5, B2, A6, f5
  2438. LFPDUX A6, AO, INC2
  2439. fxcpmadd f2, B2, A7, f2
  2440. nop
  2441. fxcsmadd f6, B2, A7, f6
  2442. LFPDUX A7, AO, INC2
  2443. fxcpmadd f3, B2, A8, f3
  2444. nop
  2445. fxcsmadd f7, B2, A8, f7
  2446. LFPDUX A8, AO, INC2
  2447. fxcpmadd f0, B3, A1, f0
  2448. fxcsmadd f4, B3, A1, f4
  2449. fxcpmadd f1, B3, A2, f1
  2450. fxcsmadd f5, B3, A2, f5
  2451. fxcpmadd f2, B3, A3, f2
  2452. fxcsmadd f6, B3, A3, f6
  2453. fxcpmadd f3, B3, A4, f3
  2454. fxcsmadd f7, B3, A4, f7
  2455. fxcpmadd f0, B4, A5, f0
  2456. fxcsmadd f4, B4, A5, f4
  2457. fxcpmadd f1, B4, A6, f1
  2458. fxcsmadd f5, B4, A6, f5
  2459. fxcpmadd f2, B4, A7, f2
  2460. fxcsmadd f6, B4, A7, f6
  2461. fxcpmadd f3, B4, A8, f3
  2462. fxcsmadd f7, B4, A8, f7
  2463. .align 4
  2464. .L54:
  2465. #if defined(LT) || defined(RN)
  2466. andi. r0, KK, 3
  2467. mtspr CTR, r0
  2468. ble+ .L58
  2469. #else
  2470. andi. r0, TEMP, 3
  2471. mtspr CTR, r0
  2472. ble+ .L58
  2473. #endif
  2474. LFPDUX A1, AO, INC2
  2475. LFPDUX B1, BO, INC2
  2476. LFPDUX A2, AO, INC2
  2477. LFPDUX A3, AO, INC2
  2478. LFPDUX A4, AO, INC2
  2479. bdz- .L57
  2480. .align 4
  2481. .L56:
  2482. fxcpmadd f0, B1, A1, f0
  2483. fxcsmadd f4, B1, A1, f4
  2484. LFPDUX A1, AO, INC2
  2485. fxcpmadd f1, B1, A2, f1
  2486. fxcsmadd f5, B1, A2, f5
  2487. LFPDUX A2, AO, INC2
  2488. fxcpmadd f2, B1, A3, f2
  2489. fxcsmadd f6, B1, A3, f6
  2490. LFPDUX A3, AO, INC2
  2491. fxcpmadd f3, B1, A4, f3
  2492. fxcsmadd f7, B1, A4, f7
  2493. LFPDUX A4, AO, INC2
  2494. LFPDUX B1, BO, INC2
  2495. bdnz+ .L56
  2496. .align 4
  2497. .L57:
  2498. fxcpmadd f0, B1, A1, f0
  2499. fxcsmadd f4, B1, A1, f4
  2500. fxcpmadd f1, B1, A2, f1
  2501. fxcsmadd f5, B1, A2, f5
  2502. fxcpmadd f2, B1, A3, f2
  2503. fxcsmadd f6, B1, A3, f6
  2504. fxcpmadd f3, B1, A4, f3
  2505. fxcsmadd f7, B1, A4, f7
  2506. .align 4
  2507. .L58:
  2508. #if defined(LN) || defined(RT)
  2509. #ifdef LN
  2510. subi r0, KK, 8
  2511. #else
  2512. subi r0, KK, 2
  2513. #endif
  2514. slwi TEMP, r0, 3 + BASE_SHIFT
  2515. slwi r0, r0, 1 + BASE_SHIFT
  2516. add AO, AORIG, TEMP
  2517. add BO, B, r0
  2518. addi BO, BO, - 2 * SIZE
  2519. #endif
  2520. #if defined(LN) || defined(LT)
  2521. fpmr f24, f0
  2522. fpmr f25, f1
  2523. fpmr f26, f2
  2524. fpmr f27, f3
  2525. fsmfp f0, f4
  2526. fsmfp f1, f5
  2527. fsmfp f2, f6
  2528. fsmfp f3, f7
  2529. fsmtp f4, f24
  2530. fsmtp f5, f25
  2531. fsmtp f6, f26
  2532. fsmtp f7, f27
  2533. LFPDUX f16, BO, INC2
  2534. LFPDUX f17, BO, INC2
  2535. LFPDUX f18, BO, INC2
  2536. LFPDUX f19, BO, INC2
  2537. LFPDUX f20, BO, INC2
  2538. LFPDUX f21, BO, INC2
  2539. LFPDUX f22, BO, INC2
  2540. LFPDUX f23, BO, INC2
  2541. subi BO, BO, 16 * SIZE
  2542. fpsub f0, f16, f0
  2543. fpsub f4, f17, f4
  2544. fpsub f1, f18, f1
  2545. fpsub f5, f19, f5
  2546. fpsub f2, f20, f2
  2547. fpsub f6, f21, f6
  2548. fpsub f3, f22, f3
  2549. fpsub f7, f23, f7
  2550. #else
  2551. LFPDUX f16, AO, INC2
  2552. LFPDUX f17, AO, INC2
  2553. LFPDUX f18, AO, INC2
  2554. LFPDUX f19, AO, INC2
  2555. LFPDUX f20, AO, INC2
  2556. LFPDUX f21, AO, INC2
  2557. LFPDUX f22, AO, INC2
  2558. LFPDUX f23, AO, INC2
  2559. subi AO, AO, 16 * SIZE
  2560. fpsub f0, f16, f0
  2561. fpsub f1, f17, f1
  2562. fpsub f2, f18, f2
  2563. fpsub f3, f19, f3
  2564. fpsub f4, f20, f4
  2565. fpsub f5, f21, f5
  2566. fpsub f6, f22, f6
  2567. fpsub f7, f23, f7
  2568. #endif
  2569. #ifdef LN
  2570. addi AO, AO, 66 * SIZE
  2571. LFPDUX A1, AO, INCM2
  2572. LFPDUX A2, AO, INCM2
  2573. LFPDUX A3, AO, INCM2
  2574. LFPDUX A4, AO, INCM2
  2575. LFPDUX A5, AO, INCM2
  2576. LFPDUX A6, AO, INCM2
  2577. LFPDUX A7, AO, INCM2
  2578. LFPDUX A8, AO, INCM2
  2579. fxsmul f7, A1, f7
  2580. fxcpnmsub f3, A1, f7, f3
  2581. fxcsnmsub f6, A2, f7, f6
  2582. fxcpnmsub f2, A2, f7, f2
  2583. fxcsnmsub f5, A3, f7, f5
  2584. fxcpnmsub f1, A3, f7, f1
  2585. fxcsnmsub f4, A4, f7, f4
  2586. fxcpnmsub f0, A4, f7, f0
  2587. fxpmul f3, A5, f3
  2588. fxcsnmsub f6, A6, f3, f6
  2589. fxcpnmsub f2, A6, f3, f2
  2590. fxcsnmsub f5, A7, f3, f5
  2591. fxcpnmsub f1, A7, f3, f1
  2592. fxcsnmsub f4, A8, f3, f4
  2593. fxcpnmsub f0, A8, f3, f0
  2594. add AO, AO, INCM2
  2595. LFPDUX A1, AO, INCM2
  2596. LFPDUX A2, AO, INCM2
  2597. LFPDUX A3, AO, INCM2
  2598. add AO, AO, INCM2
  2599. LFPDUX A4, AO, INCM2
  2600. LFPDUX A5, AO, INCM2
  2601. LFPDUX A6, AO, INCM2
  2602. add AO, AO, INCM2
  2603. add AO, AO, INCM2
  2604. LFPDUX A7, AO, INCM2
  2605. LFPDUX A8, AO, INCM2
  2606. fxsmul f6, A1, f6
  2607. fxcpnmsub f2, A1, f6, f2
  2608. fxcsnmsub f5, A2, f6, f5
  2609. fxcpnmsub f1, A2, f6, f1
  2610. fxcsnmsub f4, A3, f6, f4
  2611. fxcpnmsub f0, A3, f6, f0
  2612. fxpmul f2, A4, f2
  2613. fxcsnmsub f5, A5, f2, f5
  2614. fxcpnmsub f1, A5, f2, f1
  2615. fxcsnmsub f4, A6, f2, f4
  2616. fxcpnmsub f0, A6, f2, f0
  2617. fxsmul f5, A7, f5
  2618. fxcpnmsub f1, A7, f5, f1
  2619. fxcsnmsub f4, A8, f5, f4
  2620. fxcpnmsub f0, A8, f5, f0
  2621. add AO, AO, INCM2
  2622. add AO, AO, INCM2
  2623. LFPDUX A1, AO, INCM2
  2624. LFPDUX A2, AO, INCM2
  2625. subi AO, AO, 6 * SIZE
  2626. LFPDUX A3, AO, INCM2
  2627. subi AO, AO, 6 * SIZE
  2628. LFPDUX A4, AO, INCM2
  2629. addi AO, AO, -2 * SIZE
  2630. fxpmul f1, A1, f1
  2631. fxcsnmsub f4, A2, f1, f4
  2632. fxcpnmsub f0, A2, f1, f0
  2633. fxsmul f4, A3, f4
  2634. fxcpnmsub f0, A3, f4, f0
  2635. fxpmul f0, A4, f0
  2636. #endif
  2637. #ifdef LT
  2638. LFPDUX A1, AO, INC2
  2639. LFPDUX A2, AO, INC2
  2640. LFPDUX A3, AO, INC2
  2641. LFPDUX A4, AO, INC2
  2642. LFPDUX A5, AO, INC2
  2643. LFPDUX A6, AO, INC2
  2644. LFPDUX A7, AO, INC2
  2645. LFPDUX A8, AO, INC2
  2646. fxpmul f0, A1, f0
  2647. fxcsnmsub f4, A1, f0, f4
  2648. fxcpnmsub f1, A2, f0, f1
  2649. fxcsnmsub f5, A2, f0, f5
  2650. fxcpnmsub f2, A3, f0, f2
  2651. fxcsnmsub f6, A3, f0, f6
  2652. fxcpnmsub f3, A4, f0, f3
  2653. fxcsnmsub f7, A4, f0, f7
  2654. fxsmul f4, A5, f4
  2655. fxcpnmsub f1, A6, f4, f1
  2656. fxcsnmsub f5, A6, f4, f5
  2657. fxcpnmsub f2, A7, f4, f2
  2658. fxcsnmsub f6, A7, f4, f6
  2659. fxcpnmsub f3, A8, f4, f3
  2660. fxcsnmsub f7, A8, f4, f7
  2661. add AO, AO, INC2
  2662. LFPDUX A1, AO, INC2
  2663. LFPDUX A2, AO, INC2
  2664. LFPDUX A3, AO, INC2
  2665. add AO, AO, INC2
  2666. LFPDUX A4, AO, INC2
  2667. LFPDUX A5, AO, INC2
  2668. LFPDUX A6, AO, INC2
  2669. add AO, AO, INC2
  2670. add AO, AO, INC2
  2671. LFPDUX A7, AO, INC2
  2672. LFPDUX A8, AO, INC2
  2673. fxpmul f1, A1, f1
  2674. fxcsnmsub f5, A1, f1, f5
  2675. fxcpnmsub f2, A2, f1, f2
  2676. fxcsnmsub f6, A2, f1, f6
  2677. fxcpnmsub f3, A3, f1, f3
  2678. fxcsnmsub f7, A3, f1, f7
  2679. fxsmul f5, A4, f5
  2680. fxcpnmsub f2, A5, f5, f2
  2681. fxcsnmsub f6, A5, f5, f6
  2682. fxcpnmsub f3, A6, f5, f3
  2683. fxcsnmsub f7, A6, f5, f7
  2684. fxpmul f2, A7, f2
  2685. fxcsnmsub f6, A7, f2, f6
  2686. fxcpnmsub f3, A8, f2, f3
  2687. fxcsnmsub f7, A8, f2, f7
  2688. add AO, AO, INC2
  2689. add AO, AO, INC2
  2690. LFPDUX A1, AO, INC2
  2691. LFPDUX A2, AO, INC2
  2692. addi AO, AO, 6 * SIZE
  2693. LFPDUX A3, AO, INC2
  2694. addi AO, AO, 6 * SIZE
  2695. LFPDUX A4, AO, INC2
  2696. subi AO, AO, 64 * SIZE
  2697. fxsmul f6, A1, f6
  2698. fxcpnmsub f3, A2, f6, f3
  2699. fxcsnmsub f7, A2, f6, f7
  2700. fxpmul f3, A3, f3
  2701. fxcsnmsub f7, A3, f3, f7
  2702. fxsmul f7, A4, f7
  2703. #endif
  2704. #ifdef RN
  2705. LFPDUX A1, BO, INC2
  2706. LFPDUX A2, BO, INC2
  2707. subi BO, BO, 4 * SIZE
  2708. fxpmul f0, A1, f0
  2709. fxpmul f1, A1, f1
  2710. fxpmul f2, A1, f2
  2711. fxpmul f3, A1, f3
  2712. fxcsnmsub f4, A1, f0, f4
  2713. fxcsnmsub f5, A1, f1, f5
  2714. fxcsnmsub f6, A1, f2, f6
  2715. fxcsnmsub f7, A1, f3, f7
  2716. fxsmul f4, A2, f4
  2717. fxsmul f5, A2, f5
  2718. fxsmul f6, A2, f6
  2719. fxsmul f7, A2, f7
  2720. #endif
  2721. #ifdef RT
  2722. LFPDUX A2, BO, INC2
  2723. LFPDUX A1, BO, INC2
  2724. subi BO, BO, 4 * SIZE
  2725. fxsmul f4, A1, f4
  2726. fxsmul f5, A1, f5
  2727. fxsmul f6, A1, f6
  2728. fxsmul f7, A1, f7
  2729. fxcpnmsub f0, A1, f4, f0
  2730. fxcpnmsub f1, A1, f5, f1
  2731. fxcpnmsub f2, A1, f6, f2
  2732. fxcpnmsub f3, A1, f7, f3
  2733. fxpmul f0, A2, f0
  2734. fxpmul f1, A2, f1
  2735. fxpmul f2, A2, f2
  2736. fxpmul f3, A2, f3
  2737. #endif
  2738. #ifdef LN
  2739. subi CO1, CO1, 8 * SIZE
  2740. subi CO2, CO2, 8 * SIZE
  2741. #endif
  2742. #if defined(LN) || defined(LT)
  2743. STFPDUX f0, BO, INC2
  2744. STFPDUX f4, BO, INC2
  2745. STFPDUX f1, BO, INC2
  2746. STFPDUX f5, BO, INC2
  2747. STFPDUX f2, BO, INC2
  2748. STFPDUX f6, BO, INC2
  2749. STFPDUX f3, BO, INC2
  2750. STFPDUX f7, BO, INC2
  2751. subi BO, BO, 16 * SIZE
  2752. STFDUX f0, CO1, INC
  2753. STFDUX f4, CO1, INC
  2754. STFDUX f1, CO1, INC
  2755. STFDUX f5, CO1, INC
  2756. STFDUX f2, CO1, INC
  2757. STFDUX f6, CO1, INC
  2758. STFDUX f3, CO1, INC
  2759. STFDUX f7, CO1, INC
  2760. STFSDUX f0, CO2, INC
  2761. STFSDUX f4, CO2, INC
  2762. STFSDUX f1, CO2, INC
  2763. STFSDUX f5, CO2, INC
  2764. STFSDUX f2, CO2, INC
  2765. STFSDUX f6, CO2, INC
  2766. STFSDUX f3, CO2, INC
  2767. STFSDUX f7, CO2, INC
  2768. #else
  2769. STFPDUX f0, AO, INC2
  2770. STFPDUX f1, AO, INC2
  2771. STFPDUX f2, AO, INC2
  2772. STFPDUX f3, AO, INC2
  2773. STFPDUX f4, AO, INC2
  2774. STFPDUX f5, AO, INC2
  2775. STFPDUX f6, AO, INC2
  2776. STFPDUX f7, AO, INC2
  2777. subi AO, AO, 16 * SIZE
  2778. STFDUX f0, CO1, INC
  2779. STFSDUX f0, CO1, INC
  2780. STFDUX f1, CO1, INC
  2781. STFSDUX f1, CO1, INC
  2782. STFDUX f2, CO1, INC
  2783. STFSDUX f2, CO1, INC
  2784. STFDUX f3, CO1, INC
  2785. STFSDUX f3, CO1, INC
  2786. STFDUX f4, CO2, INC
  2787. STFSDUX f4, CO2, INC
  2788. STFDUX f5, CO2, INC
  2789. STFSDUX f5, CO2, INC
  2790. STFDUX f6, CO2, INC
  2791. STFSDUX f6, CO2, INC
  2792. STFDUX f7, CO2, INC
  2793. STFSDUX f7, CO2, INC
  2794. #endif
  2795. #ifdef LN
  2796. subi CO1, CO1, 8 * SIZE
  2797. subi CO2, CO2, 8 * SIZE
  2798. #endif
  2799. #ifdef RT
  2800. slwi r0, K, 3 + BASE_SHIFT
  2801. add AORIG, AORIG, r0
  2802. #endif
  2803. #if defined(LT) || defined(RN)
  2804. sub TEMP, K, KK
  2805. slwi r0, TEMP, 3 + BASE_SHIFT
  2806. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2807. add AO, AO, r0
  2808. add BO, BO, TEMP
  2809. #endif
  2810. #ifdef LT
  2811. addi KK, KK, 8
  2812. #endif
  2813. #ifdef LN
  2814. subi KK, KK, 8
  2815. #endif
  2816. addic. I, I, -1
  2817. li r0, FZERO
  2818. lfpsx f0, SP, r0
  2819. bgt+ .L51
  2820. .align 4
  2821. .L60:
  2822. andi. I, M, 4
  2823. beq .L70
  2824. #if defined(LT) || defined(RN)
  2825. fpmr f1, f0
  2826. addi BO, B, - 2 * SIZE
  2827. fpmr f2, f0
  2828. fpmr f3, f0
  2829. srawi. r0, KK, 2
  2830. mtspr CTR, r0
  2831. ble .L64
  2832. #else
  2833. #ifdef LN
  2834. slwi r0, K, 2 + BASE_SHIFT
  2835. sub AORIG, AORIG, r0
  2836. #endif
  2837. slwi r0 , KK, 2 + BASE_SHIFT
  2838. slwi TEMP, KK, 1 + BASE_SHIFT
  2839. add AO, AORIG, r0
  2840. add BO, B, TEMP
  2841. sub TEMP, K, KK
  2842. fpmr f1, f0
  2843. addi BO, B, - 2 * SIZE
  2844. fpmr f2, f0
  2845. fpmr f3, f0
  2846. srawi. r0, TEMP, 2
  2847. mtspr CTR, r0
  2848. ble .L64
  2849. #endif
  2850. LFPDUX B1, BO, INC2
  2851. LFPDUX A1, AO, INC2
  2852. LFPDUX A2, AO, INC2
  2853. LFPDUX B2, BO, INC2
  2854. LFPDUX A3, AO, INC2
  2855. LFPDUX A4, AO, INC2
  2856. LFPDUX B3, BO, INC2
  2857. LFPDUX A5, AO, INC2
  2858. LFPDUX A6, AO, INC2
  2859. LFPDUX B4, BO, INC2
  2860. LFPDUX A7, AO, INC2
  2861. LFPDUX A8, AO, INC2
  2862. bdz- .L63
  2863. .align 4
  2864. .L62:
  2865. fxcpmadd f0, B1, A1, f0
  2866. fxcsmadd f2, B1, A1, f2
  2867. LFPDUX A1, AO, INC2
  2868. fxcpmadd f1, B1, A2, f1
  2869. fxcsmadd f3, B1, A2, f3
  2870. LFPDUX A2, AO, INC2
  2871. LFPDUX B1, BO, INC2
  2872. fxcpmadd f0, B2, A3, f0
  2873. fxcsmadd f2, B2, A3, f2
  2874. LFPDUX A3, AO, INC2
  2875. fxcpmadd f1, B2, A4, f1
  2876. fxcsmadd f3, B2, A4, f3
  2877. LFPDUX A4, AO, INC2
  2878. LFPDUX B2, BO, INC2
  2879. fxcpmadd f0, B3, A5, f0
  2880. fxcsmadd f2, B3, A5, f2
  2881. LFPDUX A5, AO, INC2
  2882. fxcpmadd f1, B3, A6, f1
  2883. fxcsmadd f3, B3, A6, f3
  2884. LFPDUX A6, AO, INC2
  2885. LFPDUX B3, BO, INC2
  2886. fxcpmadd f0, B4, A7, f0
  2887. fxcsmadd f2, B4, A7, f2
  2888. LFPDUX A7, AO, INC2
  2889. fxcpmadd f1, B4, A8, f1
  2890. fxcsmadd f3, B4, A8, f3
  2891. LFPDUX A8, AO, INC2
  2892. LFPDUX B4, BO, INC2
  2893. bdnz+ .L62
  2894. .align 4
  2895. .L63:
  2896. fxcpmadd f0, B1, A1, f0
  2897. fxcsmadd f2, B1, A1, f2
  2898. fxcpmadd f1, B1, A2, f1
  2899. fxcsmadd f3, B1, A2, f3
  2900. fxcpmadd f0, B2, A3, f0
  2901. fxcsmadd f2, B2, A3, f2
  2902. fxcpmadd f1, B2, A4, f1
  2903. fxcsmadd f3, B2, A4, f3
  2904. fxcpmadd f0, B3, A5, f0
  2905. fxcsmadd f2, B3, A5, f2
  2906. fxcpmadd f1, B3, A6, f1
  2907. fxcsmadd f3, B3, A6, f3
  2908. fxcpmadd f0, B4, A7, f0
  2909. fxcsmadd f2, B4, A7, f2
  2910. fxcpmadd f1, B4, A8, f1
  2911. fxcsmadd f3, B4, A8, f3
  2912. .align 4
  2913. .L64:
  2914. #if defined(LT) || defined(RN)
  2915. andi. r0, KK, 3
  2916. mtspr CTR, r0
  2917. ble+ .L68
  2918. #else
  2919. andi. r0, TEMP, 3
  2920. mtspr CTR, r0
  2921. ble+ .L68
  2922. #endif
  2923. LFPDUX A1, AO, INC2
  2924. LFPDUX B1, BO, INC2
  2925. LFPDUX A2, AO, INC2
  2926. bdz- .L67
  2927. .align 4
  2928. .L66:
  2929. fxcpmadd f0, B1, A1, f0
  2930. fxcsmadd f2, B1, A1, f2
  2931. LFPDUX A1, AO, INC2
  2932. fxcpmadd f1, B1, A2, f1
  2933. fxcsmadd f3, B1, A2, f3
  2934. LFPDUX B1, BO, INC2
  2935. LFPDUX A2, AO, INC2
  2936. bdnz+ .L66
  2937. .align 4
  2938. .L67:
  2939. fxcpmadd f0, B1, A1, f0
  2940. fxcsmadd f2, B1, A1, f2
  2941. fxcpmadd f1, B1, A2, f1
  2942. fxcsmadd f3, B1, A2, f3
  2943. .align 4
  2944. .L68:
  2945. #if defined(LN) || defined(RT)
  2946. #ifdef LN
  2947. subi r0, KK, 4
  2948. #else
  2949. subi r0, KK, 2
  2950. #endif
  2951. slwi TEMP, r0, 2 + BASE_SHIFT
  2952. slwi r0, r0, 1 + BASE_SHIFT
  2953. add AO, AORIG, TEMP
  2954. add BO, B, r0
  2955. addi BO, BO, - 2 * SIZE
  2956. #endif
  2957. #if defined(LN) || defined(LT)
  2958. fpmr f24, f0
  2959. fpmr f25, f1
  2960. fsmfp f0, f2
  2961. fsmfp f1, f3
  2962. fsmtp f2, f24
  2963. fsmtp f3, f25
  2964. LFPDUX f16, BO, INC2
  2965. LFPDUX f17, BO, INC2
  2966. LFPDUX f18, BO, INC2
  2967. LFPDUX f19, BO, INC2
  2968. subi BO, BO, 8 * SIZE
  2969. fpsub f0, f16, f0
  2970. fpsub f2, f17, f2
  2971. fpsub f1, f18, f1
  2972. fpsub f3, f19, f3
  2973. #else
  2974. LFPDUX f16, AO, INC2
  2975. LFPDUX f17, AO, INC2
  2976. LFPDUX f18, AO, INC2
  2977. LFPDUX f19, AO, INC2
  2978. subi AO, AO, 8 * SIZE
  2979. fpsub f0, f16, f0
  2980. fpsub f1, f17, f1
  2981. fpsub f2, f18, f2
  2982. fpsub f3, f19, f3
  2983. #endif
  2984. #ifdef LN
  2985. addi AO, AO, 18 * SIZE
  2986. LFPDUX A1, AO, INCM2
  2987. LFPDUX A2, AO, INCM2
  2988. LFPDUX A3, AO, INCM2
  2989. LFPDUX A4, AO, INCM2
  2990. add AO, AO, INCM2
  2991. LFPDUX A5, AO, INCM2
  2992. add AO, AO, INCM2
  2993. LFPDUX A6, AO, INCM2
  2994. subi AO, AO, 2 * SIZE
  2995. fxsmul f3, A1, f3
  2996. fxcpnmsub f1, A1, f3, f1
  2997. fxcsnmsub f2, A2, f3, f2
  2998. fxcpnmsub f0, A2, f3, f0
  2999. fxpmul f1, A3, f1
  3000. fxcsnmsub f2, A4, f1, f2
  3001. fxcpnmsub f0, A4, f1, f0
  3002. fxsmul f2, A5, f2
  3003. fxcpnmsub f0, A5, f2, f0
  3004. fxpmul f0, A6, f0
  3005. #endif
  3006. #ifdef LT
  3007. LFPDUX A1, AO, INC2
  3008. LFPDUX A2, AO, INC2
  3009. LFPDUX A3, AO, INC2
  3010. LFPDUX A4, AO, INC2
  3011. add AO, AO, INC2
  3012. LFPDUX A5, AO, INC2
  3013. add AO, AO, INC2
  3014. LFPDUX A6, AO, INC2
  3015. subi AO, AO, 16 * SIZE
  3016. fxpmul f0, A1, f0
  3017. fxcsnmsub f2, A1, f0, f2
  3018. fxcpnmsub f1, A2, f0, f1
  3019. fxcsnmsub f3, A2, f0, f3
  3020. fxsmul f2, A3, f2
  3021. fxcpnmsub f1, A4, f2, f1
  3022. fxcsnmsub f3, A4, f2, f3
  3023. fxpmul f1, A5, f1
  3024. fxcsnmsub f3, A5, f1, f3
  3025. fxsmul f3, A6, f3
  3026. #endif
  3027. #ifdef RN
  3028. LFPDUX A1, BO, INC2
  3029. LFPDUX A2, BO, INC2
  3030. subi BO, BO, 4 * SIZE
  3031. fxpmul f0, A1, f0
  3032. fxpmul f1, A1, f1
  3033. fxcsnmsub f2, A1, f0, f2
  3034. fxcsnmsub f3, A1, f1, f3
  3035. fxsmul f2, A2, f2
  3036. fxsmul f3, A2, f3
  3037. #endif
  3038. #ifdef RT
  3039. LFPDUX A2, BO, INC2
  3040. LFPDUX A1, BO, INC2
  3041. subi BO, BO, 4 * SIZE
  3042. fxsmul f2, A1, f2
  3043. fxsmul f3, A1, f3
  3044. fxcpnmsub f0, A1, f2, f0
  3045. fxcpnmsub f1, A1, f3, f1
  3046. fxpmul f0, A2, f0
  3047. fxpmul f1, A2, f1
  3048. #endif
  3049. #ifdef LN
  3050. subi CO1, CO1, 4 * SIZE
  3051. subi CO2, CO2, 4 * SIZE
  3052. #endif
  3053. #if defined(LN) || defined(LT)
  3054. STFPDUX f0, BO, INC2
  3055. STFPDUX f2, BO, INC2
  3056. STFPDUX f1, BO, INC2
  3057. STFPDUX f3, BO, INC2
  3058. subi BO, BO, 8 * SIZE
  3059. STFDUX f0, CO1, INC
  3060. STFDUX f2, CO1, INC
  3061. STFDUX f1, CO1, INC
  3062. STFDUX f3, CO1, INC
  3063. STFSDUX f0, CO2, INC
  3064. STFSDUX f2, CO2, INC
  3065. STFSDUX f1, CO2, INC
  3066. STFSDUX f3, CO2, INC
  3067. #else
  3068. STFPDUX f0, AO, INC2
  3069. STFPDUX f1, AO, INC2
  3070. STFPDUX f2, AO, INC2
  3071. STFPDUX f3, AO, INC2
  3072. subi AO, AO, 8 * SIZE
  3073. STFDUX f0, CO1, INC
  3074. STFSDUX f0, CO1, INC
  3075. STFDUX f1, CO1, INC
  3076. STFSDUX f1, CO1, INC
  3077. STFDUX f2, CO2, INC
  3078. STFSDUX f2, CO2, INC
  3079. STFDUX f3, CO2, INC
  3080. STFSDUX f3, CO2, INC
  3081. #endif
  3082. #ifdef LN
  3083. subi CO1, CO1, 4 * SIZE
  3084. subi CO2, CO2, 4 * SIZE
  3085. #endif
  3086. #ifdef RT
  3087. slwi r0, K, 2 + BASE_SHIFT
  3088. add AORIG, AORIG, r0
  3089. #endif
  3090. #if defined(LT) || defined(RN)
  3091. sub TEMP, K, KK
  3092. slwi r0, TEMP, 2 + BASE_SHIFT
  3093. slwi TEMP, TEMP, 1 + BASE_SHIFT
  3094. add AO, AO, r0
  3095. add BO, BO, TEMP
  3096. #endif
  3097. #ifdef LT
  3098. addi KK, KK, 4
  3099. #endif
  3100. #ifdef LN
  3101. subi KK, KK, 4
  3102. #endif
  3103. li r0, FZERO
  3104. lfpsx f0, SP, r0
  3105. .align 4
  3106. .L70:
  3107. andi. I, M, 2
  3108. beq .L80
  3109. #if defined(LT) || defined(RN)
  3110. addi BO, B, - 2 * SIZE
  3111. fpmr f1, f0
  3112. fpmr f2, f0
  3113. fpmr f3, f0
  3114. srawi. r0, KK, 3
  3115. mtspr CTR, r0
  3116. ble .L74
  3117. #else
  3118. #ifdef LN
  3119. slwi r0, K, 1 + BASE_SHIFT
  3120. sub AORIG, AORIG, r0
  3121. #endif
  3122. slwi r0 , KK, 1 + BASE_SHIFT
  3123. slwi TEMP, KK, 1 + BASE_SHIFT
  3124. add AO, AORIG, r0
  3125. add BO, B, TEMP
  3126. sub TEMP, K, KK
  3127. addi BO, B, - 2 * SIZE
  3128. fpmr f1, f0
  3129. fpmr f2, f0
  3130. fpmr f3, f0
  3131. srawi. r0, TEMP, 3
  3132. mtspr CTR, r0
  3133. ble .L74
  3134. #endif
  3135. LFPDUX A1, AO, INC2
  3136. LFPDUX B1, BO, INC2
  3137. LFPDUX A2, AO, INC2
  3138. LFPDUX B2, BO, INC2
  3139. LFPDUX A3, AO, INC2
  3140. LFPDUX B3, BO, INC2
  3141. LFPDUX A4, AO, INC2
  3142. LFPDUX B4, BO, INC2
  3143. LFPDUX A5, AO, INC2
  3144. LFPDUX B5, BO, INC2
  3145. LFPDUX A6, AO, INC2
  3146. LFPDUX B6, BO, INC2
  3147. LFPDUX A7, AO, INC2
  3148. LFPDUX A9, BO, INC2
  3149. LFPDUX A8, AO, INC2
  3150. LFPDUX A10, BO, INC2
  3151. bdz- .L73
  3152. .align 4
  3153. .L72:
  3154. fxcpmadd f0, B1, A1, f0
  3155. fxcsmadd f1, B1, A1, f1
  3156. LFPDUX A1, AO, INC2
  3157. LFPDUX B1, BO, INC2
  3158. fxcpmadd f2, B2, A2, f2
  3159. fxcsmadd f3, B2, A2, f3
  3160. LFPDUX A2, AO, INC2
  3161. LFPDUX B2, BO, INC2
  3162. fxcpmadd f0, B3, A3, f0
  3163. fxcsmadd f1, B3, A3, f1
  3164. LFPDUX A3, AO, INC2
  3165. LFPDUX B3, BO, INC2
  3166. fxcpmadd f2, B4, A4, f2
  3167. fxcsmadd f3, B4, A4, f3
  3168. LFPDUX A4, AO, INC2
  3169. LFPDUX B4, BO, INC2
  3170. fxcpmadd f0, B5, A5, f0
  3171. fxcsmadd f1, B5, A5, f1
  3172. LFPDUX A5, AO, INC2
  3173. LFPDUX B5, BO, INC2
  3174. fxcpmadd f2, B6, A6, f2
  3175. fxcsmadd f3, B6, A6, f3
  3176. LFPDUX A6, AO, INC2
  3177. LFPDUX B6, BO, INC2
  3178. fxcpmadd f0, A9, A7, f0
  3179. fxcsmadd f1, A9, A7, f1
  3180. LFPDUX A7, AO, INC2
  3181. LFPDUX A9, BO, INC2
  3182. fxcpmadd f2, A10, A8, f2
  3183. fxcsmadd f3, A10, A8, f3
  3184. LFPDUX A8, AO, INC2
  3185. LFPDUX A10, BO, INC2
  3186. bdnz+ .L72
  3187. .align 4
  3188. .L73:
  3189. fxcpmadd f0, B1, A1, f0
  3190. fxcsmadd f1, B1, A1, f1
  3191. fxcpmadd f2, B2, A2, f2
  3192. fxcsmadd f3, B2, A2, f3
  3193. fxcpmadd f0, B3, A3, f0
  3194. fxcsmadd f1, B3, A3, f1
  3195. fxcpmadd f2, B4, A4, f2
  3196. fxcsmadd f3, B4, A4, f3
  3197. fxcpmadd f0, B5, A5, f0
  3198. fxcsmadd f1, B5, A5, f1
  3199. fxcpmadd f2, B6, A6, f2
  3200. fxcsmadd f3, B6, A6, f3
  3201. fxcpmadd f0, A9, A7, f0
  3202. fxcsmadd f1, A9, A7, f1
  3203. fxcpmadd f2, A10, A8, f2
  3204. fxcsmadd f3, A10, A8, f3
  3205. .align 4
  3206. .L74:
  3207. #if defined(LT) || defined(RN)
  3208. andi. r0, KK, 7
  3209. mtspr CTR, r0
  3210. ble+ .L78
  3211. #else
  3212. andi. r0, TEMP, 7
  3213. mtspr CTR, r0
  3214. ble+ .L78
  3215. #endif
  3216. LFPDUX A1, AO, INC2
  3217. LFPDUX B1, BO, INC2
  3218. bdz- .L77
  3219. .align 4
  3220. .L76:
  3221. fxcpmadd f0, B1, A1, f0
  3222. fxcsmadd f1, B1, A1, f1
  3223. LFPDUX A1, AO, INC2
  3224. LFPDUX B1, BO, INC2
  3225. bdnz+ .L76
  3226. .align 4
  3227. .L77:
  3228. fxcpmadd f0, B1, A1, f0
  3229. fxcsmadd f1, B1, A1, f1
  3230. .align 4
  3231. .L78:
  3232. fpadd f0, f0, f2
  3233. fpadd f1, f1, f3
  3234. #if defined(LN) || defined(RT)
  3235. #ifdef LN
  3236. subi r0, KK, 2
  3237. #else
  3238. subi r0, KK, 2
  3239. #endif
  3240. slwi TEMP, r0, 1 + BASE_SHIFT
  3241. slwi r0, r0, 1 + BASE_SHIFT
  3242. add AO, AORIG, TEMP
  3243. add BO, B, r0
  3244. addi BO, BO, - 2 * SIZE
  3245. #endif
  3246. #if defined(LN) || defined(LT)
  3247. fpmr f24, f0
  3248. fsmfp f0, f1
  3249. fsmtp f1, f24
  3250. LFPDUX f16, BO, INC2
  3251. LFPDUX f17, BO, INC2
  3252. subi BO, BO, 4 * SIZE
  3253. fpsub f0, f16, f0
  3254. fpsub f1, f17, f1
  3255. #else
  3256. LFPDUX f16, AO, INC2
  3257. LFPDUX f17, AO, INC2
  3258. subi AO, AO, 4 * SIZE
  3259. fpsub f0, f16, f0
  3260. fpsub f1, f17, f1
  3261. #endif
  3262. #ifdef LN
  3263. LFPDUX A1, AO, INC2
  3264. LFPDUX A2, AO, INC2
  3265. addi AO, AO, -4 * SIZE
  3266. fxsmul f1, A2, f1
  3267. fxcpnmsub f0, A2, f1, f0
  3268. fxpmul f0, A1, f0
  3269. #endif
  3270. #ifdef LT
  3271. LFPDUX A1, AO, INC2
  3272. LFPDUX A2, AO, INC2
  3273. addi AO, AO, -4 * SIZE
  3274. fxpmul f0, A1, f0
  3275. fxcsnmsub f1, A1, f0, f1
  3276. fxsmul f1, A2, f1
  3277. #endif
  3278. #ifdef RN
  3279. LFPDUX A1, BO, INC2
  3280. LFPDUX A2, BO, INC2
  3281. subi BO, BO, 4 * SIZE
  3282. fxpmul f0, A1, f0
  3283. fxcsnmsub f1, A1, f0, f1
  3284. fxsmul f1, A2, f1
  3285. #endif
  3286. #ifdef RT
  3287. LFPDUX A2, BO, INC2
  3288. LFPDUX A1, BO, INC2
  3289. subi BO, BO, 4 * SIZE
  3290. fxsmul f1, A1, f1
  3291. fxcpnmsub f0, A1, f1, f0
  3292. fxpmul f0, A2, f0
  3293. #endif
  3294. #ifdef LN
  3295. subi CO1, CO1, 2 * SIZE
  3296. subi CO2, CO2, 2 * SIZE
  3297. #endif
  3298. #if defined(LN) || defined(LT)
  3299. STFPDUX f0, BO, INC2
  3300. STFPDUX f1, BO, INC2
  3301. subi BO, BO, 4 * SIZE
  3302. STFDUX f0, CO1, INC
  3303. STFDUX f1, CO1, INC
  3304. STFSDUX f0, CO2, INC
  3305. STFSDUX f1, CO2, INC
  3306. #else
  3307. STFPDUX f0, AO, INC2
  3308. STFPDUX f1, AO, INC2
  3309. subi AO, AO, 4 * SIZE
  3310. STFDUX f0, CO1, INC
  3311. STFSDUX f0, CO1, INC
  3312. STFDUX f1, CO2, INC
  3313. STFSDUX f1, CO2, INC
  3314. #endif
  3315. #ifdef LN
  3316. subi CO1, CO1, 2 * SIZE
  3317. subi CO2, CO2, 2 * SIZE
  3318. #endif
  3319. #ifdef RT
  3320. slwi r0, K, 1 + BASE_SHIFT
  3321. add AORIG, AORIG, r0
  3322. #endif
  3323. #if defined(LT) || defined(RN)
  3324. sub TEMP, K, KK
  3325. slwi r0, TEMP, 1 + BASE_SHIFT
  3326. slwi TEMP, TEMP, 1 + BASE_SHIFT
  3327. add AO, AO, r0
  3328. add BO, BO, TEMP
  3329. #endif
  3330. #ifdef LT
  3331. addi KK, KK, 2
  3332. #endif
  3333. #ifdef LN
  3334. subi KK, KK, 2
  3335. #endif
  3336. li r0, FZERO
  3337. lfpsx f0, SP, r0
  3338. .align 4
  3339. .L80:
  3340. andi. I, M, 1
  3341. beq .L89
  3342. #if defined(LT) || defined(RN)
  3343. addi BO, B, - 2 * SIZE
  3344. fpmr f1, f0
  3345. fpmr f2, f0
  3346. fpmr f3, f0
  3347. srawi. r0, KK, 3
  3348. mtspr CTR, r0
  3349. ble .L84
  3350. #else
  3351. #ifdef LN
  3352. slwi r0, K, 0 + BASE_SHIFT
  3353. sub AORIG, AORIG, r0
  3354. #endif
  3355. slwi r0 , KK, 0 + BASE_SHIFT
  3356. slwi TEMP, KK, 1 + BASE_SHIFT
  3357. add AO, AORIG, r0
  3358. add BO, B, TEMP
  3359. sub TEMP, K, KK
  3360. addi BO, B, - 2 * SIZE
  3361. fpmr f1, f0
  3362. fpmr f2, f0
  3363. fpmr f3, f0
  3364. srawi. r0, TEMP, 3
  3365. mtspr CTR, r0
  3366. ble .L84
  3367. #endif
  3368. LFPDUX B1, BO, INC2
  3369. LFPDUX A1, AO, INC2
  3370. LFPDUX A2, AO, INC2
  3371. LFPDUX B2, BO, INC2
  3372. LFPDUX A3, AO, INC2
  3373. LFPDUX A4, AO, INC2
  3374. LFPDUX B3, BO, INC2
  3375. LFPDUX B4, BO, INC2
  3376. bdz- .L83
  3377. .align 4
  3378. .L82:
  3379. fxcpmadd f0, A1, B1, f0
  3380. LFPDUX B1, BO, INC2
  3381. fxcsmadd f1, A1, B2, f1
  3382. LFPDUX B2, BO, INC2
  3383. LFPDUX A1, AO, INC2
  3384. fxcpmadd f2, A2, B3, f2
  3385. LFPDUX B3, BO, INC2
  3386. fxcsmadd f3, A2, B4, f3
  3387. LFPDUX B4, BO, INC2
  3388. LFPDUX A2, AO, INC2
  3389. fxcpmadd f0, A3, B1, f0
  3390. LFPDUX B1, BO, INC2
  3391. fxcsmadd f1, A3, B2, f1
  3392. LFPDUX B2, BO, INC2
  3393. LFPDUX A3, AO, INC2
  3394. fxcpmadd f2, A4, B3, f2
  3395. LFPDUX B3, BO, INC2
  3396. fxcsmadd f3, A4, B4, f3
  3397. LFPDUX B4, BO, INC2
  3398. LFPDUX A4, AO, INC2
  3399. bdnz+ .L82
  3400. .align 4
  3401. .L83:
  3402. fxcpmadd f0, A1, B1, f0
  3403. LFPDUX B1, BO, INC2
  3404. fxcsmadd f1, A1, B2, f1
  3405. LFPDUX B2, BO, INC2
  3406. fxcpmadd f2, A2, B3, f2
  3407. LFPDUX B3, BO, INC2
  3408. fxcsmadd f3, A2, B4, f3
  3409. LFPDUX B4, BO, INC2
  3410. fxcpmadd f0, A3, B1, f0
  3411. fxcsmadd f1, A3, B2, f1
  3412. fxcpmadd f2, A4, B3, f2
  3413. fxcsmadd f3, A4, B4, f3
  3414. .align 4
  3415. .L84:
  3416. #if defined(LT) || defined(RN)
  3417. andi. r0, KK, 7
  3418. mtspr CTR, r0
  3419. ble+ .L88
  3420. #else
  3421. andi. r0, TEMP, 7
  3422. mtspr CTR, r0
  3423. ble+ .L88
  3424. #endif
  3425. LFDX A1, AO, INC2
  3426. LFPDUX B1, BO, INC2
  3427. add AO, AO, INC
  3428. bdz- .L87
  3429. .align 4
  3430. .L86:
  3431. fxcpmadd f0, A1, B1, f0
  3432. LFDX A1, AO, INC2
  3433. LFPDUX B1, BO, INC2
  3434. add AO, AO, INC
  3435. bdnz+ .L86
  3436. .align 4
  3437. .L87:
  3438. fxcpmadd f0, A1, B1, f0
  3439. .align 4
  3440. .L88:
  3441. fpadd f0, f0, f1
  3442. fpadd f2, f2, f3
  3443. fpadd f0, f0, f2
  3444. #if defined(LN) || defined(RT)
  3445. #ifdef LN
  3446. subi r0, KK, 1
  3447. #else
  3448. subi r0, KK, 2
  3449. #endif
  3450. slwi TEMP, r0, 0 + BASE_SHIFT
  3451. slwi r0, r0, 1 + BASE_SHIFT
  3452. add AO, AORIG, TEMP
  3453. add BO, B, r0
  3454. addi BO, BO, - 2 * SIZE
  3455. #endif
  3456. #if defined(LN) || defined(LT)
  3457. LFPDX f16, BO, INC2
  3458. fpsub f0, f16, f0
  3459. #else
  3460. LFPDX f16, AO, INC2
  3461. fpsub f0, f16, f0
  3462. #endif
  3463. #ifdef LN
  3464. LFPDX A1, AO, INC2
  3465. fxpmul f0, A1, f0
  3466. #endif
  3467. #ifdef LT
  3468. LFPDX A1, AO, INC2
  3469. fxpmul f0, A1, f0
  3470. #endif
  3471. #ifdef RN
  3472. LFD A1, (2 + 0) * SIZE(BO)
  3473. LFD A2, (2 + 1) * SIZE(BO)
  3474. LFD A3, (2 + 3) * SIZE(BO)
  3475. fsmtp f1, f0
  3476. fmul f0, A1, f0
  3477. fnmsub f1, A2, f0, f1
  3478. fmul f1, A3, f1
  3479. fsmfp f0, f1
  3480. #endif
  3481. #ifdef RT
  3482. LFD A1, (2 + 3) * SIZE(BO)
  3483. LFD A2, (2 + 2) * SIZE(BO)
  3484. LFD A3, (2 + 0) * SIZE(BO)
  3485. fsmtp f1, f0
  3486. fmul f1, A1, f1
  3487. fnmsub f0, A2, f1, f0
  3488. fmul f0, A3, f0
  3489. fsmfp f0, f1
  3490. #endif
  3491. #ifdef LN
  3492. subi CO1, CO1, 1 * SIZE
  3493. subi CO2, CO2, 1 * SIZE
  3494. #endif
  3495. #if defined(LN) || defined(LT)
  3496. STFPDX f0, BO, INC2
  3497. STFDUX f0, CO1, INC
  3498. STFSDUX f0, CO2, INC
  3499. #else
  3500. STFPDX f0, AO, INC2
  3501. STFDUX f0, CO1, INC
  3502. STFDUX f1, CO2, INC
  3503. #endif
  3504. #ifdef LN
  3505. subi CO1, CO1, 1 * SIZE
  3506. subi CO2, CO2, 1 * SIZE
  3507. #endif
  3508. #ifdef RT
  3509. slwi r0, K, 0 + BASE_SHIFT
  3510. add AORIG, AORIG, r0
  3511. #endif
  3512. #if defined(LT) || defined(RN)
  3513. sub TEMP, K, KK
  3514. slwi r0, TEMP, 0 + BASE_SHIFT
  3515. slwi TEMP, TEMP, 1 + BASE_SHIFT
  3516. add AO, AO, r0
  3517. add BO, BO, TEMP
  3518. #endif
  3519. #ifdef LT
  3520. addi KK, KK, 1
  3521. #endif
  3522. #ifdef LN
  3523. subi KK, KK, 1
  3524. #endif
  3525. .align 4
  3526. .L89:
  3527. #ifdef LN
  3528. slwi r0, K, 1 + BASE_SHIFT
  3529. add B, B, r0
  3530. #endif
  3531. #if defined(LT) || defined(RN)
  3532. addi B, BO, 2 * SIZE
  3533. #endif
  3534. #ifdef RN
  3535. addi KK, KK, 2
  3536. #endif
  3537. #ifdef RT
  3538. subi KK, KK, 2
  3539. #endif
  3540. .align 4
  3541. .L90:
  3542. andi. J, N, 1
  3543. beq .L999
  3544. #ifdef RT
  3545. slwi r0, K, 0 + BASE_SHIFT
  3546. sub B, B, r0
  3547. sub C, C, LDC
  3548. #endif
  3549. mr CO1, C
  3550. #ifdef LN
  3551. add KK, M, OFFSET
  3552. #endif
  3553. #ifdef LT
  3554. mr KK, OFFSET
  3555. #endif
  3556. #if defined(LN) || defined(RT)
  3557. addi AORIG, A, -2 * SIZE
  3558. #else
  3559. addi AO, A, -2 * SIZE
  3560. #endif
  3561. #ifndef RT
  3562. add C, CO1, LDC
  3563. #endif
  3564. li r0, FZERO
  3565. lfpsx f0, SP, r0
  3566. srawi. I, M, 3
  3567. ble .L100
  3568. .align 4
  3569. .L91:
  3570. #if defined(LT) || defined(RN)
  3571. fpmr f1, f0
  3572. addi BO, B, - 2 * SIZE
  3573. fpmr f2, f0
  3574. fpmr f3, f0
  3575. srawi. r0, KK, 2
  3576. mtspr CTR, r0
  3577. ble .L94
  3578. #else
  3579. #ifdef LN
  3580. slwi r0, K, 3 + BASE_SHIFT
  3581. sub AORIG, AORIG, r0
  3582. #endif
  3583. slwi r0 , KK, 3 + BASE_SHIFT
  3584. slwi TEMP, KK, 0 + BASE_SHIFT
  3585. add AO, AORIG, r0
  3586. add BO, B, TEMP
  3587. sub TEMP, K, KK
  3588. fpmr f1, f0
  3589. addi BO, B, - 2 * SIZE
  3590. fpmr f2, f0
  3591. fpmr f3, f0
  3592. srawi. r0, TEMP, 2
  3593. mtspr CTR, r0
  3594. ble .L94
  3595. #endif
  3596. LFPDUX B1, BO, INC2
  3597. LFPDUX A1, AO, INC2
  3598. LFPDUX A2, AO, INC2
  3599. LFPDUX A3, AO, INC2
  3600. LFPDUX A4, AO, INC2
  3601. LFPDUX B2, BO, INC2
  3602. LFPDUX A5, AO, INC2
  3603. LFPDUX A6, AO, INC2
  3604. LFPDUX A7, AO, INC2
  3605. LFPDUX A8, AO, INC2
  3606. bdz- .L93
  3607. .align 4
  3608. .L92:
  3609. fxcpmadd f0, B1, A1, f0
  3610. LFPDUX A1, AO, INC2
  3611. fxcpmadd f1, B1, A2, f1
  3612. LFPDUX A2, AO, INC2
  3613. fxcpmadd f2, B1, A3, f2
  3614. LFPDUX A3, AO, INC2
  3615. fxcpmadd f3, B1, A4, f3
  3616. LFPDUX A4, AO, INC2
  3617. fxcsmadd f0, B1, A5, f0
  3618. LFPDUX A5, AO, INC2
  3619. fxcsmadd f1, B1, A6, f1
  3620. LFPDUX A6, AO, INC2
  3621. fxcsmadd f2, B1, A7, f2
  3622. LFPDUX A7, AO, INC2
  3623. fxcsmadd f3, B1, A8, f3
  3624. LFPDUX A8, AO, INC2
  3625. LFPDUX B1, BO, INC2
  3626. fxcpmadd f0, B2, A1, f0
  3627. LFPDUX A1, AO, INC2
  3628. fxcpmadd f1, B2, A2, f1
  3629. LFPDUX A2, AO, INC2
  3630. fxcpmadd f2, B2, A3, f2
  3631. LFPDUX A3, AO, INC2
  3632. fxcpmadd f3, B2, A4, f3
  3633. LFPDUX A4, AO, INC2
  3634. fxcsmadd f0, B2, A5, f0
  3635. LFPDUX A5, AO, INC2
  3636. fxcsmadd f1, B2, A6, f1
  3637. LFPDUX A6, AO, INC2
  3638. fxcsmadd f2, B2, A7, f2
  3639. LFPDUX A7, AO, INC2
  3640. fxcsmadd f3, B2, A8, f3
  3641. LFPDUX A8, AO, INC2
  3642. LFPDUX B2, BO, INC2
  3643. bdnz+ .L92
  3644. .align 4
  3645. .L93:
  3646. fxcpmadd f0, B1, A1, f0
  3647. LFPDUX A1, AO, INC2
  3648. fxcpmadd f1, B1, A2, f1
  3649. LFPDUX A2, AO, INC2
  3650. fxcpmadd f2, B1, A3, f2
  3651. LFPDUX A3, AO, INC2
  3652. fxcpmadd f3, B1, A4, f3
  3653. LFPDUX A4, AO, INC2
  3654. fxcsmadd f0, B1, A5, f0
  3655. LFPDUX A5, AO, INC2
  3656. fxcsmadd f1, B1, A6, f1
  3657. LFPDUX A6, AO, INC2
  3658. fxcsmadd f2, B1, A7, f2
  3659. LFPDUX A7, AO, INC2
  3660. fxcsmadd f3, B1, A8, f3
  3661. LFPDUX A8, AO, INC2
  3662. fxcpmadd f0, B2, A1, f0
  3663. fxcpmadd f1, B2, A2, f1
  3664. fxcpmadd f2, B2, A3, f2
  3665. fxcpmadd f3, B2, A4, f3
  3666. fxcsmadd f0, B2, A5, f0
  3667. fxcsmadd f1, B2, A6, f1
  3668. fxcsmadd f2, B2, A7, f2
  3669. fxcsmadd f3, B2, A8, f3
  3670. .align 4
  3671. .L94:
  3672. #if defined(LT) || defined(RN)
  3673. andi. r0, KK, 3
  3674. mtspr CTR, r0
  3675. ble+ .L98
  3676. #else
  3677. andi. r0, TEMP, 3
  3678. mtspr CTR, r0
  3679. ble+ .L98
  3680. #endif
  3681. LFDX B1, BO, INC2
  3682. LFPDUX A1, AO, INC2
  3683. LFPDUX A2, AO, INC2
  3684. LFPDUX A3, AO, INC2
  3685. LFPDUX A4, AO, INC2
  3686. add BO, BO, INC
  3687. bdz- .L97
  3688. .align 4
  3689. .L96:
  3690. fxcpmadd f0, B1, A1, f0
  3691. LFPDUX A1, AO, INC2
  3692. fxcpmadd f1, B1, A2, f1
  3693. LFPDUX A2, AO, INC2
  3694. fxcpmadd f2, B1, A3, f2
  3695. LFPDUX A3, AO, INC2
  3696. fxcpmadd f3, B1, A4, f3
  3697. LFDX B1, BO, INC2
  3698. LFPDUX A4, AO, INC2
  3699. add BO, BO, INC
  3700. bdnz+ .L96
  3701. .align 4
  3702. .L97:
  3703. fxcpmadd f0, B1, A1, f0
  3704. fxcpmadd f1, B1, A2, f1
  3705. fxcpmadd f2, B1, A3, f2
  3706. fxcpmadd f3, B1, A4, f3
  3707. .align 4
  3708. .L98:
  3709. #if defined(LN) || defined(RT)
  3710. #ifdef LN
  3711. subi r0, KK, 8
  3712. #else
  3713. subi r0, KK, 1
  3714. #endif
  3715. slwi TEMP, r0, 3 + BASE_SHIFT
  3716. slwi r0, r0, 0 + BASE_SHIFT
  3717. add AO, AORIG, TEMP
  3718. add BO, B, r0
  3719. addi BO, BO, - 2 * SIZE
  3720. #endif
  3721. #if defined(LN) || defined(LT)
  3722. LFPDUX f16, BO, INC2
  3723. LFPDUX f17, BO, INC2
  3724. LFPDUX f18, BO, INC2
  3725. LFPDUX f19, BO, INC2
  3726. subi BO, BO, 8 * SIZE
  3727. fpsub f0, f16, f0
  3728. fpsub f1, f17, f1
  3729. fpsub f2, f18, f2
  3730. fpsub f3, f19, f3
  3731. #else
  3732. LFPDUX f16, AO, INC2
  3733. LFPDUX f17, AO, INC2
  3734. LFPDUX f18, AO, INC2
  3735. LFPDUX f19, AO, INC2
  3736. subi AO, AO, 8 * SIZE
  3737. fpsub f0, f16, f0
  3738. fpsub f1, f17, f1
  3739. fpsub f2, f18, f2
  3740. fpsub f3, f19, f3
  3741. #endif
  3742. #ifdef LN
  3743. fsmtp f4, f0
  3744. fsmtp f5, f1
  3745. fsmtp f6, f2
  3746. fsmtp f7, f3
  3747. LFD A1, (2 + 63) * SIZE(AO)
  3748. LFD A2, (2 + 62) * SIZE(AO)
  3749. LFD A3, (2 + 61) * SIZE(AO)
  3750. LFD A4, (2 + 60) * SIZE(AO)
  3751. LFD A5, (2 + 59) * SIZE(AO)
  3752. LFD A6, (2 + 58) * SIZE(AO)
  3753. LFD A7, (2 + 57) * SIZE(AO)
  3754. LFD A8, (2 + 56) * SIZE(AO)
  3755. fmul f7, A1, f7
  3756. fnmsub f3, A2, f7, f3
  3757. fnmsub f6, A3, f7, f6
  3758. fnmsub f2, A4, f7, f2
  3759. fnmsub f5, A5, f7, f5
  3760. fnmsub f1, A6, f7, f1
  3761. fnmsub f4, A7, f7, f4
  3762. fnmsub f0, A8, f7, f0
  3763. LFD A1, (2 + 54) * SIZE(AO)
  3764. LFD A2, (2 + 53) * SIZE(AO)
  3765. LFD A3, (2 + 52) * SIZE(AO)
  3766. LFD A4, (2 + 51) * SIZE(AO)
  3767. LFD A5, (2 + 50) * SIZE(AO)
  3768. LFD A6, (2 + 49) * SIZE(AO)
  3769. LFD A7, (2 + 48) * SIZE(AO)
  3770. fmul f3, A1, f3
  3771. fnmsub f6, A2, f3, f6
  3772. fnmsub f2, A3, f3, f2
  3773. fnmsub f5, A4, f3, f5
  3774. fnmsub f1, A5, f3, f1
  3775. fnmsub f4, A6, f3, f4
  3776. fnmsub f0, A7, f3, f0
  3777. LFD A1, (2 + 45) * SIZE(AO)
  3778. LFD A2, (2 + 44) * SIZE(AO)
  3779. LFD A3, (2 + 43) * SIZE(AO)
  3780. LFD A4, (2 + 42) * SIZE(AO)
  3781. LFD A5, (2 + 41) * SIZE(AO)
  3782. LFD A6, (2 + 40) * SIZE(AO)
  3783. fmul f6, A1, f6
  3784. fnmsub f2, A2, f6, f2
  3785. fnmsub f5, A3, f6, f5
  3786. fnmsub f1, A4, f6, f1
  3787. fnmsub f4, A5, f6, f4
  3788. fnmsub f0, A6, f6, f0
  3789. LFD A1, (2 + 36) * SIZE(AO)
  3790. LFD A2, (2 + 35) * SIZE(AO)
  3791. LFD A3, (2 + 34) * SIZE(AO)
  3792. LFD A4, (2 + 33) * SIZE(AO)
  3793. LFD A5, (2 + 32) * SIZE(AO)
  3794. fmul f2, A1, f2
  3795. fnmsub f5, A2, f2, f5
  3796. fnmsub f1, A3, f2, f1
  3797. fnmsub f4, A4, f2, f4
  3798. fnmsub f0, A5, f2, f0
  3799. LFD A1, (2 + 27) * SIZE(AO)
  3800. LFD A2, (2 + 26) * SIZE(AO)
  3801. LFD A3, (2 + 25) * SIZE(AO)
  3802. LFD A4, (2 + 24) * SIZE(AO)
  3803. fmul f5, A1, f5
  3804. fnmsub f1, A2, f5, f1
  3805. fnmsub f4, A3, f5, f4
  3806. fnmsub f0, A4, f5, f0
  3807. LFD A1, (2 + 18) * SIZE(AO)
  3808. LFD A2, (2 + 17) * SIZE(AO)
  3809. LFD A3, (2 + 16) * SIZE(AO)
  3810. fmul f1, A1, f1
  3811. fnmsub f4, A2, f1, f4
  3812. fnmsub f0, A3, f1, f0
  3813. LFD A1, (2 + 9) * SIZE(AO)
  3814. LFD A2, (2 + 8) * SIZE(AO)
  3815. fmul f4, A1, f4
  3816. fnmsub f0, A2, f4, f0
  3817. LFD A1, (2 + 0) * SIZE(AO)
  3818. fmul f0, A1, f0
  3819. fsmfp f0, f4
  3820. fsmfp f1, f5
  3821. fsmfp f2, f6
  3822. fsmfp f3, f7
  3823. #endif
  3824. #ifdef LT
  3825. fsmtp f4, f0
  3826. fsmtp f5, f1
  3827. fsmtp f6, f2
  3828. fsmtp f7, f3
  3829. LFD A1, (2 + 0) * SIZE(AO)
  3830. LFD A2, (2 + 1) * SIZE(AO)
  3831. LFD A3, (2 + 2) * SIZE(AO)
  3832. LFD A4, (2 + 3) * SIZE(AO)
  3833. LFD A5, (2 + 4) * SIZE(AO)
  3834. LFD A6, (2 + 5) * SIZE(AO)
  3835. LFD A7, (2 + 6) * SIZE(AO)
  3836. LFD A8, (2 + 7) * SIZE(AO)
  3837. fmul f0, A1, f0
  3838. fnmsub f4, A2, f0, f4
  3839. fnmsub f1, A3, f0, f1
  3840. fnmsub f5, A4, f0, f5
  3841. fnmsub f2, A5, f0, f2
  3842. fnmsub f6, A6, f0, f6
  3843. fnmsub f3, A7, f0, f3
  3844. fnmsub f7, A8, f0, f7
  3845. LFD A1, (2 + 9) * SIZE(AO)
  3846. LFD A2, (2 + 10) * SIZE(AO)
  3847. LFD A3, (2 + 11) * SIZE(AO)
  3848. LFD A4, (2 + 12) * SIZE(AO)
  3849. LFD A5, (2 + 13) * SIZE(AO)
  3850. LFD A6, (2 + 14) * SIZE(AO)
  3851. LFD A7, (2 + 15) * SIZE(AO)
  3852. fmul f4, A1, f4
  3853. fnmsub f1, A2, f4, f1
  3854. fnmsub f5, A3, f4, f5
  3855. fnmsub f2, A4, f4, f2
  3856. fnmsub f6, A5, f4, f6
  3857. fnmsub f3, A6, f4, f3
  3858. fnmsub f7, A7, f4, f7
  3859. LFD A1, (2 + 18) * SIZE(AO)
  3860. LFD A2, (2 + 19) * SIZE(AO)
  3861. LFD A3, (2 + 20) * SIZE(AO)
  3862. LFD A4, (2 + 21) * SIZE(AO)
  3863. LFD A5, (2 + 22) * SIZE(AO)
  3864. LFD A6, (2 + 23) * SIZE(AO)
  3865. fmul f1, A1, f1
  3866. fnmsub f5, A2, f1, f5
  3867. fnmsub f2, A3, f1, f2
  3868. fnmsub f6, A4, f1, f6
  3869. fnmsub f3, A5, f1, f3
  3870. fnmsub f7, A6, f1, f7
  3871. LFD A1, (2 + 27) * SIZE(AO)
  3872. LFD A2, (2 + 28) * SIZE(AO)
  3873. LFD A3, (2 + 29) * SIZE(AO)
  3874. LFD A4, (2 + 30) * SIZE(AO)
  3875. LFD A5, (2 + 31) * SIZE(AO)
  3876. fmul f5, A1, f5
  3877. fnmsub f2, A2, f5, f2
  3878. fnmsub f6, A3, f5, f6
  3879. fnmsub f3, A4, f5, f3
  3880. fnmsub f7, A5, f5, f7
  3881. LFD A1, (2 + 36) * SIZE(AO)
  3882. LFD A2, (2 + 37) * SIZE(AO)
  3883. LFD A3, (2 + 38) * SIZE(AO)
  3884. LFD A4, (2 + 39) * SIZE(AO)
  3885. fmul f2, A1, f2
  3886. fnmsub f6, A2, f2, f6
  3887. fnmsub f3, A3, f2, f3
  3888. fnmsub f7, A4, f2, f7
  3889. LFD A1, (2 + 45) * SIZE(AO)
  3890. LFD A2, (2 + 46) * SIZE(AO)
  3891. LFD A3, (2 + 47) * SIZE(AO)
  3892. fmul f6, A1, f6
  3893. fnmsub f3, A2, f6, f3
  3894. fnmsub f7, A3, f6, f7
  3895. LFD A1, (2 + 54) * SIZE(AO)
  3896. LFD A2, (2 + 55) * SIZE(AO)
  3897. fmul f3, A1, f3
  3898. fnmsub f7, A2, f3, f7
  3899. LFD A1, (2 + 63) * SIZE(AO)
  3900. fmul f7, A1, f7
  3901. fsmfp f0, f4
  3902. fsmfp f1, f5
  3903. fsmfp f2, f6
  3904. fsmfp f3, f7
  3905. #endif
  3906. #ifdef RN
  3907. LFPDX A1, BO, INC2
  3908. fxpmul f0, A1, f0
  3909. fxpmul f1, A1, f1
  3910. fxpmul f2, A1, f2
  3911. fxpmul f3, A1, f3
  3912. #endif
  3913. #ifdef RT
  3914. LFPDX A1, BO, INC2
  3915. fxpmul f0, A1, f0
  3916. fxpmul f1, A1, f1
  3917. fxpmul f2, A1, f2
  3918. fxpmul f3, A1, f3
  3919. #endif
  3920. #ifdef LN
  3921. subi CO1, CO1, 8 * SIZE
  3922. #endif
  3923. #if defined(LN) || defined(LT)
  3924. STFPDUX f0, BO, INC2
  3925. STFPDUX f1, BO, INC2
  3926. STFPDUX f2, BO, INC2
  3927. STFPDUX f3, BO, INC2
  3928. subi BO, BO, 8 * SIZE
  3929. STFDUX f0, CO1, INC
  3930. STFSDUX f0, CO1, INC
  3931. STFDUX f1, CO1, INC
  3932. STFSDUX f1, CO1, INC
  3933. STFDUX f2, CO1, INC
  3934. STFSDUX f2, CO1, INC
  3935. STFDUX f3, CO1, INC
  3936. STFSDUX f3, CO1, INC
  3937. #else
  3938. STFPDUX f0, AO, INC2
  3939. STFPDUX f1, AO, INC2
  3940. STFPDUX f2, AO, INC2
  3941. STFPDUX f3, AO, INC2
  3942. subi AO, AO, 8 * SIZE
  3943. STFDUX f0, CO1, INC
  3944. STFSDUX f0, CO1, INC
  3945. STFDUX f1, CO1, INC
  3946. STFSDUX f1, CO1, INC
  3947. STFDUX f2, CO1, INC
  3948. STFSDUX f2, CO1, INC
  3949. STFDUX f3, CO1, INC
  3950. STFSDUX f3, CO1, INC
  3951. #endif
  3952. #ifdef LN
  3953. subi CO1, CO1, 8 * SIZE
  3954. #endif
  3955. #ifdef RT
  3956. slwi r0, K, 3 + BASE_SHIFT
  3957. add AORIG, AORIG, r0
  3958. #endif
  3959. #if defined(LT) || defined(RN)
  3960. sub TEMP, K, KK
  3961. slwi r0, TEMP, 3 + BASE_SHIFT
  3962. slwi TEMP, TEMP, 0 + BASE_SHIFT
  3963. add AO, AO, r0
  3964. add BO, BO, TEMP
  3965. #endif
  3966. #ifdef LT
  3967. addi KK, KK, 8
  3968. #endif
  3969. #ifdef LN
  3970. subi KK, KK, 8
  3971. #endif
  3972. addic. I, I, -1
  3973. li r0, FZERO
  3974. lfpsx f0, SP, r0
  3975. bgt+ .L91
  3976. .align 4
  3977. .L100:
  3978. andi. I, M, 4
  3979. beq .L110
  3980. #if defined(LT) || defined(RN)
  3981. addi BO, B, - 2 * SIZE
  3982. fpmr f1, f0
  3983. fpmr f2, f0
  3984. fpmr f3, f0
  3985. srawi. r0, KK, 3
  3986. mtspr CTR, r0
  3987. ble .L104
  3988. #else
  3989. #ifdef LN
  3990. slwi r0, K, 2 + BASE_SHIFT
  3991. sub AORIG, AORIG, r0
  3992. #endif
  3993. slwi r0 , KK, 2 + BASE_SHIFT
  3994. slwi TEMP, KK, 0 + BASE_SHIFT
  3995. add AO, AORIG, r0
  3996. add BO, B, TEMP
  3997. sub TEMP, K, KK
  3998. addi BO, B, - 2 * SIZE
  3999. fpmr f1, f0
  4000. fpmr f2, f0
  4001. fpmr f3, f0
  4002. srawi. r0, TEMP, 3
  4003. mtspr CTR, r0
  4004. ble .L104
  4005. #endif
  4006. LFPDUX B1, BO, INC2
  4007. LFPDUX A1, AO, INC2
  4008. LFPDUX A2, AO, INC2
  4009. LFPDUX A3, AO, INC2
  4010. LFPDUX A4, AO, INC2
  4011. LFPDUX B2, BO, INC2
  4012. LFPDUX A5, AO, INC2
  4013. LFPDUX A6, AO, INC2
  4014. LFPDUX A7, AO, INC2
  4015. LFPDUX A8, AO, INC2
  4016. LFPDUX B3, BO, INC2
  4017. LFPDUX B4, BO, INC2
  4018. bdz- .L103
  4019. .align 4
  4020. .L102:
  4021. fxcpmadd f0, B1, A1, f0
  4022. LFPDUX A1, AO, INC2
  4023. fxcpmadd f1, B1, A2, f1
  4024. LFPDUX A2, AO, INC2
  4025. fxcsmadd f2, B1, A3, f2
  4026. LFPDUX A3, AO, INC2
  4027. fxcsmadd f3, B1, A4, f3
  4028. LFPDUX A4, AO, INC2
  4029. LFPDUX B1, BO, INC2
  4030. fxcpmadd f0, B2, A5, f0
  4031. LFPDUX A5, AO, INC2
  4032. fxcpmadd f1, B2, A6, f1
  4033. LFPDUX A6, AO, INC2
  4034. fxcsmadd f2, B2, A7, f2
  4035. LFPDUX A7, AO, INC2
  4036. fxcsmadd f3, B2, A8, f3
  4037. LFPDUX A8, AO, INC2
  4038. LFPDUX B2, BO, INC2
  4039. fxcpmadd f0, B3, A1, f0
  4040. LFPDUX A1, AO, INC2
  4041. fxcpmadd f1, B3, A2, f1
  4042. LFPDUX A2, AO, INC2
  4043. fxcsmadd f2, B3, A3, f2
  4044. LFPDUX A3, AO, INC2
  4045. fxcsmadd f3, B3, A4, f3
  4046. LFPDUX A4, AO, INC2
  4047. LFPDUX B3, BO, INC2
  4048. fxcpmadd f0, B4, A5, f0
  4049. LFPDUX A5, AO, INC2
  4050. fxcpmadd f1, B4, A6, f1
  4051. LFPDUX A6, AO, INC2
  4052. fxcsmadd f2, B4, A7, f2
  4053. LFPDUX A7, AO, INC2
  4054. fxcsmadd f3, B4, A8, f3
  4055. LFPDUX A8, AO, INC2
  4056. LFPDUX B4, BO, INC2
  4057. bdnz+ .L102
  4058. .align 4
  4059. .L103:
  4060. fxcpmadd f0, B1, A1, f0
  4061. LFPDUX A1, AO, INC2
  4062. fxcpmadd f1, B1, A2, f1
  4063. LFPDUX A2, AO, INC2
  4064. fxcsmadd f2, B1, A3, f2
  4065. LFPDUX A3, AO, INC2
  4066. fxcsmadd f3, B1, A4, f3
  4067. LFPDUX A4, AO, INC2
  4068. fxcpmadd f0, B2, A5, f0
  4069. LFPDUX A5, AO, INC2
  4070. fxcpmadd f1, B2, A6, f1
  4071. LFPDUX A6, AO, INC2
  4072. fxcsmadd f2, B2, A7, f2
  4073. LFPDUX A7, AO, INC2
  4074. fxcsmadd f3, B2, A8, f3
  4075. LFPDUX A8, AO, INC2
  4076. fxcpmadd f0, B3, A1, f0
  4077. fxcpmadd f1, B3, A2, f1
  4078. fxcsmadd f2, B3, A3, f2
  4079. fxcsmadd f3, B3, A4, f3
  4080. fxcpmadd f0, B4, A5, f0
  4081. fxcpmadd f1, B4, A6, f1
  4082. fxcsmadd f2, B4, A7, f2
  4083. fxcsmadd f3, B4, A8, f3
  4084. .align 4
  4085. .L104:
  4086. #if defined(LT) || defined(RN)
  4087. andi. r0, KK, 7
  4088. mtspr CTR, r0
  4089. ble+ .L108
  4090. #else
  4091. andi. r0, TEMP, 7
  4092. mtspr CTR, r0
  4093. ble+ .L108
  4094. #endif
  4095. LFPDUX A1, AO, INC2
  4096. LFDX B1, BO, INC2
  4097. LFPDUX A2, AO, INC2
  4098. add BO, BO, INC
  4099. bdz- .L107
  4100. .align 4
  4101. .L106:
  4102. fxcpmadd f0, B1, A1, f0
  4103. LFPDUX A1, AO, INC2
  4104. fxcpmadd f1, B1, A2, f1
  4105. LFDX B1, BO, INC2
  4106. LFPDUX A2, AO, INC2
  4107. add BO, BO, INC
  4108. bdnz+ .L106
  4109. .align 4
  4110. .L107:
  4111. fxcpmadd f0, B1, A1, f0
  4112. fxcpmadd f1, B1, A2, f1
  4113. .align 4
  4114. .L108:
  4115. fpadd f0, f0, f2
  4116. fpadd f1, f1, f3
  4117. #if defined(LN) || defined(RT)
  4118. #ifdef LN
  4119. subi r0, KK, 4
  4120. #else
  4121. subi r0, KK, 1
  4122. #endif
  4123. slwi TEMP, r0, 2 + BASE_SHIFT
  4124. slwi r0, r0, 0 + BASE_SHIFT
  4125. add AO, AORIG, TEMP
  4126. add BO, B, r0
  4127. addi BO, BO, - 2 * SIZE
  4128. #endif
  4129. #if defined(LN) || defined(LT)
  4130. LFPDUX f16, BO, INC2
  4131. LFPDUX f17, BO, INC2
  4132. subi BO, BO, 4 * SIZE
  4133. fpsub f0, f16, f0
  4134. fpsub f1, f17, f1
  4135. #else
  4136. LFPDUX f16, AO, INC2
  4137. LFPDUX f17, AO, INC2
  4138. subi AO, AO, 4 * SIZE
  4139. fpsub f0, f16, f0
  4140. fpsub f1, f17, f1
  4141. #endif
  4142. #ifdef LN
  4143. fsmtp f4, f0
  4144. fsmtp f5, f1
  4145. LFD A1, (2 + 15) * SIZE(AO)
  4146. LFD A2, (2 + 14) * SIZE(AO)
  4147. LFD A3, (2 + 13) * SIZE(AO)
  4148. LFD A4, (2 + 12) * SIZE(AO)
  4149. fmul f5, A1, f5
  4150. fnmsub f1, A2, f5, f1
  4151. fnmsub f4, A3, f5, f4
  4152. fnmsub f0, A4, f5, f0
  4153. LFD A1, (2 + 10) * SIZE(AO)
  4154. LFD A2, (2 + 9) * SIZE(AO)
  4155. LFD A3, (2 + 8) * SIZE(AO)
  4156. fmul f1, A1, f1
  4157. fnmsub f4, A2, f1, f4
  4158. fnmsub f0, A3, f1, f0
  4159. LFD A1, (2 + 5) * SIZE(AO)
  4160. LFD A2, (2 + 4) * SIZE(AO)
  4161. fmul f4, A1, f4
  4162. fnmsub f0, A2, f4, f0
  4163. LFD A1, (2 + 0) * SIZE(AO)
  4164. fmul f0, A1, f0
  4165. fsmfp f0, f4
  4166. fsmfp f1, f5
  4167. #endif
  4168. #ifdef LT
  4169. fsmtp f4, f0
  4170. fsmtp f5, f1
  4171. LFD A1, (2 + 0) * SIZE(AO)
  4172. LFD A2, (2 + 1) * SIZE(AO)
  4173. LFD A3, (2 + 2) * SIZE(AO)
  4174. LFD A4, (2 + 3) * SIZE(AO)
  4175. fmul f0, A1, f0
  4176. fnmsub f4, A2, f0, f4
  4177. fnmsub f1, A3, f0, f1
  4178. fnmsub f5, A4, f0, f5
  4179. LFD A1, (2 + 5) * SIZE(AO)
  4180. LFD A2, (2 + 6) * SIZE(AO)
  4181. LFD A3, (2 + 7) * SIZE(AO)
  4182. fmul f4, A1, f4
  4183. fnmsub f1, A2, f4, f1
  4184. fnmsub f5, A3, f4, f5
  4185. LFD A1, (2 + 10) * SIZE(AO)
  4186. LFD A2, (2 + 11) * SIZE(AO)
  4187. fmul f1, A1, f1
  4188. fnmsub f5, A2, f1, f5
  4189. LFD A1, (2 + 15) * SIZE(AO)
  4190. fmul f5, A1, f5
  4191. fsmfp f0, f4
  4192. fsmfp f1, f5
  4193. #endif
  4194. #ifdef RN
  4195. LFPDX A1, BO, INC2
  4196. fxpmul f0, A1, f0
  4197. fxpmul f1, A1, f1
  4198. #endif
  4199. #ifdef RT
  4200. LFPDX A1, BO, INC2
  4201. fxpmul f0, A1, f0
  4202. fxpmul f1, A1, f1
  4203. #endif
  4204. #ifdef LN
  4205. subi CO1, CO1, 4 * SIZE
  4206. #endif
  4207. #if defined(LN) || defined(LT)
  4208. STFPDUX f0, BO, INC2
  4209. STFPDUX f1, BO, INC2
  4210. subi BO, BO, 4 * SIZE
  4211. STFDUX f0, CO1, INC
  4212. STFSDUX f0, CO1, INC
  4213. STFDUX f1, CO1, INC
  4214. STFSDUX f1, CO1, INC
  4215. #else
  4216. STFPDUX f0, AO, INC2
  4217. STFPDUX f1, AO, INC2
  4218. subi AO, AO, 4 * SIZE
  4219. STFDUX f0, CO1, INC
  4220. STFSDUX f0, CO1, INC
  4221. STFDUX f1, CO1, INC
  4222. STFSDUX f1, CO1, INC
  4223. #endif
  4224. #ifdef LN
  4225. subi CO1, CO1, 4 * SIZE
  4226. #endif
  4227. #ifdef RT
  4228. slwi r0, K, 2 + BASE_SHIFT
  4229. add AORIG, AORIG, r0
  4230. #endif
  4231. #if defined(LT) || defined(RN)
  4232. sub TEMP, K, KK
  4233. slwi r0, TEMP, 2 + BASE_SHIFT
  4234. slwi TEMP, TEMP, 0 + BASE_SHIFT
  4235. add AO, AO, r0
  4236. add BO, BO, TEMP
  4237. #endif
  4238. #ifdef LT
  4239. addi KK, KK, 4
  4240. #endif
  4241. #ifdef LN
  4242. subi KK, KK, 4
  4243. #endif
  4244. li r0, FZERO
  4245. lfpsx f0, SP, r0
  4246. .align 4
  4247. .L110:
  4248. andi. I, M, 2
  4249. beq .L120
  4250. #if defined(LT) || defined(RN)
  4251. addi BO, B, - 2 * SIZE
  4252. fpmr f1, f0
  4253. fpmr f2, f0
  4254. fpmr f3, f0
  4255. srawi. r0, KK, 3
  4256. mtspr CTR, r0
  4257. ble .L114
  4258. #else
  4259. #ifdef LN
  4260. slwi r0, K, 1 + BASE_SHIFT
  4261. sub AORIG, AORIG, r0
  4262. #endif
  4263. slwi r0 , KK, 1 + BASE_SHIFT
  4264. slwi TEMP, KK, 0 + BASE_SHIFT
  4265. add AO, AORIG, r0
  4266. add BO, B, TEMP
  4267. sub TEMP, K, KK
  4268. addi BO, B, - 2 * SIZE
  4269. fpmr f1, f0
  4270. fpmr f2, f0
  4271. fpmr f3, f0
  4272. srawi. r0, TEMP, 3
  4273. mtspr CTR, r0
  4274. ble .L114
  4275. #endif
  4276. LFPDUX A1, AO, INC2
  4277. LFPDUX A2, AO, INC2
  4278. LFPDUX B1, BO, INC2
  4279. LFPDUX A3, AO, INC2
  4280. LFPDUX A4, AO, INC2
  4281. LFPDUX B2, BO, INC2
  4282. LFPDUX A5, AO, INC2
  4283. LFPDUX A6, AO, INC2
  4284. LFPDUX B3, BO, INC2
  4285. LFPDUX A7, AO, INC2
  4286. LFPDUX A8, AO, INC2
  4287. LFPDUX B4, BO, INC2
  4288. bdz- .L113
  4289. .align 4
  4290. .L112:
  4291. fxcpmadd f0, B1, A1, f0
  4292. LFPDUX A1, AO, INC2
  4293. fxcsmadd f1, B1, A2, f1
  4294. LFPDUX A2, AO, INC2
  4295. LFPDUX B1, BO, INC2
  4296. fxcpmadd f2, B2, A3, f2
  4297. LFPDUX A3, AO, INC2
  4298. fxcsmadd f3, B2, A4, f3
  4299. LFPDUX A4, AO, INC2
  4300. LFPDUX B2, BO, INC2
  4301. fxcpmadd f0, B3, A5, f0
  4302. LFPDUX A5, AO, INC2
  4303. fxcsmadd f1, B3, A6, f1
  4304. LFPDUX A6, AO, INC2
  4305. LFPDUX B3, BO, INC2
  4306. fxcpmadd f2, B4, A7, f2
  4307. LFPDUX A7, AO, INC2
  4308. fxcsmadd f3, B4, A8, f3
  4309. LFPDUX A8, AO, INC2
  4310. LFPDUX B4, BO, INC2
  4311. bdnz+ .L112
  4312. .align 4
  4313. .L113:
  4314. fxcpmadd f0, B1, A1, f0
  4315. fxcsmadd f1, B1, A2, f1
  4316. fxcpmadd f2, B2, A3, f2
  4317. fxcsmadd f3, B2, A4, f3
  4318. fxcpmadd f0, B3, A5, f0
  4319. fxcsmadd f1, B3, A6, f1
  4320. fxcpmadd f2, B4, A7, f2
  4321. fxcsmadd f3, B4, A8, f3
  4322. .align 4
  4323. .L114:
  4324. #if defined(LT) || defined(RN)
  4325. andi. r0, KK, 7
  4326. mtspr CTR, r0
  4327. ble+ .L118
  4328. #else
  4329. andi. r0, TEMP, 7
  4330. mtspr CTR, r0
  4331. ble+ .L118
  4332. #endif
  4333. LFPDUX A1, AO, INC2
  4334. LFDX B1, BO, INC2
  4335. add BO, BO, INC
  4336. bdz- .L117
  4337. .align 4
  4338. .L116:
  4339. fxcpmadd f0, B1, A1, f0
  4340. LFPDUX A1, AO, INC2
  4341. LFDX B1, BO, INC2
  4342. add BO, BO, INC
  4343. bdnz+ .L116
  4344. .align 4
  4345. .L117:
  4346. fxcpmadd f0, B1, A1, f0
  4347. .align 4
  4348. .L118:
  4349. fpadd f0, f0, f1
  4350. fpadd f2, f3, f2
  4351. fpadd f0, f0, f2
  4352. #if defined(LN) || defined(RT)
  4353. #ifdef LN
  4354. subi r0, KK, 2
  4355. #else
  4356. subi r0, KK, 1
  4357. #endif
  4358. slwi TEMP, r0, 1 + BASE_SHIFT
  4359. slwi r0, r0, 0 + BASE_SHIFT
  4360. add AO, AORIG, TEMP
  4361. add BO, B, r0
  4362. addi BO, BO, - 2 * SIZE
  4363. #endif
  4364. #if defined(LN) || defined(LT)
  4365. LFPDX f16, BO, INC2
  4366. fpsub f0, f16, f0
  4367. #else
  4368. LFPDX f16, AO, INC2
  4369. fpsub f0, f16, f0
  4370. #endif
  4371. #ifdef LN
  4372. fsmtp f4, f0
  4373. LFD A1, (2 + 3) * SIZE(AO)
  4374. LFD A2, (2 + 2) * SIZE(AO)
  4375. LFD A3, (2 + 0) * SIZE(AO)
  4376. fmul f4, A1, f4
  4377. fnmsub f0, A2, f4, f0
  4378. fmul f0, A3, f0
  4379. fsmfp f0, f4
  4380. #endif
  4381. #ifdef LT
  4382. fsmtp f4, f0
  4383. LFD A1, (2 + 0) * SIZE(AO)
  4384. LFD A2, (2 + 1) * SIZE(AO)
  4385. LFD A3, (2 + 3) * SIZE(AO)
  4386. fmul f0, A1, f0
  4387. fnmsub f4, A2, f0, f4
  4388. fmul f4, A3, f4
  4389. fsmfp f0, f4
  4390. #endif
  4391. #ifdef RN
  4392. LFPDX A1, BO, INC2
  4393. fxpmul f0, A1, f0
  4394. #endif
  4395. #ifdef RT
  4396. LFPDX A1, BO, INC2
  4397. fxpmul f0, A1, f0
  4398. #endif
  4399. #ifdef LN
  4400. subi CO1, CO1, 2 * SIZE
  4401. #endif
  4402. #if defined(LN) || defined(LT)
  4403. STFPDX f0, BO, INC2
  4404. STFDUX f0, CO1, INC
  4405. STFSDUX f0, CO1, INC
  4406. #else
  4407. STFPDX f0, AO, INC2
  4408. STFDUX f0, CO1, INC
  4409. STFSDUX f0, CO1, INC
  4410. #endif
  4411. #ifdef LN
  4412. subi CO1, CO1, 2 * SIZE
  4413. #endif
  4414. #ifdef RT
  4415. slwi r0, K, 1 + BASE_SHIFT
  4416. add AORIG, AORIG, r0
  4417. #endif
  4418. #if defined(LT) || defined(RN)
  4419. sub TEMP, K, KK
  4420. slwi r0, TEMP, 1 + BASE_SHIFT
  4421. slwi TEMP, TEMP, 0 + BASE_SHIFT
  4422. add AO, AO, r0
  4423. add BO, BO, TEMP
  4424. #endif
  4425. #ifdef LT
  4426. addi KK, KK, 2
  4427. #endif
  4428. #ifdef LN
  4429. subi KK, KK, 2
  4430. #endif
  4431. li r0, FZERO
  4432. lfpsx f0, SP, r0
  4433. .align 4
  4434. .L120:
  4435. andi. I, M, 1
  4436. beq .L129
  4437. #if defined(LT) || defined(RN)
  4438. addi BO, B, - 2 * SIZE
  4439. fpmr f1, f0
  4440. fpmr f2, f0
  4441. fpmr f3, f0
  4442. srawi. r0, KK, 3
  4443. mtspr CTR, r0
  4444. ble .L124
  4445. #else
  4446. #ifdef LN
  4447. slwi r0, K, 0 + BASE_SHIFT
  4448. sub AORIG, AORIG, r0
  4449. #endif
  4450. slwi r0 , KK, 0 + BASE_SHIFT
  4451. slwi TEMP, KK, 0 + BASE_SHIFT
  4452. add AO, AORIG, r0
  4453. add BO, B, TEMP
  4454. sub TEMP, K, KK
  4455. addi BO, B, - 2 * SIZE
  4456. fpmr f1, f0
  4457. fpmr f2, f0
  4458. fpmr f3, f0
  4459. srawi. r0, TEMP, 3
  4460. mtspr CTR, r0
  4461. ble .L124
  4462. #endif
  4463. LFPDUX A1, AO, INC2
  4464. LFPDUX B1, BO, INC2
  4465. LFPDUX A2, AO, INC2
  4466. LFPDUX B2, BO, INC2
  4467. LFPDUX A3, AO, INC2
  4468. LFPDUX B3, BO, INC2
  4469. LFPDUX A4, AO, INC2
  4470. LFPDUX B4, BO, INC2
  4471. bdz- .L123
  4472. .align 4
  4473. .L122:
  4474. fpmadd f0, A1, B1, f0
  4475. LFPDUX A1, AO, INC2
  4476. LFPDUX B1, BO, INC2
  4477. fpmadd f1, A2, B2, f1
  4478. LFPDUX A2, AO, INC2
  4479. LFPDUX B2, BO, INC2
  4480. fpmadd f2, A3, B3, f2
  4481. LFPDUX A3, AO, INC2
  4482. LFPDUX B3, BO, INC2
  4483. fpmadd f3, A4, B4, f3
  4484. LFPDUX A4, AO, INC2
  4485. LFPDUX B4, BO, INC2
  4486. bdnz+ .L122
  4487. .align 4
  4488. .L123:
  4489. fpmadd f0, A1, B1, f0
  4490. fpmadd f1, A2, B2, f1
  4491. fpmadd f2, A3, B3, f2
  4492. fpmadd f3, A4, B4, f3
  4493. .align 4
  4494. .L124:
  4495. #if defined(LT) || defined(RN)
  4496. andi. r0, KK, 7
  4497. mtspr CTR, r0
  4498. ble+ .L128
  4499. #else
  4500. andi. r0, TEMP, 7
  4501. mtspr CTR, r0
  4502. ble+ .L128
  4503. #endif
  4504. LFDX A1, AO, INC2
  4505. LFDX B1, BO, INC2
  4506. add AO, AO, INC
  4507. add BO, BO, INC
  4508. bdz- .L127
  4509. .align 4
  4510. .L126:
  4511. fmadd f0, A1, B1, f0
  4512. LFDX A1, AO, INC2
  4513. LFDX B1, BO, INC2
  4514. add AO, AO, INC
  4515. add BO, BO, INC
  4516. bdnz+ .L126
  4517. .align 4
  4518. .L127:
  4519. fmadd f0, A1, B1, f0
  4520. .align 4
  4521. .L128:
  4522. fpadd f0, f0, f1
  4523. fpadd f2, f2, f3
  4524. fpadd f0, f0, f2
  4525. fsmtp f1, f0
  4526. fadd f0, f0, f1
  4527. #if defined(LN) || defined(RT)
  4528. #ifdef LN
  4529. subi r0, KK, 1
  4530. #else
  4531. subi r0, KK, 1
  4532. #endif
  4533. slwi TEMP, r0, 0 + BASE_SHIFT
  4534. slwi r0, r0, 0 + BASE_SHIFT
  4535. add AO, AORIG, TEMP
  4536. add BO, B, r0
  4537. addi BO, BO, - 2 * SIZE
  4538. #endif
  4539. #if defined(LN) || defined(LT)
  4540. LFDX f16, BO, INC2
  4541. fsub f0, f16, f0
  4542. #else
  4543. LFDX f16, AO, INC2
  4544. fsub f0, f16, f0
  4545. #endif
  4546. #ifdef LN
  4547. LFD A1, (2 + 0) * SIZE(AO)
  4548. fmul f0, A1, f0
  4549. #endif
  4550. #ifdef LT
  4551. LFD A1, (2 + 0) * SIZE(AO)
  4552. fmul f0, A1, f0
  4553. #endif
  4554. #ifdef RN
  4555. LFDX A1, BO, INC2
  4556. fmul f0, A1, f0
  4557. #endif
  4558. #ifdef RT
  4559. LFDX A1, BO, INC2
  4560. fmul f0, A1, f0
  4561. #endif
  4562. #ifdef LN
  4563. subi CO1, CO1, 1 * SIZE
  4564. #endif
  4565. #if defined(LN) || defined(LT)
  4566. STFDX f0, BO, INC2
  4567. STFDUX f0, CO1, INC
  4568. #else
  4569. STFDX f0, AO, INC2
  4570. STFDUX f0, CO1, INC
  4571. #endif
  4572. #ifdef LN
  4573. subi CO1, CO1, 1 * SIZE
  4574. #endif
  4575. #ifdef RT
  4576. slwi r0, K, 0 + BASE_SHIFT
  4577. add AORIG, AORIG, r0
  4578. #endif
  4579. #if defined(LT) || defined(RN)
  4580. sub TEMP, K, KK
  4581. slwi r0, TEMP, 0 + BASE_SHIFT
  4582. slwi TEMP, TEMP, 0 + BASE_SHIFT
  4583. add AO, AO, r0
  4584. add BO, BO, TEMP
  4585. #endif
  4586. #ifdef LT
  4587. addi KK, KK, 1
  4588. #endif
  4589. #ifdef LN
  4590. subi KK, KK, 1
  4591. #endif
  4592. .align 4
  4593. .L129:
  4594. #ifdef LN
  4595. slwi r0, K, 0 + BASE_SHIFT
  4596. add B, B, r0
  4597. #endif
  4598. #if defined(LT) || defined(RN)
  4599. addi B, BO, 2 * SIZE
  4600. #endif
  4601. #ifdef RN
  4602. addi KK, KK, 1
  4603. #endif
  4604. #ifdef RT
  4605. subi KK, KK, 1
  4606. #endif
  4607. .align 4
  4608. .L999:
  4609. addi SP, SP, 12
  4610. lwzu r14, 4(SP)
  4611. lwzu r15, 4(SP)
  4612. lwzu r16, 4(SP)
  4613. lwzu r17, 4(SP)
  4614. lwzu r18, 4(SP)
  4615. lwzu r19, 4(SP)
  4616. lwzu r20, 4(SP)
  4617. lwzu r21, 4(SP)
  4618. lwzu r22, 4(SP)
  4619. lwzu r23, 4(SP)
  4620. lwzu r24, 4(SP)
  4621. lwzu r25, 4(SP)
  4622. lwzu r26, 4(SP)
  4623. lwzu r27, 4(SP)
  4624. lwzu r28, 4(SP)
  4625. lwzu r29, 4(SP)
  4626. lwzu r30, 4(SP)
  4627. lwzu r31, 4(SP)
  4628. subi SP, SP, 12
  4629. li r0, 16
  4630. lfpdux f31, SP, r0
  4631. lfpdux f30, SP, r0
  4632. lfpdux f29, SP, r0
  4633. lfpdux f28, SP, r0
  4634. lfpdux f27, SP, r0
  4635. lfpdux f26, SP, r0
  4636. lfpdux f25, SP, r0
  4637. lfpdux f24, SP, r0
  4638. lfpdux f23, SP, r0
  4639. lfpdux f22, SP, r0
  4640. lfpdux f21, SP, r0
  4641. lfpdux f20, SP, r0
  4642. lfpdux f19, SP, r0
  4643. lfpdux f18, SP, r0
  4644. lfpdux f17, SP, r0
  4645. lfpdux f16, SP, r0
  4646. lfpdux f15, SP, r0
  4647. lfpdux f14, SP, r0
  4648. addi SP, SP, 16
  4649. blr
  4650. EPILOGUE
  4651. #endif