You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_hummer.S 79 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #undef ZERO
  41. #define ALPHA 0
  42. #define FZERO 16
  43. #define M r3
  44. #define N r4
  45. #define K r5
  46. #if defined(linux) || defined(__FreeBSD__)
  47. #define A r6
  48. #define B r7
  49. #define C r8
  50. #define LDC r9
  51. #define OFFSET r10
  52. #endif
  53. #define TEMP r11
  54. #define KK r14
  55. #define INCM1 r15
  56. #define INCM3 r16
  57. #define INCM5 r17
  58. #define INCM7 r18
  59. #define INC2 r19
  60. #define INC r20
  61. #define INC4 r21
  62. #define I r22
  63. #define J r23
  64. #define AO r24
  65. #define BO r25
  66. #define AO2 r26
  67. #define BO2 r27
  68. #define CO1 r28
  69. #define CO2 r29
  70. #define ZERO r31
  71. #ifndef NEEDPARAM
  72. #define A1 f16
  73. #define A2 f17
  74. #define A3 f18
  75. #define A4 f19
  76. #define A5 f20
  77. #define A6 f21
  78. #define A7 f22
  79. #define A8 f23
  80. #define A9 f24
  81. #define A10 f25
  82. #define B1 f26
  83. #define B2 f27
  84. #define B3 f28
  85. #define B4 f29
  86. #define B5 f30
  87. #define B6 f31
  88. #define AP B6
  89. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  90. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  91. #define FXCPMADD fxcpmadd
  92. #define FXCSMADD fxcxnpma
  93. #else
  94. #define FXCPMADD fxcpnsma
  95. #define FXCSMADD fxcxma
  96. #endif
  97. PROLOGUE
  98. PROFCODE
  99. li r0, -16
  100. stfpdux f14, SP, r0
  101. stfpdux f15, SP, r0
  102. stfpdux f16, SP, r0
  103. stfpdux f17, SP, r0
  104. stfpdux f18, SP, r0
  105. stfpdux f19, SP, r0
  106. stfpdux f20, SP, r0
  107. stfpdux f21, SP, r0
  108. stfpdux f22, SP, r0
  109. stfpdux f23, SP, r0
  110. stfpdux f24, SP, r0
  111. stfpdux f25, SP, r0
  112. stfpdux f26, SP, r0
  113. stfpdux f27, SP, r0
  114. stfpdux f28, SP, r0
  115. stfpdux f29, SP, r0
  116. stfpdux f30, SP, r0
  117. stfpdux f31, SP, r0
  118. stwu r31, -4(SP)
  119. stwu r30, -4(SP)
  120. stwu r29, -4(SP)
  121. stwu r28, -4(SP)
  122. stwu r27, -4(SP)
  123. stwu r26, -4(SP)
  124. stwu r25, -4(SP)
  125. stwu r24, -4(SP)
  126. stwu r23, -4(SP)
  127. stwu r22, -4(SP)
  128. stwu r21, -4(SP)
  129. stwu r20, -4(SP)
  130. stwu r19, -4(SP)
  131. stwu r18, -4(SP)
  132. stwu r17, -4(SP)
  133. stwu r16, -4(SP)
  134. stwu r15, -4(SP)
  135. stwu r14, -4(SP)
  136. li r0, 0
  137. stwu r0, -4(SP)
  138. stwu r0, -4(SP)
  139. stfdu f2, -8(SP)
  140. stfdu f1, -8(SP)
  141. slwi LDC, LDC, ZBASE_SHIFT
  142. cmpwi cr0, M, 0
  143. ble .L999
  144. cmpwi cr0, N, 0
  145. ble .L999
  146. cmpwi cr0, K, 0
  147. ble .L999
  148. #if defined(TRMMKERNEL) && !defined(LEFT)
  149. neg KK, OFFSET
  150. #endif
  151. andi. r0, C, 2 * SIZE - 1
  152. bne .L1000
  153. li INC, 1 * SIZE
  154. li INC2, 2 * SIZE
  155. li INC4, 4 * SIZE
  156. li INCM1, -1 * SIZE
  157. li INCM3, -2 * SIZE
  158. li INCM5, -4 * SIZE
  159. li INCM7, -6 * SIZE
  160. addi C, C, - 2 * SIZE
  161. srawi. J, N, 1
  162. ble .L50
  163. .align 4
  164. .L10:
  165. mr CO1, C
  166. add CO2, C, LDC
  167. add C, CO2, LDC
  168. #if defined(TRMMKERNEL) && defined(LEFT)
  169. mr KK, OFFSET
  170. #endif
  171. addi AO, A, -4 * SIZE
  172. li r0, FZERO
  173. lfpsx f0, SP, r0
  174. srawi. I, M, 2
  175. ble .L20
  176. .align 4
  177. .L11:
  178. #if defined(TRMMKERNEL)
  179. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  180. addi AO2, AO, 2 * SIZE
  181. fpmr f4, f0
  182. addi BO, B, - 4 * SIZE
  183. fpmr f8, f0
  184. addi BO2, B, - 2 * SIZE
  185. fpmr f12, f0
  186. #else
  187. slwi TEMP, KK, 2 + ZBASE_SHIFT
  188. slwi r0, KK, 1 + ZBASE_SHIFT
  189. add AO, AO, TEMP
  190. add BO, B, r0
  191. addi AO2, AO, 2 * SIZE
  192. fpmr f4, f0
  193. addi BO, BO, - 4 * SIZE
  194. fpmr f8, f0
  195. addi BO2, BO, 2 * SIZE
  196. fpmr f12, f0
  197. #endif
  198. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  199. sub TEMP, K, KK
  200. #elif defined(LEFT)
  201. addi TEMP, KK, 4
  202. #else
  203. addi TEMP, KK, 2
  204. #endif
  205. srawi. r0, TEMP, 2
  206. fpmr f1, f0
  207. mtspr CTR, r0
  208. ble .L14
  209. #else
  210. addi AO2, AO, 2 * SIZE
  211. fpmr f4, f0
  212. addi BO, B, - 4 * SIZE
  213. fpmr f8, f0
  214. addi BO2, B, - 2 * SIZE
  215. fpmr f12, f0
  216. srawi. r0, K, 2
  217. fpmr f1, f0
  218. mtspr CTR, r0
  219. ble .L14
  220. #endif
  221. LFPDUX A1, AO, INC4
  222. fpmr f5, f0
  223. LFPDUX A3, AO, INC4
  224. fpmr f9, f0
  225. LFPDUX B1, BO, INC4
  226. fpmr f13, f0
  227. LFPDUX A5, AO, INC4
  228. fpmr f2, f0
  229. LFPDUX A6, AO, INC4
  230. fpmr f6, f0
  231. LFPDUX B3, BO, INC4
  232. fpmr f10, f0
  233. LFPDUX A7, AO, INC4
  234. fpmr f14, f0
  235. LFPDUX A8, AO, INC4
  236. fpmr f3, f0
  237. LFPDUX B5, BO, INC4
  238. fpmr f7, f0
  239. LFPDUX A9, AO, INC4
  240. fpmr f11, f0
  241. LFPDUX A2, AO2, INC4
  242. fpmr f15, f0
  243. LFPDUX B2, BO2, INC4
  244. bdz- .L13
  245. .align 4
  246. .L12:
  247. ## 1 ##
  248. FXCPMADD f0, B1, A1, f0
  249. nop
  250. FXCSMADD f4, B1, A1, f4
  251. nop
  252. FXCPMADD f8, B2, A1, f8
  253. LFPDUX B4, BO2, INC4
  254. FXCSMADD f12, B2, A1, f12
  255. LFPDUX B6, BO, INC4
  256. FXCPMADD f1, B1, A2, f1
  257. nop
  258. FXCSMADD f5, B1, A2, f5
  259. LFPDUX A4, AO2, INC4
  260. FXCPMADD f9, B2, A2, f9
  261. LFPDUX A10, AO, INC4
  262. FXCSMADD f13, B2, A2, f13
  263. nop
  264. FXCPMADD f2, B1, A3, f2
  265. nop
  266. FXCSMADD f6, B1, A3, f6
  267. nop
  268. FXCPMADD f10, B2, A3, f10
  269. nop
  270. FXCSMADD f14, B2, A3, f14
  271. nop
  272. FXCPMADD f3, B1, A4, f3
  273. nop
  274. FXCSMADD f7, B1, A4, f7
  275. LFPDUX A2, AO2, INC4
  276. FXCPMADD f11, B2, A4, f11
  277. LFPDUX A1, AO, INC4
  278. FXCSMADD f15, B2, A4, f15
  279. nop
  280. ## 2 ##
  281. FXCPMADD f0, B3, A5, f0
  282. nop
  283. FXCSMADD f4, B3, A5, f4
  284. nop
  285. FXCPMADD f8, B4, A5, f8
  286. LFPDUX B2, BO2, INC4
  287. FXCSMADD f12, B4, A5, f12
  288. LFPDUX B1, BO, INC4
  289. FXCPMADD f1, B3, A2, f1
  290. nop
  291. FXCSMADD f5, B3, A2, f5
  292. LFPDUX A4, AO2, INC4
  293. FXCPMADD f9, B4, A2, f9
  294. LFPDUX A3, AO, INC4
  295. FXCSMADD f13, B4, A2, f13
  296. nop
  297. FXCPMADD f2, B3, A6, f2
  298. nop
  299. FXCSMADD f6, B3, A6, f6
  300. nop
  301. FXCPMADD f10, B4, A6, f10
  302. nop
  303. FXCSMADD f14, B4, A6, f14
  304. nop
  305. FXCPMADD f3, B3, A4, f3
  306. nop
  307. FXCSMADD f7, B3, A4, f7
  308. LFPDUX A2, AO2, INC4
  309. FXCPMADD f11, B4, A4, f11
  310. LFPDUX A5, AO, INC4
  311. FXCSMADD f15, B4, A4, f15
  312. nop
  313. ## 3 ##
  314. FXCPMADD f0, B5, A7, f0
  315. nop
  316. FXCSMADD f4, B5, A7, f4
  317. nop
  318. FXCPMADD f8, B2, A7, f8
  319. LFPDUX B4, BO2, INC4
  320. FXCSMADD f12, B2, A7, f12
  321. LFPDUX B3, BO, INC4
  322. FXCPMADD f1, B5, A2, f1
  323. nop
  324. FXCSMADD f5, B5, A2, f5
  325. LFPDUX A4, AO2, INC4
  326. FXCPMADD f9, B2, A2, f9
  327. LFPDUX A6, AO, INC4
  328. FXCSMADD f13, B2, A2, f13
  329. nop
  330. FXCPMADD f2, B5, A8, f2
  331. nop
  332. FXCSMADD f6, B5, A8, f6
  333. nop
  334. FXCPMADD f10, B2, A8, f10
  335. nop
  336. FXCSMADD f14, B2, A8, f14
  337. nop
  338. FXCPMADD f3, B5, A4, f3
  339. nop
  340. FXCSMADD f7, B5, A4, f7
  341. LFPDUX A2, AO2, INC4
  342. FXCPMADD f11, B2, A4, f11
  343. LFPDUX A7, AO, INC4
  344. FXCSMADD f15, B2, A4, f15
  345. nop
  346. ## 4 ##
  347. FXCPMADD f0, B6, A9, f0
  348. nop
  349. FXCSMADD f4, B6, A9, f4
  350. nop
  351. FXCPMADD f8, B4, A9, f8
  352. LFPDUX B2, BO2, INC4
  353. FXCSMADD f12, B4, A9, f12
  354. LFPDUX B5, BO, INC4
  355. FXCPMADD f1, B6, A2, f1
  356. nop
  357. FXCSMADD f5, B6, A2, f5
  358. LFPDUX A4, AO2, INC4
  359. FXCPMADD f9, B4, A2, f9
  360. LFPDUX A8, AO, INC4
  361. FXCSMADD f13, B4, A2, f13
  362. nop
  363. FXCPMADD f2, B6, A10, f2
  364. nop
  365. FXCSMADD f6, B6, A10, f6
  366. nop
  367. FXCPMADD f10, B4, A10, f10
  368. nop
  369. FXCSMADD f14, B4, A10, f14
  370. nop
  371. FXCPMADD f3, B6, A4, f3
  372. LFPDUX A2, AO2, INC4
  373. FXCSMADD f7, B6, A4, f7
  374. LFPDUX A9, AO, INC4
  375. FXCPMADD f11, B4, A4, f11
  376. nop
  377. FXCSMADD f15, B4, A4, f15
  378. bdnz+ .L12
  379. .align 4
  380. .L13:
  381. ## 1 ##
  382. FXCPMADD f0, B1, A1, f0
  383. nop
  384. FXCSMADD f4, B1, A1, f4
  385. nop
  386. FXCPMADD f8, B2, A1, f8
  387. LFPDUX B4, BO2, INC4
  388. FXCSMADD f12, B2, A1, f12
  389. LFPDUX B6, BO, INC4
  390. FXCPMADD f1, B1, A2, f1
  391. nop
  392. FXCSMADD f5, B1, A2, f5
  393. LFPDUX A4, AO2, INC4
  394. FXCPMADD f9, B2, A2, f9
  395. LFPDUX A10, AO, INC4
  396. FXCSMADD f13, B2, A2, f13
  397. nop
  398. FXCPMADD f2, B1, A3, f2
  399. nop
  400. FXCSMADD f6, B1, A3, f6
  401. nop
  402. FXCPMADD f10, B2, A3, f10
  403. nop
  404. FXCSMADD f14, B2, A3, f14
  405. nop
  406. FXCPMADD f3, B1, A4, f3
  407. nop
  408. FXCSMADD f7, B1, A4, f7
  409. LFPDUX A2, AO2, INC4
  410. FXCPMADD f11, B2, A4, f11
  411. #ifndef TRMMKERNEL
  412. LFPDUX A1, CO1, INC2
  413. #else
  414. nop
  415. #endif
  416. FXCSMADD f15, B2, A4, f15
  417. nop
  418. ## 2 ##
  419. FXCPMADD f0, B3, A5, f0
  420. nop
  421. FXCSMADD f4, B3, A5, f4
  422. nop
  423. FXCPMADD f8, B4, A5, f8
  424. LFPDUX B2, BO2, INC4
  425. FXCSMADD f12, B4, A5, f12
  426. #ifndef TRMMKERNEL
  427. LFPDUX B1, CO1, INC2
  428. #else
  429. nop
  430. #endif
  431. FXCPMADD f1, B3, A2, f1
  432. nop
  433. FXCSMADD f5, B3, A2, f5
  434. LFPDUX A4, AO2, INC4
  435. FXCPMADD f9, B4, A2, f9
  436. #ifndef TRMMKERNEL
  437. LFPDUX A3, CO1, INC2
  438. #else
  439. nop
  440. #endif
  441. FXCSMADD f13, B4, A2, f13
  442. nop
  443. FXCPMADD f2, B3, A6, f2
  444. nop
  445. FXCSMADD f6, B3, A6, f6
  446. nop
  447. FXCPMADD f10, B4, A6, f10
  448. nop
  449. FXCSMADD f14, B4, A6, f14
  450. nop
  451. FXCPMADD f3, B3, A4, f3
  452. nop
  453. FXCSMADD f7, B3, A4, f7
  454. LFPDUX A2, AO2, INC4
  455. FXCPMADD f11, B4, A4, f11
  456. #ifndef TRMMKERNEL
  457. LFPDUX A5, CO1, INC2
  458. #else
  459. nop
  460. #endif
  461. FXCSMADD f15, B4, A4, f15
  462. nop
  463. ## 3 ##
  464. FXCPMADD f0, B5, A7, f0
  465. nop
  466. FXCSMADD f4, B5, A7, f4
  467. nop
  468. FXCPMADD f8, B2, A7, f8
  469. LFPDUX B4, BO2, INC4
  470. FXCSMADD f12, B2, A7, f12
  471. #ifndef TRMMKERNEL
  472. LFPDUX B3, CO2, INC2
  473. #else
  474. nop
  475. #endif
  476. FXCPMADD f1, B5, A2, f1
  477. nop
  478. FXCSMADD f5, B5, A2, f5
  479. LFPDUX A4, AO2, INC4
  480. FXCPMADD f9, B2, A2, f9
  481. #ifndef TRMMKERNEL
  482. LFPDUX A6, CO2, INC2
  483. #else
  484. nop
  485. #endif
  486. FXCSMADD f13, B2, A2, f13
  487. FXCPMADD f2, B5, A8, f2
  488. nop
  489. FXCSMADD f6, B5, A8, f6
  490. nop
  491. FXCPMADD f10, B2, A8, f10
  492. nop
  493. FXCSMADD f14, B2, A8, f14
  494. nop
  495. FXCPMADD f3, B5, A4, f3
  496. nop
  497. FXCSMADD f7, B5, A4, f7
  498. LFPDUX A2, AO2, INC4
  499. FXCPMADD f11, B2, A4, f11
  500. #ifndef TRMMKERNEL
  501. LFPDUX A7, CO2, INC2
  502. #else
  503. nop
  504. #endif
  505. FXCSMADD f15, B2, A4, f15
  506. nop
  507. ## 4 ##
  508. FXCPMADD f0, B6, A9, f0
  509. nop
  510. FXCSMADD f4, B6, A9, f4
  511. nop
  512. FXCPMADD f8, B4, A9, f8
  513. #ifndef TRMMKERNEL
  514. LFPDUX B2, CO2, INC2
  515. #else
  516. nop
  517. #endif
  518. FXCSMADD f12, B4, A9, f12
  519. FXCPMADD f1, B6, A2, f1
  520. nop
  521. FXCSMADD f5, B6, A2, f5
  522. LFPDUX A4, AO2, INC4
  523. FXCPMADD f9, B4, A2, f9
  524. nop
  525. FXCSMADD f13, B4, A2, f13
  526. nop
  527. FXCPMADD f2, B6, A10, f2
  528. FXCSMADD f6, B6, A10, f6
  529. FXCPMADD f10, B4, A10, f10
  530. FXCSMADD f14, B4, A10, f14
  531. FXCPMADD f3, B6, A4, f3
  532. FXCSMADD f7, B6, A4, f7
  533. FXCPMADD f11, B4, A4, f11
  534. FXCSMADD f15, B4, A4, f15
  535. .align 4
  536. .L14:
  537. li r0, ALPHA
  538. lfpdx AP, SP, r0
  539. #ifdef TRMMKERNEL
  540. li r0, FZERO
  541. lfpsx f30, SP, r0
  542. #endif
  543. #if defined(TRMMKERNEL)
  544. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  545. sub TEMP, K, KK
  546. #elif defined(LEFT)
  547. addi TEMP, KK, 4
  548. #else
  549. addi TEMP, KK, 2
  550. #endif
  551. andi. r0, TEMP, 3
  552. mtspr CTR, r0
  553. ble+ .L18
  554. cmpwi cr0, TEMP, 3
  555. bgt+ .L15
  556. #else
  557. andi. r0, K, 3
  558. mtspr CTR, r0
  559. ble+ .L18
  560. cmpwi cr0, K, 3
  561. bgt+ .L15
  562. #endif
  563. #ifndef TRMMKERNEL
  564. LFPDUX A1, CO1, INC2
  565. fpmr f5, f0
  566. LFPDUX B1, CO1, INC2
  567. fpmr f9, f0
  568. LFPDUX A3, CO1, INC2
  569. fpmr f13, f0
  570. LFPDUX A5, CO1, INC2
  571. fpmr f2, f0
  572. LFPDUX B3, CO2, INC2
  573. fpmr f6, f0
  574. LFPDUX A6, CO2, INC2
  575. fpmr f10, f0
  576. LFPDUX A7, CO2, INC2
  577. fpmr f14, f0
  578. LFPDUX B2, CO2, INC2
  579. fpmr f3, f0
  580. #else
  581. fpmr f5, f0
  582. fpmr f9, f0
  583. fpmr f13, f0
  584. fpmr f2, f0
  585. fpmr f6, f0
  586. fpmr f10, f0
  587. fpmr f14, f0
  588. fpmr f3, f0
  589. #endif
  590. fpmr f7, f0
  591. fpmr f11, f0
  592. fpmr f15, f0
  593. .align 4
  594. .L15:
  595. LFPDUX A2, AO, INC4
  596. LFPDUX A4, AO2, INC4
  597. LFPDUX A10, BO, INC4
  598. LFPDUX B4, BO2, INC4
  599. bdz- .L17
  600. .align 4
  601. .L16:
  602. FXCPMADD f0, A10, A2, f0
  603. FXCSMADD f4, A10, A2, f4
  604. FXCPMADD f8, B4, A2, f8
  605. FXCSMADD f12, B4, A2, f12
  606. LFPDUX A2, AO, INC4
  607. FXCPMADD f1, A10, A4, f1
  608. FXCSMADD f5, A10, A4, f5
  609. FXCPMADD f9, B4, A4, f9
  610. FXCSMADD f13, B4, A4, f13
  611. LFPDUX A4, AO2, INC4
  612. FXCPMADD f2, A10, A2, f2
  613. FXCSMADD f6, A10, A2, f6
  614. FXCPMADD f10, B4, A2, f10
  615. FXCSMADD f14, B4, A2, f14
  616. LFPDUX A2, AO, INC4
  617. FXCPMADD f3, A10, A4, f3
  618. FXCSMADD f7, A10, A4, f7
  619. LFPDUX A10, BO, INC4
  620. FXCPMADD f11, B4, A4, f11
  621. FXCSMADD f15, B4, A4, f15
  622. LFPDUX A4, AO2, INC4
  623. LFPDUX B4, BO2, INC4
  624. bdnz+ .L16
  625. .align 4
  626. .L17:
  627. FXCPMADD f0, A10, A2, f0
  628. FXCSMADD f4, A10, A2, f4
  629. FXCPMADD f8, B4, A2, f8
  630. FXCSMADD f12, B4, A2, f12
  631. LFPDUX A2, AO, INC4
  632. FXCPMADD f1, A10, A4, f1
  633. FXCSMADD f5, A10, A4, f5
  634. FXCPMADD f9, B4, A4, f9
  635. FXCSMADD f13, B4, A4, f13
  636. LFPDUX A4, AO2, INC4
  637. FXCPMADD f2, A10, A2, f2
  638. FXCSMADD f6, A10, A2, f6
  639. FXCPMADD f10, B4, A2, f10
  640. FXCSMADD f14, B4, A2, f14
  641. FXCPMADD f3, A10, A4, f3
  642. FXCSMADD f7, A10, A4, f7
  643. FXCPMADD f11, B4, A4, f11
  644. FXCSMADD f15, B4, A4, f15
  645. .align 4
  646. .L18:
  647. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  648. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  649. fpadd f0, f0, f4
  650. fpadd f8, f8, f12
  651. fpadd f1, f1, f5
  652. fpadd f9, f9, f13
  653. fpadd f2, f2, f6
  654. fpadd f10, f10, f14
  655. fpadd f3, f3, f7
  656. fpadd f11, f11, f15
  657. #else
  658. fpsub f0, f0, f4
  659. fpsub f8, f8, f12
  660. fpsub f1, f1, f5
  661. fpsub f9, f9, f13
  662. fpsub f2, f2, f6
  663. fpsub f10, f10, f14
  664. fpsub f3, f3, f7
  665. fpsub f11, f11, f15
  666. #endif
  667. #ifndef TRMMKERNEL
  668. fxcpmadd A1, f0, AP, A1
  669. fxcpmadd B1, f1, AP, B1
  670. fxcpmadd A3, f2, AP, A3
  671. fxcpmadd A5, f3, AP, A5
  672. fxcxnpma f0, f0, AP, A1
  673. fxcpmadd B3, f8, AP, B3
  674. fxcxnpma f1, f1, AP, B1
  675. fxcpmadd A6, f9, AP, A6
  676. fxcxnpma f2, f2, AP, A3
  677. fxcpmadd A7, f10, AP, A7
  678. fxcxnpma f3, f3, AP, A5
  679. fxcpmadd B2, f11, AP, B2
  680. fxcxnpma f8, f8, AP, B3
  681. STFPDUX f0, CO1, INCM7
  682. fxcxnpma f9, f9, AP, A6
  683. STFPDUX f1, CO1, INC2
  684. fxcxnpma f10, f10, AP, A7
  685. STFPDUX f2, CO1, INC2
  686. fxcxnpma f11, f11, AP, B2
  687. STFPDUX f3, CO1, INC2
  688. STFPDUX f8, CO2, INCM7
  689. STFPDUX f9, CO2, INC2
  690. STFPDUX f10, CO2, INC2
  691. STFPDUX f11, CO2, INC2
  692. #else
  693. fxcpmadd f12, f0, AP, f30
  694. fxcpmadd f13, f1, AP, f30
  695. fxcpmadd f14, f2, AP, f30
  696. fxcpmadd f15, f3, AP, f30
  697. fxcxnpma f0, f0, AP, f12
  698. fxcxnpma f1, f1, AP, f13
  699. fxcxnpma f2, f2, AP, f14
  700. fxcxnpma f3, f3, AP, f15
  701. fxcpmadd f16, f8, AP, f30
  702. fxcpmadd f17, f9, AP, f30
  703. fxcpmadd f18, f10, AP, f30
  704. fxcpmadd f19, f11, AP, f30
  705. fxcxnpma f8, f8, AP, f16
  706. fxcxnpma f9, f9, AP, f17
  707. fxcxnpma f10, f10, AP, f18
  708. fxcxnpma f11, f11, AP, f19
  709. STFPDUX f0, CO1, INC2
  710. STFPDUX f1, CO1, INC2
  711. STFPDUX f2, CO1, INC2
  712. STFPDUX f3, CO1, INC2
  713. STFPDUX f8, CO2, INC2
  714. STFPDUX f9, CO2, INC2
  715. STFPDUX f10, CO2, INC2
  716. STFPDUX f11, CO2, INC2
  717. #endif
  718. #ifdef TRMMKERNEL
  719. #if ( defined(LEFT) && defined(TRANSA)) || \
  720. (!defined(LEFT) && !defined(TRANSA))
  721. sub TEMP, K, KK
  722. #ifdef LEFT
  723. addi TEMP, TEMP, -4
  724. #else
  725. addi TEMP, TEMP, -2
  726. #endif
  727. slwi r0, TEMP, 2 + ZBASE_SHIFT
  728. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  729. add AO, AO, r0
  730. add BO, BO, TEMP
  731. #endif
  732. #ifdef LEFT
  733. addi KK, KK, 4
  734. #endif
  735. #endif
  736. addic. I, I, -1
  737. li r0, FZERO
  738. lfpsx f0, SP, r0
  739. bgt+ .L11
  740. .align 4
  741. .L20:
  742. andi. I, M, 2
  743. beq .L30
  744. #if defined(TRMMKERNEL)
  745. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  746. addi AO2, AO, 2 * SIZE
  747. fpmr f4, f0
  748. addi BO, B, - 4 * SIZE
  749. fpmr f8, f0
  750. addi BO2, B, - 2 * SIZE
  751. fpmr f12, f0
  752. #else
  753. slwi TEMP, KK, 1 + ZBASE_SHIFT
  754. slwi r0, KK, 1 + ZBASE_SHIFT
  755. add AO, AO, TEMP
  756. add BO, B, r0
  757. addi AO2, AO, 2 * SIZE
  758. fpmr f4, f0
  759. addi BO, BO, - 4 * SIZE
  760. fpmr f8, f0
  761. addi BO2, BO, 2 * SIZE
  762. fpmr f12, f0
  763. #endif
  764. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  765. sub TEMP, K, KK
  766. #elif defined(LEFT)
  767. addi TEMP, KK, 2
  768. #else
  769. addi TEMP, KK, 2
  770. #endif
  771. srawi. r0, TEMP, 2
  772. fpmr f1, f0
  773. fpmr f5, f0
  774. fpmr f9, f0
  775. mtspr CTR, r0
  776. fpmr f13, f0
  777. ble .L24
  778. #else
  779. addi AO2, AO, 2 * SIZE
  780. fpmr f4, f0
  781. addi BO, B, - 4 * SIZE
  782. fpmr f8, f0
  783. addi BO2, B, - 2 * SIZE
  784. fpmr f12, f0
  785. srawi. r0, K, 2
  786. fpmr f1, f0
  787. fpmr f5, f0
  788. fpmr f9, f0
  789. mtspr CTR, r0
  790. fpmr f13, f0
  791. ble .L24
  792. #endif
  793. LFPDUX A1, AO, INC4
  794. LFPDUX B1, BO, INC4
  795. LFPDUX A2, AO2, INC4
  796. LFPDUX B2, BO2, INC4
  797. LFPDUX A3, AO, INC4
  798. LFPDUX B3, BO, INC4
  799. LFPDUX A4, AO2, INC4
  800. LFPDUX B4, BO2, INC4
  801. LFPDUX A5, AO, INC4
  802. LFPDUX B5, BO, INC4
  803. LFPDUX A6, AO2, INC4
  804. LFPDUX B6, BO2, INC4
  805. LFPDUX A7, AO, INC4
  806. LFPDUX A9, BO, INC4
  807. LFPDUX A10, BO2, INC4
  808. bdz- .L23
  809. .align 4
  810. .L22:
  811. FXCPMADD f0, B1, A1, f0
  812. nop
  813. FXCSMADD f4, B1, A1, f4
  814. LFPDUX A8, AO2, INC4
  815. FXCPMADD f8, B2, A1, f8
  816. nop
  817. FXCSMADD f12, B2, A1, f12
  818. LFPDUX A1, AO, INC4
  819. FXCPMADD f1, B1, A2, f1
  820. nop
  821. FXCSMADD f5, B1, A2, f5
  822. LFPDUX B1, BO, INC4
  823. FXCPMADD f9, B2, A2, f9
  824. nop
  825. FXCSMADD f13, B2, A2, f13
  826. LFPDUX B2, BO2, INC4
  827. FXCPMADD f0, B3, A3, f0
  828. nop
  829. FXCSMADD f4, B3, A3, f4
  830. LFPDUX A2, AO2, INC4
  831. FXCPMADD f8, B4, A3, f8
  832. nop
  833. FXCSMADD f12, B4, A3, f12
  834. LFPDUX A3, AO, INC4
  835. FXCPMADD f1, B3, A4, f1
  836. nop
  837. FXCSMADD f5, B3, A4, f5
  838. LFPDUX B3, BO, INC4
  839. FXCPMADD f9, B4, A4, f9
  840. nop
  841. FXCSMADD f13, B4, A4, f13
  842. LFPDUX B4, BO2, INC4
  843. FXCPMADD f0, B5, A5, f0
  844. nop
  845. FXCSMADD f4, B5, A5, f4
  846. LFPDUX A4, AO2, INC4
  847. FXCPMADD f8, B6, A5, f8
  848. nop
  849. FXCSMADD f12, B6, A5, f12
  850. LFPDUX A5, AO, INC4
  851. FXCPMADD f1, B5, A6, f1
  852. nop
  853. FXCSMADD f5, B5, A6, f5
  854. LFPDUX B5, BO, INC4
  855. FXCPMADD f9, B6, A6, f9
  856. nop
  857. FXCSMADD f13, B6, A6, f13
  858. LFPDUX B6, BO2, INC4
  859. FXCPMADD f0, A9, A7, f0
  860. nop
  861. FXCSMADD f4, A9, A7, f4
  862. LFPDUX A6, AO2, INC4
  863. FXCPMADD f8, A10, A7, f8
  864. nop
  865. FXCSMADD f12, A10, A7, f12
  866. LFPDUX A7, AO, INC4
  867. FXCPMADD f1, A9, A8, f1
  868. nop
  869. FXCSMADD f5, A9, A8, f5
  870. LFPDUX A9, BO, INC4
  871. FXCPMADD f9, A10, A8, f9
  872. nop
  873. FXCSMADD f13, A10, A8, f13
  874. LFPDUX A10, BO2, INC4
  875. bdnz+ .L22
  876. .align 4
  877. .L23:
  878. FXCPMADD f0, B1, A1, f0
  879. FXCSMADD f4, B1, A1, f4
  880. LFPDUX A8, AO2, INC4
  881. FXCPMADD f8, B2, A1, f8
  882. FXCSMADD f12, B2, A1, f12
  883. FXCPMADD f1, B1, A2, f1
  884. FXCSMADD f5, B1, A2, f5
  885. FXCPMADD f9, B2, A2, f9
  886. FXCSMADD f13, B2, A2, f13
  887. FXCPMADD f0, B3, A3, f0
  888. FXCSMADD f4, B3, A3, f4
  889. FXCPMADD f8, B4, A3, f8
  890. FXCSMADD f12, B4, A3, f12
  891. FXCPMADD f1, B3, A4, f1
  892. FXCSMADD f5, B3, A4, f5
  893. FXCPMADD f9, B4, A4, f9
  894. FXCSMADD f13, B4, A4, f13
  895. FXCPMADD f0, B5, A5, f0
  896. FXCSMADD f4, B5, A5, f4
  897. FXCPMADD f8, B6, A5, f8
  898. FXCSMADD f12, B6, A5, f12
  899. FXCPMADD f1, B5, A6, f1
  900. FXCSMADD f5, B5, A6, f5
  901. FXCPMADD f9, B6, A6, f9
  902. FXCSMADD f13, B6, A6, f13
  903. FXCPMADD f0, A9, A7, f0
  904. FXCSMADD f4, A9, A7, f4
  905. FXCPMADD f8, A10, A7, f8
  906. FXCSMADD f12, A10, A7, f12
  907. FXCPMADD f1, A9, A8, f1
  908. FXCSMADD f5, A9, A8, f5
  909. FXCPMADD f9, A10, A8, f9
  910. FXCSMADD f13, A10, A8, f13
  911. .align 4
  912. .L24:
  913. li r0, ALPHA
  914. lfpdx AP, SP, r0
  915. #ifdef TRMMKERNEL
  916. li r0, FZERO
  917. lfpsx f30, SP, r0
  918. #endif
  919. #if defined(TRMMKERNEL)
  920. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  921. sub TEMP, K, KK
  922. #elif defined(LEFT)
  923. addi TEMP, KK, 2
  924. #else
  925. addi TEMP, KK, 2
  926. #endif
  927. andi. r0, TEMP, 3
  928. mtspr CTR, r0
  929. #else
  930. andi. r0, K, 3
  931. mtspr CTR, r0
  932. #endif
  933. ble+ .L28
  934. LFPDUX A1, AO, INC4
  935. LFPDUX A2, AO2, INC4
  936. LFPDUX B1, BO, INC4
  937. LFPDUX B2, BO2, INC4
  938. bdz- .L27
  939. .align 4
  940. .L26:
  941. FXCPMADD f0, B1, A1, f0
  942. FXCSMADD f4, B1, A1, f4
  943. FXCPMADD f8, B2, A1, f8
  944. FXCSMADD f12, B2, A1, f12
  945. LFPDUX A1, AO, INC4
  946. FXCPMADD f1, B1, A2, f1
  947. FXCSMADD f5, B1, A2, f5
  948. LFPDUX B1, BO, INC4
  949. FXCPMADD f9, B2, A2, f9
  950. FXCSMADD f13, B2, A2, f13
  951. LFPDUX A2, AO2, INC4
  952. LFPDUX B2, BO2, INC4
  953. bdnz+ .L26
  954. .align 4
  955. .L27:
  956. FXCPMADD f0, B1, A1, f0
  957. FXCSMADD f4, B1, A1, f4
  958. FXCPMADD f8, B2, A1, f8
  959. FXCSMADD f12, B2, A1, f12
  960. FXCPMADD f1, B1, A2, f1
  961. FXCSMADD f5, B1, A2, f5
  962. FXCPMADD f9, B2, A2, f9
  963. FXCSMADD f13, B2, A2, f13
  964. .align 4
  965. .L28:
  966. #ifndef TRMMKERNEL
  967. LFPDUX A1, CO1, INC2
  968. LFPDUX A2, CO1, INC2
  969. LFPDUX A3, CO2, INC2
  970. LFPDUX A4, CO2, INC2
  971. #endif
  972. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  973. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  974. fpadd f0, f0, f4
  975. fpadd f8, f8, f12
  976. fpadd f1, f1, f5
  977. fpadd f9, f9, f13
  978. #else
  979. fpsub f0, f0, f4
  980. fpsub f8, f8, f12
  981. fpsub f1, f1, f5
  982. fpsub f9, f9, f13
  983. #endif
  984. #ifndef TRMMKERNEL
  985. fxcpmadd A1, f0, AP, A1
  986. fxcpmadd A2, f1, AP, A2
  987. fxcpmadd A3, f8, AP, A3
  988. fxcpmadd A4, f9, AP, A4
  989. fxcxnpma f0, f0, AP, A1
  990. fxcxnpma f1, f1, AP, A2
  991. fxcxnpma f8, f8, AP, A3
  992. fxcxnpma f9, f9, AP, A4
  993. STFPDUX f0, CO1, INCM3
  994. STFPDUX f1, CO1, INC2
  995. STFPDUX f8, CO2, INCM3
  996. STFPDUX f9, CO2, INC2
  997. #else
  998. fxcpmadd f12, f0, AP, f30
  999. fxcpmadd f13, f1, AP, f30
  1000. fxcpmadd f14, f8, AP, f30
  1001. fxcpmadd f15, f9, AP, f30
  1002. fxcxnpma f0, f0, AP, f12
  1003. fxcxnpma f1, f1, AP, f13
  1004. fxcxnpma f8, f8, AP, f14
  1005. fxcxnpma f9, f9, AP, f15
  1006. STFPDUX f0, CO1, INC2
  1007. STFPDUX f1, CO1, INC2
  1008. STFPDUX f8, CO2, INC2
  1009. STFPDUX f9, CO2, INC2
  1010. #endif
  1011. #ifdef TRMMKERNEL
  1012. #if ( defined(LEFT) && defined(TRANSA)) || \
  1013. (!defined(LEFT) && !defined(TRANSA))
  1014. sub TEMP, K, KK
  1015. #ifdef LEFT
  1016. addi TEMP, TEMP, -2
  1017. #else
  1018. addi TEMP, TEMP, -2
  1019. #endif
  1020. slwi r0, TEMP, 1 + ZBASE_SHIFT
  1021. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  1022. add AO, AO, r0
  1023. add BO, BO, TEMP
  1024. #endif
  1025. #ifdef LEFT
  1026. addi KK, KK, 2
  1027. #endif
  1028. #endif
  1029. li r0, FZERO
  1030. lfpsx f0, SP, r0
  1031. .align 4
  1032. .L30:
  1033. andi. I, M, 1
  1034. beq .L49
  1035. #if defined(TRMMKERNEL)
  1036. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1037. addi AO2, AO, 2 * SIZE
  1038. fpmr f1, f0
  1039. addi BO, B, - 4 * SIZE
  1040. fpmr f2, f0
  1041. addi BO2, B, - 2 * SIZE
  1042. fpmr f3, f0
  1043. #else
  1044. slwi TEMP, KK, 0 + ZBASE_SHIFT
  1045. slwi r0, KK, 1 + ZBASE_SHIFT
  1046. add AO, AO, TEMP
  1047. add BO, B, r0
  1048. addi AO2, AO, 2 * SIZE
  1049. fpmr f1, f0
  1050. addi BO, BO, - 4 * SIZE
  1051. fpmr f2, f0
  1052. addi BO2, BO, 2 * SIZE
  1053. fpmr f3, f0
  1054. #endif
  1055. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1056. sub TEMP, K, KK
  1057. #elif defined(LEFT)
  1058. addi TEMP, KK, 1
  1059. #else
  1060. addi TEMP, KK, 2
  1061. #endif
  1062. srawi. r0, TEMP, 2
  1063. mtspr CTR, r0
  1064. ble .L34
  1065. #else
  1066. addi AO2, AO, 2 * SIZE
  1067. fpmr f1, f0
  1068. addi BO, B, - 4 * SIZE
  1069. fpmr f2, f0
  1070. addi BO2, B, - 2 * SIZE
  1071. fpmr f3, f0
  1072. srawi. r0, K, 2
  1073. mtspr CTR, r0
  1074. ble .L34
  1075. #endif
  1076. LFPDUX A1, AO, INC4
  1077. LFPDUX B1, BO, INC4
  1078. LFPDUX B2, BO2, INC4
  1079. LFPDUX A2, AO2, INC4
  1080. LFPDUX B3, BO, INC4
  1081. LFPDUX B4, BO2, INC4
  1082. LFPDUX A3, AO, INC4
  1083. LFPDUX A5, BO, INC4
  1084. LFPDUX A6, BO2, INC4
  1085. LFPDUX A4, AO2, INC4
  1086. LFPDUX A7, BO, INC4
  1087. LFPDUX A8, BO2, INC4
  1088. bdz- .L33
  1089. .align 4
  1090. .L32:
  1091. FXCPMADD f0, B1, A1, f0
  1092. FXCSMADD f1, B1, A1, f1
  1093. LFPDUX B1, BO, INC4
  1094. FXCPMADD f2, B2, A1, f2
  1095. FXCSMADD f3, B2, A1, f3
  1096. LFPDUX B2, BO2, INC4
  1097. LFPDUX A1, AO, INC4
  1098. FXCPMADD f0, B3, A2, f0
  1099. FXCSMADD f1, B3, A2, f1
  1100. LFPDUX B3, BO, INC4
  1101. FXCPMADD f2, B4, A2, f2
  1102. FXCSMADD f3, B4, A2, f3
  1103. LFPDUX B4, BO2, INC4
  1104. LFPDUX A2, AO2, INC4
  1105. FXCPMADD f0, A5, A3, f0
  1106. FXCSMADD f1, A5, A3, f1
  1107. LFPDUX A5, BO, INC4
  1108. FXCPMADD f2, A6, A3, f2
  1109. FXCSMADD f3, A6, A3, f3
  1110. LFPDUX A6, BO2, INC4
  1111. LFPDUX A3, AO, INC4
  1112. FXCPMADD f0, A7, A4, f0
  1113. FXCSMADD f1, A7, A4, f1
  1114. LFPDUX A7, BO, INC4
  1115. FXCPMADD f2, A8, A4, f2
  1116. FXCSMADD f3, A8, A4, f3
  1117. LFPDUX A8, BO2, INC4
  1118. LFPDUX A4, AO2, INC4
  1119. bdnz+ .L32
  1120. .align 4
  1121. .L33:
  1122. FXCPMADD f0, B1, A1, f0
  1123. FXCSMADD f1, B1, A1, f1
  1124. FXCPMADD f2, B2, A1, f2
  1125. FXCSMADD f3, B2, A1, f3
  1126. FXCPMADD f0, B3, A2, f0
  1127. FXCSMADD f1, B3, A2, f1
  1128. FXCPMADD f2, B4, A2, f2
  1129. FXCSMADD f3, B4, A2, f3
  1130. FXCPMADD f0, A5, A3, f0
  1131. FXCSMADD f1, A5, A3, f1
  1132. FXCPMADD f2, A6, A3, f2
  1133. FXCSMADD f3, A6, A3, f3
  1134. FXCPMADD f0, A7, A4, f0
  1135. FXCSMADD f1, A7, A4, f1
  1136. FXCPMADD f2, A8, A4, f2
  1137. FXCSMADD f3, A8, A4, f3
  1138. .align 4
  1139. .L34:
  1140. li r0, ALPHA
  1141. lfpdx AP, SP, r0
  1142. #ifdef TRMMKERNEL
  1143. li r0, FZERO
  1144. lfpsx f30, SP, r0
  1145. #endif
  1146. #if defined(TRMMKERNEL)
  1147. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1148. sub TEMP, K, KK
  1149. #elif defined(LEFT)
  1150. addi TEMP, KK, 1
  1151. #else
  1152. addi TEMP, KK, 2
  1153. #endif
  1154. andi. r0, TEMP, 3
  1155. mtspr CTR, r0
  1156. #else
  1157. andi. r0, K, 3
  1158. mtspr CTR, r0
  1159. #endif
  1160. ble+ .L38
  1161. LFPDX A1, AO, INC4
  1162. LFPDUX B1, BO, INC4
  1163. LFPDUX B2, BO2, INC4
  1164. add AO, AO, INC2
  1165. bdz- .L37
  1166. .align 4
  1167. .L36:
  1168. FXCPMADD f0, B1, A1, f0
  1169. FXCSMADD f1, B1, A1, f1
  1170. LFPDUX B1, BO, INC4
  1171. FXCPMADD f2, B2, A1, f2
  1172. FXCSMADD f3, B2, A1, f3
  1173. LFPDX A1, AO, INC4
  1174. LFPDUX B2, BO2, INC4
  1175. add AO, AO, INC2
  1176. bdnz+ .L36
  1177. .align 4
  1178. .L37:
  1179. FXCPMADD f0, B1, A1, f0
  1180. FXCSMADD f1, B1, A1, f1
  1181. FXCPMADD f2, B2, A1, f2
  1182. FXCSMADD f3, B2, A1, f3
  1183. .align 4
  1184. .L38:
  1185. #ifndef TRMMKERNEL
  1186. LFPDX A1, CO1, INC2
  1187. LFPDX A2, CO2, INC2
  1188. #endif
  1189. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1190. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1191. fpadd f0, f0, f1
  1192. fpadd f2, f2, f3
  1193. #else
  1194. fpsub f0, f0, f1
  1195. fpsub f2, f2, f3
  1196. #endif
  1197. #ifndef TRMMKERNEL
  1198. fxcpmadd A1, f0, AP, A1
  1199. fxcpmadd A2, f2, AP, A2
  1200. fxcxnpma f0, f0, AP, A1
  1201. fxcxnpma f2, f2, AP, A2
  1202. #else
  1203. fxcpmadd f12, f0, AP, f30
  1204. fxcpmadd f13, f2, AP, f30
  1205. fxcxnpma f0, f0, AP, f12
  1206. fxcxnpma f2, f2, AP, f13
  1207. #endif
  1208. STFPDUX f0, CO1, INC2
  1209. STFPDUX f2, CO2, INC2
  1210. #ifdef TRMMKERNEL
  1211. #if ( defined(LEFT) && defined(TRANSA)) || \
  1212. (!defined(LEFT) && !defined(TRANSA))
  1213. sub TEMP, K, KK
  1214. #ifdef LEFT
  1215. addi TEMP, TEMP, -1
  1216. #else
  1217. addi TEMP, TEMP, -2
  1218. #endif
  1219. slwi r0, TEMP, 0 + ZBASE_SHIFT
  1220. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  1221. add AO, AO, r0
  1222. add BO, BO, TEMP
  1223. #endif
  1224. #ifdef LEFT
  1225. addi KK, KK, 1
  1226. #endif
  1227. #endif
  1228. li r0, FZERO
  1229. lfpsx f0, SP, r0
  1230. .align 4
  1231. .L49:
  1232. #if defined(TRMMKERNEL) && !defined(LEFT)
  1233. addi KK, KK, 2
  1234. #endif
  1235. addi B, BO, 4 * SIZE
  1236. addic. J, J, -1
  1237. bgt+ .L10
  1238. .align 4
  1239. .L50:
  1240. andi. J, N, 1
  1241. beq .L999
  1242. mr CO1, C
  1243. #if defined(TRMMKERNEL) && defined(LEFT)
  1244. mr KK, OFFSET
  1245. #endif
  1246. addi AO, A, -2 * SIZE
  1247. li r0, FZERO
  1248. lfpsx f0, SP, r0
  1249. srawi. I, M, 2
  1250. ble .L60
  1251. .align 4
  1252. .L51:
  1253. #if defined(TRMMKERNEL)
  1254. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1255. fpmr f4, f0
  1256. addi BO, B, - 2 * SIZE
  1257. fpmr f1, f0
  1258. fpmr f5, f0
  1259. fpmr f2, f0
  1260. fpmr f6, f0
  1261. #else
  1262. slwi TEMP, KK, 2 + ZBASE_SHIFT
  1263. slwi r0, KK, 0 + ZBASE_SHIFT
  1264. add AO, AO, TEMP
  1265. add BO, B, r0
  1266. fpmr f4, f0
  1267. addi BO, BO, - 2 * SIZE
  1268. fpmr f1, f0
  1269. fpmr f5, f0
  1270. fpmr f2, f0
  1271. fpmr f6, f0
  1272. #endif
  1273. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1274. sub TEMP, K, KK
  1275. #elif defined(LEFT)
  1276. addi TEMP, KK, 4
  1277. #else
  1278. addi TEMP, KK, 1
  1279. #endif
  1280. srawi. r0, TEMP, 2
  1281. fpmr f3, f0
  1282. mtspr CTR, r0
  1283. fpmr f7, f0
  1284. ble .L54
  1285. #else
  1286. srawi. r0, K, 2
  1287. fpmr f4, f0
  1288. addi BO, B, - 2 * SIZE
  1289. fpmr f1, f0
  1290. fpmr f5, f0
  1291. fpmr f2, f0
  1292. fpmr f6, f0
  1293. fpmr f3, f0
  1294. mtspr CTR, r0
  1295. fpmr f7, f0
  1296. ble .L54
  1297. #endif
  1298. LFPDUX B1, BO, INC2
  1299. LFPDUX A1, AO, INC2
  1300. LFPDUX A2, AO, INC2
  1301. LFPDUX B2, BO, INC2
  1302. LFPDUX A3, AO, INC2
  1303. LFPDUX A4, AO, INC2
  1304. LFPDUX B3, BO, INC2
  1305. LFPDUX A5, AO, INC2
  1306. LFPDUX A6, AO, INC2
  1307. LFPDUX A7, AO, INC2
  1308. LFPDUX A8, AO, INC2
  1309. bdz- .L53
  1310. .align 4
  1311. .L52:
  1312. FXCPMADD f0, B1, A1, f0
  1313. LFPDUX B4, BO, INC2
  1314. FXCSMADD f4, B1, A1, f4
  1315. LFPDUX A1, AO, INC2
  1316. FXCPMADD f1, B1, A2, f1
  1317. nop
  1318. FXCSMADD f5, B1, A2, f5
  1319. LFPDUX A2, AO, INC2
  1320. FXCPMADD f2, B1, A3, f2
  1321. nop
  1322. FXCSMADD f6, B1, A3, f6
  1323. LFPDUX A3, AO, INC2
  1324. FXCPMADD f3, B1, A4, f3
  1325. nop
  1326. FXCSMADD f7, B1, A4, f7
  1327. LFPDUX A4, AO, INC2
  1328. FXCPMADD f0, B2, A5, f0
  1329. LFPDUX B1, BO, INC2
  1330. FXCSMADD f4, B2, A5, f4
  1331. LFPDUX A5, AO, INC2
  1332. FXCPMADD f1, B2, A6, f1
  1333. nop
  1334. FXCSMADD f5, B2, A6, f5
  1335. LFPDUX A6, AO, INC2
  1336. FXCPMADD f2, B2, A7, f2
  1337. nop
  1338. FXCSMADD f6, B2, A7, f6
  1339. LFPDUX A7, AO, INC2
  1340. FXCPMADD f3, B2, A8, f3
  1341. nop
  1342. FXCSMADD f7, B2, A8, f7
  1343. LFPDUX A8, AO, INC2
  1344. FXCPMADD f0, B3, A1, f0
  1345. LFPDUX B2, BO, INC2
  1346. FXCSMADD f4, B3, A1, f4
  1347. LFPDUX A1, AO, INC2
  1348. FXCPMADD f1, B3, A2, f1
  1349. nop
  1350. FXCSMADD f5, B3, A2, f5
  1351. LFPDUX A2, AO, INC2
  1352. FXCPMADD f2, B3, A3, f2
  1353. nop
  1354. FXCSMADD f6, B3, A3, f6
  1355. LFPDUX A3, AO, INC2
  1356. FXCPMADD f3, B3, A4, f3
  1357. nop
  1358. FXCSMADD f7, B3, A4, f7
  1359. LFPDUX A4, AO, INC2
  1360. FXCPMADD f0, B4, A5, f0
  1361. LFPDUX B3, BO, INC2
  1362. FXCSMADD f4, B4, A5, f4
  1363. LFPDUX A5, AO, INC2
  1364. FXCPMADD f1, B4, A6, f1
  1365. nop
  1366. FXCSMADD f5, B4, A6, f5
  1367. LFPDUX A6, AO, INC2
  1368. FXCPMADD f2, B4, A7, f2
  1369. nop
  1370. FXCSMADD f6, B4, A7, f6
  1371. LFPDUX A7, AO, INC2
  1372. FXCPMADD f3, B4, A8, f3
  1373. nop
  1374. FXCSMADD f7, B4, A8, f7
  1375. LFPDUX A8, AO, INC2
  1376. bdnz+ .L52
  1377. .align 4
  1378. .L53:
  1379. FXCPMADD f0, B1, A1, f0
  1380. LFPDUX B4, BO, INC2
  1381. FXCSMADD f4, B1, A1, f4
  1382. LFPDUX A1, AO, INC2
  1383. FXCPMADD f1, B1, A2, f1
  1384. nop
  1385. FXCSMADD f5, B1, A2, f5
  1386. LFPDUX A2, AO, INC2
  1387. FXCPMADD f2, B1, A3, f2
  1388. nop
  1389. FXCSMADD f6, B1, A3, f6
  1390. LFPDUX A3, AO, INC2
  1391. FXCPMADD f3, B1, A4, f3
  1392. nop
  1393. FXCSMADD f7, B1, A4, f7
  1394. LFPDUX A4, AO, INC2
  1395. FXCPMADD f0, B2, A5, f0
  1396. nop
  1397. FXCSMADD f4, B2, A5, f4
  1398. LFPDUX A5, AO, INC2
  1399. FXCPMADD f1, B2, A6, f1
  1400. nop
  1401. FXCSMADD f5, B2, A6, f5
  1402. LFPDUX A6, AO, INC2
  1403. FXCPMADD f2, B2, A7, f2
  1404. nop
  1405. FXCSMADD f6, B2, A7, f6
  1406. LFPDUX A7, AO, INC2
  1407. FXCPMADD f3, B2, A8, f3
  1408. nop
  1409. FXCSMADD f7, B2, A8, f7
  1410. LFPDUX A8, AO, INC2
  1411. FXCPMADD f0, B3, A1, f0
  1412. FXCSMADD f4, B3, A1, f4
  1413. FXCPMADD f1, B3, A2, f1
  1414. FXCSMADD f5, B3, A2, f5
  1415. FXCPMADD f2, B3, A3, f2
  1416. FXCSMADD f6, B3, A3, f6
  1417. FXCPMADD f3, B3, A4, f3
  1418. FXCSMADD f7, B3, A4, f7
  1419. FXCPMADD f0, B4, A5, f0
  1420. FXCSMADD f4, B4, A5, f4
  1421. FXCPMADD f1, B4, A6, f1
  1422. FXCSMADD f5, B4, A6, f5
  1423. FXCPMADD f2, B4, A7, f2
  1424. FXCSMADD f6, B4, A7, f6
  1425. FXCPMADD f3, B4, A8, f3
  1426. FXCSMADD f7, B4, A8, f7
  1427. .align 4
  1428. .L54:
  1429. li r0, ALPHA
  1430. lfpdx AP, SP, r0
  1431. #ifdef TRMMKERNEL
  1432. li r0, FZERO
  1433. lfpsx f30, SP, r0
  1434. #endif
  1435. #if defined(TRMMKERNEL)
  1436. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1437. sub TEMP, K, KK
  1438. #elif defined(LEFT)
  1439. addi TEMP, KK, 4
  1440. #else
  1441. addi TEMP, KK, 1
  1442. #endif
  1443. andi. r0, TEMP, 3
  1444. mtspr CTR, r0
  1445. #else
  1446. andi. r0, K, 3
  1447. mtspr CTR, r0
  1448. #endif
  1449. ble+ .L58
  1450. LFPDUX A1, AO, INC2
  1451. LFPDUX B1, BO, INC2
  1452. LFPDUX A2, AO, INC2
  1453. LFPDUX A3, AO, INC2
  1454. LFPDUX A4, AO, INC2
  1455. bdz- .L57
  1456. .align 4
  1457. .L56:
  1458. FXCPMADD f0, B1, A1, f0
  1459. FXCSMADD f4, B1, A1, f4
  1460. LFPDUX A1, AO, INC2
  1461. FXCPMADD f1, B1, A2, f1
  1462. FXCSMADD f5, B1, A2, f5
  1463. LFPDUX A2, AO, INC2
  1464. FXCPMADD f2, B1, A3, f2
  1465. FXCSMADD f6, B1, A3, f6
  1466. LFPDUX A3, AO, INC2
  1467. FXCPMADD f3, B1, A4, f3
  1468. FXCSMADD f7, B1, A4, f7
  1469. LFPDUX A4, AO, INC2
  1470. LFPDUX B1, BO, INC2
  1471. bdnz+ .L56
  1472. .align 4
  1473. .L57:
  1474. FXCPMADD f0, B1, A1, f0
  1475. FXCSMADD f4, B1, A1, f4
  1476. FXCPMADD f1, B1, A2, f1
  1477. FXCSMADD f5, B1, A2, f5
  1478. FXCPMADD f2, B1, A3, f2
  1479. FXCSMADD f6, B1, A3, f6
  1480. FXCPMADD f3, B1, A4, f3
  1481. FXCSMADD f7, B1, A4, f7
  1482. .align 4
  1483. .L58:
  1484. #ifndef TRMMKERNEL
  1485. LFPDUX A1, CO1, INC2
  1486. LFPDUX A2, CO1, INC2
  1487. LFPDUX A3, CO1, INC2
  1488. LFPDUX A4, CO1, INC2
  1489. #endif
  1490. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1491. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1492. fpadd f0, f0, f4
  1493. fpadd f1, f1, f5
  1494. fpadd f2, f2, f6
  1495. fpadd f3, f3, f7
  1496. #else
  1497. fpsub f0, f0, f4
  1498. fpsub f1, f1, f5
  1499. fpsub f2, f2, f6
  1500. fpsub f3, f3, f7
  1501. #endif
  1502. #ifndef TRMMKERNEL
  1503. fxcpmadd A1, f0, AP, A1
  1504. fxcpmadd A2, f1, AP, A2
  1505. fxcpmadd A3, f2, AP, A3
  1506. fxcpmadd A4, f3, AP, A4
  1507. fxcxnpma f0, f0, AP, A1
  1508. fxcxnpma f1, f1, AP, A2
  1509. fxcxnpma f2, f2, AP, A3
  1510. fxcxnpma f3, f3, AP, A4
  1511. STFPDUX f0, CO1, INCM7
  1512. STFPDUX f1, CO1, INC2
  1513. STFPDUX f2, CO1, INC2
  1514. STFPDUX f3, CO1, INC2
  1515. #else
  1516. fxcpmadd f12, f0, AP, f30
  1517. fxcpmadd f13, f1, AP, f30
  1518. fxcpmadd f14, f2, AP, f30
  1519. fxcpmadd f15, f3, AP, f30
  1520. fxcxnpma f0, f0, AP, f12
  1521. fxcxnpma f1, f1, AP, f13
  1522. fxcxnpma f2, f2, AP, f14
  1523. fxcxnpma f3, f3, AP, f15
  1524. STFPDUX f0, CO1, INC2
  1525. STFPDUX f1, CO1, INC2
  1526. STFPDUX f2, CO1, INC2
  1527. STFPDUX f3, CO1, INC2
  1528. #endif
  1529. #ifdef TRMMKERNEL
  1530. #if ( defined(LEFT) && defined(TRANSA)) || \
  1531. (!defined(LEFT) && !defined(TRANSA))
  1532. sub TEMP, K, KK
  1533. #ifdef LEFT
  1534. addi TEMP, TEMP, -4
  1535. #else
  1536. addi TEMP, TEMP, -1
  1537. #endif
  1538. slwi r0, TEMP, 2 + ZBASE_SHIFT
  1539. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  1540. add AO, AO, r0
  1541. add BO, BO, TEMP
  1542. #endif
  1543. #ifdef LEFT
  1544. addi KK, KK, 4
  1545. #endif
  1546. #endif
  1547. addic. I, I, -1
  1548. li r0, FZERO
  1549. lfpsx f0, SP, r0
  1550. bgt+ .L51
  1551. .align 4
  1552. .L60:
  1553. andi. I, M, 2
  1554. beq .L70
  1555. #if defined(TRMMKERNEL)
  1556. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1557. addi BO, B, - 2 * SIZE
  1558. fpmr f1, f0
  1559. #else
  1560. slwi TEMP, KK, 1 + ZBASE_SHIFT
  1561. slwi r0, KK, 0 + ZBASE_SHIFT
  1562. add AO, AO, TEMP
  1563. add BO, B, r0
  1564. addi BO, BO, - 2 * SIZE
  1565. fpmr f1, f0
  1566. #endif
  1567. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1568. sub TEMP, K, KK
  1569. #elif defined(LEFT)
  1570. addi TEMP, KK, 2
  1571. #else
  1572. addi TEMP, KK, 1
  1573. #endif
  1574. srawi. r0, TEMP, 2
  1575. fpmr f2, f0
  1576. mtspr CTR, r0
  1577. fpmr f3, f0
  1578. ble .L64
  1579. #else
  1580. srawi. r0, K, 2
  1581. fpmr f1, f0
  1582. addi BO, B, - 2 * SIZE
  1583. fpmr f2, f0
  1584. mtspr CTR, r0
  1585. fpmr f3, f0
  1586. ble .L64
  1587. #endif
  1588. LFPDUX B1, BO, INC2
  1589. LFPDUX A1, AO, INC2
  1590. LFPDUX A2, AO, INC2
  1591. LFPDUX B2, BO, INC2
  1592. LFPDUX A3, AO, INC2
  1593. LFPDUX A4, AO, INC2
  1594. LFPDUX B3, BO, INC2
  1595. LFPDUX A5, AO, INC2
  1596. LFPDUX A6, AO, INC2
  1597. LFPDUX B4, BO, INC2
  1598. LFPDUX A7, AO, INC2
  1599. LFPDUX A8, AO, INC2
  1600. bdz- .L63
  1601. .align 4
  1602. .L62:
  1603. FXCPMADD f0, B1, A1, f0
  1604. FXCSMADD f2, B1, A1, f2
  1605. LFPDUX A1, AO, INC2
  1606. FXCPMADD f1, B1, A2, f1
  1607. FXCSMADD f3, B1, A2, f3
  1608. LFPDUX A2, AO, INC2
  1609. LFPDUX B1, BO, INC2
  1610. FXCPMADD f0, B2, A3, f0
  1611. FXCSMADD f2, B2, A3, f2
  1612. LFPDUX A3, AO, INC2
  1613. FXCPMADD f1, B2, A4, f1
  1614. FXCSMADD f3, B2, A4, f3
  1615. LFPDUX A4, AO, INC2
  1616. LFPDUX B2, BO, INC2
  1617. FXCPMADD f0, B3, A5, f0
  1618. FXCSMADD f2, B3, A5, f2
  1619. LFPDUX A5, AO, INC2
  1620. FXCPMADD f1, B3, A6, f1
  1621. FXCSMADD f3, B3, A6, f3
  1622. LFPDUX A6, AO, INC2
  1623. LFPDUX B3, BO, INC2
  1624. FXCPMADD f0, B4, A7, f0
  1625. FXCSMADD f2, B4, A7, f2
  1626. LFPDUX A7, AO, INC2
  1627. FXCPMADD f1, B4, A8, f1
  1628. FXCSMADD f3, B4, A8, f3
  1629. LFPDUX A8, AO, INC2
  1630. LFPDUX B4, BO, INC2
  1631. bdnz+ .L62
  1632. .align 4
  1633. .L63:
  1634. FXCPMADD f0, B1, A1, f0
  1635. FXCSMADD f2, B1, A1, f2
  1636. FXCPMADD f1, B1, A2, f1
  1637. FXCSMADD f3, B1, A2, f3
  1638. FXCPMADD f0, B2, A3, f0
  1639. FXCSMADD f2, B2, A3, f2
  1640. FXCPMADD f1, B2, A4, f1
  1641. FXCSMADD f3, B2, A4, f3
  1642. FXCPMADD f0, B3, A5, f0
  1643. FXCSMADD f2, B3, A5, f2
  1644. FXCPMADD f1, B3, A6, f1
  1645. FXCSMADD f3, B3, A6, f3
  1646. FXCPMADD f0, B4, A7, f0
  1647. FXCSMADD f2, B4, A7, f2
  1648. FXCPMADD f1, B4, A8, f1
  1649. FXCSMADD f3, B4, A8, f3
  1650. .align 4
  1651. .L64:
  1652. li r0, ALPHA
  1653. lfpdx AP, SP, r0
  1654. #ifdef TRMMKERNEL
  1655. li r0, FZERO
  1656. lfpsx f30, SP, r0
  1657. #endif
  1658. #if defined(TRMMKERNEL)
  1659. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1660. sub TEMP, K, KK
  1661. #elif defined(LEFT)
  1662. addi TEMP, KK, 2
  1663. #else
  1664. addi TEMP, KK, 1
  1665. #endif
  1666. andi. r0, TEMP, 3
  1667. mtspr CTR, r0
  1668. #else
  1669. andi. r0, K, 3
  1670. mtspr CTR, r0
  1671. #endif
  1672. ble+ .L68
  1673. LFPDUX A1, AO, INC2
  1674. LFPDUX B1, BO, INC2
  1675. LFPDUX A2, AO, INC2
  1676. bdz- .L67
  1677. .align 4
  1678. .L66:
  1679. FXCPMADD f0, B1, A1, f0
  1680. FXCSMADD f2, B1, A1, f2
  1681. LFPDUX A1, AO, INC2
  1682. FXCPMADD f1, B1, A2, f1
  1683. FXCSMADD f3, B1, A2, f3
  1684. LFPDUX B1, BO, INC2
  1685. LFPDUX A2, AO, INC2
  1686. bdnz+ .L66
  1687. .align 4
  1688. .L67:
  1689. FXCPMADD f0, B1, A1, f0
  1690. FXCSMADD f2, B1, A1, f2
  1691. FXCPMADD f1, B1, A2, f1
  1692. FXCSMADD f3, B1, A2, f3
  1693. .align 4
  1694. .L68:
  1695. #ifndef TRMMKERNEL
  1696. LFPDUX A1, CO1, INC2
  1697. LFPDUX A2, CO1, INC2
  1698. #endif
  1699. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1700. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1701. fpadd f0, f0, f2
  1702. fpadd f1, f1, f3
  1703. #else
  1704. fpsub f0, f0, f2
  1705. fpsub f1, f1, f3
  1706. #endif
  1707. #ifndef TRMMKERNEL
  1708. fxcpmadd A1, f0, AP, A1
  1709. fxcpmadd A2, f1, AP, A2
  1710. fxcxnpma f0, f0, AP, A1
  1711. fxcxnpma f1, f1, AP, A2
  1712. STFPDUX f0, CO1, INCM3
  1713. STFPDUX f1, CO1, INC2
  1714. #else
  1715. fxcpmadd f12, f0, AP, f30
  1716. fxcpmadd f13, f1, AP, f30
  1717. fxcxnpma f0, f0, AP, f12
  1718. fxcxnpma f1, f1, AP, f13
  1719. STFPDUX f0, CO1, INC2
  1720. STFPDUX f1, CO1, INC2
  1721. #endif
  1722. #ifdef TRMMKERNEL
  1723. #if ( defined(LEFT) && defined(TRANSA)) || \
  1724. (!defined(LEFT) && !defined(TRANSA))
  1725. sub TEMP, K, KK
  1726. #ifdef LEFT
  1727. addi TEMP, TEMP, -2
  1728. #else
  1729. addi TEMP, TEMP, -1
  1730. #endif
  1731. slwi r0, TEMP, 1 + ZBASE_SHIFT
  1732. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  1733. add AO, AO, r0
  1734. add BO, BO, TEMP
  1735. #endif
  1736. #ifdef LEFT
  1737. addi KK, KK, 2
  1738. #endif
  1739. #endif
  1740. li r0, FZERO
  1741. lfpsx f0, SP, r0
  1742. .align 4
  1743. .L70:
  1744. andi. I, M, 1
  1745. beq .L89
  1746. #if defined(TRMMKERNEL)
  1747. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1748. addi BO, B, - 2 * SIZE
  1749. fpmr f1, f0
  1750. #else
  1751. slwi TEMP, KK, 0 + ZBASE_SHIFT
  1752. slwi r0, KK, 0 + ZBASE_SHIFT
  1753. add AO, AO, TEMP
  1754. add BO, B, r0
  1755. addi BO, BO, - 2 * SIZE
  1756. fpmr f1, f0
  1757. #endif
  1758. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1759. sub TEMP, K, KK
  1760. #elif defined(LEFT)
  1761. addi TEMP, KK, 1
  1762. #else
  1763. addi TEMP, KK, 1
  1764. #endif
  1765. srawi. r0, TEMP, 3
  1766. fpmr f2, f0
  1767. mtspr CTR, r0
  1768. fpmr f3, f0
  1769. ble .L74
  1770. #else
  1771. addi BO, B, - 2 * SIZE
  1772. fpmr f1, f0
  1773. srawi. r0, K, 3
  1774. fpmr f2, f0
  1775. mtspr CTR, r0
  1776. fpmr f3, f0
  1777. ble .L74
  1778. #endif
  1779. LFPDUX A1, AO, INC2
  1780. LFPDUX B1, BO, INC2
  1781. LFPDUX A2, AO, INC2
  1782. LFPDUX B2, BO, INC2
  1783. LFPDUX A3, AO, INC2
  1784. LFPDUX B3, BO, INC2
  1785. LFPDUX A4, AO, INC2
  1786. LFPDUX B4, BO, INC2
  1787. LFPDUX A5, AO, INC2
  1788. LFPDUX B5, BO, INC2
  1789. LFPDUX A6, AO, INC2
  1790. LFPDUX B6, BO, INC2
  1791. LFPDUX A7, AO, INC2
  1792. LFPDUX A9, BO, INC2
  1793. LFPDUX A8, AO, INC2
  1794. LFPDUX A10, BO, INC2
  1795. bdz- .L73
  1796. .align 4
  1797. .L72:
  1798. FXCPMADD f0, B1, A1, f0
  1799. FXCSMADD f1, B1, A1, f1
  1800. LFPDUX A1, AO, INC2
  1801. LFPDUX B1, BO, INC2
  1802. FXCPMADD f2, B2, A2, f2
  1803. FXCSMADD f3, B2, A2, f3
  1804. LFPDUX A2, AO, INC2
  1805. LFPDUX B2, BO, INC2
  1806. FXCPMADD f0, B3, A3, f0
  1807. FXCSMADD f1, B3, A3, f1
  1808. LFPDUX A3, AO, INC2
  1809. LFPDUX B3, BO, INC2
  1810. FXCPMADD f2, B4, A4, f2
  1811. FXCSMADD f3, B4, A4, f3
  1812. LFPDUX A4, AO, INC2
  1813. LFPDUX B4, BO, INC2
  1814. FXCPMADD f0, B5, A5, f0
  1815. FXCSMADD f1, B5, A5, f1
  1816. LFPDUX A5, AO, INC2
  1817. LFPDUX B5, BO, INC2
  1818. FXCPMADD f2, B6, A6, f2
  1819. FXCSMADD f3, B6, A6, f3
  1820. LFPDUX A6, AO, INC2
  1821. LFPDUX B6, BO, INC2
  1822. FXCPMADD f0, A9, A7, f0
  1823. FXCSMADD f1, A9, A7, f1
  1824. LFPDUX A7, AO, INC2
  1825. LFPDUX A9, BO, INC2
  1826. FXCPMADD f2, A10, A8, f2
  1827. FXCSMADD f3, A10, A8, f3
  1828. LFPDUX A8, AO, INC2
  1829. LFPDUX A10, BO, INC2
  1830. bdnz+ .L72
  1831. .align 4
  1832. .L73:
  1833. FXCPMADD f0, B1, A1, f0
  1834. FXCSMADD f1, B1, A1, f1
  1835. FXCPMADD f2, B2, A2, f2
  1836. FXCSMADD f3, B2, A2, f3
  1837. FXCPMADD f0, B3, A3, f0
  1838. FXCSMADD f1, B3, A3, f1
  1839. FXCPMADD f2, B4, A4, f2
  1840. FXCSMADD f3, B4, A4, f3
  1841. FXCPMADD f0, B5, A5, f0
  1842. FXCSMADD f1, B5, A5, f1
  1843. FXCPMADD f2, B6, A6, f2
  1844. FXCSMADD f3, B6, A6, f3
  1845. FXCPMADD f0, A9, A7, f0
  1846. FXCSMADD f1, A9, A7, f1
  1847. FXCPMADD f2, A10, A8, f2
  1848. FXCSMADD f3, A10, A8, f3
  1849. .align 4
  1850. .L74:
  1851. li r0, ALPHA
  1852. lfpdx AP, SP, r0
  1853. #ifdef TRMMKERNEL
  1854. li r0, FZERO
  1855. lfpsx f30, SP, r0
  1856. #endif
  1857. #if defined(TRMMKERNEL)
  1858. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1859. sub TEMP, K, KK
  1860. #elif defined(LEFT)
  1861. addi TEMP, KK, 1
  1862. #else
  1863. addi TEMP, KK, 1
  1864. #endif
  1865. andi. r0, TEMP, 7
  1866. mtspr CTR, r0
  1867. #else
  1868. andi. r0, K, 7
  1869. mtspr CTR, r0
  1870. #endif
  1871. ble+ .L78
  1872. LFPDUX A1, AO, INC2
  1873. LFPDUX B1, BO, INC2
  1874. bdz- .L77
  1875. .align 4
  1876. .L76:
  1877. FXCPMADD f0, B1, A1, f0
  1878. FXCSMADD f1, B1, A1, f1
  1879. LFPDUX A1, AO, INC2
  1880. LFPDUX B1, BO, INC2
  1881. bdnz+ .L76
  1882. .align 4
  1883. .L77:
  1884. FXCPMADD f0, B1, A1, f0
  1885. FXCSMADD f1, B1, A1, f1
  1886. .align 4
  1887. .L78:
  1888. #ifndef TRMMKERNEL
  1889. LFPDX A1, CO1, INC2
  1890. #endif
  1891. fpadd f0, f0, f2
  1892. fpadd f1, f1, f3
  1893. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1894. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1895. fpadd f0, f0, f1
  1896. #else
  1897. fpsub f0, f0, f1
  1898. #endif
  1899. #ifndef TRMMKERNEL
  1900. fxcpmadd A1, f0, AP, A1
  1901. fxcxnpma f0, f0, AP, A1
  1902. #else
  1903. fxcpmadd f12, f0, AP, f30
  1904. fxcxnpma f0, f0, AP, f12
  1905. #endif
  1906. STFPDUX f0, CO1, INC2
  1907. li r0, FZERO
  1908. lfpsx f0, SP, r0
  1909. .align 4
  1910. .L89:
  1911. addi B, BO, 2 * SIZE
  1912. .align 4
  1913. .L999:
  1914. addi SP, SP, 20
  1915. lwzu r14, 4(SP)
  1916. lwzu r15, 4(SP)
  1917. lwzu r16, 4(SP)
  1918. lwzu r17, 4(SP)
  1919. lwzu r18, 4(SP)
  1920. lwzu r19, 4(SP)
  1921. lwzu r20, 4(SP)
  1922. lwzu r21, 4(SP)
  1923. lwzu r22, 4(SP)
  1924. lwzu r23, 4(SP)
  1925. lwzu r24, 4(SP)
  1926. lwzu r25, 4(SP)
  1927. lwzu r26, 4(SP)
  1928. lwzu r27, 4(SP)
  1929. lwzu r28, 4(SP)
  1930. lwzu r29, 4(SP)
  1931. lwzu r30, 4(SP)
  1932. lwzu r31, 4(SP)
  1933. subi SP, SP, 12
  1934. li r0, 16
  1935. lfpdux f31, SP, r0
  1936. lfpdux f30, SP, r0
  1937. lfpdux f29, SP, r0
  1938. lfpdux f28, SP, r0
  1939. lfpdux f27, SP, r0
  1940. lfpdux f26, SP, r0
  1941. lfpdux f25, SP, r0
  1942. lfpdux f24, SP, r0
  1943. lfpdux f23, SP, r0
  1944. lfpdux f22, SP, r0
  1945. lfpdux f21, SP, r0
  1946. lfpdux f20, SP, r0
  1947. lfpdux f19, SP, r0
  1948. lfpdux f18, SP, r0
  1949. lfpdux f17, SP, r0
  1950. lfpdux f16, SP, r0
  1951. lfpdux f15, SP, r0
  1952. lfpdux f14, SP, r0
  1953. addi SP, SP, 16
  1954. blr
  1955. .align 4
  1956. .L1000:
  1957. li INC, 1 * SIZE
  1958. li INC2, 2 * SIZE
  1959. li INC4, 4 * SIZE
  1960. li INCM1, -1 * SIZE
  1961. li INCM3, -3 * SIZE
  1962. li INCM5, -5 * SIZE
  1963. li INCM7, -7 * SIZE
  1964. addi C, C, - 1 * SIZE
  1965. srawi. J, N, 1
  1966. ble .L1050
  1967. .align 4
  1968. .L1010:
  1969. mr CO1, C
  1970. add CO2, C, LDC
  1971. add C, CO2, LDC
  1972. #if defined(TRMMKERNEL) && defined(LEFT)
  1973. mr KK, OFFSET
  1974. #endif
  1975. addi AO, A, -4 * SIZE
  1976. li r0, FZERO
  1977. lfpsx f0, SP, r0
  1978. srawi. I, M, 2
  1979. ble .L1020
  1980. .align 4
  1981. .L1011:
  1982. #if defined(TRMMKERNEL)
  1983. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1984. addi AO2, AO, 2 * SIZE
  1985. fpmr f4, f0
  1986. addi BO, B, - 4 * SIZE
  1987. fpmr f8, f0
  1988. addi BO2, B, - 2 * SIZE
  1989. fpmr f12, f0
  1990. #else
  1991. slwi TEMP, KK, 2 + ZBASE_SHIFT
  1992. slwi r0, KK, 1 + ZBASE_SHIFT
  1993. add AO, AO, TEMP
  1994. add BO, B, r0
  1995. addi AO2, AO, 2 * SIZE
  1996. fpmr f4, f0
  1997. addi BO, BO, - 4 * SIZE
  1998. fpmr f8, f0
  1999. addi BO2, BO, 2 * SIZE
  2000. fpmr f12, f0
  2001. #endif
  2002. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2003. sub TEMP, K, KK
  2004. #elif defined(LEFT)
  2005. addi TEMP, KK, 4
  2006. #else
  2007. addi TEMP, KK, 2
  2008. #endif
  2009. srawi. r0, TEMP, 2
  2010. fpmr f1, f0
  2011. mtspr CTR, r0
  2012. ble .L1014
  2013. #else
  2014. addi AO2, AO, 2 * SIZE
  2015. fpmr f4, f0
  2016. addi BO, B, - 4 * SIZE
  2017. fpmr f8, f0
  2018. addi BO2, B, - 2 * SIZE
  2019. fpmr f12, f0
  2020. srawi. r0, K, 2
  2021. fpmr f1, f0
  2022. mtspr CTR, r0
  2023. ble .L1014
  2024. #endif
  2025. LFPDUX A1, AO, INC4
  2026. fpmr f5, f0
  2027. LFPDUX A3, AO, INC4
  2028. fpmr f9, f0
  2029. LFPDUX B1, BO, INC4
  2030. fpmr f13, f0
  2031. LFPDUX A5, AO, INC4
  2032. fpmr f2, f0
  2033. LFPDUX A6, AO, INC4
  2034. fpmr f6, f0
  2035. LFPDUX B3, BO, INC4
  2036. fpmr f10, f0
  2037. LFPDUX A7, AO, INC4
  2038. fpmr f14, f0
  2039. LFPDUX A8, AO, INC4
  2040. fpmr f3, f0
  2041. LFPDUX B5, BO, INC4
  2042. fpmr f7, f0
  2043. LFPDUX A9, AO, INC4
  2044. fpmr f11, f0
  2045. LFPDUX A2, AO2, INC4
  2046. fpmr f15, f0
  2047. LFPDUX B2, BO2, INC4
  2048. bdz- .L1013
  2049. .align 4
  2050. .L1012:
  2051. ## 1 ##
  2052. FXCPMADD f0, B1, A1, f0
  2053. nop
  2054. FXCSMADD f4, B1, A1, f4
  2055. nop
  2056. FXCPMADD f8, B2, A1, f8
  2057. LFPDUX B4, BO2, INC4
  2058. FXCSMADD f12, B2, A1, f12
  2059. LFPDUX B6, BO, INC4
  2060. FXCPMADD f1, B1, A2, f1
  2061. nop
  2062. FXCSMADD f5, B1, A2, f5
  2063. LFPDUX A4, AO2, INC4
  2064. FXCPMADD f9, B2, A2, f9
  2065. LFPDUX A10, AO, INC4
  2066. FXCSMADD f13, B2, A2, f13
  2067. nop
  2068. FXCPMADD f2, B1, A3, f2
  2069. nop
  2070. FXCSMADD f6, B1, A3, f6
  2071. nop
  2072. FXCPMADD f10, B2, A3, f10
  2073. nop
  2074. FXCSMADD f14, B2, A3, f14
  2075. nop
  2076. FXCPMADD f3, B1, A4, f3
  2077. nop
  2078. FXCSMADD f7, B1, A4, f7
  2079. LFPDUX A2, AO2, INC4
  2080. FXCPMADD f11, B2, A4, f11
  2081. LFPDUX A1, AO, INC4
  2082. FXCSMADD f15, B2, A4, f15
  2083. nop
  2084. ## 2 ##
  2085. FXCPMADD f0, B3, A5, f0
  2086. nop
  2087. FXCSMADD f4, B3, A5, f4
  2088. nop
  2089. FXCPMADD f8, B4, A5, f8
  2090. LFPDUX B2, BO2, INC4
  2091. FXCSMADD f12, B4, A5, f12
  2092. LFPDUX B1, BO, INC4
  2093. FXCPMADD f1, B3, A2, f1
  2094. nop
  2095. FXCSMADD f5, B3, A2, f5
  2096. LFPDUX A4, AO2, INC4
  2097. FXCPMADD f9, B4, A2, f9
  2098. LFPDUX A3, AO, INC4
  2099. FXCSMADD f13, B4, A2, f13
  2100. nop
  2101. FXCPMADD f2, B3, A6, f2
  2102. nop
  2103. FXCSMADD f6, B3, A6, f6
  2104. nop
  2105. FXCPMADD f10, B4, A6, f10
  2106. nop
  2107. FXCSMADD f14, B4, A6, f14
  2108. nop
  2109. FXCPMADD f3, B3, A4, f3
  2110. nop
  2111. FXCSMADD f7, B3, A4, f7
  2112. LFPDUX A2, AO2, INC4
  2113. FXCPMADD f11, B4, A4, f11
  2114. LFPDUX A5, AO, INC4
  2115. FXCSMADD f15, B4, A4, f15
  2116. nop
  2117. ## 3 ##
  2118. FXCPMADD f0, B5, A7, f0
  2119. nop
  2120. FXCSMADD f4, B5, A7, f4
  2121. nop
  2122. FXCPMADD f8, B2, A7, f8
  2123. LFPDUX B4, BO2, INC4
  2124. FXCSMADD f12, B2, A7, f12
  2125. LFPDUX B3, BO, INC4
  2126. FXCPMADD f1, B5, A2, f1
  2127. nop
  2128. FXCSMADD f5, B5, A2, f5
  2129. LFPDUX A4, AO2, INC4
  2130. FXCPMADD f9, B2, A2, f9
  2131. LFPDUX A6, AO, INC4
  2132. FXCSMADD f13, B2, A2, f13
  2133. nop
  2134. FXCPMADD f2, B5, A8, f2
  2135. nop
  2136. FXCSMADD f6, B5, A8, f6
  2137. nop
  2138. FXCPMADD f10, B2, A8, f10
  2139. nop
  2140. FXCSMADD f14, B2, A8, f14
  2141. nop
  2142. FXCPMADD f3, B5, A4, f3
  2143. nop
  2144. FXCSMADD f7, B5, A4, f7
  2145. LFPDUX A2, AO2, INC4
  2146. FXCPMADD f11, B2, A4, f11
  2147. LFPDUX A7, AO, INC4
  2148. FXCSMADD f15, B2, A4, f15
  2149. nop
  2150. ## 4 ##
  2151. FXCPMADD f0, B6, A9, f0
  2152. nop
  2153. FXCSMADD f4, B6, A9, f4
  2154. nop
  2155. FXCPMADD f8, B4, A9, f8
  2156. LFPDUX B2, BO2, INC4
  2157. FXCSMADD f12, B4, A9, f12
  2158. LFPDUX B5, BO, INC4
  2159. FXCPMADD f1, B6, A2, f1
  2160. nop
  2161. FXCSMADD f5, B6, A2, f5
  2162. LFPDUX A4, AO2, INC4
  2163. FXCPMADD f9, B4, A2, f9
  2164. LFPDUX A8, AO, INC4
  2165. FXCSMADD f13, B4, A2, f13
  2166. nop
  2167. FXCPMADD f2, B6, A10, f2
  2168. nop
  2169. FXCSMADD f6, B6, A10, f6
  2170. nop
  2171. FXCPMADD f10, B4, A10, f10
  2172. nop
  2173. FXCSMADD f14, B4, A10, f14
  2174. nop
  2175. FXCPMADD f3, B6, A4, f3
  2176. LFPDUX A2, AO2, INC4
  2177. FXCSMADD f7, B6, A4, f7
  2178. LFPDUX A9, AO, INC4
  2179. FXCPMADD f11, B4, A4, f11
  2180. nop
  2181. FXCSMADD f15, B4, A4, f15
  2182. bdnz+ .L1012
  2183. .align 4
  2184. .L1013:
  2185. ## 1 ##
  2186. FXCPMADD f0, B1, A1, f0
  2187. nop
  2188. FXCSMADD f4, B1, A1, f4
  2189. nop
  2190. FXCPMADD f8, B2, A1, f8
  2191. LFPDUX B4, BO2, INC4
  2192. FXCSMADD f12, B2, A1, f12
  2193. LFPDUX B6, BO, INC4
  2194. FXCPMADD f1, B1, A2, f1
  2195. nop
  2196. FXCSMADD f5, B1, A2, f5
  2197. LFPDUX A4, AO2, INC4
  2198. FXCPMADD f9, B2, A2, f9
  2199. LFPDUX A10, AO, INC4
  2200. FXCSMADD f13, B2, A2, f13
  2201. nop
  2202. FXCPMADD f2, B1, A3, f2
  2203. nop
  2204. FXCSMADD f6, B1, A3, f6
  2205. nop
  2206. FXCPMADD f10, B2, A3, f10
  2207. nop
  2208. FXCSMADD f14, B2, A3, f14
  2209. nop
  2210. FXCPMADD f3, B1, A4, f3
  2211. nop
  2212. FXCSMADD f7, B1, A4, f7
  2213. LFPDUX A2, AO2, INC4
  2214. FXCPMADD f11, B2, A4, f11
  2215. #ifndef TRMMKERNEL
  2216. LFDUX A1, CO1, INC
  2217. #else
  2218. nop
  2219. #endif
  2220. FXCSMADD f15, B2, A4, f15
  2221. nop
  2222. ## 2 ##
  2223. FXCPMADD f0, B3, A5, f0
  2224. nop
  2225. FXCSMADD f4, B3, A5, f4
  2226. nop
  2227. FXCPMADD f8, B4, A5, f8
  2228. LFPDUX B2, BO2, INC4
  2229. FXCSMADD f12, B4, A5, f12
  2230. #ifndef TRMMKERNEL
  2231. LFDUX B1, CO1, INC2
  2232. #else
  2233. nop
  2234. #endif
  2235. FXCPMADD f1, B3, A2, f1
  2236. nop
  2237. FXCSMADD f5, B3, A2, f5
  2238. LFPDUX A4, AO2, INC4
  2239. FXCPMADD f9, B4, A2, f9
  2240. #ifndef TRMMKERNEL
  2241. LFDUX A3, CO1, INC2
  2242. #else
  2243. nop
  2244. #endif
  2245. FXCSMADD f13, B4, A2, f13
  2246. nop
  2247. FXCPMADD f2, B3, A6, f2
  2248. nop
  2249. FXCSMADD f6, B3, A6, f6
  2250. nop
  2251. FXCPMADD f10, B4, A6, f10
  2252. nop
  2253. FXCSMADD f14, B4, A6, f14
  2254. nop
  2255. FXCPMADD f3, B3, A4, f3
  2256. nop
  2257. FXCSMADD f7, B3, A4, f7
  2258. LFPDUX A2, AO2, INC4
  2259. FXCPMADD f11, B4, A4, f11
  2260. #ifndef TRMMKERNEL
  2261. LFDUX A5, CO1, INC2
  2262. #else
  2263. nop
  2264. #endif
  2265. FXCSMADD f15, B4, A4, f15
  2266. nop
  2267. ## 3 ##
  2268. FXCPMADD f0, B5, A7, f0
  2269. nop
  2270. FXCSMADD f4, B5, A7, f4
  2271. nop
  2272. FXCPMADD f8, B2, A7, f8
  2273. LFPDUX B4, BO2, INC4
  2274. FXCSMADD f12, B2, A7, f12
  2275. #ifndef TRMMKERNEL
  2276. LFSDUX A1, CO1, INCM5
  2277. #else
  2278. nop
  2279. #endif
  2280. FXCPMADD f1, B5, A2, f1
  2281. nop
  2282. FXCSMADD f5, B5, A2, f5
  2283. LFPDUX A4, AO2, INC4
  2284. FXCPMADD f9, B2, A2, f9
  2285. #ifndef TRMMKERNEL
  2286. LFSDUX B1, CO1, INC2
  2287. #else
  2288. nop
  2289. #endif
  2290. FXCSMADD f13, B2, A2, f13
  2291. nop
  2292. FXCPMADD f2, B5, A8, f2
  2293. nop
  2294. FXCSMADD f6, B5, A8, f6
  2295. nop
  2296. FXCPMADD f10, B2, A8, f10
  2297. nop
  2298. FXCSMADD f14, B2, A8, f14
  2299. nop
  2300. FXCPMADD f3, B5, A4, f3
  2301. nop
  2302. FXCSMADD f7, B5, A4, f7
  2303. LFPDUX A2, AO2, INC4
  2304. FXCPMADD f11, B2, A4, f11
  2305. #ifndef TRMMKERNEL
  2306. LFSDUX A3, CO1, INC2
  2307. #else
  2308. nop
  2309. #endif
  2310. FXCSMADD f15, B2, A4, f15
  2311. nop
  2312. ## 4 ##
  2313. FXCPMADD f0, B6, A9, f0
  2314. nop
  2315. FXCSMADD f4, B6, A9, f4
  2316. nop
  2317. FXCPMADD f8, B4, A9, f8
  2318. #ifndef TRMMKERNEL
  2319. LFSDUX A5, CO1, INC2
  2320. #else
  2321. nop
  2322. #endif
  2323. FXCSMADD f12, B4, A9, f12
  2324. #ifndef TRMMKERNEL
  2325. LFDUX B3, CO2, INC
  2326. #else
  2327. nop
  2328. #endif
  2329. FXCPMADD f1, B6, A2, f1
  2330. nop
  2331. FXCSMADD f5, B6, A2, f5
  2332. LFPDUX A4, AO2, INC4
  2333. FXCPMADD f9, B4, A2, f9
  2334. #ifndef TRMMKERNEL
  2335. LFDUX A6, CO2, INC2
  2336. #else
  2337. nop
  2338. #endif
  2339. FXCSMADD f13, B4, A2, f13
  2340. nop
  2341. FXCPMADD f2, B6, A10, f2
  2342. nop
  2343. FXCSMADD f6, B6, A10, f6
  2344. nop
  2345. FXCPMADD f10, B4, A10, f10
  2346. nop
  2347. FXCSMADD f14, B4, A10, f14
  2348. #ifndef TRMMKERNEL
  2349. LFDUX A7, CO2, INC2
  2350. #else
  2351. nop
  2352. #endif
  2353. FXCPMADD f3, B6, A4, f3
  2354. nop
  2355. FXCSMADD f7, B6, A4, f7
  2356. nop
  2357. FXCPMADD f11, B4, A4, f11
  2358. nop
  2359. FXCSMADD f15, B4, A4, f15
  2360. #ifndef TRMMKERNEL
  2361. LFDUX B2, CO2, INC2
  2362. #else
  2363. nop
  2364. #endif
  2365. .align 4
  2366. .L1014:
  2367. li r0, ALPHA
  2368. lfpdx AP, SP, r0
  2369. #ifdef TRMMKERNEL
  2370. li r0, FZERO
  2371. lfpsx f30, SP, r0
  2372. #endif
  2373. #if defined(TRMMKERNEL)
  2374. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2375. sub TEMP, K, KK
  2376. #elif defined(LEFT)
  2377. addi TEMP, KK, 4
  2378. #else
  2379. addi TEMP, KK, 2
  2380. #endif
  2381. andi. r0, TEMP, 3
  2382. mtspr CTR, r0
  2383. ble+ .L1018
  2384. cmpwi cr0, TEMP, 3
  2385. bgt+ .L1015
  2386. #else
  2387. andi. r0, K, 3
  2388. mtspr CTR, r0
  2389. ble+ .L1018
  2390. cmpwi cr0, K, 3
  2391. bgt+ .L1015
  2392. #endif
  2393. #ifndef TRMMKERNEL
  2394. LFDUX A1, CO1, INC
  2395. fpmr f5, f0
  2396. LFDUX B1, CO1, INC2
  2397. fpmr f9, f0
  2398. LFDUX A3, CO1, INC2
  2399. fpmr f13, f0
  2400. LFDUX A5, CO1, INC2
  2401. fpmr f2, f0
  2402. LFSDUX A1, CO1, INCM5
  2403. fpmr f6, f0
  2404. LFSDUX B1, CO1, INC2
  2405. fpmr f10, f0
  2406. LFSDUX A3, CO1, INC2
  2407. fpmr f14, f0
  2408. LFSDUX A5, CO1, INC2
  2409. fpmr f3, f0
  2410. LFDUX B3, CO2, INC
  2411. fpmr f7, f0
  2412. LFDUX A6, CO2, INC2
  2413. fpmr f11, f0
  2414. LFDUX A7, CO2, INC2
  2415. fpmr f15, f0
  2416. LFDUX B2, CO2, INC2
  2417. #else
  2418. fpmr f5, f0
  2419. fpmr f9, f0
  2420. fpmr f13, f0
  2421. fpmr f2, f0
  2422. fpmr f6, f0
  2423. fpmr f10, f0
  2424. fpmr f14, f0
  2425. fpmr f3, f0
  2426. fpmr f7, f0
  2427. fpmr f11, f0
  2428. fpmr f15, f0
  2429. #endif
  2430. .align 4
  2431. .L1015:
  2432. LFPDUX A2, AO, INC4
  2433. LFPDUX A4, AO2, INC4
  2434. LFPDUX A10, BO, INC4
  2435. LFPDUX B4, BO2, INC4
  2436. bdz- .L1017
  2437. .align 4
  2438. .L1016:
  2439. FXCPMADD f0, A10, A2, f0
  2440. FXCSMADD f4, A10, A2, f4
  2441. FXCPMADD f8, B4, A2, f8
  2442. FXCSMADD f12, B4, A2, f12
  2443. LFPDUX A2, AO, INC4
  2444. FXCPMADD f1, A10, A4, f1
  2445. FXCSMADD f5, A10, A4, f5
  2446. FXCPMADD f9, B4, A4, f9
  2447. FXCSMADD f13, B4, A4, f13
  2448. LFPDUX A4, AO2, INC4
  2449. FXCPMADD f2, A10, A2, f2
  2450. FXCSMADD f6, A10, A2, f6
  2451. FXCPMADD f10, B4, A2, f10
  2452. FXCSMADD f14, B4, A2, f14
  2453. LFPDUX A2, AO, INC4
  2454. FXCPMADD f3, A10, A4, f3
  2455. FXCSMADD f7, A10, A4, f7
  2456. LFPDUX A10, BO, INC4
  2457. FXCPMADD f11, B4, A4, f11
  2458. FXCSMADD f15, B4, A4, f15
  2459. LFPDUX A4, AO2, INC4
  2460. LFPDUX B4, BO2, INC4
  2461. bdnz+ .L1016
  2462. .align 4
  2463. .L1017:
  2464. FXCPMADD f0, A10, A2, f0
  2465. FXCSMADD f4, A10, A2, f4
  2466. FXCPMADD f8, B4, A2, f8
  2467. FXCSMADD f12, B4, A2, f12
  2468. LFPDUX A2, AO, INC4
  2469. FXCPMADD f1, A10, A4, f1
  2470. FXCSMADD f5, A10, A4, f5
  2471. FXCPMADD f9, B4, A4, f9
  2472. FXCSMADD f13, B4, A4, f13
  2473. LFPDUX A4, AO2, INC4
  2474. FXCPMADD f2, A10, A2, f2
  2475. FXCSMADD f6, A10, A2, f6
  2476. FXCPMADD f10, B4, A2, f10
  2477. FXCSMADD f14, B4, A2, f14
  2478. FXCPMADD f3, A10, A4, f3
  2479. FXCSMADD f7, A10, A4, f7
  2480. FXCPMADD f11, B4, A4, f11
  2481. FXCSMADD f15, B4, A4, f15
  2482. .align 4
  2483. .L1018:
  2484. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  2485. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  2486. fpadd f0, f0, f4
  2487. fpadd f8, f8, f12
  2488. fpadd f1, f1, f5
  2489. fpadd f9, f9, f13
  2490. fpadd f2, f2, f6
  2491. fpadd f10, f10, f14
  2492. fpadd f3, f3, f7
  2493. fpadd f11, f11, f15
  2494. #else
  2495. fpsub f0, f0, f4
  2496. fpsub f8, f8, f12
  2497. fpsub f1, f1, f5
  2498. fpsub f9, f9, f13
  2499. fpsub f2, f2, f6
  2500. fpsub f10, f10, f14
  2501. fpsub f3, f3, f7
  2502. fpsub f11, f11, f15
  2503. #endif
  2504. #ifndef TRMMKERNEL
  2505. fxcpmadd A1, f0, AP, A1
  2506. LFSDUX B3, CO2, INCM5
  2507. fxcpmadd B1, f1, AP, B1
  2508. LFSDUX A6, CO2, INC2
  2509. fxcpmadd A3, f2, AP, A3
  2510. LFSDUX A7, CO2, INC2
  2511. fxcpmadd A5, f3, AP, A5
  2512. LFSDUX B2, CO2, INC2
  2513. fxcxnpma f0, f0, AP, A1
  2514. fxcpmadd B3, f8, AP, B3
  2515. fxcxnpma f1, f1, AP, B1
  2516. fxcpmadd A6, f9, AP, A6
  2517. fxcxnpma f2, f2, AP, A3
  2518. fxcpmadd A7, f10, AP, A7
  2519. fxcxnpma f3, f3, AP, A5
  2520. STFDUX f0, CO1, INCM7
  2521. fxcpmadd B2, f11, AP, B2
  2522. STFSDUX f0, CO1, INC
  2523. fxcxnpma f8, f8, AP, B3
  2524. STFDUX f1, CO1, INC
  2525. STFSDUX f1, CO1, INC
  2526. fxcxnpma f9, f9, AP, A6
  2527. STFDUX f2, CO1, INC
  2528. STFSDUX f2, CO1, INC
  2529. fxcxnpma f10, f10, AP, A7
  2530. STFDUX f3, CO1, INC
  2531. STFSDUX f3, CO1, INC
  2532. fxcxnpma f11, f11, AP, B2
  2533. STFDUX f8, CO2, INCM7
  2534. #else
  2535. fxcpmadd f12, f0, AP, f30
  2536. fxcpmadd f13, f1, AP, f30
  2537. fxcpmadd f14, f2, AP, f30
  2538. fxcpmadd f15, f3, AP, f30
  2539. fxcxnpma f0, f0, AP, f12
  2540. fxcxnpma f1, f1, AP, f13
  2541. fxcxnpma f2, f2, AP, f14
  2542. fxcxnpma f3, f3, AP, f15
  2543. fxcpmadd f16, f8, AP, f30
  2544. fxcpmadd f17, f9, AP, f30
  2545. fxcpmadd f18, f10, AP, f30
  2546. fxcpmadd f19, f11, AP, f30
  2547. fxcxnpma f8, f8, AP, f16
  2548. fxcxnpma f9, f9, AP, f17
  2549. fxcxnpma f10, f10, AP, f18
  2550. fxcxnpma f11, f11, AP, f19
  2551. STFDUX f0, CO1, INC
  2552. STFSDUX f0, CO1, INC
  2553. STFDUX f1, CO1, INC
  2554. STFSDUX f1, CO1, INC
  2555. STFDUX f2, CO1, INC
  2556. STFSDUX f2, CO1, INC
  2557. STFDUX f3, CO1, INC
  2558. STFSDUX f3, CO1, INC
  2559. STFDUX f8, CO2, INC
  2560. #endif
  2561. STFSDUX f8, CO2, INC
  2562. STFDUX f9, CO2, INC
  2563. STFSDUX f9, CO2, INC
  2564. STFDUX f10, CO2, INC
  2565. STFSDUX f10, CO2, INC
  2566. STFDUX f11, CO2, INC
  2567. STFSDUX f11, CO2, INC
  2568. #ifdef TRMMKERNEL
  2569. #if ( defined(LEFT) && defined(TRANSA)) || \
  2570. (!defined(LEFT) && !defined(TRANSA))
  2571. sub TEMP, K, KK
  2572. #ifdef LEFT
  2573. addi TEMP, TEMP, -4
  2574. #else
  2575. addi TEMP, TEMP, -2
  2576. #endif
  2577. slwi r0, TEMP, 2 + ZBASE_SHIFT
  2578. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  2579. add AO, AO, r0
  2580. add BO, BO, TEMP
  2581. #endif
  2582. #ifdef LEFT
  2583. addi KK, KK, 4
  2584. #endif
  2585. #endif
  2586. addic. I, I, -1
  2587. li r0, FZERO
  2588. lfpsx f0, SP, r0
  2589. bgt+ .L1011
  2590. .align 4
  2591. .L1020:
  2592. andi. I, M, 2
  2593. beq .L1030
  2594. #if defined(TRMMKERNEL)
  2595. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2596. addi AO2, AO, 2 * SIZE
  2597. fpmr f4, f0
  2598. addi BO, B, - 4 * SIZE
  2599. fpmr f8, f0
  2600. addi BO2, B, - 2 * SIZE
  2601. fpmr f12, f0
  2602. #else
  2603. slwi TEMP, KK, 1 + ZBASE_SHIFT
  2604. slwi r0, KK, 1 + ZBASE_SHIFT
  2605. add AO, AO, TEMP
  2606. add BO, B, r0
  2607. addi AO2, AO, 2 * SIZE
  2608. fpmr f4, f0
  2609. addi BO, BO, - 4 * SIZE
  2610. fpmr f8, f0
  2611. addi BO2, BO, 2 * SIZE
  2612. fpmr f12, f0
  2613. #endif
  2614. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2615. sub TEMP, K, KK
  2616. #elif defined(LEFT)
  2617. addi TEMP, KK, 2
  2618. #else
  2619. addi TEMP, KK, 2
  2620. #endif
  2621. srawi. r0, TEMP, 2
  2622. fpmr f1, f0
  2623. fpmr f5, f0
  2624. fpmr f9, f0
  2625. mtspr CTR, r0
  2626. fpmr f13, f0
  2627. ble .L1024
  2628. #else
  2629. addi AO2, AO, 2 * SIZE
  2630. fpmr f4, f0
  2631. addi BO, B, - 4 * SIZE
  2632. fpmr f8, f0
  2633. addi BO2, B, - 2 * SIZE
  2634. fpmr f12, f0
  2635. srawi. r0, K, 2
  2636. fpmr f1, f0
  2637. fpmr f5, f0
  2638. fpmr f9, f0
  2639. mtspr CTR, r0
  2640. fpmr f13, f0
  2641. ble .L1024
  2642. #endif
  2643. LFPDUX A1, AO, INC4
  2644. LFPDUX B1, BO, INC4
  2645. LFPDUX A2, AO2, INC4
  2646. LFPDUX B2, BO2, INC4
  2647. LFPDUX A3, AO, INC4
  2648. LFPDUX B3, BO, INC4
  2649. LFPDUX A4, AO2, INC4
  2650. LFPDUX B4, BO2, INC4
  2651. LFPDUX A5, AO, INC4
  2652. LFPDUX B5, BO, INC4
  2653. LFPDUX A6, AO2, INC4
  2654. LFPDUX B6, BO2, INC4
  2655. LFPDUX A7, AO, INC4
  2656. LFPDUX A9, BO, INC4
  2657. LFPDUX A10, BO2, INC4
  2658. bdz- .L1023
  2659. .align 4
  2660. .L1022:
  2661. FXCPMADD f0, B1, A1, f0
  2662. nop
  2663. FXCSMADD f4, B1, A1, f4
  2664. LFPDUX A8, AO2, INC4
  2665. FXCPMADD f8, B2, A1, f8
  2666. nop
  2667. FXCSMADD f12, B2, A1, f12
  2668. LFPDUX A1, AO, INC4
  2669. FXCPMADD f1, B1, A2, f1
  2670. nop
  2671. FXCSMADD f5, B1, A2, f5
  2672. LFPDUX B1, BO, INC4
  2673. FXCPMADD f9, B2, A2, f9
  2674. nop
  2675. FXCSMADD f13, B2, A2, f13
  2676. LFPDUX B2, BO2, INC4
  2677. FXCPMADD f0, B3, A3, f0
  2678. nop
  2679. FXCSMADD f4, B3, A3, f4
  2680. LFPDUX A2, AO2, INC4
  2681. FXCPMADD f8, B4, A3, f8
  2682. nop
  2683. FXCSMADD f12, B4, A3, f12
  2684. LFPDUX A3, AO, INC4
  2685. FXCPMADD f1, B3, A4, f1
  2686. nop
  2687. FXCSMADD f5, B3, A4, f5
  2688. LFPDUX B3, BO, INC4
  2689. FXCPMADD f9, B4, A4, f9
  2690. nop
  2691. FXCSMADD f13, B4, A4, f13
  2692. LFPDUX B4, BO2, INC4
  2693. FXCPMADD f0, B5, A5, f0
  2694. nop
  2695. FXCSMADD f4, B5, A5, f4
  2696. LFPDUX A4, AO2, INC4
  2697. FXCPMADD f8, B6, A5, f8
  2698. nop
  2699. FXCSMADD f12, B6, A5, f12
  2700. LFPDUX A5, AO, INC4
  2701. FXCPMADD f1, B5, A6, f1
  2702. nop
  2703. FXCSMADD f5, B5, A6, f5
  2704. LFPDUX B5, BO, INC4
  2705. FXCPMADD f9, B6, A6, f9
  2706. nop
  2707. FXCSMADD f13, B6, A6, f13
  2708. LFPDUX B6, BO2, INC4
  2709. FXCPMADD f0, A9, A7, f0
  2710. nop
  2711. FXCSMADD f4, A9, A7, f4
  2712. LFPDUX A6, AO2, INC4
  2713. FXCPMADD f8, A10, A7, f8
  2714. nop
  2715. FXCSMADD f12, A10, A7, f12
  2716. LFPDUX A7, AO, INC4
  2717. FXCPMADD f1, A9, A8, f1
  2718. nop
  2719. FXCSMADD f5, A9, A8, f5
  2720. LFPDUX A9, BO, INC4
  2721. FXCPMADD f9, A10, A8, f9
  2722. nop
  2723. FXCSMADD f13, A10, A8, f13
  2724. LFPDUX A10, BO2, INC4
  2725. bdnz+ .L1022
  2726. .align 4
  2727. .L1023:
  2728. FXCPMADD f0, B1, A1, f0
  2729. FXCSMADD f4, B1, A1, f4
  2730. LFPDUX A8, AO2, INC4
  2731. FXCPMADD f8, B2, A1, f8
  2732. FXCSMADD f12, B2, A1, f12
  2733. FXCPMADD f1, B1, A2, f1
  2734. FXCSMADD f5, B1, A2, f5
  2735. FXCPMADD f9, B2, A2, f9
  2736. FXCSMADD f13, B2, A2, f13
  2737. FXCPMADD f0, B3, A3, f0
  2738. FXCSMADD f4, B3, A3, f4
  2739. FXCPMADD f8, B4, A3, f8
  2740. FXCSMADD f12, B4, A3, f12
  2741. FXCPMADD f1, B3, A4, f1
  2742. FXCSMADD f5, B3, A4, f5
  2743. FXCPMADD f9, B4, A4, f9
  2744. FXCSMADD f13, B4, A4, f13
  2745. FXCPMADD f0, B5, A5, f0
  2746. FXCSMADD f4, B5, A5, f4
  2747. FXCPMADD f8, B6, A5, f8
  2748. FXCSMADD f12, B6, A5, f12
  2749. FXCPMADD f1, B5, A6, f1
  2750. FXCSMADD f5, B5, A6, f5
  2751. FXCPMADD f9, B6, A6, f9
  2752. FXCSMADD f13, B6, A6, f13
  2753. FXCPMADD f0, A9, A7, f0
  2754. FXCSMADD f4, A9, A7, f4
  2755. FXCPMADD f8, A10, A7, f8
  2756. FXCSMADD f12, A10, A7, f12
  2757. FXCPMADD f1, A9, A8, f1
  2758. FXCSMADD f5, A9, A8, f5
  2759. FXCPMADD f9, A10, A8, f9
  2760. FXCSMADD f13, A10, A8, f13
  2761. .align 4
  2762. .L1024:
  2763. li r0, ALPHA
  2764. lfpdx AP, SP, r0
  2765. #ifdef TRMMKERNEL
  2766. li r0, FZERO
  2767. lfpsx f30, SP, r0
  2768. #endif
  2769. #if defined(TRMMKERNEL)
  2770. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2771. sub TEMP, K, KK
  2772. #elif defined(LEFT)
  2773. addi TEMP, KK, 2
  2774. #else
  2775. addi TEMP, KK, 2
  2776. #endif
  2777. andi. r0, TEMP, 3
  2778. mtspr CTR, r0
  2779. #else
  2780. andi. r0, K, 3
  2781. mtspr CTR, r0
  2782. #endif
  2783. ble+ .L1028
  2784. LFPDUX A1, AO, INC4
  2785. LFPDUX A2, AO2, INC4
  2786. LFPDUX B1, BO, INC4
  2787. LFPDUX B2, BO2, INC4
  2788. bdz- .L1027
  2789. .align 4
  2790. .L1026:
  2791. FXCPMADD f0, B1, A1, f0
  2792. FXCSMADD f4, B1, A1, f4
  2793. FXCPMADD f8, B2, A1, f8
  2794. FXCSMADD f12, B2, A1, f12
  2795. LFPDUX A1, AO, INC4
  2796. FXCPMADD f1, B1, A2, f1
  2797. FXCSMADD f5, B1, A2, f5
  2798. LFPDUX B1, BO, INC4
  2799. FXCPMADD f9, B2, A2, f9
  2800. FXCSMADD f13, B2, A2, f13
  2801. LFPDUX A2, AO2, INC4
  2802. LFPDUX B2, BO2, INC4
  2803. bdnz+ .L1026
  2804. .align 4
  2805. .L1027:
  2806. FXCPMADD f0, B1, A1, f0
  2807. FXCSMADD f4, B1, A1, f4
  2808. FXCPMADD f8, B2, A1, f8
  2809. FXCSMADD f12, B2, A1, f12
  2810. FXCPMADD f1, B1, A2, f1
  2811. FXCSMADD f5, B1, A2, f5
  2812. FXCPMADD f9, B2, A2, f9
  2813. FXCSMADD f13, B2, A2, f13
  2814. .align 4
  2815. .L1028:
  2816. #ifndef TRMMKERNEL
  2817. LFDUX A1, CO1, INC
  2818. LFDUX A2, CO1, INC2
  2819. LFDUX A3, CO2, INC
  2820. LFDUX A4, CO2, INC2
  2821. LFSDUX A1, CO1, INCM1
  2822. LFSDUX A2, CO1, INC2
  2823. LFSDUX A3, CO2, INCM1
  2824. LFSDUX A4, CO2, INC2
  2825. #endif
  2826. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  2827. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  2828. fpadd f0, f0, f4
  2829. fpadd f8, f8, f12
  2830. fpadd f1, f1, f5
  2831. fpadd f9, f9, f13
  2832. #else
  2833. fpsub f0, f0, f4
  2834. fpsub f8, f8, f12
  2835. fpsub f1, f1, f5
  2836. fpsub f9, f9, f13
  2837. #endif
  2838. #ifndef TRMMKERNEL
  2839. fxcpmadd A1, f0, AP, A1
  2840. fxcpmadd A2, f1, AP, A2
  2841. fxcpmadd A3, f8, AP, A3
  2842. fxcpmadd A4, f9, AP, A4
  2843. fxcxnpma f0, f0, AP, A1
  2844. fxcxnpma f1, f1, AP, A2
  2845. fxcxnpma f8, f8, AP, A3
  2846. fxcxnpma f9, f9, AP, A4
  2847. STFDUX f0, CO1, INCM3
  2848. STFSDUX f0, CO1, INC
  2849. STFDUX f1, CO1, INC
  2850. STFSDUX f1, CO1, INC
  2851. STFDUX f8, CO2, INCM3
  2852. STFSDUX f8, CO2, INC
  2853. STFDUX f9, CO2, INC
  2854. STFSDUX f9, CO2, INC
  2855. #else
  2856. fxcpmadd f12, f0, AP, f30
  2857. fxcpmadd f13, f1, AP, f30
  2858. fxcpmadd f14, f8, AP, f30
  2859. fxcpmadd f15, f9, AP, f30
  2860. fxcxnpma f0, f0, AP, f12
  2861. fxcxnpma f1, f1, AP, f13
  2862. fxcxnpma f8, f8, AP, f14
  2863. fxcxnpma f9, f9, AP, f15
  2864. STFDUX f0, CO1, INC
  2865. STFSDUX f0, CO1, INC
  2866. STFDUX f1, CO1, INC
  2867. STFSDUX f1, CO1, INC
  2868. STFDUX f8, CO2, INC
  2869. STFSDUX f8, CO2, INC
  2870. STFDUX f9, CO2, INC
  2871. STFSDUX f9, CO2, INC
  2872. #endif
  2873. #ifdef TRMMKERNEL
  2874. #if ( defined(LEFT) && defined(TRANSA)) || \
  2875. (!defined(LEFT) && !defined(TRANSA))
  2876. sub TEMP, K, KK
  2877. #ifdef LEFT
  2878. addi TEMP, TEMP, -2
  2879. #else
  2880. addi TEMP, TEMP, -2
  2881. #endif
  2882. slwi r0, TEMP, 1 + ZBASE_SHIFT
  2883. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  2884. add AO, AO, r0
  2885. add BO, BO, TEMP
  2886. #endif
  2887. #ifdef LEFT
  2888. addi KK, KK, 2
  2889. #endif
  2890. #endif
  2891. li r0, FZERO
  2892. lfpsx f0, SP, r0
  2893. .align 4
  2894. .L1030:
  2895. andi. I, M, 1
  2896. beq .L1049
  2897. #if defined(TRMMKERNEL)
  2898. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2899. addi AO2, AO, 2 * SIZE
  2900. fpmr f1, f0
  2901. addi BO, B, - 4 * SIZE
  2902. fpmr f2, f0
  2903. addi BO2, B, - 2 * SIZE
  2904. fpmr f3, f0
  2905. #else
  2906. slwi TEMP, KK, 0 + ZBASE_SHIFT
  2907. slwi r0, KK, 1 + ZBASE_SHIFT
  2908. add AO, AO, TEMP
  2909. add BO, B, r0
  2910. addi AO2, AO, 2 * SIZE
  2911. fpmr f1, f0
  2912. addi BO, BO, - 4 * SIZE
  2913. fpmr f2, f0
  2914. addi BO2, BO, 2 * SIZE
  2915. fpmr f3, f0
  2916. #endif
  2917. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2918. sub TEMP, K, KK
  2919. #elif defined(LEFT)
  2920. addi TEMP, KK, 1
  2921. #else
  2922. addi TEMP, KK, 2
  2923. #endif
  2924. srawi. r0, TEMP, 2
  2925. mtspr CTR, r0
  2926. ble .L1034
  2927. #else
  2928. addi AO2, AO, 2 * SIZE
  2929. fpmr f1, f0
  2930. addi BO, B, - 4 * SIZE
  2931. fpmr f2, f0
  2932. addi BO2, B, - 2 * SIZE
  2933. fpmr f3, f0
  2934. srawi. r0, K, 2
  2935. mtspr CTR, r0
  2936. ble .L1034
  2937. #endif
  2938. LFPDUX A1, AO, INC4
  2939. LFPDUX B1, BO, INC4
  2940. LFPDUX B2, BO2, INC4
  2941. LFPDUX A2, AO2, INC4
  2942. LFPDUX B3, BO, INC4
  2943. LFPDUX B4, BO2, INC4
  2944. LFPDUX A3, AO, INC4
  2945. LFPDUX A5, BO, INC4
  2946. LFPDUX A6, BO2, INC4
  2947. LFPDUX A4, AO2, INC4
  2948. LFPDUX A7, BO, INC4
  2949. LFPDUX A8, BO2, INC4
  2950. bdz- .L1033
  2951. .align 4
  2952. .L1032:
  2953. FXCPMADD f0, B1, A1, f0
  2954. FXCSMADD f1, B1, A1, f1
  2955. LFPDUX B1, BO, INC4
  2956. FXCPMADD f2, B2, A1, f2
  2957. FXCSMADD f3, B2, A1, f3
  2958. LFPDUX B2, BO2, INC4
  2959. LFPDUX A1, AO, INC4
  2960. FXCPMADD f0, B3, A2, f0
  2961. FXCSMADD f1, B3, A2, f1
  2962. LFPDUX B3, BO, INC4
  2963. FXCPMADD f2, B4, A2, f2
  2964. FXCSMADD f3, B4, A2, f3
  2965. LFPDUX B4, BO2, INC4
  2966. LFPDUX A2, AO2, INC4
  2967. FXCPMADD f0, A5, A3, f0
  2968. FXCSMADD f1, A5, A3, f1
  2969. LFPDUX A5, BO, INC4
  2970. FXCPMADD f2, A6, A3, f2
  2971. FXCSMADD f3, A6, A3, f3
  2972. LFPDUX A6, BO2, INC4
  2973. LFPDUX A3, AO, INC4
  2974. FXCPMADD f0, A7, A4, f0
  2975. FXCSMADD f1, A7, A4, f1
  2976. LFPDUX A7, BO, INC4
  2977. FXCPMADD f2, A8, A4, f2
  2978. FXCSMADD f3, A8, A4, f3
  2979. LFPDUX A8, BO2, INC4
  2980. LFPDUX A4, AO2, INC4
  2981. bdnz+ .L1032
  2982. .align 4
  2983. .L1033:
  2984. FXCPMADD f0, B1, A1, f0
  2985. FXCSMADD f1, B1, A1, f1
  2986. FXCPMADD f2, B2, A1, f2
  2987. FXCSMADD f3, B2, A1, f3
  2988. FXCPMADD f0, B3, A2, f0
  2989. FXCSMADD f1, B3, A2, f1
  2990. FXCPMADD f2, B4, A2, f2
  2991. FXCSMADD f3, B4, A2, f3
  2992. FXCPMADD f0, A5, A3, f0
  2993. FXCSMADD f1, A5, A3, f1
  2994. FXCPMADD f2, A6, A3, f2
  2995. FXCSMADD f3, A6, A3, f3
  2996. FXCPMADD f0, A7, A4, f0
  2997. FXCSMADD f1, A7, A4, f1
  2998. FXCPMADD f2, A8, A4, f2
  2999. FXCSMADD f3, A8, A4, f3
  3000. .align 4
  3001. .L1034:
  3002. li r0, ALPHA
  3003. lfpdx AP, SP, r0
  3004. #ifdef TRMMKERNEL
  3005. li r0, FZERO
  3006. lfpsx f30, SP, r0
  3007. #endif
  3008. #if defined(TRMMKERNEL)
  3009. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3010. sub TEMP, K, KK
  3011. #elif defined(LEFT)
  3012. addi TEMP, KK, 1
  3013. #else
  3014. addi TEMP, KK, 2
  3015. #endif
  3016. andi. r0, TEMP, 3
  3017. mtspr CTR, r0
  3018. #else
  3019. andi. r0, K, 3
  3020. mtspr CTR, r0
  3021. #endif
  3022. ble+ .L1038
  3023. LFPDX A1, AO, INC4
  3024. LFPDUX B1, BO, INC4
  3025. LFPDUX B2, BO2, INC4
  3026. add AO, AO, INC2
  3027. bdz- .L1037
  3028. .align 4
  3029. .L1036:
  3030. FXCPMADD f0, B1, A1, f0
  3031. FXCSMADD f1, B1, A1, f1
  3032. LFPDUX B1, BO, INC4
  3033. FXCPMADD f2, B2, A1, f2
  3034. FXCSMADD f3, B2, A1, f3
  3035. LFPDX A1, AO, INC4
  3036. LFPDUX B2, BO2, INC4
  3037. add AO, AO, INC2
  3038. bdnz+ .L1036
  3039. .align 4
  3040. .L1037:
  3041. FXCPMADD f0, B1, A1, f0
  3042. FXCSMADD f1, B1, A1, f1
  3043. FXCPMADD f2, B2, A1, f2
  3044. FXCSMADD f3, B2, A1, f3
  3045. .align 4
  3046. .L1038:
  3047. #ifndef TRMMKERNEL
  3048. LFDUX A1, CO1, INC
  3049. LFDUX A2, CO2, INC
  3050. LFSDUX A1, CO1, INC
  3051. LFSDUX A2, CO2, INC
  3052. #endif
  3053. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  3054. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  3055. fpadd f0, f0, f1
  3056. fpadd f2, f2, f3
  3057. #else
  3058. fpsub f0, f0, f1
  3059. fpsub f2, f2, f3
  3060. #endif
  3061. #ifndef TRMMKERNEL
  3062. fxcpmadd A1, f0, AP, A1
  3063. fxcpmadd A2, f2, AP, A2
  3064. fxcxnpma f0, f0, AP, A1
  3065. fxcxnpma f2, f2, AP, A2
  3066. STFDUX f0, CO1, INCM1
  3067. STFSDUX f0, CO1, INC
  3068. STFDUX f2, CO2, INCM1
  3069. STFSDUX f2, CO2, INC
  3070. #else
  3071. fxcpmadd f12, f0, AP, f30
  3072. fxcpmadd f13, f2, AP, f30
  3073. fxcxnpma f0, f0, AP, f12
  3074. fxcxnpma f2, f2, AP, f13
  3075. STFDUX f0, CO1, INC
  3076. STFSDUX f0, CO1, INC
  3077. STFDUX f2, CO2, INC
  3078. STFSDUX f2, CO2, INC
  3079. #endif
  3080. #ifdef TRMMKERNEL
  3081. #if ( defined(LEFT) && defined(TRANSA)) || \
  3082. (!defined(LEFT) && !defined(TRANSA))
  3083. sub TEMP, K, KK
  3084. #ifdef LEFT
  3085. addi TEMP, TEMP, -1
  3086. #else
  3087. addi TEMP, TEMP, -2
  3088. #endif
  3089. slwi r0, TEMP, 0 + ZBASE_SHIFT
  3090. slwi TEMP, TEMP, 1 + ZBASE_SHIFT
  3091. add AO, AO, r0
  3092. add BO, BO, TEMP
  3093. #endif
  3094. #ifdef LEFT
  3095. addi KK, KK, 1
  3096. #endif
  3097. #endif
  3098. li r0, FZERO
  3099. lfpsx f0, SP, r0
  3100. .align 4
  3101. .L1049:
  3102. #if defined(TRMMKERNEL) && !defined(LEFT)
  3103. addi KK, KK, 2
  3104. #endif
  3105. addi B, BO, 4 * SIZE
  3106. addic. J, J, -1
  3107. bgt+ .L1010
  3108. .align 4
  3109. .L1050:
  3110. andi. J, N, 1
  3111. beq .L10999
  3112. mr CO1, C
  3113. #if defined(TRMMKERNEL) && defined(LEFT)
  3114. mr KK, OFFSET
  3115. #endif
  3116. addi AO, A, -2 * SIZE
  3117. li r0, FZERO
  3118. lfpsx f0, SP, r0
  3119. srawi. I, M, 2
  3120. ble .L1060
  3121. .align 4
  3122. .L1051:
  3123. #if defined(TRMMKERNEL)
  3124. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  3125. fpmr f4, f0
  3126. addi BO, B, - 2 * SIZE
  3127. fpmr f1, f0
  3128. fpmr f5, f0
  3129. fpmr f2, f0
  3130. fpmr f6, f0
  3131. #else
  3132. slwi TEMP, KK, 2 + ZBASE_SHIFT
  3133. slwi r0, KK, 0 + ZBASE_SHIFT
  3134. add AO, AO, TEMP
  3135. add BO, B, r0
  3136. fpmr f4, f0
  3137. addi BO, BO, - 2 * SIZE
  3138. fpmr f1, f0
  3139. fpmr f5, f0
  3140. fpmr f2, f0
  3141. fpmr f6, f0
  3142. #endif
  3143. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3144. sub TEMP, K, KK
  3145. #elif defined(LEFT)
  3146. addi TEMP, KK, 4
  3147. #else
  3148. addi TEMP, KK, 1
  3149. #endif
  3150. srawi. r0, TEMP, 2
  3151. fpmr f3, f0
  3152. mtspr CTR, r0
  3153. fpmr f7, f0
  3154. ble .L1054
  3155. #else
  3156. srawi. r0, K, 2
  3157. fpmr f4, f0
  3158. addi BO, B, - 2 * SIZE
  3159. fpmr f1, f0
  3160. fpmr f5, f0
  3161. fpmr f2, f0
  3162. fpmr f6, f0
  3163. fpmr f3, f0
  3164. mtspr CTR, r0
  3165. fpmr f7, f0
  3166. ble .L1054
  3167. #endif
  3168. LFPDUX B1, BO, INC2
  3169. LFPDUX A1, AO, INC2
  3170. LFPDUX A2, AO, INC2
  3171. LFPDUX B2, BO, INC2
  3172. LFPDUX A3, AO, INC2
  3173. LFPDUX A4, AO, INC2
  3174. LFPDUX B3, BO, INC2
  3175. LFPDUX A5, AO, INC2
  3176. LFPDUX A6, AO, INC2
  3177. LFPDUX A7, AO, INC2
  3178. LFPDUX A8, AO, INC2
  3179. bdz- .L1053
  3180. .align 4
  3181. .L1052:
  3182. FXCPMADD f0, B1, A1, f0
  3183. LFPDUX B4, BO, INC2
  3184. FXCSMADD f4, B1, A1, f4
  3185. LFPDUX A1, AO, INC2
  3186. FXCPMADD f1, B1, A2, f1
  3187. nop
  3188. FXCSMADD f5, B1, A2, f5
  3189. LFPDUX A2, AO, INC2
  3190. FXCPMADD f2, B1, A3, f2
  3191. nop
  3192. FXCSMADD f6, B1, A3, f6
  3193. LFPDUX A3, AO, INC2
  3194. FXCPMADD f3, B1, A4, f3
  3195. nop
  3196. FXCSMADD f7, B1, A4, f7
  3197. LFPDUX A4, AO, INC2
  3198. FXCPMADD f0, B2, A5, f0
  3199. LFPDUX B1, BO, INC2
  3200. FXCSMADD f4, B2, A5, f4
  3201. LFPDUX A5, AO, INC2
  3202. FXCPMADD f1, B2, A6, f1
  3203. nop
  3204. FXCSMADD f5, B2, A6, f5
  3205. LFPDUX A6, AO, INC2
  3206. FXCPMADD f2, B2, A7, f2
  3207. nop
  3208. FXCSMADD f6, B2, A7, f6
  3209. LFPDUX A7, AO, INC2
  3210. FXCPMADD f3, B2, A8, f3
  3211. nop
  3212. FXCSMADD f7, B2, A8, f7
  3213. LFPDUX A8, AO, INC2
  3214. FXCPMADD f0, B3, A1, f0
  3215. LFPDUX B2, BO, INC2
  3216. FXCSMADD f4, B3, A1, f4
  3217. LFPDUX A1, AO, INC2
  3218. FXCPMADD f1, B3, A2, f1
  3219. nop
  3220. FXCSMADD f5, B3, A2, f5
  3221. LFPDUX A2, AO, INC2
  3222. FXCPMADD f2, B3, A3, f2
  3223. nop
  3224. FXCSMADD f6, B3, A3, f6
  3225. LFPDUX A3, AO, INC2
  3226. FXCPMADD f3, B3, A4, f3
  3227. nop
  3228. FXCSMADD f7, B3, A4, f7
  3229. LFPDUX A4, AO, INC2
  3230. FXCPMADD f0, B4, A5, f0
  3231. LFPDUX B3, BO, INC2
  3232. FXCSMADD f4, B4, A5, f4
  3233. LFPDUX A5, AO, INC2
  3234. FXCPMADD f1, B4, A6, f1
  3235. nop
  3236. FXCSMADD f5, B4, A6, f5
  3237. LFPDUX A6, AO, INC2
  3238. FXCPMADD f2, B4, A7, f2
  3239. nop
  3240. FXCSMADD f6, B4, A7, f6
  3241. LFPDUX A7, AO, INC2
  3242. FXCPMADD f3, B4, A8, f3
  3243. nop
  3244. FXCSMADD f7, B4, A8, f7
  3245. LFPDUX A8, AO, INC2
  3246. bdnz+ .L1052
  3247. .align 4
  3248. .L1053:
  3249. FXCPMADD f0, B1, A1, f0
  3250. LFPDUX B4, BO, INC2
  3251. FXCSMADD f4, B1, A1, f4
  3252. LFPDUX A1, AO, INC2
  3253. FXCPMADD f1, B1, A2, f1
  3254. nop
  3255. FXCSMADD f5, B1, A2, f5
  3256. LFPDUX A2, AO, INC2
  3257. FXCPMADD f2, B1, A3, f2
  3258. nop
  3259. FXCSMADD f6, B1, A3, f6
  3260. LFPDUX A3, AO, INC2
  3261. FXCPMADD f3, B1, A4, f3
  3262. nop
  3263. FXCSMADD f7, B1, A4, f7
  3264. LFPDUX A4, AO, INC2
  3265. FXCPMADD f0, B2, A5, f0
  3266. nop
  3267. FXCSMADD f4, B2, A5, f4
  3268. LFPDUX A5, AO, INC2
  3269. FXCPMADD f1, B2, A6, f1
  3270. nop
  3271. FXCSMADD f5, B2, A6, f5
  3272. LFPDUX A6, AO, INC2
  3273. FXCPMADD f2, B2, A7, f2
  3274. nop
  3275. FXCSMADD f6, B2, A7, f6
  3276. LFPDUX A7, AO, INC2
  3277. FXCPMADD f3, B2, A8, f3
  3278. nop
  3279. FXCSMADD f7, B2, A8, f7
  3280. LFPDUX A8, AO, INC2
  3281. FXCPMADD f0, B3, A1, f0
  3282. FXCSMADD f4, B3, A1, f4
  3283. FXCPMADD f1, B3, A2, f1
  3284. FXCSMADD f5, B3, A2, f5
  3285. FXCPMADD f2, B3, A3, f2
  3286. FXCSMADD f6, B3, A3, f6
  3287. FXCPMADD f3, B3, A4, f3
  3288. FXCSMADD f7, B3, A4, f7
  3289. FXCPMADD f0, B4, A5, f0
  3290. FXCSMADD f4, B4, A5, f4
  3291. FXCPMADD f1, B4, A6, f1
  3292. FXCSMADD f5, B4, A6, f5
  3293. FXCPMADD f2, B4, A7, f2
  3294. FXCSMADD f6, B4, A7, f6
  3295. FXCPMADD f3, B4, A8, f3
  3296. FXCSMADD f7, B4, A8, f7
  3297. .align 4
  3298. .L1054:
  3299. li r0, ALPHA
  3300. lfpdx AP, SP, r0
  3301. #ifdef TRMMKERNEL
  3302. li r0, FZERO
  3303. lfpsx f30, SP, r0
  3304. #endif
  3305. #if defined(TRMMKERNEL)
  3306. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3307. sub TEMP, K, KK
  3308. #elif defined(LEFT)
  3309. addi TEMP, KK, 4
  3310. #else
  3311. addi TEMP, KK, 1
  3312. #endif
  3313. andi. r0, TEMP, 3
  3314. mtspr CTR, r0
  3315. #else
  3316. andi. r0, K, 3
  3317. mtspr CTR, r0
  3318. #endif
  3319. ble+ .L1058
  3320. LFPDUX A1, AO, INC2
  3321. LFPDUX B1, BO, INC2
  3322. LFPDUX A2, AO, INC2
  3323. LFPDUX A3, AO, INC2
  3324. LFPDUX A4, AO, INC2
  3325. bdz- .L1057
  3326. .align 4
  3327. .L1056:
  3328. FXCPMADD f0, B1, A1, f0
  3329. FXCSMADD f4, B1, A1, f4
  3330. LFPDUX A1, AO, INC2
  3331. FXCPMADD f1, B1, A2, f1
  3332. FXCSMADD f5, B1, A2, f5
  3333. LFPDUX A2, AO, INC2
  3334. FXCPMADD f2, B1, A3, f2
  3335. FXCSMADD f6, B1, A3, f6
  3336. LFPDUX A3, AO, INC2
  3337. FXCPMADD f3, B1, A4, f3
  3338. FXCSMADD f7, B1, A4, f7
  3339. LFPDUX A4, AO, INC2
  3340. LFPDUX B1, BO, INC2
  3341. bdnz+ .L1056
  3342. .align 4
  3343. .L1057:
  3344. FXCPMADD f0, B1, A1, f0
  3345. FXCSMADD f4, B1, A1, f4
  3346. FXCPMADD f1, B1, A2, f1
  3347. FXCSMADD f5, B1, A2, f5
  3348. FXCPMADD f2, B1, A3, f2
  3349. FXCSMADD f6, B1, A3, f6
  3350. FXCPMADD f3, B1, A4, f3
  3351. FXCSMADD f7, B1, A4, f7
  3352. .align 4
  3353. .L1058:
  3354. #ifndef TRMMKERNEL
  3355. LFDUX A1, CO1, INC
  3356. LFDUX A2, CO1, INC2
  3357. LFDUX A3, CO1, INC2
  3358. LFDUX A4, CO1, INC2
  3359. LFSDUX A1, CO1, INCM5
  3360. LFSDUX A2, CO1, INC2
  3361. LFSDUX A3, CO1, INC2
  3362. LFSDUX A4, CO1, INC2
  3363. #endif
  3364. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  3365. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  3366. fpadd f0, f0, f4
  3367. fpadd f1, f1, f5
  3368. fpadd f2, f2, f6
  3369. fpadd f3, f3, f7
  3370. #else
  3371. fpsub f0, f0, f4
  3372. fpsub f1, f1, f5
  3373. fpsub f2, f2, f6
  3374. fpsub f3, f3, f7
  3375. #endif
  3376. #ifndef TRMMKERNEL
  3377. fxcpmadd A1, f0, AP, A1
  3378. fxcpmadd A2, f1, AP, A2
  3379. fxcpmadd A3, f2, AP, A3
  3380. fxcpmadd A4, f3, AP, A4
  3381. fxcxnpma f0, f0, AP, A1
  3382. fxcxnpma f1, f1, AP, A2
  3383. fxcxnpma f2, f2, AP, A3
  3384. fxcxnpma f3, f3, AP, A4
  3385. STFDUX f0, CO1, INCM7
  3386. STFSDUX f0, CO1, INC
  3387. STFDUX f1, CO1, INC
  3388. STFSDUX f1, CO1, INC
  3389. STFDUX f2, CO1, INC
  3390. STFSDUX f2, CO1, INC
  3391. STFDUX f3, CO1, INC
  3392. STFSDUX f3, CO1, INC
  3393. #else
  3394. fxcpmadd f12, f0, AP, f30
  3395. fxcpmadd f13, f1, AP, f30
  3396. fxcpmadd f14, f2, AP, f30
  3397. fxcpmadd f15, f3, AP, f30
  3398. fxcxnpma f0, f0, AP, f12
  3399. fxcxnpma f1, f1, AP, f13
  3400. fxcxnpma f2, f2, AP, f14
  3401. fxcxnpma f3, f3, AP, f15
  3402. STFDUX f0, CO1, INC
  3403. STFSDUX f0, CO1, INC
  3404. STFDUX f1, CO1, INC
  3405. STFSDUX f1, CO1, INC
  3406. STFDUX f2, CO1, INC
  3407. STFSDUX f2, CO1, INC
  3408. STFDUX f3, CO1, INC
  3409. STFSDUX f3, CO1, INC
  3410. #endif
  3411. #ifdef TRMMKERNEL
  3412. #if ( defined(LEFT) && defined(TRANSA)) || \
  3413. (!defined(LEFT) && !defined(TRANSA))
  3414. sub TEMP, K, KK
  3415. #ifdef LEFT
  3416. addi TEMP, TEMP, -4
  3417. #else
  3418. addi TEMP, TEMP, -1
  3419. #endif
  3420. slwi r0, TEMP, 2 + ZBASE_SHIFT
  3421. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  3422. add AO, AO, r0
  3423. add BO, BO, TEMP
  3424. #endif
  3425. #ifdef LEFT
  3426. addi KK, KK, 4
  3427. #endif
  3428. #endif
  3429. addic. I, I, -1
  3430. li r0, FZERO
  3431. lfpsx f0, SP, r0
  3432. bgt+ .L1051
  3433. .align 4
  3434. .L1060:
  3435. andi. I, M, 2
  3436. beq .L1070
  3437. #if defined(TRMMKERNEL)
  3438. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  3439. addi BO, B, - 2 * SIZE
  3440. fpmr f1, f0
  3441. #else
  3442. slwi TEMP, KK, 1 + ZBASE_SHIFT
  3443. slwi r0, KK, 0 + ZBASE_SHIFT
  3444. add AO, AO, TEMP
  3445. add BO, B, r0
  3446. addi BO, BO, - 2 * SIZE
  3447. fpmr f1, f0
  3448. #endif
  3449. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3450. sub TEMP, K, KK
  3451. #elif defined(LEFT)
  3452. addi TEMP, KK, 2
  3453. #else
  3454. addi TEMP, KK, 1
  3455. #endif
  3456. srawi. r0, TEMP, 2
  3457. fpmr f2, f0
  3458. mtspr CTR, r0
  3459. fpmr f3, f0
  3460. ble .L1064
  3461. #else
  3462. srawi. r0, K, 2
  3463. fpmr f1, f0
  3464. addi BO, B, - 2 * SIZE
  3465. fpmr f2, f0
  3466. mtspr CTR, r0
  3467. fpmr f3, f0
  3468. ble .L1064
  3469. #endif
  3470. LFPDUX B1, BO, INC2
  3471. LFPDUX A1, AO, INC2
  3472. LFPDUX A2, AO, INC2
  3473. LFPDUX B2, BO, INC2
  3474. LFPDUX A3, AO, INC2
  3475. LFPDUX A4, AO, INC2
  3476. LFPDUX B3, BO, INC2
  3477. LFPDUX A5, AO, INC2
  3478. LFPDUX A6, AO, INC2
  3479. LFPDUX B4, BO, INC2
  3480. LFPDUX A7, AO, INC2
  3481. LFPDUX A8, AO, INC2
  3482. bdz- .L1063
  3483. .align 4
  3484. .L1062:
  3485. FXCPMADD f0, B1, A1, f0
  3486. FXCSMADD f2, B1, A1, f2
  3487. LFPDUX A1, AO, INC2
  3488. FXCPMADD f1, B1, A2, f1
  3489. FXCSMADD f3, B1, A2, f3
  3490. LFPDUX A2, AO, INC2
  3491. LFPDUX B1, BO, INC2
  3492. FXCPMADD f0, B2, A3, f0
  3493. FXCSMADD f2, B2, A3, f2
  3494. LFPDUX A3, AO, INC2
  3495. FXCPMADD f1, B2, A4, f1
  3496. FXCSMADD f3, B2, A4, f3
  3497. LFPDUX A4, AO, INC2
  3498. LFPDUX B2, BO, INC2
  3499. FXCPMADD f0, B3, A5, f0
  3500. FXCSMADD f2, B3, A5, f2
  3501. LFPDUX A5, AO, INC2
  3502. FXCPMADD f1, B3, A6, f1
  3503. FXCSMADD f3, B3, A6, f3
  3504. LFPDUX A6, AO, INC2
  3505. LFPDUX B3, BO, INC2
  3506. FXCPMADD f0, B4, A7, f0
  3507. FXCSMADD f2, B4, A7, f2
  3508. LFPDUX A7, AO, INC2
  3509. FXCPMADD f1, B4, A8, f1
  3510. FXCSMADD f3, B4, A8, f3
  3511. LFPDUX A8, AO, INC2
  3512. LFPDUX B4, BO, INC2
  3513. bdnz+ .L1062
  3514. .align 4
  3515. .L1063:
  3516. FXCPMADD f0, B1, A1, f0
  3517. FXCSMADD f2, B1, A1, f2
  3518. FXCPMADD f1, B1, A2, f1
  3519. FXCSMADD f3, B1, A2, f3
  3520. FXCPMADD f0, B2, A3, f0
  3521. FXCSMADD f2, B2, A3, f2
  3522. FXCPMADD f1, B2, A4, f1
  3523. FXCSMADD f3, B2, A4, f3
  3524. FXCPMADD f0, B3, A5, f0
  3525. FXCSMADD f2, B3, A5, f2
  3526. FXCPMADD f1, B3, A6, f1
  3527. FXCSMADD f3, B3, A6, f3
  3528. FXCPMADD f0, B4, A7, f0
  3529. FXCSMADD f2, B4, A7, f2
  3530. FXCPMADD f1, B4, A8, f1
  3531. FXCSMADD f3, B4, A8, f3
  3532. .align 4
  3533. .L1064:
  3534. li r0, ALPHA
  3535. lfpdx AP, SP, r0
  3536. #ifdef TRMMKERNEL
  3537. li r0, FZERO
  3538. lfpsx f30, SP, r0
  3539. #endif
  3540. #if defined(TRMMKERNEL)
  3541. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3542. sub TEMP, K, KK
  3543. #elif defined(LEFT)
  3544. addi TEMP, KK, 2
  3545. #else
  3546. addi TEMP, KK, 1
  3547. #endif
  3548. andi. r0, TEMP, 3
  3549. mtspr CTR, r0
  3550. #else
  3551. andi. r0, K, 3
  3552. mtspr CTR, r0
  3553. #endif
  3554. ble+ .L1068
  3555. LFPDUX A1, AO, INC2
  3556. LFPDUX B1, BO, INC2
  3557. LFPDUX A2, AO, INC2
  3558. bdz- .L1067
  3559. .align 4
  3560. .L1066:
  3561. FXCPMADD f0, B1, A1, f0
  3562. FXCSMADD f2, B1, A1, f2
  3563. LFPDUX A1, AO, INC2
  3564. FXCPMADD f1, B1, A2, f1
  3565. FXCSMADD f3, B1, A2, f3
  3566. LFPDUX B1, BO, INC2
  3567. LFPDUX A2, AO, INC2
  3568. bdnz+ .L1066
  3569. .align 4
  3570. .L1067:
  3571. FXCPMADD f0, B1, A1, f0
  3572. FXCSMADD f2, B1, A1, f2
  3573. FXCPMADD f1, B1, A2, f1
  3574. FXCSMADD f3, B1, A2, f3
  3575. .align 4
  3576. .L1068:
  3577. #ifndef TRMMKERNEL
  3578. LFDUX A1, CO1, INC
  3579. LFDUX A2, CO1, INC2
  3580. LFSDUX A1, CO1, INCM1
  3581. LFSDUX A2, CO1, INC2
  3582. #endif
  3583. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  3584. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  3585. fpadd f0, f0, f2
  3586. fpadd f1, f1, f3
  3587. #else
  3588. fpsub f0, f0, f2
  3589. fpsub f1, f1, f3
  3590. #endif
  3591. #ifndef TRMMKERNEL
  3592. fxcpmadd A1, f0, AP, A1
  3593. fxcpmadd A2, f1, AP, A2
  3594. fxcxnpma f0, f0, AP, A1
  3595. fxcxnpma f1, f1, AP, A2
  3596. STFDUX f0, CO1, INCM3
  3597. STFSDUX f0, CO1, INC
  3598. STFDUX f1, CO1, INC
  3599. STFSDUX f1, CO1, INC
  3600. #else
  3601. fxcpmadd f12, f0, AP, f30
  3602. fxcpmadd f13, f1, AP, f30
  3603. fxcxnpma f0, f0, AP, f12
  3604. fxcxnpma f1, f1, AP, f13
  3605. STFDUX f0, CO1, INC
  3606. STFSDUX f0, CO1, INC
  3607. STFDUX f1, CO1, INC
  3608. STFSDUX f1, CO1, INC
  3609. #endif
  3610. #ifdef TRMMKERNEL
  3611. #if ( defined(LEFT) && defined(TRANSA)) || \
  3612. (!defined(LEFT) && !defined(TRANSA))
  3613. sub TEMP, K, KK
  3614. #ifdef LEFT
  3615. addi TEMP, TEMP, -2
  3616. #else
  3617. addi TEMP, TEMP, -1
  3618. #endif
  3619. slwi r0, TEMP, 1 + ZBASE_SHIFT
  3620. slwi TEMP, TEMP, 0 + ZBASE_SHIFT
  3621. add AO, AO, r0
  3622. add BO, BO, TEMP
  3623. #endif
  3624. #ifdef LEFT
  3625. addi KK, KK, 2
  3626. #endif
  3627. #endif
  3628. li r0, FZERO
  3629. lfpsx f0, SP, r0
  3630. .align 4
  3631. .L1070:
  3632. andi. I, M, 1
  3633. beq .L1089
  3634. #if defined(TRMMKERNEL)
  3635. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  3636. addi BO, B, - 2 * SIZE
  3637. fpmr f1, f0
  3638. #else
  3639. slwi TEMP, KK, 0 + ZBASE_SHIFT
  3640. slwi r0, KK, 0 + ZBASE_SHIFT
  3641. add AO, AO, TEMP
  3642. add BO, B, r0
  3643. addi BO, BO, - 2 * SIZE
  3644. fpmr f1, f0
  3645. #endif
  3646. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3647. sub TEMP, K, KK
  3648. #elif defined(LEFT)
  3649. addi TEMP, KK, 1
  3650. #else
  3651. addi TEMP, KK, 1
  3652. #endif
  3653. srawi. r0, TEMP, 3
  3654. fpmr f2, f0
  3655. mtspr CTR, r0
  3656. fpmr f3, f0
  3657. ble .L1074
  3658. #else
  3659. addi BO, B, - 2 * SIZE
  3660. fpmr f1, f0
  3661. srawi. r0, K, 3
  3662. fpmr f2, f0
  3663. mtspr CTR, r0
  3664. fpmr f3, f0
  3665. ble .L1074
  3666. #endif
  3667. LFPDUX A1, AO, INC2
  3668. LFPDUX B1, BO, INC2
  3669. LFPDUX A2, AO, INC2
  3670. LFPDUX B2, BO, INC2
  3671. LFPDUX A3, AO, INC2
  3672. LFPDUX B3, BO, INC2
  3673. LFPDUX A4, AO, INC2
  3674. LFPDUX B4, BO, INC2
  3675. LFPDUX A5, AO, INC2
  3676. LFPDUX B5, BO, INC2
  3677. LFPDUX A6, AO, INC2
  3678. LFPDUX B6, BO, INC2
  3679. LFPDUX A7, AO, INC2
  3680. LFPDUX A9, BO, INC2
  3681. LFPDUX A8, AO, INC2
  3682. LFPDUX A10, BO, INC2
  3683. bdz- .L1073
  3684. .align 4
  3685. .L1072:
  3686. FXCPMADD f0, B1, A1, f0
  3687. FXCSMADD f1, B1, A1, f1
  3688. LFPDUX A1, AO, INC2
  3689. LFPDUX B1, BO, INC2
  3690. FXCPMADD f2, B2, A2, f2
  3691. FXCSMADD f3, B2, A2, f3
  3692. LFPDUX A2, AO, INC2
  3693. LFPDUX B2, BO, INC2
  3694. FXCPMADD f0, B3, A3, f0
  3695. FXCSMADD f1, B3, A3, f1
  3696. LFPDUX A3, AO, INC2
  3697. LFPDUX B3, BO, INC2
  3698. FXCPMADD f2, B4, A4, f2
  3699. FXCSMADD f3, B4, A4, f3
  3700. LFPDUX A4, AO, INC2
  3701. LFPDUX B4, BO, INC2
  3702. FXCPMADD f0, B5, A5, f0
  3703. FXCSMADD f1, B5, A5, f1
  3704. LFPDUX A5, AO, INC2
  3705. LFPDUX B5, BO, INC2
  3706. FXCPMADD f2, B6, A6, f2
  3707. FXCSMADD f3, B6, A6, f3
  3708. LFPDUX A6, AO, INC2
  3709. LFPDUX B6, BO, INC2
  3710. FXCPMADD f0, A9, A7, f0
  3711. FXCSMADD f1, A9, A7, f1
  3712. LFPDUX A7, AO, INC2
  3713. LFPDUX A9, BO, INC2
  3714. FXCPMADD f2, A10, A8, f2
  3715. FXCSMADD f3, A10, A8, f3
  3716. LFPDUX A8, AO, INC2
  3717. LFPDUX A10, BO, INC2
  3718. bdnz+ .L1072
  3719. .align 4
  3720. .L1073:
  3721. FXCPMADD f0, B1, A1, f0
  3722. FXCSMADD f1, B1, A1, f1
  3723. FXCPMADD f2, B2, A2, f2
  3724. FXCSMADD f3, B2, A2, f3
  3725. FXCPMADD f0, B3, A3, f0
  3726. FXCSMADD f1, B3, A3, f1
  3727. FXCPMADD f2, B4, A4, f2
  3728. FXCSMADD f3, B4, A4, f3
  3729. FXCPMADD f0, B5, A5, f0
  3730. FXCSMADD f1, B5, A5, f1
  3731. FXCPMADD f2, B6, A6, f2
  3732. FXCSMADD f3, B6, A6, f3
  3733. FXCPMADD f0, A9, A7, f0
  3734. FXCSMADD f1, A9, A7, f1
  3735. FXCPMADD f2, A10, A8, f2
  3736. FXCSMADD f3, A10, A8, f3
  3737. .align 4
  3738. .L1074:
  3739. li r0, ALPHA
  3740. lfpdx AP, SP, r0
  3741. #ifdef TRMMKERNEL
  3742. li r0, FZERO
  3743. lfpsx f30, SP, r0
  3744. #endif
  3745. #if defined(TRMMKERNEL)
  3746. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3747. sub TEMP, K, KK
  3748. #elif defined(LEFT)
  3749. addi TEMP, KK, 1
  3750. #else
  3751. addi TEMP, KK, 1
  3752. #endif
  3753. andi. r0, TEMP, 7
  3754. mtspr CTR, r0
  3755. #else
  3756. andi. r0, K, 7
  3757. mtspr CTR, r0
  3758. #endif
  3759. ble+ .L1078
  3760. LFPDUX A1, AO, INC2
  3761. LFPDUX B1, BO, INC2
  3762. bdz- .L1077
  3763. .align 4
  3764. .L1076:
  3765. FXCPMADD f0, B1, A1, f0
  3766. FXCSMADD f1, B1, A1, f1
  3767. LFPDUX A1, AO, INC2
  3768. LFPDUX B1, BO, INC2
  3769. bdnz+ .L1076
  3770. .align 4
  3771. .L1077:
  3772. FXCPMADD f0, B1, A1, f0
  3773. FXCSMADD f1, B1, A1, f1
  3774. .align 4
  3775. .L1078:
  3776. #ifndef TRMMKERNEL
  3777. LFDUX A1, CO1, INC
  3778. LFDUX A2, CO1, INC
  3779. #endif
  3780. fpadd f0, f0, f2
  3781. fpadd f1, f1, f3
  3782. fsmfp A1, A2
  3783. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  3784. defined(RN) || defined(RT) || defined(CN) || defined(CT)
  3785. fpadd f0, f0, f1
  3786. #else
  3787. fpsub f0, f0, f1
  3788. #endif
  3789. #ifndef TRMMKERNEL
  3790. fxcpmadd A1, f0, AP, A1
  3791. fxcxnpma f0, f0, AP, A1
  3792. STFDUX f0, CO1, INCM1
  3793. STFSDUX f0, CO1, INC
  3794. #else
  3795. fxcpmadd f12, f0, AP, f30
  3796. fxcxnpma f0, f0, AP, f12
  3797. STFDUX f0, CO1, INC
  3798. STFSDUX f0, CO1, INC
  3799. #endif
  3800. li r0, FZERO
  3801. lfpsx f0, SP, r0
  3802. .align 4
  3803. .L1089:
  3804. addi B, BO, 2 * SIZE
  3805. .align 4
  3806. .L10999:
  3807. addi SP, SP, 20
  3808. lwzu r14, 4(SP)
  3809. lwzu r15, 4(SP)
  3810. lwzu r16, 4(SP)
  3811. lwzu r17, 4(SP)
  3812. lwzu r18, 4(SP)
  3813. lwzu r19, 4(SP)
  3814. lwzu r20, 4(SP)
  3815. lwzu r21, 4(SP)
  3816. lwzu r22, 4(SP)
  3817. lwzu r23, 4(SP)
  3818. lwzu r24, 4(SP)
  3819. lwzu r25, 4(SP)
  3820. lwzu r26, 4(SP)
  3821. lwzu r27, 4(SP)
  3822. lwzu r28, 4(SP)
  3823. lwzu r29, 4(SP)
  3824. lwzu r30, 4(SP)
  3825. lwzu r31, 4(SP)
  3826. subi SP, SP, 12
  3827. li r0, 16
  3828. lfpdux f31, SP, r0
  3829. lfpdux f30, SP, r0
  3830. lfpdux f29, SP, r0
  3831. lfpdux f28, SP, r0
  3832. lfpdux f27, SP, r0
  3833. lfpdux f26, SP, r0
  3834. lfpdux f25, SP, r0
  3835. lfpdux f24, SP, r0
  3836. lfpdux f23, SP, r0
  3837. lfpdux f22, SP, r0
  3838. lfpdux f21, SP, r0
  3839. lfpdux f20, SP, r0
  3840. lfpdux f19, SP, r0
  3841. lfpdux f18, SP, r0
  3842. lfpdux f17, SP, r0
  3843. lfpdux f16, SP, r0
  3844. lfpdux f15, SP, r0
  3845. lfpdux f14, SP, r0
  3846. addi SP, SP, 16
  3847. blr
  3848. .align 4
  3849. EPILOGUE
  3850. #endif