You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_hummer.S 122 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define ALPHA 0
  41. #define FZERO 8
  42. #define M r3
  43. #define N r4
  44. #define K r5
  45. #if defined(linux) || defined(__FreeBSD__)
  46. #define A r6
  47. #define B r7
  48. #define C r8
  49. #define LDC r9
  50. #define OFFSET r10
  51. #endif
  52. #define TEMP r11
  53. #define KK r14
  54. #define INCM1 r15
  55. #define INCM3 r16
  56. #define INCM5 r17
  57. #define INCM7 r18
  58. #define INC2 r19
  59. #define INC r20
  60. #define INC4 r21
  61. #define I r22
  62. #define J r23
  63. #define AO r24
  64. #define BO r25
  65. #define AO2 r26
  66. #define BO2 r27
  67. #define CO1 r28
  68. #define CO2 r29
  69. #define CO3 r30
  70. #define CO4 r31
  71. #ifndef NEEDPARAM
  72. #define A1 f16
  73. #define A2 f17
  74. #define A3 f18
  75. #define A4 f19
  76. #define A5 f20
  77. #define A6 f21
  78. #define A7 f22
  79. #define A8 f23
  80. #define A9 f24
  81. #define A10 f25
  82. #define B1 f26
  83. #define B2 f27
  84. #define B3 f28
  85. #define B4 f29
  86. #define B5 f30
  87. #define B6 f31
  88. #define AP B6
  89. PROLOGUE
  90. PROFCODE
  91. li r0, -16
  92. stfpdux f14, SP, r0
  93. stfpdux f15, SP, r0
  94. stfpdux f16, SP, r0
  95. stfpdux f17, SP, r0
  96. stfpdux f18, SP, r0
  97. stfpdux f19, SP, r0
  98. stfpdux f20, SP, r0
  99. stfpdux f21, SP, r0
  100. stfpdux f22, SP, r0
  101. stfpdux f23, SP, r0
  102. stfpdux f24, SP, r0
  103. stfpdux f25, SP, r0
  104. stfpdux f26, SP, r0
  105. stfpdux f27, SP, r0
  106. stfpdux f28, SP, r0
  107. stfpdux f29, SP, r0
  108. stfpdux f30, SP, r0
  109. stfpdux f31, SP, r0
  110. stwu r31, -4(SP)
  111. stwu r30, -4(SP)
  112. stwu r29, -4(SP)
  113. stwu r28, -4(SP)
  114. stwu r27, -4(SP)
  115. stwu r26, -4(SP)
  116. stwu r25, -4(SP)
  117. stwu r24, -4(SP)
  118. stwu r23, -4(SP)
  119. stwu r22, -4(SP)
  120. stwu r21, -4(SP)
  121. stwu r20, -4(SP)
  122. stwu r19, -4(SP)
  123. stwu r18, -4(SP)
  124. stwu r17, -4(SP)
  125. stwu r16, -4(SP)
  126. stwu r15, -4(SP)
  127. stwu r14, -4(SP) # dummy
  128. li r0, 0
  129. stwu r0, -4(SP)
  130. stwu r0, -4(SP)
  131. stfdu f1, -8(SP)
  132. slwi LDC, LDC, BASE_SHIFT
  133. cmpwi cr0, M, 0
  134. ble .L999
  135. cmpwi cr0, N, 0
  136. ble .L999
  137. cmpwi cr0, K, 0
  138. ble .L999
  139. li INC, 1 * SIZE
  140. li INC2, 2 * SIZE
  141. li INC4, 4 * SIZE
  142. #if defined(TRMMKERNEL) && !defined(LEFT)
  143. neg KK, OFFSET
  144. #endif
  145. andi. r0, C, 2 * SIZE - 1
  146. bne .L1000
  147. andi. r0, LDC, 2 * SIZE - 1
  148. bne .L1000
  149. /* High performance version */
  150. li INCM3, -2 * SIZE
  151. li INCM5, -5 * SIZE
  152. li INCM7, -6 * SIZE
  153. addi C, C, - 2 * SIZE
  154. srawi. J, N, 2
  155. ble .L50
  156. .align 4
  157. .L10:
  158. mr CO1, C
  159. add CO2, C, LDC
  160. add CO3, CO2, LDC
  161. add CO4, CO3, LDC
  162. add C, CO4, LDC
  163. #if defined(TRMMKERNEL) && defined(LEFT)
  164. mr KK, OFFSET
  165. #endif
  166. addi AO, A, -4 * SIZE
  167. li r0, FZERO
  168. lfpsx f0, SP, r0
  169. srawi. I, M, 3
  170. ble .L20
  171. .align 4
  172. .L11:
  173. #if defined(TRMMKERNEL)
  174. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  175. addi AO2, AO, 2 * SIZE
  176. fpmr f4, f0
  177. addi BO, B, - 4 * SIZE
  178. fpmr f8, f0
  179. addi BO2, B, - 2 * SIZE
  180. fpmr f12, f0
  181. #else
  182. slwi TEMP, KK, 3 + BASE_SHIFT
  183. slwi r0, KK, 2 + BASE_SHIFT
  184. add AO, AO, TEMP
  185. add BO, B, r0
  186. addi AO2, AO, 2 * SIZE
  187. fpmr f4, f0
  188. addi BO, BO, - 4 * SIZE
  189. fpmr f8, f0
  190. addi BO2, BO, 2 * SIZE
  191. fpmr f12, f0
  192. #endif
  193. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  194. sub TEMP, K, KK
  195. #elif defined(LEFT)
  196. addi TEMP, KK, 8
  197. #else
  198. addi TEMP, KK, 4
  199. #endif
  200. srawi. TEMP, TEMP, 2
  201. fpmr f1, f0
  202. mtspr CTR, TEMP
  203. ble .L14
  204. #else
  205. addi AO2, AO, 2 * SIZE
  206. fpmr f4, f0
  207. addi BO, B, - 4 * SIZE
  208. fpmr f8, f0
  209. addi BO2, B, - 2 * SIZE
  210. fpmr f12, f0
  211. srawi. r0, K, 2
  212. fpmr f1, f0
  213. mtspr CTR, r0
  214. ble .L14
  215. #endif
  216. LFPDUX A1, AO, INC4
  217. fpmr f5, f0
  218. LFPDUX A3, AO, INC4
  219. fpmr f9, f0
  220. LFPDUX B1, BO, INC4
  221. fpmr f13, f0
  222. LFPDUX A5, AO, INC4
  223. fpmr f2, f0
  224. LFPDUX A6, AO, INC4
  225. fpmr f6, f0
  226. LFPDUX B3, BO, INC4
  227. fpmr f10, f0
  228. LFPDUX A7, AO, INC4
  229. fpmr f14, f0
  230. LFPDUX A8, AO, INC4
  231. fpmr f3, f0
  232. LFPDUX B5, BO, INC4
  233. fpmr f7, f0
  234. LFPDUX A9, AO, INC4
  235. fpmr f11, f0
  236. LFPDUX A2, AO2, INC4
  237. fpmr f15, f0
  238. LFPDUX B2, BO2, INC4
  239. bdz- .L13
  240. .align 4
  241. .L12:
  242. ## 1 ##
  243. fxcpmadd f0, B1, A1, f0
  244. nop
  245. fxcsmadd f4, B1, A1, f4
  246. nop
  247. fxcpmadd f8, B2, A1, f8
  248. LFPDUX B4, BO2, INC4
  249. fxcsmadd f12, B2, A1, f12
  250. LFPDUX B6, BO, INC4
  251. fxcpmadd f1, B1, A2, f1
  252. nop
  253. fxcsmadd f5, B1, A2, f5
  254. LFPDUX A4, AO2, INC4
  255. fxcpmadd f9, B2, A2, f9
  256. LFPDUX A10, AO, INC4
  257. fxcsmadd f13, B2, A2, f13
  258. nop
  259. fxcpmadd f2, B1, A3, f2
  260. nop
  261. fxcsmadd f6, B1, A3, f6
  262. nop
  263. fxcpmadd f10, B2, A3, f10
  264. nop
  265. fxcsmadd f14, B2, A3, f14
  266. nop
  267. fxcpmadd f3, B1, A4, f3
  268. nop
  269. fxcsmadd f7, B1, A4, f7
  270. LFPDUX A2, AO2, INC4
  271. fxcpmadd f11, B2, A4, f11
  272. LFPDUX A1, AO, INC4
  273. fxcsmadd f15, B2, A4, f15
  274. nop
  275. ## 2 ##
  276. fxcpmadd f0, B3, A5, f0
  277. nop
  278. fxcsmadd f4, B3, A5, f4
  279. nop
  280. fxcpmadd f8, B4, A5, f8
  281. LFPDUX B2, BO2, INC4
  282. fxcsmadd f12, B4, A5, f12
  283. LFPDUX B1, BO, INC4
  284. fxcpmadd f1, B3, A2, f1
  285. nop
  286. fxcsmadd f5, B3, A2, f5
  287. LFPDUX A4, AO2, INC4
  288. fxcpmadd f9, B4, A2, f9
  289. LFPDUX A3, AO, INC4
  290. fxcsmadd f13, B4, A2, f13
  291. nop
  292. fxcpmadd f2, B3, A6, f2
  293. nop
  294. fxcsmadd f6, B3, A6, f6
  295. nop
  296. fxcpmadd f10, B4, A6, f10
  297. nop
  298. fxcsmadd f14, B4, A6, f14
  299. nop
  300. fxcpmadd f3, B3, A4, f3
  301. nop
  302. fxcsmadd f7, B3, A4, f7
  303. LFPDUX A2, AO2, INC4
  304. fxcpmadd f11, B4, A4, f11
  305. LFPDUX A5, AO, INC4
  306. fxcsmadd f15, B4, A4, f15
  307. nop
  308. ## 3 ##
  309. fxcpmadd f0, B5, A7, f0
  310. nop
  311. fxcsmadd f4, B5, A7, f4
  312. nop
  313. fxcpmadd f8, B2, A7, f8
  314. LFPDUX B4, BO2, INC4
  315. fxcsmadd f12, B2, A7, f12
  316. LFPDUX B3, BO, INC4
  317. fxcpmadd f1, B5, A2, f1
  318. nop
  319. fxcsmadd f5, B5, A2, f5
  320. LFPDUX A4, AO2, INC4
  321. fxcpmadd f9, B2, A2, f9
  322. LFPDUX A6, AO, INC4
  323. fxcsmadd f13, B2, A2, f13
  324. nop
  325. fxcpmadd f2, B5, A8, f2
  326. nop
  327. fxcsmadd f6, B5, A8, f6
  328. nop
  329. fxcpmadd f10, B2, A8, f10
  330. nop
  331. fxcsmadd f14, B2, A8, f14
  332. nop
  333. fxcpmadd f3, B5, A4, f3
  334. nop
  335. fxcsmadd f7, B5, A4, f7
  336. LFPDUX A2, AO2, INC4
  337. fxcpmadd f11, B2, A4, f11
  338. LFPDUX A7, AO, INC4
  339. fxcsmadd f15, B2, A4, f15
  340. nop
  341. ## 4 ##
  342. fxcpmadd f0, B6, A9, f0
  343. nop
  344. fxcsmadd f4, B6, A9, f4
  345. nop
  346. fxcpmadd f8, B4, A9, f8
  347. LFPDUX B2, BO2, INC4
  348. fxcsmadd f12, B4, A9, f12
  349. LFPDUX B5, BO, INC4
  350. fxcpmadd f1, B6, A2, f1
  351. nop
  352. fxcsmadd f5, B6, A2, f5
  353. LFPDUX A4, AO2, INC4
  354. fxcpmadd f9, B4, A2, f9
  355. LFPDUX A8, AO, INC4
  356. fxcsmadd f13, B4, A2, f13
  357. nop
  358. fxcpmadd f2, B6, A10, f2
  359. nop
  360. fxcsmadd f6, B6, A10, f6
  361. nop
  362. fxcpmadd f10, B4, A10, f10
  363. nop
  364. fxcsmadd f14, B4, A10, f14
  365. nop
  366. fxcpmadd f3, B6, A4, f3
  367. LFPDUX A2, AO2, INC4
  368. fxcsmadd f7, B6, A4, f7
  369. LFPDUX A9, AO, INC4
  370. fxcpmadd f11, B4, A4, f11
  371. nop
  372. fxcsmadd f15, B4, A4, f15
  373. bdnz+ .L12
  374. .align 4
  375. .L13:
  376. ## 1 ##
  377. fxcpmadd f0, B1, A1, f0
  378. nop
  379. fxcsmadd f4, B1, A1, f4
  380. nop
  381. fxcpmadd f8, B2, A1, f8
  382. LFPDUX B4, BO2, INC4
  383. fxcsmadd f12, B2, A1, f12
  384. LFPDUX B6, BO, INC4
  385. fxcpmadd f1, B1, A2, f1
  386. nop
  387. fxcsmadd f5, B1, A2, f5
  388. LFPDUX A4, AO2, INC4
  389. fxcpmadd f9, B2, A2, f9
  390. LFPDUX A10, AO, INC4
  391. fxcsmadd f13, B2, A2, f13
  392. nop
  393. fxcpmadd f2, B1, A3, f2
  394. nop
  395. fxcsmadd f6, B1, A3, f6
  396. nop
  397. fxcpmadd f10, B2, A3, f10
  398. nop
  399. fxcsmadd f14, B2, A3, f14
  400. nop
  401. fxcpmadd f3, B1, A4, f3
  402. nop
  403. fxcsmadd f7, B1, A4, f7
  404. LFPDUX A2, AO2, INC4
  405. fxcpmadd f11, B2, A4, f11
  406. #ifndef TRMMKERNEL
  407. LFPDUX A1, CO1, INC2
  408. #else
  409. nop
  410. #endif
  411. fxcsmadd f15, B2, A4, f15
  412. nop
  413. ## 2 ##
  414. fxcpmadd f0, B3, A5, f0
  415. nop
  416. fxcsmadd f4, B3, A5, f4
  417. nop
  418. fxcpmadd f8, B4, A5, f8
  419. LFPDUX B2, BO2, INC4
  420. fxcsmadd f12, B4, A5, f12
  421. #ifndef TRMMKERNEL
  422. LFPDUX B1, CO1, INC4
  423. #else
  424. nop
  425. #endif
  426. fxcpmadd f1, B3, A2, f1
  427. nop
  428. fxcsmadd f5, B3, A2, f5
  429. LFPDUX A4, AO2, INC4
  430. fxcpmadd f9, B4, A2, f9
  431. #ifndef TRMMKERNEL
  432. LFPDUX A3, CO2, INC2
  433. #else
  434. nop
  435. #endif
  436. fxcsmadd f13, B4, A2, f13
  437. nop
  438. fxcpmadd f2, B3, A6, f2
  439. nop
  440. fxcsmadd f6, B3, A6, f6
  441. nop
  442. fxcpmadd f10, B4, A6, f10
  443. nop
  444. fxcsmadd f14, B4, A6, f14
  445. nop
  446. fxcpmadd f3, B3, A4, f3
  447. nop
  448. fxcsmadd f7, B3, A4, f7
  449. LFPDUX A2, AO2, INC4
  450. fxcpmadd f11, B4, A4, f11
  451. #ifndef TRMMKERNEL
  452. LFPDUX A5, CO2, INC4
  453. #else
  454. nop
  455. #endif
  456. fxcsmadd f15, B4, A4, f15
  457. nop
  458. ## 3 ##
  459. fxcpmadd f0, B5, A7, f0
  460. nop
  461. fxcsmadd f4, B5, A7, f4
  462. nop
  463. fxcpmadd f8, B2, A7, f8
  464. LFPDUX B4, BO2, INC4
  465. fxcsmadd f12, B2, A7, f12
  466. #ifndef TRMMKERNEL
  467. LFPDUX B3, CO3, INC2
  468. #else
  469. nop
  470. #endif
  471. fxcpmadd f1, B5, A2, f1
  472. nop
  473. fxcsmadd f5, B5, A2, f5
  474. LFPDUX A4, AO2, INC4
  475. fxcpmadd f9, B2, A2, f9
  476. #ifndef TRMMKERNEL
  477. LFPDUX A6, CO3, INC4
  478. #else
  479. nop
  480. #endif
  481. fxcsmadd f13, B2, A2, f13
  482. nop
  483. fxcpmadd f2, B5, A8, f2
  484. nop
  485. fxcsmadd f6, B5, A8, f6
  486. nop
  487. fxcpmadd f10, B2, A8, f10
  488. nop
  489. fxcsmadd f14, B2, A8, f14
  490. nop
  491. fxcpmadd f3, B5, A4, f3
  492. nop
  493. fxcsmadd f7, B5, A4, f7
  494. LFPDUX A2, AO2, INC4
  495. fxcpmadd f11, B2, A4, f11
  496. #ifndef TRMMKERNEL
  497. LFPDUX A7, CO4, INC2
  498. #else
  499. nop
  500. #endif
  501. fxcsmadd f15, B2, A4, f15
  502. nop
  503. ## 4 ##
  504. fxcpmadd f0, B6, A9, f0
  505. nop
  506. fxcsmadd f4, B6, A9, f4
  507. nop
  508. fxcpmadd f8, B4, A9, f8
  509. nop
  510. fxcsmadd f12, B4, A9, f12
  511. #ifndef TRMMKERNEL
  512. LFPDUX B2, CO4, INC4
  513. #else
  514. nop
  515. #endif
  516. fxcpmadd f1, B6, A2, f1
  517. nop
  518. fxcsmadd f5, B6, A2, f5
  519. LFPDUX A4, AO2, INC4
  520. fxcpmadd f9, B4, A2, f9
  521. #ifndef TRMMKERNEL
  522. LFPDUX B5, CO1, INCM3
  523. #else
  524. nop
  525. #endif
  526. fxcsmadd f13, B4, A2, f13
  527. nop
  528. fxcpmadd f2, B6, A10, f2
  529. nop
  530. fxcsmadd f6, B6, A10, f6
  531. nop
  532. fxcpmadd f10, B4, A10, f10
  533. nop
  534. fxcsmadd f14, B4, A10, f14
  535. #ifndef TRMMKERNEL
  536. LFPDUX A8, CO1, INC4
  537. #else
  538. nop
  539. #endif
  540. fxcpmadd f3, B6, A4, f3
  541. nop
  542. fxcsmadd f7, B6, A4, f7
  543. nop
  544. fxcpmadd f11, B4, A4, f11
  545. nop
  546. fxcsmadd f15, B4, A4, f15
  547. #ifndef TRMMKERNEL
  548. LFPDUX A9, CO2, INCM3
  549. #else
  550. nop
  551. #endif
  552. .align 4
  553. .L14:
  554. lfd AP, ALPHA(SP)
  555. #ifdef TRMMKERNEL
  556. fsmfp AP, AP
  557. #endif
  558. #if defined(TRMMKERNEL)
  559. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  560. sub TEMP, K, KK
  561. #elif defined(LEFT)
  562. addi TEMP, KK, 8
  563. #else
  564. addi TEMP, KK, 4
  565. #endif
  566. andi. r0, TEMP, 3
  567. mtspr CTR, r0
  568. ble+ .L18
  569. cmpwi cr0, TEMP, 3
  570. bgt+ .L15
  571. #else
  572. andi. r0, K, 3
  573. mtspr CTR, r0
  574. ble+ .L18
  575. cmpwi cr0, K, 3
  576. bgt+ .L15
  577. #endif
  578. #ifndef TRMMKERNEL
  579. LFPDUX A1, CO1, INC2
  580. fpmr f5, f0
  581. LFPDUX B1, CO1, INC4
  582. fpmr f9, f0
  583. LFPDUX A3, CO2, INC2
  584. fpmr f13, f0
  585. LFPDUX A5, CO2, INC4
  586. fpmr f2, f0
  587. LFPDUX B3, CO3, INC2
  588. fpmr f6, f0
  589. LFPDUX A6, CO3, INC4
  590. fpmr f10, f0
  591. LFPDUX A7, CO4, INC2
  592. fpmr f14, f0
  593. LFPDUX B2, CO4, INC4
  594. fpmr f3, f0
  595. LFPDUX B5, CO1, INCM3
  596. fpmr f7, f0
  597. LFPDUX A8, CO1, INC4
  598. fpmr f11, f0
  599. LFPDUX A9, CO2, INCM3
  600. fpmr f15, f0
  601. #else
  602. fpmr f5, f0
  603. fpmr f9, f0
  604. fpmr f13, f0
  605. fpmr f2, f0
  606. fpmr f6, f0
  607. fpmr f10, f0
  608. fpmr f14, f0
  609. fpmr f3, f0
  610. fpmr f7, f0
  611. fpmr f11, f0
  612. fpmr f15, f0
  613. nop
  614. #endif
  615. .align 4
  616. .L15:
  617. LFPDUX A2, AO, INC4
  618. LFPDUX A4, AO2, INC4
  619. LFPDUX A10, BO, INC4
  620. LFPDUX B4, BO2, INC4
  621. bdz- .L17
  622. .align 4
  623. .L16:
  624. fxcpmadd f0, A10, A2, f0
  625. fxcsmadd f4, A10, A2, f4
  626. fxcpmadd f8, B4, A2, f8
  627. fxcsmadd f12, B4, A2, f12
  628. LFPDUX A2, AO, INC4
  629. fxcpmadd f1, A10, A4, f1
  630. fxcsmadd f5, A10, A4, f5
  631. fxcpmadd f9, B4, A4, f9
  632. fxcsmadd f13, B4, A4, f13
  633. LFPDUX A4, AO2, INC4
  634. fxcpmadd f2, A10, A2, f2
  635. fxcsmadd f6, A10, A2, f6
  636. fxcpmadd f10, B4, A2, f10
  637. fxcsmadd f14, B4, A2, f14
  638. LFPDUX A2, AO, INC4
  639. fxcpmadd f3, A10, A4, f3
  640. fxcsmadd f7, A10, A4, f7
  641. LFPDUX A10, BO, INC4
  642. fxcpmadd f11, B4, A4, f11
  643. fxcsmadd f15, B4, A4, f15
  644. LFPDUX A4, AO2, INC4
  645. LFPDUX B4, BO2, INC4
  646. bdnz+ .L16
  647. .align 4
  648. .L17:
  649. fxcpmadd f0, A10, A2, f0
  650. fxcsmadd f4, A10, A2, f4
  651. fxcpmadd f8, B4, A2, f8
  652. fxcsmadd f12, B4, A2, f12
  653. LFPDUX A2, AO, INC4
  654. fxcpmadd f1, A10, A4, f1
  655. fxcsmadd f5, A10, A4, f5
  656. fxcpmadd f9, B4, A4, f9
  657. fxcsmadd f13, B4, A4, f13
  658. LFPDUX A4, AO2, INC4
  659. fxcpmadd f2, A10, A2, f2
  660. fxcsmadd f6, A10, A2, f6
  661. fxcpmadd f10, B4, A2, f10
  662. fxcsmadd f14, B4, A2, f14
  663. fxcpmadd f3, A10, A4, f3
  664. fxcsmadd f7, A10, A4, f7
  665. fxcpmadd f11, B4, A4, f11
  666. fxcsmadd f15, B4, A4, f15
  667. .align 4
  668. .L18:
  669. #ifndef TRMMKERNEL
  670. fxcpmadd f0, AP, f0, A1
  671. LFPDUX B4, CO2, INC4
  672. fxcpmadd f1, AP, f1, B5
  673. LFPDUX A2, CO3, INCM3
  674. fxcpmadd f2, AP, f2, B1
  675. LFPDUX A4, CO3, INC4
  676. fxcpmadd f3, AP, f3, A8
  677. LFPDUX A10, CO4, INCM3
  678. fxcpmadd f4, AP, f4, A3
  679. LFPDUX A1, CO4, INC4
  680. fxcpmadd f5, AP, f5, A9
  681. STFPDUX f0, CO1, INCM7
  682. fxcpmadd f6, AP, f6, A5
  683. STFPDUX f1, CO1, INC2
  684. fxcpmadd f7, AP, f7, B4
  685. STFPDUX f2, CO1, INC2
  686. fxcpmadd f8, AP, f8, B3
  687. STFPDUX f3, CO1, INC2
  688. fxcpmadd f9, AP, f9, A2
  689. STFPDUX f4, CO2, INCM7
  690. fxcpmadd f10, AP, f10, A6
  691. STFPDUX f5, CO2, INC2
  692. fxcpmadd f11, AP, f11, A4
  693. STFPDUX f6, CO2, INC2
  694. fxcpmadd f12, AP, f12, A7
  695. STFPDUX f7, CO2, INC2
  696. fxcpmadd f13, AP, f13, A10
  697. STFPDUX f8, CO3, INCM7
  698. fxcpmadd f14, AP, f14, B2
  699. STFPDUX f9, CO3, INC2
  700. fxcpmadd f15, AP, f15, A1
  701. STFPDUX f10, CO3, INC2
  702. STFPDUX f11, CO3, INC2
  703. STFPDUX f12, CO4, INCM7
  704. STFPDUX f13, CO4, INC2
  705. STFPDUX f14, CO4, INC2
  706. STFPDUX f15, CO4, INC2
  707. #else
  708. fpmul f0, AP, f0
  709. fpmul f1, AP, f1
  710. fpmul f2, AP, f2
  711. fpmul f3, AP, f3
  712. fpmul f4, AP, f4
  713. fpmul f5, AP, f5
  714. STFPDUX f0, CO1, INC2
  715. fpmul f6, AP, f6
  716. STFPDUX f1, CO1, INC2
  717. fpmul f7, AP, f7
  718. STFPDUX f2, CO1, INC2
  719. fpmul f8, AP, f8
  720. STFPDUX f3, CO1, INC2
  721. fpmul f9, AP, f9
  722. STFPDUX f4, CO2, INC2
  723. fpmul f10, AP, f10
  724. STFPDUX f5, CO2, INC2
  725. fpmul f11, AP, f11
  726. STFPDUX f6, CO2, INC2
  727. fpmul f12, AP, f12
  728. STFPDUX f7, CO2, INC2
  729. fpmul f13, AP, f13
  730. STFPDUX f8, CO3, INC2
  731. fpmul f14, AP, f14
  732. STFPDUX f9, CO3, INC2
  733. fpmul f15, AP, f15
  734. STFPDUX f10, CO3, INC2
  735. STFPDUX f11, CO3, INC2
  736. STFPDUX f12, CO4, INC2
  737. STFPDUX f13, CO4, INC2
  738. STFPDUX f14, CO4, INC2
  739. STFPDUX f15, CO4, INC2
  740. #endif
  741. #ifdef TRMMKERNEL
  742. #if ( defined(LEFT) && defined(TRANSA)) || \
  743. (!defined(LEFT) && !defined(TRANSA))
  744. sub TEMP, K, KK
  745. #ifdef LEFT
  746. addi TEMP, TEMP, -8
  747. #else
  748. addi TEMP, TEMP, -4
  749. #endif
  750. slwi r0, TEMP, 3 + BASE_SHIFT
  751. slwi TEMP, TEMP, 2 + BASE_SHIFT
  752. add AO, AO, r0
  753. add BO, BO, TEMP
  754. #endif
  755. #ifdef LEFT
  756. addi KK, KK, 8
  757. #endif
  758. #endif
  759. addic. I, I, -1
  760. li r0, FZERO
  761. lfpsx f0, SP, r0
  762. bgt+ .L11
  763. .align 4
  764. .L20:
  765. andi. I, M, 4
  766. beq .L30
  767. #if defined(TRMMKERNEL)
  768. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  769. addi AO2, AO, 2 * SIZE
  770. fpmr f4, f0
  771. addi BO, B, - 4 * SIZE
  772. fpmr f8, f0
  773. addi BO2, B, - 2 * SIZE
  774. fpmr f12, f0
  775. #else
  776. slwi TEMP, KK, 2 + BASE_SHIFT
  777. slwi r0, KK, 2 + BASE_SHIFT
  778. add AO, AO, TEMP
  779. add BO, B, r0
  780. addi AO2, AO, 2 * SIZE
  781. fpmr f4, f0
  782. addi BO, BO, - 4 * SIZE
  783. fpmr f8, f0
  784. addi BO2, BO, 2 * SIZE
  785. fpmr f12, f0
  786. #endif
  787. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  788. sub TEMP, K, KK
  789. #elif defined(LEFT)
  790. addi TEMP, KK, 4
  791. #else
  792. addi TEMP, KK, 4
  793. #endif
  794. srawi. TEMP, TEMP, 2
  795. fpmr f1, f0
  796. fpmr f5, f0
  797. fpmr f9, f0
  798. mtspr CTR, TEMP
  799. fpmr f13, f0
  800. ble .L24
  801. #else
  802. addi AO2, AO, 2 * SIZE
  803. fpmr f4, f0
  804. addi BO, B, - 4 * SIZE
  805. fpmr f8, f0
  806. addi BO2, B, - 2 * SIZE
  807. fpmr f12, f0
  808. srawi. r0, K, 2
  809. fpmr f1, f0
  810. fpmr f5, f0
  811. fpmr f9, f0
  812. mtspr CTR, r0
  813. fpmr f13, f0
  814. ble .L24
  815. #endif
  816. LFPDUX A1, AO, INC4
  817. LFPDUX B1, BO, INC4
  818. LFPDUX A2, AO2, INC4
  819. LFPDUX B2, BO2, INC4
  820. LFPDUX A3, AO, INC4
  821. LFPDUX B3, BO, INC4
  822. LFPDUX A4, AO2, INC4
  823. LFPDUX B4, BO2, INC4
  824. LFPDUX A5, AO, INC4
  825. LFPDUX B5, BO, INC4
  826. LFPDUX A6, AO2, INC4
  827. LFPDUX B6, BO2, INC4
  828. LFPDUX A7, AO, INC4
  829. LFPDUX A9, BO, INC4
  830. LFPDUX A10, BO2, INC4
  831. bdz- .L23
  832. .align 4
  833. .L22:
  834. fxcpmadd f0, B1, A1, f0
  835. nop
  836. fxcsmadd f4, B1, A1, f4
  837. LFPDUX A8, AO2, INC4
  838. fxcpmadd f8, B2, A1, f8
  839. nop
  840. fxcsmadd f12, B2, A1, f12
  841. LFPDUX A1, AO, INC4
  842. fxcpmadd f1, B1, A2, f1
  843. nop
  844. fxcsmadd f5, B1, A2, f5
  845. LFPDUX B1, BO, INC4
  846. fxcpmadd f9, B2, A2, f9
  847. nop
  848. fxcsmadd f13, B2, A2, f13
  849. LFPDUX B2, BO2, INC4
  850. fxcpmadd f0, B3, A3, f0
  851. nop
  852. fxcsmadd f4, B3, A3, f4
  853. LFPDUX A2, AO2, INC4
  854. fxcpmadd f8, B4, A3, f8
  855. nop
  856. fxcsmadd f12, B4, A3, f12
  857. LFPDUX A3, AO, INC4
  858. fxcpmadd f1, B3, A4, f1
  859. nop
  860. fxcsmadd f5, B3, A4, f5
  861. LFPDUX B3, BO, INC4
  862. fxcpmadd f9, B4, A4, f9
  863. nop
  864. fxcsmadd f13, B4, A4, f13
  865. LFPDUX B4, BO2, INC4
  866. fxcpmadd f0, B5, A5, f0
  867. nop
  868. fxcsmadd f4, B5, A5, f4
  869. LFPDUX A4, AO2, INC4
  870. fxcpmadd f8, B6, A5, f8
  871. nop
  872. fxcsmadd f12, B6, A5, f12
  873. LFPDUX A5, AO, INC4
  874. fxcpmadd f1, B5, A6, f1
  875. nop
  876. fxcsmadd f5, B5, A6, f5
  877. LFPDUX B5, BO, INC4
  878. fxcpmadd f9, B6, A6, f9
  879. nop
  880. fxcsmadd f13, B6, A6, f13
  881. LFPDUX B6, BO2, INC4
  882. fxcpmadd f0, A9, A7, f0
  883. nop
  884. fxcsmadd f4, A9, A7, f4
  885. LFPDUX A6, AO2, INC4
  886. fxcpmadd f8, A10, A7, f8
  887. nop
  888. fxcsmadd f12, A10, A7, f12
  889. LFPDUX A7, AO, INC4
  890. fxcpmadd f1, A9, A8, f1
  891. nop
  892. fxcsmadd f5, A9, A8, f5
  893. LFPDUX A9, BO, INC4
  894. fxcpmadd f9, A10, A8, f9
  895. nop
  896. fxcsmadd f13, A10, A8, f13
  897. LFPDUX A10, BO2, INC4
  898. bdnz+ .L22
  899. .align 4
  900. .L23:
  901. fxcpmadd f0, B1, A1, f0
  902. fxcsmadd f4, B1, A1, f4
  903. LFPDUX A8, AO2, INC4
  904. fxcpmadd f8, B2, A1, f8
  905. fxcsmadd f12, B2, A1, f12
  906. fxcpmadd f1, B1, A2, f1
  907. fxcsmadd f5, B1, A2, f5
  908. fxcpmadd f9, B2, A2, f9
  909. fxcsmadd f13, B2, A2, f13
  910. fxcpmadd f0, B3, A3, f0
  911. fxcsmadd f4, B3, A3, f4
  912. fxcpmadd f8, B4, A3, f8
  913. fxcsmadd f12, B4, A3, f12
  914. fxcpmadd f1, B3, A4, f1
  915. fxcsmadd f5, B3, A4, f5
  916. fxcpmadd f9, B4, A4, f9
  917. fxcsmadd f13, B4, A4, f13
  918. fxcpmadd f0, B5, A5, f0
  919. fxcsmadd f4, B5, A5, f4
  920. fxcpmadd f8, B6, A5, f8
  921. fxcsmadd f12, B6, A5, f12
  922. fxcpmadd f1, B5, A6, f1
  923. fxcsmadd f5, B5, A6, f5
  924. fxcpmadd f9, B6, A6, f9
  925. fxcsmadd f13, B6, A6, f13
  926. fxcpmadd f0, A9, A7, f0
  927. fxcsmadd f4, A9, A7, f4
  928. fxcpmadd f8, A10, A7, f8
  929. fxcsmadd f12, A10, A7, f12
  930. fxcpmadd f1, A9, A8, f1
  931. fxcsmadd f5, A9, A8, f5
  932. fxcpmadd f9, A10, A8, f9
  933. fxcsmadd f13, A10, A8, f13
  934. .align 4
  935. .L24:
  936. lfd AP, ALPHA(SP)
  937. #ifdef TRMMKERNEL
  938. fsmfp AP, AP
  939. #endif
  940. #if defined(TRMMKERNEL)
  941. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  942. sub TEMP, K, KK
  943. #elif defined(LEFT)
  944. addi TEMP, KK, 4
  945. #else
  946. addi TEMP, KK, 4
  947. #endif
  948. andi. TEMP, TEMP, 3
  949. mtspr CTR, TEMP
  950. #else
  951. andi. r0, K, 3
  952. mtspr CTR, r0
  953. #endif
  954. ble+ .L28
  955. LFPDUX A1, AO, INC4
  956. LFPDUX A2, AO2, INC4
  957. LFPDUX B1, BO, INC4
  958. LFPDUX B2, BO2, INC4
  959. bdz- .L27
  960. .align 4
  961. .L26:
  962. fxcpmadd f0, B1, A1, f0
  963. fxcsmadd f4, B1, A1, f4
  964. fxcpmadd f8, B2, A1, f8
  965. fxcsmadd f12, B2, A1, f12
  966. LFPDUX A1, AO, INC4
  967. fxcpmadd f1, B1, A2, f1
  968. fxcsmadd f5, B1, A2, f5
  969. LFPDUX B1, BO, INC4
  970. fxcpmadd f9, B2, A2, f9
  971. fxcsmadd f13, B2, A2, f13
  972. LFPDUX A2, AO2, INC4
  973. LFPDUX B2, BO2, INC4
  974. bdnz+ .L26
  975. .align 4
  976. .L27:
  977. fxcpmadd f0, B1, A1, f0
  978. fxcsmadd f4, B1, A1, f4
  979. fxcpmadd f8, B2, A1, f8
  980. fxcsmadd f12, B2, A1, f12
  981. fxcpmadd f1, B1, A2, f1
  982. fxcsmadd f5, B1, A2, f5
  983. fxcpmadd f9, B2, A2, f9
  984. fxcsmadd f13, B2, A2, f13
  985. .align 4
  986. .L28:
  987. #ifndef TRMMKERNEL
  988. LFPDUX A1, CO1, INC2
  989. LFPDUX B1, CO1, INC2
  990. LFPDUX B3, CO2, INC2
  991. LFPDUX A6, CO2, INC2
  992. LFPDUX B5, CO3, INC2
  993. LFPDUX A8, CO3, INC2
  994. LFPDUX A2, CO4, INC2
  995. LFPDUX A4, CO4, INC2
  996. fxcpmadd f0, AP, f0, A1
  997. fxcpmadd f1, AP, f1, B1
  998. fxcpmadd f4, AP, f4, B3
  999. fxcpmadd f5, AP, f5, A6
  1000. fxcpmadd f8, AP, f8, B5
  1001. fxcpmadd f9, AP, f9, A8
  1002. STFPDUX f0, CO1, INCM3
  1003. fxcpmadd f12, AP, f12, A2
  1004. STFPDUX f1, CO1, INC2
  1005. fxcpmadd f13, AP, f13, A4
  1006. STFPDUX f4, CO2, INCM3
  1007. STFPDUX f5, CO2, INC2
  1008. STFPDUX f8, CO3, INCM3
  1009. STFPDUX f9, CO3, INC2
  1010. STFPDUX f12, CO4, INCM3
  1011. STFPDUX f13, CO4, INC2
  1012. #else
  1013. fpmul f0, AP, f0
  1014. fpmul f1, AP, f1
  1015. fpmul f4, AP, f4
  1016. fpmul f5, AP, f5
  1017. fpmul f8, AP, f8
  1018. fpmul f9, AP, f9
  1019. STFPDUX f0, CO1, INC2
  1020. fpmul f12, AP, f12
  1021. STFPDUX f1, CO1, INC2
  1022. fpmul f13, AP, f13
  1023. STFPDUX f4, CO2, INC2
  1024. STFPDUX f5, CO2, INC2
  1025. STFPDUX f8, CO3, INC2
  1026. STFPDUX f9, CO3, INC2
  1027. STFPDUX f12, CO4, INC2
  1028. STFPDUX f13, CO4, INC2
  1029. #endif
  1030. #ifdef TRMMKERNEL
  1031. #if ( defined(LEFT) && defined(TRANSA)) || \
  1032. (!defined(LEFT) && !defined(TRANSA))
  1033. sub TEMP, K, KK
  1034. #ifdef LEFT
  1035. addi TEMP, TEMP, -4
  1036. #else
  1037. addi TEMP, TEMP, -4
  1038. #endif
  1039. slwi r0, TEMP, 2 + BASE_SHIFT
  1040. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1041. add AO, AO, r0
  1042. add BO, BO, TEMP
  1043. #endif
  1044. #ifdef LEFT
  1045. addi KK, KK, 4
  1046. #endif
  1047. #endif
  1048. li r0, FZERO
  1049. lfpsx f0, SP, r0
  1050. .align 4
  1051. .L30:
  1052. andi. I, M, 2
  1053. beq .L40
  1054. #if defined(TRMMKERNEL)
  1055. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1056. addi AO2, AO, 2 * SIZE
  1057. fpmr f1, f0
  1058. addi BO, B, - 4 * SIZE
  1059. fpmr f2, f0
  1060. addi BO2, B, - 2 * SIZE
  1061. fpmr f3, f0
  1062. #else
  1063. slwi TEMP, KK, 1 + BASE_SHIFT
  1064. slwi r0, KK, 2 + BASE_SHIFT
  1065. add AO, AO, TEMP
  1066. add BO, B, r0
  1067. addi AO2, AO, 2 * SIZE
  1068. fpmr f1, f0
  1069. addi BO, BO, - 4 * SIZE
  1070. fpmr f2, f0
  1071. addi BO2, BO, 2 * SIZE
  1072. fpmr f3, f0
  1073. #endif
  1074. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1075. sub TEMP, K, KK
  1076. #elif defined(LEFT)
  1077. addi TEMP, KK, 2
  1078. #else
  1079. addi TEMP, KK, 4
  1080. #endif
  1081. srawi. r0, TEMP, 2
  1082. mtspr CTR, r0
  1083. ble .L34
  1084. #else
  1085. addi AO2, AO, 2 * SIZE
  1086. fpmr f1, f0
  1087. addi BO, B, - 4 * SIZE
  1088. fpmr f2, f0
  1089. addi BO2, B, - 2 * SIZE
  1090. fpmr f3, f0
  1091. srawi. r0, K, 2
  1092. mtspr CTR, r0
  1093. ble .L34
  1094. #endif
  1095. LFPDUX A1, AO, INC4
  1096. LFPDUX B1, BO, INC4
  1097. LFPDUX B2, BO2, INC4
  1098. LFPDUX A2, AO2, INC4
  1099. LFPDUX B3, BO, INC4
  1100. LFPDUX B4, BO2, INC4
  1101. LFPDUX A3, AO, INC4
  1102. LFPDUX A5, BO, INC4
  1103. LFPDUX A6, BO2, INC4
  1104. LFPDUX A4, AO2, INC4
  1105. LFPDUX A7, BO, INC4
  1106. LFPDUX A8, BO2, INC4
  1107. bdz- .L33
  1108. .align 4
  1109. .L32:
  1110. fxcpmadd f0, B1, A1, f0
  1111. fxcsmadd f1, B1, A1, f1
  1112. LFPDUX B1, BO, INC4
  1113. fxcpmadd f2, B2, A1, f2
  1114. fxcsmadd f3, B2, A1, f3
  1115. LFPDUX B2, BO2, INC4
  1116. LFPDUX A1, AO, INC4
  1117. fxcpmadd f0, B3, A2, f0
  1118. fxcsmadd f1, B3, A2, f1
  1119. LFPDUX B3, BO, INC4
  1120. fxcpmadd f2, B4, A2, f2
  1121. fxcsmadd f3, B4, A2, f3
  1122. LFPDUX B4, BO2, INC4
  1123. LFPDUX A2, AO2, INC4
  1124. fxcpmadd f0, A5, A3, f0
  1125. fxcsmadd f1, A5, A3, f1
  1126. LFPDUX A5, BO, INC4
  1127. fxcpmadd f2, A6, A3, f2
  1128. fxcsmadd f3, A6, A3, f3
  1129. LFPDUX A6, BO2, INC4
  1130. LFPDUX A3, AO, INC4
  1131. fxcpmadd f0, A7, A4, f0
  1132. fxcsmadd f1, A7, A4, f1
  1133. LFPDUX A7, BO, INC4
  1134. fxcpmadd f2, A8, A4, f2
  1135. fxcsmadd f3, A8, A4, f3
  1136. LFPDUX A8, BO2, INC4
  1137. LFPDUX A4, AO2, INC4
  1138. bdnz+ .L32
  1139. .align 4
  1140. .L33:
  1141. fxcpmadd f0, B1, A1, f0
  1142. fxcsmadd f1, B1, A1, f1
  1143. fxcpmadd f2, B2, A1, f2
  1144. fxcsmadd f3, B2, A1, f3
  1145. fxcpmadd f0, B3, A2, f0
  1146. fxcsmadd f1, B3, A2, f1
  1147. fxcpmadd f2, B4, A2, f2
  1148. fxcsmadd f3, B4, A2, f3
  1149. fxcpmadd f0, A5, A3, f0
  1150. fxcsmadd f1, A5, A3, f1
  1151. fxcpmadd f2, A6, A3, f2
  1152. fxcsmadd f3, A6, A3, f3
  1153. fxcpmadd f0, A7, A4, f0
  1154. fxcsmadd f1, A7, A4, f1
  1155. fxcpmadd f2, A8, A4, f2
  1156. fxcsmadd f3, A8, A4, f3
  1157. .align 4
  1158. .L34:
  1159. lfd AP, ALPHA(SP)
  1160. #ifdef TRMMKERNEL
  1161. fsmfp AP, AP
  1162. #endif
  1163. #if defined(TRMMKERNEL)
  1164. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1165. sub TEMP, K, KK
  1166. #elif defined(LEFT)
  1167. addi TEMP, KK, 2
  1168. #else
  1169. addi TEMP, KK, 4
  1170. #endif
  1171. andi. TEMP, TEMP, 3
  1172. mtspr CTR, TEMP
  1173. #else
  1174. andi. r0, K, 3
  1175. mtspr CTR, r0
  1176. #endif
  1177. ble+ .L38
  1178. LFPDX A1, AO, INC4
  1179. LFPDUX B1, BO, INC4
  1180. LFPDUX B2, BO2, INC4
  1181. add AO, AO, INC2
  1182. bdz- .L37
  1183. .align 4
  1184. .L36:
  1185. fxcpmadd f0, B1, A1, f0
  1186. fxcsmadd f1, B1, A1, f1
  1187. LFPDUX B1, BO, INC4
  1188. fxcpmadd f2, B2, A1, f2
  1189. fxcsmadd f3, B2, A1, f3
  1190. LFPDX A1, AO, INC4
  1191. LFPDUX B2, BO2, INC4
  1192. add AO, AO, INC2
  1193. bdnz+ .L36
  1194. .align 4
  1195. .L37:
  1196. fxcpmadd f0, B1, A1, f0
  1197. fxcsmadd f1, B1, A1, f1
  1198. fxcpmadd f2, B2, A1, f2
  1199. fxcsmadd f3, B2, A1, f3
  1200. .align 4
  1201. .L38:
  1202. #ifndef TRMMKERNEL
  1203. LFPDX A1, CO1, INC2
  1204. LFPDX A2, CO2, INC2
  1205. LFPDX A3, CO3, INC2
  1206. LFPDX A4, CO4, INC2
  1207. fxcpmadd f0, AP, f0, A1
  1208. fxcpmadd f1, AP, f1, A2
  1209. fxcpmadd f2, AP, f2, A3
  1210. fxcpmadd f3, AP, f3, A4
  1211. #else
  1212. fpmul f0, AP, f0
  1213. fpmul f1, AP, f1
  1214. fpmul f2, AP, f2
  1215. fpmul f3, AP, f3
  1216. #endif
  1217. STFPDUX f0, CO1, INC2
  1218. STFPDUX f1, CO2, INC2
  1219. STFPDUX f2, CO3, INC2
  1220. STFPDUX f3, CO4, INC2
  1221. #ifdef TRMMKERNEL
  1222. #if ( defined(LEFT) && defined(TRANSA)) || \
  1223. (!defined(LEFT) && !defined(TRANSA))
  1224. sub TEMP, K, KK
  1225. #ifdef LEFT
  1226. addi TEMP, TEMP, -2
  1227. #else
  1228. addi TEMP, TEMP, -4
  1229. #endif
  1230. slwi r0, TEMP, 1 + BASE_SHIFT
  1231. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1232. add AO, AO, r0
  1233. add BO, BO, TEMP
  1234. #endif
  1235. #ifdef LEFT
  1236. addi KK, KK, 2
  1237. #endif
  1238. #endif
  1239. li r0, FZERO
  1240. lfpsx f0, SP, r0
  1241. .align 4
  1242. .L40:
  1243. andi. I, M, 1
  1244. beq .L49
  1245. #if defined(TRMMKERNEL)
  1246. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1247. addi AO2, AO, 2 * SIZE
  1248. fpmr f1, f0
  1249. addi BO, B, - 4 * SIZE
  1250. fpmr f2, f0
  1251. addi BO2, B, - 2 * SIZE
  1252. fpmr f3, f0
  1253. #else
  1254. slwi TEMP, KK, 0 + BASE_SHIFT
  1255. slwi r0, KK, 2 + BASE_SHIFT
  1256. add AO, AO, TEMP
  1257. add BO, B, r0
  1258. addi AO2, AO, 2 * SIZE
  1259. fpmr f1, f0
  1260. addi BO, BO, - 4 * SIZE
  1261. fpmr f2, f0
  1262. addi BO2, BO, 2 * SIZE
  1263. fpmr f3, f0
  1264. #endif
  1265. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1266. sub TEMP, K, KK
  1267. #elif defined(LEFT)
  1268. addi TEMP, KK, 1
  1269. #else
  1270. addi TEMP, KK, 4
  1271. #endif
  1272. srawi. r0, TEMP, 3
  1273. mtspr CTR, r0
  1274. ble .L44
  1275. #else
  1276. addi AO2, AO, 2 * SIZE
  1277. fpmr f1, f0
  1278. addi BO, B, - 4 * SIZE
  1279. fpmr f2, f0
  1280. addi BO2, B, - 2 * SIZE
  1281. fpmr f3, f0
  1282. srawi. r0, K, 3
  1283. mtspr CTR, r0
  1284. ble .L44
  1285. #endif
  1286. LFPDUX A1, AO, INC4
  1287. LFPDUX B1, BO, INC4
  1288. LFPDUX B2, BO2, INC4
  1289. LFPDUX A2, AO2, INC4
  1290. LFPDUX B3, BO, INC4
  1291. LFPDUX B4, BO2, INC4
  1292. LFPDUX A3, AO, INC4
  1293. LFPDUX A5, BO, INC4
  1294. LFPDUX A6, BO2, INC4
  1295. LFPDUX A4, AO2, INC4
  1296. LFPDUX A7, BO, INC4
  1297. LFPDUX A8, BO2, INC4
  1298. bdz- .L43
  1299. .align 4
  1300. .L42:
  1301. fxcpmadd f0, A1, B1, f0
  1302. LFPDUX B1, BO, INC4
  1303. fxcpmadd f1, A1, B2, f1
  1304. LFPDUX B2, BO2, INC4
  1305. fxcsmadd f2, A1, B3, f2
  1306. LFPDUX B3, BO, INC4
  1307. fxcsmadd f3, A1, B4, f3
  1308. LFPDUX B4, BO2, INC4
  1309. LFPDUX A1, AO, INC4
  1310. fxcpmadd f0, A2, A5, f0
  1311. LFPDUX A5, BO, INC4
  1312. fxcpmadd f1, A2, A6, f1
  1313. LFPDUX A6, BO2, INC4
  1314. fxcsmadd f2, A2, A7, f2
  1315. LFPDUX A7, BO, INC4
  1316. fxcsmadd f3, A2, A8, f3
  1317. LFPDUX A8, BO2, INC4
  1318. LFPDUX A2, AO2, INC4
  1319. fxcpmadd f0, A3, B1, f0
  1320. LFPDUX B1, BO, INC4
  1321. fxcpmadd f1, A3, B2, f1
  1322. LFPDUX B2, BO2, INC4
  1323. fxcsmadd f2, A3, B3, f2
  1324. LFPDUX B3, BO, INC4
  1325. fxcsmadd f3, A3, B4, f3
  1326. LFPDUX B4, BO2, INC4
  1327. LFPDUX A3, AO, INC4
  1328. fxcpmadd f0, A4, A5, f0
  1329. LFPDUX A5, BO, INC4
  1330. fxcpmadd f1, A4, A6, f1
  1331. LFPDUX A6, BO2, INC4
  1332. fxcsmadd f2, A4, A7, f2
  1333. LFPDUX A7, BO, INC4
  1334. fxcsmadd f3, A4, A8, f3
  1335. LFPDUX A8, BO2, INC4
  1336. LFPDUX A4, AO2, INC4
  1337. bdnz+ .L42
  1338. .align 4
  1339. .L43:
  1340. fxcpmadd f0, A1, B1, f0
  1341. LFPDUX B1, BO, INC4
  1342. fxcpmadd f1, A1, B2, f1
  1343. LFPDUX B2, BO2, INC4
  1344. fxcsmadd f2, A1, B3, f2
  1345. LFPDUX B3, BO, INC4
  1346. fxcsmadd f3, A1, B4, f3
  1347. LFPDUX B4, BO2, INC4
  1348. fxcpmadd f0, A2, A5, f0
  1349. LFPDUX A5, BO, INC4
  1350. fxcpmadd f1, A2, A6, f1
  1351. LFPDUX A6, BO2, INC4
  1352. fxcsmadd f2, A2, A7, f2
  1353. LFPDUX A7, BO, INC4
  1354. fxcsmadd f3, A2, A8, f3
  1355. LFPDUX A8, BO2, INC4
  1356. fxcpmadd f0, A3, B1, f0
  1357. fxcpmadd f1, A3, B2, f1
  1358. fxcsmadd f2, A3, B3, f2
  1359. fxcsmadd f3, A3, B4, f3
  1360. fxcpmadd f0, A4, A5, f0
  1361. fxcpmadd f1, A4, A6, f1
  1362. fxcsmadd f2, A4, A7, f2
  1363. fxcsmadd f3, A4, A8, f3
  1364. .align 4
  1365. .L44:
  1366. lfd AP, ALPHA(SP)
  1367. #ifdef TRMMKERNEL
  1368. fsmfp AP, AP
  1369. #endif
  1370. #if defined(TRMMKERNEL)
  1371. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1372. sub TEMP, K, KK
  1373. #elif defined(LEFT)
  1374. addi TEMP, KK, 1
  1375. #else
  1376. addi TEMP, KK, 4
  1377. #endif
  1378. andi. TEMP, TEMP, 7
  1379. mtspr CTR, TEMP
  1380. #else
  1381. andi. r0, K, 7
  1382. mtspr CTR, r0
  1383. #endif
  1384. ble+ .L48
  1385. LFDX A1, AO, INC4
  1386. LFPDUX B1, BO, INC4
  1387. LFPDUX B2, BO2, INC4
  1388. add AO, AO, INC
  1389. bdz- .L47
  1390. .align 4
  1391. .L46:
  1392. fxcpmadd f0, A1, B1, f0
  1393. LFPDUX B1, BO, INC4
  1394. fxcpmadd f1, A1, B2, f1
  1395. LFDX A1, AO, INC4
  1396. LFPDUX B2, BO2, INC4
  1397. add AO, AO, INC
  1398. bdnz+ .L46
  1399. .align 4
  1400. .L47:
  1401. fxcpmadd f0, A1, B1, f0
  1402. fxcpmadd f1, A1, B2, f1
  1403. .align 4
  1404. .L48:
  1405. #ifndef TRMMKERNEL
  1406. LFDX A1, CO1, INC2
  1407. LFDX A2, CO2, INC2
  1408. LFDX A3, CO3, INC2
  1409. LFDX A4, CO4, INC2
  1410. fpadd f0, f0, f2
  1411. fpadd f1, f1, f3
  1412. fsmfp A1, A2
  1413. fsmfp A3, A4
  1414. fxcpmadd f0, AP, f0, A1
  1415. fxcpmadd f1, AP, f1, A3
  1416. #else
  1417. fpadd f0, f0, f2
  1418. fpadd f1, f1, f3
  1419. fpmul f0, AP, f0
  1420. fpmul f1, AP, f1
  1421. #endif
  1422. STFDX f0, CO1, INC2
  1423. STFSDX f0, CO2, INC2
  1424. STFDX f1, CO3, INC2
  1425. STFSDX f1, CO4, INC2
  1426. #ifdef TRMMKERNEL
  1427. #if ( defined(LEFT) && defined(TRANSA)) || \
  1428. (!defined(LEFT) && !defined(TRANSA))
  1429. sub TEMP, K, KK
  1430. #ifdef LEFT
  1431. addi TEMP, TEMP, -1
  1432. #else
  1433. addi TEMP, TEMP, -4
  1434. #endif
  1435. slwi r0, TEMP, 0 + BASE_SHIFT
  1436. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1437. add AO, AO, r0
  1438. add BO, BO, TEMP
  1439. #endif
  1440. #ifdef LEFT
  1441. addi KK, KK, 1
  1442. #endif
  1443. #endif
  1444. .align 4
  1445. .L49:
  1446. #if defined(TRMMKERNEL) && !defined(LEFT)
  1447. addi KK, KK, 4
  1448. #endif
  1449. addi B, BO, 4 * SIZE
  1450. addic. J, J, -1
  1451. bgt+ .L10
  1452. .align 4
  1453. .L50:
  1454. andi. J, N, 2
  1455. beq .L90
  1456. mr CO1, C
  1457. add CO2, C, LDC
  1458. add C, CO2, LDC
  1459. #if defined(TRMMKERNEL) && defined(LEFT)
  1460. mr KK, OFFSET
  1461. #endif
  1462. addi AO, A, -2 * SIZE
  1463. li r0, FZERO
  1464. lfpsx f0, SP, r0
  1465. srawi. I, M, 3
  1466. ble .L60
  1467. .align 4
  1468. .L51:
  1469. #if defined(TRMMKERNEL)
  1470. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1471. fpmr f4, f0
  1472. addi BO, B, - 2 * SIZE
  1473. fpmr f1, f0
  1474. fpmr f5, f0
  1475. fpmr f2, f0
  1476. fpmr f6, f0
  1477. #else
  1478. slwi TEMP, KK, 3 + BASE_SHIFT
  1479. slwi r0, KK, 1 + BASE_SHIFT
  1480. add AO, AO, TEMP
  1481. add BO, B, r0
  1482. fpmr f4, f0
  1483. addi BO, BO, - 2 * SIZE
  1484. fpmr f1, f0
  1485. fpmr f5, f0
  1486. fpmr f2, f0
  1487. fpmr f6, f0
  1488. #endif
  1489. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1490. sub TEMP, K, KK
  1491. #elif defined(LEFT)
  1492. addi TEMP, KK, 8
  1493. #else
  1494. addi TEMP, KK, 2
  1495. #endif
  1496. srawi. r0, TEMP, 2
  1497. fpmr f3, f0
  1498. mtspr CTR, r0
  1499. fpmr f7, f0
  1500. ble .L54
  1501. #else
  1502. fpmr f4, f0
  1503. addi BO, B, - 2 * SIZE
  1504. fpmr f1, f0
  1505. fpmr f5, f0
  1506. fpmr f2, f0
  1507. fpmr f6, f0
  1508. srawi. r0, K, 2
  1509. fpmr f3, f0
  1510. mtspr CTR, r0
  1511. fpmr f7, f0
  1512. ble .L54
  1513. #endif
  1514. LFPDUX B1, BO, INC2
  1515. LFPDUX A1, AO, INC2
  1516. LFPDUX A2, AO, INC2
  1517. LFPDUX B2, BO, INC2
  1518. LFPDUX A3, AO, INC2
  1519. LFPDUX A4, AO, INC2
  1520. LFPDUX B3, BO, INC2
  1521. LFPDUX A5, AO, INC2
  1522. LFPDUX A6, AO, INC2
  1523. LFPDUX A7, AO, INC2
  1524. LFPDUX A8, AO, INC2
  1525. bdz- .L53
  1526. .align 4
  1527. .L52:
  1528. fxcpmadd f0, B1, A1, f0
  1529. LFPDUX B4, BO, INC2
  1530. fxcsmadd f4, B1, A1, f4
  1531. LFPDUX A1, AO, INC2
  1532. fxcpmadd f1, B1, A2, f1
  1533. nop
  1534. fxcsmadd f5, B1, A2, f5
  1535. LFPDUX A2, AO, INC2
  1536. fxcpmadd f2, B1, A3, f2
  1537. nop
  1538. fxcsmadd f6, B1, A3, f6
  1539. LFPDUX A3, AO, INC2
  1540. fxcpmadd f3, B1, A4, f3
  1541. nop
  1542. fxcsmadd f7, B1, A4, f7
  1543. LFPDUX A4, AO, INC2
  1544. fxcpmadd f0, B2, A5, f0
  1545. LFPDUX B1, BO, INC2
  1546. fxcsmadd f4, B2, A5, f4
  1547. LFPDUX A5, AO, INC2
  1548. fxcpmadd f1, B2, A6, f1
  1549. nop
  1550. fxcsmadd f5, B2, A6, f5
  1551. LFPDUX A6, AO, INC2
  1552. fxcpmadd f2, B2, A7, f2
  1553. nop
  1554. fxcsmadd f6, B2, A7, f6
  1555. LFPDUX A7, AO, INC2
  1556. fxcpmadd f3, B2, A8, f3
  1557. nop
  1558. fxcsmadd f7, B2, A8, f7
  1559. LFPDUX A8, AO, INC2
  1560. fxcpmadd f0, B3, A1, f0
  1561. LFPDUX B2, BO, INC2
  1562. fxcsmadd f4, B3, A1, f4
  1563. LFPDUX A1, AO, INC2
  1564. fxcpmadd f1, B3, A2, f1
  1565. nop
  1566. fxcsmadd f5, B3, A2, f5
  1567. LFPDUX A2, AO, INC2
  1568. fxcpmadd f2, B3, A3, f2
  1569. nop
  1570. fxcsmadd f6, B3, A3, f6
  1571. LFPDUX A3, AO, INC2
  1572. fxcpmadd f3, B3, A4, f3
  1573. nop
  1574. fxcsmadd f7, B3, A4, f7
  1575. LFPDUX A4, AO, INC2
  1576. fxcpmadd f0, B4, A5, f0
  1577. LFPDUX B3, BO, INC2
  1578. fxcsmadd f4, B4, A5, f4
  1579. LFPDUX A5, AO, INC2
  1580. fxcpmadd f1, B4, A6, f1
  1581. nop
  1582. fxcsmadd f5, B4, A6, f5
  1583. LFPDUX A6, AO, INC2
  1584. fxcpmadd f2, B4, A7, f2
  1585. nop
  1586. fxcsmadd f6, B4, A7, f6
  1587. LFPDUX A7, AO, INC2
  1588. fxcpmadd f3, B4, A8, f3
  1589. nop
  1590. fxcsmadd f7, B4, A8, f7
  1591. LFPDUX A8, AO, INC2
  1592. bdnz+ .L52
  1593. .align 4
  1594. .L53:
  1595. fxcpmadd f0, B1, A1, f0
  1596. LFPDUX B4, BO, INC2
  1597. fxcsmadd f4, B1, A1, f4
  1598. LFPDUX A1, AO, INC2
  1599. fxcpmadd f1, B1, A2, f1
  1600. nop
  1601. fxcsmadd f5, B1, A2, f5
  1602. LFPDUX A2, AO, INC2
  1603. fxcpmadd f2, B1, A3, f2
  1604. nop
  1605. fxcsmadd f6, B1, A3, f6
  1606. LFPDUX A3, AO, INC2
  1607. fxcpmadd f3, B1, A4, f3
  1608. nop
  1609. fxcsmadd f7, B1, A4, f7
  1610. LFPDUX A4, AO, INC2
  1611. fxcpmadd f0, B2, A5, f0
  1612. nop
  1613. fxcsmadd f4, B2, A5, f4
  1614. LFPDUX A5, AO, INC2
  1615. fxcpmadd f1, B2, A6, f1
  1616. nop
  1617. fxcsmadd f5, B2, A6, f5
  1618. LFPDUX A6, AO, INC2
  1619. fxcpmadd f2, B2, A7, f2
  1620. nop
  1621. fxcsmadd f6, B2, A7, f6
  1622. LFPDUX A7, AO, INC2
  1623. fxcpmadd f3, B2, A8, f3
  1624. nop
  1625. fxcsmadd f7, B2, A8, f7
  1626. LFPDUX A8, AO, INC2
  1627. fxcpmadd f0, B3, A1, f0
  1628. fxcsmadd f4, B3, A1, f4
  1629. fxcpmadd f1, B3, A2, f1
  1630. fxcsmadd f5, B3, A2, f5
  1631. fxcpmadd f2, B3, A3, f2
  1632. fxcsmadd f6, B3, A3, f6
  1633. fxcpmadd f3, B3, A4, f3
  1634. fxcsmadd f7, B3, A4, f7
  1635. fxcpmadd f0, B4, A5, f0
  1636. fxcsmadd f4, B4, A5, f4
  1637. fxcpmadd f1, B4, A6, f1
  1638. fxcsmadd f5, B4, A6, f5
  1639. fxcpmadd f2, B4, A7, f2
  1640. fxcsmadd f6, B4, A7, f6
  1641. fxcpmadd f3, B4, A8, f3
  1642. fxcsmadd f7, B4, A8, f7
  1643. .align 4
  1644. .L54:
  1645. lfd AP, ALPHA(SP)
  1646. #ifdef TRMMKERNEL
  1647. fsmfp AP, AP
  1648. #endif
  1649. #if defined(TRMMKERNEL)
  1650. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1651. sub TEMP, K, KK
  1652. #elif defined(LEFT)
  1653. addi TEMP, KK, 8
  1654. #else
  1655. addi TEMP, KK, 2
  1656. #endif
  1657. andi. TEMP, TEMP, 3
  1658. mtspr CTR, TEMP
  1659. #else
  1660. andi. r0, K, 3
  1661. mtspr CTR, r0
  1662. #endif
  1663. ble+ .L58
  1664. LFPDUX A1, AO, INC2
  1665. LFPDUX B1, BO, INC2
  1666. LFPDUX A2, AO, INC2
  1667. LFPDUX A3, AO, INC2
  1668. LFPDUX A4, AO, INC2
  1669. bdz- .L57
  1670. .align 4
  1671. .L56:
  1672. fxcpmadd f0, B1, A1, f0
  1673. fxcsmadd f4, B1, A1, f4
  1674. LFPDUX A1, AO, INC2
  1675. fxcpmadd f1, B1, A2, f1
  1676. fxcsmadd f5, B1, A2, f5
  1677. LFPDUX A2, AO, INC2
  1678. fxcpmadd f2, B1, A3, f2
  1679. fxcsmadd f6, B1, A3, f6
  1680. LFPDUX A3, AO, INC2
  1681. fxcpmadd f3, B1, A4, f3
  1682. fxcsmadd f7, B1, A4, f7
  1683. LFPDUX A4, AO, INC2
  1684. LFPDUX B1, BO, INC2
  1685. bdnz+ .L56
  1686. .align 4
  1687. .L57:
  1688. fxcpmadd f0, B1, A1, f0
  1689. fxcsmadd f4, B1, A1, f4
  1690. fxcpmadd f1, B1, A2, f1
  1691. fxcsmadd f5, B1, A2, f5
  1692. fxcpmadd f2, B1, A3, f2
  1693. fxcsmadd f6, B1, A3, f6
  1694. fxcpmadd f3, B1, A4, f3
  1695. fxcsmadd f7, B1, A4, f7
  1696. .align 4
  1697. .L58:
  1698. #ifndef TRMMKERNEL
  1699. LFPDUX A1, CO1, INC2
  1700. LFPDUX B1, CO1, INC2
  1701. LFPDUX A3, CO1, INC2
  1702. LFPDUX A5, CO1, INC2
  1703. LFPDUX B3, CO2, INC2
  1704. LFPDUX A6, CO2, INC2
  1705. LFPDUX A7, CO2, INC2
  1706. LFPDUX B2, CO2, INC2
  1707. fxcpmadd f0, AP, f0, A1
  1708. fxcpmadd f1, AP, f1, B1
  1709. fxcpmadd f2, AP, f2, A3
  1710. fxcpmadd f3, AP, f3, A5
  1711. fxcpmadd f4, AP, f4, B3
  1712. fxcpmadd f5, AP, f5, A6
  1713. STFPDUX f0, CO1, INCM7
  1714. fxcpmadd f6, AP, f6, A7
  1715. STFPDUX f1, CO1, INC2
  1716. fxcpmadd f7, AP, f7, B2
  1717. STFPDUX f2, CO1, INC2
  1718. STFPDUX f3, CO1, INC2
  1719. STFPDUX f4, CO2, INCM7
  1720. STFPDUX f5, CO2, INC2
  1721. STFPDUX f6, CO2, INC2
  1722. STFPDUX f7, CO2, INC2
  1723. #else
  1724. fpmul f0, AP, f0
  1725. fpmul f1, AP, f1
  1726. fpmul f2, AP, f2
  1727. fpmul f3, AP, f3
  1728. fpmul f4, AP, f4
  1729. fpmul f5, AP, f5
  1730. STFPDUX f0, CO1, INC2
  1731. fpmul f6, AP, f6
  1732. STFPDUX f1, CO1, INC2
  1733. fpmul f7, AP, f7
  1734. STFPDUX f2, CO1, INC2
  1735. STFPDUX f3, CO1, INC2
  1736. STFPDUX f4, CO2, INC2
  1737. STFPDUX f5, CO2, INC2
  1738. STFPDUX f6, CO2, INC2
  1739. STFPDUX f7, CO2, INC2
  1740. #endif
  1741. #ifdef TRMMKERNEL
  1742. #if ( defined(LEFT) && defined(TRANSA)) || \
  1743. (!defined(LEFT) && !defined(TRANSA))
  1744. sub TEMP, K, KK
  1745. #ifdef LEFT
  1746. addi TEMP, TEMP, -8
  1747. #else
  1748. addi TEMP, TEMP, -2
  1749. #endif
  1750. slwi r0, TEMP, 3 + BASE_SHIFT
  1751. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1752. add AO, AO, r0
  1753. add BO, BO, TEMP
  1754. #endif
  1755. #ifdef LEFT
  1756. addi KK, KK, 8
  1757. #endif
  1758. #endif
  1759. addic. I, I, -1
  1760. li r0, FZERO
  1761. lfpsx f0, SP, r0
  1762. bgt+ .L51
  1763. .align 4
  1764. .L60:
  1765. andi. I, M, 4
  1766. beq .L70
  1767. #if defined(TRMMKERNEL)
  1768. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1769. addi BO, B, - 2 * SIZE
  1770. fpmr f1, f0
  1771. #else
  1772. slwi TEMP, KK, 2 + BASE_SHIFT
  1773. slwi r0, KK, 1 + BASE_SHIFT
  1774. add AO, AO, TEMP
  1775. add BO, B, r0
  1776. addi BO, BO, - 2 * SIZE
  1777. fpmr f1, f0
  1778. #endif
  1779. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1780. sub TEMP, K, KK
  1781. #elif defined(LEFT)
  1782. addi TEMP, KK, 4
  1783. #else
  1784. addi TEMP, KK, 2
  1785. #endif
  1786. fpmr f2, f0
  1787. srawi. r0, TEMP, 2
  1788. mtspr CTR, r0
  1789. fpmr f3, f0
  1790. ble .L64
  1791. #else
  1792. srawi. r0, K, 2
  1793. fpmr f1, f0
  1794. addi BO, B, - 2 * SIZE
  1795. fpmr f2, f0
  1796. mtspr CTR, r0
  1797. fpmr f3, f0
  1798. ble .L64
  1799. #endif
  1800. LFPDUX B1, BO, INC2
  1801. LFPDUX A1, AO, INC2
  1802. LFPDUX A2, AO, INC2
  1803. LFPDUX B2, BO, INC2
  1804. LFPDUX A3, AO, INC2
  1805. LFPDUX A4, AO, INC2
  1806. LFPDUX B3, BO, INC2
  1807. LFPDUX A5, AO, INC2
  1808. LFPDUX A6, AO, INC2
  1809. LFPDUX B4, BO, INC2
  1810. LFPDUX A7, AO, INC2
  1811. LFPDUX A8, AO, INC2
  1812. bdz- .L63
  1813. .align 4
  1814. .L62:
  1815. fxcpmadd f0, B1, A1, f0
  1816. fxcsmadd f2, B1, A1, f2
  1817. LFPDUX A1, AO, INC2
  1818. fxcpmadd f1, B1, A2, f1
  1819. fxcsmadd f3, B1, A2, f3
  1820. LFPDUX A2, AO, INC2
  1821. LFPDUX B1, BO, INC2
  1822. fxcpmadd f0, B2, A3, f0
  1823. fxcsmadd f2, B2, A3, f2
  1824. LFPDUX A3, AO, INC2
  1825. fxcpmadd f1, B2, A4, f1
  1826. fxcsmadd f3, B2, A4, f3
  1827. LFPDUX A4, AO, INC2
  1828. LFPDUX B2, BO, INC2
  1829. fxcpmadd f0, B3, A5, f0
  1830. fxcsmadd f2, B3, A5, f2
  1831. LFPDUX A5, AO, INC2
  1832. fxcpmadd f1, B3, A6, f1
  1833. fxcsmadd f3, B3, A6, f3
  1834. LFPDUX A6, AO, INC2
  1835. LFPDUX B3, BO, INC2
  1836. fxcpmadd f0, B4, A7, f0
  1837. fxcsmadd f2, B4, A7, f2
  1838. LFPDUX A7, AO, INC2
  1839. fxcpmadd f1, B4, A8, f1
  1840. fxcsmadd f3, B4, A8, f3
  1841. LFPDUX A8, AO, INC2
  1842. LFPDUX B4, BO, INC2
  1843. bdnz+ .L62
  1844. .align 4
  1845. .L63:
  1846. fxcpmadd f0, B1, A1, f0
  1847. fxcsmadd f2, B1, A1, f2
  1848. fxcpmadd f1, B1, A2, f1
  1849. fxcsmadd f3, B1, A2, f3
  1850. fxcpmadd f0, B2, A3, f0
  1851. fxcsmadd f2, B2, A3, f2
  1852. fxcpmadd f1, B2, A4, f1
  1853. fxcsmadd f3, B2, A4, f3
  1854. fxcpmadd f0, B3, A5, f0
  1855. fxcsmadd f2, B3, A5, f2
  1856. fxcpmadd f1, B3, A6, f1
  1857. fxcsmadd f3, B3, A6, f3
  1858. fxcpmadd f0, B4, A7, f0
  1859. fxcsmadd f2, B4, A7, f2
  1860. fxcpmadd f1, B4, A8, f1
  1861. fxcsmadd f3, B4, A8, f3
  1862. .align 4
  1863. .L64:
  1864. lfd AP, ALPHA(SP)
  1865. #ifdef TRMMKERNEL
  1866. fsmfp AP, AP
  1867. #endif
  1868. #if defined(TRMMKERNEL)
  1869. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1870. sub TEMP, K, KK
  1871. #elif defined(LEFT)
  1872. addi TEMP, KK, 4
  1873. #else
  1874. addi TEMP, KK, 2
  1875. #endif
  1876. andi. TEMP, TEMP, 3
  1877. mtspr CTR, TEMP
  1878. #else
  1879. andi. r0, K, 3
  1880. mtspr CTR, r0
  1881. #endif
  1882. ble+ .L68
  1883. LFPDUX A1, AO, INC2
  1884. LFPDUX B1, BO, INC2
  1885. LFPDUX A2, AO, INC2
  1886. bdz- .L67
  1887. .align 4
  1888. .L66:
  1889. fxcpmadd f0, B1, A1, f0
  1890. fxcsmadd f2, B1, A1, f2
  1891. LFPDUX A1, AO, INC2
  1892. fxcpmadd f1, B1, A2, f1
  1893. fxcsmadd f3, B1, A2, f3
  1894. LFPDUX B1, BO, INC2
  1895. LFPDUX A2, AO, INC2
  1896. bdnz+ .L66
  1897. .align 4
  1898. .L67:
  1899. fxcpmadd f0, B1, A1, f0
  1900. fxcsmadd f2, B1, A1, f2
  1901. fxcpmadd f1, B1, A2, f1
  1902. fxcsmadd f3, B1, A2, f3
  1903. .align 4
  1904. .L68:
  1905. #ifndef TRMMKERNEL
  1906. LFPDUX A1, CO1, INC2
  1907. LFPDUX A2, CO1, INC2
  1908. LFPDUX A3, CO2, INC2
  1909. LFPDUX A4, CO2, INC2
  1910. fxcpmadd f0, AP, f0, A1
  1911. fxcpmadd f1, AP, f1, A2
  1912. fxcpmadd f2, AP, f2, A3
  1913. fxcpmadd f3, AP, f3, A4
  1914. STFPDUX f0, CO1, INCM3
  1915. STFPDUX f1, CO1, INC2
  1916. STFPDUX f2, CO2, INCM3
  1917. STFPDUX f3, CO2, INC2
  1918. #else
  1919. fpmul f0, AP, f0
  1920. fpmul f1, AP, f1
  1921. fpmul f2, AP, f2
  1922. fpmul f3, AP, f3
  1923. STFPDUX f0, CO1, INC2
  1924. STFPDUX f1, CO1, INC2
  1925. STFPDUX f2, CO2, INC2
  1926. STFPDUX f3, CO2, INC2
  1927. #endif
  1928. #ifdef TRMMKERNEL
  1929. #if ( defined(LEFT) && defined(TRANSA)) || \
  1930. (!defined(LEFT) && !defined(TRANSA))
  1931. sub TEMP, K, KK
  1932. #ifdef LEFT
  1933. addi TEMP, TEMP, -4
  1934. #else
  1935. addi TEMP, TEMP, -2
  1936. #endif
  1937. slwi r0, TEMP, 2 + BASE_SHIFT
  1938. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1939. add AO, AO, r0
  1940. add BO, BO, TEMP
  1941. #endif
  1942. #ifdef LEFT
  1943. addi KK, KK, 4
  1944. #endif
  1945. #endif
  1946. li r0, FZERO
  1947. lfpsx f0, SP, r0
  1948. .align 4
  1949. .L70:
  1950. andi. I, M, 2
  1951. beq .L80
  1952. #if defined(TRMMKERNEL)
  1953. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1954. addi BO, B, - 2 * SIZE
  1955. fpmr f1, f0
  1956. #else
  1957. slwi TEMP, KK, 1 + BASE_SHIFT
  1958. slwi r0, KK, 1 + BASE_SHIFT
  1959. add AO, AO, TEMP
  1960. add BO, B, r0
  1961. addi BO, BO, - 2 * SIZE
  1962. fpmr f1, f0
  1963. #endif
  1964. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1965. sub TEMP, K, KK
  1966. #elif defined(LEFT)
  1967. addi TEMP, KK, 2
  1968. #else
  1969. addi TEMP, KK, 2
  1970. #endif
  1971. srawi. r0, TEMP, 3
  1972. fpmr f2, f0
  1973. mtspr CTR, r0
  1974. fpmr f3, f0
  1975. ble .L74
  1976. #else
  1977. addi BO, B, - 2 * SIZE
  1978. fpmr f1, f0
  1979. srawi. r0, K, 3
  1980. fpmr f2, f0
  1981. mtspr CTR, r0
  1982. fpmr f3, f0
  1983. ble .L74
  1984. #endif
  1985. LFPDUX A1, AO, INC2
  1986. LFPDUX B1, BO, INC2
  1987. LFPDUX A2, AO, INC2
  1988. LFPDUX B2, BO, INC2
  1989. LFPDUX A3, AO, INC2
  1990. LFPDUX B3, BO, INC2
  1991. LFPDUX A4, AO, INC2
  1992. LFPDUX B4, BO, INC2
  1993. LFPDUX A5, AO, INC2
  1994. LFPDUX B5, BO, INC2
  1995. LFPDUX A6, AO, INC2
  1996. LFPDUX B6, BO, INC2
  1997. LFPDUX A7, AO, INC2
  1998. LFPDUX A9, BO, INC2
  1999. LFPDUX A8, AO, INC2
  2000. LFPDUX A10, BO, INC2
  2001. bdz- .L73
  2002. .align 4
  2003. .L72:
  2004. fxcpmadd f0, B1, A1, f0
  2005. fxcsmadd f1, B1, A1, f1
  2006. LFPDUX A1, AO, INC2
  2007. LFPDUX B1, BO, INC2
  2008. fxcpmadd f2, B2, A2, f2
  2009. fxcsmadd f3, B2, A2, f3
  2010. LFPDUX A2, AO, INC2
  2011. LFPDUX B2, BO, INC2
  2012. fxcpmadd f0, B3, A3, f0
  2013. fxcsmadd f1, B3, A3, f1
  2014. LFPDUX A3, AO, INC2
  2015. LFPDUX B3, BO, INC2
  2016. fxcpmadd f2, B4, A4, f2
  2017. fxcsmadd f3, B4, A4, f3
  2018. LFPDUX A4, AO, INC2
  2019. LFPDUX B4, BO, INC2
  2020. fxcpmadd f0, B5, A5, f0
  2021. fxcsmadd f1, B5, A5, f1
  2022. LFPDUX A5, AO, INC2
  2023. LFPDUX B5, BO, INC2
  2024. fxcpmadd f2, B6, A6, f2
  2025. fxcsmadd f3, B6, A6, f3
  2026. LFPDUX A6, AO, INC2
  2027. LFPDUX B6, BO, INC2
  2028. fxcpmadd f0, A9, A7, f0
  2029. fxcsmadd f1, A9, A7, f1
  2030. LFPDUX A7, AO, INC2
  2031. LFPDUX A9, BO, INC2
  2032. fxcpmadd f2, A10, A8, f2
  2033. fxcsmadd f3, A10, A8, f3
  2034. LFPDUX A8, AO, INC2
  2035. LFPDUX A10, BO, INC2
  2036. bdnz+ .L72
  2037. .align 4
  2038. .L73:
  2039. fxcpmadd f0, B1, A1, f0
  2040. fxcsmadd f1, B1, A1, f1
  2041. fxcpmadd f2, B2, A2, f2
  2042. fxcsmadd f3, B2, A2, f3
  2043. fxcpmadd f0, B3, A3, f0
  2044. fxcsmadd f1, B3, A3, f1
  2045. fxcpmadd f2, B4, A4, f2
  2046. fxcsmadd f3, B4, A4, f3
  2047. fxcpmadd f0, B5, A5, f0
  2048. fxcsmadd f1, B5, A5, f1
  2049. fxcpmadd f2, B6, A6, f2
  2050. fxcsmadd f3, B6, A6, f3
  2051. fxcpmadd f0, A9, A7, f0
  2052. fxcsmadd f1, A9, A7, f1
  2053. fxcpmadd f2, A10, A8, f2
  2054. fxcsmadd f3, A10, A8, f3
  2055. .align 4
  2056. .L74:
  2057. lfd AP, ALPHA(SP)
  2058. #ifdef TRMMKERNEL
  2059. fsmfp AP, AP
  2060. #endif
  2061. #if defined(TRMMKERNEL)
  2062. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2063. sub TEMP, K, KK
  2064. #elif defined(LEFT)
  2065. addi TEMP, KK, 2
  2066. #else
  2067. addi TEMP, KK, 2
  2068. #endif
  2069. andi. TEMP, TEMP, 7
  2070. mtspr CTR, TEMP
  2071. #else
  2072. andi. r0, K, 7
  2073. mtspr CTR, r0
  2074. #endif
  2075. ble+ .L78
  2076. LFPDUX A1, AO, INC2
  2077. LFPDUX B1, BO, INC2
  2078. bdz- .L77
  2079. .align 4
  2080. .L76:
  2081. fxcpmadd f0, B1, A1, f0
  2082. fxcsmadd f1, B1, A1, f1
  2083. LFPDUX A1, AO, INC2
  2084. LFPDUX B1, BO, INC2
  2085. bdnz+ .L76
  2086. .align 4
  2087. .L77:
  2088. fxcpmadd f0, B1, A1, f0
  2089. fxcsmadd f1, B1, A1, f1
  2090. .align 4
  2091. .L78:
  2092. #ifndef TRMMKERNEL
  2093. LFPDX A1, CO1, INC2
  2094. LFPDX B3, CO2, INC2
  2095. fpadd f0, f0, f2
  2096. fpadd f1, f1, f3
  2097. fxcpmadd f0, AP, f0, A1
  2098. fxcpmadd f1, AP, f1, B3
  2099. #else
  2100. fpadd f0, f0, f2
  2101. fpadd f1, f1, f3
  2102. fpmul f0, AP, f0
  2103. fpmul f1, AP, f1
  2104. #endif
  2105. STFPDUX f0, CO1, INC2
  2106. STFPDUX f1, CO2, INC2
  2107. #ifdef TRMMKERNEL
  2108. #if ( defined(LEFT) && defined(TRANSA)) || \
  2109. (!defined(LEFT) && !defined(TRANSA))
  2110. sub TEMP, K, KK
  2111. #ifdef LEFT
  2112. addi TEMP, TEMP, -2
  2113. #else
  2114. addi TEMP, TEMP, -2
  2115. #endif
  2116. slwi r0, TEMP, 1 + BASE_SHIFT
  2117. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2118. add AO, AO, r0
  2119. add BO, BO, TEMP
  2120. #endif
  2121. #ifdef LEFT
  2122. addi KK, KK, 2
  2123. #endif
  2124. #endif
  2125. li r0, FZERO
  2126. lfpsx f0, SP, r0
  2127. .align 4
  2128. .L80:
  2129. andi. I, M, 1
  2130. beq .L89
  2131. #if defined(TRMMKERNEL)
  2132. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2133. addi BO, B, - 2 * SIZE
  2134. fpmr f1, f0
  2135. fpmr f2, f0
  2136. fpmr f3, f0
  2137. #else
  2138. slwi TEMP, KK, 0 + BASE_SHIFT
  2139. slwi r0, KK, 1 + BASE_SHIFT
  2140. add AO, AO, TEMP
  2141. add BO, B, r0
  2142. addi BO, BO, - 2 * SIZE
  2143. fpmr f1, f0
  2144. fpmr f2, f0
  2145. fpmr f3, f0
  2146. #endif
  2147. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2148. sub TEMP, K, KK
  2149. #elif defined(LEFT)
  2150. addi TEMP, KK, 1
  2151. #else
  2152. addi TEMP, KK, 2
  2153. #endif
  2154. srawi. r0, TEMP, 3
  2155. mtspr CTR, r0
  2156. ble .L84
  2157. #else
  2158. addi BO, B, - 2 * SIZE
  2159. fpmr f1, f0
  2160. fpmr f2, f0
  2161. fpmr f3, f0
  2162. srawi. r0, K, 3
  2163. mtspr CTR, r0
  2164. ble .L84
  2165. #endif
  2166. LFPDUX B1, BO, INC2
  2167. LFPDUX A1, AO, INC2
  2168. LFPDUX A2, AO, INC2
  2169. LFPDUX B2, BO, INC2
  2170. LFPDUX A3, AO, INC2
  2171. LFPDUX A4, AO, INC2
  2172. LFPDUX B3, BO, INC2
  2173. LFPDUX B4, BO, INC2
  2174. bdz- .L83
  2175. .align 4
  2176. .L82:
  2177. fxcpmadd f0, A1, B1, f0
  2178. LFPDUX B1, BO, INC2
  2179. fxcsmadd f1, A1, B2, f1
  2180. LFPDUX B2, BO, INC2
  2181. LFPDUX A1, AO, INC2
  2182. fxcpmadd f2, A2, B3, f2
  2183. LFPDUX B3, BO, INC2
  2184. fxcsmadd f3, A2, B4, f3
  2185. LFPDUX B4, BO, INC2
  2186. LFPDUX A2, AO, INC2
  2187. fxcpmadd f0, A3, B1, f0
  2188. LFPDUX B1, BO, INC2
  2189. fxcsmadd f1, A3, B2, f1
  2190. LFPDUX B2, BO, INC2
  2191. LFPDUX A3, AO, INC2
  2192. fxcpmadd f2, A4, B3, f2
  2193. LFPDUX B3, BO, INC2
  2194. fxcsmadd f3, A4, B4, f3
  2195. LFPDUX B4, BO, INC2
  2196. LFPDUX A4, AO, INC2
  2197. bdnz+ .L82
  2198. .align 4
  2199. .L83:
  2200. fxcpmadd f0, A1, B1, f0
  2201. LFPDUX B1, BO, INC2
  2202. fxcsmadd f1, A1, B2, f1
  2203. LFPDUX B2, BO, INC2
  2204. fxcpmadd f2, A2, B3, f2
  2205. LFPDUX B3, BO, INC2
  2206. fxcsmadd f3, A2, B4, f3
  2207. LFPDUX B4, BO, INC2
  2208. fxcpmadd f0, A3, B1, f0
  2209. fxcsmadd f1, A3, B2, f1
  2210. fxcpmadd f2, A4, B3, f2
  2211. fxcsmadd f3, A4, B4, f3
  2212. .align 4
  2213. .L84:
  2214. lfd AP, ALPHA(SP)
  2215. #ifdef TRMMKERNEL
  2216. fsmfp AP, AP
  2217. #endif
  2218. #if defined(TRMMKERNEL)
  2219. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2220. sub TEMP, K, KK
  2221. #elif defined(LEFT)
  2222. addi TEMP, KK, 1
  2223. #else
  2224. addi TEMP, KK, 2
  2225. #endif
  2226. andi. TEMP, TEMP, 7
  2227. mtspr CTR, TEMP
  2228. #else
  2229. andi. r0, K, 7
  2230. mtspr CTR, r0
  2231. #endif
  2232. ble+ .L88
  2233. LFDX A1, AO, INC2
  2234. LFPDUX B1, BO, INC2
  2235. add AO, AO, INC
  2236. bdz- .L87
  2237. .align 4
  2238. .L86:
  2239. fxcpmadd f0, A1, B1, f0
  2240. LFDX A1, AO, INC2
  2241. LFPDUX B1, BO, INC2
  2242. add AO, AO, INC
  2243. bdnz+ .L86
  2244. .align 4
  2245. .L87:
  2246. fxcpmadd f0, A1, B1, f0
  2247. .align 4
  2248. .L88:
  2249. #ifndef TRMMKERNEL
  2250. LFDX A1, CO1, INC2
  2251. LFDX A2, CO2, INC2
  2252. fpadd f0, f0, f1
  2253. fpadd f2, f2, f3
  2254. fsmfp A1, A2
  2255. fpadd f0, f0, f2
  2256. fxcpmadd f0, AP, f0, A1
  2257. #else
  2258. fpadd f0, f0, f1
  2259. fpadd f2, f2, f3
  2260. fsmfp A1, A2
  2261. fpadd f0, f0, f2
  2262. fpmul f0, AP, f0
  2263. #endif
  2264. STFDX f0, CO1, INC2
  2265. STFSDX f0, CO2, INC2
  2266. #ifdef TRMMKERNEL
  2267. #if ( defined(LEFT) && defined(TRANSA)) || \
  2268. (!defined(LEFT) && !defined(TRANSA))
  2269. sub TEMP, K, KK
  2270. #ifdef LEFT
  2271. addi TEMP, TEMP, -1
  2272. #else
  2273. addi TEMP, TEMP, -2
  2274. #endif
  2275. slwi r0, TEMP, 0 + BASE_SHIFT
  2276. slwi TEMP, TEMP, 1 + BASE_SHIFT
  2277. add AO, AO, r0
  2278. add BO, BO, TEMP
  2279. #endif
  2280. #ifdef LEFT
  2281. addi KK, KK, 1
  2282. #endif
  2283. #endif
  2284. .align 4
  2285. .L89:
  2286. #if defined(TRMMKERNEL) && !defined(LEFT)
  2287. addi KK, KK, 2
  2288. #endif
  2289. addi B, BO, 2 * SIZE
  2290. .align 4
  2291. .L90:
  2292. andi. J, N, 1
  2293. beq .L999
  2294. #if defined(TRMMKERNEL) && defined(LEFT)
  2295. mr KK, OFFSET
  2296. #endif
  2297. mr CO1, C
  2298. addi AO, A, -2 * SIZE
  2299. li r0, FZERO
  2300. lfpsx f0, SP, r0
  2301. srawi. I, M, 3
  2302. ble .L100
  2303. .align 4
  2304. .L91:
  2305. #if defined(TRMMKERNEL)
  2306. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2307. addi BO, B, - 2 * SIZE
  2308. fpmr f1, f0
  2309. #else
  2310. slwi TEMP, KK, 3 + BASE_SHIFT
  2311. slwi r0, KK, 0 + BASE_SHIFT
  2312. add AO, AO, TEMP
  2313. add BO, B, r0
  2314. addi BO, BO, - 2 * SIZE
  2315. fpmr f1, f0
  2316. #endif
  2317. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2318. sub TEMP, K, KK
  2319. #elif defined(LEFT)
  2320. addi TEMP, KK, 8
  2321. #else
  2322. addi TEMP, KK, 1
  2323. #endif
  2324. fpmr f2, f0
  2325. srawi. r0, TEMP, 2
  2326. fpmr f3, f0
  2327. mtspr CTR, r0
  2328. ble .L94
  2329. #else
  2330. srawi. r0, K, 2
  2331. fpmr f1, f0
  2332. addi BO, B, - 2 * SIZE
  2333. fpmr f2, f0
  2334. fpmr f3, f0
  2335. mtspr CTR, r0
  2336. ble .L94
  2337. #endif
  2338. LFPDUX B1, BO, INC2
  2339. LFPDUX A1, AO, INC2
  2340. LFPDUX A2, AO, INC2
  2341. LFPDUX A3, AO, INC2
  2342. LFPDUX A4, AO, INC2
  2343. LFPDUX B2, BO, INC2
  2344. LFPDUX A5, AO, INC2
  2345. LFPDUX A6, AO, INC2
  2346. LFPDUX A7, AO, INC2
  2347. LFPDUX A8, AO, INC2
  2348. bdz- .L93
  2349. .align 4
  2350. .L92:
  2351. fxcpmadd f0, B1, A1, f0
  2352. LFPDUX A1, AO, INC2
  2353. fxcpmadd f1, B1, A2, f1
  2354. LFPDUX A2, AO, INC2
  2355. fxcpmadd f2, B1, A3, f2
  2356. LFPDUX A3, AO, INC2
  2357. fxcpmadd f3, B1, A4, f3
  2358. LFPDUX A4, AO, INC2
  2359. fxcsmadd f0, B1, A5, f0
  2360. LFPDUX A5, AO, INC2
  2361. fxcsmadd f1, B1, A6, f1
  2362. LFPDUX A6, AO, INC2
  2363. fxcsmadd f2, B1, A7, f2
  2364. LFPDUX A7, AO, INC2
  2365. fxcsmadd f3, B1, A8, f3
  2366. LFPDUX A8, AO, INC2
  2367. LFPDUX B1, BO, INC2
  2368. fxcpmadd f0, B2, A1, f0
  2369. LFPDUX A1, AO, INC2
  2370. fxcpmadd f1, B2, A2, f1
  2371. LFPDUX A2, AO, INC2
  2372. fxcpmadd f2, B2, A3, f2
  2373. LFPDUX A3, AO, INC2
  2374. fxcpmadd f3, B2, A4, f3
  2375. LFPDUX A4, AO, INC2
  2376. fxcsmadd f0, B2, A5, f0
  2377. LFPDUX A5, AO, INC2
  2378. fxcsmadd f1, B2, A6, f1
  2379. LFPDUX A6, AO, INC2
  2380. fxcsmadd f2, B2, A7, f2
  2381. LFPDUX A7, AO, INC2
  2382. fxcsmadd f3, B2, A8, f3
  2383. LFPDUX A8, AO, INC2
  2384. LFPDUX B2, BO, INC2
  2385. bdnz+ .L92
  2386. .align 4
  2387. .L93:
  2388. fxcpmadd f0, B1, A1, f0
  2389. LFPDUX A1, AO, INC2
  2390. fxcpmadd f1, B1, A2, f1
  2391. LFPDUX A2, AO, INC2
  2392. fxcpmadd f2, B1, A3, f2
  2393. LFPDUX A3, AO, INC2
  2394. fxcpmadd f3, B1, A4, f3
  2395. LFPDUX A4, AO, INC2
  2396. fxcsmadd f0, B1, A5, f0
  2397. LFPDUX A5, AO, INC2
  2398. fxcsmadd f1, B1, A6, f1
  2399. LFPDUX A6, AO, INC2
  2400. fxcsmadd f2, B1, A7, f2
  2401. LFPDUX A7, AO, INC2
  2402. fxcsmadd f3, B1, A8, f3
  2403. LFPDUX A8, AO, INC2
  2404. fxcpmadd f0, B2, A1, f0
  2405. fxcpmadd f1, B2, A2, f1
  2406. fxcpmadd f2, B2, A3, f2
  2407. fxcpmadd f3, B2, A4, f3
  2408. fxcsmadd f0, B2, A5, f0
  2409. fxcsmadd f1, B2, A6, f1
  2410. fxcsmadd f2, B2, A7, f2
  2411. fxcsmadd f3, B2, A8, f3
  2412. .align 4
  2413. .L94:
  2414. lfd AP, ALPHA(SP)
  2415. #ifdef TRMMKERNEL
  2416. fsmfp AP, AP
  2417. #endif
  2418. #if defined(TRMMKERNEL)
  2419. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2420. sub TEMP, K, KK
  2421. #elif defined(LEFT)
  2422. addi TEMP, KK, 8
  2423. #else
  2424. addi TEMP, KK, 1
  2425. #endif
  2426. andi. TEMP, TEMP, 3
  2427. mtspr CTR, TEMP
  2428. #else
  2429. andi. r0, K, 3
  2430. mtspr CTR, r0
  2431. #endif
  2432. ble+ .L98
  2433. LFDX B1, BO, INC2
  2434. LFPDUX A1, AO, INC2
  2435. LFPDUX A2, AO, INC2
  2436. LFPDUX A3, AO, INC2
  2437. LFPDUX A4, AO, INC2
  2438. add BO, BO, INC
  2439. bdz- .L97
  2440. .align 4
  2441. .L96:
  2442. fxcpmadd f0, B1, A1, f0
  2443. LFPDUX A1, AO, INC2
  2444. fxcpmadd f1, B1, A2, f1
  2445. LFPDUX A2, AO, INC2
  2446. fxcpmadd f2, B1, A3, f2
  2447. LFPDUX A3, AO, INC2
  2448. fxcpmadd f3, B1, A4, f3
  2449. LFDX B1, BO, INC2
  2450. LFPDUX A4, AO, INC2
  2451. add BO, BO, INC
  2452. bdnz+ .L96
  2453. .align 4
  2454. .L97:
  2455. fxcpmadd f0, B1, A1, f0
  2456. fxcpmadd f1, B1, A2, f1
  2457. fxcpmadd f2, B1, A3, f2
  2458. fxcpmadd f3, B1, A4, f3
  2459. .align 4
  2460. .L98:
  2461. #ifndef TRMMKERNEL
  2462. LFPDUX A1, CO1, INC2
  2463. LFPDUX B1, CO1, INC2
  2464. LFPDUX A3, CO1, INC2
  2465. LFPDUX A5, CO1, INC2
  2466. fxcpmadd f0, AP, f0, A1
  2467. fxcpmadd f1, AP, f1, B1
  2468. fxcpmadd f2, AP, f2, A3
  2469. fxcpmadd f3, AP, f3, A5
  2470. STFPDUX f0, CO1, INCM7
  2471. STFPDUX f1, CO1, INC2
  2472. STFPDUX f2, CO1, INC2
  2473. STFPDUX f3, CO1, INC2
  2474. #else
  2475. fpmul f0, AP, f0
  2476. fpmul f1, AP, f1
  2477. fpmul f2, AP, f2
  2478. fpmul f3, AP, f3
  2479. STFPDUX f0, CO1, INC2
  2480. STFPDUX f1, CO1, INC2
  2481. STFPDUX f2, CO1, INC2
  2482. STFPDUX f3, CO1, INC2
  2483. #endif
  2484. #ifdef TRMMKERNEL
  2485. #if ( defined(LEFT) && defined(TRANSA)) || \
  2486. (!defined(LEFT) && !defined(TRANSA))
  2487. sub TEMP, K, KK
  2488. #ifdef LEFT
  2489. addi TEMP, TEMP, -8
  2490. #else
  2491. addi TEMP, TEMP, -1
  2492. #endif
  2493. slwi r0, TEMP, 3 + BASE_SHIFT
  2494. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2495. add AO, AO, r0
  2496. add BO, BO, TEMP
  2497. #endif
  2498. #ifdef LEFT
  2499. addi KK, KK, 8
  2500. #endif
  2501. #endif
  2502. addic. I, I, -1
  2503. li r0, FZERO
  2504. lfpsx f0, SP, r0
  2505. bgt+ .L91
  2506. .align 4
  2507. .L100:
  2508. andi. I, M, 4
  2509. beq .L110
  2510. #if defined(TRMMKERNEL)
  2511. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2512. addi BO, B, - 2 * SIZE
  2513. fpmr f1, f0
  2514. fpmr f2, f0
  2515. fpmr f3, f0
  2516. #else
  2517. slwi TEMP, KK, 2 + BASE_SHIFT
  2518. slwi r0, KK, 0 + BASE_SHIFT
  2519. add AO, AO, TEMP
  2520. add BO, B, r0
  2521. fpmr f1, f0
  2522. addi BO, BO, - 2 * SIZE
  2523. fpmr f2, f0
  2524. fpmr f3, f0
  2525. #endif
  2526. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2527. sub TEMP, K, KK
  2528. #elif defined(LEFT)
  2529. addi TEMP, KK, 4
  2530. #else
  2531. addi TEMP, KK, 1
  2532. #endif
  2533. srawi. r0, TEMP, 3
  2534. mtspr CTR, r0
  2535. ble .L104
  2536. #else
  2537. addi BO, B, - 2 * SIZE
  2538. fpmr f1, f0
  2539. fpmr f2, f0
  2540. fpmr f3, f0
  2541. srawi. r0, K, 3
  2542. mtspr CTR, r0
  2543. ble .L104
  2544. #endif
  2545. LFPDUX B1, BO, INC2
  2546. LFPDUX A1, AO, INC2
  2547. LFPDUX A2, AO, INC2
  2548. LFPDUX A3, AO, INC2
  2549. LFPDUX A4, AO, INC2
  2550. LFPDUX B2, BO, INC2
  2551. LFPDUX A5, AO, INC2
  2552. LFPDUX A6, AO, INC2
  2553. LFPDUX A7, AO, INC2
  2554. LFPDUX A8, AO, INC2
  2555. LFPDUX B3, BO, INC2
  2556. LFPDUX B4, BO, INC2
  2557. bdz- .L103
  2558. .align 4
  2559. .L102:
  2560. fxcpmadd f0, B1, A1, f0
  2561. LFPDUX A1, AO, INC2
  2562. fxcpmadd f1, B1, A2, f1
  2563. LFPDUX A2, AO, INC2
  2564. fxcsmadd f2, B1, A3, f2
  2565. LFPDUX A3, AO, INC2
  2566. fxcsmadd f3, B1, A4, f3
  2567. LFPDUX A4, AO, INC2
  2568. LFPDUX B1, BO, INC2
  2569. fxcpmadd f0, B2, A5, f0
  2570. LFPDUX A5, AO, INC2
  2571. fxcpmadd f1, B2, A6, f1
  2572. LFPDUX A6, AO, INC2
  2573. fxcsmadd f2, B2, A7, f2
  2574. LFPDUX A7, AO, INC2
  2575. fxcsmadd f3, B2, A8, f3
  2576. LFPDUX A8, AO, INC2
  2577. LFPDUX B2, BO, INC2
  2578. fxcpmadd f0, B3, A1, f0
  2579. LFPDUX A1, AO, INC2
  2580. fxcpmadd f1, B3, A2, f1
  2581. LFPDUX A2, AO, INC2
  2582. fxcsmadd f2, B3, A3, f2
  2583. LFPDUX A3, AO, INC2
  2584. fxcsmadd f3, B3, A4, f3
  2585. LFPDUX A4, AO, INC2
  2586. LFPDUX B3, BO, INC2
  2587. fxcpmadd f0, B4, A5, f0
  2588. LFPDUX A5, AO, INC2
  2589. fxcpmadd f1, B4, A6, f1
  2590. LFPDUX A6, AO, INC2
  2591. fxcsmadd f2, B4, A7, f2
  2592. LFPDUX A7, AO, INC2
  2593. fxcsmadd f3, B4, A8, f3
  2594. LFPDUX A8, AO, INC2
  2595. LFPDUX B4, BO, INC2
  2596. bdnz+ .L102
  2597. .align 4
  2598. .L103:
  2599. fxcpmadd f0, B1, A1, f0
  2600. LFPDUX A1, AO, INC2
  2601. fxcpmadd f1, B1, A2, f1
  2602. LFPDUX A2, AO, INC2
  2603. fxcsmadd f2, B1, A3, f2
  2604. LFPDUX A3, AO, INC2
  2605. fxcsmadd f3, B1, A4, f3
  2606. LFPDUX A4, AO, INC2
  2607. fxcpmadd f0, B2, A5, f0
  2608. LFPDUX A5, AO, INC2
  2609. fxcpmadd f1, B2, A6, f1
  2610. LFPDUX A6, AO, INC2
  2611. fxcsmadd f2, B2, A7, f2
  2612. LFPDUX A7, AO, INC2
  2613. fxcsmadd f3, B2, A8, f3
  2614. LFPDUX A8, AO, INC2
  2615. fxcpmadd f0, B3, A1, f0
  2616. fxcpmadd f1, B3, A2, f1
  2617. fxcsmadd f2, B3, A3, f2
  2618. fxcsmadd f3, B3, A4, f3
  2619. fxcpmadd f0, B4, A5, f0
  2620. fxcpmadd f1, B4, A6, f1
  2621. fxcsmadd f2, B4, A7, f2
  2622. fxcsmadd f3, B4, A8, f3
  2623. .align 4
  2624. .L104:
  2625. lfd AP, ALPHA(SP)
  2626. #ifdef TRMMKERNEL
  2627. fsmfp AP, AP
  2628. #endif
  2629. #if defined(TRMMKERNEL)
  2630. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2631. sub TEMP, K, KK
  2632. #elif defined(LEFT)
  2633. addi TEMP, KK, 4
  2634. #else
  2635. addi TEMP, KK, 1
  2636. #endif
  2637. andi. TEMP, TEMP, 7
  2638. mtspr CTR, TEMP
  2639. #else
  2640. andi. r0, K, 7
  2641. mtspr CTR, r0
  2642. #endif
  2643. ble+ .L108
  2644. LFPDUX A1, AO, INC2
  2645. LFDX B1, BO, INC2
  2646. LFPDUX A2, AO, INC2
  2647. add BO, BO, INC
  2648. bdz- .L107
  2649. .align 4
  2650. .L106:
  2651. fxcpmadd f0, B1, A1, f0
  2652. LFPDUX A1, AO, INC2
  2653. fxcpmadd f1, B1, A2, f1
  2654. LFDX B1, BO, INC2
  2655. LFPDUX A2, AO, INC2
  2656. add BO, BO, INC
  2657. bdnz+ .L106
  2658. .align 4
  2659. .L107:
  2660. fxcpmadd f0, B1, A1, f0
  2661. fxcpmadd f1, B1, A2, f1
  2662. .align 4
  2663. .L108:
  2664. #ifndef TRMMKERNEL
  2665. LFPDUX A1, CO1, INC2
  2666. LFPDUX B1, CO1, INC2
  2667. fpadd f0, f0, f2
  2668. fpadd f1, f1, f3
  2669. fxcpmadd f0, AP, f0, A1
  2670. fxcpmadd f1, AP, f1, B1
  2671. STFPDUX f0, CO1, INCM3
  2672. STFPDUX f1, CO1, INC2
  2673. #else
  2674. fpadd f0, f0, f2
  2675. fpadd f1, f1, f3
  2676. fpmul f0, AP, f0
  2677. fpmul f1, AP, f1
  2678. STFPDUX f0, CO1, INC2
  2679. STFPDUX f1, CO1, INC2
  2680. #endif
  2681. #ifdef TRMMKERNEL
  2682. #if ( defined(LEFT) && defined(TRANSA)) || \
  2683. (!defined(LEFT) && !defined(TRANSA))
  2684. sub TEMP, K, KK
  2685. #ifdef LEFT
  2686. addi TEMP, TEMP, -4
  2687. #else
  2688. addi TEMP, TEMP, -1
  2689. #endif
  2690. slwi r0, TEMP, 2 + BASE_SHIFT
  2691. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2692. add AO, AO, r0
  2693. add BO, BO, TEMP
  2694. #endif
  2695. #ifdef LEFT
  2696. addi KK, KK, 4
  2697. #endif
  2698. #endif
  2699. li r0, FZERO
  2700. lfpsx f0, SP, r0
  2701. .align 4
  2702. .L110:
  2703. andi. I, M, 2
  2704. beq .L120
  2705. #if defined(TRMMKERNEL)
  2706. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2707. addi BO, B, - 2 * SIZE
  2708. fpmr f1, f0
  2709. fpmr f2, f0
  2710. fpmr f3, f0
  2711. #else
  2712. slwi TEMP, KK, 1 + BASE_SHIFT
  2713. slwi r0, KK, 0 + BASE_SHIFT
  2714. add AO, AO, TEMP
  2715. add BO, B, r0
  2716. fpmr f1, f0
  2717. addi BO, BO, - 2 * SIZE
  2718. fpmr f2, f0
  2719. fpmr f3, f0
  2720. #endif
  2721. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2722. sub TEMP, K, KK
  2723. #elif defined(LEFT)
  2724. addi TEMP, KK, 2
  2725. #else
  2726. addi TEMP, KK, 1
  2727. #endif
  2728. srawi. r0, TEMP, 3
  2729. mtspr CTR, r0
  2730. ble .L114
  2731. #else
  2732. addi BO, B, - 2 * SIZE
  2733. fpmr f1, f0
  2734. fpmr f2, f0
  2735. fpmr f3, f0
  2736. srawi. r0, K, 3
  2737. mtspr CTR, r0
  2738. ble .L114
  2739. #endif
  2740. LFPDUX A1, AO, INC2
  2741. LFPDUX A2, AO, INC2
  2742. LFPDUX B1, BO, INC2
  2743. LFPDUX A3, AO, INC2
  2744. LFPDUX A4, AO, INC2
  2745. LFPDUX B2, BO, INC2
  2746. LFPDUX A5, AO, INC2
  2747. LFPDUX A6, AO, INC2
  2748. LFPDUX B3, BO, INC2
  2749. LFPDUX A7, AO, INC2
  2750. LFPDUX A8, AO, INC2
  2751. LFPDUX B4, BO, INC2
  2752. bdz- .L113
  2753. .align 4
  2754. .L112:
  2755. fxcpmadd f0, B1, A1, f0
  2756. LFPDUX A1, AO, INC2
  2757. fxcsmadd f1, B1, A2, f1
  2758. LFPDUX A2, AO, INC2
  2759. LFPDUX B1, BO, INC2
  2760. fxcpmadd f2, B2, A3, f2
  2761. LFPDUX A3, AO, INC2
  2762. fxcsmadd f3, B2, A4, f3
  2763. LFPDUX A4, AO, INC2
  2764. LFPDUX B2, BO, INC2
  2765. fxcpmadd f0, B3, A5, f0
  2766. LFPDUX A5, AO, INC2
  2767. fxcsmadd f1, B3, A6, f1
  2768. LFPDUX A6, AO, INC2
  2769. LFPDUX B3, BO, INC2
  2770. fxcpmadd f2, B4, A7, f2
  2771. LFPDUX A7, AO, INC2
  2772. fxcsmadd f3, B4, A8, f3
  2773. LFPDUX A8, AO, INC2
  2774. LFPDUX B4, BO, INC2
  2775. bdnz+ .L112
  2776. .align 4
  2777. .L113:
  2778. fxcpmadd f0, B1, A1, f0
  2779. fxcsmadd f1, B1, A2, f1
  2780. fxcpmadd f2, B2, A3, f2
  2781. fxcsmadd f3, B2, A4, f3
  2782. fxcpmadd f0, B3, A5, f0
  2783. fxcsmadd f1, B3, A6, f1
  2784. fxcpmadd f2, B4, A7, f2
  2785. fxcsmadd f3, B4, A8, f3
  2786. .align 4
  2787. .L114:
  2788. lfd AP, ALPHA(SP)
  2789. #ifdef TRMMKERNEL
  2790. fsmfp AP, AP
  2791. #endif
  2792. #if defined(TRMMKERNEL)
  2793. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2794. sub TEMP, K, KK
  2795. #elif defined(LEFT)
  2796. addi TEMP, KK, 2
  2797. #else
  2798. addi TEMP, KK, 1
  2799. #endif
  2800. andi. TEMP, TEMP, 7
  2801. mtspr CTR, TEMP
  2802. #else
  2803. andi. r0, K, 7
  2804. mtspr CTR, r0
  2805. #endif
  2806. ble+ .L118
  2807. LFPDUX A1, AO, INC2
  2808. LFDX B1, BO, INC2
  2809. add BO, BO, INC
  2810. bdz- .L117
  2811. .align 4
  2812. .L116:
  2813. fxcpmadd f0, B1, A1, f0
  2814. LFPDUX A1, AO, INC2
  2815. LFDX B1, BO, INC2
  2816. add BO, BO, INC
  2817. bdnz+ .L116
  2818. .align 4
  2819. .L117:
  2820. fxcpmadd f0, B1, A1, f0
  2821. .align 4
  2822. .L118:
  2823. #ifndef TRMMKERNEL
  2824. LFPDX A1, CO1, INC2
  2825. fpadd f0, f0, f1
  2826. fpadd f2, f3, f2
  2827. fpadd f0, f0, f2
  2828. fxcpmadd f1, AP, f0, A1
  2829. li r0, FZERO
  2830. lfpsx f0, SP, r0
  2831. STFPDUX f1, CO1, INC2
  2832. #else
  2833. fpadd f0, f0, f1
  2834. fpadd f2, f3, f2
  2835. fpadd f0, f0, f2
  2836. fpmul f1, AP, f0
  2837. li r0, FZERO
  2838. lfpsx f0, SP, r0
  2839. STFPDUX f1, CO1, INC2
  2840. #endif
  2841. #ifdef TRMMKERNEL
  2842. #if ( defined(LEFT) && defined(TRANSA)) || \
  2843. (!defined(LEFT) && !defined(TRANSA))
  2844. sub TEMP, K, KK
  2845. #ifdef LEFT
  2846. addi TEMP, TEMP, -2
  2847. #else
  2848. addi TEMP, TEMP, -1
  2849. #endif
  2850. slwi r0, TEMP, 1 + BASE_SHIFT
  2851. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2852. add AO, AO, r0
  2853. add BO, BO, TEMP
  2854. #endif
  2855. #ifdef LEFT
  2856. addi KK, KK, 2
  2857. #endif
  2858. #endif
  2859. .align 4
  2860. .L120:
  2861. andi. I, M, 1
  2862. beq .L999
  2863. #if defined(TRMMKERNEL)
  2864. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2865. addi BO, B, - 2 * SIZE
  2866. fpmr f1, f0
  2867. fpmr f2, f0
  2868. fpmr f3, f0
  2869. #else
  2870. slwi TEMP, KK, 0 + BASE_SHIFT
  2871. slwi r0, KK, 0 + BASE_SHIFT
  2872. add AO, AO, TEMP
  2873. add BO, B, r0
  2874. fpmr f1, f0
  2875. addi BO, BO, - 2 * SIZE
  2876. fpmr f2, f0
  2877. fpmr f3, f0
  2878. #endif
  2879. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2880. sub TEMP, K, KK
  2881. #elif defined(LEFT)
  2882. addi TEMP, KK, 1
  2883. #else
  2884. addi TEMP, KK, 1
  2885. #endif
  2886. srawi. r0, TEMP, 3
  2887. mtspr CTR, r0
  2888. ble .L124
  2889. #else
  2890. addi BO, B, - 2 * SIZE
  2891. fpmr f1, f0
  2892. fpmr f2, f0
  2893. fpmr f3, f0
  2894. srawi. r0, K, 3
  2895. mtspr CTR, r0
  2896. ble .L124
  2897. #endif
  2898. LFPDUX A1, AO, INC2
  2899. LFPDUX B1, BO, INC2
  2900. LFPDUX A2, AO, INC2
  2901. LFPDUX B2, BO, INC2
  2902. LFPDUX A3, AO, INC2
  2903. LFPDUX B3, BO, INC2
  2904. LFPDUX A4, AO, INC2
  2905. LFPDUX B4, BO, INC2
  2906. bdz- .L123
  2907. .align 4
  2908. .L122:
  2909. fpmadd f0, A1, B1, f0
  2910. LFPDUX A1, AO, INC2
  2911. LFPDUX B1, BO, INC2
  2912. fpmadd f1, A2, B2, f1
  2913. LFPDUX A2, AO, INC2
  2914. LFPDUX B2, BO, INC2
  2915. fpmadd f2, A3, B3, f2
  2916. LFPDUX A3, AO, INC2
  2917. LFPDUX B3, BO, INC2
  2918. fpmadd f3, A4, B4, f3
  2919. LFPDUX A4, AO, INC2
  2920. LFPDUX B4, BO, INC2
  2921. bdnz+ .L122
  2922. .align 4
  2923. .L123:
  2924. fpmadd f0, A1, B1, f0
  2925. fpmadd f1, A2, B2, f1
  2926. fpmadd f2, A3, B3, f2
  2927. fpmadd f3, A4, B4, f3
  2928. .align 4
  2929. .L124:
  2930. lfd AP, ALPHA(SP)
  2931. #ifdef TRMMKERNEL
  2932. fsmfp AP, AP
  2933. #endif
  2934. #if defined(TRMMKERNEL)
  2935. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2936. sub TEMP, K, KK
  2937. #elif defined(LEFT)
  2938. addi TEMP, KK, 1
  2939. #else
  2940. addi TEMP, KK, 1
  2941. #endif
  2942. andi. TEMP, TEMP, 7
  2943. mtspr CTR, TEMP
  2944. #else
  2945. andi. r0, K, 7
  2946. mtspr CTR, r0
  2947. #endif
  2948. ble+ .L128
  2949. LFDX A1, AO, INC2
  2950. LFDX B1, BO, INC2
  2951. add AO, AO, INC
  2952. add BO, BO, INC
  2953. bdz- .L127
  2954. .align 4
  2955. .L126:
  2956. fmadd f0, A1, B1, f0
  2957. LFDX A1, AO, INC2
  2958. LFDX B1, BO, INC2
  2959. add AO, AO, INC
  2960. add BO, BO, INC
  2961. bdnz+ .L126
  2962. .align 4
  2963. .L127:
  2964. fmadd f0, A1, B1, f0
  2965. .align 4
  2966. .L128:
  2967. #ifndef TRMMKERNEL
  2968. LFDX A1, CO1, INC2
  2969. fpadd f0, f0, f1
  2970. fpadd f2, f2, f3
  2971. fpadd f0, f0, f2
  2972. fsmtp f1, f0
  2973. fadd f0, f0, f1
  2974. fmadd f0, AP, f0, A1
  2975. #else
  2976. fpadd f0, f0, f1
  2977. fpadd f2, f2, f3
  2978. fpadd f0, f0, f2
  2979. fsmtp f1, f0
  2980. fadd f0, f0, f1
  2981. fpmul f0, AP, f0
  2982. #endif
  2983. STFDUX f0, CO1, INC2
  2984. .align 4
  2985. .L999:
  2986. addi SP, SP, 12
  2987. lwzu r14, 4(SP)
  2988. lwzu r15, 4(SP)
  2989. lwzu r16, 4(SP)
  2990. lwzu r17, 4(SP)
  2991. lwzu r18, 4(SP)
  2992. lwzu r19, 4(SP)
  2993. lwzu r20, 4(SP)
  2994. lwzu r21, 4(SP)
  2995. lwzu r22, 4(SP)
  2996. lwzu r23, 4(SP)
  2997. lwzu r24, 4(SP)
  2998. lwzu r25, 4(SP)
  2999. lwzu r26, 4(SP)
  3000. lwzu r27, 4(SP)
  3001. lwzu r28, 4(SP)
  3002. lwzu r29, 4(SP)
  3003. lwzu r30, 4(SP)
  3004. lwzu r31, 4(SP)
  3005. subi SP, SP, 12
  3006. li r0, 16
  3007. lfpdux f31, SP, r0
  3008. lfpdux f30, SP, r0
  3009. lfpdux f29, SP, r0
  3010. lfpdux f28, SP, r0
  3011. lfpdux f27, SP, r0
  3012. lfpdux f26, SP, r0
  3013. lfpdux f25, SP, r0
  3014. lfpdux f24, SP, r0
  3015. lfpdux f23, SP, r0
  3016. lfpdux f22, SP, r0
  3017. lfpdux f21, SP, r0
  3018. lfpdux f20, SP, r0
  3019. lfpdux f19, SP, r0
  3020. lfpdux f18, SP, r0
  3021. lfpdux f17, SP, r0
  3022. lfpdux f16, SP, r0
  3023. lfpdux f15, SP, r0
  3024. lfpdux f14, SP, r0
  3025. addi SP, SP, 16
  3026. blr
  3027. .align 4
  3028. .L1000:
  3029. li INCM1, -1 * SIZE
  3030. li INCM3, -3 * SIZE
  3031. li INCM5, -5 * SIZE
  3032. li INCM7, -7 * SIZE
  3033. addi C, C, - 1 * SIZE
  3034. srawi. J, N, 2
  3035. ble .L1050
  3036. .align 4
  3037. .L1010:
  3038. mr CO1, C
  3039. add CO2, C, LDC
  3040. add CO3, CO2, LDC
  3041. add CO4, CO3, LDC
  3042. add C, CO4, LDC
  3043. #if defined(TRMMKERNEL) && defined(LEFT)
  3044. mr KK, OFFSET
  3045. #endif
  3046. addi AO, A, -4 * SIZE
  3047. li r0, FZERO
  3048. lfpsx f0, SP, r0
  3049. srawi. I, M, 3
  3050. ble .L1020
  3051. .align 4
  3052. .L1011:
  3053. #if defined(TRMMKERNEL)
  3054. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  3055. addi AO2, AO, 2 * SIZE
  3056. fpmr f4, f0
  3057. addi BO, B, - 4 * SIZE
  3058. fpmr f8, f0
  3059. addi BO2, B, - 2 * SIZE
  3060. fpmr f12, f0
  3061. #else
  3062. slwi TEMP, KK, 3 + BASE_SHIFT
  3063. slwi r0, KK, 2 + BASE_SHIFT
  3064. add AO, AO, TEMP
  3065. add BO, B, r0
  3066. addi AO2, AO, 2 * SIZE
  3067. fpmr f4, f0
  3068. addi BO, BO, - 4 * SIZE
  3069. fpmr f8, f0
  3070. addi BO2, BO, 2 * SIZE
  3071. fpmr f12, f0
  3072. #endif
  3073. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3074. sub TEMP, K, KK
  3075. #elif defined(LEFT)
  3076. addi TEMP, KK, 8
  3077. #else
  3078. addi TEMP, KK, 4
  3079. #endif
  3080. srawi. TEMP, TEMP, 2
  3081. fpmr f1, f0
  3082. mtspr CTR, TEMP
  3083. ble .L1014
  3084. #else
  3085. addi AO2, AO, 2 * SIZE
  3086. fpmr f4, f0
  3087. addi BO, B, - 4 * SIZE
  3088. fpmr f8, f0
  3089. addi BO2, B, - 2 * SIZE
  3090. fpmr f12, f0
  3091. srawi. r0, K, 2
  3092. fpmr f1, f0
  3093. mtspr CTR, r0
  3094. ble .L1014
  3095. #endif
  3096. LFPDUX A1, AO, INC4
  3097. fpmr f5, f0
  3098. LFPDUX A3, AO, INC4
  3099. fpmr f9, f0
  3100. LFPDUX B1, BO, INC4
  3101. fpmr f13, f0
  3102. LFPDUX A5, AO, INC4
  3103. fpmr f2, f0
  3104. LFPDUX A6, AO, INC4
  3105. fpmr f6, f0
  3106. LFPDUX B3, BO, INC4
  3107. fpmr f10, f0
  3108. LFPDUX A7, AO, INC4
  3109. fpmr f14, f0
  3110. LFPDUX A8, AO, INC4
  3111. fpmr f3, f0
  3112. LFPDUX B5, BO, INC4
  3113. fpmr f7, f0
  3114. LFPDUX A9, AO, INC4
  3115. fpmr f11, f0
  3116. LFPDUX A2, AO2, INC4
  3117. fpmr f15, f0
  3118. LFPDUX B2, BO2, INC4
  3119. bdz- .L1013
  3120. .align 4
  3121. .L1012:
  3122. ## 1 ##
  3123. fxcpmadd f0, B1, A1, f0
  3124. nop
  3125. fxcsmadd f4, B1, A1, f4
  3126. nop
  3127. fxcpmadd f8, B2, A1, f8
  3128. LFPDUX B4, BO2, INC4
  3129. fxcsmadd f12, B2, A1, f12
  3130. LFPDUX B6, BO, INC4
  3131. fxcpmadd f1, B1, A2, f1
  3132. nop
  3133. fxcsmadd f5, B1, A2, f5
  3134. LFPDUX A4, AO2, INC4
  3135. fxcpmadd f9, B2, A2, f9
  3136. LFPDUX A10, AO, INC4
  3137. fxcsmadd f13, B2, A2, f13
  3138. nop
  3139. fxcpmadd f2, B1, A3, f2
  3140. nop
  3141. fxcsmadd f6, B1, A3, f6
  3142. nop
  3143. fxcpmadd f10, B2, A3, f10
  3144. nop
  3145. fxcsmadd f14, B2, A3, f14
  3146. nop
  3147. fxcpmadd f3, B1, A4, f3
  3148. nop
  3149. fxcsmadd f7, B1, A4, f7
  3150. LFPDUX A2, AO2, INC4
  3151. fxcpmadd f11, B2, A4, f11
  3152. LFPDUX A1, AO, INC4
  3153. fxcsmadd f15, B2, A4, f15
  3154. nop
  3155. ## 2 ##
  3156. fxcpmadd f0, B3, A5, f0
  3157. nop
  3158. fxcsmadd f4, B3, A5, f4
  3159. nop
  3160. fxcpmadd f8, B4, A5, f8
  3161. LFPDUX B2, BO2, INC4
  3162. fxcsmadd f12, B4, A5, f12
  3163. LFPDUX B1, BO, INC4
  3164. fxcpmadd f1, B3, A2, f1
  3165. nop
  3166. fxcsmadd f5, B3, A2, f5
  3167. LFPDUX A4, AO2, INC4
  3168. fxcpmadd f9, B4, A2, f9
  3169. LFPDUX A3, AO, INC4
  3170. fxcsmadd f13, B4, A2, f13
  3171. nop
  3172. fxcpmadd f2, B3, A6, f2
  3173. nop
  3174. fxcsmadd f6, B3, A6, f6
  3175. nop
  3176. fxcpmadd f10, B4, A6, f10
  3177. nop
  3178. fxcsmadd f14, B4, A6, f14
  3179. nop
  3180. fxcpmadd f3, B3, A4, f3
  3181. nop
  3182. fxcsmadd f7, B3, A4, f7
  3183. LFPDUX A2, AO2, INC4
  3184. fxcpmadd f11, B4, A4, f11
  3185. LFPDUX A5, AO, INC4
  3186. fxcsmadd f15, B4, A4, f15
  3187. nop
  3188. ## 3 ##
  3189. fxcpmadd f0, B5, A7, f0
  3190. nop
  3191. fxcsmadd f4, B5, A7, f4
  3192. nop
  3193. fxcpmadd f8, B2, A7, f8
  3194. LFPDUX B4, BO2, INC4
  3195. fxcsmadd f12, B2, A7, f12
  3196. LFPDUX B3, BO, INC4
  3197. fxcpmadd f1, B5, A2, f1
  3198. nop
  3199. fxcsmadd f5, B5, A2, f5
  3200. LFPDUX A4, AO2, INC4
  3201. fxcpmadd f9, B2, A2, f9
  3202. LFPDUX A6, AO, INC4
  3203. fxcsmadd f13, B2, A2, f13
  3204. nop
  3205. fxcpmadd f2, B5, A8, f2
  3206. nop
  3207. fxcsmadd f6, B5, A8, f6
  3208. nop
  3209. fxcpmadd f10, B2, A8, f10
  3210. nop
  3211. fxcsmadd f14, B2, A8, f14
  3212. nop
  3213. fxcpmadd f3, B5, A4, f3
  3214. nop
  3215. fxcsmadd f7, B5, A4, f7
  3216. LFPDUX A2, AO2, INC4
  3217. fxcpmadd f11, B2, A4, f11
  3218. LFPDUX A7, AO, INC4
  3219. fxcsmadd f15, B2, A4, f15
  3220. nop
  3221. ## 4 ##
  3222. fxcpmadd f0, B6, A9, f0
  3223. nop
  3224. fxcsmadd f4, B6, A9, f4
  3225. nop
  3226. fxcpmadd f8, B4, A9, f8
  3227. LFPDUX B2, BO2, INC4
  3228. fxcsmadd f12, B4, A9, f12
  3229. LFPDUX B5, BO, INC4
  3230. fxcpmadd f1, B6, A2, f1
  3231. nop
  3232. fxcsmadd f5, B6, A2, f5
  3233. LFPDUX A4, AO2, INC4
  3234. fxcpmadd f9, B4, A2, f9
  3235. LFPDUX A8, AO, INC4
  3236. fxcsmadd f13, B4, A2, f13
  3237. nop
  3238. fxcpmadd f2, B6, A10, f2
  3239. nop
  3240. fxcsmadd f6, B6, A10, f6
  3241. nop
  3242. fxcpmadd f10, B4, A10, f10
  3243. nop
  3244. fxcsmadd f14, B4, A10, f14
  3245. nop
  3246. fxcpmadd f3, B6, A4, f3
  3247. LFPDUX A2, AO2, INC4
  3248. fxcsmadd f7, B6, A4, f7
  3249. LFPDUX A9, AO, INC4
  3250. fxcpmadd f11, B4, A4, f11
  3251. nop
  3252. fxcsmadd f15, B4, A4, f15
  3253. bdnz+ .L1012
  3254. .align 4
  3255. .L1013:
  3256. ## 1 ##
  3257. fxcpmadd f0, B1, A1, f0
  3258. nop
  3259. fxcsmadd f4, B1, A1, f4
  3260. nop
  3261. fxcpmadd f8, B2, A1, f8
  3262. LFPDUX B4, BO2, INC4
  3263. fxcsmadd f12, B2, A1, f12
  3264. LFPDUX B6, BO, INC4
  3265. fxcpmadd f1, B1, A2, f1
  3266. nop
  3267. fxcsmadd f5, B1, A2, f5
  3268. LFPDUX A4, AO2, INC4
  3269. fxcpmadd f9, B2, A2, f9
  3270. LFPDUX A10, AO, INC4
  3271. fxcsmadd f13, B2, A2, f13
  3272. nop
  3273. fxcpmadd f2, B1, A3, f2
  3274. nop
  3275. fxcsmadd f6, B1, A3, f6
  3276. nop
  3277. fxcpmadd f10, B2, A3, f10
  3278. nop
  3279. fxcsmadd f14, B2, A3, f14
  3280. nop
  3281. fxcpmadd f3, B1, A4, f3
  3282. nop
  3283. fxcsmadd f7, B1, A4, f7
  3284. LFPDUX A2, AO2, INC4
  3285. fxcpmadd f11, B2, A4, f11
  3286. #ifndef TRMMKERNEL
  3287. LFDUX A1, CO1, INC
  3288. #else
  3289. nop
  3290. #endif
  3291. fxcsmadd f15, B2, A4, f15
  3292. nop
  3293. ## 2 ##
  3294. fxcpmadd f0, B3, A5, f0
  3295. nop
  3296. fxcsmadd f4, B3, A5, f4
  3297. nop
  3298. fxcpmadd f8, B4, A5, f8
  3299. LFPDUX B2, BO2, INC4
  3300. fxcsmadd f12, B4, A5, f12
  3301. #ifndef TRMMKERNEL
  3302. LFDUX B1, CO1, INC2
  3303. #else
  3304. nop
  3305. #endif
  3306. fxcpmadd f1, B3, A2, f1
  3307. nop
  3308. fxcsmadd f5, B3, A2, f5
  3309. LFPDUX A4, AO2, INC4
  3310. fxcpmadd f9, B4, A2, f9
  3311. #ifndef TRMMKERNEL
  3312. LFDUX A3, CO1, INC2
  3313. #else
  3314. nop
  3315. #endif
  3316. fxcsmadd f13, B4, A2, f13
  3317. nop
  3318. fxcpmadd f2, B3, A6, f2
  3319. nop
  3320. fxcsmadd f6, B3, A6, f6
  3321. nop
  3322. fxcpmadd f10, B4, A6, f10
  3323. nop
  3324. fxcsmadd f14, B4, A6, f14
  3325. nop
  3326. fxcpmadd f3, B3, A4, f3
  3327. nop
  3328. fxcsmadd f7, B3, A4, f7
  3329. LFPDUX A2, AO2, INC4
  3330. fxcpmadd f11, B4, A4, f11
  3331. #ifndef TRMMKERNEL
  3332. LFDUX A5, CO1, INC2
  3333. #else
  3334. nop
  3335. #endif
  3336. fxcsmadd f15, B4, A4, f15
  3337. nop
  3338. ## 3 ##
  3339. fxcpmadd f0, B5, A7, f0
  3340. nop
  3341. fxcsmadd f4, B5, A7, f4
  3342. nop
  3343. fxcpmadd f8, B2, A7, f8
  3344. LFPDUX B4, BO2, INC4
  3345. fxcsmadd f12, B2, A7, f12
  3346. #ifndef TRMMKERNEL
  3347. LFSDUX A1, CO1, INCM5
  3348. #else
  3349. nop
  3350. #endif
  3351. fxcpmadd f1, B5, A2, f1
  3352. nop
  3353. fxcsmadd f5, B5, A2, f5
  3354. LFPDUX A4, AO2, INC4
  3355. fxcpmadd f9, B2, A2, f9
  3356. #ifndef TRMMKERNEL
  3357. LFSDUX B1, CO1, INC2
  3358. #else
  3359. nop
  3360. #endif
  3361. fxcsmadd f13, B2, A2, f13
  3362. nop
  3363. fxcpmadd f2, B5, A8, f2
  3364. nop
  3365. fxcsmadd f6, B5, A8, f6
  3366. nop
  3367. fxcpmadd f10, B2, A8, f10
  3368. nop
  3369. fxcsmadd f14, B2, A8, f14
  3370. nop
  3371. fxcpmadd f3, B5, A4, f3
  3372. nop
  3373. fxcsmadd f7, B5, A4, f7
  3374. LFPDUX A2, AO2, INC4
  3375. fxcpmadd f11, B2, A4, f11
  3376. #ifndef TRMMKERNEL
  3377. LFSDUX A3, CO1, INC2
  3378. #else
  3379. nop
  3380. #endif
  3381. fxcsmadd f15, B2, A4, f15
  3382. nop
  3383. ## 4 ##
  3384. fxcpmadd f0, B6, A9, f0
  3385. nop
  3386. fxcsmadd f4, B6, A9, f4
  3387. nop
  3388. fxcpmadd f8, B4, A9, f8
  3389. #ifndef TRMMKERNEL
  3390. LFSDUX A5, CO1, INC2
  3391. #else
  3392. nop
  3393. #endif
  3394. fxcsmadd f12, B4, A9, f12
  3395. #ifndef TRMMKERNEL
  3396. LFDUX B3, CO2, INC
  3397. #else
  3398. nop
  3399. #endif
  3400. fxcpmadd f1, B6, A2, f1
  3401. nop
  3402. fxcsmadd f5, B6, A2, f5
  3403. LFPDUX A4, AO2, INC4
  3404. fxcpmadd f9, B4, A2, f9
  3405. #ifndef TRMMKERNEL
  3406. LFDUX A6, CO2, INC2
  3407. #else
  3408. nop
  3409. #endif
  3410. fxcsmadd f13, B4, A2, f13
  3411. nop
  3412. fxcpmadd f2, B6, A10, f2
  3413. nop
  3414. fxcsmadd f6, B6, A10, f6
  3415. nop
  3416. fxcpmadd f10, B4, A10, f10
  3417. nop
  3418. fxcsmadd f14, B4, A10, f14
  3419. #ifndef TRMMKERNEL
  3420. LFDUX A7, CO2, INC2
  3421. #else
  3422. nop
  3423. #endif
  3424. fxcpmadd f3, B6, A4, f3
  3425. nop
  3426. fxcsmadd f7, B6, A4, f7
  3427. nop
  3428. fxcpmadd f11, B4, A4, f11
  3429. nop
  3430. fxcsmadd f15, B4, A4, f15
  3431. #ifndef TRMMKERNEL
  3432. LFDUX B2, CO2, INC2
  3433. #else
  3434. nop
  3435. #endif
  3436. .align 4
  3437. .L1014:
  3438. lfd AP, ALPHA(SP)
  3439. #ifdef TRMMKERNEL
  3440. fsmfp AP, AP
  3441. #endif
  3442. #if defined(TRMMKERNEL)
  3443. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3444. sub TEMP, K, KK
  3445. #elif defined(LEFT)
  3446. addi TEMP, KK, 8
  3447. #else
  3448. addi TEMP, KK, 4
  3449. #endif
  3450. andi. r0, TEMP, 3
  3451. mtspr CTR, r0
  3452. ble+ .L1018
  3453. cmpwi cr0, TEMP, 3
  3454. bgt+ .L1015
  3455. #else
  3456. andi. r0, K, 3
  3457. mtspr CTR, r0
  3458. ble+ .L1018
  3459. cmpwi cr0, K, 3
  3460. bgt+ .L1015
  3461. #endif
  3462. #ifndef TRMMKERNEL
  3463. LFDUX A1, CO1, INC
  3464. fpmr f5, f0
  3465. LFDUX B1, CO1, INC2
  3466. fpmr f9, f0
  3467. LFDUX A3, CO1, INC2
  3468. fpmr f13, f0
  3469. LFDUX A5, CO1, INC2
  3470. fpmr f2, f0
  3471. LFSDUX A1, CO1, INCM5
  3472. fpmr f6, f0
  3473. LFSDUX B1, CO1, INC2
  3474. fpmr f10, f0
  3475. LFSDUX A3, CO1, INC2
  3476. fpmr f14, f0
  3477. LFSDUX A5, CO1, INC2
  3478. fpmr f3, f0
  3479. LFDUX B3, CO2, INC
  3480. fpmr f7, f0
  3481. LFDUX A6, CO2, INC2
  3482. fpmr f11, f0
  3483. LFDUX A7, CO2, INC2
  3484. fpmr f15, f0
  3485. LFDUX B2, CO2, INC2
  3486. #else
  3487. fpmr f5, f0
  3488. fpmr f9, f0
  3489. fpmr f13, f0
  3490. fpmr f2, f0
  3491. fpmr f6, f0
  3492. fpmr f10, f0
  3493. fpmr f14, f0
  3494. fpmr f3, f0
  3495. fpmr f7, f0
  3496. fpmr f11, f0
  3497. fpmr f15, f0
  3498. nop
  3499. #endif
  3500. .align 4
  3501. .L1015:
  3502. LFPDUX A2, AO, INC4
  3503. LFPDUX A4, AO2, INC4
  3504. LFPDUX A10, BO, INC4
  3505. LFPDUX B4, BO2, INC4
  3506. bdz- .L1017
  3507. .align 4
  3508. .L1016:
  3509. fxcpmadd f0, A10, A2, f0
  3510. fxcsmadd f4, A10, A2, f4
  3511. fxcpmadd f8, B4, A2, f8
  3512. fxcsmadd f12, B4, A2, f12
  3513. LFPDUX A2, AO, INC4
  3514. fxcpmadd f1, A10, A4, f1
  3515. fxcsmadd f5, A10, A4, f5
  3516. fxcpmadd f9, B4, A4, f9
  3517. fxcsmadd f13, B4, A4, f13
  3518. LFPDUX A4, AO2, INC4
  3519. fxcpmadd f2, A10, A2, f2
  3520. fxcsmadd f6, A10, A2, f6
  3521. fxcpmadd f10, B4, A2, f10
  3522. fxcsmadd f14, B4, A2, f14
  3523. LFPDUX A2, AO, INC4
  3524. fxcpmadd f3, A10, A4, f3
  3525. fxcsmadd f7, A10, A4, f7
  3526. LFPDUX A10, BO, INC4
  3527. fxcpmadd f11, B4, A4, f11
  3528. fxcsmadd f15, B4, A4, f15
  3529. LFPDUX A4, AO2, INC4
  3530. LFPDUX B4, BO2, INC4
  3531. bdnz+ .L1016
  3532. .align 4
  3533. .L1017:
  3534. fxcpmadd f0, A10, A2, f0
  3535. fxcsmadd f4, A10, A2, f4
  3536. fxcpmadd f8, B4, A2, f8
  3537. fxcsmadd f12, B4, A2, f12
  3538. LFPDUX A2, AO, INC4
  3539. fxcpmadd f1, A10, A4, f1
  3540. fxcsmadd f5, A10, A4, f5
  3541. fxcpmadd f9, B4, A4, f9
  3542. fxcsmadd f13, B4, A4, f13
  3543. LFPDUX A4, AO2, INC4
  3544. fxcpmadd f2, A10, A2, f2
  3545. fxcsmadd f6, A10, A2, f6
  3546. fxcpmadd f10, B4, A2, f10
  3547. fxcsmadd f14, B4, A2, f14
  3548. fxcpmadd f3, A10, A4, f3
  3549. fxcsmadd f7, A10, A4, f7
  3550. fxcpmadd f11, B4, A4, f11
  3551. fxcsmadd f15, B4, A4, f15
  3552. .align 4
  3553. .L1018:
  3554. #ifndef TRMMKERNEL
  3555. LFSDUX B3, CO2, INCM5
  3556. LFSDUX A6, CO2, INC2
  3557. LFSDUX A7, CO2, INC2
  3558. LFSDUX B2, CO2, INC2
  3559. LFDUX B5, CO3, INC
  3560. LFDUX A8, CO3, INC2
  3561. LFDUX A9, CO3, INC2
  3562. LFDUX B4, CO3, INC2
  3563. LFSDUX B5, CO3, INCM5
  3564. LFSDUX A8, CO3, INC2
  3565. LFSDUX A9, CO3, INC2
  3566. LFSDUX B4, CO3, INC2
  3567. LFDUX A2, CO4, INC
  3568. LFDUX A4, CO4, INC2
  3569. fxcpmadd f0, AP, f0, A1
  3570. LFDUX A10, CO4, INC2
  3571. LFDUX A1, CO4, INC2
  3572. fxcpmadd f1, AP, f1, B1
  3573. LFSDUX A2, CO4, INCM5
  3574. LFSDUX A4, CO4, INC2
  3575. fxcpmadd f2, AP, f2, A3
  3576. LFSDUX A10, CO4, INC2
  3577. LFSDUX A1, CO4, INC2
  3578. fxcpmadd f3, AP, f3, A5
  3579. STFDUX f0, CO1, INCM7
  3580. STFSDUX f0, CO1, INC
  3581. fxcpmadd f4, AP, f4, B3
  3582. STFDUX f1, CO1, INC
  3583. STFSDUX f1, CO1, INC
  3584. fxcpmadd f5, AP, f5, A6
  3585. STFDUX f2, CO1, INC
  3586. STFSDUX f2, CO1, INC
  3587. fxcpmadd f6, AP, f6, A7
  3588. STFDUX f3, CO1, INC
  3589. STFSDUX f3, CO1, INC
  3590. fxcpmadd f7, AP, f7, B2
  3591. STFDUX f4, CO2, INCM7
  3592. STFSDUX f4, CO2, INC
  3593. fxcpmadd f8, AP, f8, B5
  3594. STFDUX f5, CO2, INC
  3595. STFSDUX f5, CO2, INC
  3596. fxcpmadd f9, AP, f9, A8
  3597. STFDUX f6, CO2, INC
  3598. STFSDUX f6, CO2, INC
  3599. fxcpmadd f10, AP, f10, A9
  3600. STFDUX f7, CO2, INC
  3601. STFSDUX f7, CO2, INC
  3602. fxcpmadd f11, AP, f11, B4
  3603. STFDUX f8, CO3, INCM7
  3604. STFSDUX f8, CO3, INC
  3605. fxcpmadd f12, AP, f12, A2
  3606. STFDUX f9, CO3, INC
  3607. STFSDUX f9, CO3, INC
  3608. fxcpmadd f13, AP, f13, A4
  3609. STFDUX f10, CO3, INC
  3610. STFSDUX f10, CO3, INC
  3611. fxcpmadd f14, AP, f14, A10
  3612. STFDUX f11, CO3, INC
  3613. STFSDUX f11, CO3, INC
  3614. fxcpmadd f15, AP, f15, A1
  3615. STFDUX f12, CO4, INCM7
  3616. #else
  3617. fpmul f0, AP, f0
  3618. fpmul f1, AP, f1
  3619. fpmul f2, AP, f2
  3620. fpmul f3, AP, f3
  3621. STFDUX f0, CO1, INC
  3622. STFSDUX f0, CO1, INC
  3623. fpmul f4, AP, f4
  3624. STFDUX f1, CO1, INC
  3625. STFSDUX f1, CO1, INC
  3626. fpmul f5, AP, f5
  3627. STFDUX f2, CO1, INC
  3628. STFSDUX f2, CO1, INC
  3629. fpmul f6, AP, f6
  3630. STFDUX f3, CO1, INC
  3631. STFSDUX f3, CO1, INC
  3632. fpmul f7, AP, f7
  3633. STFDUX f4, CO2, INC
  3634. STFSDUX f4, CO2, INC
  3635. fpmul f8, AP, f8
  3636. STFDUX f5, CO2, INC
  3637. STFSDUX f5, CO2, INC
  3638. fpmul f9, AP, f9
  3639. STFDUX f6, CO2, INC
  3640. STFSDUX f6, CO2, INC
  3641. fpmul f10, AP, f10
  3642. STFDUX f7, CO2, INC
  3643. STFSDUX f7, CO2, INC
  3644. fpmul f11, AP, f11
  3645. STFDUX f8, CO3, INC
  3646. STFSDUX f8, CO3, INC
  3647. fpmul f12, AP, f12
  3648. STFDUX f9, CO3, INC
  3649. STFSDUX f9, CO3, INC
  3650. fpmul f13, AP, f13
  3651. STFDUX f10, CO3, INC
  3652. STFSDUX f10, CO3, INC
  3653. fpmul f14, AP, f14
  3654. STFDUX f11, CO3, INC
  3655. STFSDUX f11, CO3, INC
  3656. fpmul f15, AP, f15
  3657. STFDUX f12, CO4, INC
  3658. #endif
  3659. STFSDUX f12, CO4, INC
  3660. STFDUX f13, CO4, INC
  3661. STFSDUX f13, CO4, INC
  3662. STFDUX f14, CO4, INC
  3663. STFSDUX f14, CO4, INC
  3664. STFDUX f15, CO4, INC
  3665. STFSDUX f15, CO4, INC
  3666. #ifdef TRMMKERNEL
  3667. #if ( defined(LEFT) && defined(TRANSA)) || \
  3668. (!defined(LEFT) && !defined(TRANSA))
  3669. sub TEMP, K, KK
  3670. #ifdef LEFT
  3671. addi TEMP, TEMP, -8
  3672. #else
  3673. addi TEMP, TEMP, -4
  3674. #endif
  3675. slwi r0, TEMP, 3 + BASE_SHIFT
  3676. slwi TEMP, TEMP, 2 + BASE_SHIFT
  3677. add AO, AO, r0
  3678. add BO, BO, TEMP
  3679. #endif
  3680. #ifdef LEFT
  3681. addi KK, KK, 8
  3682. #endif
  3683. #endif
  3684. addic. I, I, -1
  3685. li r0, FZERO
  3686. lfpsx f0, SP, r0
  3687. bgt+ .L1011
  3688. .align 4
  3689. .L1020:
  3690. andi. I, M, 4
  3691. beq .L1030
  3692. #if defined(TRMMKERNEL)
  3693. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  3694. addi AO2, AO, 2 * SIZE
  3695. fpmr f4, f0
  3696. addi BO, B, - 4 * SIZE
  3697. fpmr f8, f0
  3698. addi BO2, B, - 2 * SIZE
  3699. fpmr f12, f0
  3700. #else
  3701. slwi TEMP, KK, 2 + BASE_SHIFT
  3702. slwi r0, KK, 2 + BASE_SHIFT
  3703. add AO, AO, TEMP
  3704. add BO, B, r0
  3705. addi AO2, AO, 2 * SIZE
  3706. fpmr f4, f0
  3707. addi BO, BO, - 4 * SIZE
  3708. fpmr f8, f0
  3709. addi BO2, BO, 2 * SIZE
  3710. fpmr f12, f0
  3711. #endif
  3712. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3713. sub TEMP, K, KK
  3714. #elif defined(LEFT)
  3715. addi TEMP, KK, 4
  3716. #else
  3717. addi TEMP, KK, 4
  3718. #endif
  3719. srawi. TEMP, TEMP, 2
  3720. fpmr f1, f0
  3721. fpmr f5, f0
  3722. fpmr f9, f0
  3723. mtspr CTR, TEMP
  3724. fpmr f13, f0
  3725. ble .L1024
  3726. #else
  3727. addi AO2, AO, 2 * SIZE
  3728. fpmr f4, f0
  3729. addi BO, B, - 4 * SIZE
  3730. fpmr f8, f0
  3731. addi BO2, B, - 2 * SIZE
  3732. fpmr f12, f0
  3733. srawi. r0, K, 2
  3734. fpmr f1, f0
  3735. fpmr f5, f0
  3736. fpmr f9, f0
  3737. mtspr CTR, r0
  3738. fpmr f13, f0
  3739. ble .L1024
  3740. #endif
  3741. LFPDUX A1, AO, INC4
  3742. LFPDUX B1, BO, INC4
  3743. LFPDUX A2, AO2, INC4
  3744. LFPDUX B2, BO2, INC4
  3745. LFPDUX A3, AO, INC4
  3746. LFPDUX B3, BO, INC4
  3747. LFPDUX A4, AO2, INC4
  3748. LFPDUX B4, BO2, INC4
  3749. LFPDUX A5, AO, INC4
  3750. LFPDUX B5, BO, INC4
  3751. LFPDUX A6, AO2, INC4
  3752. LFPDUX B6, BO2, INC4
  3753. LFPDUX A7, AO, INC4
  3754. LFPDUX A9, BO, INC4
  3755. LFPDUX A10, BO2, INC4
  3756. bdz- .L1023
  3757. .align 4
  3758. .L1022:
  3759. fxcpmadd f0, B1, A1, f0
  3760. nop
  3761. fxcsmadd f4, B1, A1, f4
  3762. LFPDUX A8, AO2, INC4
  3763. fxcpmadd f8, B2, A1, f8
  3764. nop
  3765. fxcsmadd f12, B2, A1, f12
  3766. LFPDUX A1, AO, INC4
  3767. fxcpmadd f1, B1, A2, f1
  3768. nop
  3769. fxcsmadd f5, B1, A2, f5
  3770. LFPDUX B1, BO, INC4
  3771. fxcpmadd f9, B2, A2, f9
  3772. nop
  3773. fxcsmadd f13, B2, A2, f13
  3774. LFPDUX B2, BO2, INC4
  3775. fxcpmadd f0, B3, A3, f0
  3776. nop
  3777. fxcsmadd f4, B3, A3, f4
  3778. LFPDUX A2, AO2, INC4
  3779. fxcpmadd f8, B4, A3, f8
  3780. nop
  3781. fxcsmadd f12, B4, A3, f12
  3782. LFPDUX A3, AO, INC4
  3783. fxcpmadd f1, B3, A4, f1
  3784. nop
  3785. fxcsmadd f5, B3, A4, f5
  3786. LFPDUX B3, BO, INC4
  3787. fxcpmadd f9, B4, A4, f9
  3788. nop
  3789. fxcsmadd f13, B4, A4, f13
  3790. LFPDUX B4, BO2, INC4
  3791. fxcpmadd f0, B5, A5, f0
  3792. nop
  3793. fxcsmadd f4, B5, A5, f4
  3794. LFPDUX A4, AO2, INC4
  3795. fxcpmadd f8, B6, A5, f8
  3796. nop
  3797. fxcsmadd f12, B6, A5, f12
  3798. LFPDUX A5, AO, INC4
  3799. fxcpmadd f1, B5, A6, f1
  3800. nop
  3801. fxcsmadd f5, B5, A6, f5
  3802. LFPDUX B5, BO, INC4
  3803. fxcpmadd f9, B6, A6, f9
  3804. nop
  3805. fxcsmadd f13, B6, A6, f13
  3806. LFPDUX B6, BO2, INC4
  3807. fxcpmadd f0, A9, A7, f0
  3808. nop
  3809. fxcsmadd f4, A9, A7, f4
  3810. LFPDUX A6, AO2, INC4
  3811. fxcpmadd f8, A10, A7, f8
  3812. nop
  3813. fxcsmadd f12, A10, A7, f12
  3814. LFPDUX A7, AO, INC4
  3815. fxcpmadd f1, A9, A8, f1
  3816. nop
  3817. fxcsmadd f5, A9, A8, f5
  3818. LFPDUX A9, BO, INC4
  3819. fxcpmadd f9, A10, A8, f9
  3820. nop
  3821. fxcsmadd f13, A10, A8, f13
  3822. LFPDUX A10, BO2, INC4
  3823. bdnz+ .L1022
  3824. .align 4
  3825. .L1023:
  3826. fxcpmadd f0, B1, A1, f0
  3827. fxcsmadd f4, B1, A1, f4
  3828. LFPDUX A8, AO2, INC4
  3829. fxcpmadd f8, B2, A1, f8
  3830. fxcsmadd f12, B2, A1, f12
  3831. fxcpmadd f1, B1, A2, f1
  3832. fxcsmadd f5, B1, A2, f5
  3833. fxcpmadd f9, B2, A2, f9
  3834. fxcsmadd f13, B2, A2, f13
  3835. fxcpmadd f0, B3, A3, f0
  3836. fxcsmadd f4, B3, A3, f4
  3837. fxcpmadd f8, B4, A3, f8
  3838. fxcsmadd f12, B4, A3, f12
  3839. fxcpmadd f1, B3, A4, f1
  3840. fxcsmadd f5, B3, A4, f5
  3841. fxcpmadd f9, B4, A4, f9
  3842. fxcsmadd f13, B4, A4, f13
  3843. fxcpmadd f0, B5, A5, f0
  3844. fxcsmadd f4, B5, A5, f4
  3845. fxcpmadd f8, B6, A5, f8
  3846. fxcsmadd f12, B6, A5, f12
  3847. fxcpmadd f1, B5, A6, f1
  3848. fxcsmadd f5, B5, A6, f5
  3849. fxcpmadd f9, B6, A6, f9
  3850. fxcsmadd f13, B6, A6, f13
  3851. fxcpmadd f0, A9, A7, f0
  3852. fxcsmadd f4, A9, A7, f4
  3853. fxcpmadd f8, A10, A7, f8
  3854. fxcsmadd f12, A10, A7, f12
  3855. fxcpmadd f1, A9, A8, f1
  3856. fxcsmadd f5, A9, A8, f5
  3857. fxcpmadd f9, A10, A8, f9
  3858. fxcsmadd f13, A10, A8, f13
  3859. .align 4
  3860. .L1024:
  3861. lfd AP, ALPHA(SP)
  3862. #ifdef TRMMKERNEL
  3863. fsmfp AP, AP
  3864. #endif
  3865. #if defined(TRMMKERNEL)
  3866. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3867. sub TEMP, K, KK
  3868. #elif defined(LEFT)
  3869. addi TEMP, KK, 4
  3870. #else
  3871. addi TEMP, KK, 4
  3872. #endif
  3873. andi. TEMP, TEMP, 3
  3874. mtspr CTR, TEMP
  3875. #else
  3876. andi. r0, K, 3
  3877. mtspr CTR, r0
  3878. #endif
  3879. ble+ .L1028
  3880. LFPDUX A1, AO, INC4
  3881. LFPDUX A2, AO2, INC4
  3882. LFPDUX B1, BO, INC4
  3883. LFPDUX B2, BO2, INC4
  3884. bdz- .L1027
  3885. .align 4
  3886. .L1026:
  3887. fxcpmadd f0, B1, A1, f0
  3888. fxcsmadd f4, B1, A1, f4
  3889. fxcpmadd f8, B2, A1, f8
  3890. fxcsmadd f12, B2, A1, f12
  3891. LFPDUX A1, AO, INC4
  3892. fxcpmadd f1, B1, A2, f1
  3893. fxcsmadd f5, B1, A2, f5
  3894. LFPDUX B1, BO, INC4
  3895. fxcpmadd f9, B2, A2, f9
  3896. fxcsmadd f13, B2, A2, f13
  3897. LFPDUX A2, AO2, INC4
  3898. LFPDUX B2, BO2, INC4
  3899. bdnz+ .L1026
  3900. .align 4
  3901. .L1027:
  3902. fxcpmadd f0, B1, A1, f0
  3903. fxcsmadd f4, B1, A1, f4
  3904. fxcpmadd f8, B2, A1, f8
  3905. fxcsmadd f12, B2, A1, f12
  3906. fxcpmadd f1, B1, A2, f1
  3907. fxcsmadd f5, B1, A2, f5
  3908. fxcpmadd f9, B2, A2, f9
  3909. fxcsmadd f13, B2, A2, f13
  3910. .align 4
  3911. .L1028:
  3912. #ifndef TRMMKERNEL
  3913. LFDUX A1, CO1, INC
  3914. LFDUX B1, CO1, INC2
  3915. LFDUX B3, CO2, INC
  3916. LFDUX A6, CO2, INC2
  3917. LFSDUX A1, CO1, INCM1
  3918. LFSDUX B1, CO1, INC2
  3919. LFSDUX B3, CO2, INCM1
  3920. LFSDUX A6, CO2, INC2
  3921. LFDUX B5, CO3, INC
  3922. LFDUX A8, CO3, INC2
  3923. LFDUX A2, CO4, INC
  3924. LFDUX A4, CO4, INC2
  3925. fxcpmadd f0, AP, f0, A1
  3926. LFSDUX B5, CO3, INCM1
  3927. LFSDUX A8, CO3, INC2
  3928. fxcpmadd f1, AP, f1, B1
  3929. LFSDUX A2, CO4, INCM1
  3930. LFSDUX A4, CO4, INC2
  3931. fxcpmadd f4, AP, f4, B3
  3932. STFDUX f0, CO1, INCM3
  3933. STFSDUX f0, CO1, INC
  3934. fxcpmadd f5, AP, f5, A6
  3935. STFDUX f1, CO1, INC
  3936. STFSDUX f1, CO1, INC
  3937. fxcpmadd f8, AP, f8, B5
  3938. STFDUX f4, CO2, INCM3
  3939. STFSDUX f4, CO2, INC
  3940. fxcpmadd f9, AP, f9, A8
  3941. STFDUX f5, CO2, INC
  3942. STFSDUX f5, CO2, INC
  3943. fxcpmadd f12, AP, f12, A2
  3944. STFDUX f8, CO3, INCM3
  3945. STFSDUX f8, CO3, INC
  3946. fxcpmadd f13, AP, f13, A4
  3947. STFDUX f9, CO3, INC
  3948. STFSDUX f9, CO3, INC
  3949. STFDUX f12, CO4, INCM3
  3950. STFSDUX f12, CO4, INC
  3951. STFDUX f13, CO4, INC
  3952. STFSDUX f13, CO4, INC
  3953. #else
  3954. fpmul f0, AP, f0
  3955. fpmul f1, AP, f1
  3956. fpmul f4, AP, f4
  3957. STFDUX f0, CO1, INC
  3958. STFSDUX f0, CO1, INC
  3959. fpmul f5, AP, f5
  3960. STFDUX f1, CO1, INC
  3961. STFSDUX f1, CO1, INC
  3962. fpmul f8, AP, f8
  3963. STFDUX f4, CO2, INC
  3964. STFSDUX f4, CO2, INC
  3965. fpmul f9, AP, f9
  3966. STFDUX f5, CO2, INC
  3967. STFSDUX f5, CO2, INC
  3968. fpmul f12, AP, f12
  3969. STFDUX f8, CO3, INC
  3970. STFSDUX f8, CO3, INC
  3971. fpmul f13, AP, f13
  3972. STFDUX f9, CO3, INC
  3973. STFSDUX f9, CO3, INC
  3974. STFDUX f12, CO4, INC
  3975. STFSDUX f12, CO4, INC
  3976. STFDUX f13, CO4, INC
  3977. STFSDUX f13, CO4, INC
  3978. #endif
  3979. #ifdef TRMMKERNEL
  3980. #if ( defined(LEFT) && defined(TRANSA)) || \
  3981. (!defined(LEFT) && !defined(TRANSA))
  3982. sub TEMP, K, KK
  3983. #ifdef LEFT
  3984. addi TEMP, TEMP, -4
  3985. #else
  3986. addi TEMP, TEMP, -4
  3987. #endif
  3988. slwi r0, TEMP, 2 + BASE_SHIFT
  3989. slwi TEMP, TEMP, 2 + BASE_SHIFT
  3990. add AO, AO, r0
  3991. add BO, BO, TEMP
  3992. #endif
  3993. #ifdef LEFT
  3994. addi KK, KK, 4
  3995. #endif
  3996. #endif
  3997. li r0, FZERO
  3998. lfpsx f0, SP, r0
  3999. .align 4
  4000. .L1030:
  4001. andi. I, M, 2
  4002. beq .L1040
  4003. #if defined(TRMMKERNEL)
  4004. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  4005. addi AO2, AO, 2 * SIZE
  4006. fpmr f1, f0
  4007. addi BO, B, - 4 * SIZE
  4008. fpmr f2, f0
  4009. addi BO2, B, - 2 * SIZE
  4010. fpmr f3, f0
  4011. #else
  4012. slwi TEMP, KK, 1 + BASE_SHIFT
  4013. slwi r0, KK, 2 + BASE_SHIFT
  4014. add AO, AO, TEMP
  4015. add BO, B, r0
  4016. addi AO2, AO, 2 * SIZE
  4017. fpmr f1, f0
  4018. addi BO, BO, - 4 * SIZE
  4019. fpmr f2, f0
  4020. addi BO2, BO, 2 * SIZE
  4021. fpmr f3, f0
  4022. #endif
  4023. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4024. sub TEMP, K, KK
  4025. #elif defined(LEFT)
  4026. addi TEMP, KK, 2
  4027. #else
  4028. addi TEMP, KK, 4
  4029. #endif
  4030. srawi. r0, TEMP, 2
  4031. mtspr CTR, r0
  4032. ble .L1034
  4033. #else
  4034. addi AO2, AO, 2 * SIZE
  4035. fpmr f1, f0
  4036. addi BO, B, - 4 * SIZE
  4037. fpmr f2, f0
  4038. addi BO2, B, - 2 * SIZE
  4039. fpmr f3, f0
  4040. srawi. r0, K, 2
  4041. mtspr CTR, r0
  4042. ble .L1034
  4043. #endif
  4044. LFPDUX A1, AO, INC4
  4045. LFPDUX B1, BO, INC4
  4046. LFPDUX B2, BO2, INC4
  4047. LFPDUX A2, AO2, INC4
  4048. LFPDUX B3, BO, INC4
  4049. LFPDUX B4, BO2, INC4
  4050. LFPDUX A3, AO, INC4
  4051. LFPDUX A5, BO, INC4
  4052. LFPDUX A6, BO2, INC4
  4053. LFPDUX A4, AO2, INC4
  4054. LFPDUX A7, BO, INC4
  4055. LFPDUX A8, BO2, INC4
  4056. bdz- .L1033
  4057. .align 4
  4058. .L1032:
  4059. fxcpmadd f0, B1, A1, f0
  4060. fxcsmadd f1, B1, A1, f1
  4061. LFPDUX B1, BO, INC4
  4062. fxcpmadd f2, B2, A1, f2
  4063. fxcsmadd f3, B2, A1, f3
  4064. LFPDUX B2, BO2, INC4
  4065. LFPDUX A1, AO, INC4
  4066. fxcpmadd f0, B3, A2, f0
  4067. fxcsmadd f1, B3, A2, f1
  4068. LFPDUX B3, BO, INC4
  4069. fxcpmadd f2, B4, A2, f2
  4070. fxcsmadd f3, B4, A2, f3
  4071. LFPDUX B4, BO2, INC4
  4072. LFPDUX A2, AO2, INC4
  4073. fxcpmadd f0, A5, A3, f0
  4074. fxcsmadd f1, A5, A3, f1
  4075. LFPDUX A5, BO, INC4
  4076. fxcpmadd f2, A6, A3, f2
  4077. fxcsmadd f3, A6, A3, f3
  4078. LFPDUX A6, BO2, INC4
  4079. LFPDUX A3, AO, INC4
  4080. fxcpmadd f0, A7, A4, f0
  4081. fxcsmadd f1, A7, A4, f1
  4082. LFPDUX A7, BO, INC4
  4083. fxcpmadd f2, A8, A4, f2
  4084. fxcsmadd f3, A8, A4, f3
  4085. LFPDUX A8, BO2, INC4
  4086. LFPDUX A4, AO2, INC4
  4087. bdnz+ .L1032
  4088. .align 4
  4089. .L1033:
  4090. fxcpmadd f0, B1, A1, f0
  4091. fxcsmadd f1, B1, A1, f1
  4092. fxcpmadd f2, B2, A1, f2
  4093. fxcsmadd f3, B2, A1, f3
  4094. fxcpmadd f0, B3, A2, f0
  4095. fxcsmadd f1, B3, A2, f1
  4096. fxcpmadd f2, B4, A2, f2
  4097. fxcsmadd f3, B4, A2, f3
  4098. fxcpmadd f0, A5, A3, f0
  4099. fxcsmadd f1, A5, A3, f1
  4100. fxcpmadd f2, A6, A3, f2
  4101. fxcsmadd f3, A6, A3, f3
  4102. fxcpmadd f0, A7, A4, f0
  4103. fxcsmadd f1, A7, A4, f1
  4104. fxcpmadd f2, A8, A4, f2
  4105. fxcsmadd f3, A8, A4, f3
  4106. .align 4
  4107. .L1034:
  4108. lfd AP, ALPHA(SP)
  4109. #ifdef TRMMKERNEL
  4110. fsmfp AP, AP
  4111. #endif
  4112. #if defined(TRMMKERNEL)
  4113. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4114. sub TEMP, K, KK
  4115. #elif defined(LEFT)
  4116. addi TEMP, KK, 2
  4117. #else
  4118. addi TEMP, KK, 4
  4119. #endif
  4120. andi. TEMP, TEMP, 3
  4121. mtspr CTR, TEMP
  4122. #else
  4123. andi. r0, K, 3
  4124. mtspr CTR, r0
  4125. #endif
  4126. ble+ .L1038
  4127. LFPDX A1, AO, INC4
  4128. LFPDUX B1, BO, INC4
  4129. LFPDUX B2, BO2, INC4
  4130. add AO, AO, INC2
  4131. bdz- .L1037
  4132. .align 4
  4133. .L1036:
  4134. fxcpmadd f0, B1, A1, f0
  4135. fxcsmadd f1, B1, A1, f1
  4136. LFPDUX B1, BO, INC4
  4137. fxcpmadd f2, B2, A1, f2
  4138. fxcsmadd f3, B2, A1, f3
  4139. LFPDX A1, AO, INC4
  4140. LFPDUX B2, BO2, INC4
  4141. add AO, AO, INC2
  4142. bdnz+ .L1036
  4143. .align 4
  4144. .L1037:
  4145. fxcpmadd f0, B1, A1, f0
  4146. fxcsmadd f1, B1, A1, f1
  4147. fxcpmadd f2, B2, A1, f2
  4148. fxcsmadd f3, B2, A1, f3
  4149. .align 4
  4150. .L1038:
  4151. #ifndef TRMMKERNEL
  4152. LFDUX A1, CO1, INC
  4153. LFDUX A2, CO2, INC
  4154. LFDUX A3, CO3, INC
  4155. LFDUX A4, CO4, INC
  4156. LFSDUX A1, CO1, INC
  4157. LFSDUX A2, CO2, INC
  4158. LFSDUX A3, CO3, INC
  4159. LFSDUX A4, CO4, INC
  4160. fxcpmadd f0, AP, f0, A1
  4161. fxcpmadd f1, AP, f1, A2
  4162. fxcpmadd f2, AP, f2, A3
  4163. fxcpmadd f3, AP, f3, A4
  4164. STFDUX f0, CO1, INCM1
  4165. STFSDUX f0, CO1, INC
  4166. STFDUX f1, CO2, INCM1
  4167. STFSDUX f1, CO2, INC
  4168. STFDUX f2, CO3, INCM1
  4169. STFSDUX f2, CO3, INC
  4170. STFDUX f3, CO4, INCM1
  4171. STFSDUX f3, CO4, INC
  4172. #else
  4173. fpmul f0, AP, f0
  4174. fpmul f1, AP, f1
  4175. fpmul f2, AP, f2
  4176. fpmul f3, AP, f3
  4177. STFDUX f0, CO1, INC
  4178. STFSDUX f0, CO1, INC
  4179. STFDUX f1, CO2, INC
  4180. STFSDUX f1, CO2, INC
  4181. STFDUX f2, CO3, INC
  4182. STFSDUX f2, CO3, INC
  4183. STFDUX f3, CO4, INC
  4184. STFSDUX f3, CO4, INC
  4185. #endif
  4186. #ifdef TRMMKERNEL
  4187. #if ( defined(LEFT) && defined(TRANSA)) || \
  4188. (!defined(LEFT) && !defined(TRANSA))
  4189. sub TEMP, K, KK
  4190. #ifdef LEFT
  4191. addi TEMP, TEMP, -2
  4192. #else
  4193. addi TEMP, TEMP, -4
  4194. #endif
  4195. slwi r0, TEMP, 1 + BASE_SHIFT
  4196. slwi TEMP, TEMP, 2 + BASE_SHIFT
  4197. add AO, AO, r0
  4198. add BO, BO, TEMP
  4199. #endif
  4200. #ifdef LEFT
  4201. addi KK, KK, 2
  4202. #endif
  4203. #endif
  4204. li r0, FZERO
  4205. lfpsx f0, SP, r0
  4206. .align 4
  4207. .L1040:
  4208. andi. I, M, 1
  4209. beq .L1049
  4210. #if defined(TRMMKERNEL)
  4211. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  4212. addi AO2, AO, 2 * SIZE
  4213. fpmr f1, f0
  4214. addi BO, B, - 4 * SIZE
  4215. fpmr f2, f0
  4216. addi BO2, B, - 2 * SIZE
  4217. fpmr f3, f0
  4218. #else
  4219. slwi TEMP, KK, 0 + BASE_SHIFT
  4220. slwi r0, KK, 2 + BASE_SHIFT
  4221. add AO, AO, TEMP
  4222. add BO, B, r0
  4223. addi AO2, AO, 2 * SIZE
  4224. fpmr f1, f0
  4225. addi BO, BO, - 4 * SIZE
  4226. fpmr f2, f0
  4227. addi BO2, BO, 2 * SIZE
  4228. fpmr f3, f0
  4229. #endif
  4230. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4231. sub TEMP, K, KK
  4232. #elif defined(LEFT)
  4233. addi TEMP, KK, 1
  4234. #else
  4235. addi TEMP, KK, 4
  4236. #endif
  4237. srawi. r0, TEMP, 3
  4238. mtspr CTR, r0
  4239. ble .L1044
  4240. #else
  4241. addi AO2, AO, 2 * SIZE
  4242. fpmr f1, f0
  4243. addi BO, B, - 4 * SIZE
  4244. fpmr f2, f0
  4245. addi BO2, B, - 2 * SIZE
  4246. fpmr f3, f0
  4247. srawi. r0, K, 3
  4248. mtspr CTR, r0
  4249. ble .L1044
  4250. #endif
  4251. LFPDUX A1, AO, INC4
  4252. LFPDUX B1, BO, INC4
  4253. LFPDUX B2, BO2, INC4
  4254. LFPDUX A2, AO2, INC4
  4255. LFPDUX B3, BO, INC4
  4256. LFPDUX B4, BO2, INC4
  4257. LFPDUX A3, AO, INC4
  4258. LFPDUX A5, BO, INC4
  4259. LFPDUX A6, BO2, INC4
  4260. LFPDUX A4, AO2, INC4
  4261. LFPDUX A7, BO, INC4
  4262. LFPDUX A8, BO2, INC4
  4263. bdz- .L1043
  4264. .align 4
  4265. .L1042:
  4266. fxcpmadd f0, A1, B1, f0
  4267. LFPDUX B1, BO, INC4
  4268. fxcpmadd f1, A1, B2, f1
  4269. LFPDUX B2, BO2, INC4
  4270. fxcsmadd f2, A1, B3, f2
  4271. LFPDUX B3, BO, INC4
  4272. fxcsmadd f3, A1, B4, f3
  4273. LFPDUX B4, BO2, INC4
  4274. LFPDUX A1, AO, INC4
  4275. fxcpmadd f0, A2, A5, f0
  4276. LFPDUX A5, BO, INC4
  4277. fxcpmadd f1, A2, A6, f1
  4278. LFPDUX A6, BO2, INC4
  4279. fxcsmadd f2, A2, A7, f2
  4280. LFPDUX A7, BO, INC4
  4281. fxcsmadd f3, A2, A8, f3
  4282. LFPDUX A8, BO2, INC4
  4283. LFPDUX A2, AO2, INC4
  4284. fxcpmadd f0, A3, B1, f0
  4285. LFPDUX B1, BO, INC4
  4286. fxcpmadd f1, A3, B2, f1
  4287. LFPDUX B2, BO2, INC4
  4288. fxcsmadd f2, A3, B3, f2
  4289. LFPDUX B3, BO, INC4
  4290. fxcsmadd f3, A3, B4, f3
  4291. LFPDUX B4, BO2, INC4
  4292. LFPDUX A3, AO, INC4
  4293. fxcpmadd f0, A4, A5, f0
  4294. LFPDUX A5, BO, INC4
  4295. fxcpmadd f1, A4, A6, f1
  4296. LFPDUX A6, BO2, INC4
  4297. fxcsmadd f2, A4, A7, f2
  4298. LFPDUX A7, BO, INC4
  4299. fxcsmadd f3, A4, A8, f3
  4300. LFPDUX A8, BO2, INC4
  4301. LFPDUX A4, AO2, INC4
  4302. bdnz+ .L1042
  4303. .align 4
  4304. .L1043:
  4305. fxcpmadd f0, A1, B1, f0
  4306. LFPDUX B1, BO, INC4
  4307. fxcpmadd f1, A1, B2, f1
  4308. LFPDUX B2, BO2, INC4
  4309. fxcsmadd f2, A1, B3, f2
  4310. LFPDUX B3, BO, INC4
  4311. fxcsmadd f3, A1, B4, f3
  4312. LFPDUX B4, BO2, INC4
  4313. fxcpmadd f0, A2, A5, f0
  4314. LFPDUX A5, BO, INC4
  4315. fxcpmadd f1, A2, A6, f1
  4316. LFPDUX A6, BO2, INC4
  4317. fxcsmadd f2, A2, A7, f2
  4318. LFPDUX A7, BO, INC4
  4319. fxcsmadd f3, A2, A8, f3
  4320. LFPDUX A8, BO2, INC4
  4321. fxcpmadd f0, A3, B1, f0
  4322. fxcpmadd f1, A3, B2, f1
  4323. fxcsmadd f2, A3, B3, f2
  4324. fxcsmadd f3, A3, B4, f3
  4325. fxcpmadd f0, A4, A5, f0
  4326. fxcpmadd f1, A4, A6, f1
  4327. fxcsmadd f2, A4, A7, f2
  4328. fxcsmadd f3, A4, A8, f3
  4329. .align 4
  4330. .L1044:
  4331. lfd AP, ALPHA(SP)
  4332. #ifdef TRMMKERNEL
  4333. fsmfp AP, AP
  4334. #endif
  4335. #if defined(TRMMKERNEL)
  4336. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4337. sub TEMP, K, KK
  4338. #elif defined(LEFT)
  4339. addi TEMP, KK, 1
  4340. #else
  4341. addi TEMP, KK, 4
  4342. #endif
  4343. andi. TEMP, TEMP, 7
  4344. mtspr CTR, TEMP
  4345. #else
  4346. andi. r0, K, 7
  4347. mtspr CTR, r0
  4348. #endif
  4349. ble+ .L1048
  4350. LFDX A1, AO, INC4
  4351. LFPDUX B1, BO, INC4
  4352. LFPDUX B2, BO2, INC4
  4353. add AO, AO, INC
  4354. bdz- .L1047
  4355. .align 4
  4356. .L1046:
  4357. fxcpmadd f0, A1, B1, f0
  4358. LFPDUX B1, BO, INC4
  4359. fxcpmadd f1, A1, B2, f1
  4360. LFDX A1, AO, INC4
  4361. LFPDUX B2, BO2, INC4
  4362. add AO, AO, INC
  4363. bdnz+ .L1046
  4364. .align 4
  4365. .L1047:
  4366. fxcpmadd f0, A1, B1, f0
  4367. fxcpmadd f1, A1, B2, f1
  4368. .align 4
  4369. .L1048:
  4370. #ifndef TRMMKERNEL
  4371. LFDX A1, CO1, INC
  4372. LFDX B3, CO3, INC
  4373. LFSDX A1, CO2, INC
  4374. LFSDX B3, CO4, INC
  4375. fpadd f0, f0, f2
  4376. fpadd f1, f1, f3
  4377. fxcpmadd f0, AP, f0, A1
  4378. fxcpmadd f1, AP, f1, B3
  4379. #else
  4380. fpadd f0, f0, f2
  4381. fpadd f1, f1, f3
  4382. fpmul f0, AP, f0
  4383. fpmul f1, AP, f1
  4384. #endif
  4385. STFDUX f0, CO1, INC
  4386. STFSDUX f0, CO2, INC
  4387. STFDUX f1, CO3, INC
  4388. STFSDUX f1, CO4, INC
  4389. #ifdef TRMMKERNEL
  4390. #if ( defined(LEFT) && defined(TRANSA)) || \
  4391. (!defined(LEFT) && !defined(TRANSA))
  4392. sub TEMP, K, KK
  4393. #ifdef LEFT
  4394. addi TEMP, TEMP, -1
  4395. #else
  4396. addi TEMP, TEMP, -4
  4397. #endif
  4398. slwi r0, TEMP, 0 + BASE_SHIFT
  4399. slwi TEMP, TEMP, 2 + BASE_SHIFT
  4400. add AO, AO, r0
  4401. add BO, BO, TEMP
  4402. #endif
  4403. #ifdef LEFT
  4404. addi KK, KK, 1
  4405. #endif
  4406. #endif
  4407. .align 4
  4408. .L1049:
  4409. #if defined(TRMMKERNEL) && !defined(LEFT)
  4410. addi KK, KK, 4
  4411. #endif
  4412. addi B, BO, 4 * SIZE
  4413. addic. J, J, -1
  4414. bgt+ .L1010
  4415. .align 4
  4416. .L1050:
  4417. andi. J, N, 2
  4418. beq .L1090
  4419. mr CO1, C
  4420. add CO2, C, LDC
  4421. add C, CO2, LDC
  4422. #if defined(TRMMKERNEL) && defined(LEFT)
  4423. mr KK, OFFSET
  4424. #endif
  4425. addi AO, A, -2 * SIZE
  4426. li r0, FZERO
  4427. lfpsx f0, SP, r0
  4428. srawi. I, M, 3
  4429. ble .L1060
  4430. .align 4
  4431. .L1051:
  4432. #if defined(TRMMKERNEL)
  4433. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  4434. fpmr f4, f0
  4435. addi BO, B, - 2 * SIZE
  4436. fpmr f1, f0
  4437. fpmr f5, f0
  4438. fpmr f2, f0
  4439. fpmr f6, f0
  4440. #else
  4441. slwi TEMP, KK, 3 + BASE_SHIFT
  4442. slwi r0, KK, 1 + BASE_SHIFT
  4443. add AO, AO, TEMP
  4444. add BO, B, r0
  4445. fpmr f4, f0
  4446. addi BO, BO, - 2 * SIZE
  4447. fpmr f1, f0
  4448. fpmr f5, f0
  4449. fpmr f2, f0
  4450. fpmr f6, f0
  4451. #endif
  4452. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4453. sub TEMP, K, KK
  4454. #elif defined(LEFT)
  4455. addi TEMP, KK, 8
  4456. #else
  4457. addi TEMP, KK, 2
  4458. #endif
  4459. srawi. r0, TEMP, 2
  4460. fpmr f3, f0
  4461. mtspr CTR, r0
  4462. fpmr f7, f0
  4463. ble .L1054
  4464. #else
  4465. fpmr f4, f0
  4466. addi BO, B, - 2 * SIZE
  4467. fpmr f1, f0
  4468. fpmr f5, f0
  4469. fpmr f2, f0
  4470. fpmr f6, f0
  4471. srawi. r0, K, 2
  4472. fpmr f3, f0
  4473. mtspr CTR, r0
  4474. fpmr f7, f0
  4475. ble .L1054
  4476. #endif
  4477. LFPDUX B1, BO, INC2
  4478. LFPDUX A1, AO, INC2
  4479. LFPDUX A2, AO, INC2
  4480. LFPDUX B2, BO, INC2
  4481. LFPDUX A3, AO, INC2
  4482. LFPDUX A4, AO, INC2
  4483. LFPDUX B3, BO, INC2
  4484. LFPDUX A5, AO, INC2
  4485. LFPDUX A6, AO, INC2
  4486. LFPDUX A7, AO, INC2
  4487. LFPDUX A8, AO, INC2
  4488. bdz- .L1053
  4489. .align 4
  4490. .L1052:
  4491. fxcpmadd f0, B1, A1, f0
  4492. LFPDUX B4, BO, INC2
  4493. fxcsmadd f4, B1, A1, f4
  4494. LFPDUX A1, AO, INC2
  4495. fxcpmadd f1, B1, A2, f1
  4496. nop
  4497. fxcsmadd f5, B1, A2, f5
  4498. LFPDUX A2, AO, INC2
  4499. fxcpmadd f2, B1, A3, f2
  4500. nop
  4501. fxcsmadd f6, B1, A3, f6
  4502. LFPDUX A3, AO, INC2
  4503. fxcpmadd f3, B1, A4, f3
  4504. nop
  4505. fxcsmadd f7, B1, A4, f7
  4506. LFPDUX A4, AO, INC2
  4507. fxcpmadd f0, B2, A5, f0
  4508. LFPDUX B1, BO, INC2
  4509. fxcsmadd f4, B2, A5, f4
  4510. LFPDUX A5, AO, INC2
  4511. fxcpmadd f1, B2, A6, f1
  4512. nop
  4513. fxcsmadd f5, B2, A6, f5
  4514. LFPDUX A6, AO, INC2
  4515. fxcpmadd f2, B2, A7, f2
  4516. nop
  4517. fxcsmadd f6, B2, A7, f6
  4518. LFPDUX A7, AO, INC2
  4519. fxcpmadd f3, B2, A8, f3
  4520. nop
  4521. fxcsmadd f7, B2, A8, f7
  4522. LFPDUX A8, AO, INC2
  4523. fxcpmadd f0, B3, A1, f0
  4524. LFPDUX B2, BO, INC2
  4525. fxcsmadd f4, B3, A1, f4
  4526. LFPDUX A1, AO, INC2
  4527. fxcpmadd f1, B3, A2, f1
  4528. nop
  4529. fxcsmadd f5, B3, A2, f5
  4530. LFPDUX A2, AO, INC2
  4531. fxcpmadd f2, B3, A3, f2
  4532. nop
  4533. fxcsmadd f6, B3, A3, f6
  4534. LFPDUX A3, AO, INC2
  4535. fxcpmadd f3, B3, A4, f3
  4536. nop
  4537. fxcsmadd f7, B3, A4, f7
  4538. LFPDUX A4, AO, INC2
  4539. fxcpmadd f0, B4, A5, f0
  4540. LFPDUX B3, BO, INC2
  4541. fxcsmadd f4, B4, A5, f4
  4542. LFPDUX A5, AO, INC2
  4543. fxcpmadd f1, B4, A6, f1
  4544. nop
  4545. fxcsmadd f5, B4, A6, f5
  4546. LFPDUX A6, AO, INC2
  4547. fxcpmadd f2, B4, A7, f2
  4548. nop
  4549. fxcsmadd f6, B4, A7, f6
  4550. LFPDUX A7, AO, INC2
  4551. fxcpmadd f3, B4, A8, f3
  4552. nop
  4553. fxcsmadd f7, B4, A8, f7
  4554. LFPDUX A8, AO, INC2
  4555. bdnz+ .L1052
  4556. .align 4
  4557. .L1053:
  4558. fxcpmadd f0, B1, A1, f0
  4559. LFPDUX B4, BO, INC2
  4560. fxcsmadd f4, B1, A1, f4
  4561. LFPDUX A1, AO, INC2
  4562. fxcpmadd f1, B1, A2, f1
  4563. nop
  4564. fxcsmadd f5, B1, A2, f5
  4565. LFPDUX A2, AO, INC2
  4566. fxcpmadd f2, B1, A3, f2
  4567. nop
  4568. fxcsmadd f6, B1, A3, f6
  4569. LFPDUX A3, AO, INC2
  4570. fxcpmadd f3, B1, A4, f3
  4571. nop
  4572. fxcsmadd f7, B1, A4, f7
  4573. LFPDUX A4, AO, INC2
  4574. fxcpmadd f0, B2, A5, f0
  4575. nop
  4576. fxcsmadd f4, B2, A5, f4
  4577. LFPDUX A5, AO, INC2
  4578. fxcpmadd f1, B2, A6, f1
  4579. nop
  4580. fxcsmadd f5, B2, A6, f5
  4581. LFPDUX A6, AO, INC2
  4582. fxcpmadd f2, B2, A7, f2
  4583. nop
  4584. fxcsmadd f6, B2, A7, f6
  4585. LFPDUX A7, AO, INC2
  4586. fxcpmadd f3, B2, A8, f3
  4587. nop
  4588. fxcsmadd f7, B2, A8, f7
  4589. LFPDUX A8, AO, INC2
  4590. fxcpmadd f0, B3, A1, f0
  4591. fxcsmadd f4, B3, A1, f4
  4592. fxcpmadd f1, B3, A2, f1
  4593. fxcsmadd f5, B3, A2, f5
  4594. fxcpmadd f2, B3, A3, f2
  4595. fxcsmadd f6, B3, A3, f6
  4596. fxcpmadd f3, B3, A4, f3
  4597. fxcsmadd f7, B3, A4, f7
  4598. fxcpmadd f0, B4, A5, f0
  4599. fxcsmadd f4, B4, A5, f4
  4600. fxcpmadd f1, B4, A6, f1
  4601. fxcsmadd f5, B4, A6, f5
  4602. fxcpmadd f2, B4, A7, f2
  4603. fxcsmadd f6, B4, A7, f6
  4604. fxcpmadd f3, B4, A8, f3
  4605. fxcsmadd f7, B4, A8, f7
  4606. .align 4
  4607. .L1054:
  4608. lfd AP, ALPHA(SP)
  4609. #ifdef TRMMKERNEL
  4610. fsmfp AP, AP
  4611. #endif
  4612. #if defined(TRMMKERNEL)
  4613. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4614. sub TEMP, K, KK
  4615. #elif defined(LEFT)
  4616. addi TEMP, KK, 8
  4617. #else
  4618. addi TEMP, KK, 2
  4619. #endif
  4620. andi. TEMP, TEMP, 3
  4621. mtspr CTR, TEMP
  4622. #else
  4623. andi. r0, K, 3
  4624. mtspr CTR, r0
  4625. #endif
  4626. ble+ .L1058
  4627. LFPDUX A1, AO, INC2
  4628. LFPDUX B1, BO, INC2
  4629. LFPDUX A2, AO, INC2
  4630. LFPDUX A3, AO, INC2
  4631. LFPDUX A4, AO, INC2
  4632. bdz- .L1057
  4633. .align 4
  4634. .L1056:
  4635. fxcpmadd f0, B1, A1, f0
  4636. fxcsmadd f4, B1, A1, f4
  4637. LFPDUX A1, AO, INC2
  4638. fxcpmadd f1, B1, A2, f1
  4639. fxcsmadd f5, B1, A2, f5
  4640. LFPDUX A2, AO, INC2
  4641. fxcpmadd f2, B1, A3, f2
  4642. fxcsmadd f6, B1, A3, f6
  4643. LFPDUX A3, AO, INC2
  4644. fxcpmadd f3, B1, A4, f3
  4645. fxcsmadd f7, B1, A4, f7
  4646. LFPDUX A4, AO, INC2
  4647. LFPDUX B1, BO, INC2
  4648. bdnz+ .L1056
  4649. .align 4
  4650. .L1057:
  4651. fxcpmadd f0, B1, A1, f0
  4652. fxcsmadd f4, B1, A1, f4
  4653. fxcpmadd f1, B1, A2, f1
  4654. fxcsmadd f5, B1, A2, f5
  4655. fxcpmadd f2, B1, A3, f2
  4656. fxcsmadd f6, B1, A3, f6
  4657. fxcpmadd f3, B1, A4, f3
  4658. fxcsmadd f7, B1, A4, f7
  4659. .align 4
  4660. .L1058:
  4661. #ifndef TRMMKERNEL
  4662. LFDUX A1, CO1, INC
  4663. LFDUX B1, CO1, INC2
  4664. LFDUX A3, CO1, INC2
  4665. LFDUX A5, CO1, INC2
  4666. LFSDUX A1, CO1, INCM5
  4667. LFSDUX B1, CO1, INC2
  4668. LFSDUX A3, CO1, INC2
  4669. LFSDUX A5, CO1, INC2
  4670. LFDUX B3, CO2, INC
  4671. LFDUX A6, CO2, INC2
  4672. LFDUX A7, CO2, INC2
  4673. LFDUX B2, CO2, INC2
  4674. fxcpmadd f0, AP, f0, A1
  4675. LFSDUX B3, CO2, INCM5
  4676. LFSDUX A6, CO2, INC2
  4677. fxcpmadd f1, AP, f1, B1
  4678. LFSDUX A7, CO2, INC2
  4679. LFSDUX B2, CO2, INC2
  4680. fxcpmadd f2, AP, f2, A3
  4681. STFDUX f0, CO1, INCM7
  4682. STFSDUX f0, CO1, INC
  4683. fxcpmadd f3, AP, f3, A5
  4684. STFDUX f1, CO1, INC
  4685. STFSDUX f1, CO1, INC
  4686. fxcpmadd f4, AP, f4, B3
  4687. STFDUX f2, CO1, INC
  4688. STFSDUX f2, CO1, INC
  4689. fxcpmadd f5, AP, f5, A6
  4690. STFDUX f3, CO1, INC
  4691. STFSDUX f3, CO1, INC
  4692. fxcpmadd f6, AP, f6, A7
  4693. STFDUX f4, CO2, INCM7
  4694. STFSDUX f4, CO2, INC
  4695. fxcpmadd f7, AP, f7, B2
  4696. STFDUX f5, CO2, INC
  4697. STFSDUX f5, CO2, INC
  4698. STFDUX f6, CO2, INC
  4699. STFSDUX f6, CO2, INC
  4700. STFDUX f7, CO2, INC
  4701. STFSDUX f7, CO2, INC
  4702. #else
  4703. fpmul f0, AP, f0
  4704. fpmul f1, AP, f1
  4705. fpmul f2, AP, f2
  4706. STFDUX f0, CO1, INC
  4707. STFSDUX f0, CO1, INC
  4708. fpmul f3, AP, f3
  4709. STFDUX f1, CO1, INC
  4710. STFSDUX f1, CO1, INC
  4711. fpmul f4, AP, f4
  4712. STFDUX f2, CO1, INC
  4713. STFSDUX f2, CO1, INC
  4714. fpmul f5, AP, f5
  4715. STFDUX f3, CO1, INC
  4716. STFSDUX f3, CO1, INC
  4717. fpmul f6, AP, f6
  4718. STFDUX f4, CO2, INC
  4719. STFSDUX f4, CO2, INC
  4720. fpmul f7, AP, f7
  4721. STFDUX f5, CO2, INC
  4722. STFSDUX f5, CO2, INC
  4723. STFDUX f6, CO2, INC
  4724. STFSDUX f6, CO2, INC
  4725. STFDUX f7, CO2, INC
  4726. STFSDUX f7, CO2, INC
  4727. #endif
  4728. #ifdef TRMMKERNEL
  4729. #if ( defined(LEFT) && defined(TRANSA)) || \
  4730. (!defined(LEFT) && !defined(TRANSA))
  4731. sub TEMP, K, KK
  4732. #ifdef LEFT
  4733. addi TEMP, TEMP, -8
  4734. #else
  4735. addi TEMP, TEMP, -2
  4736. #endif
  4737. slwi r0, TEMP, 3 + BASE_SHIFT
  4738. slwi TEMP, TEMP, 1 + BASE_SHIFT
  4739. add AO, AO, r0
  4740. add BO, BO, TEMP
  4741. #endif
  4742. #ifdef LEFT
  4743. addi KK, KK, 8
  4744. #endif
  4745. #endif
  4746. addic. I, I, -1
  4747. li r0, FZERO
  4748. lfpsx f0, SP, r0
  4749. bgt+ .L1051
  4750. .align 4
  4751. .L1060:
  4752. andi. I, M, 4
  4753. beq .L1070
  4754. #if defined(TRMMKERNEL)
  4755. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  4756. addi BO, B, - 2 * SIZE
  4757. fpmr f1, f0
  4758. #else
  4759. slwi TEMP, KK, 2 + BASE_SHIFT
  4760. slwi r0, KK, 1 + BASE_SHIFT
  4761. add AO, AO, TEMP
  4762. add BO, B, r0
  4763. addi BO, BO, - 2 * SIZE
  4764. fpmr f1, f0
  4765. #endif
  4766. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4767. sub TEMP, K, KK
  4768. #elif defined(LEFT)
  4769. addi TEMP, KK, 4
  4770. #else
  4771. addi TEMP, KK, 2
  4772. #endif
  4773. fpmr f2, f0
  4774. srawi. r0, TEMP, 2
  4775. mtspr CTR, r0
  4776. fpmr f3, f0
  4777. ble .L1064
  4778. #else
  4779. srawi. r0, K, 2
  4780. fpmr f1, f0
  4781. addi BO, B, - 2 * SIZE
  4782. fpmr f2, f0
  4783. mtspr CTR, r0
  4784. fpmr f3, f0
  4785. ble .L1064
  4786. #endif
  4787. LFPDUX B1, BO, INC2
  4788. LFPDUX A1, AO, INC2
  4789. LFPDUX A2, AO, INC2
  4790. LFPDUX B2, BO, INC2
  4791. LFPDUX A3, AO, INC2
  4792. LFPDUX A4, AO, INC2
  4793. LFPDUX B3, BO, INC2
  4794. LFPDUX A5, AO, INC2
  4795. LFPDUX A6, AO, INC2
  4796. LFPDUX B4, BO, INC2
  4797. LFPDUX A7, AO, INC2
  4798. LFPDUX A8, AO, INC2
  4799. bdz- .L1063
  4800. .align 4
  4801. .L1062:
  4802. fxcpmadd f0, B1, A1, f0
  4803. fxcsmadd f2, B1, A1, f2
  4804. LFPDUX A1, AO, INC2
  4805. fxcpmadd f1, B1, A2, f1
  4806. fxcsmadd f3, B1, A2, f3
  4807. LFPDUX A2, AO, INC2
  4808. LFPDUX B1, BO, INC2
  4809. fxcpmadd f0, B2, A3, f0
  4810. fxcsmadd f2, B2, A3, f2
  4811. LFPDUX A3, AO, INC2
  4812. fxcpmadd f1, B2, A4, f1
  4813. fxcsmadd f3, B2, A4, f3
  4814. LFPDUX A4, AO, INC2
  4815. LFPDUX B2, BO, INC2
  4816. fxcpmadd f0, B3, A5, f0
  4817. fxcsmadd f2, B3, A5, f2
  4818. LFPDUX A5, AO, INC2
  4819. fxcpmadd f1, B3, A6, f1
  4820. fxcsmadd f3, B3, A6, f3
  4821. LFPDUX A6, AO, INC2
  4822. LFPDUX B3, BO, INC2
  4823. fxcpmadd f0, B4, A7, f0
  4824. fxcsmadd f2, B4, A7, f2
  4825. LFPDUX A7, AO, INC2
  4826. fxcpmadd f1, B4, A8, f1
  4827. fxcsmadd f3, B4, A8, f3
  4828. LFPDUX A8, AO, INC2
  4829. LFPDUX B4, BO, INC2
  4830. bdnz+ .L1062
  4831. .align 4
  4832. .L1063:
  4833. fxcpmadd f0, B1, A1, f0
  4834. fxcsmadd f2, B1, A1, f2
  4835. fxcpmadd f1, B1, A2, f1
  4836. fxcsmadd f3, B1, A2, f3
  4837. fxcpmadd f0, B2, A3, f0
  4838. fxcsmadd f2, B2, A3, f2
  4839. fxcpmadd f1, B2, A4, f1
  4840. fxcsmadd f3, B2, A4, f3
  4841. fxcpmadd f0, B3, A5, f0
  4842. fxcsmadd f2, B3, A5, f2
  4843. fxcpmadd f1, B3, A6, f1
  4844. fxcsmadd f3, B3, A6, f3
  4845. fxcpmadd f0, B4, A7, f0
  4846. fxcsmadd f2, B4, A7, f2
  4847. fxcpmadd f1, B4, A8, f1
  4848. fxcsmadd f3, B4, A8, f3
  4849. .align 4
  4850. .L1064:
  4851. lfd AP, ALPHA(SP)
  4852. #ifdef TRMMKERNEL
  4853. fsmfp AP, AP
  4854. #endif
  4855. #if defined(TRMMKERNEL)
  4856. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4857. sub TEMP, K, KK
  4858. #elif defined(LEFT)
  4859. addi TEMP, KK, 4
  4860. #else
  4861. addi TEMP, KK, 2
  4862. #endif
  4863. andi. TEMP, TEMP, 3
  4864. mtspr CTR, TEMP
  4865. #else
  4866. andi. r0, K, 3
  4867. mtspr CTR, r0
  4868. #endif
  4869. ble+ .L1068
  4870. LFPDUX A1, AO, INC2
  4871. LFPDUX B1, BO, INC2
  4872. LFPDUX A2, AO, INC2
  4873. bdz- .L1067
  4874. .align 4
  4875. .L1066:
  4876. fxcpmadd f0, B1, A1, f0
  4877. fxcsmadd f2, B1, A1, f2
  4878. LFPDUX A1, AO, INC2
  4879. fxcpmadd f1, B1, A2, f1
  4880. fxcsmadd f3, B1, A2, f3
  4881. LFPDUX B1, BO, INC2
  4882. LFPDUX A2, AO, INC2
  4883. bdnz+ .L1066
  4884. .align 4
  4885. .L1067:
  4886. fxcpmadd f0, B1, A1, f0
  4887. fxcsmadd f2, B1, A1, f2
  4888. fxcpmadd f1, B1, A2, f1
  4889. fxcsmadd f3, B1, A2, f3
  4890. .align 4
  4891. .L1068:
  4892. #ifndef TRMMKERNEL
  4893. LFDUX A1, CO1, INC
  4894. LFDUX A2, CO1, INC2
  4895. LFDUX A3, CO2, INC
  4896. LFDUX A4, CO2, INC2
  4897. LFSDUX A1, CO1, INCM1
  4898. LFSDUX A2, CO1, INC2
  4899. LFSDUX A3, CO2, INCM1
  4900. LFSDUX A4, CO2, INC2
  4901. fxcpmadd f0, AP, f0, A1
  4902. fxcpmadd f1, AP, f1, A2
  4903. fxcpmadd f2, AP, f2, A3
  4904. STFDUX f0, CO1, INCM3
  4905. STFSDUX f0, CO1, INC
  4906. fxcpmadd f3, AP, f3, A4
  4907. STFDUX f1, CO1, INC
  4908. STFSDUX f1, CO1, INC
  4909. STFDUX f2, CO2, INCM3
  4910. STFSDUX f2, CO2, INC
  4911. STFDUX f3, CO2, INC
  4912. STFSDUX f3, CO2, INC
  4913. #else
  4914. fpmul f0, AP, f0
  4915. fpmul f1, AP, f1
  4916. fpmul f2, AP, f2
  4917. STFDUX f0, CO1, INC
  4918. STFSDUX f0, CO1, INC
  4919. fpmul f3, AP, f3
  4920. STFDUX f1, CO1, INC
  4921. STFSDUX f1, CO1, INC
  4922. STFDUX f2, CO2, INC
  4923. STFSDUX f2, CO2, INC
  4924. STFDUX f3, CO2, INC
  4925. STFSDUX f3, CO2, INC
  4926. #endif
  4927. #ifdef TRMMKERNEL
  4928. #if ( defined(LEFT) && defined(TRANSA)) || \
  4929. (!defined(LEFT) && !defined(TRANSA))
  4930. sub TEMP, K, KK
  4931. #ifdef LEFT
  4932. addi TEMP, TEMP, -4
  4933. #else
  4934. addi TEMP, TEMP, -2
  4935. #endif
  4936. slwi r0, TEMP, 2 + BASE_SHIFT
  4937. slwi TEMP, TEMP, 1 + BASE_SHIFT
  4938. add AO, AO, r0
  4939. add BO, BO, TEMP
  4940. #endif
  4941. #ifdef LEFT
  4942. addi KK, KK, 4
  4943. #endif
  4944. #endif
  4945. li r0, FZERO
  4946. lfpsx f0, SP, r0
  4947. .align 4
  4948. .L1070:
  4949. andi. I, M, 2
  4950. beq .L1080
  4951. #if defined(TRMMKERNEL)
  4952. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  4953. addi BO, B, - 2 * SIZE
  4954. fpmr f1, f0
  4955. #else
  4956. slwi TEMP, KK, 1 + BASE_SHIFT
  4957. slwi r0, KK, 1 + BASE_SHIFT
  4958. add AO, AO, TEMP
  4959. add BO, B, r0
  4960. addi BO, BO, - 2 * SIZE
  4961. fpmr f1, f0
  4962. #endif
  4963. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4964. sub TEMP, K, KK
  4965. #elif defined(LEFT)
  4966. addi TEMP, KK, 2
  4967. #else
  4968. addi TEMP, KK, 2
  4969. #endif
  4970. srawi. r0, TEMP, 3
  4971. fpmr f2, f0
  4972. mtspr CTR, r0
  4973. fpmr f3, f0
  4974. ble .L1074
  4975. #else
  4976. addi BO, B, - 2 * SIZE
  4977. fpmr f1, f0
  4978. srawi. r0, K, 3
  4979. fpmr f2, f0
  4980. mtspr CTR, r0
  4981. fpmr f3, f0
  4982. ble .L1074
  4983. #endif
  4984. LFPDUX A1, AO, INC2
  4985. LFPDUX B1, BO, INC2
  4986. LFPDUX A2, AO, INC2
  4987. LFPDUX B2, BO, INC2
  4988. LFPDUX A3, AO, INC2
  4989. LFPDUX B3, BO, INC2
  4990. LFPDUX A4, AO, INC2
  4991. LFPDUX B4, BO, INC2
  4992. LFPDUX A5, AO, INC2
  4993. LFPDUX B5, BO, INC2
  4994. LFPDUX A6, AO, INC2
  4995. LFPDUX B6, BO, INC2
  4996. LFPDUX A7, AO, INC2
  4997. LFPDUX A9, BO, INC2
  4998. LFPDUX A8, AO, INC2
  4999. LFPDUX A10, BO, INC2
  5000. bdz- .L1073
  5001. .align 4
  5002. .L1072:
  5003. fxcpmadd f0, B1, A1, f0
  5004. fxcsmadd f1, B1, A1, f1
  5005. LFPDUX A1, AO, INC2
  5006. LFPDUX B1, BO, INC2
  5007. fxcpmadd f2, B2, A2, f2
  5008. fxcsmadd f3, B2, A2, f3
  5009. LFPDUX A2, AO, INC2
  5010. LFPDUX B2, BO, INC2
  5011. fxcpmadd f0, B3, A3, f0
  5012. fxcsmadd f1, B3, A3, f1
  5013. LFPDUX A3, AO, INC2
  5014. LFPDUX B3, BO, INC2
  5015. fxcpmadd f2, B4, A4, f2
  5016. fxcsmadd f3, B4, A4, f3
  5017. LFPDUX A4, AO, INC2
  5018. LFPDUX B4, BO, INC2
  5019. fxcpmadd f0, B5, A5, f0
  5020. fxcsmadd f1, B5, A5, f1
  5021. LFPDUX A5, AO, INC2
  5022. LFPDUX B5, BO, INC2
  5023. fxcpmadd f2, B6, A6, f2
  5024. fxcsmadd f3, B6, A6, f3
  5025. LFPDUX A6, AO, INC2
  5026. LFPDUX B6, BO, INC2
  5027. fxcpmadd f0, A9, A7, f0
  5028. fxcsmadd f1, A9, A7, f1
  5029. LFPDUX A7, AO, INC2
  5030. LFPDUX A9, BO, INC2
  5031. fxcpmadd f2, A10, A8, f2
  5032. fxcsmadd f3, A10, A8, f3
  5033. LFPDUX A8, AO, INC2
  5034. LFPDUX A10, BO, INC2
  5035. bdnz+ .L1072
  5036. .align 4
  5037. .L1073:
  5038. fxcpmadd f0, B1, A1, f0
  5039. fxcsmadd f1, B1, A1, f1
  5040. fxcpmadd f2, B2, A2, f2
  5041. fxcsmadd f3, B2, A2, f3
  5042. fxcpmadd f0, B3, A3, f0
  5043. fxcsmadd f1, B3, A3, f1
  5044. fxcpmadd f2, B4, A4, f2
  5045. fxcsmadd f3, B4, A4, f3
  5046. fxcpmadd f0, B5, A5, f0
  5047. fxcsmadd f1, B5, A5, f1
  5048. fxcpmadd f2, B6, A6, f2
  5049. fxcsmadd f3, B6, A6, f3
  5050. fxcpmadd f0, A9, A7, f0
  5051. fxcsmadd f1, A9, A7, f1
  5052. fxcpmadd f2, A10, A8, f2
  5053. fxcsmadd f3, A10, A8, f3
  5054. .align 4
  5055. .L1074:
  5056. lfd AP, ALPHA(SP)
  5057. #ifdef TRMMKERNEL
  5058. fsmfp AP, AP
  5059. #endif
  5060. #if defined(TRMMKERNEL)
  5061. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5062. sub TEMP, K, KK
  5063. #elif defined(LEFT)
  5064. addi TEMP, KK, 2
  5065. #else
  5066. addi TEMP, KK, 2
  5067. #endif
  5068. andi. TEMP, TEMP, 7
  5069. mtspr CTR, TEMP
  5070. #else
  5071. andi. r0, K, 7
  5072. mtspr CTR, r0
  5073. #endif
  5074. ble+ .L1078
  5075. LFPDUX A1, AO, INC2
  5076. LFPDUX B1, BO, INC2
  5077. bdz- .L1077
  5078. .align 4
  5079. .L1076:
  5080. fxcpmadd f0, B1, A1, f0
  5081. fxcsmadd f1, B1, A1, f1
  5082. LFPDUX A1, AO, INC2
  5083. LFPDUX B1, BO, INC2
  5084. bdnz+ .L1076
  5085. .align 4
  5086. .L1077:
  5087. fxcpmadd f0, B1, A1, f0
  5088. fxcsmadd f1, B1, A1, f1
  5089. .align 4
  5090. .L1078:
  5091. #ifndef TRMMKERNEL
  5092. LFDUX A1, CO1, INC
  5093. LFDUX B3, CO2, INC
  5094. LFSDUX A1, CO1, INC
  5095. LFSDUX B3, CO2, INC
  5096. fpadd f0, f0, f2
  5097. fpadd f1, f1, f3
  5098. fxcpmadd f0, AP, f0, A1
  5099. fxcpmadd f1, AP, f1, B3
  5100. STFDUX f0, CO1, INCM1
  5101. STFSDUX f0, CO1, INC
  5102. STFDUX f1, CO2, INCM1
  5103. STFSDUX f1, CO2, INC
  5104. #else
  5105. fpadd f0, f0, f2
  5106. fpadd f1, f1, f3
  5107. fpmul f0, AP, f0
  5108. fpmul f1, AP, f1
  5109. STFDUX f0, CO1, INC
  5110. STFSDUX f0, CO1, INC
  5111. STFDUX f1, CO2, INC
  5112. STFSDUX f1, CO2, INC
  5113. #endif
  5114. #ifdef TRMMKERNEL
  5115. #if ( defined(LEFT) && defined(TRANSA)) || \
  5116. (!defined(LEFT) && !defined(TRANSA))
  5117. sub TEMP, K, KK
  5118. #ifdef LEFT
  5119. addi TEMP, TEMP, -2
  5120. #else
  5121. addi TEMP, TEMP, -2
  5122. #endif
  5123. slwi r0, TEMP, 1 + BASE_SHIFT
  5124. slwi TEMP, TEMP, 1 + BASE_SHIFT
  5125. add AO, AO, r0
  5126. add BO, BO, TEMP
  5127. #endif
  5128. #ifdef LEFT
  5129. addi KK, KK, 2
  5130. #endif
  5131. #endif
  5132. li r0, FZERO
  5133. lfpsx f0, SP, r0
  5134. .align 4
  5135. .L1080:
  5136. andi. I, M, 1
  5137. beq .L1089
  5138. #if defined(TRMMKERNEL)
  5139. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  5140. addi BO, B, - 2 * SIZE
  5141. fpmr f1, f0
  5142. fpmr f2, f0
  5143. fpmr f3, f0
  5144. #else
  5145. slwi TEMP, KK, 0 + BASE_SHIFT
  5146. slwi r0, KK, 1 + BASE_SHIFT
  5147. add AO, AO, TEMP
  5148. add BO, B, r0
  5149. addi BO, BO, - 2 * SIZE
  5150. fpmr f1, f0
  5151. fpmr f2, f0
  5152. fpmr f3, f0
  5153. #endif
  5154. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5155. sub TEMP, K, KK
  5156. #elif defined(LEFT)
  5157. addi TEMP, KK, 1
  5158. #else
  5159. addi TEMP, KK, 2
  5160. #endif
  5161. srawi. r0, TEMP, 3
  5162. mtspr CTR, r0
  5163. ble .L1084
  5164. #else
  5165. addi BO, B, - 2 * SIZE
  5166. fpmr f1, f0
  5167. fpmr f2, f0
  5168. fpmr f3, f0
  5169. srawi. r0, K, 3
  5170. mtspr CTR, r0
  5171. ble .L1084
  5172. #endif
  5173. LFPDUX B1, BO, INC2
  5174. LFPDUX A1, AO, INC2
  5175. LFPDUX A2, AO, INC2
  5176. LFPDUX B2, BO, INC2
  5177. LFPDUX A3, AO, INC2
  5178. LFPDUX A4, AO, INC2
  5179. LFPDUX B3, BO, INC2
  5180. LFPDUX B4, BO, INC2
  5181. bdz- .L1083
  5182. .align 4
  5183. .L1082:
  5184. fxcpmadd f0, A1, B1, f0
  5185. LFPDUX B1, BO, INC2
  5186. fxcsmadd f1, A1, B2, f1
  5187. LFPDUX B2, BO, INC2
  5188. LFPDUX A1, AO, INC2
  5189. fxcpmadd f2, A2, B3, f2
  5190. LFPDUX B3, BO, INC2
  5191. fxcsmadd f3, A2, B4, f3
  5192. LFPDUX B4, BO, INC2
  5193. LFPDUX A2, AO, INC2
  5194. fxcpmadd f0, A3, B1, f0
  5195. LFPDUX B1, BO, INC2
  5196. fxcsmadd f1, A3, B2, f1
  5197. LFPDUX B2, BO, INC2
  5198. LFPDUX A3, AO, INC2
  5199. fxcpmadd f2, A4, B3, f2
  5200. LFPDUX B3, BO, INC2
  5201. fxcsmadd f3, A4, B4, f3
  5202. LFPDUX B4, BO, INC2
  5203. LFPDUX A4, AO, INC2
  5204. bdnz+ .L1082
  5205. .align 4
  5206. .L1083:
  5207. fxcpmadd f0, A1, B1, f0
  5208. LFPDUX B1, BO, INC2
  5209. fxcsmadd f1, A1, B2, f1
  5210. LFPDUX B2, BO, INC2
  5211. fxcpmadd f2, A2, B3, f2
  5212. LFPDUX B3, BO, INC2
  5213. fxcsmadd f3, A2, B4, f3
  5214. LFPDUX B4, BO, INC2
  5215. fxcpmadd f0, A3, B1, f0
  5216. fxcsmadd f1, A3, B2, f1
  5217. fxcpmadd f2, A4, B3, f2
  5218. fxcsmadd f3, A4, B4, f3
  5219. .align 4
  5220. .L1084:
  5221. lfd AP, ALPHA(SP)
  5222. #ifdef TRMMKERNEL
  5223. fsmfp AP, AP
  5224. #endif
  5225. #if defined(TRMMKERNEL)
  5226. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5227. sub TEMP, K, KK
  5228. #elif defined(LEFT)
  5229. addi TEMP, KK, 1
  5230. #else
  5231. addi TEMP, KK, 2
  5232. #endif
  5233. andi. TEMP, TEMP, 7
  5234. mtspr CTR, TEMP
  5235. #else
  5236. andi. r0, K, 7
  5237. mtspr CTR, r0
  5238. #endif
  5239. ble+ .L1088
  5240. LFDX A1, AO, INC2
  5241. LFPDUX B1, BO, INC2
  5242. add AO, AO, INC
  5243. bdz- .L1087
  5244. .align 4
  5245. .L1086:
  5246. fxcpmadd f0, A1, B1, f0
  5247. LFDX A1, AO, INC2
  5248. LFPDUX B1, BO, INC2
  5249. add AO, AO, INC
  5250. bdnz+ .L1086
  5251. .align 4
  5252. .L1087:
  5253. fxcpmadd f0, A1, B1, f0
  5254. .align 4
  5255. .L1088:
  5256. #ifndef TRMMKERNEL
  5257. LFDX A1, CO1, INC
  5258. LFDX A2, CO2, INC
  5259. fpadd f0, f0, f1
  5260. fpadd f2, f2, f3
  5261. fsmfp A1, A2
  5262. fpadd f0, f0, f2
  5263. fxcpmadd f0, AP, f0, A1
  5264. #else
  5265. fpadd f0, f0, f1
  5266. fpadd f2, f2, f3
  5267. fsmfp A1, A2
  5268. fpadd f0, f0, f2
  5269. fpmul f0, AP, f0
  5270. #endif
  5271. STFDUX f0, CO1, INC
  5272. STFSDUX f0, CO2, INC
  5273. #ifdef TRMMKERNEL
  5274. #if ( defined(LEFT) && defined(TRANSA)) || \
  5275. (!defined(LEFT) && !defined(TRANSA))
  5276. sub TEMP, K, KK
  5277. #ifdef LEFT
  5278. addi TEMP, TEMP, -1
  5279. #else
  5280. addi TEMP, TEMP, -2
  5281. #endif
  5282. slwi r0, TEMP, 0 + BASE_SHIFT
  5283. slwi TEMP, TEMP, 1 + BASE_SHIFT
  5284. add AO, AO, r0
  5285. add BO, BO, TEMP
  5286. #endif
  5287. #ifdef LEFT
  5288. addi KK, KK, 1
  5289. #endif
  5290. #endif
  5291. .align 4
  5292. .L1089:
  5293. #if defined(TRMMKERNEL) && !defined(LEFT)
  5294. addi KK, KK, 2
  5295. #endif
  5296. addi B, BO, 2 * SIZE
  5297. .align 4
  5298. .L1090:
  5299. andi. J, N, 1
  5300. beq .L10999
  5301. #if defined(TRMMKERNEL) && defined(LEFT)
  5302. mr KK, OFFSET
  5303. #endif
  5304. mr CO1, C
  5305. addi AO, A, -2 * SIZE
  5306. li r0, FZERO
  5307. lfpsx f0, SP, r0
  5308. srawi. I, M, 3
  5309. ble .L10100
  5310. .align 4
  5311. .L1091:
  5312. #if defined(TRMMKERNEL)
  5313. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  5314. addi BO, B, - 2 * SIZE
  5315. fpmr f1, f0
  5316. #else
  5317. slwi TEMP, KK, 3 + BASE_SHIFT
  5318. slwi r0, KK, 0 + BASE_SHIFT
  5319. add AO, AO, TEMP
  5320. add BO, B, r0
  5321. addi BO, BO, - 2 * SIZE
  5322. fpmr f1, f0
  5323. #endif
  5324. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5325. sub TEMP, K, KK
  5326. #elif defined(LEFT)
  5327. addi TEMP, KK, 8
  5328. #else
  5329. addi TEMP, KK, 1
  5330. #endif
  5331. fpmr f2, f0
  5332. srawi. r0, TEMP, 2
  5333. fpmr f3, f0
  5334. mtspr CTR, r0
  5335. ble .L1094
  5336. #else
  5337. srawi. r0, K, 2
  5338. fpmr f1, f0
  5339. addi BO, B, - 2 * SIZE
  5340. fpmr f2, f0
  5341. fpmr f3, f0
  5342. mtspr CTR, r0
  5343. ble .L1094
  5344. #endif
  5345. LFPDUX B1, BO, INC2
  5346. LFPDUX A1, AO, INC2
  5347. LFPDUX A2, AO, INC2
  5348. LFPDUX A3, AO, INC2
  5349. LFPDUX A4, AO, INC2
  5350. LFPDUX B2, BO, INC2
  5351. LFPDUX A5, AO, INC2
  5352. LFPDUX A6, AO, INC2
  5353. LFPDUX A7, AO, INC2
  5354. LFPDUX A8, AO, INC2
  5355. bdz- .L1093
  5356. .align 4
  5357. .L1092:
  5358. fxcpmadd f0, B1, A1, f0
  5359. LFPDUX A1, AO, INC2
  5360. fxcpmadd f1, B1, A2, f1
  5361. LFPDUX A2, AO, INC2
  5362. fxcpmadd f2, B1, A3, f2
  5363. LFPDUX A3, AO, INC2
  5364. fxcpmadd f3, B1, A4, f3
  5365. LFPDUX A4, AO, INC2
  5366. fxcsmadd f0, B1, A5, f0
  5367. LFPDUX A5, AO, INC2
  5368. fxcsmadd f1, B1, A6, f1
  5369. LFPDUX A6, AO, INC2
  5370. fxcsmadd f2, B1, A7, f2
  5371. LFPDUX A7, AO, INC2
  5372. fxcsmadd f3, B1, A8, f3
  5373. LFPDUX A8, AO, INC2
  5374. LFPDUX B1, BO, INC2
  5375. fxcpmadd f0, B2, A1, f0
  5376. LFPDUX A1, AO, INC2
  5377. fxcpmadd f1, B2, A2, f1
  5378. LFPDUX A2, AO, INC2
  5379. fxcpmadd f2, B2, A3, f2
  5380. LFPDUX A3, AO, INC2
  5381. fxcpmadd f3, B2, A4, f3
  5382. LFPDUX A4, AO, INC2
  5383. fxcsmadd f0, B2, A5, f0
  5384. LFPDUX A5, AO, INC2
  5385. fxcsmadd f1, B2, A6, f1
  5386. LFPDUX A6, AO, INC2
  5387. fxcsmadd f2, B2, A7, f2
  5388. LFPDUX A7, AO, INC2
  5389. fxcsmadd f3, B2, A8, f3
  5390. LFPDUX A8, AO, INC2
  5391. LFPDUX B2, BO, INC2
  5392. bdnz+ .L1092
  5393. .align 4
  5394. .L1093:
  5395. fxcpmadd f0, B1, A1, f0
  5396. LFPDUX A1, AO, INC2
  5397. fxcpmadd f1, B1, A2, f1
  5398. LFPDUX A2, AO, INC2
  5399. fxcpmadd f2, B1, A3, f2
  5400. LFPDUX A3, AO, INC2
  5401. fxcpmadd f3, B1, A4, f3
  5402. LFPDUX A4, AO, INC2
  5403. fxcsmadd f0, B1, A5, f0
  5404. LFPDUX A5, AO, INC2
  5405. fxcsmadd f1, B1, A6, f1
  5406. LFPDUX A6, AO, INC2
  5407. fxcsmadd f2, B1, A7, f2
  5408. LFPDUX A7, AO, INC2
  5409. fxcsmadd f3, B1, A8, f3
  5410. LFPDUX A8, AO, INC2
  5411. fxcpmadd f0, B2, A1, f0
  5412. fxcpmadd f1, B2, A2, f1
  5413. fxcpmadd f2, B2, A3, f2
  5414. fxcpmadd f3, B2, A4, f3
  5415. fxcsmadd f0, B2, A5, f0
  5416. fxcsmadd f1, B2, A6, f1
  5417. fxcsmadd f2, B2, A7, f2
  5418. fxcsmadd f3, B2, A8, f3
  5419. .align 4
  5420. .L1094:
  5421. lfd AP, ALPHA(SP)
  5422. #ifdef TRMMKERNEL
  5423. fsmfp AP, AP
  5424. #endif
  5425. #if defined(TRMMKERNEL)
  5426. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5427. sub TEMP, K, KK
  5428. #elif defined(LEFT)
  5429. addi TEMP, KK, 8
  5430. #else
  5431. addi TEMP, KK, 1
  5432. #endif
  5433. andi. TEMP, TEMP, 3
  5434. mtspr CTR, TEMP
  5435. #else
  5436. andi. r0, K, 3
  5437. mtspr CTR, r0
  5438. #endif
  5439. ble+ .L1098
  5440. LFDX B1, BO, INC2
  5441. LFPDUX A1, AO, INC2
  5442. LFPDUX A2, AO, INC2
  5443. LFPDUX A3, AO, INC2
  5444. LFPDUX A4, AO, INC2
  5445. add BO, BO, INC
  5446. bdz- .L1097
  5447. .align 4
  5448. .L1096:
  5449. fxcpmadd f0, B1, A1, f0
  5450. LFPDUX A1, AO, INC2
  5451. fxcpmadd f1, B1, A2, f1
  5452. LFPDUX A2, AO, INC2
  5453. fxcpmadd f2, B1, A3, f2
  5454. LFPDUX A3, AO, INC2
  5455. fxcpmadd f3, B1, A4, f3
  5456. LFDX B1, BO, INC2
  5457. LFPDUX A4, AO, INC2
  5458. add BO, BO, INC
  5459. bdnz+ .L1096
  5460. .align 4
  5461. .L1097:
  5462. fxcpmadd f0, B1, A1, f0
  5463. fxcpmadd f1, B1, A2, f1
  5464. fxcpmadd f2, B1, A3, f2
  5465. fxcpmadd f3, B1, A4, f3
  5466. .align 4
  5467. .L1098:
  5468. #ifndef TRMMKERNEL
  5469. LFDUX A1, CO1, INC
  5470. LFDUX B1, CO1, INC2
  5471. LFDUX A3, CO1, INC2
  5472. LFDUX A5, CO1, INC2
  5473. LFSDUX A1, CO1, INCM5
  5474. LFSDUX B1, CO1, INC2
  5475. LFSDUX A3, CO1, INC2
  5476. LFSDUX A5, CO1, INC2
  5477. fxcpmadd f0, AP, f0, A1
  5478. fxcpmadd f1, AP, f1, B1
  5479. fxcpmadd f2, AP, f2, A3
  5480. STFDUX f0, CO1, INCM7
  5481. STFSDUX f0, CO1, INC
  5482. fxcpmadd f3, AP, f3, A5
  5483. #else
  5484. fpmul f0, AP, f0
  5485. fpmul f1, AP, f1
  5486. fpmul f2, AP, f2
  5487. STFDUX f0, CO1, INC
  5488. STFSDUX f0, CO1, INC
  5489. fpmul f3, AP, f3
  5490. #endif
  5491. STFDUX f1, CO1, INC
  5492. STFSDUX f1, CO1, INC
  5493. STFDUX f2, CO1, INC
  5494. STFSDUX f2, CO1, INC
  5495. STFDUX f3, CO1, INC
  5496. STFSDUX f3, CO1, INC
  5497. #ifdef TRMMKERNEL
  5498. #if ( defined(LEFT) && defined(TRANSA)) || \
  5499. (!defined(LEFT) && !defined(TRANSA))
  5500. sub TEMP, K, KK
  5501. #ifdef LEFT
  5502. addi TEMP, TEMP, -8
  5503. #else
  5504. addi TEMP, TEMP, -1
  5505. #endif
  5506. slwi r0, TEMP, 3 + BASE_SHIFT
  5507. slwi TEMP, TEMP, 0 + BASE_SHIFT
  5508. add AO, AO, r0
  5509. add BO, BO, TEMP
  5510. #endif
  5511. #ifdef LEFT
  5512. addi KK, KK, 8
  5513. #endif
  5514. #endif
  5515. addic. I, I, -1
  5516. li r0, FZERO
  5517. lfpsx f0, SP, r0
  5518. bgt+ .L1091
  5519. .align 4
  5520. .L10100:
  5521. andi. I, M, 4
  5522. beq .L10110
  5523. #if defined(TRMMKERNEL)
  5524. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  5525. addi BO, B, - 2 * SIZE
  5526. fpmr f1, f0
  5527. fpmr f2, f0
  5528. fpmr f3, f0
  5529. #else
  5530. slwi TEMP, KK, 2 + BASE_SHIFT
  5531. slwi r0, KK, 0 + BASE_SHIFT
  5532. add AO, AO, TEMP
  5533. add BO, B, r0
  5534. fpmr f1, f0
  5535. addi BO, BO, - 2 * SIZE
  5536. fpmr f2, f0
  5537. fpmr f3, f0
  5538. #endif
  5539. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5540. sub TEMP, K, KK
  5541. #elif defined(LEFT)
  5542. addi TEMP, KK, 4
  5543. #else
  5544. addi TEMP, KK, 1
  5545. #endif
  5546. srawi. r0, TEMP, 3
  5547. mtspr CTR, r0
  5548. ble .L10104
  5549. #else
  5550. addi BO, B, - 2 * SIZE
  5551. fpmr f1, f0
  5552. fpmr f2, f0
  5553. fpmr f3, f0
  5554. srawi. r0, K, 3
  5555. mtspr CTR, r0
  5556. ble .L10104
  5557. #endif
  5558. LFPDUX B1, BO, INC2
  5559. LFPDUX A1, AO, INC2
  5560. LFPDUX A2, AO, INC2
  5561. LFPDUX A3, AO, INC2
  5562. LFPDUX A4, AO, INC2
  5563. LFPDUX B2, BO, INC2
  5564. LFPDUX A5, AO, INC2
  5565. LFPDUX A6, AO, INC2
  5566. LFPDUX A7, AO, INC2
  5567. LFPDUX A8, AO, INC2
  5568. LFPDUX B3, BO, INC2
  5569. LFPDUX B4, BO, INC2
  5570. bdz- .L10103
  5571. .align 4
  5572. .L10102:
  5573. fxcpmadd f0, B1, A1, f0
  5574. LFPDUX A1, AO, INC2
  5575. fxcpmadd f1, B1, A2, f1
  5576. LFPDUX A2, AO, INC2
  5577. fxcsmadd f2, B1, A3, f2
  5578. LFPDUX A3, AO, INC2
  5579. fxcsmadd f3, B1, A4, f3
  5580. LFPDUX A4, AO, INC2
  5581. LFPDUX B1, BO, INC2
  5582. fxcpmadd f0, B2, A5, f0
  5583. LFPDUX A5, AO, INC2
  5584. fxcpmadd f1, B2, A6, f1
  5585. LFPDUX A6, AO, INC2
  5586. fxcsmadd f2, B2, A7, f2
  5587. LFPDUX A7, AO, INC2
  5588. fxcsmadd f3, B2, A8, f3
  5589. LFPDUX A8, AO, INC2
  5590. LFPDUX B2, BO, INC2
  5591. fxcpmadd f0, B3, A1, f0
  5592. LFPDUX A1, AO, INC2
  5593. fxcpmadd f1, B3, A2, f1
  5594. LFPDUX A2, AO, INC2
  5595. fxcsmadd f2, B3, A3, f2
  5596. LFPDUX A3, AO, INC2
  5597. fxcsmadd f3, B3, A4, f3
  5598. LFPDUX A4, AO, INC2
  5599. LFPDUX B3, BO, INC2
  5600. fxcpmadd f0, B4, A5, f0
  5601. LFPDUX A5, AO, INC2
  5602. fxcpmadd f1, B4, A6, f1
  5603. LFPDUX A6, AO, INC2
  5604. fxcsmadd f2, B4, A7, f2
  5605. LFPDUX A7, AO, INC2
  5606. fxcsmadd f3, B4, A8, f3
  5607. LFPDUX A8, AO, INC2
  5608. LFPDUX B4, BO, INC2
  5609. bdnz+ .L10102
  5610. .align 4
  5611. .L10103:
  5612. fxcpmadd f0, B1, A1, f0
  5613. LFPDUX A1, AO, INC2
  5614. fxcpmadd f1, B1, A2, f1
  5615. LFPDUX A2, AO, INC2
  5616. fxcsmadd f2, B1, A3, f2
  5617. LFPDUX A3, AO, INC2
  5618. fxcsmadd f3, B1, A4, f3
  5619. LFPDUX A4, AO, INC2
  5620. fxcpmadd f0, B2, A5, f0
  5621. LFPDUX A5, AO, INC2
  5622. fxcpmadd f1, B2, A6, f1
  5623. LFPDUX A6, AO, INC2
  5624. fxcsmadd f2, B2, A7, f2
  5625. LFPDUX A7, AO, INC2
  5626. fxcsmadd f3, B2, A8, f3
  5627. LFPDUX A8, AO, INC2
  5628. fxcpmadd f0, B3, A1, f0
  5629. fxcpmadd f1, B3, A2, f1
  5630. fxcsmadd f2, B3, A3, f2
  5631. fxcsmadd f3, B3, A4, f3
  5632. fxcpmadd f0, B4, A5, f0
  5633. fxcpmadd f1, B4, A6, f1
  5634. fxcsmadd f2, B4, A7, f2
  5635. fxcsmadd f3, B4, A8, f3
  5636. .align 4
  5637. .L10104:
  5638. lfd AP, ALPHA(SP)
  5639. #ifdef TRMMKERNEL
  5640. fsmfp AP, AP
  5641. #endif
  5642. #if defined(TRMMKERNEL)
  5643. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5644. sub TEMP, K, KK
  5645. #elif defined(LEFT)
  5646. addi TEMP, KK, 4
  5647. #else
  5648. addi TEMP, KK, 1
  5649. #endif
  5650. andi. TEMP, TEMP, 7
  5651. mtspr CTR, TEMP
  5652. #else
  5653. andi. r0, K, 7
  5654. mtspr CTR, r0
  5655. #endif
  5656. ble+ .L10108
  5657. LFPDUX A1, AO, INC2
  5658. LFDX B1, BO, INC2
  5659. LFPDUX A2, AO, INC2
  5660. add BO, BO, INC
  5661. bdz- .L10107
  5662. .align 4
  5663. .L10106:
  5664. fxcpmadd f0, B1, A1, f0
  5665. LFPDUX A1, AO, INC2
  5666. fxcpmadd f1, B1, A2, f1
  5667. LFDX B1, BO, INC2
  5668. LFPDUX A2, AO, INC2
  5669. add BO, BO, INC
  5670. bdnz+ .L10106
  5671. .align 4
  5672. .L10107:
  5673. fxcpmadd f0, B1, A1, f0
  5674. fxcpmadd f1, B1, A2, f1
  5675. .align 4
  5676. .L10108:
  5677. #ifndef TRMMKERNEL
  5678. LFDUX A1, CO1, INC
  5679. LFDUX B1, CO1, INC2
  5680. LFSDUX A1, CO1, INCM1
  5681. LFSDUX B1, CO1, INC2
  5682. fpadd f0, f0, f2
  5683. fpadd f1, f1, f3
  5684. fxcpmadd f0, AP, f0, A1
  5685. fxcpmadd f1, AP, f1, B1
  5686. STFDUX f0, CO1, INCM3
  5687. STFSDUX f0, CO1, INC
  5688. #else
  5689. fpadd f0, f0, f2
  5690. fpadd f1, f1, f3
  5691. fpmul f0, AP, f0
  5692. fpmul f1, AP, f1
  5693. STFDUX f0, CO1, INC
  5694. STFSDUX f0, CO1, INC
  5695. #endif
  5696. STFDUX f1, CO1, INC
  5697. STFSDUX f1, CO1, INC
  5698. #ifdef TRMMKERNEL
  5699. #if ( defined(LEFT) && defined(TRANSA)) || \
  5700. (!defined(LEFT) && !defined(TRANSA))
  5701. sub TEMP, K, KK
  5702. #ifdef LEFT
  5703. addi TEMP, TEMP, -4
  5704. #else
  5705. addi TEMP, TEMP, -1
  5706. #endif
  5707. slwi r0, TEMP, 2 + BASE_SHIFT
  5708. slwi TEMP, TEMP, 0 + BASE_SHIFT
  5709. add AO, AO, r0
  5710. add BO, BO, TEMP
  5711. #endif
  5712. #ifdef LEFT
  5713. addi KK, KK, 4
  5714. #endif
  5715. #endif
  5716. li r0, FZERO
  5717. lfpsx f0, SP, r0
  5718. .align 4
  5719. .L10110:
  5720. andi. I, M, 2
  5721. beq .L10120
  5722. #if defined(TRMMKERNEL)
  5723. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  5724. addi BO, B, - 2 * SIZE
  5725. fpmr f1, f0
  5726. fpmr f2, f0
  5727. fpmr f3, f0
  5728. #else
  5729. slwi TEMP, KK, 1 + BASE_SHIFT
  5730. slwi r0, KK, 0 + BASE_SHIFT
  5731. add AO, AO, TEMP
  5732. add BO, B, r0
  5733. fpmr f1, f0
  5734. addi BO, BO, - 2 * SIZE
  5735. fpmr f2, f0
  5736. fpmr f3, f0
  5737. #endif
  5738. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5739. sub TEMP, K, KK
  5740. #elif defined(LEFT)
  5741. addi TEMP, KK, 2
  5742. #else
  5743. addi TEMP, KK, 1
  5744. #endif
  5745. srawi. r0, TEMP, 3
  5746. mtspr CTR, r0
  5747. ble .L10114
  5748. #else
  5749. addi BO, B, - 2 * SIZE
  5750. fpmr f1, f0
  5751. fpmr f2, f0
  5752. fpmr f3, f0
  5753. srawi. r0, K, 3
  5754. mtspr CTR, r0
  5755. ble .L10114
  5756. #endif
  5757. LFPDUX A1, AO, INC2
  5758. LFPDUX A2, AO, INC2
  5759. LFPDUX B1, BO, INC2
  5760. LFPDUX A3, AO, INC2
  5761. LFPDUX A4, AO, INC2
  5762. LFPDUX B2, BO, INC2
  5763. LFPDUX A5, AO, INC2
  5764. LFPDUX A6, AO, INC2
  5765. LFPDUX B3, BO, INC2
  5766. LFPDUX A7, AO, INC2
  5767. LFPDUX A8, AO, INC2
  5768. LFPDUX B4, BO, INC2
  5769. bdz- .L10113
  5770. .align 4
  5771. .L10112:
  5772. fxcpmadd f0, B1, A1, f0
  5773. LFPDUX A1, AO, INC2
  5774. fxcsmadd f1, B1, A2, f1
  5775. LFPDUX A2, AO, INC2
  5776. LFPDUX B1, BO, INC2
  5777. fxcpmadd f2, B2, A3, f2
  5778. LFPDUX A3, AO, INC2
  5779. fxcsmadd f3, B2, A4, f3
  5780. LFPDUX A4, AO, INC2
  5781. LFPDUX B2, BO, INC2
  5782. fxcpmadd f0, B3, A5, f0
  5783. LFPDUX A5, AO, INC2
  5784. fxcsmadd f1, B3, A6, f1
  5785. LFPDUX A6, AO, INC2
  5786. LFPDUX B3, BO, INC2
  5787. fxcpmadd f2, B4, A7, f2
  5788. LFPDUX A7, AO, INC2
  5789. fxcsmadd f3, B4, A8, f3
  5790. LFPDUX A8, AO, INC2
  5791. LFPDUX B4, BO, INC2
  5792. bdnz+ .L10112
  5793. .align 4
  5794. .L10113:
  5795. fxcpmadd f0, B1, A1, f0
  5796. fxcsmadd f1, B1, A2, f1
  5797. fxcpmadd f2, B2, A3, f2
  5798. fxcsmadd f3, B2, A4, f3
  5799. fxcpmadd f0, B3, A5, f0
  5800. fxcsmadd f1, B3, A6, f1
  5801. fxcpmadd f2, B4, A7, f2
  5802. fxcsmadd f3, B4, A8, f3
  5803. .align 4
  5804. .L10114:
  5805. lfd AP, ALPHA(SP)
  5806. #ifdef TRMMKERNEL
  5807. fsmfp AP, AP
  5808. #endif
  5809. #if defined(TRMMKERNEL)
  5810. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5811. sub TEMP, K, KK
  5812. #elif defined(LEFT)
  5813. addi TEMP, KK, 2
  5814. #else
  5815. addi TEMP, KK, 1
  5816. #endif
  5817. andi. TEMP, TEMP, 7
  5818. mtspr CTR, TEMP
  5819. #else
  5820. andi. r0, K, 7
  5821. mtspr CTR, r0
  5822. #endif
  5823. ble+ .L10118
  5824. LFPDUX A1, AO, INC2
  5825. LFDX B1, BO, INC2
  5826. add BO, BO, INC
  5827. bdz- .L10117
  5828. .align 4
  5829. .L10116:
  5830. fxcpmadd f0, B1, A1, f0
  5831. LFPDUX A1, AO, INC2
  5832. LFDX B1, BO, INC2
  5833. add BO, BO, INC
  5834. bdnz+ .L10116
  5835. .align 4
  5836. .L10117:
  5837. fxcpmadd f0, B1, A1, f0
  5838. .align 4
  5839. .L10118:
  5840. #ifndef TRMMKERNEL
  5841. LFDUX A1, CO1, INC
  5842. LFDUX A2, CO1, INC
  5843. fpadd f0, f0, f1
  5844. fpadd f2, f3, f2
  5845. fsmfp A1, A2
  5846. fpadd f0, f0, f2
  5847. fxcpmadd f1, AP, f0, A1
  5848. li r0, FZERO
  5849. lfpsx f0, SP, r0
  5850. STFDUX f1, CO1, INCM1
  5851. STFSDUX f1, CO1, INC
  5852. #else
  5853. fpadd f0, f0, f1
  5854. fpadd f2, f3, f2
  5855. fsmfp A1, A2
  5856. fpadd f0, f0, f2
  5857. fpmul f1, AP, f0
  5858. li r0, FZERO
  5859. lfpsx f0, SP, r0
  5860. STFDUX f1, CO1, INC
  5861. STFSDUX f1, CO1, INC
  5862. #endif
  5863. #ifdef TRMMKERNEL
  5864. #if ( defined(LEFT) && defined(TRANSA)) || \
  5865. (!defined(LEFT) && !defined(TRANSA))
  5866. sub TEMP, K, KK
  5867. #ifdef LEFT
  5868. addi TEMP, TEMP, -2
  5869. #else
  5870. addi TEMP, TEMP, -1
  5871. #endif
  5872. slwi r0, TEMP, 1 + BASE_SHIFT
  5873. slwi TEMP, TEMP, 0 + BASE_SHIFT
  5874. add AO, AO, r0
  5875. add BO, BO, TEMP
  5876. #endif
  5877. #ifdef LEFT
  5878. addi KK, KK, 2
  5879. #endif
  5880. #endif
  5881. .align 4
  5882. .L10120:
  5883. andi. I, M, 1
  5884. beq .L10999
  5885. #if defined(TRMMKERNEL)
  5886. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  5887. addi BO, B, - 2 * SIZE
  5888. fpmr f1, f0
  5889. fpmr f2, f0
  5890. fpmr f3, f0
  5891. #else
  5892. slwi TEMP, KK, 0 + BASE_SHIFT
  5893. slwi r0, KK, 0 + BASE_SHIFT
  5894. add AO, AO, TEMP
  5895. add BO, B, r0
  5896. fpmr f1, f0
  5897. addi BO, BO, - 2 * SIZE
  5898. fpmr f2, f0
  5899. fpmr f3, f0
  5900. #endif
  5901. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5902. sub TEMP, K, KK
  5903. #elif defined(LEFT)
  5904. addi TEMP, KK, 1
  5905. #else
  5906. addi TEMP, KK, 1
  5907. #endif
  5908. srawi. r0, TEMP, 3
  5909. mtspr CTR, r0
  5910. ble .L10124
  5911. #else
  5912. addi BO, B, - 2 * SIZE
  5913. fpmr f1, f0
  5914. fpmr f2, f0
  5915. fpmr f3, f0
  5916. srawi. r0, K, 3
  5917. mtspr CTR, r0
  5918. ble .L10124
  5919. #endif
  5920. LFPDUX A1, AO, INC2
  5921. LFPDUX B1, BO, INC2
  5922. LFPDUX A2, AO, INC2
  5923. LFPDUX B2, BO, INC2
  5924. LFPDUX A3, AO, INC2
  5925. LFPDUX B3, BO, INC2
  5926. LFPDUX A4, AO, INC2
  5927. LFPDUX B4, BO, INC2
  5928. bdz- .L10123
  5929. .align 4
  5930. .L10122:
  5931. fpmadd f0, A1, B1, f0
  5932. LFPDUX A1, AO, INC2
  5933. LFPDUX B1, BO, INC2
  5934. fpmadd f1, A2, B2, f1
  5935. LFPDUX A2, AO, INC2
  5936. LFPDUX B2, BO, INC2
  5937. fpmadd f2, A3, B3, f2
  5938. LFPDUX A3, AO, INC2
  5939. LFPDUX B3, BO, INC2
  5940. fpmadd f3, A4, B4, f3
  5941. LFPDUX A4, AO, INC2
  5942. LFPDUX B4, BO, INC2
  5943. bdnz+ .L10122
  5944. .align 4
  5945. .L10123:
  5946. fpmadd f0, A1, B1, f0
  5947. fpmadd f1, A2, B2, f1
  5948. fpmadd f2, A3, B3, f2
  5949. fpmadd f3, A4, B4, f3
  5950. .align 4
  5951. .L10124:
  5952. lfd AP, ALPHA(SP)
  5953. #ifdef TRMMKERNEL
  5954. fsmfp AP, AP
  5955. #endif
  5956. #if defined(TRMMKERNEL)
  5957. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5958. sub TEMP, K, KK
  5959. #elif defined(LEFT)
  5960. addi TEMP, KK, 1
  5961. #else
  5962. addi TEMP, KK, 1
  5963. #endif
  5964. andi. TEMP, TEMP, 7
  5965. mtspr CTR, TEMP
  5966. #else
  5967. andi. r0, K, 7
  5968. mtspr CTR, r0
  5969. #endif
  5970. ble+ .L10128
  5971. LFDX A1, AO, INC2
  5972. LFDX B1, BO, INC2
  5973. add AO, AO, INC
  5974. add BO, BO, INC
  5975. bdz- .L10127
  5976. .align 4
  5977. .L10126:
  5978. fmadd f0, A1, B1, f0
  5979. LFDX A1, AO, INC2
  5980. LFDX B1, BO, INC2
  5981. add AO, AO, INC
  5982. add BO, BO, INC
  5983. bdnz+ .L10126
  5984. .align 4
  5985. .L10127:
  5986. fmadd f0, A1, B1, f0
  5987. .align 4
  5988. .L10128:
  5989. #ifndef TRMMKERNEL
  5990. LFDX A1, CO1, INC
  5991. fpadd f0, f0, f1
  5992. fpadd f2, f2, f3
  5993. fpadd f0, f0, f2
  5994. fsmtp f1, f0
  5995. fadd f0, f0, f1
  5996. fmadd f0, AP, f0, A1
  5997. STFDUX f0, CO1, INC
  5998. #else
  5999. fpadd f0, f0, f1
  6000. fpadd f2, f2, f3
  6001. fpadd f0, f0, f2
  6002. fsmtp f1, f0
  6003. fadd f0, f0, f1
  6004. fmul f0, AP, f0
  6005. STFDUX f0, CO1, INC
  6006. #endif
  6007. .align 4
  6008. .L10999:
  6009. addi SP, SP, 12
  6010. lwzu r14, 4(SP)
  6011. lwzu r15, 4(SP)
  6012. lwzu r16, 4(SP)
  6013. lwzu r17, 4(SP)
  6014. lwzu r18, 4(SP)
  6015. lwzu r19, 4(SP)
  6016. lwzu r20, 4(SP)
  6017. lwzu r21, 4(SP)
  6018. lwzu r22, 4(SP)
  6019. lwzu r23, 4(SP)
  6020. lwzu r24, 4(SP)
  6021. lwzu r25, 4(SP)
  6022. lwzu r26, 4(SP)
  6023. lwzu r27, 4(SP)
  6024. lwzu r28, 4(SP)
  6025. lwzu r29, 4(SP)
  6026. lwzu r30, 4(SP)
  6027. lwzu r31, 4(SP)
  6028. subi SP, SP, 12
  6029. li r0, 16
  6030. lfpdux f31, SP, r0
  6031. lfpdux f30, SP, r0
  6032. lfpdux f29, SP, r0
  6033. lfpdux f28, SP, r0
  6034. lfpdux f27, SP, r0
  6035. lfpdux f26, SP, r0
  6036. lfpdux f25, SP, r0
  6037. lfpdux f24, SP, r0
  6038. lfpdux f23, SP, r0
  6039. lfpdux f22, SP, r0
  6040. lfpdux f21, SP, r0
  6041. lfpdux f20, SP, r0
  6042. lfpdux f19, SP, r0
  6043. lfpdux f18, SP, r0
  6044. lfpdux f17, SP, r0
  6045. lfpdux f16, SP, r0
  6046. lfpdux f15, SP, r0
  6047. lfpdux f14, SP, r0
  6048. addi SP, SP, 16
  6049. blr
  6050. EPILOGUE
  6051. #endif