You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_ppc440.S 44 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #ifdef linux
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define AORIG r18
  88. #define TEMP r19
  89. #define KK r20
  90. #define I r21
  91. #define J r22
  92. #define AO r23
  93. #define BO r24
  94. #define CO1 r25
  95. #define CO2 r26
  96. #define CO3 r27
  97. #define CO4 r28
  98. #define PREA r29
  99. PROLOGUE
  100. PROFCODE
  101. addi SP, SP, -STACKSIZE
  102. li r0, 0
  103. stfd f14, 0(SP)
  104. stfd f15, 8(SP)
  105. stfd f16, 16(SP)
  106. stfd f17, 24(SP)
  107. stfd f18, 32(SP)
  108. stfd f19, 40(SP)
  109. stfd f20, 48(SP)
  110. stfd f21, 56(SP)
  111. stfd f22, 64(SP)
  112. stfd f23, 72(SP)
  113. stfd f24, 80(SP)
  114. stfd f25, 88(SP)
  115. stfd f26, 96(SP)
  116. stfd f27, 104(SP)
  117. stfd f28, 112(SP)
  118. stfd f29, 120(SP)
  119. stfd f30, 128(SP)
  120. stfd f31, 136(SP)
  121. #ifdef __64BIT__
  122. std r31, 144(SP)
  123. std r30, 152(SP)
  124. std r29, 160(SP)
  125. std r28, 168(SP)
  126. std r27, 176(SP)
  127. std r26, 184(SP)
  128. std r25, 192(SP)
  129. std r24, 200(SP)
  130. std r23, 208(SP)
  131. std r22, 216(SP)
  132. std r21, 224(SP)
  133. std r20, 232(SP)
  134. #if defined(TRMMKERNEL)
  135. std r19, 240(SP)
  136. std r18, 248(SP)
  137. #endif
  138. #else
  139. stw r31, 144(SP)
  140. stw r30, 148(SP)
  141. stw r29, 152(SP)
  142. stw r28, 156(SP)
  143. stw r27, 160(SP)
  144. stw r26, 164(SP)
  145. stw r25, 168(SP)
  146. stw r24, 172(SP)
  147. stw r23, 176(SP)
  148. stw r22, 180(SP)
  149. stw r21, 184(SP)
  150. stw r20, 188(SP)
  151. #if defined(TRMMKERNEL)
  152. stw r19, 192(SP)
  153. stw r18, 196(SP)
  154. #endif
  155. #endif
  156. stfd f1, ALPHA
  157. stw r0, FZERO
  158. #if defined(_AIX) || defined(__APPLE__)
  159. #if !defined(__64BIT__) && defined(DOUBLE)
  160. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  161. #endif
  162. #endif
  163. slwi LDC, LDC, BASE_SHIFT
  164. #if defined(TRMMKERNEL)
  165. #if defined(linux) && defined(__64BIT__)
  166. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  167. #endif
  168. #if defined(_AIX) || defined(__APPLE__)
  169. #ifdef __64BIT__
  170. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  171. #else
  172. #ifdef DOUBLE
  173. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  174. #else
  175. lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  176. #endif
  177. #endif
  178. #endif
  179. #endif
  180. #if defined(TRMMKERNEL) && !defined(LEFT)
  181. neg KK, OFFSET
  182. #endif
  183. cmpwi cr0, M, 0
  184. ble .L999
  185. cmpwi cr0, N, 0
  186. ble .L999
  187. cmpwi cr0, K, 0
  188. ble .L999
  189. srawi. J, N, 2
  190. ble .L40
  191. .align 4
  192. #define A1 f16
  193. #define A2 f17
  194. #define A3 f18
  195. #define A4 f19
  196. #define A5 f20
  197. #define A6 f21
  198. #define B1 f22
  199. #define B2 f23
  200. #define B3 f24
  201. #define B4 f25
  202. #define B5 f26
  203. #define B6 f27
  204. #define B7 f28
  205. #define B8 f29
  206. #define B9 f30
  207. #define B10 f31
  208. .L10:
  209. mr CO1, C
  210. add CO2, C, LDC
  211. add CO3, CO2, LDC
  212. add CO4, CO3, LDC
  213. #if defined(TRMMKERNEL) && defined(LEFT)
  214. mr KK, OFFSET
  215. #endif
  216. lfs f0, FZERO
  217. fmr f1, f0
  218. fmr f2, f0
  219. fmr f3, f0
  220. fmr f4, f0
  221. fmr f5, f0
  222. fmr f6, f0
  223. fmr f7, f0
  224. fmr f8, f0
  225. fmr f9, f0
  226. fmr f10, f0
  227. fmr f11, f0
  228. fmr f12, f0
  229. fmr f13, f0
  230. fmr f14, f0
  231. fmr f15, f0
  232. srawi. I, M, 2
  233. mr AO, A
  234. add C, CO4, LDC
  235. ble .L20
  236. .align 4
  237. .L11:
  238. #if defined(TRMMKERNEL)
  239. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  240. LFD A1, 0 * SIZE(AO)
  241. LFD A2, 1 * SIZE(AO)
  242. LFD A4, 4 * SIZE(AO)
  243. LFD A5, 8 * SIZE(AO)
  244. LFD B1, 0 * SIZE(B)
  245. LFD B2, 1 * SIZE(B)
  246. LFD B3, 2 * SIZE(B)
  247. LFD B4, 3 * SIZE(B)
  248. LFD B5, 4 * SIZE(B)
  249. LFD B6, 8 * SIZE(B)
  250. LFD B7, 12 * SIZE(B)
  251. mr BO, B
  252. #else
  253. slwi r0, KK, 2 + BASE_SHIFT
  254. add AO, AO, r0
  255. add BO, B, r0
  256. LFD A1, 0 * SIZE(AO)
  257. LFD A2, 1 * SIZE(AO)
  258. LFD A4, 4 * SIZE(AO)
  259. LFD A5, 8 * SIZE(AO)
  260. LFD B1, 0 * SIZE(BO)
  261. LFD B2, 1 * SIZE(BO)
  262. LFD B3, 2 * SIZE(BO)
  263. LFD B4, 3 * SIZE(BO)
  264. LFD B5, 4 * SIZE(BO)
  265. LFD B6, 8 * SIZE(BO)
  266. LFD B7, 12 * SIZE(BO)
  267. #endif
  268. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  269. sub TEMP, K, KK
  270. #elif defined(LEFT)
  271. addi TEMP, KK, 4
  272. #else
  273. addi TEMP, KK, 4
  274. #endif
  275. srawi. TEMP, TEMP, 2
  276. mtspr CTR, TEMP
  277. ble .L15
  278. #else
  279. LFD A1, 0 * SIZE(AO)
  280. LFD A2, 1 * SIZE(AO)
  281. LFD A4, 4 * SIZE(AO)
  282. LFD A5, 8 * SIZE(AO)
  283. LFD B1, 0 * SIZE(B)
  284. LFD B2, 1 * SIZE(B)
  285. LFD B3, 2 * SIZE(B)
  286. LFD B4, 3 * SIZE(B)
  287. LFD B5, 4 * SIZE(B)
  288. LFD B6, 8 * SIZE(B)
  289. LFD B7, 12 * SIZE(B)
  290. srawi. r0, K, 2
  291. mtspr CTR, r0
  292. mr BO, B
  293. ble .L15
  294. #endif
  295. .align 4
  296. .L12:
  297. FMADD f0, A1, B1, f0
  298. LFD A3, 2 * SIZE(AO)
  299. FMADD f4, A1, B2, f4
  300. LFD A6, 12 * SIZE(AO)
  301. FMADD f8, A1, B3, f8
  302. nop
  303. FMADD f12, A1, B4, f12
  304. nop
  305. FMADD f1, A2, B1, f1
  306. LFD A1, 3 * SIZE(AO)
  307. FMADD f5, A2, B2, f5
  308. nop
  309. FMADD f9, A2, B3, f9
  310. nop
  311. FMADD f13, A2, B4, f13
  312. nop
  313. FMADD f2, A3, B1, f2
  314. nop
  315. FMADD f6, A3, B2, f6
  316. LFD B8, 5 * SIZE(BO)
  317. FMADD f10, A3, B3, f10
  318. LFD B9, 6 * SIZE(BO)
  319. FMADD f14, A3, B4, f14
  320. LFD B10, 7 * SIZE(BO)
  321. FMADD f3, A1, B1, f3
  322. LFD A2, 5 * SIZE(AO)
  323. FMADD f7, A1, B2, f7
  324. LFD B1, 16 * SIZE(BO)
  325. FMADD f11, A1, B3, f11
  326. nop
  327. FMADD f15, A1, B4, f15
  328. nop
  329. FMADD f0, A4, B5, f0
  330. LFD A3, 6 * SIZE(AO)
  331. FMADD f4, A4, B8, f4
  332. LFD A1, 16 * SIZE(AO)
  333. FMADD f8, A4, B9, f8
  334. nop
  335. FMADD f12, A4, B10, f12
  336. nop
  337. FMADD f1, A2, B5, f1
  338. LFD A4, 7 * SIZE(AO)
  339. FMADD f5, A2, B8, f5
  340. nop
  341. FMADD f9, A2, B9, f9
  342. nop
  343. FMADD f13, A2, B10, f13
  344. nop
  345. FMADD f2, A3, B5, f2
  346. nop
  347. FMADD f6, A3, B8, f6
  348. LFD B2, 9 * SIZE(BO)
  349. FMADD f10, A3, B9, f10
  350. LFD B3, 10 * SIZE(BO)
  351. FMADD f14, A3, B10, f14
  352. LFD B4, 11 * SIZE(BO)
  353. FMADD f3, A4, B5, f3
  354. LFD A2, 9 * SIZE(AO)
  355. FMADD f7, A4, B8, f7
  356. LFD B5, 20 * SIZE(BO)
  357. FMADD f11, A4, B9, f11
  358. nop
  359. FMADD f15, A4, B10, f15
  360. nop
  361. FMADD f0, A5, B6, f0
  362. LFD A3, 10 * SIZE(AO)
  363. FMADD f4, A5, B2, f4
  364. LFD A4, 20 * SIZE(AO)
  365. FMADD f8, A5, B3, f8
  366. nop
  367. FMADD f12, A5, B4, f12
  368. nop
  369. FMADD f1, A2, B6, f1
  370. LFD A5, 11 * SIZE(AO)
  371. FMADD f5, A2, B2, f5
  372. nop
  373. FMADD f9, A2, B3, f9
  374. nop
  375. FMADD f13, A2, B4, f13
  376. nop
  377. FMADD f2, A3, B6, f2
  378. nop
  379. FMADD f6, A3, B2, f6
  380. LFD B8, 13 * SIZE(BO)
  381. FMADD f10, A3, B3, f10
  382. LFD B9, 14 * SIZE(BO)
  383. FMADD f14, A3, B4, f14
  384. LFD B10,15 * SIZE(BO)
  385. FMADD f3, A5, B6, f3
  386. LFD A2, 13 * SIZE(AO)
  387. FMADD f7, A5, B2, f7
  388. LFD B6, 24 * SIZE(BO)
  389. FMADD f11, A5, B3, f11
  390. nop
  391. FMADD f15, A5, B4, f15
  392. nop
  393. FMADD f0, A6, B7, f0
  394. LFD A3, 14 * SIZE(AO)
  395. FMADD f4, A6, B8, f4
  396. LFD A5, 24 * SIZE(AO)
  397. FMADD f8, A6, B9, f8
  398. nop
  399. FMADD f12, A6, B10, f12
  400. nop
  401. FMADD f1, A2, B7, f1
  402. LFD A6, 15 * SIZE(AO)
  403. FMADD f5, A2, B8, f5
  404. nop
  405. FMADD f9, A2, B9, f9
  406. nop
  407. FMADD f13, A2, B10, f13
  408. nop
  409. FMADD f2, A3, B7, f2
  410. addi AO, AO, 16 * SIZE
  411. FMADD f6, A3, B8, f6
  412. LFD B2, 17 * SIZE(BO)
  413. FMADD f10, A3, B9, f10
  414. LFD B3, 18 * SIZE(BO)
  415. FMADD f14, A3, B10, f14
  416. LFD B4, 19 * SIZE(BO)
  417. FMADD f3, A6, B7, f3
  418. LFD A2, 1 * SIZE(AO)
  419. FMADD f7, A6, B8, f7
  420. LFD B7, 28 * SIZE(BO)
  421. FMADD f11, A6, B9, f11
  422. addi BO, BO, 16 * SIZE
  423. FMADD f15, A6, B10, f15
  424. bdnz .L12
  425. .align 4
  426. .L15:
  427. lfd f30, ALPHA
  428. #if defined(TRMMKERNEL)
  429. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  430. sub TEMP, K, KK
  431. #elif defined(LEFT)
  432. addi TEMP, KK, 4
  433. #else
  434. addi TEMP, KK, 4
  435. #endif
  436. andi. TEMP, TEMP, 3
  437. mtspr CTR, TEMP
  438. #else
  439. andi. r0, K, 3
  440. mtspr CTR, r0
  441. #endif
  442. ble+ .L18
  443. .align 4
  444. .L16:
  445. FMADD f0, A1, B1, f0
  446. LFD A3, 2 * SIZE(AO)
  447. FMADD f4, A1, B2, f4
  448. FMADD f8, A1, B3, f8
  449. FMADD f12, A1, B4, f12
  450. LFD A4, 3 * SIZE(AO)
  451. FMADD f1, A2, B1, f1
  452. FMADD f5, A2, B2, f5
  453. FMADD f9, A2, B3, f9
  454. FMADD f13, A2, B4, f13
  455. LFDU A1, 4 * SIZE(AO)
  456. FMADD f2, A3, B1, f2
  457. FMADD f6, A3, B2, f6
  458. FMADD f10, A3, B3, f10
  459. FMADD f14, A3, B4, f14
  460. LFD A2, 1 * SIZE(AO)
  461. FMADD f3, A4, B1, f3
  462. LFDU B1, 4 * SIZE(BO)
  463. FMADD f7, A4, B2, f7
  464. LFD B2, 1 * SIZE(BO)
  465. FMADD f11, A4, B3, f11
  466. LFD B3, 2 * SIZE(BO)
  467. FMADD f15, A4, B4, f15
  468. LFD B4, 3 * SIZE(BO)
  469. bdnz .L16
  470. .align 4
  471. .L18:
  472. #ifndef TRMMKERNEL
  473. LFD f16, 0 * SIZE(CO1)
  474. LFD f17, 1 * SIZE(CO1)
  475. LFD f18, 2 * SIZE(CO1)
  476. LFD f19, 3 * SIZE(CO1)
  477. LFD f20, 0 * SIZE(CO2)
  478. LFD f21, 1 * SIZE(CO2)
  479. LFD f22, 2 * SIZE(CO2)
  480. LFD f23, 3 * SIZE(CO2)
  481. FMADD f0, f0, f30, f16
  482. LFD f16, 0 * SIZE(CO3)
  483. FMADD f1, f1, f30, f17
  484. LFD f17, 1 * SIZE(CO3)
  485. FMADD f2, f2, f30, f18
  486. LFD f18, 2 * SIZE(CO3)
  487. FMADD f3, f3, f30, f19
  488. LFD f19, 3 * SIZE(CO3)
  489. FMADD f4, f4, f30, f20
  490. LFD f20, 0 * SIZE(CO4)
  491. FMADD f5, f5, f30, f21
  492. LFD f21, 1 * SIZE(CO4)
  493. FMADD f6, f6, f30, f22
  494. LFD f22, 2 * SIZE(CO4)
  495. FMADD f7, f7, f30, f23
  496. LFD f23, 3 * SIZE(CO4)
  497. FMADD f8, f8, f30, f16
  498. FMADD f9, f9, f30, f17
  499. FMADD f10, f10, f30, f18
  500. FMADD f11, f11, f30, f19
  501. FMADD f12, f12, f30, f20
  502. FMADD f13, f13, f30, f21
  503. FMADD f14, f14, f30, f22
  504. FMADD f15, f15, f30, f23
  505. #else
  506. FMUL f0, f0, f30
  507. FMUL f1, f1, f30
  508. FMUL f2, f2, f30
  509. FMUL f3, f3, f30
  510. FMUL f4, f4, f30
  511. FMUL f5, f5, f30
  512. FMUL f6, f6, f30
  513. FMUL f7, f7, f30
  514. FMUL f8, f8, f30
  515. FMUL f9, f9, f30
  516. FMUL f10, f10, f30
  517. FMUL f11, f11, f30
  518. FMUL f12, f12, f30
  519. FMUL f13, f13, f30
  520. FMUL f14, f14, f30
  521. FMUL f15, f15, f30
  522. #endif
  523. STFD f0, 0 * SIZE(CO1)
  524. STFD f1, 1 * SIZE(CO1)
  525. STFD f2, 2 * SIZE(CO1)
  526. STFD f3, 3 * SIZE(CO1)
  527. lfs f0, FZERO
  528. fmr f1, f0
  529. fmr f2, f0
  530. fmr f3, f0
  531. STFD f4, 0 * SIZE(CO2)
  532. fmr f4, f0
  533. STFD f5, 1 * SIZE(CO2)
  534. fmr f5, f0
  535. STFD f6, 2 * SIZE(CO2)
  536. fmr f6, f0
  537. STFD f7, 3 * SIZE(CO2)
  538. fmr f7, f0
  539. STFD f8, 0 * SIZE(CO3)
  540. fmr f8, f0
  541. STFD f9, 1 * SIZE(CO3)
  542. fmr f9, f0
  543. STFD f10, 2 * SIZE(CO3)
  544. fmr f10, f0
  545. STFD f11, 3 * SIZE(CO3)
  546. fmr f11, f0
  547. STFD f12, 0 * SIZE(CO4)
  548. fmr f12, f0
  549. STFD f13, 1 * SIZE(CO4)
  550. fmr f13, f0
  551. STFD f14, 2 * SIZE(CO4)
  552. fmr f14, f0
  553. STFD f15, 3 * SIZE(CO4)
  554. fmr f15, f0
  555. addi CO1, CO1, 4 * SIZE
  556. addi CO2, CO2, 4 * SIZE
  557. addi CO3, CO3, 4 * SIZE
  558. addi CO4, CO4, 4 * SIZE
  559. #ifdef TRMMKERNEL
  560. #if ( defined(LEFT) && defined(TRANSA)) || \
  561. (!defined(LEFT) && !defined(TRANSA))
  562. sub TEMP, K, KK
  563. #ifdef LEFT
  564. addi TEMP, TEMP, -4
  565. #else
  566. addi TEMP, TEMP, -4
  567. #endif
  568. slwi TEMP, TEMP, 2 + BASE_SHIFT
  569. add AO, AO, TEMP
  570. add BO, BO, TEMP
  571. #endif
  572. #ifdef LEFT
  573. addi KK, KK, 4
  574. #endif
  575. #endif
  576. addic. I, I, -1
  577. bgt+ .L11
  578. .align 4
  579. .L20:
  580. andi. I, M, 2
  581. ble .L30
  582. #if defined(TRMMKERNEL)
  583. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  584. LFD f16, 0 * SIZE(AO)
  585. LFD f17, 1 * SIZE(AO)
  586. LFD f18, 2 * SIZE(AO)
  587. LFD f19, 3 * SIZE(AO)
  588. LFD f20, 0 * SIZE(B)
  589. LFD f21, 1 * SIZE(B)
  590. LFD f22, 2 * SIZE(B)
  591. LFD f23, 3 * SIZE(B)
  592. LFD f24, 4 * SIZE(B)
  593. LFD f25, 5 * SIZE(B)
  594. LFD f26, 6 * SIZE(B)
  595. LFD f27, 7 * SIZE(B)
  596. mr BO, B
  597. #else
  598. slwi r0, KK, 1 + BASE_SHIFT
  599. slwi TEMP, KK, 2 + BASE_SHIFT
  600. add AO, AO, r0
  601. add BO, B, TEMP
  602. LFD f16, 0 * SIZE(AO)
  603. LFD f17, 1 * SIZE(AO)
  604. LFD f18, 2 * SIZE(AO)
  605. LFD f19, 3 * SIZE(AO)
  606. LFD f20, 0 * SIZE(BO)
  607. LFD f21, 1 * SIZE(BO)
  608. LFD f22, 2 * SIZE(BO)
  609. LFD f23, 3 * SIZE(BO)
  610. LFD f24, 4 * SIZE(BO)
  611. LFD f25, 5 * SIZE(BO)
  612. LFD f26, 6 * SIZE(BO)
  613. LFD f27, 7 * SIZE(BO)
  614. #endif
  615. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  616. sub TEMP, K, KK
  617. #elif defined(LEFT)
  618. addi TEMP, KK, 2
  619. #else
  620. addi TEMP, KK, 4
  621. #endif
  622. srawi. TEMP, TEMP, 2
  623. mtspr CTR, TEMP
  624. #else
  625. LFD f16, 0 * SIZE(AO)
  626. LFD f17, 1 * SIZE(AO)
  627. LFD f18, 2 * SIZE(AO)
  628. LFD f19, 3 * SIZE(AO)
  629. LFD f20, 0 * SIZE(B)
  630. LFD f21, 1 * SIZE(B)
  631. LFD f22, 2 * SIZE(B)
  632. LFD f23, 3 * SIZE(B)
  633. LFD f24, 4 * SIZE(B)
  634. LFD f25, 5 * SIZE(B)
  635. LFD f26, 6 * SIZE(B)
  636. LFD f27, 7 * SIZE(B)
  637. srawi. r0, K, 2
  638. mtspr CTR, r0
  639. mr BO, B
  640. #endif
  641. ble .L25
  642. .align 5
  643. .L22:
  644. FMADD f0, f16, f20, f0
  645. nop
  646. FMADD f1, f17, f20, f1
  647. LFD f20, 8 * SIZE(BO)
  648. FMADD f4, f16, f21, f4
  649. nop
  650. FMADD f5, f17, f21, f5
  651. LFD f21, 9 * SIZE(BO)
  652. FMADD f8, f16, f22, f8
  653. nop
  654. FMADD f9, f17, f22, f9
  655. LFD f22, 10 * SIZE(BO)
  656. FMADD f12, f16, f23, f12
  657. LFD f16, 4 * SIZE(AO)
  658. FMADD f13, f17, f23, f13
  659. LFD f23, 11 * SIZE(BO)
  660. FMADD f2, f18, f24, f2
  661. LFD f17, 5 * SIZE(AO)
  662. FMADD f3, f19, f24, f3
  663. LFD f24, 12 * SIZE(BO)
  664. FMADD f6, f18, f25, f6
  665. nop
  666. FMADD f7, f19, f25, f7
  667. LFD f25, 13 * SIZE(BO)
  668. FMADD f10, f18, f26, f10
  669. nop
  670. FMADD f11, f19, f26, f11
  671. LFD f26, 14 * SIZE(BO)
  672. FMADD f14, f18, f27, f14
  673. LFD f18, 6 * SIZE(AO)
  674. FMADD f15, f19, f27, f15
  675. LFD f27, 15 * SIZE(BO)
  676. FMADD f0, f16, f20, f0
  677. LFD f19, 7 * SIZE(AO)
  678. FMADD f1, f17, f20, f1
  679. LFDU f20, 16 * SIZE(BO)
  680. FMADD f4, f16, f21, f4
  681. nop
  682. FMADD f5, f17, f21, f5
  683. LFD f21, 1 * SIZE(BO)
  684. FMADD f8, f16, f22, f8
  685. nop
  686. FMADD f9, f17, f22, f9
  687. LFD f22, 2 * SIZE(BO)
  688. FMADD f12, f16, f23, f12
  689. LFDU f16, 8 * SIZE(AO)
  690. FMADD f13, f17, f23, f13
  691. LFD f23, 3 * SIZE(BO)
  692. FMADD f2, f18, f24, f2
  693. LFD f17, 1 * SIZE(AO)
  694. FMADD f3, f19, f24, f3
  695. LFD f24, 4 * SIZE(BO)
  696. FMADD f6, f18, f25, f6
  697. nop
  698. FMADD f7, f19, f25, f7
  699. LFD f25, 5 * SIZE(BO)
  700. FMADD f10, f18, f26, f10
  701. nop
  702. FMADD f11, f19, f26, f11
  703. LFD f26, 6 * SIZE(BO)
  704. FMADD f14, f18, f27, f14
  705. LFD f18, 2 * SIZE(AO)
  706. FMADD f15, f19, f27, f15
  707. LFD f19, 3 * SIZE(AO)
  708. LFD f27, 7 * SIZE(BO)
  709. bdnz .L22
  710. fadd f0, f2, f0
  711. fadd f1, f3, f1
  712. fadd f4, f6, f4
  713. fadd f5, f7, f5
  714. fadd f8, f10, f8
  715. fadd f9, f11, f9
  716. fadd f12, f14, f12
  717. fadd f13, f15, f13
  718. .align 4
  719. .L25:
  720. lfd f30, ALPHA
  721. #if defined(TRMMKERNEL)
  722. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  723. sub TEMP, K, KK
  724. #elif defined(LEFT)
  725. addi TEMP, KK, 2
  726. #else
  727. addi TEMP, KK, 4
  728. #endif
  729. andi. TEMP, TEMP, 3
  730. mtspr CTR, TEMP
  731. #else
  732. andi. r0, K, 3
  733. mtspr CTR, r0
  734. #endif
  735. ble+ .L28
  736. .align 4
  737. .L26:
  738. FMADD f0, f16, f20, f0
  739. nop
  740. FMADD f1, f17, f20, f1
  741. LFDU f20, 4 * SIZE(BO)
  742. FMADD f4, f16, f21, f4
  743. nop
  744. FMADD f5, f17, f21, f5
  745. LFD f21, 1 * SIZE(BO)
  746. FMADD f8, f16, f22, f8
  747. nop
  748. FMADD f9, f17, f22, f9
  749. LFD f22, 2 * SIZE(BO)
  750. FMADD f12, f16, f23, f12
  751. LFDU f16, 2 * SIZE(AO)
  752. FMADD f13, f17, f23, f13
  753. LFD f17, 1 * SIZE(AO)
  754. LFD f23, 3 * SIZE(BO)
  755. bdnz .L26
  756. .align 4
  757. .L28:
  758. #ifndef TRMMKERNEL
  759. LFD f16, 0 * SIZE(CO1)
  760. LFD f17, 1 * SIZE(CO1)
  761. LFD f18, 0 * SIZE(CO2)
  762. LFD f19, 1 * SIZE(CO2)
  763. FMADD f0, f0, f30, f16
  764. FMADD f1, f1, f30, f17
  765. FMADD f4, f4, f30, f18
  766. FMADD f5, f5, f30, f19
  767. LFD f20, 0 * SIZE(CO3)
  768. LFD f21, 1 * SIZE(CO3)
  769. LFD f22, 0 * SIZE(CO4)
  770. LFD f23, 1 * SIZE(CO4)
  771. FMADD f8, f8, f30, f20
  772. FMADD f9, f9, f30, f21
  773. FMADD f12, f12, f30, f22
  774. FMADD f13, f13, f30, f23
  775. #else
  776. FMUL f0, f0, f30
  777. FMUL f1, f1, f30
  778. FMUL f4, f4, f30
  779. FMUL f5, f5, f30
  780. FMUL f8, f8, f30
  781. FMUL f9, f9, f30
  782. FMUL f12, f12, f30
  783. FMUL f13, f13, f30
  784. #endif
  785. STFD f0, 0 * SIZE(CO1)
  786. STFD f1, 1 * SIZE(CO1)
  787. STFD f4, 0 * SIZE(CO2)
  788. STFD f5, 1 * SIZE(CO2)
  789. lfs f0, FZERO
  790. fmr f1, f0
  791. fmr f2, f0
  792. fmr f3, f0
  793. STFD f8, 0 * SIZE(CO3)
  794. STFD f9, 1 * SIZE(CO3)
  795. STFD f12, 0 * SIZE(CO4)
  796. STFD f13, 1 * SIZE(CO4)
  797. fmr f4, f0
  798. fmr f5, f0
  799. fmr f6, f0
  800. fmr f7, f0
  801. fmr f8, f0
  802. fmr f9, f0
  803. fmr f10, f0
  804. fmr f11, f0
  805. fmr f12, f0
  806. fmr f13, f0
  807. fmr f14, f0
  808. fmr f15, f0
  809. addi CO1, CO1, 2 * SIZE
  810. addi CO2, CO2, 2 * SIZE
  811. addi CO3, CO3, 2 * SIZE
  812. addi CO4, CO4, 2 * SIZE
  813. #ifdef TRMMKERNEL
  814. #if ( defined(LEFT) && defined(TRANSA)) || \
  815. (!defined(LEFT) && !defined(TRANSA))
  816. sub TEMP, K, KK
  817. #ifdef LEFT
  818. addi TEMP, TEMP, -2
  819. #else
  820. addi TEMP, TEMP, -4
  821. #endif
  822. slwi r0, TEMP, 1 + BASE_SHIFT
  823. slwi TEMP, TEMP, 2 + BASE_SHIFT
  824. add AO, AO, r0
  825. add BO, BO, TEMP
  826. #endif
  827. #ifdef LEFT
  828. addi KK, KK, 2
  829. #endif
  830. #endif
  831. .align 4
  832. .L30:
  833. andi. I, M, 1
  834. ble .L39
  835. #if defined(TRMMKERNEL)
  836. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  837. LFD f16, 0 * SIZE(AO)
  838. LFD f17, 1 * SIZE(AO)
  839. LFD f18, 2 * SIZE(AO)
  840. LFD f19, 3 * SIZE(AO)
  841. LFD f20, 0 * SIZE(B)
  842. LFD f21, 1 * SIZE(B)
  843. LFD f22, 2 * SIZE(B)
  844. LFD f23, 3 * SIZE(B)
  845. LFD f24, 4 * SIZE(B)
  846. LFD f25, 5 * SIZE(B)
  847. LFD f26, 6 * SIZE(B)
  848. LFD f27, 7 * SIZE(B)
  849. mr BO, B
  850. #else
  851. slwi r0, KK, 0 + BASE_SHIFT
  852. slwi TEMP, KK, 2 + BASE_SHIFT
  853. add AO, AO, r0
  854. add BO, B, TEMP
  855. LFD f16, 0 * SIZE(AO)
  856. LFD f17, 1 * SIZE(AO)
  857. LFD f18, 2 * SIZE(AO)
  858. LFD f19, 3 * SIZE(AO)
  859. LFD f20, 0 * SIZE(BO)
  860. LFD f21, 1 * SIZE(BO)
  861. LFD f22, 2 * SIZE(BO)
  862. LFD f23, 3 * SIZE(BO)
  863. LFD f24, 4 * SIZE(BO)
  864. LFD f25, 5 * SIZE(BO)
  865. LFD f26, 6 * SIZE(BO)
  866. LFD f27, 7 * SIZE(BO)
  867. #endif
  868. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  869. sub TEMP, K, KK
  870. #elif defined(LEFT)
  871. addi TEMP, KK, 1
  872. #else
  873. addi TEMP, KK, 4
  874. #endif
  875. srawi. TEMP, TEMP, 2
  876. mtspr CTR, TEMP
  877. #else
  878. LFD f16, 0 * SIZE(AO)
  879. LFD f17, 1 * SIZE(AO)
  880. LFD f18, 2 * SIZE(AO)
  881. LFD f19, 3 * SIZE(AO)
  882. LFD f20, 0 * SIZE(B)
  883. LFD f21, 1 * SIZE(B)
  884. LFD f22, 2 * SIZE(B)
  885. LFD f23, 3 * SIZE(B)
  886. LFD f24, 4 * SIZE(B)
  887. LFD f25, 5 * SIZE(B)
  888. LFD f26, 6 * SIZE(B)
  889. LFD f27, 7 * SIZE(B)
  890. srawi. r0, K, 2
  891. mtspr CTR, r0
  892. mr BO, B
  893. #endif
  894. ble .L35
  895. .align 5
  896. .L32:
  897. FMADD f0, f16, f20, f0
  898. LFD f20, 8 * SIZE(BO)
  899. FMADD f4, f16, f21, f4
  900. LFD f21, 9 * SIZE(BO)
  901. FMADD f8, f16, f22, f8
  902. LFD f22, 10 * SIZE(BO)
  903. FMADD f12, f16, f23, f12
  904. LFD f23, 11 * SIZE(BO)
  905. LFDU f16, 4 * SIZE(AO)
  906. FMADD f1, f17, f24, f1
  907. LFD f24, 12 * SIZE(BO)
  908. FMADD f5, f17, f25, f5
  909. LFD f25, 13 * SIZE(BO)
  910. FMADD f9, f17, f26, f9
  911. LFD f26, 14 * SIZE(BO)
  912. FMADD f13, f17, f27, f13
  913. LFD f27, 15 * SIZE(BO)
  914. LFD f17, 1 * SIZE(AO)
  915. FMADD f0, f18, f20, f0
  916. LFDU f20, 16 * SIZE(BO)
  917. FMADD f4, f18, f21, f4
  918. LFD f21, 1 * SIZE(BO)
  919. FMADD f8, f18, f22, f8
  920. LFD f22, 2 * SIZE(BO)
  921. FMADD f12, f18, f23, f12
  922. LFD f23, 3 * SIZE(BO)
  923. LFD f18, 2 * SIZE(AO)
  924. FMADD f1, f19, f24, f1
  925. LFD f24, 4 * SIZE(BO)
  926. FMADD f5, f19, f25, f5
  927. LFD f25, 5 * SIZE(BO)
  928. FMADD f9, f19, f26, f9
  929. LFD f26, 6 * SIZE(BO)
  930. FMADD f13, f19, f27, f13
  931. LFD f27, 7 * SIZE(BO)
  932. LFD f19, 3 * SIZE(AO)
  933. bdnz .L32
  934. fadd f0, f1, f0
  935. fadd f4, f5, f4
  936. fadd f8, f9, f8
  937. fadd f12, f13, f12
  938. .align 4
  939. .L35:
  940. lfd f30, ALPHA
  941. #if defined(TRMMKERNEL)
  942. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  943. sub TEMP, K, KK
  944. #elif defined(LEFT)
  945. addi TEMP, KK, 1
  946. #else
  947. addi TEMP, KK, 4
  948. #endif
  949. andi. TEMP, TEMP, 3
  950. mtspr CTR, TEMP
  951. #else
  952. andi. r0, K, 3
  953. mtspr CTR, r0
  954. #endif
  955. ble+ .L38
  956. .align 4
  957. .L36:
  958. FMADD f0, f16, f20, f0
  959. LFDU f20, 4 * SIZE(BO)
  960. FMADD f4, f16, f21, f4
  961. LFD f21, 1 * SIZE(BO)
  962. FMADD f8, f16, f22, f8
  963. LFD f22, 2 * SIZE(BO)
  964. FMADD f12, f16, f23, f12
  965. LFDU f16, 1 * SIZE(AO)
  966. LFD f23, 3 * SIZE(BO)
  967. bdnz .L36
  968. .align 4
  969. .L38:
  970. #ifndef TRMMKERNEL
  971. LFD f16, 0 * SIZE(CO1)
  972. LFD f18, 0 * SIZE(CO2)
  973. LFD f20, 0 * SIZE(CO3)
  974. LFD f22, 0 * SIZE(CO4)
  975. FMADD f0, f0, f30, f16
  976. FMADD f4, f4, f30, f18
  977. FMADD f8, f8, f30, f20
  978. FMADD f12, f12, f30, f22
  979. #else
  980. FMUL f0, f0, f30
  981. FMUL f4, f4, f30
  982. FMUL f8, f8, f30
  983. FMUL f12, f12, f30
  984. #endif
  985. STFD f0, 0 * SIZE(CO1)
  986. STFD f4, 0 * SIZE(CO2)
  987. STFD f8, 0 * SIZE(CO3)
  988. STFD f12, 0 * SIZE(CO4)
  989. lfs f0, FZERO
  990. fmr f1, f0
  991. fmr f4, f0
  992. fmr f5, f0
  993. fmr f8, f0
  994. fmr f9, f0
  995. fmr f12, f0
  996. fmr f13, f0
  997. #ifdef TRMMKERNEL
  998. #if ( defined(LEFT) && defined(TRANSA)) || \
  999. (!defined(LEFT) && !defined(TRANSA))
  1000. sub TEMP, K, KK
  1001. #ifdef LEFT
  1002. addi TEMP, TEMP, -1
  1003. #else
  1004. addi TEMP, TEMP, -4
  1005. #endif
  1006. slwi r0, TEMP, 0 + BASE_SHIFT
  1007. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1008. add AO, AO, r0
  1009. add BO, BO, TEMP
  1010. #endif
  1011. #ifdef LEFT
  1012. addi KK, KK, 2
  1013. #endif
  1014. #endif
  1015. .align 4
  1016. .L39:
  1017. #if defined(TRMMKERNEL) && !defined(LEFT)
  1018. addi KK, KK, 4
  1019. #endif
  1020. mr B, BO
  1021. addic. J, J, -1
  1022. bgt .L10
  1023. .align 4
  1024. .L40:
  1025. mr CO1, C
  1026. add CO2, C, LDC
  1027. andi. J, N, 2
  1028. ble .L70
  1029. #if defined(TRMMKERNEL) && defined(LEFT)
  1030. mr KK, OFFSET
  1031. #endif
  1032. lfs f0, FZERO
  1033. fmr f1, f0
  1034. fmr f2, f0
  1035. fmr f3, f0
  1036. fmr f4, f0
  1037. fmr f5, f0
  1038. fmr f6, f0
  1039. fmr f7, f0
  1040. srawi. I, M, 2
  1041. add C, CO2, LDC
  1042. mr AO, A
  1043. ble .L50
  1044. .align 4
  1045. .L41:
  1046. #if defined(TRMMKERNEL)
  1047. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1048. LFD f16, 0 * SIZE(AO)
  1049. LFD f17, 1 * SIZE(AO)
  1050. LFD f18, 2 * SIZE(AO)
  1051. LFD f19, 3 * SIZE(AO)
  1052. LFD f20, 0 * SIZE(B)
  1053. LFD f21, 1 * SIZE(B)
  1054. LFD f22, 2 * SIZE(B)
  1055. LFD f23, 3 * SIZE(B)
  1056. mr BO, B
  1057. #else
  1058. slwi r0, KK, 2 + BASE_SHIFT
  1059. slwi TEMP, KK, 1 + BASE_SHIFT
  1060. add AO, AO, r0
  1061. add BO, B, TEMP
  1062. LFD f16, 0 * SIZE(AO)
  1063. LFD f17, 1 * SIZE(AO)
  1064. LFD f18, 2 * SIZE(AO)
  1065. LFD f19, 3 * SIZE(AO)
  1066. LFD f20, 0 * SIZE(BO)
  1067. LFD f21, 1 * SIZE(BO)
  1068. LFD f22, 2 * SIZE(BO)
  1069. LFD f23, 3 * SIZE(BO)
  1070. #endif
  1071. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1072. sub TEMP, K, KK
  1073. #elif defined(LEFT)
  1074. addi TEMP, KK, 4
  1075. #else
  1076. addi TEMP, KK, 2
  1077. #endif
  1078. srawi. TEMP, TEMP, 2
  1079. mtspr CTR, TEMP
  1080. #else
  1081. LFD f16, 0 * SIZE(AO)
  1082. LFD f17, 1 * SIZE(AO)
  1083. LFD f18, 2 * SIZE(AO)
  1084. LFD f19, 3 * SIZE(AO)
  1085. LFD f20, 0 * SIZE(B)
  1086. LFD f21, 1 * SIZE(B)
  1087. LFD f22, 2 * SIZE(B)
  1088. LFD f23, 3 * SIZE(B)
  1089. srawi. r0, K, 2
  1090. mtspr CTR, r0
  1091. mr BO, B
  1092. #endif
  1093. ble .L45
  1094. .align 5
  1095. .L42:
  1096. FMADD f0, f16, f20, f0
  1097. FMADD f1, f17, f20, f1
  1098. FMADD f2, f18, f20, f2
  1099. FMADD f3, f19, f20, f3
  1100. LFD f20, 4 * SIZE(BO)
  1101. FMADD f4, f16, f21, f4
  1102. LFD f16, 4 * SIZE(AO)
  1103. FMADD f5, f17, f21, f5
  1104. LFD f17, 5 * SIZE(AO)
  1105. FMADD f6, f18, f21, f6
  1106. LFD f18, 6 * SIZE(AO)
  1107. FMADD f7, f19, f21, f7
  1108. LFD f19, 7 * SIZE(AO)
  1109. FMADD f0, f16, f22, f0
  1110. LFD f21, 5 * SIZE(BO)
  1111. FMADD f1, f17, f22, f1
  1112. FMADD f2, f18, f22, f2
  1113. FMADD f3, f19, f22, f3
  1114. LFD f22, 6 * SIZE(BO)
  1115. FMADD f4, f16, f23, f4
  1116. LFD f16, 8 * SIZE(AO)
  1117. FMADD f5, f17, f23, f5
  1118. LFD f17, 9 * SIZE(AO)
  1119. FMADD f6, f18, f23, f6
  1120. LFD f18, 10 * SIZE(AO)
  1121. FMADD f7, f19, f23, f7
  1122. LFD f19, 11 * SIZE(AO)
  1123. FMADD f0, f16, f20, f0
  1124. LFD f23, 7 * SIZE(BO)
  1125. FMADD f1, f17, f20, f1
  1126. FMADD f2, f18, f20, f2
  1127. FMADD f3, f19, f20, f3
  1128. LFDU f20, 8 * SIZE(BO)
  1129. FMADD f4, f16, f21, f4
  1130. LFD f16, 12 * SIZE(AO)
  1131. FMADD f5, f17, f21, f5
  1132. LFD f17, 13 * SIZE(AO)
  1133. FMADD f6, f18, f21, f6
  1134. LFD f18, 14 * SIZE(AO)
  1135. FMADD f7, f19, f21, f7
  1136. LFD f19, 15 * SIZE(AO)
  1137. FMADD f0, f16, f22, f0
  1138. LFD f21, 1 * SIZE(BO)
  1139. FMADD f1, f17, f22, f1
  1140. FMADD f2, f18, f22, f2
  1141. FMADD f3, f19, f22, f3
  1142. LFD f22, 2 * SIZE(BO)
  1143. FMADD f4, f16, f23, f4
  1144. LFDU f16, 16 * SIZE(AO)
  1145. FMADD f5, f17, f23, f5
  1146. LFD f17, 1 * SIZE(AO)
  1147. FMADD f6, f18, f23, f6
  1148. LFD f18, 2 * SIZE(AO)
  1149. FMADD f7, f19, f23, f7
  1150. LFD f19, 3 * SIZE(AO)
  1151. LFD f23, 3 * SIZE(BO)
  1152. bdnz .L42
  1153. .align 4
  1154. .L45:
  1155. lfd f30, ALPHA
  1156. #if defined(TRMMKERNEL)
  1157. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1158. sub TEMP, K, KK
  1159. #elif defined(LEFT)
  1160. addi TEMP, KK, 4
  1161. #else
  1162. addi TEMP, KK, 2
  1163. #endif
  1164. andi. TEMP, TEMP, 3
  1165. mtspr CTR, TEMP
  1166. #else
  1167. andi. r0, K, 3
  1168. mtspr CTR, r0
  1169. #endif
  1170. ble+ .L48
  1171. .align 4
  1172. .L46:
  1173. FMADD f0, f16, f20, f0
  1174. FMADD f1, f17, f20, f1
  1175. FMADD f2, f18, f20, f2
  1176. FMADD f3, f19, f20, f3
  1177. LFDU f20, 2 * SIZE(BO)
  1178. FMADD f4, f16, f21, f4
  1179. LFDU f16, 4 * SIZE(AO)
  1180. FMADD f5, f17, f21, f5
  1181. LFD f17, 1 * SIZE(AO)
  1182. FMADD f6, f18, f21, f6
  1183. LFD f18, 2 * SIZE(AO)
  1184. FMADD f7, f19, f21, f7
  1185. LFD f19, 3 * SIZE(AO)
  1186. LFD f21, 1 * SIZE(BO)
  1187. bdnz .L46
  1188. .align 4
  1189. .L48:
  1190. #ifndef TRMMKERNEL
  1191. LFD f16, 0 * SIZE(CO1)
  1192. LFD f17, 1 * SIZE(CO1)
  1193. LFD f18, 2 * SIZE(CO1)
  1194. LFD f19, 3 * SIZE(CO1)
  1195. LFD f20, 0 * SIZE(CO2)
  1196. LFD f21, 1 * SIZE(CO2)
  1197. LFD f22, 2 * SIZE(CO2)
  1198. LFD f23, 3 * SIZE(CO2)
  1199. FMADD f0, f0, f30, f16
  1200. FMADD f1, f1, f30, f17
  1201. FMADD f2, f2, f30, f18
  1202. FMADD f3, f3, f30, f19
  1203. FMADD f4, f4, f30, f20
  1204. FMADD f5, f5, f30, f21
  1205. FMADD f6, f6, f30, f22
  1206. FMADD f7, f7, f30, f23
  1207. #else
  1208. FMUL f0, f0, f30
  1209. FMUL f1, f1, f30
  1210. FMUL f2, f2, f30
  1211. FMUL f3, f3, f30
  1212. FMUL f4, f4, f30
  1213. FMUL f5, f5, f30
  1214. FMUL f6, f6, f30
  1215. FMUL f7, f7, f30
  1216. #endif
  1217. STFD f0, 0 * SIZE(CO1)
  1218. STFD f1, 1 * SIZE(CO1)
  1219. STFD f2, 2 * SIZE(CO1)
  1220. STFD f3, 3 * SIZE(CO1)
  1221. lfs f0, FZERO
  1222. fmr f1, f0
  1223. fmr f2, f0
  1224. fmr f3, f0
  1225. STFD f4, 0 * SIZE(CO2)
  1226. STFD f5, 1 * SIZE(CO2)
  1227. STFD f6, 2 * SIZE(CO2)
  1228. STFD f7, 3 * SIZE(CO2)
  1229. fmr f4, f0
  1230. fmr f5, f0
  1231. fmr f6, f0
  1232. fmr f7, f0
  1233. addi CO1, CO1, 4 * SIZE
  1234. addi CO2, CO2, 4 * SIZE
  1235. #ifdef TRMMKERNEL
  1236. #if ( defined(LEFT) && defined(TRANSA)) || \
  1237. (!defined(LEFT) && !defined(TRANSA))
  1238. sub TEMP, K, KK
  1239. #ifdef LEFT
  1240. addi TEMP, TEMP, -4
  1241. #else
  1242. addi TEMP, TEMP, -2
  1243. #endif
  1244. slwi r0, TEMP, 2 + BASE_SHIFT
  1245. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1246. add AO, AO, r0
  1247. add BO, BO, TEMP
  1248. #endif
  1249. #ifdef LEFT
  1250. addi KK, KK, 4
  1251. #endif
  1252. #endif
  1253. addic. I, I, -1
  1254. bgt+ .L41
  1255. .align 4
  1256. .L50:
  1257. andi. I, M, 2
  1258. ble .L60
  1259. #if defined(TRMMKERNEL)
  1260. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1261. LFD f16, 0 * SIZE(AO)
  1262. LFD f17, 1 * SIZE(AO)
  1263. LFD f18, 2 * SIZE(AO)
  1264. LFD f19, 3 * SIZE(AO)
  1265. LFD f20, 0 * SIZE(B)
  1266. LFD f21, 1 * SIZE(B)
  1267. LFD f22, 2 * SIZE(B)
  1268. LFD f23, 3 * SIZE(B)
  1269. LFD f24, 4 * SIZE(B)
  1270. LFD f25, 5 * SIZE(B)
  1271. LFD f26, 6 * SIZE(B)
  1272. LFD f27, 7 * SIZE(B)
  1273. mr BO, B
  1274. #else
  1275. slwi r0, KK, 1 + BASE_SHIFT
  1276. slwi TEMP, KK, 1 + BASE_SHIFT
  1277. add AO, AO, r0
  1278. add BO, B, TEMP
  1279. LFD f16, 0 * SIZE(AO)
  1280. LFD f17, 1 * SIZE(AO)
  1281. LFD f18, 2 * SIZE(AO)
  1282. LFD f19, 3 * SIZE(AO)
  1283. LFD f20, 0 * SIZE(BO)
  1284. LFD f21, 1 * SIZE(BO)
  1285. LFD f22, 2 * SIZE(BO)
  1286. LFD f23, 3 * SIZE(BO)
  1287. LFD f24, 4 * SIZE(BO)
  1288. LFD f25, 5 * SIZE(BO)
  1289. LFD f26, 6 * SIZE(BO)
  1290. LFD f27, 7 * SIZE(BO)
  1291. #endif
  1292. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1293. sub TEMP, K, KK
  1294. #elif defined(LEFT)
  1295. addi TEMP, KK, 2
  1296. #else
  1297. addi TEMP, KK, 2
  1298. #endif
  1299. srawi. TEMP, TEMP, 2
  1300. mtspr CTR, TEMP
  1301. #else
  1302. LFD f16, 0 * SIZE(AO)
  1303. LFD f17, 1 * SIZE(AO)
  1304. LFD f18, 2 * SIZE(AO)
  1305. LFD f19, 3 * SIZE(AO)
  1306. LFD f20, 0 * SIZE(B)
  1307. LFD f21, 1 * SIZE(B)
  1308. LFD f22, 2 * SIZE(B)
  1309. LFD f23, 3 * SIZE(B)
  1310. LFD f24, 4 * SIZE(B)
  1311. LFD f25, 5 * SIZE(B)
  1312. LFD f26, 6 * SIZE(B)
  1313. LFD f27, 7 * SIZE(B)
  1314. srawi. r0, K, 2
  1315. mtspr CTR, r0
  1316. mr BO, B
  1317. #endif
  1318. ble .L55
  1319. .align 5
  1320. .L52:
  1321. FMADD f0, f16, f20, f0
  1322. FMADD f1, f17, f20, f1
  1323. LFDU f20, 8 * SIZE(BO)
  1324. FMADD f2, f16, f21, f2
  1325. LFD f16, 4 * SIZE(AO)
  1326. FMADD f3, f17, f21, f3
  1327. LFD f17, 5 * SIZE(AO)
  1328. FMADD f4, f18, f22, f4
  1329. LFD f21, 1 * SIZE(BO)
  1330. FMADD f5, f19, f22, f5
  1331. LFD f22, 2 * SIZE(BO)
  1332. FMADD f6, f18, f23, f6
  1333. LFD f18, 6 * SIZE(AO)
  1334. FMADD f7, f19, f23, f7
  1335. LFD f19, 7 * SIZE(AO)
  1336. FMADD f0, f16, f24, f0
  1337. LFD f23, 3 * SIZE(BO)
  1338. FMADD f1, f17, f24, f1
  1339. LFD f24, 4 * SIZE(BO)
  1340. FMADD f2, f16, f25, f2
  1341. LFDU f16, 8 * SIZE(AO)
  1342. FMADD f3, f17, f25, f3
  1343. LFD f17, 1 * SIZE(AO)
  1344. FMADD f4, f18, f26, f4
  1345. LFD f25, 5 * SIZE(BO)
  1346. FMADD f5, f19, f26, f5
  1347. LFD f26, 6 * SIZE(BO)
  1348. FMADD f6, f18, f27, f6
  1349. LFD f18, 2 * SIZE(AO)
  1350. FMADD f7, f19, f27, f7
  1351. LFD f19, 3 * SIZE(AO)
  1352. LFD f27, 7 * SIZE(BO)
  1353. bdnz .L52
  1354. .align 4
  1355. .L55:
  1356. lfd f30, ALPHA
  1357. #if defined(TRMMKERNEL)
  1358. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1359. sub TEMP, K, KK
  1360. #elif defined(LEFT)
  1361. addi TEMP, KK, 2
  1362. #else
  1363. addi TEMP, KK, 2
  1364. #endif
  1365. andi. TEMP, TEMP, 3
  1366. mtspr CTR, TEMP
  1367. #else
  1368. andi. r0, K, 3
  1369. mtspr CTR, r0
  1370. #endif
  1371. ble+ .L58
  1372. .align 4
  1373. .L56:
  1374. FMADD f0, f16, f20, f0
  1375. FMADD f1, f17, f20, f1
  1376. LFDU f20, 2 * SIZE(BO)
  1377. FMADD f2, f16, f21, f2
  1378. LFDU f16, 2 * SIZE(AO)
  1379. FMADD f3, f17, f21, f3
  1380. LFD f17, 1 * SIZE(AO)
  1381. LFD f21, 1 * SIZE(BO)
  1382. bdnz .L56
  1383. .align 4
  1384. .L58:
  1385. #ifndef TRMMKERNEL
  1386. LFD f16, 0 * SIZE(CO1)
  1387. LFD f17, 1 * SIZE(CO1)
  1388. LFD f18, 0 * SIZE(CO2)
  1389. LFD f19, 1 * SIZE(CO2)
  1390. FADD f0, f4, f0
  1391. FADD f1, f5, f1
  1392. FADD f2, f6, f2
  1393. FADD f3, f7, f3
  1394. FMADD f0, f0, f30, f16
  1395. FMADD f1, f1, f30, f17
  1396. FMADD f2, f2, f30, f18
  1397. FMADD f3, f3, f30, f19
  1398. #else
  1399. FADD f0, f4, f0
  1400. FADD f1, f5, f1
  1401. FADD f2, f6, f2
  1402. FADD f3, f7, f3
  1403. FMUL f0, f0, f30
  1404. FMUL f1, f1, f30
  1405. FMUL f2, f2, f30
  1406. FMUL f3, f3, f30
  1407. #endif
  1408. STFD f0, 0 * SIZE(CO1)
  1409. STFD f1, 1 * SIZE(CO1)
  1410. STFD f2, 0 * SIZE(CO2)
  1411. STFD f3, 1 * SIZE(CO2)
  1412. lfs f0, FZERO
  1413. fmr f1, f0
  1414. fmr f2, f0
  1415. fmr f3, f0
  1416. fmr f4, f0
  1417. fmr f5, f0
  1418. fmr f6, f0
  1419. fmr f7, f0
  1420. addi CO1, CO1, 2 * SIZE
  1421. addi CO2, CO2, 2 * SIZE
  1422. #ifdef TRMMKERNEL
  1423. #if ( defined(LEFT) && defined(TRANSA)) || \
  1424. (!defined(LEFT) && !defined(TRANSA))
  1425. sub TEMP, K, KK
  1426. #ifdef LEFT
  1427. addi TEMP, TEMP, -2
  1428. #else
  1429. addi TEMP, TEMP, -2
  1430. #endif
  1431. slwi r0, TEMP, 1 + BASE_SHIFT
  1432. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1433. add AO, AO, r0
  1434. add BO, BO, TEMP
  1435. #endif
  1436. #ifdef LEFT
  1437. addi KK, KK, 2
  1438. #endif
  1439. #endif
  1440. .align 4
  1441. .L60:
  1442. andi. I, M, 1
  1443. ble .L69
  1444. #if defined(TRMMKERNEL)
  1445. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1446. LFD f16, 0 * SIZE(AO)
  1447. LFD f17, 1 * SIZE(AO)
  1448. LFD f18, 2 * SIZE(AO)
  1449. LFD f19, 3 * SIZE(AO)
  1450. LFD f20, 0 * SIZE(B)
  1451. LFD f21, 1 * SIZE(B)
  1452. LFD f22, 2 * SIZE(B)
  1453. LFD f23, 3 * SIZE(B)
  1454. LFD f24, 4 * SIZE(B)
  1455. LFD f25, 5 * SIZE(B)
  1456. LFD f26, 6 * SIZE(B)
  1457. LFD f27, 7 * SIZE(B)
  1458. mr BO, B
  1459. #else
  1460. slwi r0, KK, 0 + BASE_SHIFT
  1461. slwi TEMP, KK, 1 + BASE_SHIFT
  1462. add AO, AO, r0
  1463. add BO, B, TEMP
  1464. LFD f16, 0 * SIZE(AO)
  1465. LFD f17, 1 * SIZE(AO)
  1466. LFD f18, 2 * SIZE(AO)
  1467. LFD f19, 3 * SIZE(AO)
  1468. LFD f20, 0 * SIZE(BO)
  1469. LFD f21, 1 * SIZE(BO)
  1470. LFD f22, 2 * SIZE(BO)
  1471. LFD f23, 3 * SIZE(BO)
  1472. LFD f24, 4 * SIZE(BO)
  1473. LFD f25, 5 * SIZE(BO)
  1474. LFD f26, 6 * SIZE(BO)
  1475. LFD f27, 7 * SIZE(BO)
  1476. #endif
  1477. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1478. sub TEMP, K, KK
  1479. #elif defined(LEFT)
  1480. addi TEMP, KK, 1
  1481. #else
  1482. addi TEMP, KK, 2
  1483. #endif
  1484. srawi. TEMP, TEMP, 2
  1485. mtspr CTR, TEMP
  1486. #else
  1487. LFD f16, 0 * SIZE(AO)
  1488. LFD f17, 1 * SIZE(AO)
  1489. LFD f18, 2 * SIZE(AO)
  1490. LFD f19, 3 * SIZE(AO)
  1491. LFD f20, 0 * SIZE(B)
  1492. LFD f21, 1 * SIZE(B)
  1493. LFD f22, 2 * SIZE(B)
  1494. LFD f23, 3 * SIZE(B)
  1495. LFD f24, 4 * SIZE(B)
  1496. LFD f25, 5 * SIZE(B)
  1497. LFD f26, 6 * SIZE(B)
  1498. LFD f27, 7 * SIZE(B)
  1499. srawi. r0, K, 2
  1500. mtspr CTR, r0
  1501. mr BO, B
  1502. #endif
  1503. ble .L65
  1504. .align 5
  1505. .L62:
  1506. FMADD f0, f16, f20, f0
  1507. LFDU f20, 8 * SIZE(BO)
  1508. FMADD f1, f16, f21, f1
  1509. LFDU f16, 4 * SIZE(AO)
  1510. LFD f21, 1 * SIZE(BO)
  1511. FMADD f2, f17, f22, f2
  1512. LFD f22, 2 * SIZE(BO)
  1513. FMADD f3, f17, f23, f3
  1514. LFD f17, 1 * SIZE(AO)
  1515. LFD f23, 3 * SIZE(BO)
  1516. FMADD f0, f18, f24, f0
  1517. LFD f24, 4 * SIZE(BO)
  1518. FMADD f1, f18, f25, f1
  1519. LFD f18, 2 * SIZE(AO)
  1520. LFD f25, 5 * SIZE(BO)
  1521. FMADD f2, f19, f26, f2
  1522. LFD f26, 6 * SIZE(BO)
  1523. FMADD f3, f19, f27, f3
  1524. LFD f19, 3 * SIZE(AO)
  1525. LFD f27, 7 * SIZE(BO)
  1526. bdnz .L62
  1527. .align 4
  1528. .L65:
  1529. lfd f30, ALPHA
  1530. #if defined(TRMMKERNEL)
  1531. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1532. sub TEMP, K, KK
  1533. #elif defined(LEFT)
  1534. addi TEMP, KK, 1
  1535. #else
  1536. addi TEMP, KK, 2
  1537. #endif
  1538. andi. TEMP, TEMP, 3
  1539. mtspr CTR, TEMP
  1540. #else
  1541. andi. r0, K, 3
  1542. mtspr CTR, r0
  1543. #endif
  1544. ble+ .L68
  1545. .align 4
  1546. .L66:
  1547. FMADD f0, f16, f20, f0
  1548. LFDU f20, 2 * SIZE(BO)
  1549. FMADD f1, f16, f21, f1
  1550. LFDU f16, 1 * SIZE(AO)
  1551. LFD f21, 1 * SIZE(BO)
  1552. bdnz .L66
  1553. .align 4
  1554. .L68:
  1555. #ifndef TRMMKERNEL
  1556. LFD f16, 0 * SIZE(CO1)
  1557. LFD f18, 0 * SIZE(CO2)
  1558. FADD f0, f2, f0
  1559. FADD f1, f3, f1
  1560. FMADD f0, f0, f30, f16
  1561. FMADD f1, f1, f30, f18
  1562. #else
  1563. FADD f0, f2, f0
  1564. FADD f1, f3, f1
  1565. FMUL f0, f0, f30
  1566. FMUL f1, f1, f30
  1567. #endif
  1568. STFD f0, 0 * SIZE(CO1)
  1569. STFD f1, 0 * SIZE(CO2)
  1570. lfs f0, FZERO
  1571. fmr f1, f0
  1572. fmr f4, f0
  1573. fmr f5, f0
  1574. #ifdef TRMMKERNEL
  1575. #if ( defined(LEFT) && defined(TRANSA)) || \
  1576. (!defined(LEFT) && !defined(TRANSA))
  1577. sub TEMP, K, KK
  1578. #ifdef LEFT
  1579. addi TEMP, TEMP, -1
  1580. #else
  1581. addi TEMP, TEMP, -2
  1582. #endif
  1583. slwi r0, TEMP, 0 + BASE_SHIFT
  1584. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1585. add AO, AO, r0
  1586. add BO, BO, TEMP
  1587. #endif
  1588. #ifdef LEFT
  1589. addi KK, KK, 1
  1590. #endif
  1591. #endif
  1592. .align 4
  1593. .L69:
  1594. #if defined(TRMMKERNEL) && !defined(LEFT)
  1595. addi KK, KK, 2
  1596. #endif
  1597. mr B, BO
  1598. .align 4
  1599. .L70:
  1600. mr CO1, C
  1601. andi. J, N, 1
  1602. ble .L999
  1603. #if defined(TRMMKERNEL) && defined(LEFT)
  1604. mr KK, OFFSET
  1605. #endif
  1606. lfs f0, FZERO
  1607. fmr f1, f0
  1608. fmr f2, f0
  1609. fmr f3, f0
  1610. srawi. I, M, 2
  1611. mr AO, A
  1612. ble .L80
  1613. .align 4
  1614. .L71:
  1615. #if defined(TRMMKERNEL)
  1616. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1617. LFD f16, 0 * SIZE(AO)
  1618. LFD f17, 1 * SIZE(AO)
  1619. LFD f18, 2 * SIZE(AO)
  1620. LFD f19, 3 * SIZE(AO)
  1621. LFD f20, 0 * SIZE(B)
  1622. LFD f21, 1 * SIZE(B)
  1623. LFD f22, 2 * SIZE(B)
  1624. LFD f23, 3 * SIZE(B)
  1625. mr BO, B
  1626. #else
  1627. slwi r0, KK, 2 + BASE_SHIFT
  1628. slwi TEMP, KK, 0 + BASE_SHIFT
  1629. add AO, AO, r0
  1630. add BO, B, TEMP
  1631. LFD f16, 0 * SIZE(AO)
  1632. LFD f17, 1 * SIZE(AO)
  1633. LFD f18, 2 * SIZE(AO)
  1634. LFD f19, 3 * SIZE(AO)
  1635. LFD f20, 0 * SIZE(BO)
  1636. LFD f21, 1 * SIZE(BO)
  1637. LFD f22, 2 * SIZE(BO)
  1638. LFD f23, 3 * SIZE(BO)
  1639. #endif
  1640. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1641. sub TEMP, K, KK
  1642. #elif defined(LEFT)
  1643. addi TEMP, KK, 4
  1644. #else
  1645. addi TEMP, KK, 1
  1646. #endif
  1647. srawi. TEMP, TEMP, 2
  1648. mtspr CTR, TEMP
  1649. #else
  1650. LFD f16, 0 * SIZE(AO)
  1651. LFD f17, 1 * SIZE(AO)
  1652. LFD f18, 2 * SIZE(AO)
  1653. LFD f19, 3 * SIZE(AO)
  1654. LFD f20, 0 * SIZE(B)
  1655. LFD f21, 1 * SIZE(B)
  1656. LFD f22, 2 * SIZE(B)
  1657. LFD f23, 3 * SIZE(B)
  1658. srawi. r0, K, 2
  1659. mtspr CTR, r0
  1660. mr BO, B
  1661. ble .L75
  1662. #endif
  1663. ble .L75
  1664. .align 5
  1665. .L72:
  1666. FMADD f0, f16, f20, f0
  1667. LFD f16, 4 * SIZE(AO)
  1668. FMADD f1, f17, f20, f1
  1669. LFD f17, 5 * SIZE(AO)
  1670. FMADD f2, f18, f20, f2
  1671. LFD f18, 6 * SIZE(AO)
  1672. FMADD f3, f19, f20, f3
  1673. LFD f19, 7 * SIZE(AO)
  1674. LFDU f20, 4 * SIZE(BO)
  1675. FMADD f0, f16, f21, f0
  1676. LFD f16, 8 * SIZE(AO)
  1677. FMADD f1, f17, f21, f1
  1678. LFD f17, 9 * SIZE(AO)
  1679. FMADD f2, f18, f21, f2
  1680. LFD f18, 10 * SIZE(AO)
  1681. FMADD f3, f19, f21, f3
  1682. LFD f19, 11 * SIZE(AO)
  1683. LFD f21, 1 * SIZE(BO)
  1684. FMADD f0, f16, f22, f0
  1685. LFD f16, 12 * SIZE(AO)
  1686. FMADD f1, f17, f22, f1
  1687. LFD f17, 13 * SIZE(AO)
  1688. FMADD f2, f18, f22, f2
  1689. LFD f18, 14 * SIZE(AO)
  1690. FMADD f3, f19, f22, f3
  1691. LFD f19, 15 * SIZE(AO)
  1692. LFD f22, 2 * SIZE(BO)
  1693. FMADD f0, f16, f23, f0
  1694. LFDU f16, 16 * SIZE(AO)
  1695. FMADD f1, f17, f23, f1
  1696. LFD f17, 1 * SIZE(AO)
  1697. FMADD f2, f18, f23, f2
  1698. LFD f18, 2 * SIZE(AO)
  1699. FMADD f3, f19, f23, f3
  1700. LFD f19, 3 * SIZE(AO)
  1701. LFD f23, 3 * SIZE(BO)
  1702. bdnz .L72
  1703. .align 4
  1704. .L75:
  1705. lfd f30, ALPHA
  1706. #if defined(TRMMKERNEL)
  1707. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1708. sub TEMP, K, KK
  1709. #elif defined(LEFT)
  1710. addi TEMP, KK, 4
  1711. #else
  1712. addi TEMP, KK, 1
  1713. #endif
  1714. andi. TEMP, TEMP, 3
  1715. mtspr CTR, TEMP
  1716. #else
  1717. andi. r0, K, 3
  1718. mtspr CTR, r0
  1719. #endif
  1720. ble+ .L78
  1721. .align 4
  1722. .L76:
  1723. FMADD f0, f16, f20, f0
  1724. LFDU f16, 4 * SIZE(AO)
  1725. FMADD f1, f17, f20, f1
  1726. LFD f17, 1 * SIZE(AO)
  1727. FMADD f2, f18, f20, f2
  1728. LFD f18, 2 * SIZE(AO)
  1729. FMADD f3, f19, f20, f3
  1730. LFDU f20, 1 * SIZE(BO)
  1731. LFD f19, 3 * SIZE(AO)
  1732. bdnz .L76
  1733. .align 4
  1734. .L78:
  1735. #ifndef TRMMKERNEL
  1736. LFD f16, 0 * SIZE(CO1)
  1737. LFD f17, 1 * SIZE(CO1)
  1738. LFD f18, 2 * SIZE(CO1)
  1739. LFD f19, 3 * SIZE(CO1)
  1740. FMADD f0, f0, f30, f16
  1741. FMADD f1, f1, f30, f17
  1742. FMADD f2, f2, f30, f18
  1743. FMADD f3, f3, f30, f19
  1744. #else
  1745. FMUL f0, f0, f30
  1746. FMUL f1, f1, f30
  1747. FMUL f2, f2, f30
  1748. FMUL f3, f3, f30
  1749. #endif
  1750. STFD f0, 0 * SIZE(CO1)
  1751. STFD f1, 1 * SIZE(CO1)
  1752. STFD f2, 2 * SIZE(CO1)
  1753. STFD f3, 3 * SIZE(CO1)
  1754. lfs f0, FZERO
  1755. fmr f1, f0
  1756. fmr f2, f0
  1757. fmr f3, f0
  1758. #ifdef TRMMKERNEL
  1759. #if ( defined(LEFT) && defined(TRANSA)) || \
  1760. (!defined(LEFT) && !defined(TRANSA))
  1761. sub TEMP, K, KK
  1762. #ifdef LEFT
  1763. addi TEMP, TEMP, -4
  1764. #else
  1765. addi TEMP, TEMP, -1
  1766. #endif
  1767. slwi r0 , TEMP, 2 + BASE_SHIFT
  1768. slwi TEMP, TEMP, 0 + BASE_SHIFT
  1769. add AO, AO, r0
  1770. add BO, BO, TEMP
  1771. #endif
  1772. #ifdef LEFT
  1773. addi KK, KK, 4
  1774. #endif
  1775. #endif
  1776. addi CO1, CO1, 4 * SIZE
  1777. addic. I, I, -1
  1778. bgt+ .L71
  1779. .align 4
  1780. .L80:
  1781. andi. I, M, 2
  1782. ble .L90
  1783. #if defined(TRMMKERNEL)
  1784. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1785. LFD f16, 0 * SIZE(AO)
  1786. LFD f17, 1 * SIZE(AO)
  1787. LFD f18, 2 * SIZE(AO)
  1788. LFD f19, 3 * SIZE(AO)
  1789. LFD f20, 0 * SIZE(B)
  1790. LFD f21, 1 * SIZE(B)
  1791. LFD f22, 2 * SIZE(B)
  1792. LFD f23, 3 * SIZE(B)
  1793. mr BO, B
  1794. #else
  1795. slwi r0, KK, 1 + BASE_SHIFT
  1796. slwi TEMP, KK, 0 + BASE_SHIFT
  1797. add AO, AO, r0
  1798. add BO, B, TEMP
  1799. LFD f16, 0 * SIZE(AO)
  1800. LFD f17, 1 * SIZE(AO)
  1801. LFD f18, 2 * SIZE(AO)
  1802. LFD f19, 3 * SIZE(AO)
  1803. LFD f20, 0 * SIZE(BO)
  1804. LFD f21, 1 * SIZE(BO)
  1805. LFD f22, 2 * SIZE(BO)
  1806. LFD f23, 3 * SIZE(BO)
  1807. #endif
  1808. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1809. sub TEMP, K, KK
  1810. #elif defined(LEFT)
  1811. addi TEMP, KK, 2
  1812. #else
  1813. addi TEMP, KK, 1
  1814. #endif
  1815. srawi. TEMP, TEMP, 2
  1816. mtspr CTR, TEMP
  1817. #else
  1818. LFD f16, 0 * SIZE(AO)
  1819. LFD f17, 1 * SIZE(AO)
  1820. LFD f18, 2 * SIZE(AO)
  1821. LFD f19, 3 * SIZE(AO)
  1822. LFD f20, 0 * SIZE(B)
  1823. LFD f21, 1 * SIZE(B)
  1824. LFD f22, 2 * SIZE(B)
  1825. LFD f23, 3 * SIZE(B)
  1826. srawi. r0, K, 2
  1827. mtspr CTR, r0
  1828. mr BO, B
  1829. #endif
  1830. ble .L85
  1831. .align 5
  1832. .L82:
  1833. FMADD f0, f16, f20, f0
  1834. LFD f16, 4 * SIZE(AO)
  1835. FMADD f1, f17, f20, f1
  1836. LFDU f20, 4 * SIZE(BO)
  1837. LFD f17, 5 * SIZE(AO)
  1838. FMADD f2, f18, f21, f2
  1839. LFD f18, 6 * SIZE(AO)
  1840. FMADD f3, f19, f21, f3
  1841. LFD f21, 1 * SIZE(BO)
  1842. LFD f19, 7 * SIZE(AO)
  1843. FMADD f0, f16, f22, f0
  1844. LFDU f16, 8 * SIZE(AO)
  1845. FMADD f1, f17, f22, f1
  1846. LFD f22, 2 * SIZE(BO)
  1847. LFD f17, 1 * SIZE(AO)
  1848. FMADD f2, f18, f23, f2
  1849. LFD f18, 2 * SIZE(AO)
  1850. FMADD f3, f19, f23, f3
  1851. LFD f23, 3 * SIZE(BO)
  1852. LFD f19, 3 * SIZE(AO)
  1853. bdnz .L82
  1854. .align 4
  1855. .L85:
  1856. lfd f30, ALPHA
  1857. #if defined(TRMMKERNEL)
  1858. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1859. sub TEMP, K, KK
  1860. #elif defined(LEFT)
  1861. addi TEMP, KK, 2
  1862. #else
  1863. addi TEMP, KK, 1
  1864. #endif
  1865. andi. TEMP, TEMP, 3
  1866. mtspr CTR, TEMP
  1867. #else
  1868. andi. r0, K, 3
  1869. mtspr CTR, r0
  1870. #endif
  1871. ble+ .L88
  1872. .align 4
  1873. .L86:
  1874. FMADD f0, f16, f20, f0
  1875. LFDU f16, 2 * SIZE(AO)
  1876. FMADD f1, f17, f20, f1
  1877. LFDU f20, 1 * SIZE(BO)
  1878. LFD f17, 1 * SIZE(AO)
  1879. bdnz .L86
  1880. .align 4
  1881. .L88:
  1882. #ifndef TRMMKERNEL
  1883. LFD f16, 0 * SIZE(CO1)
  1884. LFD f17, 1 * SIZE(CO1)
  1885. FADD f0, f2, f0
  1886. FADD f1, f3, f1
  1887. FMADD f0, f0, f30, f16
  1888. FMADD f1, f1, f30, f17
  1889. #else
  1890. FADD f0, f2, f0
  1891. FADD f1, f3, f1
  1892. FMUL f0, f0, f30
  1893. FMUL f1, f1, f30
  1894. #endif
  1895. STFD f0, 0 * SIZE(CO1)
  1896. STFD f1, 1 * SIZE(CO1)
  1897. lfs f0, FZERO
  1898. fmr f1, f0
  1899. fmr f2, f0
  1900. fmr f3, f0
  1901. addi CO1, CO1, 2 * SIZE
  1902. #ifdef TRMMKERNEL
  1903. #if ( defined(LEFT) && defined(TRANSA)) || \
  1904. (!defined(LEFT) && !defined(TRANSA))
  1905. sub TEMP, K, KK
  1906. #ifdef LEFT
  1907. addi TEMP, TEMP, -2
  1908. #else
  1909. addi TEMP, TEMP, -1
  1910. #endif
  1911. slwi r0 , TEMP, 1 + BASE_SHIFT
  1912. slwi TEMP, TEMP, 0 + BASE_SHIFT
  1913. add AO, AO, r0
  1914. add BO, BO, TEMP
  1915. #endif
  1916. #ifdef LEFT
  1917. addi KK, KK, 2
  1918. #endif
  1919. #endif
  1920. .align 4
  1921. .L90:
  1922. andi. I, M, 1
  1923. ble .L999
  1924. #if defined(TRMMKERNEL)
  1925. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1926. LFD f16, 0 * SIZE(AO)
  1927. LFD f17, 1 * SIZE(AO)
  1928. LFD f18, 2 * SIZE(AO)
  1929. LFD f19, 3 * SIZE(AO)
  1930. LFD f20, 0 * SIZE(B)
  1931. LFD f21, 1 * SIZE(B)
  1932. LFD f22, 2 * SIZE(B)
  1933. LFD f23, 3 * SIZE(B)
  1934. mr BO, B
  1935. #else
  1936. slwi r0, KK, 0 + BASE_SHIFT
  1937. slwi TEMP, KK, 0 + BASE_SHIFT
  1938. add AO, AO, r0
  1939. add BO, B, TEMP
  1940. LFD f16, 0 * SIZE(AO)
  1941. LFD f17, 1 * SIZE(AO)
  1942. LFD f18, 2 * SIZE(AO)
  1943. LFD f19, 3 * SIZE(AO)
  1944. LFD f20, 0 * SIZE(BO)
  1945. LFD f21, 1 * SIZE(BO)
  1946. LFD f22, 2 * SIZE(BO)
  1947. LFD f23, 3 * SIZE(BO)
  1948. #endif
  1949. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1950. sub TEMP, K, KK
  1951. #elif defined(LEFT)
  1952. addi TEMP, KK, 1
  1953. #else
  1954. addi TEMP, KK, 1
  1955. #endif
  1956. srawi. TEMP, TEMP, 3
  1957. mtspr CTR, TEMP
  1958. #else
  1959. LFD f16, 0 * SIZE(AO)
  1960. LFD f17, 1 * SIZE(AO)
  1961. LFD f18, 2 * SIZE(AO)
  1962. LFD f19, 3 * SIZE(AO)
  1963. LFD f20, 0 * SIZE(B)
  1964. LFD f21, 1 * SIZE(B)
  1965. LFD f22, 2 * SIZE(B)
  1966. LFD f23, 3 * SIZE(B)
  1967. srawi. r0, K, 3
  1968. mtspr CTR, r0
  1969. mr BO, B
  1970. #endif
  1971. ble .L95
  1972. .align 5
  1973. .L92:
  1974. FMADD f0, f16, f20, f0
  1975. LFD f16, 4 * SIZE(AO)
  1976. LFD f20, 4 * SIZE(BO)
  1977. FMADD f1, f17, f21, f1
  1978. LFD f17, 5 * SIZE(AO)
  1979. LFD f21, 5 * SIZE(BO)
  1980. FMADD f2, f18, f22, f2
  1981. LFD f18, 6 * SIZE(AO)
  1982. LFD f22, 6 * SIZE(BO)
  1983. FMADD f3, f19, f23, f3
  1984. LFD f19, 7 * SIZE(AO)
  1985. LFD f23, 7 * SIZE(BO)
  1986. FMADD f0, f16, f20, f0
  1987. LFDU f16, 8 * SIZE(AO)
  1988. LFDU f20, 8 * SIZE(BO)
  1989. FMADD f1, f17, f21, f1
  1990. LFD f17, 1 * SIZE(AO)
  1991. LFD f21, 1 * SIZE(BO)
  1992. FMADD f2, f18, f22, f2
  1993. LFD f18, 2 * SIZE(AO)
  1994. LFD f22, 2 * SIZE(BO)
  1995. FMADD f3, f19, f23, f3
  1996. LFD f19, 3 * SIZE(AO)
  1997. LFD f23, 3 * SIZE(BO)
  1998. bdnz .L92
  1999. .align 4
  2000. .L95:
  2001. lfd f30, ALPHA
  2002. #if defined(TRMMKERNEL)
  2003. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2004. sub TEMP, K, KK
  2005. #elif defined(LEFT)
  2006. addi TEMP, KK, 1
  2007. #else
  2008. addi TEMP, KK, 1
  2009. #endif
  2010. andi. TEMP, TEMP, 7
  2011. mtspr CTR, TEMP
  2012. #else
  2013. andi. r0, K, 7
  2014. mtspr CTR, r0
  2015. #endif
  2016. ble+ .L98
  2017. .align 4
  2018. .L96:
  2019. FMADD f0, f16, f20, f0
  2020. LFDU f16, 1 * SIZE(AO)
  2021. LFDU f20, 1 * SIZE(BO)
  2022. bdnz .L96
  2023. .align 4
  2024. .L98:
  2025. #ifndef TRMMKERNEL
  2026. LFD f16, 0 * SIZE(CO1)
  2027. FADD f0, f1, f0
  2028. FADD f2, f3, f2
  2029. FADD f0, f2, f0
  2030. FMADD f0, f0, f30, f16
  2031. #else
  2032. FADD f0, f1, f0
  2033. FADD f2, f3, f2
  2034. FADD f0, f2, f0
  2035. FMUL f0, f0, f30
  2036. #endif
  2037. STFD f0, 0 * SIZE(CO1)
  2038. .align 4
  2039. .L999:
  2040. addi r3, 0, 0
  2041. lfd f14, 0(SP)
  2042. lfd f15, 8(SP)
  2043. lfd f16, 16(SP)
  2044. lfd f17, 24(SP)
  2045. lfd f18, 32(SP)
  2046. lfd f19, 40(SP)
  2047. lfd f20, 48(SP)
  2048. lfd f21, 56(SP)
  2049. lfd f22, 64(SP)
  2050. lfd f23, 72(SP)
  2051. lfd f24, 80(SP)
  2052. lfd f25, 88(SP)
  2053. lfd f26, 96(SP)
  2054. lfd f27, 104(SP)
  2055. lfd f28, 112(SP)
  2056. lfd f29, 120(SP)
  2057. lfd f30, 128(SP)
  2058. lfd f31, 136(SP)
  2059. #ifdef __64BIT__
  2060. ld r31, 144(SP)
  2061. ld r30, 152(SP)
  2062. ld r29, 160(SP)
  2063. ld r28, 168(SP)
  2064. ld r27, 176(SP)
  2065. ld r26, 184(SP)
  2066. ld r25, 192(SP)
  2067. ld r24, 200(SP)
  2068. ld r23, 208(SP)
  2069. ld r22, 216(SP)
  2070. ld r21, 224(SP)
  2071. ld r20, 232(SP)
  2072. #if defined(TRMMKERNEL) || defined(TRSMKERNEL)
  2073. ld r19, 240(SP)
  2074. ld r18, 248(SP)
  2075. #endif
  2076. #else
  2077. lwz r31, 144(SP)
  2078. lwz r30, 148(SP)
  2079. lwz r29, 152(SP)
  2080. lwz r28, 156(SP)
  2081. lwz r27, 160(SP)
  2082. lwz r26, 164(SP)
  2083. lwz r25, 168(SP)
  2084. lwz r24, 172(SP)
  2085. lwz r23, 176(SP)
  2086. lwz r22, 180(SP)
  2087. lwz r21, 184(SP)
  2088. lwz r20, 188(SP)
  2089. #if defined(TRMMKERNEL) || defined(TRSMKERNEL)
  2090. lwz r19, 192(SP)
  2091. lwz r18, 196(SP)
  2092. #endif
  2093. #endif
  2094. addi SP, SP, STACKSIZE
  2095. blr
  2096. EPILOGUE