You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel.S 48 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #if defined(linux) || defined(__FreeBSD__)
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define TEMP r18
  88. #define KK r19
  89. #define BB r20
  90. #define I r21
  91. #define J r22
  92. #define AO r23
  93. #define BO r24
  94. #define CO1 r25
  95. #define CO2 r26
  96. #define CO3 r27
  97. #define CO4 r28
  98. #define PREA r29
  99. #define PREB r30
  100. #define PREC r31
  101. #ifndef NEEDPARAM
  102. PROLOGUE
  103. PROFCODE
  104. addi SP, SP, -STACKSIZE
  105. li r0, 0
  106. stfd f14, 0(SP)
  107. stfd f15, 8(SP)
  108. stfd f16, 16(SP)
  109. stfd f17, 24(SP)
  110. stfd f18, 32(SP)
  111. stfd f19, 40(SP)
  112. stfd f20, 48(SP)
  113. stfd f21, 56(SP)
  114. stfd f22, 64(SP)
  115. stfd f23, 72(SP)
  116. stfd f24, 80(SP)
  117. stfd f25, 88(SP)
  118. stfd f26, 96(SP)
  119. stfd f27, 104(SP)
  120. stfd f28, 112(SP)
  121. stfd f29, 120(SP)
  122. stfd f30, 128(SP)
  123. stfd f31, 136(SP)
  124. #ifdef __64BIT__
  125. std r31, 144(SP)
  126. std r30, 152(SP)
  127. std r29, 160(SP)
  128. std r28, 168(SP)
  129. std r27, 176(SP)
  130. std r26, 184(SP)
  131. std r25, 192(SP)
  132. std r24, 200(SP)
  133. std r23, 208(SP)
  134. std r22, 216(SP)
  135. std r21, 224(SP)
  136. std r20, 232(SP)
  137. #if defined(TRMMKERNEL)
  138. std r19, 240(SP)
  139. std r18, 248(SP)
  140. #endif
  141. #else
  142. stw r31, 144(SP)
  143. stw r30, 148(SP)
  144. stw r29, 152(SP)
  145. stw r28, 156(SP)
  146. stw r27, 160(SP)
  147. stw r26, 164(SP)
  148. stw r25, 168(SP)
  149. stw r24, 172(SP)
  150. stw r23, 176(SP)
  151. stw r22, 180(SP)
  152. stw r21, 184(SP)
  153. stw r20, 188(SP)
  154. #if defined(TRMMKERNEL)
  155. stw r19, 192(SP)
  156. stw r18, 196(SP)
  157. #endif
  158. #endif
  159. stfd f1, ALPHA
  160. stw r0, FZERO
  161. #if defined(_AIX) || defined(__APPLE__)
  162. #if !defined(__64BIT__) && defined(DOUBLE)
  163. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  164. #endif
  165. #endif
  166. slwi LDC, LDC, BASE_SHIFT
  167. #if defined(TRMMKERNEL)
  168. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  169. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  170. #endif
  171. #if defined(_AIX) || defined(__APPLE__)
  172. #ifdef __64BIT__
  173. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  174. #else
  175. #ifdef DOUBLE
  176. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  177. #else
  178. lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  179. #endif
  180. #endif
  181. #endif
  182. #endif
  183. #if defined(TRMMKERNEL) && !defined(LEFT)
  184. neg KK, OFFSET
  185. #endif
  186. cmpwi cr0, M, 0
  187. ble LL(999)
  188. cmpwi cr0, N, 0
  189. ble LL(999)
  190. cmpwi cr0, K, 0
  191. ble LL(999)
  192. #ifndef PREFETCHTEST
  193. /* Normal prefetch */
  194. #ifdef PPC970
  195. li PREC, 4 * SIZE
  196. #endif
  197. #ifdef POWER4
  198. li PREC, 4 * SIZE /* is 12 best? */
  199. #endif
  200. #ifdef POWER5
  201. li PREC, 3 * SIZE
  202. #endif
  203. #else
  204. #if defined(linux) || defined(__FreeBSD__)
  205. #ifndef __64BIT__
  206. mr PREA, r10
  207. lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
  208. lwz PREC, FRAMESLOT(1) + STACKSIZE(SP)
  209. #else
  210. ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  211. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  212. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  213. #endif
  214. #endif
  215. #if defined(_AIX) || defined(__APPLE__)
  216. #ifdef __64BIT__
  217. ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  218. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  219. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  220. #else
  221. #ifdef DOUBLE
  222. lwz PREA, FRAMESLOT(1) + STACKSIZE(SP)
  223. lwz PREB, FRAMESLOT(2) + STACKSIZE(SP)
  224. lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
  225. #else
  226. lwz PREA, FRAMESLOT(0) + STACKSIZE(SP)
  227. lwz PREB, FRAMESLOT(1) + STACKSIZE(SP)
  228. lwz PREC, FRAMESLOT(2) + STACKSIZE(SP)
  229. #endif
  230. #endif
  231. #endif
  232. #endif
  233. #ifndef PREFETCHTEST
  234. #ifdef PPC970
  235. #ifdef ALLOC_HUGETLB
  236. li PREA, (16 * 1 * SIZE)
  237. li PREB, (16 * 5 * SIZE)
  238. #else
  239. li PREA, (16 * 19 * SIZE)
  240. li PREB, (16 * 8 * SIZE)
  241. #endif
  242. #endif
  243. #ifdef POWER4
  244. #ifdef ALLOC_HUGETLB
  245. li PREA, (16 * 1 * SIZE)
  246. li PREB, (16 * 1 * SIZE)
  247. #else
  248. li PREA, (16 * 2 * SIZE)
  249. li PREB, (16 * 2 * SIZE)
  250. #endif
  251. #endif
  252. #ifdef POWER5
  253. #ifdef ALLOC_HUGETLB
  254. li PREA, (16 * 7 * SIZE)
  255. li PREB, (16 * 7 * SIZE)
  256. #else
  257. li PREA, (16 * 12 * SIZE)
  258. li PREB, (16 * 6 * SIZE)
  259. #endif
  260. #endif
  261. #endif
  262. srawi. J, N, 2
  263. ble LL(40)
  264. .align 4
  265. LL(10):
  266. mr CO1, C
  267. add CO2, C, LDC
  268. add CO3, CO2, LDC
  269. add CO4, CO3, LDC
  270. #if defined(TRMMKERNEL) && defined(LEFT)
  271. mr KK, OFFSET
  272. #endif
  273. slwi BB, K, BASE_SHIFT + 2
  274. lfs f0, FZERO
  275. fmr f1, f0
  276. fmr f2, f0
  277. fmr f3, f0
  278. fmr f4, f0
  279. fmr f5, f0
  280. fmr f6, f0
  281. fmr f7, f0
  282. fmr f8, f0
  283. fmr f9, f0
  284. fmr f10, f0
  285. fmr f11, f0
  286. fmr f12, f0
  287. fmr f13, f0
  288. fmr f14, f0
  289. fmr f15, f0
  290. srawi. I, M, 2
  291. mr AO, A
  292. add C, CO4, LDC
  293. ble LL(20)
  294. .align 4
  295. LL(11):
  296. #if defined(TRMMKERNEL)
  297. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  298. LFD f16, 0 * SIZE(AO)
  299. LFD f17, 1 * SIZE(AO)
  300. LFD f18, 2 * SIZE(AO)
  301. LFD f19, 3 * SIZE(AO)
  302. LFD f20, 0 * SIZE(B)
  303. LFD f21, 1 * SIZE(B)
  304. LFD f22, 2 * SIZE(B)
  305. LFD f23, 3 * SIZE(B)
  306. #ifdef POWER5
  307. LFD f28, 4 * SIZE(B)
  308. LFD f29, 5 * SIZE(B)
  309. LFD f30, 6 * SIZE(B)
  310. LFD f31, 7 * SIZE(B)
  311. #endif
  312. mr BO, B
  313. #else
  314. slwi r0, KK, 2 + BASE_SHIFT
  315. add AO, AO, r0
  316. add BO, B, r0
  317. LFD f16, 0 * SIZE(AO)
  318. LFD f17, 1 * SIZE(AO)
  319. LFD f18, 2 * SIZE(AO)
  320. LFD f19, 3 * SIZE(AO)
  321. LFD f20, 0 * SIZE(BO)
  322. LFD f21, 1 * SIZE(BO)
  323. LFD f22, 2 * SIZE(BO)
  324. LFD f23, 3 * SIZE(BO)
  325. #ifdef POWER5
  326. LFD f28, 4 * SIZE(BO)
  327. LFD f29, 5 * SIZE(BO)
  328. LFD f30, 6 * SIZE(BO)
  329. LFD f31, 7 * SIZE(BO)
  330. #endif
  331. #endif
  332. DCBTST(CO1, PREC)
  333. DCBTST(CO2, PREC)
  334. DCBTST(CO3, PREC)
  335. DCBTST(CO4, PREC)
  336. dcbt B, BB
  337. addi BB, BB, 16 * SIZE
  338. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  339. sub TEMP, K, KK
  340. #elif defined(LEFT)
  341. addi TEMP, KK, 4
  342. #else
  343. addi TEMP, KK, 4
  344. #endif
  345. srawi. TEMP, TEMP, 2
  346. mtspr CTR, TEMP
  347. ble LL(15)
  348. #else
  349. LFD f16, 0 * SIZE(AO)
  350. LFD f17, 1 * SIZE(AO)
  351. LFD f18, 2 * SIZE(AO)
  352. LFD f19, 3 * SIZE(AO)
  353. LFD f20, 0 * SIZE(B)
  354. LFD f21, 1 * SIZE(B)
  355. LFD f22, 2 * SIZE(B)
  356. LFD f23, 3 * SIZE(B)
  357. #ifdef POWER5
  358. LFD f28, 4 * SIZE(B)
  359. LFD f29, 5 * SIZE(B)
  360. LFD f30, 6 * SIZE(B)
  361. LFD f31, 7 * SIZE(B)
  362. #endif
  363. DCBTST(CO1, PREC)
  364. DCBTST(CO2, PREC)
  365. DCBTST(CO3, PREC)
  366. DCBTST(CO4, PREC)
  367. dcbt B, BB
  368. addi BB, BB, 16 * SIZE
  369. srawi. r0, K, 2
  370. mtspr CTR, r0
  371. mr BO, B
  372. ble LL(15)
  373. #endif
  374. .align 4
  375. LL(12):
  376. FMADD f0, f16, f20, f0
  377. FMADD f5, f17, f21, f5
  378. FMADD f10, f18, f22, f10
  379. FMADD f15, f19, f23, f15
  380. #if defined(ALLOC_HUGETLB) && !defined(POWER5)
  381. LFD f28, 4 * SIZE(BO)
  382. LFD f29, 5 * SIZE(BO)
  383. LFD f30, 6 * SIZE(BO)
  384. LFD f31, 7 * SIZE(BO)
  385. #endif
  386. FMADD f1, f17, f20, f1
  387. FMADD f2, f18, f20, f2
  388. FMADD f3, f19, f20, f3
  389. FMADD f4, f16, f21, f4
  390. #if !defined(ALLOC_HUGETLB) && !defined(POWER5)
  391. LFD f28, 4 * SIZE(BO)
  392. LFD f29, 5 * SIZE(BO)
  393. LFD f30, 6 * SIZE(BO)
  394. LFD f31, 7 * SIZE(BO)
  395. #endif
  396. LFD f24, 4 * SIZE(AO)
  397. LFD f25, 5 * SIZE(AO)
  398. LFD f26, 6 * SIZE(AO)
  399. LFD f27, 7 * SIZE(AO)
  400. FMADD f6, f18, f21, f6
  401. FMADD f7, f19, f21, f7
  402. FMADD f8, f16, f22, f8
  403. FMADD f9, f17, f22, f9
  404. FMADD f11, f19, f22, f11
  405. FMADD f12, f16, f23, f12
  406. FMADD f13, f17, f23, f13
  407. FMADD f14, f18, f23, f14
  408. LFD f20, 8 * SIZE(BO)
  409. LFD f21, 9 * SIZE(BO)
  410. LFD f22, 10 * SIZE(BO)
  411. LFD f23, 11 * SIZE(BO)
  412. FMADD f0, f24, f28, f0
  413. FMADD f5, f25, f29, f5
  414. FMADD f10, f26, f30, f10
  415. FMADD f15, f27, f31, f15
  416. LFD f16, 8 * SIZE(AO)
  417. LFD f17, 9 * SIZE(AO)
  418. LFD f18, 10 * SIZE(AO)
  419. LFD f19, 11 * SIZE(AO)
  420. FMADD f1, f25, f28, f1
  421. FMADD f2, f26, f28, f2
  422. FMADD f3, f27, f28, f3
  423. FMADD f4, f24, f29, f4
  424. FMADD f6, f26, f29, f6
  425. FMADD f7, f27, f29, f7
  426. FMADD f8, f24, f30, f8
  427. FMADD f9, f25, f30, f9
  428. FMADD f11, f27, f30, f11
  429. FMADD f12, f24, f31, f12
  430. FMADD f13, f25, f31, f13
  431. FMADD f14, f26, f31, f14
  432. LFD f28, 12 * SIZE(BO)
  433. LFD f29, 13 * SIZE(BO)
  434. LFD f30, 14 * SIZE(BO)
  435. LFD f31, 15 * SIZE(BO)
  436. FMADD f0, f16, f20, f0
  437. FMADD f5, f17, f21, f5
  438. FMADD f10, f18, f22, f10
  439. FMADD f15, f19, f23, f15
  440. LFD f24, 12 * SIZE(AO)
  441. LFD f25, 13 * SIZE(AO)
  442. LFD f26, 14 * SIZE(AO)
  443. LFD f27, 15 * SIZE(AO)
  444. FMADD f1, f17, f20, f1
  445. FMADD f2, f18, f20, f2
  446. FMADD f3, f19, f20, f3
  447. FMADD f4, f16, f21, f4
  448. FMADD f6, f18, f21, f6
  449. FMADD f7, f19, f21, f7
  450. FMADD f8, f16, f22, f8
  451. FMADD f9, f17, f22, f9
  452. FMADD f11, f19, f22, f11
  453. FMADD f12, f16, f23, f12
  454. FMADD f13, f17, f23, f13
  455. FMADD f14, f18, f23, f14
  456. #ifndef POWER5
  457. LFD f16, 16 * SIZE(AO)
  458. LFD f17, 17 * SIZE(AO)
  459. LFD f18, 18 * SIZE(AO)
  460. LFD f19, 19 * SIZE(AO)
  461. #else
  462. LFD f20, 16 * SIZE(BO)
  463. LFD f21, 17 * SIZE(BO)
  464. LFD f22, 18 * SIZE(BO)
  465. LFD f23, 19 * SIZE(BO)
  466. #endif
  467. FMADD f0, f24, f28, f0
  468. FMADD f5, f25, f29, f5
  469. FMADD f10, f26, f30, f10
  470. FMADD f15, f27, f31, f15
  471. #ifndef POWER5
  472. LFD f20, 16 * SIZE(BO)
  473. LFD f21, 17 * SIZE(BO)
  474. LFD f22, 18 * SIZE(BO)
  475. LFD f23, 19 * SIZE(BO)
  476. #else
  477. LFD f16, 16 * SIZE(AO)
  478. LFD f17, 17 * SIZE(AO)
  479. LFD f18, 18 * SIZE(AO)
  480. LFD f19, 19 * SIZE(AO)
  481. #endif
  482. FMADD f1, f25, f28, f1
  483. FMADD f2, f26, f28, f2
  484. FMADD f3, f27, f28, f3
  485. FMADD f4, f24, f29, f4
  486. FMADD f6, f26, f29, f6
  487. FMADD f7, f27, f29, f7
  488. FMADD f8, f24, f30, f8
  489. FMADD f9, f25, f30, f9
  490. FMADD f11, f27, f30, f11
  491. FMADD f12, f24, f31, f12
  492. FMADD f13, f25, f31, f13
  493. FMADD f14, f26, f31, f14
  494. #if (L2_SIZE == 1024976) && defined (ALLOC_HUGETLB)
  495. nop
  496. nop
  497. nop
  498. nop
  499. #endif
  500. #ifdef POWER5
  501. LFD f28, 20 * SIZE(BO)
  502. LFD f29, 21 * SIZE(BO)
  503. LFD f30, 22 * SIZE(BO)
  504. LFD f31, 23 * SIZE(BO)
  505. #endif
  506. addi AO, AO, 16 * SIZE
  507. addi BO, BO, 16 * SIZE
  508. #ifdef PPC970
  509. #ifndef ALLOC_HUGETLB
  510. DCBT(AO, PREA)
  511. #endif
  512. DCBT(BO, PREB)
  513. #endif
  514. #ifdef POWER4
  515. #ifndef ALLOC_HUGETLB
  516. DCBT(AO, PREA)
  517. #endif
  518. DCBT(BO, PREB)
  519. #endif
  520. #ifdef POWER5
  521. #ifndef ALLOC_HUGETLB
  522. DCBT(BO, PREB)
  523. DCBT(AO, PREA)
  524. #endif
  525. #endif
  526. bdnz LL(12)
  527. .align 4
  528. LL(15):
  529. lfd f30, ALPHA
  530. #if defined(TRMMKERNEL)
  531. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  532. sub TEMP, K, KK
  533. #elif defined(LEFT)
  534. addi TEMP, KK, 4
  535. #else
  536. addi TEMP, KK, 4
  537. #endif
  538. andi. TEMP, TEMP, 3
  539. mtspr CTR, TEMP
  540. #else
  541. andi. r0, K, 3
  542. mtspr CTR, r0
  543. #endif
  544. ble+ LL(18)
  545. .align 4
  546. LL(16):
  547. FMADD f0, f16, f20, f0
  548. FMADD f5, f17, f21, f5
  549. FMADD f10, f18, f22, f10
  550. FMADD f15, f19, f23, f15
  551. FMADD f1, f17, f20, f1
  552. FMADD f2, f18, f20, f2
  553. FMADD f3, f19, f20, f3
  554. FMADD f4, f16, f21, f4
  555. FMADD f6, f18, f21, f6
  556. FMADD f7, f19, f21, f7
  557. FMADD f8, f16, f22, f8
  558. FMADD f9, f17, f22, f9
  559. FMADD f11, f19, f22, f11
  560. FMADD f12, f16, f23, f12
  561. FMADD f13, f17, f23, f13
  562. FMADD f14, f18, f23, f14
  563. LFD f16, 4 * SIZE(AO)
  564. LFD f17, 5 * SIZE(AO)
  565. LFD f18, 6 * SIZE(AO)
  566. LFD f19, 7 * SIZE(AO)
  567. LFD f20, 4 * SIZE(BO)
  568. LFD f21, 5 * SIZE(BO)
  569. LFD f22, 6 * SIZE(BO)
  570. LFD f23, 7 * SIZE(BO)
  571. addi BO, BO, 4 * SIZE
  572. addi AO, AO, 4 * SIZE
  573. bdnz LL(16)
  574. .align 4
  575. LL(18):
  576. #ifndef TRMMKERNEL
  577. LFD f16, 0 * SIZE(CO1)
  578. LFD f17, 1 * SIZE(CO1)
  579. LFD f18, 2 * SIZE(CO1)
  580. LFD f19, 3 * SIZE(CO1)
  581. LFD f20, 0 * SIZE(CO2)
  582. LFD f21, 1 * SIZE(CO2)
  583. LFD f22, 2 * SIZE(CO2)
  584. LFD f23, 3 * SIZE(CO2)
  585. FMADD f0, f0, f30, f16
  586. FMADD f1, f1, f30, f17
  587. FMADD f2, f2, f30, f18
  588. FMADD f3, f3, f30, f19
  589. FMADD f4, f4, f30, f20
  590. FMADD f5, f5, f30, f21
  591. FMADD f6, f6, f30, f22
  592. FMADD f7, f7, f30, f23
  593. LFD f16, 0 * SIZE(CO3)
  594. LFD f17, 1 * SIZE(CO3)
  595. LFD f18, 2 * SIZE(CO3)
  596. LFD f19, 3 * SIZE(CO3)
  597. LFD f20, 0 * SIZE(CO4)
  598. LFD f21, 1 * SIZE(CO4)
  599. LFD f22, 2 * SIZE(CO4)
  600. LFD f23, 3 * SIZE(CO4)
  601. FMADD f8, f8, f30, f16
  602. FMADD f9, f9, f30, f17
  603. FMADD f10, f10, f30, f18
  604. FMADD f11, f11, f30, f19
  605. FMADD f12, f12, f30, f20
  606. FMADD f13, f13, f30, f21
  607. FMADD f14, f14, f30, f22
  608. FMADD f15, f15, f30, f23
  609. #else
  610. FMUL f0, f0, f30
  611. FMUL f1, f1, f30
  612. FMUL f2, f2, f30
  613. FMUL f3, f3, f30
  614. FMUL f4, f4, f30
  615. FMUL f5, f5, f30
  616. FMUL f6, f6, f30
  617. FMUL f7, f7, f30
  618. FMUL f8, f8, f30
  619. FMUL f9, f9, f30
  620. FMUL f10, f10, f30
  621. FMUL f11, f11, f30
  622. FMUL f12, f12, f30
  623. FMUL f13, f13, f30
  624. FMUL f14, f14, f30
  625. FMUL f15, f15, f30
  626. #endif
  627. STFD f0, 0 * SIZE(CO1)
  628. STFD f1, 1 * SIZE(CO1)
  629. STFD f2, 2 * SIZE(CO1)
  630. STFD f3, 3 * SIZE(CO1)
  631. lfs f0, FZERO
  632. fmr f1, f0
  633. fmr f2, f0
  634. fmr f3, f0
  635. STFD f4, 0 * SIZE(CO2)
  636. STFD f5, 1 * SIZE(CO2)
  637. STFD f6, 2 * SIZE(CO2)
  638. STFD f7, 3 * SIZE(CO2)
  639. fmr f4, f0
  640. fmr f5, f0
  641. fmr f6, f0
  642. fmr f7, f0
  643. STFD f8, 0 * SIZE(CO3)
  644. STFD f9, 1 * SIZE(CO3)
  645. STFD f10, 2 * SIZE(CO3)
  646. STFD f11, 3 * SIZE(CO3)
  647. fmr f8, f0
  648. fmr f9, f0
  649. fmr f10, f0
  650. fmr f11, f0
  651. STFD f12, 0 * SIZE(CO4)
  652. STFD f13, 1 * SIZE(CO4)
  653. STFD f14, 2 * SIZE(CO4)
  654. STFD f15, 3 * SIZE(CO4)
  655. fmr f12, f0
  656. fmr f13, f0
  657. fmr f14, f0
  658. fmr f15, f0
  659. addi CO1, CO1, 4 * SIZE
  660. addi CO2, CO2, 4 * SIZE
  661. addi CO3, CO3, 4 * SIZE
  662. addi CO4, CO4, 4 * SIZE
  663. #ifdef TRMMKERNEL
  664. #if ( defined(LEFT) && defined(TRANSA)) || \
  665. (!defined(LEFT) && !defined(TRANSA))
  666. sub TEMP, K, KK
  667. #ifdef LEFT
  668. addi TEMP, TEMP, -4
  669. #else
  670. addi TEMP, TEMP, -4
  671. #endif
  672. slwi TEMP, TEMP, 2 + BASE_SHIFT
  673. add AO, AO, TEMP
  674. add BO, BO, TEMP
  675. #endif
  676. #ifdef LEFT
  677. addi KK, KK, 4
  678. #endif
  679. #endif
  680. addic. I, I, -1
  681. bgt+ LL(11)
  682. .align 4
  683. LL(20):
  684. andi. I, M, 2
  685. ble LL(30)
  686. #if defined(TRMMKERNEL)
  687. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  688. LFD f16, 0 * SIZE(AO)
  689. LFD f17, 1 * SIZE(AO)
  690. LFD f18, 2 * SIZE(AO)
  691. LFD f19, 3 * SIZE(AO)
  692. LFD f20, 0 * SIZE(B)
  693. LFD f21, 1 * SIZE(B)
  694. LFD f22, 2 * SIZE(B)
  695. LFD f23, 3 * SIZE(B)
  696. LFD f24, 4 * SIZE(B)
  697. LFD f25, 5 * SIZE(B)
  698. LFD f26, 6 * SIZE(B)
  699. LFD f27, 7 * SIZE(B)
  700. mr BO, B
  701. #else
  702. slwi r0, KK, 1 + BASE_SHIFT
  703. slwi TEMP, KK, 2 + BASE_SHIFT
  704. add AO, AO, r0
  705. add BO, B, TEMP
  706. LFD f16, 0 * SIZE(AO)
  707. LFD f17, 1 * SIZE(AO)
  708. LFD f18, 2 * SIZE(AO)
  709. LFD f19, 3 * SIZE(AO)
  710. LFD f20, 0 * SIZE(BO)
  711. LFD f21, 1 * SIZE(BO)
  712. LFD f22, 2 * SIZE(BO)
  713. LFD f23, 3 * SIZE(BO)
  714. LFD f24, 4 * SIZE(BO)
  715. LFD f25, 5 * SIZE(BO)
  716. LFD f26, 6 * SIZE(BO)
  717. LFD f27, 7 * SIZE(BO)
  718. #endif
  719. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  720. sub TEMP, K, KK
  721. #elif defined(LEFT)
  722. addi TEMP, KK, 2
  723. #else
  724. addi TEMP, KK, 4
  725. #endif
  726. srawi. TEMP, TEMP, 2
  727. mtspr CTR, TEMP
  728. #else
  729. LFD f16, 0 * SIZE(AO)
  730. LFD f17, 1 * SIZE(AO)
  731. LFD f18, 2 * SIZE(AO)
  732. LFD f19, 3 * SIZE(AO)
  733. LFD f20, 0 * SIZE(B)
  734. LFD f21, 1 * SIZE(B)
  735. LFD f22, 2 * SIZE(B)
  736. LFD f23, 3 * SIZE(B)
  737. LFD f24, 4 * SIZE(B)
  738. LFD f25, 5 * SIZE(B)
  739. LFD f26, 6 * SIZE(B)
  740. LFD f27, 7 * SIZE(B)
  741. srawi. r0, K, 2
  742. mtspr CTR, r0
  743. mr BO, B
  744. #endif
  745. ble LL(25)
  746. .align 5
  747. LL(22):
  748. FMADD f0, f16, f20, f0
  749. FMADD f1, f17, f20, f1
  750. FMADD f4, f16, f21, f4
  751. FMADD f5, f17, f21, f5
  752. FMADD f8, f16, f22, f8
  753. FMADD f9, f17, f22, f9
  754. FMADD f12, f16, f23, f12
  755. FMADD f13, f17, f23, f13
  756. LFD f20, 8 * SIZE(BO)
  757. LFD f21, 9 * SIZE(BO)
  758. LFD f22, 10 * SIZE(BO)
  759. LFD f23, 11 * SIZE(BO)
  760. FMADD f2, f18, f24, f2
  761. FMADD f3, f19, f24, f3
  762. FMADD f6, f18, f25, f6
  763. FMADD f7, f19, f25, f7
  764. FMADD f10, f18, f26, f10
  765. FMADD f11, f19, f26, f11
  766. FMADD f14, f18, f27, f14
  767. FMADD f15, f19, f27, f15
  768. LFD f16, 4 * SIZE(AO)
  769. LFD f17, 5 * SIZE(AO)
  770. LFD f18, 6 * SIZE(AO)
  771. LFD f19, 7 * SIZE(AO)
  772. FMADD f0, f16, f20, f0
  773. FMADD f1, f17, f20, f1
  774. FMADD f4, f16, f21, f4
  775. FMADD f5, f17, f21, f5
  776. LFD f24, 12 * SIZE(BO)
  777. LFD f25, 13 * SIZE(BO)
  778. LFD f26, 14 * SIZE(BO)
  779. LFD f27, 15 * SIZE(BO)
  780. FMADD f8, f16, f22, f8
  781. FMADD f9, f17, f22, f9
  782. FMADD f12, f16, f23, f12
  783. FMADD f13, f17, f23, f13
  784. LFD f20, 16 * SIZE(BO)
  785. LFD f21, 17 * SIZE(BO)
  786. LFD f22, 18 * SIZE(BO)
  787. LFD f23, 19 * SIZE(BO)
  788. FMADD f2, f18, f24, f2
  789. FMADD f3, f19, f24, f3
  790. FMADD f6, f18, f25, f6
  791. FMADD f7, f19, f25, f7
  792. FMADD f10, f18, f26, f10
  793. FMADD f11, f19, f26, f11
  794. FMADD f14, f18, f27, f14
  795. FMADD f15, f19, f27, f15
  796. LFD f16, 8 * SIZE(AO)
  797. LFD f17, 9 * SIZE(AO)
  798. LFD f18, 10 * SIZE(AO)
  799. LFD f19, 11 * SIZE(AO)
  800. LFD f24, 20 * SIZE(BO)
  801. LFD f25, 21 * SIZE(BO)
  802. LFD f26, 22 * SIZE(BO)
  803. LFD f27, 23 * SIZE(BO)
  804. addi AO, AO, 8 * SIZE
  805. addi BO, BO, 16 * SIZE
  806. DCBT(BO, PREB)
  807. bdnz LL(22)
  808. fadd f0, f2, f0
  809. fadd f1, f3, f1
  810. fadd f4, f6, f4
  811. fadd f5, f7, f5
  812. fadd f8, f10, f8
  813. fadd f9, f11, f9
  814. fadd f12, f14, f12
  815. fadd f13, f15, f13
  816. .align 4
  817. LL(25):
  818. lfd f30, ALPHA
  819. #if defined(TRMMKERNEL)
  820. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  821. sub TEMP, K, KK
  822. #elif defined(LEFT)
  823. addi TEMP, KK, 2
  824. #else
  825. addi TEMP, KK, 4
  826. #endif
  827. andi. TEMP, TEMP, 3
  828. mtspr CTR, TEMP
  829. #else
  830. andi. r0, K, 3
  831. mtspr CTR, r0
  832. #endif
  833. ble+ LL(28)
  834. .align 4
  835. LL(26):
  836. FMADD f0, f16, f20, f0
  837. FMADD f1, f17, f20, f1
  838. FMADD f4, f16, f21, f4
  839. FMADD f5, f17, f21, f5
  840. FMADD f8, f16, f22, f8
  841. FMADD f9, f17, f22, f9
  842. FMADD f12, f16, f23, f12
  843. FMADD f13, f17, f23, f13
  844. LFD f16, 2 * SIZE(AO)
  845. LFD f17, 3 * SIZE(AO)
  846. LFD f20, 4 * SIZE(BO)
  847. LFD f21, 5 * SIZE(BO)
  848. LFD f22, 6 * SIZE(BO)
  849. LFD f23, 7 * SIZE(BO)
  850. addi BO, BO, 4 * SIZE
  851. addi AO, AO, 2 * SIZE
  852. bdnz LL(26)
  853. .align 4
  854. LL(28):
  855. #ifndef TRMMKERNEL
  856. LFD f16, 0 * SIZE(CO1)
  857. LFD f17, 1 * SIZE(CO1)
  858. LFD f18, 0 * SIZE(CO2)
  859. LFD f19, 1 * SIZE(CO2)
  860. FMADD f0, f0, f30, f16
  861. FMADD f1, f1, f30, f17
  862. FMADD f4, f4, f30, f18
  863. FMADD f5, f5, f30, f19
  864. LFD f20, 0 * SIZE(CO3)
  865. LFD f21, 1 * SIZE(CO3)
  866. LFD f22, 0 * SIZE(CO4)
  867. LFD f23, 1 * SIZE(CO4)
  868. FMADD f8, f8, f30, f20
  869. FMADD f9, f9, f30, f21
  870. FMADD f12, f12, f30, f22
  871. FMADD f13, f13, f30, f23
  872. #else
  873. FMUL f0, f0, f30
  874. FMUL f1, f1, f30
  875. FMUL f4, f4, f30
  876. FMUL f5, f5, f30
  877. FMUL f8, f8, f30
  878. FMUL f9, f9, f30
  879. FMUL f12, f12, f30
  880. FMUL f13, f13, f30
  881. #endif
  882. STFD f0, 0 * SIZE(CO1)
  883. STFD f1, 1 * SIZE(CO1)
  884. STFD f4, 0 * SIZE(CO2)
  885. STFD f5, 1 * SIZE(CO2)
  886. lfs f0, FZERO
  887. fmr f1, f0
  888. fmr f2, f0
  889. fmr f3, f0
  890. STFD f8, 0 * SIZE(CO3)
  891. STFD f9, 1 * SIZE(CO3)
  892. STFD f12, 0 * SIZE(CO4)
  893. STFD f13, 1 * SIZE(CO4)
  894. fmr f4, f0
  895. fmr f5, f0
  896. fmr f6, f0
  897. fmr f7, f0
  898. fmr f8, f0
  899. fmr f9, f0
  900. fmr f10, f0
  901. fmr f11, f0
  902. fmr f12, f0
  903. fmr f13, f0
  904. fmr f14, f0
  905. fmr f15, f0
  906. addi CO1, CO1, 2 * SIZE
  907. addi CO2, CO2, 2 * SIZE
  908. addi CO3, CO3, 2 * SIZE
  909. addi CO4, CO4, 2 * SIZE
  910. #ifdef TRMMKERNEL
  911. #if ( defined(LEFT) && defined(TRANSA)) || \
  912. (!defined(LEFT) && !defined(TRANSA))
  913. sub TEMP, K, KK
  914. #ifdef LEFT
  915. addi TEMP, TEMP, -2
  916. #else
  917. addi TEMP, TEMP, -4
  918. #endif
  919. slwi r0, TEMP, 1 + BASE_SHIFT
  920. slwi TEMP, TEMP, 2 + BASE_SHIFT
  921. add AO, AO, r0
  922. add BO, BO, TEMP
  923. #endif
  924. #ifdef LEFT
  925. addi KK, KK, 2
  926. #endif
  927. #endif
  928. .align 4
  929. LL(30):
  930. andi. I, M, 1
  931. ble LL(39)
  932. #if defined(TRMMKERNEL)
  933. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  934. LFD f16, 0 * SIZE(AO)
  935. LFD f17, 1 * SIZE(AO)
  936. LFD f18, 2 * SIZE(AO)
  937. LFD f19, 3 * SIZE(AO)
  938. LFD f20, 0 * SIZE(B)
  939. LFD f21, 1 * SIZE(B)
  940. LFD f22, 2 * SIZE(B)
  941. LFD f23, 3 * SIZE(B)
  942. LFD f24, 4 * SIZE(B)
  943. LFD f25, 5 * SIZE(B)
  944. LFD f26, 6 * SIZE(B)
  945. LFD f27, 7 * SIZE(B)
  946. mr BO, B
  947. #else
  948. slwi r0, KK, 0 + BASE_SHIFT
  949. slwi TEMP, KK, 2 + BASE_SHIFT
  950. add AO, AO, r0
  951. add BO, B, TEMP
  952. LFD f16, 0 * SIZE(AO)
  953. LFD f17, 1 * SIZE(AO)
  954. LFD f18, 2 * SIZE(AO)
  955. LFD f19, 3 * SIZE(AO)
  956. LFD f20, 0 * SIZE(BO)
  957. LFD f21, 1 * SIZE(BO)
  958. LFD f22, 2 * SIZE(BO)
  959. LFD f23, 3 * SIZE(BO)
  960. LFD f24, 4 * SIZE(BO)
  961. LFD f25, 5 * SIZE(BO)
  962. LFD f26, 6 * SIZE(BO)
  963. LFD f27, 7 * SIZE(BO)
  964. #endif
  965. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  966. sub TEMP, K, KK
  967. #elif defined(LEFT)
  968. addi TEMP, KK, 1
  969. #else
  970. addi TEMP, KK, 4
  971. #endif
  972. srawi. TEMP, TEMP, 2
  973. mtspr CTR, TEMP
  974. #else
  975. LFD f16, 0 * SIZE(AO)
  976. LFD f17, 1 * SIZE(AO)
  977. LFD f18, 2 * SIZE(AO)
  978. LFD f19, 3 * SIZE(AO)
  979. LFD f20, 0 * SIZE(B)
  980. LFD f21, 1 * SIZE(B)
  981. LFD f22, 2 * SIZE(B)
  982. LFD f23, 3 * SIZE(B)
  983. LFD f24, 4 * SIZE(B)
  984. LFD f25, 5 * SIZE(B)
  985. LFD f26, 6 * SIZE(B)
  986. LFD f27, 7 * SIZE(B)
  987. srawi. r0, K, 2
  988. mtspr CTR, r0
  989. mr BO, B
  990. #endif
  991. ble LL(35)
  992. .align 5
  993. LL(32):
  994. FMADD f0, f16, f20, f0
  995. FMADD f4, f16, f21, f4
  996. FMADD f8, f16, f22, f8
  997. FMADD f12, f16, f23, f12
  998. LFD f20, 8 * SIZE(BO)
  999. LFD f21, 9 * SIZE(BO)
  1000. LFD f22, 10 * SIZE(BO)
  1001. LFD f23, 11 * SIZE(BO)
  1002. FMADD f1, f17, f24, f1
  1003. FMADD f5, f17, f25, f5
  1004. FMADD f9, f17, f26, f9
  1005. FMADD f13, f17, f27, f13
  1006. LFD f24, 12 * SIZE(BO)
  1007. LFD f25, 13 * SIZE(BO)
  1008. LFD f26, 14 * SIZE(BO)
  1009. LFD f27, 15 * SIZE(BO)
  1010. FMADD f0, f18, f20, f0
  1011. FMADD f4, f18, f21, f4
  1012. FMADD f8, f18, f22, f8
  1013. FMADD f12, f18, f23, f12
  1014. LFD f20, 16 * SIZE(BO)
  1015. LFD f21, 17 * SIZE(BO)
  1016. LFD f22, 18 * SIZE(BO)
  1017. LFD f23, 19 * SIZE(BO)
  1018. FMADD f1, f19, f24, f1
  1019. FMADD f5, f19, f25, f5
  1020. FMADD f9, f19, f26, f9
  1021. FMADD f13, f19, f27, f13
  1022. LFD f16, 4 * SIZE(AO)
  1023. LFD f17, 5 * SIZE(AO)
  1024. LFD f18, 6 * SIZE(AO)
  1025. LFD f19, 7 * SIZE(AO)
  1026. LFD f24, 20 * SIZE(BO)
  1027. LFD f25, 21 * SIZE(BO)
  1028. LFD f26, 22 * SIZE(BO)
  1029. LFD f27, 23 * SIZE(BO)
  1030. addi AO, AO, 4 * SIZE
  1031. addi BO, BO, 16 * SIZE
  1032. DCBT(BO, PREB)
  1033. bdnz LL(32)
  1034. fadd f0, f1, f0
  1035. fadd f4, f5, f4
  1036. fadd f8, f9, f8
  1037. fadd f12, f13, f12
  1038. .align 4
  1039. LL(35):
  1040. lfd f30, ALPHA
  1041. #if defined(TRMMKERNEL)
  1042. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1043. sub TEMP, K, KK
  1044. #elif defined(LEFT)
  1045. addi TEMP, KK, 1
  1046. #else
  1047. addi TEMP, KK, 4
  1048. #endif
  1049. andi. TEMP, TEMP, 3
  1050. mtspr CTR, TEMP
  1051. #else
  1052. andi. r0, K, 3
  1053. mtspr CTR, r0
  1054. #endif
  1055. ble+ LL(38)
  1056. .align 4
  1057. LL(36):
  1058. FMADD f0, f16, f20, f0
  1059. FMADD f4, f16, f21, f4
  1060. FMADD f8, f16, f22, f8
  1061. FMADD f12, f16, f23, f12
  1062. LFD f16, 1 * SIZE(AO)
  1063. LFD f20, 4 * SIZE(BO)
  1064. LFD f21, 5 * SIZE(BO)
  1065. LFD f22, 6 * SIZE(BO)
  1066. LFD f23, 7 * SIZE(BO)
  1067. addi BO, BO, 4 * SIZE
  1068. addi AO, AO, 1 * SIZE
  1069. bdnz LL(36)
  1070. .align 4
  1071. LL(38):
  1072. #ifndef TRMMKERNEL
  1073. LFD f16, 0 * SIZE(CO1)
  1074. LFD f18, 0 * SIZE(CO2)
  1075. LFD f20, 0 * SIZE(CO3)
  1076. LFD f22, 0 * SIZE(CO4)
  1077. FMADD f0, f0, f30, f16
  1078. FMADD f4, f4, f30, f18
  1079. FMADD f8, f8, f30, f20
  1080. FMADD f12, f12, f30, f22
  1081. #else
  1082. FMUL f0, f0, f30
  1083. FMUL f4, f4, f30
  1084. FMUL f8, f8, f30
  1085. FMUL f12, f12, f30
  1086. #endif
  1087. STFD f0, 0 * SIZE(CO1)
  1088. STFD f4, 0 * SIZE(CO2)
  1089. STFD f8, 0 * SIZE(CO3)
  1090. STFD f12, 0 * SIZE(CO4)
  1091. lfs f0, FZERO
  1092. fmr f1, f0
  1093. fmr f4, f0
  1094. fmr f5, f0
  1095. fmr f8, f0
  1096. fmr f9, f0
  1097. fmr f12, f0
  1098. fmr f13, f0
  1099. #ifdef TRMMKERNEL
  1100. #if ( defined(LEFT) && defined(TRANSA)) || \
  1101. (!defined(LEFT) && !defined(TRANSA))
  1102. sub TEMP, K, KK
  1103. #ifdef LEFT
  1104. addi TEMP, TEMP, -1
  1105. #else
  1106. addi TEMP, TEMP, -4
  1107. #endif
  1108. slwi r0, TEMP, 0 + BASE_SHIFT
  1109. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1110. add AO, AO, r0
  1111. add BO, BO, TEMP
  1112. #endif
  1113. #ifdef LEFT
  1114. addi KK, KK, 2
  1115. #endif
  1116. #endif
  1117. .align 4
  1118. LL(39):
  1119. #if defined(TRMMKERNEL) && !defined(LEFT)
  1120. addi KK, KK, 4
  1121. #endif
  1122. mr B, BO
  1123. addic. J, J, -1
  1124. bgt LL(10)
  1125. .align 4
  1126. LL(40):
  1127. mr CO1, C
  1128. add CO2, C, LDC
  1129. andi. J, N, 2
  1130. ble LL(70)
  1131. #if defined(TRMMKERNEL) && defined(LEFT)
  1132. mr KK, OFFSET
  1133. #endif
  1134. lfs f0, FZERO
  1135. fmr f1, f0
  1136. fmr f2, f0
  1137. fmr f3, f0
  1138. fmr f4, f0
  1139. fmr f5, f0
  1140. fmr f6, f0
  1141. fmr f7, f0
  1142. srawi. I, M, 2
  1143. add C, CO2, LDC
  1144. mr AO, A
  1145. ble LL(50)
  1146. .align 4
  1147. LL(41):
  1148. #if defined(TRMMKERNEL)
  1149. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1150. LFD f16, 0 * SIZE(AO)
  1151. LFD f17, 1 * SIZE(AO)
  1152. LFD f18, 2 * SIZE(AO)
  1153. LFD f19, 3 * SIZE(AO)
  1154. LFD f20, 0 * SIZE(B)
  1155. LFD f21, 1 * SIZE(B)
  1156. LFD f22, 2 * SIZE(B)
  1157. LFD f23, 3 * SIZE(B)
  1158. mr BO, B
  1159. #else
  1160. slwi r0, KK, 2 + BASE_SHIFT
  1161. slwi TEMP, KK, 1 + BASE_SHIFT
  1162. add AO, AO, r0
  1163. add BO, B, TEMP
  1164. LFD f16, 0 * SIZE(AO)
  1165. LFD f17, 1 * SIZE(AO)
  1166. LFD f18, 2 * SIZE(AO)
  1167. LFD f19, 3 * SIZE(AO)
  1168. LFD f20, 0 * SIZE(BO)
  1169. LFD f21, 1 * SIZE(BO)
  1170. LFD f22, 2 * SIZE(BO)
  1171. LFD f23, 3 * SIZE(BO)
  1172. #endif
  1173. DCBTST(CO1, PREC)
  1174. DCBTST(CO2, PREC)
  1175. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1176. sub TEMP, K, KK
  1177. #elif defined(LEFT)
  1178. addi TEMP, KK, 4
  1179. #else
  1180. addi TEMP, KK, 2
  1181. #endif
  1182. srawi. TEMP, TEMP, 2
  1183. mtspr CTR, TEMP
  1184. #else
  1185. LFD f16, 0 * SIZE(AO)
  1186. LFD f17, 1 * SIZE(AO)
  1187. LFD f18, 2 * SIZE(AO)
  1188. LFD f19, 3 * SIZE(AO)
  1189. LFD f20, 0 * SIZE(B)
  1190. LFD f21, 1 * SIZE(B)
  1191. LFD f22, 2 * SIZE(B)
  1192. LFD f23, 3 * SIZE(B)
  1193. DCBTST(CO1, PREC)
  1194. DCBTST(CO2, PREC)
  1195. srawi. r0, K, 2
  1196. mtspr CTR, r0
  1197. mr BO, B
  1198. #endif
  1199. ble LL(45)
  1200. .align 5
  1201. LL(42):
  1202. FMADD f0, f16, f20, f0
  1203. FMADD f1, f17, f20, f1
  1204. FMADD f2, f18, f20, f2
  1205. FMADD f3, f19, f20, f3
  1206. FMADD f4, f16, f21, f4
  1207. FMADD f5, f17, f21, f5
  1208. FMADD f6, f18, f21, f6
  1209. FMADD f7, f19, f21, f7
  1210. LFD f16, 4 * SIZE(AO)
  1211. LFD f17, 5 * SIZE(AO)
  1212. LFD f18, 6 * SIZE(AO)
  1213. LFD f19, 7 * SIZE(AO)
  1214. FMADD f0, f16, f22, f0
  1215. FMADD f1, f17, f22, f1
  1216. FMADD f2, f18, f22, f2
  1217. FMADD f3, f19, f22, f3
  1218. FMADD f4, f16, f23, f4
  1219. FMADD f5, f17, f23, f5
  1220. FMADD f6, f18, f23, f6
  1221. FMADD f7, f19, f23, f7
  1222. LFD f16, 8 * SIZE(AO)
  1223. LFD f17, 9 * SIZE(AO)
  1224. LFD f18, 10 * SIZE(AO)
  1225. LFD f19, 11 * SIZE(AO)
  1226. LFD f20, 4 * SIZE(BO)
  1227. LFD f21, 5 * SIZE(BO)
  1228. LFD f22, 6 * SIZE(BO)
  1229. LFD f23, 7 * SIZE(BO)
  1230. FMADD f0, f16, f20, f0
  1231. FMADD f1, f17, f20, f1
  1232. FMADD f2, f18, f20, f2
  1233. FMADD f3, f19, f20, f3
  1234. FMADD f4, f16, f21, f4
  1235. FMADD f5, f17, f21, f5
  1236. FMADD f6, f18, f21, f6
  1237. FMADD f7, f19, f21, f7
  1238. LFD f16, 12 * SIZE(AO)
  1239. LFD f17, 13 * SIZE(AO)
  1240. LFD f18, 14 * SIZE(AO)
  1241. LFD f19, 15 * SIZE(AO)
  1242. FMADD f0, f16, f22, f0
  1243. FMADD f1, f17, f22, f1
  1244. FMADD f2, f18, f22, f2
  1245. FMADD f3, f19, f22, f3
  1246. FMADD f4, f16, f23, f4
  1247. FMADD f5, f17, f23, f5
  1248. FMADD f6, f18, f23, f6
  1249. FMADD f7, f19, f23, f7
  1250. LFD f16, 16 * SIZE(AO)
  1251. LFD f17, 17 * SIZE(AO)
  1252. LFD f18, 18 * SIZE(AO)
  1253. LFD f19, 19 * SIZE(AO)
  1254. LFD f20, 8 * SIZE(BO)
  1255. LFD f21, 9 * SIZE(BO)
  1256. LFD f22, 10 * SIZE(BO)
  1257. LFD f23, 11 * SIZE(BO)
  1258. addi AO, AO, 16 * SIZE
  1259. addi BO, BO, 8 * SIZE
  1260. DCBT(BO, PREB)
  1261. bdnz LL(42)
  1262. .align 4
  1263. LL(45):
  1264. lfd f30, ALPHA
  1265. #if defined(TRMMKERNEL)
  1266. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1267. sub TEMP, K, KK
  1268. #elif defined(LEFT)
  1269. addi TEMP, KK, 4
  1270. #else
  1271. addi TEMP, KK, 2
  1272. #endif
  1273. andi. TEMP, TEMP, 3
  1274. mtspr CTR, TEMP
  1275. #else
  1276. andi. r0, K, 3
  1277. mtspr CTR, r0
  1278. #endif
  1279. ble+ LL(48)
  1280. .align 4
  1281. LL(46):
  1282. FMADD f0, f16, f20, f0
  1283. FMADD f1, f17, f20, f1
  1284. FMADD f2, f18, f20, f2
  1285. FMADD f3, f19, f20, f3
  1286. FMADD f4, f16, f21, f4
  1287. FMADD f5, f17, f21, f5
  1288. FMADD f6, f18, f21, f6
  1289. FMADD f7, f19, f21, f7
  1290. LFD f16, 4 * SIZE(AO)
  1291. LFD f17, 5 * SIZE(AO)
  1292. LFD f18, 6 * SIZE(AO)
  1293. LFD f19, 7 * SIZE(AO)
  1294. LFD f20, 2 * SIZE(BO)
  1295. LFD f21, 3 * SIZE(BO)
  1296. addi BO, BO, 2 * SIZE
  1297. addi AO, AO, 4 * SIZE
  1298. bdnz LL(46)
  1299. .align 4
  1300. LL(48):
  1301. #ifndef TRMMKERNEL
  1302. LFD f16, 0 * SIZE(CO1)
  1303. LFD f17, 1 * SIZE(CO1)
  1304. LFD f18, 2 * SIZE(CO1)
  1305. LFD f19, 3 * SIZE(CO1)
  1306. LFD f20, 0 * SIZE(CO2)
  1307. LFD f21, 1 * SIZE(CO2)
  1308. LFD f22, 2 * SIZE(CO2)
  1309. LFD f23, 3 * SIZE(CO2)
  1310. FMADD f0, f0, f30, f16
  1311. FMADD f1, f1, f30, f17
  1312. FMADD f2, f2, f30, f18
  1313. FMADD f3, f3, f30, f19
  1314. FMADD f4, f4, f30, f20
  1315. FMADD f5, f5, f30, f21
  1316. FMADD f6, f6, f30, f22
  1317. FMADD f7, f7, f30, f23
  1318. #else
  1319. FMUL f0, f0, f30
  1320. FMUL f1, f1, f30
  1321. FMUL f2, f2, f30
  1322. FMUL f3, f3, f30
  1323. FMUL f4, f4, f30
  1324. FMUL f5, f5, f30
  1325. FMUL f6, f6, f30
  1326. FMUL f7, f7, f30
  1327. #endif
  1328. STFD f0, 0 * SIZE(CO1)
  1329. STFD f1, 1 * SIZE(CO1)
  1330. STFD f2, 2 * SIZE(CO1)
  1331. STFD f3, 3 * SIZE(CO1)
  1332. lfs f0, FZERO
  1333. fmr f1, f0
  1334. fmr f2, f0
  1335. fmr f3, f0
  1336. STFD f4, 0 * SIZE(CO2)
  1337. STFD f5, 1 * SIZE(CO2)
  1338. STFD f6, 2 * SIZE(CO2)
  1339. STFD f7, 3 * SIZE(CO2)
  1340. fmr f4, f0
  1341. fmr f5, f0
  1342. fmr f6, f0
  1343. fmr f7, f0
  1344. addi CO1, CO1, 4 * SIZE
  1345. addi CO2, CO2, 4 * SIZE
  1346. #ifdef TRMMKERNEL
  1347. #if ( defined(LEFT) && defined(TRANSA)) || \
  1348. (!defined(LEFT) && !defined(TRANSA))
  1349. sub TEMP, K, KK
  1350. #ifdef LEFT
  1351. addi TEMP, TEMP, -4
  1352. #else
  1353. addi TEMP, TEMP, -2
  1354. #endif
  1355. slwi r0, TEMP, 2 + BASE_SHIFT
  1356. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1357. add AO, AO, r0
  1358. add BO, BO, TEMP
  1359. #endif
  1360. #ifdef LEFT
  1361. addi KK, KK, 4
  1362. #endif
  1363. #endif
  1364. addic. I, I, -1
  1365. bgt+ LL(41)
  1366. .align 4
  1367. LL(50):
  1368. andi. I, M, 2
  1369. ble LL(60)
  1370. #if defined(TRMMKERNEL)
  1371. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1372. LFD f16, 0 * SIZE(AO)
  1373. LFD f17, 1 * SIZE(AO)
  1374. LFD f18, 2 * SIZE(AO)
  1375. LFD f19, 3 * SIZE(AO)
  1376. LFD f20, 0 * SIZE(B)
  1377. LFD f21, 1 * SIZE(B)
  1378. LFD f22, 2 * SIZE(B)
  1379. LFD f23, 3 * SIZE(B)
  1380. LFD f24, 4 * SIZE(B)
  1381. LFD f25, 5 * SIZE(B)
  1382. LFD f26, 6 * SIZE(B)
  1383. LFD f27, 7 * SIZE(B)
  1384. mr BO, B
  1385. #else
  1386. slwi r0, KK, 1 + BASE_SHIFT
  1387. slwi TEMP, KK, 1 + BASE_SHIFT
  1388. add AO, AO, r0
  1389. add BO, B, TEMP
  1390. LFD f16, 0 * SIZE(AO)
  1391. LFD f17, 1 * SIZE(AO)
  1392. LFD f18, 2 * SIZE(AO)
  1393. LFD f19, 3 * SIZE(AO)
  1394. LFD f20, 0 * SIZE(BO)
  1395. LFD f21, 1 * SIZE(BO)
  1396. LFD f22, 2 * SIZE(BO)
  1397. LFD f23, 3 * SIZE(BO)
  1398. LFD f24, 4 * SIZE(BO)
  1399. LFD f25, 5 * SIZE(BO)
  1400. LFD f26, 6 * SIZE(BO)
  1401. LFD f27, 7 * SIZE(BO)
  1402. #endif
  1403. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1404. sub TEMP, K, KK
  1405. #elif defined(LEFT)
  1406. addi TEMP, KK, 2
  1407. #else
  1408. addi TEMP, KK, 2
  1409. #endif
  1410. srawi. TEMP, TEMP, 2
  1411. mtspr CTR, TEMP
  1412. #else
  1413. LFD f16, 0 * SIZE(AO)
  1414. LFD f17, 1 * SIZE(AO)
  1415. LFD f18, 2 * SIZE(AO)
  1416. LFD f19, 3 * SIZE(AO)
  1417. LFD f20, 0 * SIZE(B)
  1418. LFD f21, 1 * SIZE(B)
  1419. LFD f22, 2 * SIZE(B)
  1420. LFD f23, 3 * SIZE(B)
  1421. LFD f24, 4 * SIZE(B)
  1422. LFD f25, 5 * SIZE(B)
  1423. LFD f26, 6 * SIZE(B)
  1424. LFD f27, 7 * SIZE(B)
  1425. srawi. r0, K, 2
  1426. mtspr CTR, r0
  1427. mr BO, B
  1428. #endif
  1429. ble LL(55)
  1430. .align 5
  1431. LL(52):
  1432. FMADD f0, f16, f20, f0
  1433. FMADD f1, f17, f20, f1
  1434. FMADD f2, f16, f21, f2
  1435. FMADD f3, f17, f21, f3
  1436. FMADD f4, f18, f22, f4
  1437. FMADD f5, f19, f22, f5
  1438. FMADD f6, f18, f23, f6
  1439. FMADD f7, f19, f23, f7
  1440. LFD f16, 4 * SIZE(AO)
  1441. LFD f17, 5 * SIZE(AO)
  1442. LFD f18, 6 * SIZE(AO)
  1443. LFD f19, 7 * SIZE(AO)
  1444. LFD f20, 8 * SIZE(BO)
  1445. LFD f21, 9 * SIZE(BO)
  1446. LFD f22, 10 * SIZE(BO)
  1447. LFD f23, 11 * SIZE(BO)
  1448. FMADD f0, f16, f24, f0
  1449. FMADD f1, f17, f24, f1
  1450. FMADD f2, f16, f25, f2
  1451. FMADD f3, f17, f25, f3
  1452. FMADD f4, f18, f26, f4
  1453. FMADD f5, f19, f26, f5
  1454. FMADD f6, f18, f27, f6
  1455. FMADD f7, f19, f27, f7
  1456. LFD f16, 8 * SIZE(AO)
  1457. LFD f17, 9 * SIZE(AO)
  1458. LFD f18, 10 * SIZE(AO)
  1459. LFD f19, 11 * SIZE(AO)
  1460. LFD f24, 12 * SIZE(BO)
  1461. LFD f25, 13 * SIZE(BO)
  1462. LFD f26, 14 * SIZE(BO)
  1463. LFD f27, 15 * SIZE(BO)
  1464. addi AO, AO, 8 * SIZE
  1465. addi BO, BO, 8 * SIZE
  1466. DCBT(BO, PREB)
  1467. bdnz LL(52)
  1468. .align 4
  1469. LL(55):
  1470. lfd f30, ALPHA
  1471. #if defined(TRMMKERNEL)
  1472. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1473. sub TEMP, K, KK
  1474. #elif defined(LEFT)
  1475. addi TEMP, KK, 2
  1476. #else
  1477. addi TEMP, KK, 2
  1478. #endif
  1479. andi. TEMP, TEMP, 3
  1480. mtspr CTR, TEMP
  1481. #else
  1482. andi. r0, K, 3
  1483. mtspr CTR, r0
  1484. #endif
  1485. ble+ LL(58)
  1486. .align 4
  1487. LL(56):
  1488. FMADD f0, f16, f20, f0
  1489. FMADD f1, f17, f20, f1
  1490. FMADD f2, f16, f21, f2
  1491. FMADD f3, f17, f21, f3
  1492. LFD f16, 2 * SIZE(AO)
  1493. LFD f17, 3 * SIZE(AO)
  1494. LFD f20, 2 * SIZE(BO)
  1495. LFD f21, 3 * SIZE(BO)
  1496. addi BO, BO, 2 * SIZE
  1497. addi AO, AO, 2 * SIZE
  1498. bdnz LL(56)
  1499. .align 4
  1500. LL(58):
  1501. #ifndef TRMMKERNEL
  1502. LFD f16, 0 * SIZE(CO1)
  1503. LFD f17, 1 * SIZE(CO1)
  1504. LFD f18, 0 * SIZE(CO2)
  1505. LFD f19, 1 * SIZE(CO2)
  1506. FADD f0, f4, f0
  1507. FADD f1, f5, f1
  1508. FADD f2, f6, f2
  1509. FADD f3, f7, f3
  1510. FMADD f0, f0, f30, f16
  1511. FMADD f1, f1, f30, f17
  1512. FMADD f2, f2, f30, f18
  1513. FMADD f3, f3, f30, f19
  1514. #else
  1515. FADD f0, f4, f0
  1516. FADD f1, f5, f1
  1517. FADD f2, f6, f2
  1518. FADD f3, f7, f3
  1519. FMUL f0, f0, f30
  1520. FMUL f1, f1, f30
  1521. FMUL f2, f2, f30
  1522. FMUL f3, f3, f30
  1523. #endif
  1524. STFD f0, 0 * SIZE(CO1)
  1525. STFD f1, 1 * SIZE(CO1)
  1526. STFD f2, 0 * SIZE(CO2)
  1527. STFD f3, 1 * SIZE(CO2)
  1528. lfs f0, FZERO
  1529. fmr f1, f0
  1530. fmr f2, f0
  1531. fmr f3, f0
  1532. fmr f4, f0
  1533. fmr f5, f0
  1534. fmr f6, f0
  1535. fmr f7, f0
  1536. addi CO1, CO1, 2 * SIZE
  1537. addi CO2, CO2, 2 * SIZE
  1538. #ifdef TRMMKERNEL
  1539. #if ( defined(LEFT) && defined(TRANSA)) || \
  1540. (!defined(LEFT) && !defined(TRANSA))
  1541. sub TEMP, K, KK
  1542. #ifdef LEFT
  1543. addi TEMP, TEMP, -2
  1544. #else
  1545. addi TEMP, TEMP, -2
  1546. #endif
  1547. slwi r0, TEMP, 1 + BASE_SHIFT
  1548. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1549. add AO, AO, r0
  1550. add BO, BO, TEMP
  1551. #endif
  1552. #ifdef LEFT
  1553. addi KK, KK, 2
  1554. #endif
  1555. #endif
  1556. .align 4
  1557. LL(60):
  1558. andi. I, M, 1
  1559. ble LL(69)
  1560. #if defined(TRMMKERNEL)
  1561. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1562. LFD f16, 0 * SIZE(AO)
  1563. LFD f17, 1 * SIZE(AO)
  1564. LFD f18, 2 * SIZE(AO)
  1565. LFD f19, 3 * SIZE(AO)
  1566. LFD f20, 0 * SIZE(B)
  1567. LFD f21, 1 * SIZE(B)
  1568. LFD f22, 2 * SIZE(B)
  1569. LFD f23, 3 * SIZE(B)
  1570. LFD f24, 4 * SIZE(B)
  1571. LFD f25, 5 * SIZE(B)
  1572. LFD f26, 6 * SIZE(B)
  1573. LFD f27, 7 * SIZE(B)
  1574. mr BO, B
  1575. #else
  1576. slwi r0, KK, 0 + BASE_SHIFT
  1577. slwi TEMP, KK, 1 + BASE_SHIFT
  1578. add AO, AO, r0
  1579. add BO, B, TEMP
  1580. LFD f16, 0 * SIZE(AO)
  1581. LFD f17, 1 * SIZE(AO)
  1582. LFD f18, 2 * SIZE(AO)
  1583. LFD f19, 3 * SIZE(AO)
  1584. LFD f20, 0 * SIZE(BO)
  1585. LFD f21, 1 * SIZE(BO)
  1586. LFD f22, 2 * SIZE(BO)
  1587. LFD f23, 3 * SIZE(BO)
  1588. LFD f24, 4 * SIZE(BO)
  1589. LFD f25, 5 * SIZE(BO)
  1590. LFD f26, 6 * SIZE(BO)
  1591. LFD f27, 7 * SIZE(BO)
  1592. #endif
  1593. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1594. sub TEMP, K, KK
  1595. #elif defined(LEFT)
  1596. addi TEMP, KK, 1
  1597. #else
  1598. addi TEMP, KK, 2
  1599. #endif
  1600. srawi. TEMP, TEMP, 2
  1601. mtspr CTR, TEMP
  1602. #else
  1603. LFD f16, 0 * SIZE(AO)
  1604. LFD f17, 1 * SIZE(AO)
  1605. LFD f18, 2 * SIZE(AO)
  1606. LFD f19, 3 * SIZE(AO)
  1607. LFD f20, 0 * SIZE(B)
  1608. LFD f21, 1 * SIZE(B)
  1609. LFD f22, 2 * SIZE(B)
  1610. LFD f23, 3 * SIZE(B)
  1611. LFD f24, 4 * SIZE(B)
  1612. LFD f25, 5 * SIZE(B)
  1613. LFD f26, 6 * SIZE(B)
  1614. LFD f27, 7 * SIZE(B)
  1615. srawi. r0, K, 2
  1616. mtspr CTR, r0
  1617. mr BO, B
  1618. #endif
  1619. ble LL(65)
  1620. .align 5
  1621. LL(62):
  1622. FMADD f0, f16, f20, f0
  1623. FMADD f1, f16, f21, f1
  1624. FMADD f2, f17, f22, f2
  1625. FMADD f3, f17, f23, f3
  1626. LFD f20, 8 * SIZE(BO)
  1627. LFD f21, 9 * SIZE(BO)
  1628. LFD f22, 10 * SIZE(BO)
  1629. LFD f23, 11 * SIZE(BO)
  1630. FMADD f0, f18, f24, f0
  1631. FMADD f1, f18, f25, f1
  1632. FMADD f2, f19, f26, f2
  1633. FMADD f3, f19, f27, f3
  1634. LFD f16, 4 * SIZE(AO)
  1635. LFD f17, 5 * SIZE(AO)
  1636. LFD f18, 6 * SIZE(AO)
  1637. LFD f19, 7 * SIZE(AO)
  1638. LFD f24, 12 * SIZE(BO)
  1639. LFD f25, 13 * SIZE(BO)
  1640. LFD f26, 14 * SIZE(BO)
  1641. LFD f27, 15 * SIZE(BO)
  1642. addi AO, AO, 4 * SIZE
  1643. addi BO, BO, 8 * SIZE
  1644. bdnz LL(62)
  1645. .align 4
  1646. LL(65):
  1647. lfd f30, ALPHA
  1648. #if defined(TRMMKERNEL)
  1649. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1650. sub TEMP, K, KK
  1651. #elif defined(LEFT)
  1652. addi TEMP, KK, 1
  1653. #else
  1654. addi TEMP, KK, 2
  1655. #endif
  1656. andi. TEMP, TEMP, 3
  1657. mtspr CTR, TEMP
  1658. #else
  1659. andi. r0, K, 3
  1660. mtspr CTR, r0
  1661. #endif
  1662. ble+ LL(68)
  1663. .align 4
  1664. LL(66):
  1665. FMADD f0, f16, f20, f0
  1666. FMADD f1, f16, f21, f1
  1667. LFD f16, 1 * SIZE(AO)
  1668. LFD f20, 2 * SIZE(BO)
  1669. LFD f21, 3 * SIZE(BO)
  1670. addi BO, BO, 2 * SIZE
  1671. addi AO, AO, 1 * SIZE
  1672. bdnz LL(66)
  1673. .align 4
  1674. LL(68):
  1675. #ifndef TRMMKERNEL
  1676. LFD f16, 0 * SIZE(CO1)
  1677. LFD f18, 0 * SIZE(CO2)
  1678. FADD f0, f2, f0
  1679. FADD f1, f3, f1
  1680. FMADD f0, f0, f30, f16
  1681. FMADD f1, f1, f30, f18
  1682. #else
  1683. FADD f0, f2, f0
  1684. FADD f1, f3, f1
  1685. FMUL f0, f0, f30
  1686. FMUL f1, f1, f30
  1687. #endif
  1688. STFD f0, 0 * SIZE(CO1)
  1689. STFD f1, 0 * SIZE(CO2)
  1690. lfs f0, FZERO
  1691. fmr f1, f0
  1692. fmr f4, f0
  1693. fmr f5, f0
  1694. #ifdef TRMMKERNEL
  1695. #if ( defined(LEFT) && defined(TRANSA)) || \
  1696. (!defined(LEFT) && !defined(TRANSA))
  1697. sub TEMP, K, KK
  1698. #ifdef LEFT
  1699. addi TEMP, TEMP, -1
  1700. #else
  1701. addi TEMP, TEMP, -2
  1702. #endif
  1703. slwi r0, TEMP, 0 + BASE_SHIFT
  1704. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1705. add AO, AO, r0
  1706. add BO, BO, TEMP
  1707. #endif
  1708. #ifdef LEFT
  1709. addi KK, KK, 1
  1710. #endif
  1711. #endif
  1712. .align 4
  1713. LL(69):
  1714. #if defined(TRMMKERNEL) && !defined(LEFT)
  1715. addi KK, KK, 2
  1716. #endif
  1717. mr B, BO
  1718. .align 4
  1719. LL(70):
  1720. mr CO1, C
  1721. andi. J, N, 1
  1722. ble LL(999)
  1723. #if defined(TRMMKERNEL) && defined(LEFT)
  1724. mr KK, OFFSET
  1725. #endif
  1726. lfs f0, FZERO
  1727. fmr f1, f0
  1728. fmr f2, f0
  1729. fmr f3, f0
  1730. srawi. I, M, 2
  1731. mr AO, A
  1732. ble LL(80)
  1733. .align 4
  1734. LL(71):
  1735. #if defined(TRMMKERNEL)
  1736. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1737. LFD f16, 0 * SIZE(AO)
  1738. LFD f17, 1 * SIZE(AO)
  1739. LFD f18, 2 * SIZE(AO)
  1740. LFD f19, 3 * SIZE(AO)
  1741. LFD f20, 0 * SIZE(B)
  1742. LFD f21, 1 * SIZE(B)
  1743. LFD f22, 2 * SIZE(B)
  1744. LFD f23, 3 * SIZE(B)
  1745. mr BO, B
  1746. #else
  1747. slwi r0, KK, 2 + BASE_SHIFT
  1748. slwi TEMP, KK, 0 + BASE_SHIFT
  1749. add AO, AO, r0
  1750. add BO, B, TEMP
  1751. LFD f16, 0 * SIZE(AO)
  1752. LFD f17, 1 * SIZE(AO)
  1753. LFD f18, 2 * SIZE(AO)
  1754. LFD f19, 3 * SIZE(AO)
  1755. LFD f20, 0 * SIZE(BO)
  1756. LFD f21, 1 * SIZE(BO)
  1757. LFD f22, 2 * SIZE(BO)
  1758. LFD f23, 3 * SIZE(BO)
  1759. #endif
  1760. DCBTST(CO1, PREC)
  1761. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1762. sub TEMP, K, KK
  1763. #elif defined(LEFT)
  1764. addi TEMP, KK, 4
  1765. #else
  1766. addi TEMP, KK, 1
  1767. #endif
  1768. srawi. TEMP, TEMP, 2
  1769. mtspr CTR, TEMP
  1770. #else
  1771. LFD f16, 0 * SIZE(AO)
  1772. LFD f17, 1 * SIZE(AO)
  1773. LFD f18, 2 * SIZE(AO)
  1774. LFD f19, 3 * SIZE(AO)
  1775. LFD f20, 0 * SIZE(B)
  1776. LFD f21, 1 * SIZE(B)
  1777. LFD f22, 2 * SIZE(B)
  1778. LFD f23, 3 * SIZE(B)
  1779. DCBTST(CO1, PREC)
  1780. srawi. r0, K, 2
  1781. mtspr CTR, r0
  1782. mr BO, B
  1783. #endif
  1784. ble LL(75)
  1785. .align 5
  1786. LL(72):
  1787. FMADD f0, f16, f20, f0
  1788. FMADD f1, f17, f20, f1
  1789. FMADD f2, f18, f20, f2
  1790. FMADD f3, f19, f20, f3
  1791. LFD f16, 4 * SIZE(AO)
  1792. LFD f17, 5 * SIZE(AO)
  1793. LFD f18, 6 * SIZE(AO)
  1794. LFD f19, 7 * SIZE(AO)
  1795. FMADD f0, f16, f21, f0
  1796. FMADD f1, f17, f21, f1
  1797. FMADD f2, f18, f21, f2
  1798. FMADD f3, f19, f21, f3
  1799. LFD f16, 8 * SIZE(AO)
  1800. LFD f17, 9 * SIZE(AO)
  1801. LFD f18, 10 * SIZE(AO)
  1802. LFD f19, 11 * SIZE(AO)
  1803. FMADD f0, f16, f22, f0
  1804. FMADD f1, f17, f22, f1
  1805. FMADD f2, f18, f22, f2
  1806. FMADD f3, f19, f22, f3
  1807. LFD f16, 12 * SIZE(AO)
  1808. LFD f17, 13 * SIZE(AO)
  1809. LFD f18, 14 * SIZE(AO)
  1810. LFD f19, 15 * SIZE(AO)
  1811. FMADD f0, f16, f23, f0
  1812. FMADD f1, f17, f23, f1
  1813. FMADD f2, f18, f23, f2
  1814. FMADD f3, f19, f23, f3
  1815. LFD f16, 16 * SIZE(AO)
  1816. LFD f17, 17 * SIZE(AO)
  1817. LFD f18, 18 * SIZE(AO)
  1818. LFD f19, 19 * SIZE(AO)
  1819. LFD f20, 4 * SIZE(BO)
  1820. LFD f21, 5 * SIZE(BO)
  1821. LFD f22, 6 * SIZE(BO)
  1822. LFD f23, 7 * SIZE(BO)
  1823. addi AO, AO, 16 * SIZE
  1824. addi BO, BO, 4 * SIZE
  1825. DCBT(BO, PREB)
  1826. bdnz LL(72)
  1827. .align 4
  1828. LL(75):
  1829. lfd f30, ALPHA
  1830. #if defined(TRMMKERNEL)
  1831. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1832. sub TEMP, K, KK
  1833. #elif defined(LEFT)
  1834. addi TEMP, KK, 4
  1835. #else
  1836. addi TEMP, KK, 1
  1837. #endif
  1838. andi. TEMP, TEMP, 3
  1839. mtspr CTR, TEMP
  1840. #else
  1841. andi. r0, K, 3
  1842. mtspr CTR, r0
  1843. #endif
  1844. ble+ LL(78)
  1845. .align 4
  1846. LL(76):
  1847. FMADD f0, f16, f20, f0
  1848. FMADD f1, f17, f20, f1
  1849. FMADD f2, f18, f20, f2
  1850. FMADD f3, f19, f20, f3
  1851. LFD f16, 4 * SIZE(AO)
  1852. LFD f17, 5 * SIZE(AO)
  1853. LFD f18, 6 * SIZE(AO)
  1854. LFD f19, 7 * SIZE(AO)
  1855. LFD f20, 1 * SIZE(BO)
  1856. addi BO, BO, 1 * SIZE
  1857. addi AO, AO, 4 * SIZE
  1858. bdnz LL(76)
  1859. .align 4
  1860. LL(78):
  1861. #ifndef TRMMKERNEL
  1862. LFD f16, 0 * SIZE(CO1)
  1863. LFD f17, 1 * SIZE(CO1)
  1864. LFD f18, 2 * SIZE(CO1)
  1865. LFD f19, 3 * SIZE(CO1)
  1866. FMADD f0, f0, f30, f16
  1867. FMADD f1, f1, f30, f17
  1868. FMADD f2, f2, f30, f18
  1869. FMADD f3, f3, f30, f19
  1870. #else
  1871. FMUL f0, f0, f30
  1872. FMUL f1, f1, f30
  1873. FMUL f2, f2, f30
  1874. FMUL f3, f3, f30
  1875. #endif
  1876. STFD f0, 0 * SIZE(CO1)
  1877. STFD f1, 1 * SIZE(CO1)
  1878. STFD f2, 2 * SIZE(CO1)
  1879. STFD f3, 3 * SIZE(CO1)
  1880. lfs f0, FZERO
  1881. fmr f1, f0
  1882. fmr f2, f0
  1883. fmr f3, f0
  1884. #ifdef TRMMKERNEL
  1885. #if ( defined(LEFT) && defined(TRANSA)) || \
  1886. (!defined(LEFT) && !defined(TRANSA))
  1887. sub TEMP, K, KK
  1888. #ifdef LEFT
  1889. addi TEMP, TEMP, -4
  1890. #else
  1891. addi TEMP, TEMP, -1
  1892. #endif
  1893. slwi r0 , TEMP, 2 + BASE_SHIFT
  1894. slwi TEMP, TEMP, 0 + BASE_SHIFT
  1895. add AO, AO, r0
  1896. add BO, BO, TEMP
  1897. #endif
  1898. #ifdef LEFT
  1899. addi KK, KK, 4
  1900. #endif
  1901. #endif
  1902. addi CO1, CO1, 4 * SIZE
  1903. addic. I, I, -1
  1904. bgt+ LL(71)
  1905. .align 4
  1906. LL(80):
  1907. andi. I, M, 2
  1908. ble LL(90)
  1909. #if defined(TRMMKERNEL)
  1910. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1911. LFD f16, 0 * SIZE(AO)
  1912. LFD f17, 1 * SIZE(AO)
  1913. LFD f18, 2 * SIZE(AO)
  1914. LFD f19, 3 * SIZE(AO)
  1915. LFD f20, 0 * SIZE(B)
  1916. LFD f21, 1 * SIZE(B)
  1917. LFD f22, 2 * SIZE(B)
  1918. LFD f23, 3 * SIZE(B)
  1919. mr BO, B
  1920. #else
  1921. slwi r0, KK, 1 + BASE_SHIFT
  1922. slwi TEMP, KK, 0 + BASE_SHIFT
  1923. add AO, AO, r0
  1924. add BO, B, TEMP
  1925. LFD f16, 0 * SIZE(AO)
  1926. LFD f17, 1 * SIZE(AO)
  1927. LFD f18, 2 * SIZE(AO)
  1928. LFD f19, 3 * SIZE(AO)
  1929. LFD f20, 0 * SIZE(BO)
  1930. LFD f21, 1 * SIZE(BO)
  1931. LFD f22, 2 * SIZE(BO)
  1932. LFD f23, 3 * SIZE(BO)
  1933. #endif
  1934. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1935. sub TEMP, K, KK
  1936. #elif defined(LEFT)
  1937. addi TEMP, KK, 2
  1938. #else
  1939. addi TEMP, KK, 1
  1940. #endif
  1941. srawi. TEMP, TEMP, 2
  1942. mtspr CTR, TEMP
  1943. #else
  1944. LFD f16, 0 * SIZE(AO)
  1945. LFD f17, 1 * SIZE(AO)
  1946. LFD f18, 2 * SIZE(AO)
  1947. LFD f19, 3 * SIZE(AO)
  1948. LFD f20, 0 * SIZE(B)
  1949. LFD f21, 1 * SIZE(B)
  1950. LFD f22, 2 * SIZE(B)
  1951. LFD f23, 3 * SIZE(B)
  1952. srawi. r0, K, 2
  1953. mtspr CTR, r0
  1954. mr BO, B
  1955. #endif
  1956. ble LL(85)
  1957. .align 5
  1958. LL(82):
  1959. FMADD f0, f16, f20, f0
  1960. FMADD f1, f17, f20, f1
  1961. FMADD f2, f18, f21, f2
  1962. FMADD f3, f19, f21, f3
  1963. LFD f16, 4 * SIZE(AO)
  1964. LFD f17, 5 * SIZE(AO)
  1965. LFD f18, 6 * SIZE(AO)
  1966. LFD f19, 7 * SIZE(AO)
  1967. FMADD f0, f16, f22, f0
  1968. FMADD f1, f17, f22, f1
  1969. FMADD f2, f18, f23, f2
  1970. FMADD f3, f19, f23, f3
  1971. LFD f16, 8 * SIZE(AO)
  1972. LFD f17, 9 * SIZE(AO)
  1973. LFD f18, 10 * SIZE(AO)
  1974. LFD f19, 11 * SIZE(AO)
  1975. LFD f20, 4 * SIZE(BO)
  1976. LFD f21, 5 * SIZE(BO)
  1977. LFD f22, 6 * SIZE(BO)
  1978. LFD f23, 7 * SIZE(BO)
  1979. addi AO, AO, 8 * SIZE
  1980. addi BO, BO, 4 * SIZE
  1981. DCBT(BO, PREB)
  1982. bdnz LL(82)
  1983. .align 4
  1984. LL(85):
  1985. lfd f30, ALPHA
  1986. #if defined(TRMMKERNEL)
  1987. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1988. sub TEMP, K, KK
  1989. #elif defined(LEFT)
  1990. addi TEMP, KK, 2
  1991. #else
  1992. addi TEMP, KK, 1
  1993. #endif
  1994. andi. TEMP, TEMP, 3
  1995. mtspr CTR, TEMP
  1996. #else
  1997. andi. r0, K, 3
  1998. mtspr CTR, r0
  1999. #endif
  2000. ble+ LL(88)
  2001. .align 4
  2002. LL(86):
  2003. FMADD f0, f16, f20, f0
  2004. FMADD f1, f17, f20, f1
  2005. LFD f16, 2 * SIZE(AO)
  2006. LFD f17, 3 * SIZE(AO)
  2007. LFD f20, 1 * SIZE(BO)
  2008. addi BO, BO, 1 * SIZE
  2009. addi AO, AO, 2 * SIZE
  2010. bdnz LL(86)
  2011. .align 4
  2012. LL(88):
  2013. #ifndef TRMMKERNEL
  2014. LFD f16, 0 * SIZE(CO1)
  2015. LFD f17, 1 * SIZE(CO1)
  2016. FADD f0, f2, f0
  2017. FADD f1, f3, f1
  2018. FMADD f0, f0, f30, f16
  2019. FMADD f1, f1, f30, f17
  2020. #else
  2021. FADD f0, f2, f0
  2022. FADD f1, f3, f1
  2023. FMUL f0, f0, f30
  2024. FMUL f1, f1, f30
  2025. #endif
  2026. STFD f0, 0 * SIZE(CO1)
  2027. STFD f1, 1 * SIZE(CO1)
  2028. lfs f0, FZERO
  2029. fmr f1, f0
  2030. fmr f2, f0
  2031. fmr f3, f0
  2032. addi CO1, CO1, 2 * SIZE
  2033. #ifdef TRMMKERNEL
  2034. #if ( defined(LEFT) && defined(TRANSA)) || \
  2035. (!defined(LEFT) && !defined(TRANSA))
  2036. sub TEMP, K, KK
  2037. #ifdef LEFT
  2038. addi TEMP, TEMP, -2
  2039. #else
  2040. addi TEMP, TEMP, -1
  2041. #endif
  2042. slwi r0 , TEMP, 1 + BASE_SHIFT
  2043. slwi TEMP, TEMP, 0 + BASE_SHIFT
  2044. add AO, AO, r0
  2045. add BO, BO, TEMP
  2046. #endif
  2047. #ifdef LEFT
  2048. addi KK, KK, 2
  2049. #endif
  2050. #endif
  2051. .align 4
  2052. LL(90):
  2053. andi. I, M, 1
  2054. ble LL(999)
  2055. #if defined(TRMMKERNEL)
  2056. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2057. LFD f16, 0 * SIZE(AO)
  2058. LFD f17, 1 * SIZE(AO)
  2059. LFD f18, 2 * SIZE(AO)
  2060. LFD f19, 3 * SIZE(AO)
  2061. LFD f20, 0 * SIZE(B)
  2062. LFD f21, 1 * SIZE(B)
  2063. LFD f22, 2 * SIZE(B)
  2064. LFD f23, 3 * SIZE(B)
  2065. mr BO, B
  2066. #else
  2067. slwi r0, KK, 0 + BASE_SHIFT
  2068. slwi TEMP, KK, 0 + BASE_SHIFT
  2069. add AO, AO, r0
  2070. add BO, B, TEMP
  2071. LFD f16, 0 * SIZE(AO)
  2072. LFD f17, 1 * SIZE(AO)
  2073. LFD f18, 2 * SIZE(AO)
  2074. LFD f19, 3 * SIZE(AO)
  2075. LFD f20, 0 * SIZE(BO)
  2076. LFD f21, 1 * SIZE(BO)
  2077. LFD f22, 2 * SIZE(BO)
  2078. LFD f23, 3 * SIZE(BO)
  2079. #endif
  2080. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2081. sub TEMP, K, KK
  2082. #elif defined(LEFT)
  2083. addi TEMP, KK, 1
  2084. #else
  2085. addi TEMP, KK, 1
  2086. #endif
  2087. srawi. TEMP, TEMP, 3
  2088. mtspr CTR, TEMP
  2089. #else
  2090. LFD f16, 0 * SIZE(AO)
  2091. LFD f17, 1 * SIZE(AO)
  2092. LFD f18, 2 * SIZE(AO)
  2093. LFD f19, 3 * SIZE(AO)
  2094. LFD f20, 0 * SIZE(B)
  2095. LFD f21, 1 * SIZE(B)
  2096. LFD f22, 2 * SIZE(B)
  2097. LFD f23, 3 * SIZE(B)
  2098. srawi. r0, K, 3
  2099. mtspr CTR, r0
  2100. mr BO, B
  2101. #endif
  2102. ble LL(95)
  2103. .align 5
  2104. LL(92):
  2105. FMADD f0, f16, f20, f0
  2106. FMADD f1, f17, f21, f1
  2107. FMADD f2, f18, f22, f2
  2108. FMADD f3, f19, f23, f3
  2109. LFD f16, 4 * SIZE(AO)
  2110. LFD f17, 5 * SIZE(AO)
  2111. LFD f18, 6 * SIZE(AO)
  2112. LFD f19, 7 * SIZE(AO)
  2113. LFD f20, 4 * SIZE(BO)
  2114. LFD f21, 5 * SIZE(BO)
  2115. LFD f22, 6 * SIZE(BO)
  2116. LFD f23, 7 * SIZE(BO)
  2117. FMADD f0, f16, f20, f0
  2118. FMADD f1, f17, f21, f1
  2119. FMADD f2, f18, f22, f2
  2120. FMADD f3, f19, f23, f3
  2121. LFD f16, 8 * SIZE(AO)
  2122. LFD f17, 9 * SIZE(AO)
  2123. LFD f18, 10 * SIZE(AO)
  2124. LFD f19, 11 * SIZE(AO)
  2125. LFD f20, 8 * SIZE(BO)
  2126. LFD f21, 9 * SIZE(BO)
  2127. LFD f22, 10 * SIZE(BO)
  2128. LFD f23, 11 * SIZE(BO)
  2129. addi AO, AO, 8 * SIZE
  2130. addi BO, BO, 8 * SIZE
  2131. bdnz LL(92)
  2132. .align 4
  2133. LL(95):
  2134. lfd f30, ALPHA
  2135. #if defined(TRMMKERNEL)
  2136. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2137. sub TEMP, K, KK
  2138. #elif defined(LEFT)
  2139. addi TEMP, KK, 1
  2140. #else
  2141. addi TEMP, KK, 1
  2142. #endif
  2143. andi. TEMP, TEMP, 7
  2144. mtspr CTR, TEMP
  2145. #else
  2146. andi. r0, K, 7
  2147. mtspr CTR, r0
  2148. #endif
  2149. ble+ LL(98)
  2150. .align 4
  2151. LL(96):
  2152. FMADD f0, f16, f20, f0
  2153. LFD f16, 1 * SIZE(AO)
  2154. LFD f20, 1 * SIZE(BO)
  2155. addi BO, BO, 1 * SIZE
  2156. addi AO, AO, 1 * SIZE
  2157. bdnz LL(96)
  2158. .align 4
  2159. LL(98):
  2160. #ifndef TRMMKERNEL
  2161. LFD f16, 0 * SIZE(CO1)
  2162. FADD f0, f1, f0
  2163. FADD f2, f3, f2
  2164. FADD f0, f2, f0
  2165. FMADD f0, f0, f30, f16
  2166. #else
  2167. FADD f0, f1, f0
  2168. FADD f2, f3, f2
  2169. FADD f0, f2, f0
  2170. FMUL f0, f0, f30
  2171. #endif
  2172. STFD f0, 0 * SIZE(CO1)
  2173. .align 4
  2174. LL(999):
  2175. addi r3, 0, 0
  2176. lfd f14, 0(SP)
  2177. lfd f15, 8(SP)
  2178. lfd f16, 16(SP)
  2179. lfd f17, 24(SP)
  2180. lfd f18, 32(SP)
  2181. lfd f19, 40(SP)
  2182. lfd f20, 48(SP)
  2183. lfd f21, 56(SP)
  2184. lfd f22, 64(SP)
  2185. lfd f23, 72(SP)
  2186. lfd f24, 80(SP)
  2187. lfd f25, 88(SP)
  2188. lfd f26, 96(SP)
  2189. lfd f27, 104(SP)
  2190. lfd f28, 112(SP)
  2191. lfd f29, 120(SP)
  2192. lfd f30, 128(SP)
  2193. lfd f31, 136(SP)
  2194. #ifdef __64BIT__
  2195. ld r31, 144(SP)
  2196. ld r30, 152(SP)
  2197. ld r29, 160(SP)
  2198. ld r28, 168(SP)
  2199. ld r27, 176(SP)
  2200. ld r26, 184(SP)
  2201. ld r25, 192(SP)
  2202. ld r24, 200(SP)
  2203. ld r23, 208(SP)
  2204. ld r22, 216(SP)
  2205. ld r21, 224(SP)
  2206. ld r20, 232(SP)
  2207. #if defined(TRMMKERNEL) || defined(TRSMKERNEL)
  2208. ld r19, 240(SP)
  2209. ld r18, 248(SP)
  2210. #endif
  2211. #else
  2212. lwz r31, 144(SP)
  2213. lwz r30, 148(SP)
  2214. lwz r29, 152(SP)
  2215. lwz r28, 156(SP)
  2216. lwz r27, 160(SP)
  2217. lwz r26, 164(SP)
  2218. lwz r25, 168(SP)
  2219. lwz r24, 172(SP)
  2220. lwz r23, 176(SP)
  2221. lwz r22, 180(SP)
  2222. lwz r21, 184(SP)
  2223. lwz r20, 188(SP)
  2224. #if defined(TRMMKERNEL) || defined(TRSMKERNEL)
  2225. lwz r19, 192(SP)
  2226. lwz r18, 196(SP)
  2227. #endif
  2228. #endif
  2229. addi SP, SP, STACKSIZE
  2230. blr
  2231. EPILOGUE
  2232. #endif