You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_cell.S 47 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #if defined(linux) || defined(__FreeBSD__)
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define AORIG r18
  88. #define TEMP r19
  89. #define KK r20
  90. #define I r21
  91. #define J r22
  92. #define AO r23
  93. #define BO r24
  94. #define CO1 r25
  95. #define CO2 r26
  96. #define CO3 r27
  97. #define CO4 r28
  98. #define PREA r29
  99. #define PREB r30
  100. #define PREC r31
  101. #ifndef NEEDPARAM
  102. #ifndef DOUBLE
  103. #include "../sparam.h"
  104. #else
  105. #include "../dparam.h"
  106. #endif
  107. PROLOGUE
  108. PROFCODE
  109. addi SP, SP, -STACKSIZE
  110. li r0, 0
  111. stfd f14, 0(SP)
  112. stfd f15, 8(SP)
  113. stfd f16, 16(SP)
  114. stfd f17, 24(SP)
  115. stfd f18, 32(SP)
  116. stfd f19, 40(SP)
  117. stfd f20, 48(SP)
  118. stfd f21, 56(SP)
  119. stfd f22, 64(SP)
  120. stfd f23, 72(SP)
  121. stfd f24, 80(SP)
  122. stfd f25, 88(SP)
  123. stfd f26, 96(SP)
  124. stfd f27, 104(SP)
  125. stfd f28, 112(SP)
  126. stfd f29, 120(SP)
  127. stfd f30, 128(SP)
  128. stfd f31, 136(SP)
  129. #ifdef __64BIT__
  130. std r31, 144(SP)
  131. std r30, 152(SP)
  132. std r29, 160(SP)
  133. std r28, 168(SP)
  134. std r27, 176(SP)
  135. std r26, 184(SP)
  136. std r25, 192(SP)
  137. std r24, 200(SP)
  138. std r23, 208(SP)
  139. std r22, 216(SP)
  140. std r21, 224(SP)
  141. std r20, 232(SP)
  142. #if defined(TRMMKERNEL)
  143. std r19, 240(SP)
  144. std r18, 248(SP)
  145. #endif
  146. #else
  147. stw r31, 144(SP)
  148. stw r30, 148(SP)
  149. stw r29, 152(SP)
  150. stw r28, 156(SP)
  151. stw r27, 160(SP)
  152. stw r26, 164(SP)
  153. stw r25, 168(SP)
  154. stw r24, 172(SP)
  155. stw r23, 176(SP)
  156. stw r22, 180(SP)
  157. stw r21, 184(SP)
  158. stw r20, 188(SP)
  159. #if defined(TRMMKERNEL)
  160. stw r19, 192(SP)
  161. stw r18, 196(SP)
  162. #endif
  163. #endif
  164. stfd f1, ALPHA
  165. stw r0, FZERO
  166. #if defined(_AIX) || defined(__APPLE__)
  167. #if !defined(__64BIT__) && defined(DOUBLE)
  168. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  169. #endif
  170. #endif
  171. slwi LDC, LDC, BASE_SHIFT
  172. #if defined(TRMMKERNEL)
  173. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  174. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  175. #endif
  176. #if defined(_AIX) || defined(__APPLE__)
  177. #ifdef __64BIT__
  178. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  179. #else
  180. #ifdef DOUBLE
  181. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  182. #else
  183. lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  184. #endif
  185. #endif
  186. #endif
  187. #endif
  188. #if defined(TRMMKERNEL) && !defined(LEFT)
  189. neg KK, OFFSET
  190. #endif
  191. cmpwi cr0, M, 0
  192. ble LL(999)
  193. cmpwi cr0, N, 0
  194. ble LL(999)
  195. cmpwi cr0, K, 0
  196. ble LL(999)
  197. #ifndef PREFETCHTEST
  198. /* Normal prefetch */
  199. #ifdef CELL
  200. li PREC, 4 * SIZE
  201. #endif
  202. #if defined(linux) || defined(__FreeBSD__)
  203. #ifndef __64BIT__
  204. mr PREA, r10
  205. lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
  206. lwz PREC, FRAMESLOT(1) + STACKSIZE(SP)
  207. #else
  208. ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  209. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  210. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  211. #endif
  212. #endif
  213. #if defined(_AIX) || defined(__APPLE__)
  214. #ifdef __64BIT__
  215. xc ld PREA, FRAMESLOT(0) + STACKSIZE(SP)
  216. ld PREB, FRAMESLOT(1) + STACKSIZE(SP)
  217. ld PREC, FRAMESLOT(2) + STACKSIZE(SP)
  218. #else
  219. #ifdef DOUBLE
  220. lwz PREA, FRAMESLOT(1) + STACKSIZE(SP)
  221. lwz PREB, FRAMESLOT(2) + STACKSIZE(SP)
  222. lwz PREC, FRAMESLOT(3) + STACKSIZE(SP)
  223. #else
  224. lwz PREA, FRAMESLOT(0) + STACKSIZE(SP)
  225. lwz PREB, FRAMESLOT(1) + STACKSIZE(SP)
  226. lwz PREC, FRAMESLOT(2) + STACKSIZE(SP)
  227. #endif
  228. #endif
  229. #endif
  230. #endif
  231. #ifndef PREFETCHTEST
  232. li PREC, 3 * SIZE
  233. li PREA, 16 * 12 * SIZE
  234. li PREB, 16 * 12 * SIZE
  235. #endif
  236. srawi. J, N, 2
  237. ble LL(40)
  238. .align 4
  239. LL(10):
  240. mr CO1, C
  241. add CO2, C, LDC
  242. add CO3, CO2, LDC
  243. add CO4, CO3, LDC
  244. #if defined(TRMMKERNEL) && defined(LEFT)
  245. mr KK, OFFSET
  246. #endif
  247. lfs f0, FZERO
  248. fmr f1, f0
  249. fmr f2, f0
  250. fmr f3, f0
  251. fmr f4, f0
  252. fmr f5, f0
  253. fmr f6, f0
  254. fmr f7, f0
  255. fmr f8, f0
  256. fmr f9, f0
  257. fmr f10, f0
  258. fmr f11, f0
  259. fmr f12, f0
  260. fmr f13, f0
  261. fmr f14, f0
  262. fmr f15, f0
  263. srawi. I, M, 2
  264. mr AO, A
  265. add C, CO4, LDC
  266. ble LL(20)
  267. .align 4
  268. LL(11):
  269. #if defined(TRMMKERNEL)
  270. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  271. LFD f16, 0 * SIZE(AO)
  272. LFD f17, 1 * SIZE(AO)
  273. LFD f18, 2 * SIZE(AO)
  274. LFD f19, 3 * SIZE(AO)
  275. LFD f20, 0 * SIZE(B)
  276. LFD f21, 1 * SIZE(B)
  277. LFD f22, 2 * SIZE(B)
  278. LFD f23, 3 * SIZE(B)
  279. LFD f24, 4 * SIZE(AO)
  280. LFD f25, 5 * SIZE(AO)
  281. LFD f26, 6 * SIZE(AO)
  282. LFD f28, 4 * SIZE(B)
  283. LFD f29, 5 * SIZE(B)
  284. LFD f30, 6 * SIZE(B)
  285. mr BO, B
  286. #else
  287. slwi r0, KK, 2 + BASE_SHIFT
  288. add AO, AO, r0
  289. add BO, B, r0
  290. LFD f16, 0 * SIZE(AO)
  291. LFD f17, 1 * SIZE(AO)
  292. LFD f18, 2 * SIZE(AO)
  293. LFD f19, 3 * SIZE(AO)
  294. LFD f20, 0 * SIZE(BO)
  295. LFD f21, 1 * SIZE(BO)
  296. LFD f22, 2 * SIZE(BO)
  297. LFD f23, 3 * SIZE(BO)
  298. LFD f24, 4 * SIZE(AO)
  299. LFD f25, 5 * SIZE(AO)
  300. LFD f26, 6 * SIZE(AO)
  301. LFD f28, 4 * SIZE(BO)
  302. LFD f29, 5 * SIZE(BO)
  303. LFD f30, 6 * SIZE(BO)
  304. #endif
  305. dcbtst CO1, PREC
  306. dcbtst CO2, PREC
  307. dcbtst CO3, PREC
  308. dcbtst CO4, PREC
  309. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  310. sub TEMP, K, KK
  311. #elif defined(LEFT)
  312. addi TEMP, KK, 4
  313. #else
  314. addi TEMP, KK, 4
  315. #endif
  316. srawi. TEMP, TEMP, 2
  317. mtspr CTR, TEMP
  318. ble LL(15)
  319. #else
  320. LFD f16, 0 * SIZE(AO)
  321. LFD f17, 1 * SIZE(AO)
  322. LFD f18, 2 * SIZE(AO)
  323. LFD f19, 3 * SIZE(AO)
  324. LFD f20, 0 * SIZE(B)
  325. LFD f21, 1 * SIZE(B)
  326. LFD f22, 2 * SIZE(B)
  327. LFD f23, 3 * SIZE(B)
  328. LFD f24, 4 * SIZE(AO)
  329. LFD f25, 5 * SIZE(AO)
  330. LFD f26, 6 * SIZE(AO)
  331. LFD f28, 4 * SIZE(B)
  332. LFD f29, 5 * SIZE(B)
  333. LFD f30, 6 * SIZE(B)
  334. dcbtst CO1, PREC
  335. dcbtst CO2, PREC
  336. dcbtst CO3, PREC
  337. dcbtst CO4, PREC
  338. srawi. r0, K, 2
  339. mtspr CTR, r0
  340. mr BO, B
  341. ble LL(15)
  342. #endif
  343. .align 4
  344. #define NOP1 mr r18, r18
  345. #define NOP2 mr r19, r19
  346. LL(12):
  347. FMADD f0, f16, f20, f0
  348. dcbt AO, PREA
  349. FMADD f4, f16, f21, f4
  350. dcbt BO, PREB
  351. FMADD f8, f16, f22, f8
  352. LFD f31, 7 * SIZE(BO)
  353. FMADD f12, f16, f23, f12
  354. LFD f27, 7 * SIZE(AO)
  355. FMADD f1, f17, f20, f1
  356. LFD f16, 8 * SIZE(AO)
  357. FMADD f5, f17, f21, f5
  358. NOP2
  359. FMADD f9, f17, f22, f9
  360. NOP1
  361. FMADD f13, f17, f23, f13
  362. LFD f17, 9 * SIZE(AO)
  363. FMADD f2, f18, f20, f2
  364. NOP1
  365. FMADD f6, f18, f21, f6
  366. NOP2
  367. FMADD f10, f18, f22, f10
  368. NOP1
  369. FMADD f14, f18, f23, f14
  370. LFD f18, 10 * SIZE(AO)
  371. FMADD f3, f19, f20, f3
  372. LFD f20, 8 * SIZE(BO)
  373. FMADD f7, f19, f21, f7
  374. LFD f21, 9 * SIZE(BO)
  375. FMADD f11, f19, f22, f11
  376. LFD f22, 10 * SIZE(BO)
  377. FMADD f15, f19, f23, f15
  378. LFD f19, 11 * SIZE(AO)
  379. FMADD f0, f24, f28, f0
  380. LFD f23, 11 * SIZE(BO)
  381. FMADD f4, f24, f29, f4
  382. NOP2
  383. FMADD f8, f24, f30, f8
  384. NOP1
  385. FMADD f12, f24, f31, f12
  386. LFD f24, 12 * SIZE(AO)
  387. FMADD f1, f25, f28, f1
  388. NOP1
  389. FMADD f5, f25, f29, f5
  390. NOP2
  391. FMADD f9, f25, f30, f9
  392. NOP1
  393. FMADD f13, f25, f31, f13
  394. LFD f25, 13 * SIZE(AO)
  395. FMADD f2, f26, f28, f2
  396. NOP1
  397. FMADD f6, f26, f29, f6
  398. NOP2
  399. FMADD f10, f26, f30, f10
  400. NOP1
  401. FMADD f14, f26, f31, f14
  402. LFD f26, 14 * SIZE(AO)
  403. FMADD f3, f27, f28, f3
  404. LFD f28, 12 * SIZE(BO)
  405. FMADD f7, f27, f29, f7
  406. LFD f29, 13 * SIZE(BO)
  407. FMADD f11, f27, f30, f11
  408. LFD f30, 14 * SIZE(BO)
  409. FMADD f15, f27, f31, f15
  410. LFD f27, 15 * SIZE(AO)
  411. FMADD f0, f16, f20, f0
  412. LFD f31, 15 * SIZE(BO)
  413. FMADD f4, f16, f21, f4
  414. NOP2
  415. FMADD f8, f16, f22, f8
  416. NOP1
  417. FMADD f12, f16, f23, f12
  418. LFD f16, 16 * SIZE(AO)
  419. FMADD f1, f17, f20, f1
  420. NOP1
  421. FMADD f5, f17, f21, f5
  422. NOP2
  423. FMADD f9, f17, f22, f9
  424. NOP1
  425. FMADD f13, f17, f23, f13
  426. LFD f17, 17 * SIZE(AO)
  427. FMADD f2, f18, f20, f2
  428. NOP1
  429. FMADD f6, f18, f21, f6
  430. NOP2
  431. FMADD f10, f18, f22, f10
  432. NOP1
  433. FMADD f14, f18, f23, f14
  434. LFD f18, 18 * SIZE(AO)
  435. FMADD f3, f19, f20, f3
  436. LFD f20, 16 * SIZE(BO)
  437. FMADD f7, f19, f21, f7
  438. LFD f21, 17 * SIZE(BO)
  439. FMADD f11, f19, f22, f11
  440. LFD f22, 18 * SIZE(BO)
  441. FMADD f15, f19, f23, f15
  442. LFD f19, 19 * SIZE(AO)
  443. FMADD f0, f24, f28, f0
  444. LFD f23, 19 * SIZE(BO)
  445. FMADD f4, f24, f29, f4
  446. NOP2
  447. FMADD f8, f24, f30, f8
  448. NOP1
  449. FMADD f12, f24, f31, f12
  450. LFD f24, 20 * SIZE(AO)
  451. FMADD f1, f25, f28, f1
  452. NOP1
  453. FMADD f5, f25, f29, f5
  454. NOP2
  455. FMADD f9, f25, f30, f9
  456. NOP1
  457. FMADD f13, f25, f31, f13
  458. LFD f25, 21 * SIZE(AO)
  459. FMADD f2, f26, f28, f2
  460. NOP1
  461. FMADD f6, f26, f29, f6
  462. NOP2
  463. FMADD f10, f26, f30, f10
  464. NOP1
  465. FMADD f14, f26, f31, f14
  466. LFD f26, 22 * SIZE(AO)
  467. FMADD f3, f27, f28, f3
  468. LFD f28, 20 * SIZE(BO)
  469. FMADD f7, f27, f29, f7
  470. LFD f29, 21 * SIZE(BO)
  471. FMADD f11, f27, f30, f11
  472. LFD f30, 22 * SIZE(BO)
  473. FMADD f15, f27, f31, f15
  474. addi AO, AO, 16 * SIZE
  475. addi BO, BO, 16 * SIZE
  476. bdnz LL(12)
  477. .align 4
  478. LL(15):
  479. lfd f30, ALPHA
  480. #if defined(TRMMKERNEL)
  481. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  482. sub TEMP, K, KK
  483. #elif defined(LEFT)
  484. addi TEMP, KK, 4
  485. #else
  486. addi TEMP, KK, 4
  487. #endif
  488. andi. TEMP, TEMP, 3
  489. mtspr CTR, TEMP
  490. #else
  491. andi. r0, K, 3
  492. mtspr CTR, r0
  493. #endif
  494. ble+ LL(18)
  495. .align 4
  496. LL(16):
  497. FMADD f0, f16, f20, f0
  498. FMADD f5, f17, f21, f5
  499. FMADD f10, f18, f22, f10
  500. FMADD f15, f19, f23, f15
  501. FMADD f1, f17, f20, f1
  502. FMADD f2, f18, f20, f2
  503. FMADD f3, f19, f20, f3
  504. FMADD f4, f16, f21, f4
  505. FMADD f6, f18, f21, f6
  506. FMADD f7, f19, f21, f7
  507. FMADD f8, f16, f22, f8
  508. FMADD f9, f17, f22, f9
  509. FMADD f11, f19, f22, f11
  510. FMADD f12, f16, f23, f12
  511. FMADD f13, f17, f23, f13
  512. FMADD f14, f18, f23, f14
  513. LFD f16, 4 * SIZE(AO)
  514. LFD f17, 5 * SIZE(AO)
  515. LFD f18, 6 * SIZE(AO)
  516. LFD f19, 7 * SIZE(AO)
  517. LFD f20, 4 * SIZE(BO)
  518. LFD f21, 5 * SIZE(BO)
  519. LFD f22, 6 * SIZE(BO)
  520. LFD f23, 7 * SIZE(BO)
  521. addi BO, BO, 4 * SIZE
  522. addi AO, AO, 4 * SIZE
  523. bdnz LL(16)
  524. .align 4
  525. LL(18):
  526. #ifndef TRMMKERNEL
  527. LFD f16, 0 * SIZE(CO1)
  528. LFD f17, 1 * SIZE(CO1)
  529. LFD f18, 2 * SIZE(CO1)
  530. LFD f19, 3 * SIZE(CO1)
  531. LFD f20, 0 * SIZE(CO2)
  532. LFD f21, 1 * SIZE(CO2)
  533. LFD f22, 2 * SIZE(CO2)
  534. LFD f23, 3 * SIZE(CO2)
  535. FMADD f0, f0, f30, f16
  536. FMADD f1, f1, f30, f17
  537. FMADD f2, f2, f30, f18
  538. FMADD f3, f3, f30, f19
  539. FMADD f4, f4, f30, f20
  540. FMADD f5, f5, f30, f21
  541. FMADD f6, f6, f30, f22
  542. FMADD f7, f7, f30, f23
  543. LFD f16, 0 * SIZE(CO3)
  544. LFD f17, 1 * SIZE(CO3)
  545. LFD f18, 2 * SIZE(CO3)
  546. LFD f19, 3 * SIZE(CO3)
  547. LFD f20, 0 * SIZE(CO4)
  548. LFD f21, 1 * SIZE(CO4)
  549. LFD f22, 2 * SIZE(CO4)
  550. LFD f23, 3 * SIZE(CO4)
  551. FMADD f8, f8, f30, f16
  552. FMADD f9, f9, f30, f17
  553. FMADD f10, f10, f30, f18
  554. FMADD f11, f11, f30, f19
  555. FMADD f12, f12, f30, f20
  556. FMADD f13, f13, f30, f21
  557. FMADD f14, f14, f30, f22
  558. FMADD f15, f15, f30, f23
  559. #else
  560. FMUL f0, f0, f30
  561. FMUL f1, f1, f30
  562. FMUL f2, f2, f30
  563. FMUL f3, f3, f30
  564. FMUL f4, f4, f30
  565. FMUL f5, f5, f30
  566. FMUL f6, f6, f30
  567. FMUL f7, f7, f30
  568. FMUL f8, f8, f30
  569. FMUL f9, f9, f30
  570. FMUL f10, f10, f30
  571. FMUL f11, f11, f30
  572. FMUL f12, f12, f30
  573. FMUL f13, f13, f30
  574. FMUL f14, f14, f30
  575. FMUL f15, f15, f30
  576. #endif
  577. STFD f0, 0 * SIZE(CO1)
  578. STFD f1, 1 * SIZE(CO1)
  579. STFD f2, 2 * SIZE(CO1)
  580. STFD f3, 3 * SIZE(CO1)
  581. lfs f0, FZERO
  582. fmr f1, f0
  583. fmr f2, f0
  584. fmr f3, f0
  585. STFD f4, 0 * SIZE(CO2)
  586. STFD f5, 1 * SIZE(CO2)
  587. STFD f6, 2 * SIZE(CO2)
  588. STFD f7, 3 * SIZE(CO2)
  589. fmr f4, f0
  590. fmr f5, f0
  591. fmr f6, f0
  592. fmr f7, f0
  593. STFD f8, 0 * SIZE(CO3)
  594. STFD f9, 1 * SIZE(CO3)
  595. STFD f10, 2 * SIZE(CO3)
  596. STFD f11, 3 * SIZE(CO3)
  597. fmr f8, f0
  598. fmr f9, f0
  599. fmr f10, f0
  600. fmr f11, f0
  601. STFD f12, 0 * SIZE(CO4)
  602. STFD f13, 1 * SIZE(CO4)
  603. STFD f14, 2 * SIZE(CO4)
  604. STFD f15, 3 * SIZE(CO4)
  605. fmr f12, f0
  606. fmr f13, f0
  607. fmr f14, f0
  608. fmr f15, f0
  609. addi CO1, CO1, 4 * SIZE
  610. addi CO2, CO2, 4 * SIZE
  611. addi CO3, CO3, 4 * SIZE
  612. addi CO4, CO4, 4 * SIZE
  613. #ifdef TRMMKERNEL
  614. #if ( defined(LEFT) && defined(TRANSA)) || \
  615. (!defined(LEFT) && !defined(TRANSA))
  616. sub TEMP, K, KK
  617. #ifdef LEFT
  618. addi TEMP, TEMP, -4
  619. #else
  620. addi TEMP, TEMP, -4
  621. #endif
  622. slwi TEMP, TEMP, 2 + BASE_SHIFT
  623. add AO, AO, TEMP
  624. add BO, BO, TEMP
  625. #endif
  626. #ifdef LEFT
  627. addi KK, KK, 4
  628. #endif
  629. #endif
  630. addic. I, I, -1
  631. bgt+ LL(11)
  632. .align 4
  633. LL(20):
  634. andi. I, M, 2
  635. ble LL(30)
  636. #if defined(TRMMKERNEL)
  637. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  638. LFD f16, 0 * SIZE(AO)
  639. LFD f17, 1 * SIZE(AO)
  640. LFD f18, 2 * SIZE(AO)
  641. LFD f19, 3 * SIZE(AO)
  642. LFD f20, 0 * SIZE(B)
  643. LFD f21, 1 * SIZE(B)
  644. LFD f22, 2 * SIZE(B)
  645. LFD f23, 3 * SIZE(B)
  646. LFD f24, 4 * SIZE(B)
  647. LFD f25, 5 * SIZE(B)
  648. LFD f26, 6 * SIZE(B)
  649. LFD f27, 7 * SIZE(B)
  650. mr BO, B
  651. #else
  652. slwi r0, KK, 1 + BASE_SHIFT
  653. slwi TEMP, KK, 2 + BASE_SHIFT
  654. add AO, AO, r0
  655. add BO, B, TEMP
  656. LFD f16, 0 * SIZE(AO)
  657. LFD f17, 1 * SIZE(AO)
  658. LFD f18, 2 * SIZE(AO)
  659. LFD f19, 3 * SIZE(AO)
  660. LFD f20, 0 * SIZE(BO)
  661. LFD f21, 1 * SIZE(BO)
  662. LFD f22, 2 * SIZE(BO)
  663. LFD f23, 3 * SIZE(BO)
  664. LFD f24, 4 * SIZE(BO)
  665. LFD f25, 5 * SIZE(BO)
  666. LFD f26, 6 * SIZE(BO)
  667. LFD f27, 7 * SIZE(BO)
  668. #endif
  669. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  670. sub TEMP, K, KK
  671. #elif defined(LEFT)
  672. addi TEMP, KK, 2
  673. #else
  674. addi TEMP, KK, 4
  675. #endif
  676. srawi. TEMP, TEMP, 2
  677. mtspr CTR, TEMP
  678. #else
  679. LFD f16, 0 * SIZE(AO)
  680. LFD f17, 1 * SIZE(AO)
  681. LFD f18, 2 * SIZE(AO)
  682. LFD f19, 3 * SIZE(AO)
  683. LFD f20, 0 * SIZE(B)
  684. LFD f21, 1 * SIZE(B)
  685. LFD f22, 2 * SIZE(B)
  686. LFD f23, 3 * SIZE(B)
  687. LFD f24, 4 * SIZE(B)
  688. LFD f25, 5 * SIZE(B)
  689. LFD f26, 6 * SIZE(B)
  690. LFD f27, 7 * SIZE(B)
  691. srawi. r0, K, 2
  692. mtspr CTR, r0
  693. mr BO, B
  694. #endif
  695. ble LL(25)
  696. .align 5
  697. LL(22):
  698. FMADD f0, f16, f20, f0
  699. FMADD f1, f17, f20, f1
  700. FMADD f4, f16, f21, f4
  701. FMADD f5, f17, f21, f5
  702. FMADD f8, f16, f22, f8
  703. FMADD f9, f17, f22, f9
  704. FMADD f12, f16, f23, f12
  705. FMADD f13, f17, f23, f13
  706. LFD f20, 8 * SIZE(BO)
  707. LFD f21, 9 * SIZE(BO)
  708. LFD f22, 10 * SIZE(BO)
  709. LFD f23, 11 * SIZE(BO)
  710. FMADD f2, f18, f24, f2
  711. FMADD f3, f19, f24, f3
  712. FMADD f6, f18, f25, f6
  713. FMADD f7, f19, f25, f7
  714. FMADD f10, f18, f26, f10
  715. FMADD f11, f19, f26, f11
  716. FMADD f14, f18, f27, f14
  717. FMADD f15, f19, f27, f15
  718. LFD f16, 4 * SIZE(AO)
  719. LFD f17, 5 * SIZE(AO)
  720. LFD f18, 6 * SIZE(AO)
  721. LFD f19, 7 * SIZE(AO)
  722. FMADD f0, f16, f20, f0
  723. FMADD f1, f17, f20, f1
  724. FMADD f4, f16, f21, f4
  725. FMADD f5, f17, f21, f5
  726. LFD f24, 12 * SIZE(BO)
  727. LFD f25, 13 * SIZE(BO)
  728. LFD f26, 14 * SIZE(BO)
  729. LFD f27, 15 * SIZE(BO)
  730. FMADD f8, f16, f22, f8
  731. FMADD f9, f17, f22, f9
  732. FMADD f12, f16, f23, f12
  733. FMADD f13, f17, f23, f13
  734. LFD f20, 16 * SIZE(BO)
  735. LFD f21, 17 * SIZE(BO)
  736. LFD f22, 18 * SIZE(BO)
  737. LFD f23, 19 * SIZE(BO)
  738. FMADD f2, f18, f24, f2
  739. FMADD f3, f19, f24, f3
  740. FMADD f6, f18, f25, f6
  741. FMADD f7, f19, f25, f7
  742. FMADD f10, f18, f26, f10
  743. FMADD f11, f19, f26, f11
  744. FMADD f14, f18, f27, f14
  745. FMADD f15, f19, f27, f15
  746. LFD f16, 8 * SIZE(AO)
  747. LFD f17, 9 * SIZE(AO)
  748. LFD f18, 10 * SIZE(AO)
  749. LFD f19, 11 * SIZE(AO)
  750. LFD f24, 20 * SIZE(BO)
  751. LFD f25, 21 * SIZE(BO)
  752. LFD f26, 22 * SIZE(BO)
  753. LFD f27, 23 * SIZE(BO)
  754. addi AO, AO, 8 * SIZE
  755. addi BO, BO, 16 * SIZE
  756. dcbt 0, BO, PREB
  757. bdnz LL(22)
  758. fadd f0, f2, f0
  759. fadd f1, f3, f1
  760. fadd f4, f6, f4
  761. fadd f5, f7, f5
  762. fadd f8, f10, f8
  763. fadd f9, f11, f9
  764. fadd f12, f14, f12
  765. fadd f13, f15, f13
  766. .align 4
  767. LL(25):
  768. lfd f30, ALPHA
  769. #if defined(TRMMKERNEL)
  770. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  771. sub TEMP, K, KK
  772. #elif defined(LEFT)
  773. addi TEMP, KK, 2
  774. #else
  775. addi TEMP, KK, 4
  776. #endif
  777. andi. TEMP, TEMP, 3
  778. mtspr CTR, TEMP
  779. #else
  780. andi. r0, K, 3
  781. mtspr CTR, r0
  782. #endif
  783. ble+ LL(28)
  784. .align 4
  785. LL(26):
  786. FMADD f0, f16, f20, f0
  787. FMADD f1, f17, f20, f1
  788. FMADD f4, f16, f21, f4
  789. FMADD f5, f17, f21, f5
  790. FMADD f8, f16, f22, f8
  791. FMADD f9, f17, f22, f9
  792. FMADD f12, f16, f23, f12
  793. FMADD f13, f17, f23, f13
  794. LFD f16, 2 * SIZE(AO)
  795. LFD f17, 3 * SIZE(AO)
  796. LFD f20, 4 * SIZE(BO)
  797. LFD f21, 5 * SIZE(BO)
  798. LFD f22, 6 * SIZE(BO)
  799. LFD f23, 7 * SIZE(BO)
  800. addi BO, BO, 4 * SIZE
  801. addi AO, AO, 2 * SIZE
  802. bdnz LL(26)
  803. .align 4
  804. LL(28):
  805. #ifndef TRMMKERNEL
  806. LFD f16, 0 * SIZE(CO1)
  807. LFD f17, 1 * SIZE(CO1)
  808. LFD f18, 0 * SIZE(CO2)
  809. LFD f19, 1 * SIZE(CO2)
  810. FMADD f0, f0, f30, f16
  811. FMADD f1, f1, f30, f17
  812. FMADD f4, f4, f30, f18
  813. FMADD f5, f5, f30, f19
  814. LFD f20, 0 * SIZE(CO3)
  815. LFD f21, 1 * SIZE(CO3)
  816. LFD f22, 0 * SIZE(CO4)
  817. LFD f23, 1 * SIZE(CO4)
  818. FMADD f8, f8, f30, f20
  819. FMADD f9, f9, f30, f21
  820. FMADD f12, f12, f30, f22
  821. FMADD f13, f13, f30, f23
  822. #else
  823. FMUL f0, f0, f30
  824. FMUL f1, f1, f30
  825. FMUL f4, f4, f30
  826. FMUL f5, f5, f30
  827. FMUL f8, f8, f30
  828. FMUL f9, f9, f30
  829. FMUL f12, f12, f30
  830. FMUL f13, f13, f30
  831. #endif
  832. STFD f0, 0 * SIZE(CO1)
  833. STFD f1, 1 * SIZE(CO1)
  834. STFD f4, 0 * SIZE(CO2)
  835. STFD f5, 1 * SIZE(CO2)
  836. lfs f0, FZERO
  837. fmr f1, f0
  838. fmr f2, f0
  839. fmr f3, f0
  840. STFD f8, 0 * SIZE(CO3)
  841. STFD f9, 1 * SIZE(CO3)
  842. STFD f12, 0 * SIZE(CO4)
  843. STFD f13, 1 * SIZE(CO4)
  844. fmr f4, f0
  845. fmr f5, f0
  846. fmr f6, f0
  847. fmr f7, f0
  848. fmr f8, f0
  849. fmr f9, f0
  850. fmr f10, f0
  851. fmr f11, f0
  852. fmr f12, f0
  853. fmr f13, f0
  854. fmr f14, f0
  855. fmr f15, f0
  856. addi CO1, CO1, 2 * SIZE
  857. addi CO2, CO2, 2 * SIZE
  858. addi CO3, CO3, 2 * SIZE
  859. addi CO4, CO4, 2 * SIZE
  860. #ifdef TRMMKERNEL
  861. #if ( defined(LEFT) && defined(TRANSA)) || \
  862. (!defined(LEFT) && !defined(TRANSA))
  863. sub TEMP, K, KK
  864. #ifdef LEFT
  865. addi TEMP, TEMP, -2
  866. #else
  867. addi TEMP, TEMP, -4
  868. #endif
  869. slwi r0, TEMP, 1 + BASE_SHIFT
  870. slwi TEMP, TEMP, 2 + BASE_SHIFT
  871. add AO, AO, r0
  872. add BO, BO, TEMP
  873. #endif
  874. #ifdef LEFT
  875. addi KK, KK, 2
  876. #endif
  877. #endif
  878. .align 4
  879. LL(30):
  880. andi. I, M, 1
  881. ble LL(39)
  882. #if defined(TRMMKERNEL)
  883. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  884. LFD f16, 0 * SIZE(AO)
  885. LFD f17, 1 * SIZE(AO)
  886. LFD f18, 2 * SIZE(AO)
  887. LFD f19, 3 * SIZE(AO)
  888. LFD f20, 0 * SIZE(B)
  889. LFD f21, 1 * SIZE(B)
  890. LFD f22, 2 * SIZE(B)
  891. LFD f23, 3 * SIZE(B)
  892. LFD f24, 4 * SIZE(B)
  893. LFD f25, 5 * SIZE(B)
  894. LFD f26, 6 * SIZE(B)
  895. LFD f27, 7 * SIZE(B)
  896. mr BO, B
  897. #else
  898. slwi r0, KK, 0 + BASE_SHIFT
  899. slwi TEMP, KK, 2 + BASE_SHIFT
  900. add AO, AO, r0
  901. add BO, B, TEMP
  902. LFD f16, 0 * SIZE(AO)
  903. LFD f17, 1 * SIZE(AO)
  904. LFD f18, 2 * SIZE(AO)
  905. LFD f19, 3 * SIZE(AO)
  906. LFD f20, 0 * SIZE(BO)
  907. LFD f21, 1 * SIZE(BO)
  908. LFD f22, 2 * SIZE(BO)
  909. LFD f23, 3 * SIZE(BO)
  910. LFD f24, 4 * SIZE(BO)
  911. LFD f25, 5 * SIZE(BO)
  912. LFD f26, 6 * SIZE(BO)
  913. LFD f27, 7 * SIZE(BO)
  914. #endif
  915. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  916. sub TEMP, K, KK
  917. #elif defined(LEFT)
  918. addi TEMP, KK, 1
  919. #else
  920. addi TEMP, KK, 4
  921. #endif
  922. srawi. TEMP, TEMP, 2
  923. mtspr CTR, TEMP
  924. #else
  925. LFD f16, 0 * SIZE(AO)
  926. LFD f17, 1 * SIZE(AO)
  927. LFD f18, 2 * SIZE(AO)
  928. LFD f19, 3 * SIZE(AO)
  929. LFD f20, 0 * SIZE(B)
  930. LFD f21, 1 * SIZE(B)
  931. LFD f22, 2 * SIZE(B)
  932. LFD f23, 3 * SIZE(B)
  933. LFD f24, 4 * SIZE(B)
  934. LFD f25, 5 * SIZE(B)
  935. LFD f26, 6 * SIZE(B)
  936. LFD f27, 7 * SIZE(B)
  937. srawi. r0, K, 2
  938. mtspr CTR, r0
  939. mr BO, B
  940. #endif
  941. ble LL(35)
  942. .align 5
  943. LL(32):
  944. FMADD f0, f16, f20, f0
  945. FMADD f4, f16, f21, f4
  946. FMADD f8, f16, f22, f8
  947. FMADD f12, f16, f23, f12
  948. LFD f20, 8 * SIZE(BO)
  949. LFD f21, 9 * SIZE(BO)
  950. LFD f22, 10 * SIZE(BO)
  951. LFD f23, 11 * SIZE(BO)
  952. FMADD f1, f17, f24, f1
  953. FMADD f5, f17, f25, f5
  954. FMADD f9, f17, f26, f9
  955. FMADD f13, f17, f27, f13
  956. LFD f24, 12 * SIZE(BO)
  957. LFD f25, 13 * SIZE(BO)
  958. LFD f26, 14 * SIZE(BO)
  959. LFD f27, 15 * SIZE(BO)
  960. FMADD f0, f18, f20, f0
  961. FMADD f4, f18, f21, f4
  962. FMADD f8, f18, f22, f8
  963. FMADD f12, f18, f23, f12
  964. LFD f20, 16 * SIZE(BO)
  965. LFD f21, 17 * SIZE(BO)
  966. LFD f22, 18 * SIZE(BO)
  967. LFD f23, 19 * SIZE(BO)
  968. FMADD f1, f19, f24, f1
  969. FMADD f5, f19, f25, f5
  970. FMADD f9, f19, f26, f9
  971. FMADD f13, f19, f27, f13
  972. LFD f16, 4 * SIZE(AO)
  973. LFD f17, 5 * SIZE(AO)
  974. LFD f18, 6 * SIZE(AO)
  975. LFD f19, 7 * SIZE(AO)
  976. LFD f24, 20 * SIZE(BO)
  977. LFD f25, 21 * SIZE(BO)
  978. LFD f26, 22 * SIZE(BO)
  979. LFD f27, 23 * SIZE(BO)
  980. addi AO, AO, 4 * SIZE
  981. addi BO, BO, 16 * SIZE
  982. dcbt 0, BO, PREB
  983. bdnz LL(32)
  984. fadd f0, f1, f0
  985. fadd f4, f5, f4
  986. fadd f8, f9, f8
  987. fadd f12, f13, f12
  988. .align 4
  989. LL(35):
  990. lfd f30, ALPHA
  991. #if defined(TRMMKERNEL)
  992. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  993. sub TEMP, K, KK
  994. #elif defined(LEFT)
  995. addi TEMP, KK, 1
  996. #else
  997. addi TEMP, KK, 4
  998. #endif
  999. andi. TEMP, TEMP, 3
  1000. mtspr CTR, TEMP
  1001. #else
  1002. andi. r0, K, 3
  1003. mtspr CTR, r0
  1004. #endif
  1005. ble+ LL(38)
  1006. .align 4
  1007. LL(36):
  1008. FMADD f0, f16, f20, f0
  1009. FMADD f4, f16, f21, f4
  1010. FMADD f8, f16, f22, f8
  1011. FMADD f12, f16, f23, f12
  1012. LFD f16, 1 * SIZE(AO)
  1013. LFD f20, 4 * SIZE(BO)
  1014. LFD f21, 5 * SIZE(BO)
  1015. LFD f22, 6 * SIZE(BO)
  1016. LFD f23, 7 * SIZE(BO)
  1017. addi BO, BO, 4 * SIZE
  1018. addi AO, AO, 1 * SIZE
  1019. bdnz LL(36)
  1020. .align 4
  1021. LL(38):
  1022. #ifndef TRMMKERNEL
  1023. LFD f16, 0 * SIZE(CO1)
  1024. LFD f18, 0 * SIZE(CO2)
  1025. LFD f20, 0 * SIZE(CO3)
  1026. LFD f22, 0 * SIZE(CO4)
  1027. FMADD f0, f0, f30, f16
  1028. FMADD f4, f4, f30, f18
  1029. FMADD f8, f8, f30, f20
  1030. FMADD f12, f12, f30, f22
  1031. #else
  1032. FMUL f0, f0, f30
  1033. FMUL f4, f4, f30
  1034. FMUL f8, f8, f30
  1035. FMUL f12, f12, f30
  1036. #endif
  1037. STFD f0, 0 * SIZE(CO1)
  1038. STFD f4, 0 * SIZE(CO2)
  1039. STFD f8, 0 * SIZE(CO3)
  1040. STFD f12, 0 * SIZE(CO4)
  1041. lfs f0, FZERO
  1042. fmr f1, f0
  1043. fmr f4, f0
  1044. fmr f5, f0
  1045. fmr f8, f0
  1046. fmr f9, f0
  1047. fmr f12, f0
  1048. fmr f13, f0
  1049. #ifdef TRMMKERNEL
  1050. #if ( defined(LEFT) && defined(TRANSA)) || \
  1051. (!defined(LEFT) && !defined(TRANSA))
  1052. sub TEMP, K, KK
  1053. #ifdef LEFT
  1054. addi TEMP, TEMP, -1
  1055. #else
  1056. addi TEMP, TEMP, -4
  1057. #endif
  1058. slwi r0, TEMP, 0 + BASE_SHIFT
  1059. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1060. add AO, AO, r0
  1061. add BO, BO, TEMP
  1062. #endif
  1063. #ifdef LEFT
  1064. addi KK, KK, 2
  1065. #endif
  1066. #endif
  1067. .align 4
  1068. LL(39):
  1069. #if defined(TRMMKERNEL) && !defined(LEFT)
  1070. addi KK, KK, 4
  1071. #endif
  1072. mr B, BO
  1073. addic. J, J, -1
  1074. bgt LL(10)
  1075. .align 4
  1076. LL(40):
  1077. mr CO1, C
  1078. add CO2, C, LDC
  1079. andi. J, N, 2
  1080. ble LL(70)
  1081. #if defined(TRMMKERNEL) && defined(LEFT)
  1082. mr KK, OFFSET
  1083. #endif
  1084. lfs f0, FZERO
  1085. fmr f1, f0
  1086. fmr f2, f0
  1087. fmr f3, f0
  1088. fmr f4, f0
  1089. fmr f5, f0
  1090. fmr f6, f0
  1091. fmr f7, f0
  1092. srawi. I, M, 2
  1093. add C, CO2, LDC
  1094. mr AO, A
  1095. ble LL(50)
  1096. .align 4
  1097. LL(41):
  1098. #if defined(TRMMKERNEL)
  1099. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1100. LFD f16, 0 * SIZE(AO)
  1101. LFD f17, 1 * SIZE(AO)
  1102. LFD f18, 2 * SIZE(AO)
  1103. LFD f19, 3 * SIZE(AO)
  1104. LFD f20, 0 * SIZE(B)
  1105. LFD f21, 1 * SIZE(B)
  1106. LFD f22, 2 * SIZE(B)
  1107. LFD f23, 3 * SIZE(B)
  1108. mr BO, B
  1109. #else
  1110. slwi r0, KK, 2 + BASE_SHIFT
  1111. slwi TEMP, KK, 1 + BASE_SHIFT
  1112. add AO, AO, r0
  1113. add BO, B, TEMP
  1114. LFD f16, 0 * SIZE(AO)
  1115. LFD f17, 1 * SIZE(AO)
  1116. LFD f18, 2 * SIZE(AO)
  1117. LFD f19, 3 * SIZE(AO)
  1118. LFD f20, 0 * SIZE(BO)
  1119. LFD f21, 1 * SIZE(BO)
  1120. LFD f22, 2 * SIZE(BO)
  1121. LFD f23, 3 * SIZE(BO)
  1122. #endif
  1123. dcbt CO1, PREC
  1124. dcbt CO2, PREC
  1125. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1126. sub TEMP, K, KK
  1127. #elif defined(LEFT)
  1128. addi TEMP, KK, 4
  1129. #else
  1130. addi TEMP, KK, 2
  1131. #endif
  1132. srawi. TEMP, TEMP, 2
  1133. mtspr CTR, TEMP
  1134. #else
  1135. LFD f16, 0 * SIZE(AO)
  1136. LFD f17, 1 * SIZE(AO)
  1137. LFD f18, 2 * SIZE(AO)
  1138. LFD f19, 3 * SIZE(AO)
  1139. LFD f20, 0 * SIZE(B)
  1140. LFD f21, 1 * SIZE(B)
  1141. LFD f22, 2 * SIZE(B)
  1142. LFD f23, 3 * SIZE(B)
  1143. dcbt CO1, PREC
  1144. dcbt CO2, PREC
  1145. srawi. r0, K, 2
  1146. mtspr CTR, r0
  1147. mr BO, B
  1148. #endif
  1149. ble LL(45)
  1150. .align 5
  1151. LL(42):
  1152. FMADD f0, f16, f20, f0
  1153. FMADD f1, f17, f20, f1
  1154. FMADD f2, f18, f20, f2
  1155. FMADD f3, f19, f20, f3
  1156. FMADD f4, f16, f21, f4
  1157. FMADD f5, f17, f21, f5
  1158. FMADD f6, f18, f21, f6
  1159. FMADD f7, f19, f21, f7
  1160. LFD f16, 4 * SIZE(AO)
  1161. LFD f17, 5 * SIZE(AO)
  1162. LFD f18, 6 * SIZE(AO)
  1163. LFD f19, 7 * SIZE(AO)
  1164. FMADD f0, f16, f22, f0
  1165. FMADD f1, f17, f22, f1
  1166. FMADD f2, f18, f22, f2
  1167. FMADD f3, f19, f22, f3
  1168. FMADD f4, f16, f23, f4
  1169. FMADD f5, f17, f23, f5
  1170. FMADD f6, f18, f23, f6
  1171. FMADD f7, f19, f23, f7
  1172. LFD f16, 8 * SIZE(AO)
  1173. LFD f17, 9 * SIZE(AO)
  1174. LFD f18, 10 * SIZE(AO)
  1175. LFD f19, 11 * SIZE(AO)
  1176. LFD f20, 4 * SIZE(BO)
  1177. LFD f21, 5 * SIZE(BO)
  1178. LFD f22, 6 * SIZE(BO)
  1179. LFD f23, 7 * SIZE(BO)
  1180. FMADD f0, f16, f20, f0
  1181. FMADD f1, f17, f20, f1
  1182. FMADD f2, f18, f20, f2
  1183. FMADD f3, f19, f20, f3
  1184. FMADD f4, f16, f21, f4
  1185. FMADD f5, f17, f21, f5
  1186. FMADD f6, f18, f21, f6
  1187. FMADD f7, f19, f21, f7
  1188. LFD f16, 12 * SIZE(AO)
  1189. LFD f17, 13 * SIZE(AO)
  1190. LFD f18, 14 * SIZE(AO)
  1191. LFD f19, 15 * SIZE(AO)
  1192. FMADD f0, f16, f22, f0
  1193. FMADD f1, f17, f22, f1
  1194. FMADD f2, f18, f22, f2
  1195. FMADD f3, f19, f22, f3
  1196. FMADD f4, f16, f23, f4
  1197. FMADD f5, f17, f23, f5
  1198. FMADD f6, f18, f23, f6
  1199. FMADD f7, f19, f23, f7
  1200. LFD f16, 16 * SIZE(AO)
  1201. LFD f17, 17 * SIZE(AO)
  1202. LFD f18, 18 * SIZE(AO)
  1203. LFD f19, 19 * SIZE(AO)
  1204. LFD f20, 8 * SIZE(BO)
  1205. LFD f21, 9 * SIZE(BO)
  1206. LFD f22, 10 * SIZE(BO)
  1207. LFD f23, 11 * SIZE(BO)
  1208. addi AO, AO, 16 * SIZE
  1209. addi BO, BO, 8 * SIZE
  1210. dcbt 0, BO, PREB
  1211. bdnz LL(42)
  1212. .align 4
  1213. LL(45):
  1214. lfd f30, ALPHA
  1215. #if defined(TRMMKERNEL)
  1216. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1217. sub TEMP, K, KK
  1218. #elif defined(LEFT)
  1219. addi TEMP, KK, 4
  1220. #else
  1221. addi TEMP, KK, 2
  1222. #endif
  1223. andi. TEMP, TEMP, 3
  1224. mtspr CTR, TEMP
  1225. #else
  1226. andi. r0, K, 3
  1227. mtspr CTR, r0
  1228. #endif
  1229. ble+ LL(48)
  1230. .align 4
  1231. LL(46):
  1232. FMADD f0, f16, f20, f0
  1233. FMADD f1, f17, f20, f1
  1234. FMADD f2, f18, f20, f2
  1235. FMADD f3, f19, f20, f3
  1236. FMADD f4, f16, f21, f4
  1237. FMADD f5, f17, f21, f5
  1238. FMADD f6, f18, f21, f6
  1239. FMADD f7, f19, f21, f7
  1240. LFD f16, 4 * SIZE(AO)
  1241. LFD f17, 5 * SIZE(AO)
  1242. LFD f18, 6 * SIZE(AO)
  1243. LFD f19, 7 * SIZE(AO)
  1244. LFD f20, 2 * SIZE(BO)
  1245. LFD f21, 3 * SIZE(BO)
  1246. addi BO, BO, 2 * SIZE
  1247. addi AO, AO, 4 * SIZE
  1248. bdnz LL(46)
  1249. .align 4
  1250. LL(48):
  1251. #ifndef TRMMKERNEL
  1252. LFD f16, 0 * SIZE(CO1)
  1253. LFD f17, 1 * SIZE(CO1)
  1254. LFD f18, 2 * SIZE(CO1)
  1255. LFD f19, 3 * SIZE(CO1)
  1256. LFD f20, 0 * SIZE(CO2)
  1257. LFD f21, 1 * SIZE(CO2)
  1258. LFD f22, 2 * SIZE(CO2)
  1259. LFD f23, 3 * SIZE(CO2)
  1260. FMADD f0, f0, f30, f16
  1261. FMADD f1, f1, f30, f17
  1262. FMADD f2, f2, f30, f18
  1263. FMADD f3, f3, f30, f19
  1264. FMADD f4, f4, f30, f20
  1265. FMADD f5, f5, f30, f21
  1266. FMADD f6, f6, f30, f22
  1267. FMADD f7, f7, f30, f23
  1268. #else
  1269. FMUL f0, f0, f30
  1270. FMUL f1, f1, f30
  1271. FMUL f2, f2, f30
  1272. FMUL f3, f3, f30
  1273. FMUL f4, f4, f30
  1274. FMUL f5, f5, f30
  1275. FMUL f6, f6, f30
  1276. FMUL f7, f7, f30
  1277. #endif
  1278. STFD f0, 0 * SIZE(CO1)
  1279. STFD f1, 1 * SIZE(CO1)
  1280. STFD f2, 2 * SIZE(CO1)
  1281. STFD f3, 3 * SIZE(CO1)
  1282. lfs f0, FZERO
  1283. fmr f1, f0
  1284. fmr f2, f0
  1285. fmr f3, f0
  1286. STFD f4, 0 * SIZE(CO2)
  1287. STFD f5, 1 * SIZE(CO2)
  1288. STFD f6, 2 * SIZE(CO2)
  1289. STFD f7, 3 * SIZE(CO2)
  1290. fmr f4, f0
  1291. fmr f5, f0
  1292. fmr f6, f0
  1293. fmr f7, f0
  1294. addi CO1, CO1, 4 * SIZE
  1295. addi CO2, CO2, 4 * SIZE
  1296. #ifdef TRMMKERNEL
  1297. #if ( defined(LEFT) && defined(TRANSA)) || \
  1298. (!defined(LEFT) && !defined(TRANSA))
  1299. sub TEMP, K, KK
  1300. #ifdef LEFT
  1301. addi TEMP, TEMP, -4
  1302. #else
  1303. addi TEMP, TEMP, -2
  1304. #endif
  1305. slwi r0, TEMP, 2 + BASE_SHIFT
  1306. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1307. add AO, AO, r0
  1308. add BO, BO, TEMP
  1309. #endif
  1310. #ifdef LEFT
  1311. addi KK, KK, 4
  1312. #endif
  1313. #endif
  1314. addic. I, I, -1
  1315. bgt+ LL(41)
  1316. .align 4
  1317. LL(50):
  1318. andi. I, M, 2
  1319. ble LL(60)
  1320. #if defined(TRMMKERNEL)
  1321. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1322. LFD f16, 0 * SIZE(AO)
  1323. LFD f17, 1 * SIZE(AO)
  1324. LFD f18, 2 * SIZE(AO)
  1325. LFD f19, 3 * SIZE(AO)
  1326. LFD f20, 0 * SIZE(B)
  1327. LFD f21, 1 * SIZE(B)
  1328. LFD f22, 2 * SIZE(B)
  1329. LFD f23, 3 * SIZE(B)
  1330. LFD f24, 4 * SIZE(B)
  1331. LFD f25, 5 * SIZE(B)
  1332. LFD f26, 6 * SIZE(B)
  1333. LFD f27, 7 * SIZE(B)
  1334. mr BO, B
  1335. #else
  1336. slwi r0, KK, 1 + BASE_SHIFT
  1337. slwi TEMP, KK, 1 + BASE_SHIFT
  1338. add AO, AO, r0
  1339. add BO, B, TEMP
  1340. LFD f16, 0 * SIZE(AO)
  1341. LFD f17, 1 * SIZE(AO)
  1342. LFD f18, 2 * SIZE(AO)
  1343. LFD f19, 3 * SIZE(AO)
  1344. LFD f20, 0 * SIZE(BO)
  1345. LFD f21, 1 * SIZE(BO)
  1346. LFD f22, 2 * SIZE(BO)
  1347. LFD f23, 3 * SIZE(BO)
  1348. LFD f24, 4 * SIZE(BO)
  1349. LFD f25, 5 * SIZE(BO)
  1350. LFD f26, 6 * SIZE(BO)
  1351. LFD f27, 7 * SIZE(BO)
  1352. #endif
  1353. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1354. sub TEMP, K, KK
  1355. #elif defined(LEFT)
  1356. addi TEMP, KK, 2
  1357. #else
  1358. addi TEMP, KK, 2
  1359. #endif
  1360. srawi. TEMP, TEMP, 2
  1361. mtspr CTR, TEMP
  1362. #else
  1363. LFD f16, 0 * SIZE(AO)
  1364. LFD f17, 1 * SIZE(AO)
  1365. LFD f18, 2 * SIZE(AO)
  1366. LFD f19, 3 * SIZE(AO)
  1367. LFD f20, 0 * SIZE(B)
  1368. LFD f21, 1 * SIZE(B)
  1369. LFD f22, 2 * SIZE(B)
  1370. LFD f23, 3 * SIZE(B)
  1371. LFD f24, 4 * SIZE(B)
  1372. LFD f25, 5 * SIZE(B)
  1373. LFD f26, 6 * SIZE(B)
  1374. LFD f27, 7 * SIZE(B)
  1375. srawi. r0, K, 2
  1376. mtspr CTR, r0
  1377. mr BO, B
  1378. #endif
  1379. ble LL(55)
  1380. .align 5
  1381. LL(52):
  1382. FMADD f0, f16, f20, f0
  1383. FMADD f1, f17, f20, f1
  1384. FMADD f2, f16, f21, f2
  1385. FMADD f3, f17, f21, f3
  1386. FMADD f4, f18, f22, f4
  1387. FMADD f5, f19, f22, f5
  1388. FMADD f6, f18, f23, f6
  1389. FMADD f7, f19, f23, f7
  1390. LFD f16, 4 * SIZE(AO)
  1391. LFD f17, 5 * SIZE(AO)
  1392. LFD f18, 6 * SIZE(AO)
  1393. LFD f19, 7 * SIZE(AO)
  1394. LFD f20, 8 * SIZE(BO)
  1395. LFD f21, 9 * SIZE(BO)
  1396. LFD f22, 10 * SIZE(BO)
  1397. LFD f23, 11 * SIZE(BO)
  1398. FMADD f0, f16, f24, f0
  1399. FMADD f1, f17, f24, f1
  1400. FMADD f2, f16, f25, f2
  1401. FMADD f3, f17, f25, f3
  1402. FMADD f4, f18, f26, f4
  1403. FMADD f5, f19, f26, f5
  1404. FMADD f6, f18, f27, f6
  1405. FMADD f7, f19, f27, f7
  1406. LFD f16, 8 * SIZE(AO)
  1407. LFD f17, 9 * SIZE(AO)
  1408. LFD f18, 10 * SIZE(AO)
  1409. LFD f19, 11 * SIZE(AO)
  1410. LFD f24, 12 * SIZE(BO)
  1411. LFD f25, 13 * SIZE(BO)
  1412. LFD f26, 14 * SIZE(BO)
  1413. LFD f27, 15 * SIZE(BO)
  1414. addi AO, AO, 8 * SIZE
  1415. addi BO, BO, 8 * SIZE
  1416. dcbt 0, BO, PREB
  1417. bdnz LL(52)
  1418. .align 4
  1419. LL(55):
  1420. lfd f30, ALPHA
  1421. #if defined(TRMMKERNEL)
  1422. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1423. sub TEMP, K, KK
  1424. #elif defined(LEFT)
  1425. addi TEMP, KK, 2
  1426. #else
  1427. addi TEMP, KK, 2
  1428. #endif
  1429. andi. TEMP, TEMP, 3
  1430. mtspr CTR, TEMP
  1431. #else
  1432. andi. r0, K, 3
  1433. mtspr CTR, r0
  1434. #endif
  1435. ble+ LL(58)
  1436. .align 4
  1437. LL(56):
  1438. FMADD f0, f16, f20, f0
  1439. FMADD f1, f17, f20, f1
  1440. FMADD f2, f16, f21, f2
  1441. FMADD f3, f17, f21, f3
  1442. LFD f16, 2 * SIZE(AO)
  1443. LFD f17, 3 * SIZE(AO)
  1444. LFD f20, 2 * SIZE(BO)
  1445. LFD f21, 3 * SIZE(BO)
  1446. addi BO, BO, 2 * SIZE
  1447. addi AO, AO, 2 * SIZE
  1448. bdnz LL(56)
  1449. .align 4
  1450. LL(58):
  1451. #ifndef TRMMKERNEL
  1452. LFD f16, 0 * SIZE(CO1)
  1453. LFD f17, 1 * SIZE(CO1)
  1454. LFD f18, 0 * SIZE(CO2)
  1455. LFD f19, 1 * SIZE(CO2)
  1456. FADD f0, f4, f0
  1457. FADD f1, f5, f1
  1458. FADD f2, f6, f2
  1459. FADD f3, f7, f3
  1460. FMADD f0, f0, f30, f16
  1461. FMADD f1, f1, f30, f17
  1462. FMADD f2, f2, f30, f18
  1463. FMADD f3, f3, f30, f19
  1464. #else
  1465. FADD f0, f4, f0
  1466. FADD f1, f5, f1
  1467. FADD f2, f6, f2
  1468. FADD f3, f7, f3
  1469. FMUL f0, f0, f30
  1470. FMUL f1, f1, f30
  1471. FMUL f2, f2, f30
  1472. FMUL f3, f3, f30
  1473. #endif
  1474. STFD f0, 0 * SIZE(CO1)
  1475. STFD f1, 1 * SIZE(CO1)
  1476. STFD f2, 0 * SIZE(CO2)
  1477. STFD f3, 1 * SIZE(CO2)
  1478. lfs f0, FZERO
  1479. fmr f1, f0
  1480. fmr f2, f0
  1481. fmr f3, f0
  1482. fmr f4, f0
  1483. fmr f5, f0
  1484. fmr f6, f0
  1485. fmr f7, f0
  1486. addi CO1, CO1, 2 * SIZE
  1487. addi CO2, CO2, 2 * SIZE
  1488. #ifdef TRMMKERNEL
  1489. #if ( defined(LEFT) && defined(TRANSA)) || \
  1490. (!defined(LEFT) && !defined(TRANSA))
  1491. sub TEMP, K, KK
  1492. #ifdef LEFT
  1493. addi TEMP, TEMP, -2
  1494. #else
  1495. addi TEMP, TEMP, -2
  1496. #endif
  1497. slwi r0, TEMP, 1 + BASE_SHIFT
  1498. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1499. add AO, AO, r0
  1500. add BO, BO, TEMP
  1501. #endif
  1502. #ifdef LEFT
  1503. addi KK, KK, 2
  1504. #endif
  1505. #endif
  1506. .align 4
  1507. LL(60):
  1508. andi. I, M, 1
  1509. ble LL(69)
  1510. #if defined(TRMMKERNEL)
  1511. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1512. LFD f16, 0 * SIZE(AO)
  1513. LFD f17, 1 * SIZE(AO)
  1514. LFD f18, 2 * SIZE(AO)
  1515. LFD f19, 3 * SIZE(AO)
  1516. LFD f20, 0 * SIZE(B)
  1517. LFD f21, 1 * SIZE(B)
  1518. LFD f22, 2 * SIZE(B)
  1519. LFD f23, 3 * SIZE(B)
  1520. LFD f24, 4 * SIZE(B)
  1521. LFD f25, 5 * SIZE(B)
  1522. LFD f26, 6 * SIZE(B)
  1523. LFD f27, 7 * SIZE(B)
  1524. mr BO, B
  1525. #else
  1526. slwi r0, KK, 0 + BASE_SHIFT
  1527. slwi TEMP, KK, 1 + BASE_SHIFT
  1528. add AO, AO, r0
  1529. add BO, B, TEMP
  1530. LFD f16, 0 * SIZE(AO)
  1531. LFD f17, 1 * SIZE(AO)
  1532. LFD f18, 2 * SIZE(AO)
  1533. LFD f19, 3 * SIZE(AO)
  1534. LFD f20, 0 * SIZE(BO)
  1535. LFD f21, 1 * SIZE(BO)
  1536. LFD f22, 2 * SIZE(BO)
  1537. LFD f23, 3 * SIZE(BO)
  1538. LFD f24, 4 * SIZE(BO)
  1539. LFD f25, 5 * SIZE(BO)
  1540. LFD f26, 6 * SIZE(BO)
  1541. LFD f27, 7 * SIZE(BO)
  1542. #endif
  1543. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1544. sub TEMP, K, KK
  1545. #elif defined(LEFT)
  1546. addi TEMP, KK, 1
  1547. #else
  1548. addi TEMP, KK, 2
  1549. #endif
  1550. srawi. TEMP, TEMP, 2
  1551. mtspr CTR, TEMP
  1552. #else
  1553. LFD f16, 0 * SIZE(AO)
  1554. LFD f17, 1 * SIZE(AO)
  1555. LFD f18, 2 * SIZE(AO)
  1556. LFD f19, 3 * SIZE(AO)
  1557. LFD f20, 0 * SIZE(B)
  1558. LFD f21, 1 * SIZE(B)
  1559. LFD f22, 2 * SIZE(B)
  1560. LFD f23, 3 * SIZE(B)
  1561. LFD f24, 4 * SIZE(B)
  1562. LFD f25, 5 * SIZE(B)
  1563. LFD f26, 6 * SIZE(B)
  1564. LFD f27, 7 * SIZE(B)
  1565. srawi. r0, K, 2
  1566. mtspr CTR, r0
  1567. mr BO, B
  1568. #endif
  1569. ble LL(65)
  1570. .align 5
  1571. LL(62):
  1572. FMADD f0, f16, f20, f0
  1573. FMADD f1, f16, f21, f1
  1574. FMADD f2, f17, f22, f2
  1575. FMADD f3, f17, f23, f3
  1576. LFD f20, 8 * SIZE(BO)
  1577. LFD f21, 9 * SIZE(BO)
  1578. LFD f22, 10 * SIZE(BO)
  1579. LFD f23, 11 * SIZE(BO)
  1580. FMADD f0, f18, f24, f0
  1581. FMADD f1, f18, f25, f1
  1582. FMADD f2, f19, f26, f2
  1583. FMADD f3, f19, f27, f3
  1584. LFD f16, 4 * SIZE(AO)
  1585. LFD f17, 5 * SIZE(AO)
  1586. LFD f18, 6 * SIZE(AO)
  1587. LFD f19, 7 * SIZE(AO)
  1588. LFD f24, 12 * SIZE(BO)
  1589. LFD f25, 13 * SIZE(BO)
  1590. LFD f26, 14 * SIZE(BO)
  1591. LFD f27, 15 * SIZE(BO)
  1592. addi AO, AO, 4 * SIZE
  1593. addi BO, BO, 8 * SIZE
  1594. bdnz LL(62)
  1595. .align 4
  1596. LL(65):
  1597. lfd f30, ALPHA
  1598. #if defined(TRMMKERNEL)
  1599. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1600. sub TEMP, K, KK
  1601. #elif defined(LEFT)
  1602. addi TEMP, KK, 1
  1603. #else
  1604. addi TEMP, KK, 2
  1605. #endif
  1606. andi. TEMP, TEMP, 3
  1607. mtspr CTR, TEMP
  1608. #else
  1609. andi. r0, K, 3
  1610. mtspr CTR, r0
  1611. #endif
  1612. ble+ LL(68)
  1613. .align 4
  1614. LL(66):
  1615. FMADD f0, f16, f20, f0
  1616. FMADD f1, f16, f21, f1
  1617. LFD f16, 1 * SIZE(AO)
  1618. LFD f20, 2 * SIZE(BO)
  1619. LFD f21, 3 * SIZE(BO)
  1620. addi BO, BO, 2 * SIZE
  1621. addi AO, AO, 1 * SIZE
  1622. bdnz LL(66)
  1623. .align 4
  1624. LL(68):
  1625. #ifndef TRMMKERNEL
  1626. LFD f16, 0 * SIZE(CO1)
  1627. LFD f18, 0 * SIZE(CO2)
  1628. FADD f0, f2, f0
  1629. FADD f1, f3, f1
  1630. FMADD f0, f0, f30, f16
  1631. FMADD f1, f1, f30, f18
  1632. #else
  1633. FADD f0, f2, f0
  1634. FADD f1, f3, f1
  1635. FMUL f0, f0, f30
  1636. FMUL f1, f1, f30
  1637. #endif
  1638. STFD f0, 0 * SIZE(CO1)
  1639. STFD f1, 0 * SIZE(CO2)
  1640. lfs f0, FZERO
  1641. fmr f1, f0
  1642. fmr f4, f0
  1643. fmr f5, f0
  1644. #ifdef TRMMKERNEL
  1645. #if ( defined(LEFT) && defined(TRANSA)) || \
  1646. (!defined(LEFT) && !defined(TRANSA))
  1647. sub TEMP, K, KK
  1648. #ifdef LEFT
  1649. addi TEMP, TEMP, -1
  1650. #else
  1651. addi TEMP, TEMP, -2
  1652. #endif
  1653. slwi r0, TEMP, 0 + BASE_SHIFT
  1654. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1655. add AO, AO, r0
  1656. add BO, BO, TEMP
  1657. #endif
  1658. #ifdef LEFT
  1659. addi KK, KK, 1
  1660. #endif
  1661. #endif
  1662. .align 4
  1663. LL(69):
  1664. #if defined(TRMMKERNEL) && !defined(LEFT)
  1665. addi KK, KK, 2
  1666. #endif
  1667. mr B, BO
  1668. .align 4
  1669. LL(70):
  1670. mr CO1, C
  1671. andi. J, N, 1
  1672. ble LL(999)
  1673. #if defined(TRMMKERNEL) && defined(LEFT)
  1674. mr KK, OFFSET
  1675. #endif
  1676. lfs f0, FZERO
  1677. fmr f1, f0
  1678. fmr f2, f0
  1679. fmr f3, f0
  1680. srawi. I, M, 2
  1681. mr AO, A
  1682. ble LL(80)
  1683. .align 4
  1684. LL(71):
  1685. #if defined(TRMMKERNEL)
  1686. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1687. LFD f16, 0 * SIZE(AO)
  1688. LFD f17, 1 * SIZE(AO)
  1689. LFD f18, 2 * SIZE(AO)
  1690. LFD f19, 3 * SIZE(AO)
  1691. LFD f20, 0 * SIZE(B)
  1692. LFD f21, 1 * SIZE(B)
  1693. LFD f22, 2 * SIZE(B)
  1694. LFD f23, 3 * SIZE(B)
  1695. mr BO, B
  1696. #else
  1697. slwi r0, KK, 2 + BASE_SHIFT
  1698. slwi TEMP, KK, 0 + BASE_SHIFT
  1699. add AO, AO, r0
  1700. add BO, B, TEMP
  1701. LFD f16, 0 * SIZE(AO)
  1702. LFD f17, 1 * SIZE(AO)
  1703. LFD f18, 2 * SIZE(AO)
  1704. LFD f19, 3 * SIZE(AO)
  1705. LFD f20, 0 * SIZE(BO)
  1706. LFD f21, 1 * SIZE(BO)
  1707. LFD f22, 2 * SIZE(BO)
  1708. LFD f23, 3 * SIZE(BO)
  1709. #endif
  1710. dcbt CO1, PREC
  1711. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1712. sub TEMP, K, KK
  1713. #elif defined(LEFT)
  1714. addi TEMP, KK, 4
  1715. #else
  1716. addi TEMP, KK, 1
  1717. #endif
  1718. srawi. TEMP, TEMP, 2
  1719. mtspr CTR, TEMP
  1720. #else
  1721. LFD f16, 0 * SIZE(AO)
  1722. LFD f17, 1 * SIZE(AO)
  1723. LFD f18, 2 * SIZE(AO)
  1724. LFD f19, 3 * SIZE(AO)
  1725. LFD f20, 0 * SIZE(B)
  1726. LFD f21, 1 * SIZE(B)
  1727. LFD f22, 2 * SIZE(B)
  1728. LFD f23, 3 * SIZE(B)
  1729. dcbt CO1, PREC
  1730. srawi. r0, K, 2
  1731. mtspr CTR, r0
  1732. mr BO, B
  1733. #endif
  1734. ble LL(75)
  1735. .align 5
  1736. LL(72):
  1737. FMADD f0, f16, f20, f0
  1738. FMADD f1, f17, f20, f1
  1739. FMADD f2, f18, f20, f2
  1740. FMADD f3, f19, f20, f3
  1741. LFD f16, 4 * SIZE(AO)
  1742. LFD f17, 5 * SIZE(AO)
  1743. LFD f18, 6 * SIZE(AO)
  1744. LFD f19, 7 * SIZE(AO)
  1745. FMADD f0, f16, f21, f0
  1746. FMADD f1, f17, f21, f1
  1747. FMADD f2, f18, f21, f2
  1748. FMADD f3, f19, f21, f3
  1749. LFD f16, 8 * SIZE(AO)
  1750. LFD f17, 9 * SIZE(AO)
  1751. LFD f18, 10 * SIZE(AO)
  1752. LFD f19, 11 * SIZE(AO)
  1753. FMADD f0, f16, f22, f0
  1754. FMADD f1, f17, f22, f1
  1755. FMADD f2, f18, f22, f2
  1756. FMADD f3, f19, f22, f3
  1757. LFD f16, 12 * SIZE(AO)
  1758. LFD f17, 13 * SIZE(AO)
  1759. LFD f18, 14 * SIZE(AO)
  1760. LFD f19, 15 * SIZE(AO)
  1761. FMADD f0, f16, f23, f0
  1762. FMADD f1, f17, f23, f1
  1763. FMADD f2, f18, f23, f2
  1764. FMADD f3, f19, f23, f3
  1765. LFD f16, 16 * SIZE(AO)
  1766. LFD f17, 17 * SIZE(AO)
  1767. LFD f18, 18 * SIZE(AO)
  1768. LFD f19, 19 * SIZE(AO)
  1769. LFD f20, 4 * SIZE(BO)
  1770. LFD f21, 5 * SIZE(BO)
  1771. LFD f22, 6 * SIZE(BO)
  1772. LFD f23, 7 * SIZE(BO)
  1773. addi AO, AO, 16 * SIZE
  1774. addi BO, BO, 4 * SIZE
  1775. dcbt 0, BO, PREB
  1776. bdnz LL(72)
  1777. .align 4
  1778. LL(75):
  1779. lfd f30, ALPHA
  1780. #if defined(TRMMKERNEL)
  1781. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1782. sub TEMP, K, KK
  1783. #elif defined(LEFT)
  1784. addi TEMP, KK, 4
  1785. #else
  1786. addi TEMP, KK, 1
  1787. #endif
  1788. andi. TEMP, TEMP, 3
  1789. mtspr CTR, TEMP
  1790. #else
  1791. andi. r0, K, 3
  1792. mtspr CTR, r0
  1793. #endif
  1794. ble+ LL(78)
  1795. .align 4
  1796. LL(76):
  1797. FMADD f0, f16, f20, f0
  1798. FMADD f1, f17, f20, f1
  1799. FMADD f2, f18, f20, f2
  1800. FMADD f3, f19, f20, f3
  1801. LFD f16, 4 * SIZE(AO)
  1802. LFD f17, 5 * SIZE(AO)
  1803. LFD f18, 6 * SIZE(AO)
  1804. LFD f19, 7 * SIZE(AO)
  1805. LFD f20, 1 * SIZE(BO)
  1806. addi BO, BO, 1 * SIZE
  1807. addi AO, AO, 4 * SIZE
  1808. bdnz LL(76)
  1809. .align 4
  1810. LL(78):
  1811. #ifndef TRMMKERNEL
  1812. LFD f16, 0 * SIZE(CO1)
  1813. LFD f17, 1 * SIZE(CO1)
  1814. LFD f18, 2 * SIZE(CO1)
  1815. LFD f19, 3 * SIZE(CO1)
  1816. FMADD f0, f0, f30, f16
  1817. FMADD f1, f1, f30, f17
  1818. FMADD f2, f2, f30, f18
  1819. FMADD f3, f3, f30, f19
  1820. #else
  1821. FMUL f0, f0, f30
  1822. FMUL f1, f1, f30
  1823. FMUL f2, f2, f30
  1824. FMUL f3, f3, f30
  1825. #endif
  1826. STFD f0, 0 * SIZE(CO1)
  1827. STFD f1, 1 * SIZE(CO1)
  1828. STFD f2, 2 * SIZE(CO1)
  1829. STFD f3, 3 * SIZE(CO1)
  1830. lfs f0, FZERO
  1831. fmr f1, f0
  1832. fmr f2, f0
  1833. fmr f3, f0
  1834. #ifdef TRMMKERNEL
  1835. #if ( defined(LEFT) && defined(TRANSA)) || \
  1836. (!defined(LEFT) && !defined(TRANSA))
  1837. sub TEMP, K, KK
  1838. #ifdef LEFT
  1839. addi TEMP, TEMP, -4
  1840. #else
  1841. addi TEMP, TEMP, -1
  1842. #endif
  1843. slwi r0 , TEMP, 2 + BASE_SHIFT
  1844. slwi TEMP, TEMP, 0 + BASE_SHIFT
  1845. add AO, AO, r0
  1846. add BO, BO, TEMP
  1847. #endif
  1848. #ifdef LEFT
  1849. addi KK, KK, 4
  1850. #endif
  1851. #endif
  1852. addi CO1, CO1, 4 * SIZE
  1853. addic. I, I, -1
  1854. bgt+ LL(71)
  1855. .align 4
  1856. LL(80):
  1857. andi. I, M, 2
  1858. ble LL(90)
  1859. #if defined(TRMMKERNEL)
  1860. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1861. LFD f16, 0 * SIZE(AO)
  1862. LFD f17, 1 * SIZE(AO)
  1863. LFD f18, 2 * SIZE(AO)
  1864. LFD f19, 3 * SIZE(AO)
  1865. LFD f20, 0 * SIZE(B)
  1866. LFD f21, 1 * SIZE(B)
  1867. LFD f22, 2 * SIZE(B)
  1868. LFD f23, 3 * SIZE(B)
  1869. mr BO, B
  1870. #else
  1871. slwi r0, KK, 1 + BASE_SHIFT
  1872. slwi TEMP, KK, 0 + BASE_SHIFT
  1873. add AO, AO, r0
  1874. add BO, B, TEMP
  1875. LFD f16, 0 * SIZE(AO)
  1876. LFD f17, 1 * SIZE(AO)
  1877. LFD f18, 2 * SIZE(AO)
  1878. LFD f19, 3 * SIZE(AO)
  1879. LFD f20, 0 * SIZE(BO)
  1880. LFD f21, 1 * SIZE(BO)
  1881. LFD f22, 2 * SIZE(BO)
  1882. LFD f23, 3 * SIZE(BO)
  1883. #endif
  1884. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1885. sub TEMP, K, KK
  1886. #elif defined(LEFT)
  1887. addi TEMP, KK, 2
  1888. #else
  1889. addi TEMP, KK, 1
  1890. #endif
  1891. srawi. TEMP, TEMP, 2
  1892. mtspr CTR, TEMP
  1893. #else
  1894. LFD f16, 0 * SIZE(AO)
  1895. LFD f17, 1 * SIZE(AO)
  1896. LFD f18, 2 * SIZE(AO)
  1897. LFD f19, 3 * SIZE(AO)
  1898. LFD f20, 0 * SIZE(B)
  1899. LFD f21, 1 * SIZE(B)
  1900. LFD f22, 2 * SIZE(B)
  1901. LFD f23, 3 * SIZE(B)
  1902. srawi. r0, K, 2
  1903. mtspr CTR, r0
  1904. mr BO, B
  1905. #endif
  1906. ble LL(85)
  1907. .align 5
  1908. LL(82):
  1909. FMADD f0, f16, f20, f0
  1910. FMADD f1, f17, f20, f1
  1911. FMADD f2, f18, f21, f2
  1912. FMADD f3, f19, f21, f3
  1913. LFD f16, 4 * SIZE(AO)
  1914. LFD f17, 5 * SIZE(AO)
  1915. LFD f18, 6 * SIZE(AO)
  1916. LFD f19, 7 * SIZE(AO)
  1917. FMADD f0, f16, f22, f0
  1918. FMADD f1, f17, f22, f1
  1919. FMADD f2, f18, f23, f2
  1920. FMADD f3, f19, f23, f3
  1921. LFD f16, 8 * SIZE(AO)
  1922. LFD f17, 9 * SIZE(AO)
  1923. LFD f18, 10 * SIZE(AO)
  1924. LFD f19, 11 * SIZE(AO)
  1925. LFD f20, 4 * SIZE(BO)
  1926. LFD f21, 5 * SIZE(BO)
  1927. LFD f22, 6 * SIZE(BO)
  1928. LFD f23, 7 * SIZE(BO)
  1929. addi AO, AO, 8 * SIZE
  1930. addi BO, BO, 4 * SIZE
  1931. dcbt 0, BO, PREB
  1932. bdnz LL(82)
  1933. .align 4
  1934. LL(85):
  1935. lfd f30, ALPHA
  1936. #if defined(TRMMKERNEL)
  1937. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1938. sub TEMP, K, KK
  1939. #elif defined(LEFT)
  1940. addi TEMP, KK, 2
  1941. #else
  1942. addi TEMP, KK, 1
  1943. #endif
  1944. andi. TEMP, TEMP, 3
  1945. mtspr CTR, TEMP
  1946. #else
  1947. andi. r0, K, 3
  1948. mtspr CTR, r0
  1949. #endif
  1950. ble+ LL(88)
  1951. .align 4
  1952. LL(86):
  1953. FMADD f0, f16, f20, f0
  1954. FMADD f1, f17, f20, f1
  1955. LFD f16, 2 * SIZE(AO)
  1956. LFD f17, 3 * SIZE(AO)
  1957. LFD f20, 1 * SIZE(BO)
  1958. addi BO, BO, 1 * SIZE
  1959. addi AO, AO, 2 * SIZE
  1960. bdnz LL(86)
  1961. .align 4
  1962. LL(88):
  1963. #ifndef TRMMKERNEL
  1964. LFD f16, 0 * SIZE(CO1)
  1965. LFD f17, 1 * SIZE(CO1)
  1966. FADD f0, f2, f0
  1967. FADD f1, f3, f1
  1968. FMADD f0, f0, f30, f16
  1969. FMADD f1, f1, f30, f17
  1970. #else
  1971. FADD f0, f2, f0
  1972. FADD f1, f3, f1
  1973. FMUL f0, f0, f30
  1974. FMUL f1, f1, f30
  1975. #endif
  1976. STFD f0, 0 * SIZE(CO1)
  1977. STFD f1, 1 * SIZE(CO1)
  1978. lfs f0, FZERO
  1979. fmr f1, f0
  1980. fmr f2, f0
  1981. fmr f3, f0
  1982. addi CO1, CO1, 2 * SIZE
  1983. #ifdef TRMMKERNEL
  1984. #if ( defined(LEFT) && defined(TRANSA)) || \
  1985. (!defined(LEFT) && !defined(TRANSA))
  1986. sub TEMP, K, KK
  1987. #ifdef LEFT
  1988. addi TEMP, TEMP, -2
  1989. #else
  1990. addi TEMP, TEMP, -1
  1991. #endif
  1992. slwi r0 , TEMP, 1 + BASE_SHIFT
  1993. slwi TEMP, TEMP, 0 + BASE_SHIFT
  1994. add AO, AO, r0
  1995. add BO, BO, TEMP
  1996. #endif
  1997. #ifdef LEFT
  1998. addi KK, KK, 2
  1999. #endif
  2000. #endif
  2001. .align 4
  2002. LL(90):
  2003. andi. I, M, 1
  2004. ble LL(999)
  2005. #if defined(TRMMKERNEL)
  2006. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2007. LFD f16, 0 * SIZE(AO)
  2008. LFD f17, 1 * SIZE(AO)
  2009. LFD f18, 2 * SIZE(AO)
  2010. LFD f19, 3 * SIZE(AO)
  2011. LFD f20, 0 * SIZE(B)
  2012. LFD f21, 1 * SIZE(B)
  2013. LFD f22, 2 * SIZE(B)
  2014. LFD f23, 3 * SIZE(B)
  2015. mr BO, B
  2016. #else
  2017. slwi r0, KK, 0 + BASE_SHIFT
  2018. slwi TEMP, KK, 0 + BASE_SHIFT
  2019. add AO, AO, r0
  2020. add BO, B, TEMP
  2021. LFD f16, 0 * SIZE(AO)
  2022. LFD f17, 1 * SIZE(AO)
  2023. LFD f18, 2 * SIZE(AO)
  2024. LFD f19, 3 * SIZE(AO)
  2025. LFD f20, 0 * SIZE(BO)
  2026. LFD f21, 1 * SIZE(BO)
  2027. LFD f22, 2 * SIZE(BO)
  2028. LFD f23, 3 * SIZE(BO)
  2029. #endif
  2030. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2031. sub TEMP, K, KK
  2032. #elif defined(LEFT)
  2033. addi TEMP, KK, 1
  2034. #else
  2035. addi TEMP, KK, 1
  2036. #endif
  2037. srawi. TEMP, TEMP, 3
  2038. mtspr CTR, TEMP
  2039. #else
  2040. LFD f16, 0 * SIZE(AO)
  2041. LFD f17, 1 * SIZE(AO)
  2042. LFD f18, 2 * SIZE(AO)
  2043. LFD f19, 3 * SIZE(AO)
  2044. LFD f20, 0 * SIZE(B)
  2045. LFD f21, 1 * SIZE(B)
  2046. LFD f22, 2 * SIZE(B)
  2047. LFD f23, 3 * SIZE(B)
  2048. srawi. r0, K, 3
  2049. mtspr CTR, r0
  2050. mr BO, B
  2051. #endif
  2052. ble LL(95)
  2053. .align 5
  2054. LL(92):
  2055. FMADD f0, f16, f20, f0
  2056. FMADD f1, f17, f21, f1
  2057. FMADD f2, f18, f22, f2
  2058. FMADD f3, f19, f23, f3
  2059. LFD f16, 4 * SIZE(AO)
  2060. LFD f17, 5 * SIZE(AO)
  2061. LFD f18, 6 * SIZE(AO)
  2062. LFD f19, 7 * SIZE(AO)
  2063. LFD f20, 4 * SIZE(BO)
  2064. LFD f21, 5 * SIZE(BO)
  2065. LFD f22, 6 * SIZE(BO)
  2066. LFD f23, 7 * SIZE(BO)
  2067. FMADD f0, f16, f20, f0
  2068. FMADD f1, f17, f21, f1
  2069. FMADD f2, f18, f22, f2
  2070. FMADD f3, f19, f23, f3
  2071. LFD f16, 8 * SIZE(AO)
  2072. LFD f17, 9 * SIZE(AO)
  2073. LFD f18, 10 * SIZE(AO)
  2074. LFD f19, 11 * SIZE(AO)
  2075. LFD f20, 8 * SIZE(BO)
  2076. LFD f21, 9 * SIZE(BO)
  2077. LFD f22, 10 * SIZE(BO)
  2078. LFD f23, 11 * SIZE(BO)
  2079. addi AO, AO, 8 * SIZE
  2080. addi BO, BO, 8 * SIZE
  2081. bdnz LL(92)
  2082. .align 4
  2083. LL(95):
  2084. lfd f30, ALPHA
  2085. #if defined(TRMMKERNEL)
  2086. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2087. sub TEMP, K, KK
  2088. #elif defined(LEFT)
  2089. addi TEMP, KK, 1
  2090. #else
  2091. addi TEMP, KK, 1
  2092. #endif
  2093. andi. TEMP, TEMP, 7
  2094. mtspr CTR, TEMP
  2095. #else
  2096. andi. r0, K, 7
  2097. mtspr CTR, r0
  2098. #endif
  2099. ble+ LL(98)
  2100. .align 4
  2101. LL(96):
  2102. FMADD f0, f16, f20, f0
  2103. LFD f16, 1 * SIZE(AO)
  2104. LFD f20, 1 * SIZE(BO)
  2105. addi BO, BO, 1 * SIZE
  2106. addi AO, AO, 1 * SIZE
  2107. bdnz LL(96)
  2108. .align 4
  2109. LL(98):
  2110. #ifndef TRMMKERNEL
  2111. LFD f16, 0 * SIZE(CO1)
  2112. FADD f0, f1, f0
  2113. FADD f2, f3, f2
  2114. FADD f0, f2, f0
  2115. FMADD f0, f0, f30, f16
  2116. #else
  2117. FADD f0, f1, f0
  2118. FADD f2, f3, f2
  2119. FADD f0, f2, f0
  2120. FMUL f0, f0, f30
  2121. #endif
  2122. STFD f0, 0 * SIZE(CO1)
  2123. .align 4
  2124. LL(999):
  2125. addi r3, 0, 0
  2126. lfd f14, 0(SP)
  2127. lfd f15, 8(SP)
  2128. lfd f16, 16(SP)
  2129. lfd f17, 24(SP)
  2130. lfd f18, 32(SP)
  2131. lfd f19, 40(SP)
  2132. lfd f20, 48(SP)
  2133. lfd f21, 56(SP)
  2134. lfd f22, 64(SP)
  2135. lfd f23, 72(SP)
  2136. lfd f24, 80(SP)
  2137. lfd f25, 88(SP)
  2138. lfd f26, 96(SP)
  2139. lfd f27, 104(SP)
  2140. lfd f28, 112(SP)
  2141. lfd f29, 120(SP)
  2142. lfd f30, 128(SP)
  2143. lfd f31, 136(SP)
  2144. #ifdef __64BIT__
  2145. ld r31, 144(SP)
  2146. ld r30, 152(SP)
  2147. ld r29, 160(SP)
  2148. ld r28, 168(SP)
  2149. ld r27, 176(SP)
  2150. ld r26, 184(SP)
  2151. ld r25, 192(SP)
  2152. ld r24, 200(SP)
  2153. ld r23, 208(SP)
  2154. ld r22, 216(SP)
  2155. ld r21, 224(SP)
  2156. ld r20, 232(SP)
  2157. #if defined(TRMMKERNEL) || defined(TRSMKERNEL)
  2158. ld r19, 240(SP)
  2159. ld r18, 248(SP)
  2160. #endif
  2161. #else
  2162. lwz r31, 144(SP)
  2163. lwz r30, 148(SP)
  2164. lwz r29, 152(SP)
  2165. lwz r28, 156(SP)
  2166. lwz r27, 160(SP)
  2167. lwz r26, 164(SP)
  2168. lwz r25, 168(SP)
  2169. lwz r24, 172(SP)
  2170. lwz r23, 176(SP)
  2171. lwz r22, 180(SP)
  2172. lwz r21, 184(SP)
  2173. lwz r20, 188(SP)
  2174. #if defined(TRMMKERNEL) || defined(TRSMKERNEL)
  2175. lwz r19, 192(SP)
  2176. lwz r18, 196(SP)
  2177. #endif
  2178. #endif
  2179. addi SP, SP, STACKSIZE
  2180. blr
  2181. EPILOGUE
  2182. #endif