You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_power6.S 47 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef __64BIT__
  41. #define LOAD lwz
  42. #else
  43. #define LOAD ld
  44. #endif
  45. #ifdef __64BIT__
  46. #define STACKSIZE 320
  47. #define ALPHA 296(SP)
  48. #define FZERO 304(SP)
  49. #else
  50. #define STACKSIZE 240
  51. #define ALPHA 224(SP)
  52. #define FZERO 232(SP)
  53. #endif
  54. #define M r3
  55. #define N r4
  56. #define K r5
  57. #if defined(linux) || defined(__FreeBSD__)
  58. #ifndef __64BIT__
  59. #define A r6
  60. #define B r7
  61. #define C r8
  62. #define LDC r9
  63. #define OFFSET r10
  64. #else
  65. #define A r7
  66. #define B r8
  67. #define C r9
  68. #define LDC r10
  69. #define OFFSET r6
  70. #endif
  71. #endif
  72. #if defined(_AIX) || defined(__APPLE__)
  73. #if !defined(__64BIT__) && defined(DOUBLE)
  74. #define A r8
  75. #define B r9
  76. #define C r10
  77. #define LDC r7
  78. #define OFFSET r6
  79. #else
  80. #define A r7
  81. #define B r8
  82. #define C r9
  83. #define LDC r10
  84. #define OFFSET r6
  85. #endif
  86. #endif
  87. #define TEMP r19
  88. #define KK r20
  89. #define BB r21
  90. #define I r22
  91. #define J r23
  92. #define AO r24
  93. #define BO r25
  94. #define CO1 r26
  95. #define CO2 r27
  96. #define CO3 r28
  97. #define CO4 r29
  98. #define PREA r30
  99. #define PREC r31
  100. #ifndef NEEDPARAM
  101. PROLOGUE
  102. PROFCODE
  103. addi SP, SP, -STACKSIZE
  104. li r0, 0
  105. stfd f14, 0(SP)
  106. stfd f15, 8(SP)
  107. stfd f16, 16(SP)
  108. stfd f17, 24(SP)
  109. stfd f18, 32(SP)
  110. stfd f19, 40(SP)
  111. stfd f20, 48(SP)
  112. stfd f21, 56(SP)
  113. stfd f22, 64(SP)
  114. stfd f23, 72(SP)
  115. stfd f24, 80(SP)
  116. stfd f25, 88(SP)
  117. stfd f26, 96(SP)
  118. stfd f27, 104(SP)
  119. stfd f28, 112(SP)
  120. stfd f29, 120(SP)
  121. stfd f30, 128(SP)
  122. stfd f31, 136(SP)
  123. #ifdef __64BIT__
  124. std r31, 144(SP)
  125. std r30, 152(SP)
  126. std r29, 160(SP)
  127. std r28, 168(SP)
  128. std r27, 176(SP)
  129. std r26, 184(SP)
  130. std r25, 192(SP)
  131. std r24, 200(SP)
  132. std r23, 208(SP)
  133. std r22, 216(SP)
  134. std r21, 224(SP)
  135. #ifdef TRMMKERNEL
  136. std r20, 232(SP)
  137. std r19, 240(SP)
  138. #endif
  139. #else
  140. stw r31, 144(SP)
  141. stw r30, 148(SP)
  142. stw r29, 152(SP)
  143. stw r28, 156(SP)
  144. stw r27, 160(SP)
  145. stw r26, 164(SP)
  146. stw r25, 168(SP)
  147. stw r24, 172(SP)
  148. stw r23, 176(SP)
  149. stw r22, 180(SP)
  150. stw r21, 184(SP)
  151. #ifdef TRMMKERNEL
  152. stw r20, 188(SP)
  153. stw r19, 192(SP)
  154. #endif
  155. #endif
  156. stfd f1, ALPHA
  157. stw r0, FZERO
  158. #if defined(_AIX) || defined(__APPLE__)
  159. #if !defined(__64BIT__) && defined(DOUBLE)
  160. lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
  161. #endif
  162. #endif
  163. slwi LDC, LDC, BASE_SHIFT
  164. #if defined(TRMMKERNEL)
  165. #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
  166. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  167. #endif
  168. #if defined(_AIX) || defined(__APPLE__)
  169. #ifdef __64BIT__
  170. ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  171. #else
  172. #ifdef DOUBLE
  173. lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
  174. #else
  175. lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
  176. #endif
  177. #endif
  178. #endif
  179. #endif
  180. #if defined(TRMMKERNEL) && !defined(LEFT)
  181. neg KK, OFFSET
  182. #endif
  183. cmpwi cr0, M, 0
  184. ble LL(999)
  185. cmpwi cr0, N, 0
  186. ble LL(999)
  187. cmpwi cr0, K, 0
  188. ble LL(999)
  189. lfs f0, FZERO
  190. li PREA, (16 * 3) * SIZE
  191. srawi. J, N, 2
  192. li PREC, 3 * SIZE
  193. ble LL(40)
  194. .align 4
  195. LL(10):
  196. mr CO1, C
  197. fmr f1, f0
  198. add CO2, C, LDC
  199. fmr f2, f0
  200. add CO3, CO2, LDC
  201. fmr f3, f0
  202. add CO4, CO3, LDC
  203. fmr f4, f0
  204. #if defined(TRMMKERNEL) && defined(LEFT)
  205. mr KK, OFFSET
  206. #endif
  207. slwi BB, K, BASE_SHIFT + 2
  208. fmr f5, f0
  209. srawi. I, M, 2
  210. fmr f6, f0
  211. mr AO, A
  212. fmr f7, f0
  213. add C, CO4, LDC
  214. fmr f8, f0
  215. fmr f9, f0
  216. fmr f10, f0
  217. fmr f11, f0
  218. fmr f12, f0
  219. fmr f13, f0
  220. fmr f14, f0
  221. fmr f15, f0
  222. ble LL(20)
  223. .align 4
  224. LL(11):
  225. #if defined(TRMMKERNEL)
  226. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  227. LFD f16, 0 * SIZE(AO)
  228. LFD f20, 0 * SIZE(B)
  229. LFD f17, 1 * SIZE(AO)
  230. LFD f21, 1 * SIZE(B)
  231. LFD f18, 2 * SIZE(AO)
  232. LFD f22, 2 * SIZE(B)
  233. LFD f19, 3 * SIZE(AO)
  234. LFD f23, 3 * SIZE(B)
  235. mr BO, B
  236. #else
  237. slwi r0, KK, 2 + BASE_SHIFT
  238. add AO, AO, r0
  239. add BO, B, r0
  240. LFD f16, 0 * SIZE(AO)
  241. LFD f20, 0 * SIZE(BO)
  242. LFD f17, 1 * SIZE(AO)
  243. LFD f21, 1 * SIZE(BO)
  244. LFD f18, 2 * SIZE(AO)
  245. LFD f22, 2 * SIZE(BO)
  246. LFD f19, 3 * SIZE(AO)
  247. LFD f23, 3 * SIZE(BO)
  248. #endif
  249. dcbtst CO1, PREC
  250. dcbtst CO2, PREC
  251. dcbtst CO3, PREC
  252. dcbtst CO4, PREC
  253. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  254. sub TEMP, K, KK
  255. #elif defined(LEFT)
  256. addi TEMP, KK, 4
  257. #else
  258. addi TEMP, KK, 4
  259. #endif
  260. srawi. TEMP, TEMP, 3
  261. mtspr CTR, TEMP
  262. ble LL(15)
  263. #else
  264. LFD f16, 0 * SIZE(AO)
  265. LFD f20, 0 * SIZE(B)
  266. LFD f17, 1 * SIZE(AO)
  267. LFD f21, 1 * SIZE(B)
  268. LFD f18, 2 * SIZE(AO)
  269. LFD f22, 2 * SIZE(B)
  270. LFD f19, 3 * SIZE(AO)
  271. LFD f23, 3 * SIZE(B)
  272. dcbtst CO1, PREC
  273. dcbtst CO2, PREC
  274. dcbtst CO3, PREC
  275. dcbtst CO4, PREC
  276. srawi. r0, K, 3
  277. mtctr r0
  278. mr BO, B
  279. ble LL(15)
  280. #endif
  281. .align 4
  282. LL(12):
  283. dcbt AO, PREA
  284. FMADD f0, f16, f20, f0
  285. nop
  286. FMADD f4, f16, f21, f4
  287. FMADD f8, f16, f22, f8
  288. FMADD f12, f16, f23, f12
  289. LFD f24, 4 * SIZE(AO)
  290. LFD f25, 5 * SIZE(AO)
  291. FMADD f1, f17, f20, f1
  292. FMADD f5, f17, f21, f5
  293. LFD f28, 4 * SIZE(BO)
  294. LFD f29, 5 * SIZE(BO)
  295. FMADD f9, f17, f22, f9
  296. FMADD f13, f17, f23, f13
  297. FMADD f2, f18, f20, f2
  298. FMADD f6, f18, f21, f6
  299. FMADD f10, f18, f22, f10
  300. FMADD f14, f18, f23, f14
  301. LFD f26, 6 * SIZE(AO)
  302. LFD f27, 7 * SIZE(AO)
  303. FMADD f3, f19, f20, f3
  304. FMADD f7, f19, f21, f7
  305. LFD f30, 6 * SIZE(BO)
  306. LFD f31, 7 * SIZE(BO)
  307. FMADD f11, f19, f22, f11
  308. FMADD f15, f19, f23, f15
  309. FMADD f0, f24, f28, f0
  310. FMADD f4, f24, f29, f4
  311. FMADD f8, f24, f30, f8
  312. FMADD f12, f24, f31, f12
  313. LFD f16, 8 * SIZE(AO)
  314. LFD f17, 9 * SIZE(AO)
  315. FMADD f1, f25, f28, f1
  316. FMADD f5, f25, f29, f5
  317. LFD f20, 8 * SIZE(BO)
  318. LFD f21, 9 * SIZE(BO)
  319. FMADD f9, f25, f30, f9
  320. FMADD f13, f25, f31, f13
  321. FMADD f2, f26, f28, f2
  322. FMADD f6, f26, f29, f6
  323. FMADD f10, f26, f30, f10
  324. FMADD f14, f26, f31, f14
  325. LFD f18, 10 * SIZE(AO)
  326. LFD f19, 11 * SIZE(AO)
  327. FMADD f3, f27, f28, f3
  328. FMADD f7, f27, f29, f7
  329. LFD f22, 10 * SIZE(BO)
  330. LFD f23, 11 * SIZE(BO)
  331. FMADD f11, f27, f30, f11
  332. FMADD f15, f27, f31, f15
  333. FMADD f0, f16, f20, f0
  334. FMADD f4, f16, f21, f4
  335. FMADD f8, f16, f22, f8
  336. FMADD f12, f16, f23, f12
  337. LFD f24, 12 * SIZE(AO)
  338. LFD f25, 13 * SIZE(AO)
  339. FMADD f1, f17, f20, f1
  340. FMADD f5, f17, f21, f5
  341. LFD f28, 12 * SIZE(BO)
  342. LFD f29, 13 * SIZE(BO)
  343. FMADD f9, f17, f22, f9
  344. FMADD f13, f17, f23, f13
  345. FMADD f2, f18, f20, f2
  346. FMADD f6, f18, f21, f6
  347. FMADD f10, f18, f22, f10
  348. FMADD f14, f18, f23, f14
  349. LFD f26, 14 * SIZE(AO)
  350. LFD f27, 15 * SIZE(AO)
  351. FMADD f3, f19, f20, f3
  352. FMADD f7, f19, f21, f7
  353. LFD f30, 14 * SIZE(BO)
  354. LFD f31, 15 * SIZE(BO)
  355. FMADD f11, f19, f22, f11
  356. FMADD f15, f19, f23, f15
  357. FMADD f0, f24, f28, f0
  358. FMADD f4, f24, f29, f4
  359. FMADD f8, f24, f30, f8
  360. FMADD f12, f24, f31, f12
  361. LFD f16, 16 * SIZE(AO)
  362. LFD f17, 17 * SIZE(AO)
  363. FMADD f1, f25, f28, f1
  364. FMADD f5, f25, f29, f5
  365. LFD f20, 16 * SIZE(BO)
  366. LFD f21, 17 * SIZE(BO)
  367. FMADD f9, f25, f30, f9
  368. FMADD f13, f25, f31, f13
  369. FMADD f2, f26, f28, f2
  370. FMADD f6, f26, f29, f6
  371. FMADD f10, f26, f30, f10
  372. FMADD f14, f26, f31, f14
  373. LFD f18, 18 * SIZE(AO)
  374. LFD f19, 19 * SIZE(AO)
  375. FMADD f3, f27, f28, f3
  376. FMADD f7, f27, f29, f7
  377. LFD f22, 18 * SIZE(BO)
  378. LFD f23, 19 * SIZE(BO)
  379. FMADD f11, f27, f30, f11
  380. FMADD f15, f27, f31, f15
  381. FMADD f0, f16, f20, f0
  382. FMADD f4, f16, f21, f4
  383. FMADD f8, f16, f22, f8
  384. FMADD f12, f16, f23, f12
  385. LFD f24, 20 * SIZE(AO)
  386. LFD f25, 21 * SIZE(AO)
  387. FMADD f1, f17, f20, f1
  388. FMADD f5, f17, f21, f5
  389. LFD f28, 20 * SIZE(BO)
  390. LFD f29, 21 * SIZE(BO)
  391. FMADD f9, f17, f22, f9
  392. FMADD f13, f17, f23, f13
  393. FMADD f2, f18, f20, f2
  394. FMADD f6, f18, f21, f6
  395. FMADD f10, f18, f22, f10
  396. FMADD f14, f18, f23, f14
  397. LFD f26, 22 * SIZE(AO)
  398. LFD f27, 23 * SIZE(AO)
  399. FMADD f3, f19, f20, f3
  400. FMADD f7, f19, f21, f7
  401. LFD f30, 22 * SIZE(BO)
  402. LFD f31, 23 * SIZE(BO)
  403. FMADD f11, f19, f22, f11
  404. FMADD f15, f19, f23, f15
  405. FMADD f0, f24, f28, f0
  406. FMADD f4, f24, f29, f4
  407. FMADD f8, f24, f30, f8
  408. FMADD f12, f24, f31, f12
  409. LFD f16, 24 * SIZE(AO)
  410. LFD f17, 25 * SIZE(AO)
  411. FMADD f1, f25, f28, f1
  412. FMADD f5, f25, f29, f5
  413. LFD f20, 24 * SIZE(BO)
  414. LFD f21, 25 * SIZE(BO)
  415. FMADD f9, f25, f30, f9
  416. FMADD f13, f25, f31, f13
  417. FMADD f2, f26, f28, f2
  418. FMADD f6, f26, f29, f6
  419. FMADD f10, f26, f30, f10
  420. FMADD f14, f26, f31, f14
  421. LFD f18, 26 * SIZE(AO)
  422. LFD f19, 27 * SIZE(AO)
  423. FMADD f3, f27, f28, f3
  424. FMADD f7, f27, f29, f7
  425. LFD f22, 26 * SIZE(BO)
  426. LFD f23, 27 * SIZE(BO)
  427. FMADD f11, f27, f30, f11
  428. FMADD f15, f27, f31, f15
  429. FMADD f0, f16, f20, f0
  430. FMADD f4, f16, f21, f4
  431. FMADD f8, f16, f22, f8
  432. FMADD f12, f16, f23, f12
  433. LFD f24, 28 * SIZE(AO)
  434. LFD f25, 29 * SIZE(AO)
  435. FMADD f1, f17, f20, f1
  436. FMADD f5, f17, f21, f5
  437. LFD f28, 28 * SIZE(BO)
  438. LFD f29, 29 * SIZE(BO)
  439. FMADD f9, f17, f22, f9
  440. FMADD f13, f17, f23, f13
  441. FMADD f2, f18, f20, f2
  442. FMADD f6, f18, f21, f6
  443. FMADD f10, f18, f22, f10
  444. FMADD f14, f18, f23, f14
  445. LFD f26, 30 * SIZE(AO)
  446. LFD f27, 31 * SIZE(AO)
  447. FMADD f3, f19, f20, f3
  448. FMADD f7, f19, f21, f7
  449. LFD f30, 30 * SIZE(BO)
  450. LFD f31, 31 * SIZE(BO)
  451. FMADD f11, f19, f22, f11
  452. FMADD f15, f19, f23, f15
  453. FMADD f0, f24, f28, f0
  454. FMADD f4, f24, f29, f4
  455. FMADD f8, f24, f30, f8
  456. FMADD f12, f24, f31, f12
  457. LFD f16, 32 * SIZE(AO)
  458. LFD f17, 33 * SIZE(AO)
  459. FMADD f1, f25, f28, f1
  460. FMADD f5, f25, f29, f5
  461. LFD f20, 32 * SIZE(BO)
  462. LFD f21, 33 * SIZE(BO)
  463. FMADD f9, f25, f30, f9
  464. FMADD f13, f25, f31, f13
  465. LFD f18, 34 * SIZE(AO)
  466. LFD f19, 35 * SIZE(AO)
  467. FMADD f2, f26, f28, f2
  468. FMADD f6, f26, f29, f6
  469. LFD f22, 34 * SIZE(BO)
  470. LFD f23, 35 * SIZE(BO)
  471. FMADD f10, f26, f30, f10
  472. FMADD f14, f26, f31, f14
  473. FMADD f3, f27, f28, f3
  474. FMADD f7, f27, f29, f7
  475. FMADD f11, f27, f30, f11
  476. FMADD f15, f27, f31, f15
  477. addi AO, AO, 32 * SIZE
  478. addi BO, BO, 32 * SIZE
  479. bdnz LL(12)
  480. .align 4
  481. LL(15):
  482. lfd f30, ALPHA
  483. dcbtst B, BB
  484. addi BB, BB, 16 * SIZE
  485. dcbtst B, BB
  486. addi BB, BB, 16 * SIZE
  487. #if defined(TRMMKERNEL)
  488. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  489. sub TEMP, K, KK
  490. #elif defined(LEFT)
  491. addi TEMP, KK, 4
  492. #else
  493. addi TEMP, KK, 4
  494. #endif
  495. andi. TEMP, TEMP, 7
  496. mtspr CTR, TEMP
  497. #else
  498. andi. r0, K, 7
  499. mtspr CTR, r0
  500. #endif
  501. ble+ LL(18)
  502. .align 4
  503. LL(16):
  504. FMADD f0, f16, f20, f0
  505. FMADD f4, f16, f21, f4
  506. FMADD f8, f16, f22, f8
  507. FMADD f12, f16, f23, f12
  508. FMADD f1, f17, f20, f1
  509. FMADD f5, f17, f21, f5
  510. FMADD f9, f17, f22, f9
  511. FMADD f13, f17, f23, f13
  512. FMADD f2, f18, f20, f2
  513. FMADD f6, f18, f21, f6
  514. FMADD f10, f18, f22, f10
  515. FMADD f14, f18, f23, f14
  516. FMADD f3, f19, f20, f3
  517. FMADD f7, f19, f21, f7
  518. FMADD f11, f19, f22, f11
  519. FMADD f15, f19, f23, f15
  520. LFD f16, 4 * SIZE(AO)
  521. LFD f17, 5 * SIZE(AO)
  522. LFD f18, 6 * SIZE(AO)
  523. LFD f19, 7 * SIZE(AO)
  524. LFD f20, 4 * SIZE(BO)
  525. LFD f21, 5 * SIZE(BO)
  526. LFD f22, 6 * SIZE(BO)
  527. LFD f23, 7 * SIZE(BO)
  528. addi BO, BO, 4 * SIZE
  529. addi AO, AO, 4 * SIZE
  530. nop
  531. bdnz LL(16)
  532. .align 4
  533. LL(18):
  534. #ifndef TRMMKERNEL
  535. LFD f16, 0 * SIZE(CO1)
  536. LFD f17, 1 * SIZE(CO1)
  537. LFD f18, 2 * SIZE(CO1)
  538. LFD f19, 3 * SIZE(CO1)
  539. LFD f20, 0 * SIZE(CO2)
  540. LFD f21, 1 * SIZE(CO2)
  541. LFD f22, 2 * SIZE(CO2)
  542. LFD f23, 3 * SIZE(CO2)
  543. FMADD f0, f0, f30, f16
  544. FMADD f1, f1, f30, f17
  545. FMADD f2, f2, f30, f18
  546. FMADD f3, f3, f30, f19
  547. FMADD f4, f4, f30, f20
  548. FMADD f5, f5, f30, f21
  549. FMADD f6, f6, f30, f22
  550. FMADD f7, f7, f30, f23
  551. LFD f16, 0 * SIZE(CO3)
  552. LFD f17, 1 * SIZE(CO3)
  553. LFD f18, 2 * SIZE(CO3)
  554. LFD f19, 3 * SIZE(CO3)
  555. LFD f20, 0 * SIZE(CO4)
  556. LFD f21, 1 * SIZE(CO4)
  557. LFD f22, 2 * SIZE(CO4)
  558. LFD f23, 3 * SIZE(CO4)
  559. FMADD f8, f8, f30, f16
  560. FMADD f9, f9, f30, f17
  561. FMADD f10, f10, f30, f18
  562. FMADD f11, f11, f30, f19
  563. FMADD f12, f12, f30, f20
  564. FMADD f13, f13, f30, f21
  565. FMADD f14, f14, f30, f22
  566. FMADD f15, f15, f30, f23
  567. #else
  568. FMUL f0, f0, f30
  569. FMUL f1, f1, f30
  570. FMUL f2, f2, f30
  571. FMUL f3, f3, f30
  572. FMUL f4, f4, f30
  573. FMUL f5, f5, f30
  574. FMUL f6, f6, f30
  575. FMUL f7, f7, f30
  576. FMUL f8, f8, f30
  577. FMUL f9, f9, f30
  578. FMUL f10, f10, f30
  579. FMUL f11, f11, f30
  580. FMUL f12, f12, f30
  581. FMUL f13, f13, f30
  582. FMUL f14, f14, f30
  583. FMUL f15, f15, f30
  584. #endif
  585. STFD f0, 0 * SIZE(CO1)
  586. STFD f1, 1 * SIZE(CO1)
  587. STFD f2, 2 * SIZE(CO1)
  588. STFD f3, 3 * SIZE(CO1)
  589. lfs f0, FZERO
  590. fmr f1, f0
  591. fmr f2, f0
  592. fmr f3, f0
  593. STFD f4, 0 * SIZE(CO2)
  594. STFD f5, 1 * SIZE(CO2)
  595. STFD f6, 2 * SIZE(CO2)
  596. STFD f7, 3 * SIZE(CO2)
  597. fmr f4, f0
  598. fmr f5, f0
  599. fmr f6, f0
  600. fmr f7, f0
  601. STFD f8, 0 * SIZE(CO3)
  602. STFD f9, 1 * SIZE(CO3)
  603. STFD f10, 2 * SIZE(CO3)
  604. STFD f11, 3 * SIZE(CO3)
  605. fmr f8, f0
  606. fmr f9, f0
  607. fmr f10, f0
  608. fmr f11, f0
  609. STFD f12, 0 * SIZE(CO4)
  610. STFD f13, 1 * SIZE(CO4)
  611. STFD f14, 2 * SIZE(CO4)
  612. STFD f15, 3 * SIZE(CO4)
  613. fmr f12, f0
  614. fmr f13, f0
  615. fmr f14, f0
  616. fmr f15, f0
  617. addi CO1, CO1, 4 * SIZE
  618. addi CO2, CO2, 4 * SIZE
  619. addi CO3, CO3, 4 * SIZE
  620. addi CO4, CO4, 4 * SIZE
  621. #ifdef TRMMKERNEL
  622. #if ( defined(LEFT) && defined(TRANSA)) || \
  623. (!defined(LEFT) && !defined(TRANSA))
  624. sub TEMP, K, KK
  625. #ifdef LEFT
  626. addi TEMP, TEMP, -4
  627. #else
  628. addi TEMP, TEMP, -4
  629. #endif
  630. slwi TEMP, TEMP, 2 + BASE_SHIFT
  631. add AO, AO, TEMP
  632. add BO, BO, TEMP
  633. #endif
  634. #ifdef LEFT
  635. addi KK, KK, 4
  636. #endif
  637. #endif
  638. addic. I, I, -1
  639. bgt+ LL(11)
  640. .align 4
  641. LL(20):
  642. andi. I, M, 2
  643. ble LL(30)
  644. #if defined(TRMMKERNEL)
  645. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  646. LFD f16, 0 * SIZE(AO)
  647. LFD f17, 1 * SIZE(AO)
  648. LFD f18, 2 * SIZE(AO)
  649. LFD f19, 3 * SIZE(AO)
  650. LFD f20, 0 * SIZE(B)
  651. LFD f21, 1 * SIZE(B)
  652. LFD f22, 2 * SIZE(B)
  653. LFD f23, 3 * SIZE(B)
  654. LFD f24, 4 * SIZE(B)
  655. LFD f25, 5 * SIZE(B)
  656. LFD f26, 6 * SIZE(B)
  657. LFD f27, 7 * SIZE(B)
  658. mr BO, B
  659. #else
  660. slwi r0, KK, 1 + BASE_SHIFT
  661. slwi TEMP, KK, 2 + BASE_SHIFT
  662. add AO, AO, r0
  663. add BO, B, TEMP
  664. LFD f16, 0 * SIZE(AO)
  665. LFD f17, 1 * SIZE(AO)
  666. LFD f18, 2 * SIZE(AO)
  667. LFD f19, 3 * SIZE(AO)
  668. LFD f20, 0 * SIZE(BO)
  669. LFD f21, 1 * SIZE(BO)
  670. LFD f22, 2 * SIZE(BO)
  671. LFD f23, 3 * SIZE(BO)
  672. LFD f24, 4 * SIZE(BO)
  673. LFD f25, 5 * SIZE(BO)
  674. LFD f26, 6 * SIZE(BO)
  675. LFD f27, 7 * SIZE(BO)
  676. #endif
  677. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  678. sub TEMP, K, KK
  679. #elif defined(LEFT)
  680. addi TEMP, KK, 2
  681. #else
  682. addi TEMP, KK, 4
  683. #endif
  684. srawi. TEMP, TEMP, 2
  685. mtspr CTR, TEMP
  686. #else
  687. LFD f16, 0 * SIZE(AO)
  688. LFD f17, 1 * SIZE(AO)
  689. LFD f18, 2 * SIZE(AO)
  690. LFD f19, 3 * SIZE(AO)
  691. LFD f20, 0 * SIZE(B)
  692. LFD f21, 1 * SIZE(B)
  693. LFD f22, 2 * SIZE(B)
  694. LFD f23, 3 * SIZE(B)
  695. LFD f24, 4 * SIZE(B)
  696. LFD f25, 5 * SIZE(B)
  697. LFD f26, 6 * SIZE(B)
  698. LFD f27, 7 * SIZE(B)
  699. srawi. r0, K, 2
  700. mtspr CTR, r0
  701. mr BO, B
  702. #endif
  703. ble LL(25)
  704. .align 5
  705. LL(22):
  706. FMADD f0, f16, f20, f0
  707. FMADD f1, f17, f20, f1
  708. FMADD f4, f16, f21, f4
  709. FMADD f5, f17, f21, f5
  710. FMADD f8, f16, f22, f8
  711. FMADD f9, f17, f22, f9
  712. FMADD f12, f16, f23, f12
  713. FMADD f13, f17, f23, f13
  714. LFD f20, 8 * SIZE(BO)
  715. LFD f21, 9 * SIZE(BO)
  716. LFD f22, 10 * SIZE(BO)
  717. LFD f23, 11 * SIZE(BO)
  718. FMADD f0, f18, f24, f0
  719. FMADD f1, f19, f24, f1
  720. FMADD f4, f18, f25, f4
  721. FMADD f5, f19, f25, f5
  722. FMADD f8, f18, f26, f8
  723. FMADD f9, f19, f26, f9
  724. FMADD f12, f18, f27, f12
  725. FMADD f13, f19, f27, f13
  726. LFD f16, 4 * SIZE(AO)
  727. LFD f17, 5 * SIZE(AO)
  728. LFD f18, 6 * SIZE(AO)
  729. LFD f19, 7 * SIZE(AO)
  730. FMADD f0, f16, f20, f0
  731. FMADD f1, f17, f20, f1
  732. FMADD f4, f16, f21, f4
  733. FMADD f5, f17, f21, f5
  734. LFD f24, 12 * SIZE(BO)
  735. LFD f25, 13 * SIZE(BO)
  736. LFD f26, 14 * SIZE(BO)
  737. LFD f27, 15 * SIZE(BO)
  738. FMADD f8, f16, f22, f8
  739. FMADD f9, f17, f22, f9
  740. FMADD f12, f16, f23, f12
  741. FMADD f13, f17, f23, f13
  742. LFD f20, 16 * SIZE(BO)
  743. LFD f21, 17 * SIZE(BO)
  744. LFD f22, 18 * SIZE(BO)
  745. LFD f23, 19 * SIZE(BO)
  746. FMADD f0, f18, f24, f0
  747. FMADD f1, f19, f24, f1
  748. FMADD f4, f18, f25, f4
  749. FMADD f5, f19, f25, f5
  750. FMADD f8, f18, f26, f8
  751. FMADD f9, f19, f26, f9
  752. FMADD f12, f18, f27, f12
  753. FMADD f13, f19, f27, f13
  754. LFD f16, 8 * SIZE(AO)
  755. LFD f17, 9 * SIZE(AO)
  756. LFD f18, 10 * SIZE(AO)
  757. LFD f19, 11 * SIZE(AO)
  758. LFD f24, 20 * SIZE(BO)
  759. LFD f25, 21 * SIZE(BO)
  760. LFD f26, 22 * SIZE(BO)
  761. LFD f27, 23 * SIZE(BO)
  762. addi AO, AO, 8 * SIZE
  763. addi BO, BO, 16 * SIZE
  764. bdnz LL(22)
  765. .align 4
  766. LL(25):
  767. lfd f30, ALPHA
  768. #if defined(TRMMKERNEL)
  769. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  770. sub TEMP, K, KK
  771. #elif defined(LEFT)
  772. addi TEMP, KK, 2
  773. #else
  774. addi TEMP, KK, 4
  775. #endif
  776. andi. TEMP, TEMP, 3
  777. mtspr CTR, TEMP
  778. #else
  779. andi. r0, K, 3
  780. mtspr CTR, r0
  781. #endif
  782. ble+ LL(28)
  783. .align 4
  784. LL(26):
  785. FMADD f0, f16, f20, f0
  786. FMADD f1, f17, f20, f1
  787. FMADD f4, f16, f21, f4
  788. FMADD f5, f17, f21, f5
  789. FMADD f8, f16, f22, f8
  790. FMADD f9, f17, f22, f9
  791. FMADD f12, f16, f23, f12
  792. FMADD f13, f17, f23, f13
  793. LFD f16, 2 * SIZE(AO)
  794. LFD f17, 3 * SIZE(AO)
  795. LFD f20, 4 * SIZE(BO)
  796. LFD f21, 5 * SIZE(BO)
  797. LFD f22, 6 * SIZE(BO)
  798. LFD f23, 7 * SIZE(BO)
  799. addi BO, BO, 4 * SIZE
  800. addi AO, AO, 2 * SIZE
  801. bdnz LL(26)
  802. .align 4
  803. LL(28):
  804. #ifndef TRMMKERNEL
  805. LFD f16, 0 * SIZE(CO1)
  806. LFD f17, 1 * SIZE(CO1)
  807. LFD f18, 0 * SIZE(CO2)
  808. LFD f19, 1 * SIZE(CO2)
  809. FMADD f0, f0, f30, f16
  810. FMADD f1, f1, f30, f17
  811. FMADD f4, f4, f30, f18
  812. FMADD f5, f5, f30, f19
  813. LFD f20, 0 * SIZE(CO3)
  814. LFD f21, 1 * SIZE(CO3)
  815. LFD f22, 0 * SIZE(CO4)
  816. LFD f23, 1 * SIZE(CO4)
  817. FMADD f8, f8, f30, f20
  818. FMADD f9, f9, f30, f21
  819. FMADD f12, f12, f30, f22
  820. FMADD f13, f13, f30, f23
  821. #else
  822. FMUL f0, f0, f30
  823. FMUL f1, f1, f30
  824. FMUL f4, f4, f30
  825. FMUL f5, f5, f30
  826. FMUL f8, f8, f30
  827. FMUL f9, f9, f30
  828. FMUL f12, f12, f30
  829. FMUL f13, f13, f30
  830. #endif
  831. STFD f0, 0 * SIZE(CO1)
  832. STFD f1, 1 * SIZE(CO1)
  833. STFD f4, 0 * SIZE(CO2)
  834. STFD f5, 1 * SIZE(CO2)
  835. lfs f0, FZERO
  836. fmr f1, f0
  837. fmr f2, f0
  838. fmr f3, f0
  839. STFD f8, 0 * SIZE(CO3)
  840. STFD f9, 1 * SIZE(CO3)
  841. STFD f12, 0 * SIZE(CO4)
  842. STFD f13, 1 * SIZE(CO4)
  843. fmr f4, f0
  844. fmr f5, f0
  845. fmr f6, f0
  846. fmr f7, f0
  847. fmr f8, f0
  848. fmr f9, f0
  849. fmr f10, f0
  850. fmr f11, f0
  851. fmr f12, f0
  852. fmr f13, f0
  853. fmr f14, f0
  854. fmr f15, f0
  855. addi CO1, CO1, 2 * SIZE
  856. addi CO2, CO2, 2 * SIZE
  857. addi CO3, CO3, 2 * SIZE
  858. addi CO4, CO4, 2 * SIZE
  859. #ifdef TRMMKERNEL
  860. #if ( defined(LEFT) && defined(TRANSA)) || \
  861. (!defined(LEFT) && !defined(TRANSA))
  862. sub TEMP, K, KK
  863. #ifdef LEFT
  864. addi TEMP, TEMP, -2
  865. #else
  866. addi TEMP, TEMP, -4
  867. #endif
  868. slwi r0, TEMP, 1 + BASE_SHIFT
  869. slwi TEMP, TEMP, 2 + BASE_SHIFT
  870. add AO, AO, r0
  871. add BO, BO, TEMP
  872. #endif
  873. #ifdef LEFT
  874. addi KK, KK, 2
  875. #endif
  876. #endif
  877. .align 4
  878. LL(30):
  879. andi. I, M, 1
  880. ble LL(39)
  881. #if defined(TRMMKERNEL)
  882. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  883. LFD f16, 0 * SIZE(AO)
  884. LFD f17, 1 * SIZE(AO)
  885. LFD f18, 2 * SIZE(AO)
  886. LFD f19, 3 * SIZE(AO)
  887. LFD f20, 0 * SIZE(B)
  888. LFD f21, 1 * SIZE(B)
  889. LFD f22, 2 * SIZE(B)
  890. LFD f23, 3 * SIZE(B)
  891. LFD f24, 4 * SIZE(B)
  892. LFD f25, 5 * SIZE(B)
  893. LFD f26, 6 * SIZE(B)
  894. LFD f27, 7 * SIZE(B)
  895. mr BO, B
  896. #else
  897. slwi r0, KK, 0 + BASE_SHIFT
  898. slwi TEMP, KK, 2 + BASE_SHIFT
  899. add AO, AO, r0
  900. add BO, B, TEMP
  901. LFD f16, 0 * SIZE(AO)
  902. LFD f17, 1 * SIZE(AO)
  903. LFD f18, 2 * SIZE(AO)
  904. LFD f19, 3 * SIZE(AO)
  905. LFD f20, 0 * SIZE(BO)
  906. LFD f21, 1 * SIZE(BO)
  907. LFD f22, 2 * SIZE(BO)
  908. LFD f23, 3 * SIZE(BO)
  909. LFD f24, 4 * SIZE(BO)
  910. LFD f25, 5 * SIZE(BO)
  911. LFD f26, 6 * SIZE(BO)
  912. LFD f27, 7 * SIZE(BO)
  913. #endif
  914. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  915. sub TEMP, K, KK
  916. #elif defined(LEFT)
  917. addi TEMP, KK, 1
  918. #else
  919. addi TEMP, KK, 4
  920. #endif
  921. srawi. TEMP, TEMP, 2
  922. mtspr CTR, TEMP
  923. #else
  924. LFD f16, 0 * SIZE(AO)
  925. LFD f17, 1 * SIZE(AO)
  926. LFD f18, 2 * SIZE(AO)
  927. LFD f19, 3 * SIZE(AO)
  928. LFD f20, 0 * SIZE(B)
  929. LFD f21, 1 * SIZE(B)
  930. LFD f22, 2 * SIZE(B)
  931. LFD f23, 3 * SIZE(B)
  932. LFD f24, 4 * SIZE(B)
  933. LFD f25, 5 * SIZE(B)
  934. LFD f26, 6 * SIZE(B)
  935. LFD f27, 7 * SIZE(B)
  936. srawi. r0, K, 2
  937. mtspr CTR, r0
  938. mr BO, B
  939. #endif
  940. ble LL(35)
  941. .align 5
  942. LL(32):
  943. FMADD f0, f16, f20, f0
  944. FMADD f4, f16, f21, f4
  945. FMADD f8, f16, f22, f8
  946. FMADD f12, f16, f23, f12
  947. LFD f20, 8 * SIZE(BO)
  948. LFD f21, 9 * SIZE(BO)
  949. LFD f22, 10 * SIZE(BO)
  950. LFD f23, 11 * SIZE(BO)
  951. FMADD f0, f17, f24, f0
  952. FMADD f4, f17, f25, f4
  953. FMADD f8, f17, f26, f8
  954. FMADD f12, f17, f27, f12
  955. LFD f24, 12 * SIZE(BO)
  956. LFD f25, 13 * SIZE(BO)
  957. LFD f26, 14 * SIZE(BO)
  958. LFD f27, 15 * SIZE(BO)
  959. FMADD f0, f18, f20, f0
  960. FMADD f4, f18, f21, f4
  961. FMADD f8, f18, f22, f8
  962. FMADD f12, f18, f23, f12
  963. LFD f20, 16 * SIZE(BO)
  964. LFD f21, 17 * SIZE(BO)
  965. LFD f22, 18 * SIZE(BO)
  966. LFD f23, 19 * SIZE(BO)
  967. FMADD f0, f19, f24, f0
  968. FMADD f4, f19, f25, f4
  969. FMADD f8, f19, f26, f8
  970. FMADD f12, f19, f27, f12
  971. LFD f16, 4 * SIZE(AO)
  972. LFD f17, 5 * SIZE(AO)
  973. LFD f18, 6 * SIZE(AO)
  974. LFD f19, 7 * SIZE(AO)
  975. LFD f24, 20 * SIZE(BO)
  976. LFD f25, 21 * SIZE(BO)
  977. LFD f26, 22 * SIZE(BO)
  978. LFD f27, 23 * SIZE(BO)
  979. addi AO, AO, 4 * SIZE
  980. addi BO, BO, 16 * SIZE
  981. bdnz LL(32)
  982. .align 4
  983. LL(35):
  984. lfd f30, ALPHA
  985. #if defined(TRMMKERNEL)
  986. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  987. sub TEMP, K, KK
  988. #elif defined(LEFT)
  989. addi TEMP, KK, 1
  990. #else
  991. addi TEMP, KK, 4
  992. #endif
  993. andi. TEMP, TEMP, 3
  994. mtspr CTR, TEMP
  995. #else
  996. andi. r0, K, 3
  997. mtspr CTR, r0
  998. #endif
  999. ble+ LL(38)
  1000. .align 4
  1001. LL(36):
  1002. FMADD f0, f16, f20, f0
  1003. FMADD f4, f16, f21, f4
  1004. FMADD f8, f16, f22, f8
  1005. FMADD f12, f16, f23, f12
  1006. LFD f16, 1 * SIZE(AO)
  1007. LFD f20, 4 * SIZE(BO)
  1008. LFD f21, 5 * SIZE(BO)
  1009. LFD f22, 6 * SIZE(BO)
  1010. LFD f23, 7 * SIZE(BO)
  1011. addi BO, BO, 4 * SIZE
  1012. addi AO, AO, 1 * SIZE
  1013. bdnz LL(36)
  1014. .align 4
  1015. LL(38):
  1016. #ifndef TRMMKERNEL
  1017. LFD f16, 0 * SIZE(CO1)
  1018. LFD f18, 0 * SIZE(CO2)
  1019. LFD f20, 0 * SIZE(CO3)
  1020. LFD f22, 0 * SIZE(CO4)
  1021. FMADD f0, f0, f30, f16
  1022. FMADD f4, f4, f30, f18
  1023. FMADD f8, f8, f30, f20
  1024. FMADD f12, f12, f30, f22
  1025. #else
  1026. FMUL f0, f0, f30
  1027. FMUL f4, f4, f30
  1028. FMUL f8, f8, f30
  1029. FMUL f12, f12, f30
  1030. #endif
  1031. STFD f0, 0 * SIZE(CO1)
  1032. STFD f4, 0 * SIZE(CO2)
  1033. STFD f8, 0 * SIZE(CO3)
  1034. STFD f12, 0 * SIZE(CO4)
  1035. lfs f0, FZERO
  1036. fmr f1, f0
  1037. fmr f4, f0
  1038. fmr f5, f0
  1039. fmr f8, f0
  1040. fmr f9, f0
  1041. fmr f12, f0
  1042. fmr f13, f0
  1043. #ifdef TRMMKERNEL
  1044. #if ( defined(LEFT) && defined(TRANSA)) || \
  1045. (!defined(LEFT) && !defined(TRANSA))
  1046. sub TEMP, K, KK
  1047. #ifdef LEFT
  1048. addi TEMP, TEMP, -1
  1049. #else
  1050. addi TEMP, TEMP, -4
  1051. #endif
  1052. slwi r0, TEMP, 0 + BASE_SHIFT
  1053. slwi TEMP, TEMP, 2 + BASE_SHIFT
  1054. add AO, AO, r0
  1055. add BO, BO, TEMP
  1056. #endif
  1057. #ifdef LEFT
  1058. addi KK, KK, 2
  1059. #endif
  1060. #endif
  1061. .align 4
  1062. LL(39):
  1063. #if defined(TRMMKERNEL) && !defined(LEFT)
  1064. addi KK, KK, 4
  1065. #endif
  1066. lfs f0, FZERO
  1067. mr B, BO
  1068. addic. J, J, -1
  1069. bgt LL(10)
  1070. .align 4
  1071. LL(40):
  1072. mr CO1, C
  1073. add CO2, C, LDC
  1074. andi. J, N, 2
  1075. ble LL(70)
  1076. #if defined(TRMMKERNEL) && defined(LEFT)
  1077. mr KK, OFFSET
  1078. #endif
  1079. lfs f0, FZERO
  1080. fmr f1, f0
  1081. fmr f2, f0
  1082. fmr f3, f0
  1083. fmr f4, f0
  1084. fmr f5, f0
  1085. fmr f6, f0
  1086. fmr f7, f0
  1087. srawi. I, M, 2
  1088. add C, CO2, LDC
  1089. mr AO, A
  1090. ble LL(50)
  1091. .align 4
  1092. LL(41):
  1093. #if defined(TRMMKERNEL)
  1094. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1095. LFD f16, 0 * SIZE(AO)
  1096. LFD f17, 1 * SIZE(AO)
  1097. LFD f18, 2 * SIZE(AO)
  1098. LFD f19, 3 * SIZE(AO)
  1099. LFD f20, 0 * SIZE(B)
  1100. LFD f21, 1 * SIZE(B)
  1101. LFD f22, 2 * SIZE(B)
  1102. LFD f23, 3 * SIZE(B)
  1103. mr BO, B
  1104. #else
  1105. slwi r0, KK, 2 + BASE_SHIFT
  1106. slwi TEMP, KK, 1 + BASE_SHIFT
  1107. add AO, AO, r0
  1108. add BO, B, TEMP
  1109. LFD f16, 0 * SIZE(AO)
  1110. LFD f17, 1 * SIZE(AO)
  1111. LFD f18, 2 * SIZE(AO)
  1112. LFD f19, 3 * SIZE(AO)
  1113. LFD f20, 0 * SIZE(BO)
  1114. LFD f21, 1 * SIZE(BO)
  1115. LFD f22, 2 * SIZE(BO)
  1116. LFD f23, 3 * SIZE(BO)
  1117. #endif
  1118. dcbtst CO1, PREC
  1119. dcbtst CO2, PREC
  1120. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1121. sub TEMP, K, KK
  1122. #elif defined(LEFT)
  1123. addi TEMP, KK, 4
  1124. #else
  1125. addi TEMP, KK, 2
  1126. #endif
  1127. srawi. TEMP, TEMP, 2
  1128. mtspr CTR, TEMP
  1129. #else
  1130. LFD f16, 0 * SIZE(AO)
  1131. LFD f17, 1 * SIZE(AO)
  1132. LFD f18, 2 * SIZE(AO)
  1133. LFD f19, 3 * SIZE(AO)
  1134. LFD f20, 0 * SIZE(B)
  1135. LFD f21, 1 * SIZE(B)
  1136. LFD f22, 2 * SIZE(B)
  1137. LFD f23, 3 * SIZE(B)
  1138. dcbtst CO1, PREC
  1139. dcbtst CO2, PREC
  1140. srawi. r0, K, 2
  1141. mtspr CTR, r0
  1142. mr BO, B
  1143. #endif
  1144. ble LL(45)
  1145. .align 5
  1146. LL(42):
  1147. FMADD f0, f16, f20, f0
  1148. FMADD f1, f17, f20, f1
  1149. FMADD f2, f18, f20, f2
  1150. FMADD f3, f19, f20, f3
  1151. FMADD f4, f16, f21, f4
  1152. FMADD f5, f17, f21, f5
  1153. FMADD f6, f18, f21, f6
  1154. FMADD f7, f19, f21, f7
  1155. LFD f16, 4 * SIZE(AO)
  1156. LFD f17, 5 * SIZE(AO)
  1157. LFD f18, 6 * SIZE(AO)
  1158. LFD f19, 7 * SIZE(AO)
  1159. FMADD f0, f16, f22, f0
  1160. FMADD f1, f17, f22, f1
  1161. FMADD f2, f18, f22, f2
  1162. FMADD f3, f19, f22, f3
  1163. FMADD f4, f16, f23, f4
  1164. FMADD f5, f17, f23, f5
  1165. FMADD f6, f18, f23, f6
  1166. FMADD f7, f19, f23, f7
  1167. LFD f16, 8 * SIZE(AO)
  1168. LFD f17, 9 * SIZE(AO)
  1169. LFD f18, 10 * SIZE(AO)
  1170. LFD f19, 11 * SIZE(AO)
  1171. LFD f20, 4 * SIZE(BO)
  1172. LFD f21, 5 * SIZE(BO)
  1173. LFD f22, 6 * SIZE(BO)
  1174. LFD f23, 7 * SIZE(BO)
  1175. FMADD f0, f16, f20, f0
  1176. FMADD f1, f17, f20, f1
  1177. FMADD f2, f18, f20, f2
  1178. FMADD f3, f19, f20, f3
  1179. FMADD f4, f16, f21, f4
  1180. FMADD f5, f17, f21, f5
  1181. FMADD f6, f18, f21, f6
  1182. FMADD f7, f19, f21, f7
  1183. LFD f16, 12 * SIZE(AO)
  1184. LFD f17, 13 * SIZE(AO)
  1185. LFD f18, 14 * SIZE(AO)
  1186. LFD f19, 15 * SIZE(AO)
  1187. FMADD f0, f16, f22, f0
  1188. FMADD f1, f17, f22, f1
  1189. FMADD f2, f18, f22, f2
  1190. FMADD f3, f19, f22, f3
  1191. FMADD f4, f16, f23, f4
  1192. FMADD f5, f17, f23, f5
  1193. FMADD f6, f18, f23, f6
  1194. FMADD f7, f19, f23, f7
  1195. LFD f16, 16 * SIZE(AO)
  1196. LFD f17, 17 * SIZE(AO)
  1197. LFD f18, 18 * SIZE(AO)
  1198. LFD f19, 19 * SIZE(AO)
  1199. LFD f20, 8 * SIZE(BO)
  1200. LFD f21, 9 * SIZE(BO)
  1201. LFD f22, 10 * SIZE(BO)
  1202. LFD f23, 11 * SIZE(BO)
  1203. addi AO, AO, 16 * SIZE
  1204. addi BO, BO, 8 * SIZE
  1205. bdnz LL(42)
  1206. .align 4
  1207. LL(45):
  1208. lfd f30, ALPHA
  1209. #if defined(TRMMKERNEL)
  1210. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1211. sub TEMP, K, KK
  1212. #elif defined(LEFT)
  1213. addi TEMP, KK, 4
  1214. #else
  1215. addi TEMP, KK, 2
  1216. #endif
  1217. andi. TEMP, TEMP, 3
  1218. mtspr CTR, TEMP
  1219. #else
  1220. andi. r0, K, 3
  1221. mtspr CTR, r0
  1222. #endif
  1223. ble+ LL(48)
  1224. .align 4
  1225. LL(46):
  1226. FMADD f0, f16, f20, f0
  1227. FMADD f1, f17, f20, f1
  1228. FMADD f2, f18, f20, f2
  1229. FMADD f3, f19, f20, f3
  1230. FMADD f4, f16, f21, f4
  1231. FMADD f5, f17, f21, f5
  1232. FMADD f6, f18, f21, f6
  1233. FMADD f7, f19, f21, f7
  1234. LFD f16, 4 * SIZE(AO)
  1235. LFD f17, 5 * SIZE(AO)
  1236. LFD f18, 6 * SIZE(AO)
  1237. LFD f19, 7 * SIZE(AO)
  1238. LFD f20, 2 * SIZE(BO)
  1239. LFD f21, 3 * SIZE(BO)
  1240. addi BO, BO, 2 * SIZE
  1241. addi AO, AO, 4 * SIZE
  1242. bdnz LL(46)
  1243. .align 4
  1244. LL(48):
  1245. #ifndef TRMMKERNEL
  1246. LFD f16, 0 * SIZE(CO1)
  1247. LFD f17, 1 * SIZE(CO1)
  1248. LFD f18, 2 * SIZE(CO1)
  1249. LFD f19, 3 * SIZE(CO1)
  1250. LFD f20, 0 * SIZE(CO2)
  1251. LFD f21, 1 * SIZE(CO2)
  1252. LFD f22, 2 * SIZE(CO2)
  1253. LFD f23, 3 * SIZE(CO2)
  1254. FMADD f0, f0, f30, f16
  1255. FMADD f1, f1, f30, f17
  1256. FMADD f2, f2, f30, f18
  1257. FMADD f3, f3, f30, f19
  1258. FMADD f4, f4, f30, f20
  1259. FMADD f5, f5, f30, f21
  1260. FMADD f6, f6, f30, f22
  1261. FMADD f7, f7, f30, f23
  1262. #else
  1263. FMUL f0, f0, f30
  1264. FMUL f1, f1, f30
  1265. FMUL f2, f2, f30
  1266. FMUL f3, f3, f30
  1267. FMUL f4, f4, f30
  1268. FMUL f5, f5, f30
  1269. FMUL f6, f6, f30
  1270. FMUL f7, f7, f30
  1271. #endif
  1272. STFD f0, 0 * SIZE(CO1)
  1273. STFD f1, 1 * SIZE(CO1)
  1274. STFD f2, 2 * SIZE(CO1)
  1275. STFD f3, 3 * SIZE(CO1)
  1276. lfs f0, FZERO
  1277. fmr f1, f0
  1278. fmr f2, f0
  1279. fmr f3, f0
  1280. STFD f4, 0 * SIZE(CO2)
  1281. STFD f5, 1 * SIZE(CO2)
  1282. STFD f6, 2 * SIZE(CO2)
  1283. STFD f7, 3 * SIZE(CO2)
  1284. fmr f4, f0
  1285. fmr f5, f0
  1286. fmr f6, f0
  1287. fmr f7, f0
  1288. addi CO1, CO1, 4 * SIZE
  1289. addi CO2, CO2, 4 * SIZE
  1290. #ifdef TRMMKERNEL
  1291. #if ( defined(LEFT) && defined(TRANSA)) || \
  1292. (!defined(LEFT) && !defined(TRANSA))
  1293. sub TEMP, K, KK
  1294. #ifdef LEFT
  1295. addi TEMP, TEMP, -4
  1296. #else
  1297. addi TEMP, TEMP, -2
  1298. #endif
  1299. slwi r0, TEMP, 2 + BASE_SHIFT
  1300. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1301. add AO, AO, r0
  1302. add BO, BO, TEMP
  1303. #endif
  1304. #ifdef LEFT
  1305. addi KK, KK, 4
  1306. #endif
  1307. #endif
  1308. addic. I, I, -1
  1309. bgt+ LL(41)
  1310. .align 4
  1311. LL(50):
  1312. andi. I, M, 2
  1313. ble LL(60)
  1314. #if defined(TRMMKERNEL)
  1315. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1316. LFD f16, 0 * SIZE(AO)
  1317. LFD f17, 1 * SIZE(AO)
  1318. LFD f18, 2 * SIZE(AO)
  1319. LFD f19, 3 * SIZE(AO)
  1320. LFD f20, 0 * SIZE(B)
  1321. LFD f21, 1 * SIZE(B)
  1322. LFD f22, 2 * SIZE(B)
  1323. LFD f23, 3 * SIZE(B)
  1324. LFD f24, 4 * SIZE(B)
  1325. LFD f25, 5 * SIZE(B)
  1326. LFD f26, 6 * SIZE(B)
  1327. LFD f27, 7 * SIZE(B)
  1328. mr BO, B
  1329. #else
  1330. slwi r0, KK, 1 + BASE_SHIFT
  1331. slwi TEMP, KK, 1 + BASE_SHIFT
  1332. add AO, AO, r0
  1333. add BO, B, TEMP
  1334. LFD f16, 0 * SIZE(AO)
  1335. LFD f17, 1 * SIZE(AO)
  1336. LFD f18, 2 * SIZE(AO)
  1337. LFD f19, 3 * SIZE(AO)
  1338. LFD f20, 0 * SIZE(BO)
  1339. LFD f21, 1 * SIZE(BO)
  1340. LFD f22, 2 * SIZE(BO)
  1341. LFD f23, 3 * SIZE(BO)
  1342. LFD f24, 4 * SIZE(BO)
  1343. LFD f25, 5 * SIZE(BO)
  1344. LFD f26, 6 * SIZE(BO)
  1345. LFD f27, 7 * SIZE(BO)
  1346. #endif
  1347. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1348. sub TEMP, K, KK
  1349. #elif defined(LEFT)
  1350. addi TEMP, KK, 2
  1351. #else
  1352. addi TEMP, KK, 2
  1353. #endif
  1354. srawi. TEMP, TEMP, 2
  1355. mtspr CTR, TEMP
  1356. #else
  1357. LFD f16, 0 * SIZE(AO)
  1358. LFD f17, 1 * SIZE(AO)
  1359. LFD f18, 2 * SIZE(AO)
  1360. LFD f19, 3 * SIZE(AO)
  1361. LFD f20, 0 * SIZE(B)
  1362. LFD f21, 1 * SIZE(B)
  1363. LFD f22, 2 * SIZE(B)
  1364. LFD f23, 3 * SIZE(B)
  1365. LFD f24, 4 * SIZE(B)
  1366. LFD f25, 5 * SIZE(B)
  1367. LFD f26, 6 * SIZE(B)
  1368. LFD f27, 7 * SIZE(B)
  1369. srawi. r0, K, 2
  1370. mtspr CTR, r0
  1371. mr BO, B
  1372. #endif
  1373. ble LL(55)
  1374. .align 5
  1375. LL(52):
  1376. FMADD f0, f16, f20, f0
  1377. FMADD f1, f17, f20, f1
  1378. FMADD f2, f16, f21, f2
  1379. FMADD f3, f17, f21, f3
  1380. FMADD f0, f18, f22, f0
  1381. FMADD f1, f19, f22, f1
  1382. FMADD f2, f18, f23, f2
  1383. FMADD f3, f19, f23, f3
  1384. LFD f16, 4 * SIZE(AO)
  1385. LFD f17, 5 * SIZE(AO)
  1386. LFD f18, 6 * SIZE(AO)
  1387. LFD f19, 7 * SIZE(AO)
  1388. LFD f20, 8 * SIZE(BO)
  1389. LFD f21, 9 * SIZE(BO)
  1390. LFD f22, 10 * SIZE(BO)
  1391. LFD f23, 11 * SIZE(BO)
  1392. FMADD f0, f16, f24, f0
  1393. FMADD f1, f17, f24, f1
  1394. FMADD f2, f16, f25, f2
  1395. FMADD f3, f17, f25, f3
  1396. FMADD f0, f18, f26, f0
  1397. FMADD f1, f19, f26, f1
  1398. FMADD f2, f18, f27, f2
  1399. FMADD f3, f19, f27, f3
  1400. LFD f16, 8 * SIZE(AO)
  1401. LFD f17, 9 * SIZE(AO)
  1402. LFD f18, 10 * SIZE(AO)
  1403. LFD f19, 11 * SIZE(AO)
  1404. LFD f24, 12 * SIZE(BO)
  1405. LFD f25, 13 * SIZE(BO)
  1406. LFD f26, 14 * SIZE(BO)
  1407. LFD f27, 15 * SIZE(BO)
  1408. addi AO, AO, 8 * SIZE
  1409. addi BO, BO, 8 * SIZE
  1410. bdnz LL(52)
  1411. .align 4
  1412. LL(55):
  1413. lfd f30, ALPHA
  1414. #if defined(TRMMKERNEL)
  1415. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1416. sub TEMP, K, KK
  1417. #elif defined(LEFT)
  1418. addi TEMP, KK, 2
  1419. #else
  1420. addi TEMP, KK, 2
  1421. #endif
  1422. andi. TEMP, TEMP, 3
  1423. mtspr CTR, TEMP
  1424. #else
  1425. andi. r0, K, 3
  1426. mtspr CTR, r0
  1427. #endif
  1428. ble+ LL(58)
  1429. .align 4
  1430. LL(56):
  1431. FMADD f0, f16, f20, f0
  1432. FMADD f1, f17, f20, f1
  1433. FMADD f2, f16, f21, f2
  1434. FMADD f3, f17, f21, f3
  1435. LFD f16, 2 * SIZE(AO)
  1436. LFD f17, 3 * SIZE(AO)
  1437. LFD f20, 2 * SIZE(BO)
  1438. LFD f21, 3 * SIZE(BO)
  1439. addi BO, BO, 2 * SIZE
  1440. addi AO, AO, 2 * SIZE
  1441. bdnz LL(56)
  1442. .align 4
  1443. LL(58):
  1444. #ifndef TRMMKERNEL
  1445. LFD f16, 0 * SIZE(CO1)
  1446. LFD f17, 1 * SIZE(CO1)
  1447. LFD f18, 0 * SIZE(CO2)
  1448. LFD f19, 1 * SIZE(CO2)
  1449. FMADD f0, f0, f30, f16
  1450. FMADD f1, f1, f30, f17
  1451. FMADD f2, f2, f30, f18
  1452. FMADD f3, f3, f30, f19
  1453. #else
  1454. FMUL f0, f0, f30
  1455. FMUL f1, f1, f30
  1456. FMUL f2, f2, f30
  1457. FMUL f3, f3, f30
  1458. #endif
  1459. STFD f0, 0 * SIZE(CO1)
  1460. STFD f1, 1 * SIZE(CO1)
  1461. STFD f2, 0 * SIZE(CO2)
  1462. STFD f3, 1 * SIZE(CO2)
  1463. lfs f0, FZERO
  1464. fmr f1, f0
  1465. fmr f2, f0
  1466. fmr f3, f0
  1467. fmr f4, f0
  1468. fmr f5, f0
  1469. fmr f6, f0
  1470. fmr f7, f0
  1471. addi CO1, CO1, 2 * SIZE
  1472. addi CO2, CO2, 2 * SIZE
  1473. #ifdef TRMMKERNEL
  1474. #if ( defined(LEFT) && defined(TRANSA)) || \
  1475. (!defined(LEFT) && !defined(TRANSA))
  1476. sub TEMP, K, KK
  1477. #ifdef LEFT
  1478. addi TEMP, TEMP, -2
  1479. #else
  1480. addi TEMP, TEMP, -2
  1481. #endif
  1482. slwi r0, TEMP, 1 + BASE_SHIFT
  1483. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1484. add AO, AO, r0
  1485. add BO, BO, TEMP
  1486. #endif
  1487. #ifdef LEFT
  1488. addi KK, KK, 2
  1489. #endif
  1490. #endif
  1491. .align 4
  1492. LL(60):
  1493. andi. I, M, 1
  1494. ble LL(69)
  1495. #if defined(TRMMKERNEL)
  1496. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1497. LFD f16, 0 * SIZE(AO)
  1498. LFD f17, 1 * SIZE(AO)
  1499. LFD f18, 2 * SIZE(AO)
  1500. LFD f19, 3 * SIZE(AO)
  1501. LFD f20, 0 * SIZE(B)
  1502. LFD f21, 1 * SIZE(B)
  1503. LFD f22, 2 * SIZE(B)
  1504. LFD f23, 3 * SIZE(B)
  1505. LFD f24, 4 * SIZE(B)
  1506. LFD f25, 5 * SIZE(B)
  1507. LFD f26, 6 * SIZE(B)
  1508. LFD f27, 7 * SIZE(B)
  1509. mr BO, B
  1510. #else
  1511. slwi r0, KK, 0 + BASE_SHIFT
  1512. slwi TEMP, KK, 1 + BASE_SHIFT
  1513. add AO, AO, r0
  1514. add BO, B, TEMP
  1515. LFD f16, 0 * SIZE(AO)
  1516. LFD f17, 1 * SIZE(AO)
  1517. LFD f18, 2 * SIZE(AO)
  1518. LFD f19, 3 * SIZE(AO)
  1519. LFD f20, 0 * SIZE(BO)
  1520. LFD f21, 1 * SIZE(BO)
  1521. LFD f22, 2 * SIZE(BO)
  1522. LFD f23, 3 * SIZE(BO)
  1523. LFD f24, 4 * SIZE(BO)
  1524. LFD f25, 5 * SIZE(BO)
  1525. LFD f26, 6 * SIZE(BO)
  1526. LFD f27, 7 * SIZE(BO)
  1527. #endif
  1528. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1529. sub TEMP, K, KK
  1530. #elif defined(LEFT)
  1531. addi TEMP, KK, 1
  1532. #else
  1533. addi TEMP, KK, 2
  1534. #endif
  1535. srawi. TEMP, TEMP, 2
  1536. mtspr CTR, TEMP
  1537. #else
  1538. LFD f16, 0 * SIZE(AO)
  1539. LFD f17, 1 * SIZE(AO)
  1540. LFD f18, 2 * SIZE(AO)
  1541. LFD f19, 3 * SIZE(AO)
  1542. LFD f20, 0 * SIZE(B)
  1543. LFD f21, 1 * SIZE(B)
  1544. LFD f22, 2 * SIZE(B)
  1545. LFD f23, 3 * SIZE(B)
  1546. LFD f24, 4 * SIZE(B)
  1547. LFD f25, 5 * SIZE(B)
  1548. LFD f26, 6 * SIZE(B)
  1549. LFD f27, 7 * SIZE(B)
  1550. srawi. r0, K, 2
  1551. mtspr CTR, r0
  1552. mr BO, B
  1553. #endif
  1554. ble LL(65)
  1555. .align 5
  1556. LL(62):
  1557. FMADD f0, f16, f20, f0
  1558. FMADD f1, f16, f21, f1
  1559. FMADD f0, f17, f22, f0
  1560. FMADD f1, f17, f23, f1
  1561. LFD f20, 8 * SIZE(BO)
  1562. LFD f21, 9 * SIZE(BO)
  1563. LFD f22, 10 * SIZE(BO)
  1564. LFD f23, 11 * SIZE(BO)
  1565. FMADD f0, f18, f24, f0
  1566. FMADD f1, f18, f25, f1
  1567. FMADD f0, f19, f26, f0
  1568. FMADD f1, f19, f27, f1
  1569. LFD f16, 4 * SIZE(AO)
  1570. LFD f17, 5 * SIZE(AO)
  1571. LFD f18, 6 * SIZE(AO)
  1572. LFD f19, 7 * SIZE(AO)
  1573. LFD f24, 12 * SIZE(BO)
  1574. LFD f25, 13 * SIZE(BO)
  1575. LFD f26, 14 * SIZE(BO)
  1576. LFD f27, 15 * SIZE(BO)
  1577. addi AO, AO, 4 * SIZE
  1578. addi BO, BO, 8 * SIZE
  1579. bdnz LL(62)
  1580. .align 4
  1581. LL(65):
  1582. lfd f30, ALPHA
  1583. #if defined(TRMMKERNEL)
  1584. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1585. sub TEMP, K, KK
  1586. #elif defined(LEFT)
  1587. addi TEMP, KK, 1
  1588. #else
  1589. addi TEMP, KK, 2
  1590. #endif
  1591. andi. TEMP, TEMP, 3
  1592. mtspr CTR, TEMP
  1593. #else
  1594. andi. r0, K, 3
  1595. mtspr CTR, r0
  1596. #endif
  1597. ble+ LL(68)
  1598. .align 4
  1599. LL(66):
  1600. FMADD f0, f16, f20, f0
  1601. FMADD f1, f16, f21, f1
  1602. LFD f16, 1 * SIZE(AO)
  1603. LFD f20, 2 * SIZE(BO)
  1604. LFD f21, 3 * SIZE(BO)
  1605. addi BO, BO, 2 * SIZE
  1606. addi AO, AO, 1 * SIZE
  1607. bdnz LL(66)
  1608. .align 4
  1609. LL(68):
  1610. #ifndef TRMMKERNEL
  1611. LFD f16, 0 * SIZE(CO1)
  1612. LFD f18, 0 * SIZE(CO2)
  1613. FMADD f0, f0, f30, f16
  1614. FMADD f1, f1, f30, f18
  1615. #else
  1616. FMUL f0, f0, f30
  1617. FMUL f1, f1, f30
  1618. #endif
  1619. STFD f0, 0 * SIZE(CO1)
  1620. STFD f1, 0 * SIZE(CO2)
  1621. lfs f0, FZERO
  1622. fmr f1, f0
  1623. fmr f4, f0
  1624. fmr f5, f0
  1625. #ifdef TRMMKERNEL
  1626. #if ( defined(LEFT) && defined(TRANSA)) || \
  1627. (!defined(LEFT) && !defined(TRANSA))
  1628. sub TEMP, K, KK
  1629. #ifdef LEFT
  1630. addi TEMP, TEMP, -1
  1631. #else
  1632. addi TEMP, TEMP, -2
  1633. #endif
  1634. slwi r0, TEMP, 0 + BASE_SHIFT
  1635. slwi TEMP, TEMP, 1 + BASE_SHIFT
  1636. add AO, AO, r0
  1637. add BO, BO, TEMP
  1638. #endif
  1639. #ifdef LEFT
  1640. addi KK, KK, 1
  1641. #endif
  1642. #endif
  1643. .align 4
  1644. LL(69):
  1645. #if defined(TRMMKERNEL) && !defined(LEFT)
  1646. addi KK, KK, 2
  1647. #endif
  1648. mr B, BO
  1649. .align 4
  1650. LL(70):
  1651. mr CO1, C
  1652. andi. J, N, 1
  1653. ble LL(999)
  1654. #if defined(TRMMKERNEL) && defined(LEFT)
  1655. mr KK, OFFSET
  1656. #endif
  1657. lfs f0, FZERO
  1658. fmr f1, f0
  1659. fmr f2, f0
  1660. fmr f3, f0
  1661. srawi. I, M, 2
  1662. mr AO, A
  1663. ble LL(80)
  1664. .align 4
  1665. LL(71):
  1666. #if defined(TRMMKERNEL)
  1667. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1668. LFD f16, 0 * SIZE(AO)
  1669. LFD f17, 1 * SIZE(AO)
  1670. LFD f18, 2 * SIZE(AO)
  1671. LFD f19, 3 * SIZE(AO)
  1672. LFD f20, 0 * SIZE(B)
  1673. LFD f21, 1 * SIZE(B)
  1674. LFD f22, 2 * SIZE(B)
  1675. LFD f23, 3 * SIZE(B)
  1676. mr BO, B
  1677. #else
  1678. slwi r0, KK, 2 + BASE_SHIFT
  1679. slwi TEMP, KK, 0 + BASE_SHIFT
  1680. add AO, AO, r0
  1681. add BO, B, TEMP
  1682. LFD f16, 0 * SIZE(AO)
  1683. LFD f17, 1 * SIZE(AO)
  1684. LFD f18, 2 * SIZE(AO)
  1685. LFD f19, 3 * SIZE(AO)
  1686. LFD f20, 0 * SIZE(BO)
  1687. LFD f21, 1 * SIZE(BO)
  1688. LFD f22, 2 * SIZE(BO)
  1689. LFD f23, 3 * SIZE(BO)
  1690. #endif
  1691. dcbtst CO1, PREC
  1692. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1693. sub TEMP, K, KK
  1694. #elif defined(LEFT)
  1695. addi TEMP, KK, 4
  1696. #else
  1697. addi TEMP, KK, 1
  1698. #endif
  1699. srawi. TEMP, TEMP, 2
  1700. mtspr CTR, TEMP
  1701. #else
  1702. LFD f16, 0 * SIZE(AO)
  1703. LFD f17, 1 * SIZE(AO)
  1704. LFD f18, 2 * SIZE(AO)
  1705. LFD f19, 3 * SIZE(AO)
  1706. LFD f20, 0 * SIZE(B)
  1707. LFD f21, 1 * SIZE(B)
  1708. LFD f22, 2 * SIZE(B)
  1709. LFD f23, 3 * SIZE(B)
  1710. dcbtst CO1, PREC
  1711. srawi. r0, K, 2
  1712. mtspr CTR, r0
  1713. mr BO, B
  1714. #endif
  1715. ble LL(75)
  1716. .align 5
  1717. LL(72):
  1718. FMADD f0, f16, f20, f0
  1719. FMADD f1, f17, f20, f1
  1720. FMADD f2, f18, f20, f2
  1721. FMADD f3, f19, f20, f3
  1722. LFD f16, 4 * SIZE(AO)
  1723. LFD f17, 5 * SIZE(AO)
  1724. LFD f18, 6 * SIZE(AO)
  1725. LFD f19, 7 * SIZE(AO)
  1726. FMADD f0, f16, f21, f0
  1727. FMADD f1, f17, f21, f1
  1728. FMADD f2, f18, f21, f2
  1729. FMADD f3, f19, f21, f3
  1730. LFD f16, 8 * SIZE(AO)
  1731. LFD f17, 9 * SIZE(AO)
  1732. LFD f18, 10 * SIZE(AO)
  1733. LFD f19, 11 * SIZE(AO)
  1734. FMADD f0, f16, f22, f0
  1735. FMADD f1, f17, f22, f1
  1736. FMADD f2, f18, f22, f2
  1737. FMADD f3, f19, f22, f3
  1738. LFD f16, 12 * SIZE(AO)
  1739. LFD f17, 13 * SIZE(AO)
  1740. LFD f18, 14 * SIZE(AO)
  1741. LFD f19, 15 * SIZE(AO)
  1742. FMADD f0, f16, f23, f0
  1743. FMADD f1, f17, f23, f1
  1744. FMADD f2, f18, f23, f2
  1745. FMADD f3, f19, f23, f3
  1746. LFD f16, 16 * SIZE(AO)
  1747. LFD f17, 17 * SIZE(AO)
  1748. LFD f18, 18 * SIZE(AO)
  1749. LFD f19, 19 * SIZE(AO)
  1750. LFD f20, 4 * SIZE(BO)
  1751. LFD f21, 5 * SIZE(BO)
  1752. LFD f22, 6 * SIZE(BO)
  1753. LFD f23, 7 * SIZE(BO)
  1754. addi AO, AO, 16 * SIZE
  1755. addi BO, BO, 4 * SIZE
  1756. bdnz LL(72)
  1757. .align 4
  1758. LL(75):
  1759. lfd f30, ALPHA
  1760. #if defined(TRMMKERNEL)
  1761. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1762. sub TEMP, K, KK
  1763. #elif defined(LEFT)
  1764. addi TEMP, KK, 4
  1765. #else
  1766. addi TEMP, KK, 1
  1767. #endif
  1768. andi. TEMP, TEMP, 3
  1769. mtspr CTR, TEMP
  1770. #else
  1771. andi. r0, K, 3
  1772. mtspr CTR, r0
  1773. #endif
  1774. ble+ LL(78)
  1775. .align 4
  1776. LL(76):
  1777. FMADD f0, f16, f20, f0
  1778. FMADD f1, f17, f20, f1
  1779. FMADD f2, f18, f20, f2
  1780. FMADD f3, f19, f20, f3
  1781. LFD f16, 4 * SIZE(AO)
  1782. LFD f17, 5 * SIZE(AO)
  1783. LFD f18, 6 * SIZE(AO)
  1784. LFD f19, 7 * SIZE(AO)
  1785. LFD f20, 1 * SIZE(BO)
  1786. addi BO, BO, 1 * SIZE
  1787. addi AO, AO, 4 * SIZE
  1788. bdnz LL(76)
  1789. .align 4
  1790. LL(78):
  1791. #ifndef TRMMKERNEL
  1792. LFD f16, 0 * SIZE(CO1)
  1793. LFD f17, 1 * SIZE(CO1)
  1794. LFD f18, 2 * SIZE(CO1)
  1795. LFD f19, 3 * SIZE(CO1)
  1796. FMADD f0, f0, f30, f16
  1797. FMADD f1, f1, f30, f17
  1798. FMADD f2, f2, f30, f18
  1799. FMADD f3, f3, f30, f19
  1800. #else
  1801. FMUL f0, f0, f30
  1802. FMUL f1, f1, f30
  1803. FMUL f2, f2, f30
  1804. FMUL f3, f3, f30
  1805. #endif
  1806. STFD f0, 0 * SIZE(CO1)
  1807. STFD f1, 1 * SIZE(CO1)
  1808. STFD f2, 2 * SIZE(CO1)
  1809. STFD f3, 3 * SIZE(CO1)
  1810. lfs f0, FZERO
  1811. fmr f1, f0
  1812. fmr f2, f0
  1813. fmr f3, f0
  1814. #ifdef TRMMKERNEL
  1815. #if ( defined(LEFT) && defined(TRANSA)) || \
  1816. (!defined(LEFT) && !defined(TRANSA))
  1817. sub TEMP, K, KK
  1818. #ifdef LEFT
  1819. addi TEMP, TEMP, -4
  1820. #else
  1821. addi TEMP, TEMP, -1
  1822. #endif
  1823. slwi r0 , TEMP, 2 + BASE_SHIFT
  1824. slwi TEMP, TEMP, 0 + BASE_SHIFT
  1825. add AO, AO, r0
  1826. add BO, BO, TEMP
  1827. #endif
  1828. #ifdef LEFT
  1829. addi KK, KK, 4
  1830. #endif
  1831. #endif
  1832. addi CO1, CO1, 4 * SIZE
  1833. addic. I, I, -1
  1834. bgt+ LL(71)
  1835. .align 4
  1836. LL(80):
  1837. andi. I, M, 2
  1838. ble LL(90)
  1839. #if defined(TRMMKERNEL)
  1840. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1841. LFD f16, 0 * SIZE(AO)
  1842. LFD f17, 1 * SIZE(AO)
  1843. LFD f18, 2 * SIZE(AO)
  1844. LFD f19, 3 * SIZE(AO)
  1845. LFD f20, 0 * SIZE(B)
  1846. LFD f21, 1 * SIZE(B)
  1847. LFD f22, 2 * SIZE(B)
  1848. LFD f23, 3 * SIZE(B)
  1849. mr BO, B
  1850. #else
  1851. slwi r0, KK, 1 + BASE_SHIFT
  1852. slwi TEMP, KK, 0 + BASE_SHIFT
  1853. add AO, AO, r0
  1854. add BO, B, TEMP
  1855. LFD f16, 0 * SIZE(AO)
  1856. LFD f17, 1 * SIZE(AO)
  1857. LFD f18, 2 * SIZE(AO)
  1858. LFD f19, 3 * SIZE(AO)
  1859. LFD f20, 0 * SIZE(BO)
  1860. LFD f21, 1 * SIZE(BO)
  1861. LFD f22, 2 * SIZE(BO)
  1862. LFD f23, 3 * SIZE(BO)
  1863. #endif
  1864. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1865. sub TEMP, K, KK
  1866. #elif defined(LEFT)
  1867. addi TEMP, KK, 2
  1868. #else
  1869. addi TEMP, KK, 1
  1870. #endif
  1871. srawi. TEMP, TEMP, 2
  1872. mtspr CTR, TEMP
  1873. #else
  1874. LFD f16, 0 * SIZE(AO)
  1875. LFD f17, 1 * SIZE(AO)
  1876. LFD f18, 2 * SIZE(AO)
  1877. LFD f19, 3 * SIZE(AO)
  1878. LFD f20, 0 * SIZE(B)
  1879. LFD f21, 1 * SIZE(B)
  1880. LFD f22, 2 * SIZE(B)
  1881. LFD f23, 3 * SIZE(B)
  1882. srawi. r0, K, 2
  1883. mtspr CTR, r0
  1884. mr BO, B
  1885. #endif
  1886. ble LL(85)
  1887. .align 5
  1888. LL(82):
  1889. FMADD f0, f16, f20, f0
  1890. FMADD f1, f17, f20, f1
  1891. FMADD f0, f18, f21, f0
  1892. FMADD f1, f19, f21, f1
  1893. LFD f16, 4 * SIZE(AO)
  1894. LFD f17, 5 * SIZE(AO)
  1895. LFD f18, 6 * SIZE(AO)
  1896. LFD f19, 7 * SIZE(AO)
  1897. FMADD f0, f16, f22, f0
  1898. FMADD f1, f17, f22, f1
  1899. FMADD f0, f18, f23, f0
  1900. FMADD f1, f19, f23, f1
  1901. LFD f16, 8 * SIZE(AO)
  1902. LFD f17, 9 * SIZE(AO)
  1903. LFD f18, 10 * SIZE(AO)
  1904. LFD f19, 11 * SIZE(AO)
  1905. LFD f20, 4 * SIZE(BO)
  1906. LFD f21, 5 * SIZE(BO)
  1907. LFD f22, 6 * SIZE(BO)
  1908. LFD f23, 7 * SIZE(BO)
  1909. addi AO, AO, 8 * SIZE
  1910. addi BO, BO, 4 * SIZE
  1911. bdnz LL(82)
  1912. .align 4
  1913. LL(85):
  1914. lfd f30, ALPHA
  1915. #if defined(TRMMKERNEL)
  1916. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1917. sub TEMP, K, KK
  1918. #elif defined(LEFT)
  1919. addi TEMP, KK, 2
  1920. #else
  1921. addi TEMP, KK, 1
  1922. #endif
  1923. andi. TEMP, TEMP, 3
  1924. mtspr CTR, TEMP
  1925. #else
  1926. andi. r0, K, 3
  1927. mtspr CTR, r0
  1928. #endif
  1929. ble+ LL(88)
  1930. .align 4
  1931. LL(86):
  1932. FMADD f0, f16, f20, f0
  1933. FMADD f1, f17, f20, f1
  1934. LFD f16, 2 * SIZE(AO)
  1935. LFD f17, 3 * SIZE(AO)
  1936. LFD f20, 1 * SIZE(BO)
  1937. addi BO, BO, 1 * SIZE
  1938. addi AO, AO, 2 * SIZE
  1939. bdnz LL(86)
  1940. .align 4
  1941. LL(88):
  1942. #ifndef TRMMKERNEL
  1943. LFD f16, 0 * SIZE(CO1)
  1944. LFD f17, 1 * SIZE(CO1)
  1945. FMADD f0, f0, f30, f16
  1946. FMADD f1, f1, f30, f17
  1947. #else
  1948. FMUL f0, f0, f30
  1949. FMUL f1, f1, f30
  1950. #endif
  1951. STFD f0, 0 * SIZE(CO1)
  1952. STFD f1, 1 * SIZE(CO1)
  1953. lfs f0, FZERO
  1954. addi CO1, CO1, 2 * SIZE
  1955. #ifdef TRMMKERNEL
  1956. #if ( defined(LEFT) && defined(TRANSA)) || \
  1957. (!defined(LEFT) && !defined(TRANSA))
  1958. sub TEMP, K, KK
  1959. #ifdef LEFT
  1960. addi TEMP, TEMP, -2
  1961. #else
  1962. addi TEMP, TEMP, -1
  1963. #endif
  1964. slwi r0 , TEMP, 1 + BASE_SHIFT
  1965. slwi TEMP, TEMP, 0 + BASE_SHIFT
  1966. add AO, AO, r0
  1967. add BO, BO, TEMP
  1968. #endif
  1969. #ifdef LEFT
  1970. addi KK, KK, 2
  1971. #endif
  1972. #endif
  1973. .align 4
  1974. LL(90):
  1975. andi. I, M, 1
  1976. ble LL(999)
  1977. #if defined(TRMMKERNEL)
  1978. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1979. LFD f16, 0 * SIZE(AO)
  1980. LFD f17, 1 * SIZE(AO)
  1981. LFD f18, 2 * SIZE(AO)
  1982. LFD f19, 3 * SIZE(AO)
  1983. LFD f20, 0 * SIZE(B)
  1984. LFD f21, 1 * SIZE(B)
  1985. LFD f22, 2 * SIZE(B)
  1986. LFD f23, 3 * SIZE(B)
  1987. mr BO, B
  1988. #else
  1989. slwi r0, KK, 0 + BASE_SHIFT
  1990. slwi TEMP, KK, 0 + BASE_SHIFT
  1991. add AO, AO, r0
  1992. add BO, B, TEMP
  1993. LFD f16, 0 * SIZE(AO)
  1994. LFD f17, 1 * SIZE(AO)
  1995. LFD f18, 2 * SIZE(AO)
  1996. LFD f19, 3 * SIZE(AO)
  1997. LFD f20, 0 * SIZE(BO)
  1998. LFD f21, 1 * SIZE(BO)
  1999. LFD f22, 2 * SIZE(BO)
  2000. LFD f23, 3 * SIZE(BO)
  2001. #endif
  2002. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2003. sub TEMP, K, KK
  2004. #elif defined(LEFT)
  2005. addi TEMP, KK, 1
  2006. #else
  2007. addi TEMP, KK, 1
  2008. #endif
  2009. srawi. TEMP, TEMP, 3
  2010. mtspr CTR, TEMP
  2011. #else
  2012. LFD f16, 0 * SIZE(AO)
  2013. LFD f17, 1 * SIZE(AO)
  2014. LFD f18, 2 * SIZE(AO)
  2015. LFD f19, 3 * SIZE(AO)
  2016. LFD f20, 0 * SIZE(B)
  2017. LFD f21, 1 * SIZE(B)
  2018. LFD f22, 2 * SIZE(B)
  2019. LFD f23, 3 * SIZE(B)
  2020. srawi. r0, K, 3
  2021. mtspr CTR, r0
  2022. mr BO, B
  2023. #endif
  2024. ble LL(95)
  2025. .align 5
  2026. LL(92):
  2027. FMADD f0, f16, f20, f0
  2028. FMADD f0, f17, f21, f0
  2029. FMADD f0, f18, f22, f0
  2030. FMADD f0, f19, f23, f0
  2031. LFD f16, 4 * SIZE(AO)
  2032. LFD f17, 5 * SIZE(AO)
  2033. LFD f18, 6 * SIZE(AO)
  2034. LFD f19, 7 * SIZE(AO)
  2035. LFD f20, 4 * SIZE(BO)
  2036. LFD f21, 5 * SIZE(BO)
  2037. LFD f22, 6 * SIZE(BO)
  2038. LFD f23, 7 * SIZE(BO)
  2039. FMADD f0, f16, f20, f0
  2040. FMADD f0, f17, f21, f0
  2041. FMADD f0, f18, f22, f0
  2042. FMADD f0, f19, f23, f0
  2043. LFD f16, 8 * SIZE(AO)
  2044. LFD f17, 9 * SIZE(AO)
  2045. LFD f18, 10 * SIZE(AO)
  2046. LFD f19, 11 * SIZE(AO)
  2047. LFD f20, 8 * SIZE(BO)
  2048. LFD f21, 9 * SIZE(BO)
  2049. LFD f22, 10 * SIZE(BO)
  2050. LFD f23, 11 * SIZE(BO)
  2051. addi AO, AO, 8 * SIZE
  2052. addi BO, BO, 8 * SIZE
  2053. bdnz LL(92)
  2054. .align 4
  2055. LL(95):
  2056. lfd f30, ALPHA
  2057. #if defined(TRMMKERNEL)
  2058. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2059. sub TEMP, K, KK
  2060. #elif defined(LEFT)
  2061. addi TEMP, KK, 1
  2062. #else
  2063. addi TEMP, KK, 1
  2064. #endif
  2065. andi. TEMP, TEMP, 7
  2066. mtspr CTR, TEMP
  2067. #else
  2068. andi. r0, K, 7
  2069. mtspr CTR, r0
  2070. #endif
  2071. ble+ LL(98)
  2072. .align 4
  2073. LL(96):
  2074. FMADD f0, f16, f20, f0
  2075. LFD f16, 1 * SIZE(AO)
  2076. LFD f20, 1 * SIZE(BO)
  2077. addi BO, BO, 1 * SIZE
  2078. addi AO, AO, 1 * SIZE
  2079. bdnz LL(96)
  2080. .align 4
  2081. LL(98):
  2082. #ifndef TRMMKERNEL
  2083. LFD f16, 0 * SIZE(CO1)
  2084. FMADD f0, f0, f30, f16
  2085. #else
  2086. FMUL f0, f0, f30
  2087. #endif
  2088. STFD f0, 0 * SIZE(CO1)
  2089. .align 4
  2090. LL(999):
  2091. addi r3, 0, 0
  2092. lfd f14, 0(SP)
  2093. lfd f15, 8(SP)
  2094. lfd f16, 16(SP)
  2095. lfd f17, 24(SP)
  2096. lfd f18, 32(SP)
  2097. lfd f19, 40(SP)
  2098. lfd f20, 48(SP)
  2099. lfd f21, 56(SP)
  2100. lfd f22, 64(SP)
  2101. lfd f23, 72(SP)
  2102. lfd f24, 80(SP)
  2103. lfd f25, 88(SP)
  2104. lfd f26, 96(SP)
  2105. lfd f27, 104(SP)
  2106. lfd f28, 112(SP)
  2107. lfd f29, 120(SP)
  2108. lfd f30, 128(SP)
  2109. lfd f31, 136(SP)
  2110. #ifdef __64BIT__
  2111. ld r31, 144(SP)
  2112. ld r30, 152(SP)
  2113. ld r29, 160(SP)
  2114. ld r28, 168(SP)
  2115. ld r27, 176(SP)
  2116. ld r26, 184(SP)
  2117. ld r25, 192(SP)
  2118. ld r24, 200(SP)
  2119. ld r23, 208(SP)
  2120. ld r22, 216(SP)
  2121. ld r21, 224(SP)
  2122. #ifdef TRMMKERNEL
  2123. ld r20, 232(SP)
  2124. ld r19, 240(SP)
  2125. #endif
  2126. #else
  2127. lwz r31, 144(SP)
  2128. lwz r30, 148(SP)
  2129. lwz r29, 152(SP)
  2130. lwz r28, 156(SP)
  2131. lwz r27, 160(SP)
  2132. lwz r26, 164(SP)
  2133. lwz r25, 168(SP)
  2134. lwz r24, 172(SP)
  2135. lwz r23, 176(SP)
  2136. lwz r22, 180(SP)
  2137. lwz r21, 184(SP)
  2138. #ifdef TRMMKERNEL
  2139. lwz r20, 188(SP)
  2140. lwz r19, 192(SP)
  2141. #endif
  2142. #endif
  2143. addi SP, SP, STACKSIZE
  2144. blr
  2145. EPILOGUE
  2146. #endif