You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel.S 109 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef DOUBLE
  41. #define PREFETCHSIZE (16 * 8)
  42. #else
  43. #define PREFETCHSIZE (32 * 8)
  44. #endif
  45. #define CPREFETCHSIZE 7
  46. #define CPREFETCH lfetch.excl.nt1
  47. #define M r32
  48. #define N r33
  49. #define K r34
  50. #define A r37
  51. #define B r38
  52. #define C r39
  53. #define LDC r35
  54. #define I r15
  55. #define J r16
  56. #define AOFFSET r17
  57. #define BOFFSET r18
  58. #define TEMP r19
  59. #define L r20
  60. #define C1 r21
  61. #define C2 r22
  62. #define C3 r23
  63. #define C4 r24
  64. #define C5 r25
  65. #define C6 r26
  66. #define C7 r27
  67. #define C8 r28
  68. #define PREA r8
  69. #define PREB r9
  70. #define PREC r10
  71. #define SP r12
  72. #define ARLC r29
  73. #define PR r30
  74. #define ARPFS r31
  75. #define ALPHA_R f8
  76. #define ALPHA_I f9
  77. #define AORIG loc0
  78. #define KK loc1
  79. #define KK8 loc2
  80. #define OFFSET loc3
  81. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  82. defined(CC) || defined(CR) || defined(RC) || defined(RR)
  83. #define FCALC_A FSUB
  84. #define FCALC_B FADD
  85. #define FMA_A FNMA
  86. #define FMA_B FMA
  87. #else
  88. #define FCALC_A FADD
  89. #define FCALC_B FSUB
  90. #define FMA_A FMA
  91. #define FMA_B FNMA
  92. #endif
  93. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  94. defined(NC) || defined(TC) || defined(NR) || defined(TR)
  95. #define FCALC_C FMA
  96. #define FCALC_D FNMA
  97. #else
  98. #define FCALC_C FNMA
  99. #define FCALC_D FMA
  100. #endif
  101. PROLOGUE
  102. .prologue
  103. PROFCODE
  104. { .mfi
  105. #ifdef TRMMKERNEL
  106. .save ar.pfs, ARPFS
  107. alloc ARPFS = ar.pfs, 8, 8, 0, 0
  108. #else
  109. nop __LINE__
  110. #endif
  111. mov f64 = f0
  112. adds r14 = 16, SP
  113. }
  114. { .mfi
  115. nop __LINE__
  116. mov f65 = f0
  117. adds r15 = 24, SP
  118. }
  119. ;;
  120. { .mfi
  121. ld8 LDC = [r14]
  122. mov f81 = f0
  123. mov PR = pr
  124. }
  125. { .mfi
  126. #ifdef TRMMKERNEL
  127. ld8 OFFSET = [r15]
  128. #else
  129. nop __LINE__
  130. #endif
  131. mov f96 = f0
  132. shr J = N, 2
  133. }
  134. ;;
  135. { .mfi
  136. shladd LDC = LDC, ZBASE_SHIFT, r0
  137. mov f97 = f0
  138. mov AOFFSET = A
  139. }
  140. { .mfi
  141. nop __LINE__
  142. mov f113 = f0
  143. #if defined(TRMMKERNEL) && !defined(LEFT)
  144. sub KK = r0, OFFSET
  145. #endif
  146. }
  147. ;;
  148. .body
  149. { .mfi
  150. nop __LINE__
  151. mov f80 = f0
  152. mov ARLC = ar.lc
  153. }
  154. { .mfb
  155. cmp.ge p6, p0 = 0, J
  156. mov f112 = f0
  157. (p6) br.cond.dpnt .L050
  158. }
  159. ;;
  160. .align 16
  161. .L010:
  162. { .mmi
  163. mov C1 = C // coffset1 = c + 0 * ldc
  164. add C2 = LDC, C // coffset2 = c + 1 * ldc
  165. shr I = M, 2
  166. }
  167. { .mmi
  168. adds J = -1, J
  169. #if defined(TRMMKERNEL) && defined(LEFT)
  170. mov KK = OFFSET
  171. #else
  172. nop __LINE__
  173. #endif
  174. nop __LINE__
  175. }
  176. ;;
  177. { .mmi
  178. shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc
  179. shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc
  180. #ifdef TRMMKERNEL
  181. shladd KK8 = KK, ZBASE_SHIFT, r0
  182. #else
  183. nop __LINE__
  184. #endif
  185. }
  186. { .mib
  187. cmp.eq p6, p7 = 0, I
  188. shladd C = LDC, 2, C // coffset += 8 * ldc
  189. (p6) br.cond.dpnt .L020
  190. }
  191. ;;
  192. .align 16
  193. .L011:
  194. #if !defined(TRMMKERNEL) || \
  195. defined(TRMMKERNEL) && \
  196. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  197. { .mfb
  198. LDFPD f48, f49 = [B]
  199. mov f66 = f0
  200. nop __LINE__
  201. }
  202. { .mfb
  203. adds BOFFSET = 2 * SIZE, B
  204. mov f67 = f0
  205. nop __LINE__
  206. }
  207. ;;
  208. #else
  209. { .mfi
  210. shladd BOFFSET = KK8, 2, B
  211. mov f66 = f0
  212. shladd AOFFSET = KK8, 2, AOFFSET
  213. }
  214. ;;
  215. { .mfi
  216. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  217. mov f67 = f0
  218. nop __LINE__
  219. }
  220. ;;
  221. #endif
  222. { .mfi
  223. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  224. mov f82 = f0
  225. #ifndef TRMMKERNEL
  226. nop __LINE__
  227. #else
  228. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  229. sub L = K, KK
  230. #elif defined(LEFT)
  231. adds L = 4, KK
  232. #else
  233. adds L = 4, KK
  234. #endif
  235. #endif
  236. }
  237. { .mfi
  238. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  239. mov f83 = f0
  240. adds PREC = CPREFETCHSIZE * SIZE, C1
  241. }
  242. ;;
  243. { .mfi
  244. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  245. mov f98 = f0
  246. #ifndef TRMMKERNEL
  247. adds L = 1, K
  248. #else
  249. adds L = 1, L
  250. #endif
  251. }
  252. { .mfi
  253. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  254. mov f99 = f0
  255. adds C5 = 4 * SIZE, C1
  256. }
  257. ;;
  258. { .mfi
  259. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  260. mov f114 = f0
  261. tbit.z p12, p0 = L, 0
  262. }
  263. { .mfi
  264. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  265. mov f115 = f0
  266. adds C6 = 4 * SIZE, C2
  267. }
  268. ;;
  269. { .mfi
  270. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  271. mov f68 = f0
  272. shr L = L, 1
  273. }
  274. { .mfi
  275. setf.d f86 = r0
  276. mov f69 = f0
  277. adds C7 = 4 * SIZE, C3
  278. }
  279. ;;
  280. { .mfi
  281. CPREFETCH [PREC], LDC
  282. mov f84 = f0
  283. adds L = -1, L
  284. }
  285. { .mfi
  286. setf.d f87 = r0
  287. mov f85 = f0
  288. adds C8 = 4 * SIZE, C4
  289. }
  290. ;;
  291. { .mfi
  292. CPREFETCH [PREC], LDC
  293. mov f100 = f0
  294. mov ar.lc = L
  295. }
  296. { .mfi
  297. setf.d f102 = r0
  298. mov f101 = f0
  299. cmp.eq p3, p0 = r0, r0
  300. }
  301. ;;
  302. { .mfi
  303. CPREFETCH [PREC], LDC
  304. mov f116 = f0
  305. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  306. }
  307. { .mfi
  308. setf.d f103 = r0
  309. mov f117 = f0
  310. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  311. }
  312. ;;
  313. { .mfi
  314. CPREFETCH [PREC]
  315. mov f70 = f0
  316. nop __LINE__
  317. }
  318. { .mmf
  319. setf.d f118 = r0
  320. setf.d f119 = r0
  321. mov f71 = f0
  322. }
  323. ;;
  324. .align 16
  325. .L012:
  326. /* 1 */
  327. { .mfi
  328. lfetch.nt1 [PREA], 16 * SIZE
  329. FMA f64 = f32, f48, f64 // A1 * B1
  330. nop __LINE__
  331. }
  332. { .mfb
  333. (p12) cmp.ne p3, p0 = 0, L
  334. FMA_B f65 = f32, f49, f65 // A1 * B2
  335. nop __LINE__
  336. }
  337. ;;
  338. /* 2 */
  339. { .mfi
  340. lfetch.nt1 [PREB], 16 * SIZE
  341. FMA f80 = f32, f50, f80 // A1 * B3
  342. nop __LINE__
  343. }
  344. { .mfb
  345. cmp.ne p4, p5 = 0, L
  346. FMA_B f81 = f32, f51, f81 // A1 * B4
  347. nop __LINE__
  348. }
  349. ;;
  350. /* 3 */
  351. { .mfb
  352. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  353. FMA f96 = f32, f52, f96 // A1 * B5
  354. nop __LINE__
  355. }
  356. { .mfb
  357. FMA_B f97 = f32, f53, f97 // A1 * B6
  358. nop __LINE__
  359. }
  360. ;;
  361. /* 4 */
  362. { .mfb
  363. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  364. FMA f112 = f32, f54, f112 // A1 * B7
  365. nop __LINE__
  366. }
  367. { .mfb
  368. FMA_B f113 = f32, f55, f113 // A1 * B8
  369. nop __LINE__
  370. }
  371. ;;
  372. /* 5 */
  373. { .mfb
  374. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  375. FMA f65 = f33, f48, f65 // A2 * B1
  376. nop __LINE__
  377. }
  378. { .mfb
  379. FMA_A f64 = f33, f49, f64 // A2 * B2
  380. nop __LINE__
  381. }
  382. ;;
  383. /* 6 */
  384. { .mfb
  385. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  386. FMA f81 = f33, f50, f81 // A2 * B3
  387. nop __LINE__
  388. }
  389. { .mfb
  390. FMA_A f80 = f33, f51, f80 // A2 * B4
  391. nop __LINE__
  392. }
  393. ;;
  394. /* 7 */
  395. { .mfb
  396. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  397. FMA f97 = f33, f52, f97 // A2 * B5
  398. nop __LINE__
  399. }
  400. { .mfb
  401. FMA_A f96 = f33, f53, f96 // A2 * B6
  402. nop __LINE__
  403. }
  404. ;;
  405. /* 8 */
  406. { .mfb
  407. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  408. FMA f113 = f33, f54, f113 // A2 * B7
  409. nop __LINE__
  410. }
  411. { .mfb
  412. FMA_A f112 = f33, f55, f112 // A2 * B8
  413. nop __LINE__
  414. }
  415. ;;
  416. /* 9 */
  417. { .mfb
  418. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  419. FMA f66 = f34, f48, f66 // A3 * B1
  420. nop __LINE__
  421. }
  422. { .mfb
  423. FMA_B f67 = f34, f49, f67 // A3 * B2
  424. nop __LINE__
  425. }
  426. ;;
  427. /* 10 */
  428. { .mfb
  429. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  430. FMA f82 = f34, f50, f82 // A3 * B3
  431. nop __LINE__
  432. }
  433. { .mfb
  434. FMA_B f83 = f34, f51, f83 // A3 * B4
  435. nop __LINE__
  436. }
  437. ;;
  438. /* 11 */
  439. { .mfb
  440. FMA f98 = f34, f52, f98 // A3 * B5
  441. nop __LINE__
  442. }
  443. { .mfb
  444. nop __LINE__
  445. FMA_B f99 = f34, f53, f99 // A3 * B6
  446. nop __LINE__
  447. }
  448. ;;
  449. /* 12 */
  450. { .mfb
  451. FMA f114 = f34, f54, f114 // A3 * B7
  452. nop __LINE__
  453. }
  454. { .mfb
  455. nop __LINE__
  456. FMA_B f115 = f34, f55, f115 // A3 * B8
  457. nop __LINE__
  458. }
  459. ;;
  460. /* 13 */
  461. { .mfb
  462. nop __LINE__
  463. FMA f67 = f35, f48, f67 // A4 * B1
  464. }
  465. { .mfb
  466. nop __LINE__
  467. FMA_A f66 = f35, f49, f66 // A4 * B2
  468. nop __LINE__
  469. }
  470. ;;
  471. /* 14 */
  472. { .mfb
  473. FMA f83 = f35, f50, f83 // A4 * B3
  474. nop __LINE__
  475. }
  476. { .mfb
  477. nop __LINE__
  478. FMA_A f82 = f35, f51, f82 // A4 * B4
  479. nop __LINE__
  480. }
  481. ;;
  482. /* 15 */
  483. { .mfb
  484. FMA f99 = f35, f52, f99 // A4 * B5
  485. nop __LINE__
  486. }
  487. { .mfb
  488. nop __LINE__
  489. FMA_A f98 = f35, f53, f98 // A4 * B6
  490. nop __LINE__
  491. }
  492. ;;
  493. /* 16 */
  494. { .mfb
  495. FMA f115 = f35, f54, f115 // A4 * B7
  496. nop __LINE__
  497. }
  498. { .mfb
  499. nop __LINE__
  500. FMA_A f114 = f35, f55, f114 // A4 * B8
  501. nop __LINE__
  502. }
  503. ;;
  504. /* 17 */
  505. { .mfb
  506. nop __LINE__
  507. FMA f68 = f36, f48, f68 // A5 * B1
  508. nop __LINE__
  509. }
  510. { .mfb
  511. nop __LINE__
  512. FMA_B f69 = f36, f49, f69 // A5 * B2
  513. nop __LINE__
  514. }
  515. ;;
  516. /* 18 */
  517. { .mfb
  518. nop __LINE__
  519. FMA f84 = f36, f50, f84 // A5 * B3
  520. nop __LINE__
  521. }
  522. { .mfb
  523. nop __LINE__
  524. FMA_B f85 = f36, f51, f85 // A5 * B4
  525. nop __LINE__
  526. }
  527. ;;
  528. /* 19 */
  529. { .mfb
  530. nop __LINE__
  531. FMA f100 = f36, f52, f100 // A5 * B5
  532. nop __LINE__
  533. }
  534. { .mfb
  535. nop __LINE__
  536. FMA_B f101 = f36, f53, f101 // A5 * B6
  537. nop __LINE__
  538. }
  539. ;;
  540. /* 20 */
  541. { .mfb
  542. nop __LINE__
  543. FMA f116 = f36, f54, f116 // A5 * B7
  544. nop __LINE__
  545. }
  546. { .mfb
  547. nop __LINE__
  548. FMA_B f117 = f36, f55, f117 // A5 * B8
  549. nop __LINE__
  550. }
  551. ;;
  552. /* 21 */
  553. { .mfb
  554. nop __LINE__
  555. FMA f69 = f37, f48, f69 // A6 * B1
  556. nop __LINE__
  557. }
  558. { .mfb
  559. nop __LINE__
  560. FMA_A f68 = f37, f49, f68 // A6 * B2
  561. nop __LINE__
  562. }
  563. ;;
  564. /* 22 */
  565. { .mfb
  566. nop __LINE__
  567. FMA f85 = f37, f50, f85 // A6 * B3
  568. nop __LINE__
  569. }
  570. { .mfb
  571. nop __LINE__
  572. FMA_A f84 = f37, f51, f84 // A6 * B4
  573. nop __LINE__
  574. }
  575. ;;
  576. /* 23 */
  577. { .mfb
  578. nop __LINE__
  579. FMA f101 = f37, f52, f101 // A6 * B5
  580. nop __LINE__
  581. }
  582. { .mfb
  583. nop __LINE__
  584. FMA_A f100 = f37, f53, f100 // A6 * B6
  585. nop __LINE__
  586. }
  587. ;;
  588. /* 24 */
  589. { .mfb
  590. nop __LINE__
  591. FMA f117 = f37, f54, f117 // A6 * B7
  592. nop __LINE__
  593. }
  594. { .mfb
  595. nop __LINE__
  596. FMA_A f116 = f37, f55, f116 // A6 * B8
  597. nop __LINE__
  598. }
  599. ;;
  600. /* 25 */
  601. { .mfb
  602. nop __LINE__
  603. FMA f70 = f38, f48, f70 // A7 * B1
  604. nop __LINE__
  605. }
  606. { .mfb
  607. nop __LINE__
  608. FMA_B f71 = f38, f49, f71 // A7 * B2
  609. nop __LINE__
  610. }
  611. ;;
  612. /* 26 */
  613. { .mfb
  614. nop __LINE__
  615. FMA f86 = f38, f50, f86 // A7 * B3
  616. nop __LINE__
  617. }
  618. { .mfb
  619. nop __LINE__
  620. FMA_B f87 = f38, f51, f87 // A7 * B4
  621. nop __LINE__
  622. }
  623. ;;
  624. /* 27 */
  625. { .mfb
  626. nop __LINE__
  627. FMA f102 = f38, f52, f102 // A7 * B5
  628. nop __LINE__
  629. }
  630. { .mfb
  631. nop __LINE__
  632. FMA_B f103 = f38, f53, f103 // A7 * B6
  633. nop __LINE__
  634. }
  635. ;;
  636. /* 28 */
  637. { .mfb
  638. nop __LINE__
  639. FMA f118 = f38, f54, f118 // A7 * B7
  640. nop __LINE__
  641. }
  642. { .mfb
  643. nop __LINE__
  644. FMA_B f119 = f38, f55, f119 // A7 * B8
  645. nop __LINE__
  646. }
  647. ;;
  648. /* 29 */
  649. { .mfb
  650. nop __LINE__
  651. FMA f71 = f39, f48, f71 // A8 * B1
  652. nop __LINE__
  653. }
  654. { .mfb
  655. nop __LINE__
  656. FMA_A f70 = f39, f49, f70 // A8 * B2
  657. nop __LINE__
  658. }
  659. ;;
  660. /* 30 */
  661. { .mfb
  662. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  663. FMA f87 = f39, f50, f87 // A8 * B3
  664. nop __LINE__
  665. }
  666. { .mfb
  667. nop __LINE__
  668. FMA_A f86 = f39, f51, f86 // A8 * B4
  669. nop __LINE__
  670. }
  671. ;;
  672. /* 31 */
  673. { .mfb
  674. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  675. FMA f103 = f39, f52, f103 // A8 * B5
  676. nop __LINE__
  677. }
  678. { .mfb
  679. nop __LINE__
  680. FMA_A f102 = f39, f53, f102 // A8 * B6
  681. nop __LINE__
  682. }
  683. ;;
  684. /* 32 */
  685. { .mfb
  686. nop __LINE__
  687. FMA f119 = f39, f54, f119 // A8 * B7
  688. nop __LINE__
  689. }
  690. { .mfb
  691. nop __LINE__
  692. FMA_A f118 = f39, f55, f118 // A8 * B8
  693. nop __LINE__
  694. }
  695. ;;
  696. /* 33 */
  697. { .mfb
  698. nop __LINE__
  699. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  700. nop __LINE__
  701. }
  702. { .mfb
  703. nop __LINE__
  704. (p3) FMA_B f65 = f40, f57, f65 // A1 * B2
  705. nop __LINE__
  706. }
  707. ;;
  708. /* 34 */
  709. { .mfb
  710. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  711. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  712. nop __LINE__
  713. }
  714. { .mfb
  715. nop __LINE__
  716. (p3) FMA_B f81 = f40, f59, f81 // A1 * B4
  717. nop __LINE__
  718. }
  719. ;;
  720. /* 35 */
  721. { .mfb
  722. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  723. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  724. nop __LINE__
  725. }
  726. { .mfb
  727. nop __LINE__
  728. (p3) FMA_B f97 = f40, f61, f97 // A1 * B6
  729. nop __LINE__
  730. }
  731. ;;
  732. /* 36 */
  733. { .mfb
  734. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  735. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  736. nop __LINE__
  737. }
  738. { .mfb
  739. nop __LINE__
  740. (p3) FMA_B f113 = f40, f63, f113 // A1 * B8
  741. nop __LINE__
  742. }
  743. ;;
  744. /* 37 */
  745. { .mfb
  746. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  747. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  748. nop __LINE__
  749. }
  750. { .mfb
  751. nop __LINE__
  752. (p3) FMA_A f64 = f41, f57, f64 // A2 * B2
  753. nop __LINE__
  754. }
  755. ;;
  756. /* 38 */
  757. { .mfb
  758. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  759. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  760. nop __LINE__
  761. }
  762. { .mfb
  763. nop __LINE__
  764. (p3) FMA_A f80 = f41, f59, f80 // A2 * B4
  765. nop __LINE__
  766. }
  767. ;;
  768. /* 39 */
  769. { .mfb
  770. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  771. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  772. nop __LINE__
  773. }
  774. { .mfb
  775. nop __LINE__
  776. (p3) FMA_A f96 = f41, f61, f96 // A2 * B6
  777. nop __LINE__
  778. }
  779. ;;
  780. /* 40 */
  781. { .mfb
  782. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  783. (p5) LDFD f72 = [C1], SIZE
  784. #else
  785. nop __LINE__
  786. #endif
  787. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  788. nop __LINE__
  789. }
  790. { .mfb
  791. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  792. (p5) LDFD f76 = [C5], SIZE
  793. #else
  794. nop __LINE__
  795. #endif
  796. (p3) FMA_A f112 = f41, f63, f112 // A2 * B8
  797. nop __LINE__
  798. }
  799. ;;
  800. /* 41 */
  801. { .mfb
  802. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  803. (p5) LDFD f73 = [C1], SIZE
  804. #else
  805. nop __LINE__
  806. #endif
  807. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  808. nop __LINE__
  809. }
  810. { .mfb
  811. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  812. (p5) LDFD f77 = [C5], SIZE
  813. #else
  814. nop __LINE__
  815. #endif
  816. (p3) FMA_B f67 = f42, f57, f67 // A3 * B2
  817. nop __LINE__
  818. }
  819. ;;
  820. /* 42 */
  821. { .mfb
  822. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  823. (p5) LDFD f74 = [C1], SIZE
  824. #else
  825. nop __LINE__
  826. #endif
  827. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  828. nop __LINE__
  829. }
  830. { .mfb
  831. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  832. (p5) LDFD f78 = [C5], SIZE
  833. #else
  834. nop __LINE__
  835. #endif
  836. (p3) FMA_B f83 = f42, f59, f83 // A3 * B4
  837. nop __LINE__
  838. }
  839. ;;
  840. /* 43 */
  841. { .mfb
  842. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  843. (p5) LDFD f75 = [C1], -3 * SIZE
  844. #else
  845. nop __LINE__
  846. #endif
  847. (p3) FMA f98 = f42, f60, f98 // A3 * B5
  848. nop __LINE__
  849. }
  850. { .mfb
  851. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  852. (p5) LDFD f79 = [C5], -3 * SIZE
  853. #else
  854. nop __LINE__
  855. #endif
  856. (p3) FMA_B f99 = f42, f61, f99 // A3 * B6
  857. nop __LINE__
  858. }
  859. ;;
  860. /* 44 */
  861. { .mfb
  862. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  863. (p5) LDFD f88 = [C2], SIZE
  864. #else
  865. nop __LINE__
  866. #endif
  867. (p3) FMA f114 = f42, f62, f114 // A3 * B7
  868. nop __LINE__
  869. }
  870. { .mfb
  871. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  872. (p5) LDFD f92 = [C6], SIZE
  873. #else
  874. nop __LINE__
  875. #endif
  876. (p3) FMA_B f115 = f42, f63, f115 // A3 * B8
  877. nop __LINE__
  878. }
  879. ;;
  880. /* 45 */
  881. { .mfb
  882. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  883. (p5) LDFD f89 = [C2], SIZE
  884. #else
  885. nop __LINE__
  886. #endif
  887. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  888. nop __LINE__
  889. }
  890. { .mfb
  891. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  892. (p5) LDFD f93 = [C6], SIZE
  893. #else
  894. nop __LINE__
  895. #endif
  896. (p3) FMA_A f66 = f43, f57, f66 // A4 * B2
  897. nop __LINE__
  898. }
  899. ;;
  900. /* 46 */
  901. { .mfb
  902. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  903. (p5) LDFD f90 = [C2], SIZE
  904. #else
  905. nop __LINE__
  906. #endif
  907. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  908. nop __LINE__
  909. }
  910. { .mfb
  911. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  912. (p5) LDFD f94 = [C6], SIZE
  913. #else
  914. nop __LINE__
  915. #endif
  916. (p3) FMA_A f82 = f43, f59, f82 // A4 * B4
  917. nop __LINE__
  918. }
  919. ;;
  920. /* 47 */
  921. { .mfb
  922. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  923. (p5) LDFD f91 = [C2], -3 * SIZE
  924. #else
  925. nop __LINE__
  926. #endif
  927. (p3) FMA f99 = f43, f60, f99 // A4 * B5
  928. nop __LINE__
  929. }
  930. { .mfb
  931. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  932. (p5) LDFD f95 = [C6], -3 * SIZE
  933. #else
  934. nop __LINE__
  935. #endif
  936. (p3) FMA_A f98 = f43, f61, f98 // A4 * B6
  937. nop __LINE__
  938. }
  939. ;;
  940. /* 48 */
  941. { .mfb
  942. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  943. (p5) LDFD f104 = [C3], SIZE
  944. #else
  945. nop __LINE__
  946. #endif
  947. (p3) FMA f115 = f43, f62, f115 // A4 * B7
  948. nop __LINE__
  949. }
  950. { .mfb
  951. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  952. (p5) LDFD f108 = [C7], SIZE
  953. #else
  954. nop __LINE__
  955. #endif
  956. (p3) FMA_A f114 = f43, f63, f114 // A4 * B8
  957. nop __LINE__
  958. }
  959. ;;
  960. /* 49 */
  961. { .mfb
  962. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  963. (p5) LDFD f105 = [C3], SIZE
  964. #else
  965. nop __LINE__
  966. #endif
  967. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  968. nop __LINE__
  969. }
  970. { .mfb
  971. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  972. (p5) LDFD f109 = [C7], SIZE
  973. #else
  974. nop __LINE__
  975. #endif
  976. (p3) FMA_B f69 = f44, f57, f69 // A5 * B2
  977. nop __LINE__
  978. }
  979. ;;
  980. /* 50 */
  981. { .mfb
  982. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  983. (p5) LDFD f106 = [C3], SIZE
  984. #else
  985. nop __LINE__
  986. #endif
  987. (p3) FMA f84 = f44, f58, f84 // A5 * B3
  988. nop __LINE__
  989. }
  990. { .mfb
  991. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  992. (p5) LDFD f110 = [C7], SIZE
  993. #else
  994. nop __LINE__
  995. #endif
  996. (p3) FMA_B f85 = f44, f59, f85 // A5 * B4
  997. nop __LINE__
  998. }
  999. ;;
  1000. /* 51 */
  1001. { .mfb
  1002. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1003. (p5) LDFD f107 = [C3], -3 * SIZE
  1004. #else
  1005. nop __LINE__
  1006. #endif
  1007. (p3) FMA f100 = f44, f60, f100 // A5 * B5
  1008. nop __LINE__
  1009. }
  1010. { .mfb
  1011. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1012. (p5) LDFD f111 = [C7], -3 * SIZE
  1013. #else
  1014. nop __LINE__
  1015. #endif
  1016. (p3) FMA_B f101 = f44, f61, f101 // A5 * B6
  1017. nop __LINE__
  1018. }
  1019. ;;
  1020. /* 52 */
  1021. { .mfb
  1022. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1023. (p5) LDFD f120 = [C4], SIZE
  1024. #else
  1025. nop __LINE__
  1026. #endif
  1027. (p3) FMA f116 = f44, f62, f116 // A5 * B7
  1028. nop __LINE__
  1029. }
  1030. { .mfb
  1031. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1032. (p5) LDFD f124 = [C8], SIZE
  1033. #else
  1034. nop __LINE__
  1035. #endif
  1036. (p3) FMA_B f117 = f44, f63, f117 // A5 * B8
  1037. nop __LINE__
  1038. }
  1039. ;;
  1040. /* 53 */
  1041. { .mfb
  1042. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1043. (p5) LDFD f121 = [C4], SIZE
  1044. #else
  1045. nop __LINE__
  1046. #endif
  1047. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  1048. nop __LINE__
  1049. }
  1050. { .mfb
  1051. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1052. (p5) LDFD f125 = [C8], SIZE
  1053. #else
  1054. nop __LINE__
  1055. #endif
  1056. (p3) FMA_A f68 = f45, f57, f68 // A6 * B2
  1057. nop __LINE__
  1058. }
  1059. ;;
  1060. /* 54 */
  1061. { .mfb
  1062. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1063. (p5) LDFD f122 = [C4], SIZE
  1064. #else
  1065. nop __LINE__
  1066. #endif
  1067. (p3) FMA f85 = f45, f58, f85 // A6 * B3
  1068. nop __LINE__
  1069. }
  1070. { .mfb
  1071. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1072. (p5) LDFD f126 = [C8], SIZE
  1073. #else
  1074. nop __LINE__
  1075. #endif
  1076. (p3) FMA_A f84 = f45, f59, f84 // A6 * B4
  1077. nop __LINE__
  1078. }
  1079. ;;
  1080. /* 55 */
  1081. { .mfb
  1082. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1083. (p5) LDFD f123 = [C4], -3 * SIZE
  1084. #else
  1085. nop __LINE__
  1086. #endif
  1087. (p3) FMA f101 = f45, f60, f101 // A6 * B5
  1088. nop __LINE__
  1089. }
  1090. { .mfb
  1091. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1092. (p5) LDFD f127 = [C8], -3 * SIZE
  1093. #else
  1094. nop __LINE__
  1095. #endif
  1096. (p3) FMA_A f100 = f45, f61, f100 // A6 * B6
  1097. nop __LINE__
  1098. }
  1099. ;;
  1100. /* 56 */
  1101. { .mfb
  1102. nop __LINE__
  1103. (p3) FMA f117 = f45, f62, f117 // A6 * B7
  1104. nop __LINE__
  1105. }
  1106. { .mfb
  1107. nop __LINE__
  1108. (p3) FMA_A f116 = f45, f63, f116 // A6 * B8
  1109. nop __LINE__
  1110. }
  1111. ;;
  1112. /* 57 */
  1113. { .mfb
  1114. nop __LINE__
  1115. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  1116. nop __LINE__
  1117. }
  1118. { .mfb
  1119. nop __LINE__
  1120. (p3) FMA_B f71 = f46, f57, f71 // A7 * B2
  1121. nop __LINE__
  1122. }
  1123. ;;
  1124. /* 58 */
  1125. { .mfb
  1126. nop __LINE__
  1127. (p3) FMA f86 = f46, f58, f86 // A7 * B3
  1128. nop __LINE__
  1129. }
  1130. { .mfb
  1131. nop __LINE__
  1132. (p3) FMA_B f87 = f46, f59, f87 // A7 * B4
  1133. nop __LINE__
  1134. }
  1135. ;;
  1136. /* 59 */
  1137. { .mfb
  1138. nop __LINE__
  1139. (p3) FMA f102 = f46, f60, f102 // A7 * B5
  1140. nop __LINE__
  1141. }
  1142. { .mfb
  1143. nop __LINE__
  1144. (p3) FMA_B f103 = f46, f61, f103 // A7 * B6
  1145. nop __LINE__
  1146. }
  1147. ;;
  1148. /* 60 */
  1149. { .mfb
  1150. nop __LINE__
  1151. (p3) FMA f118 = f46, f62, f118 // A7 * B7
  1152. nop __LINE__
  1153. }
  1154. { .mfb
  1155. nop __LINE__
  1156. (p3) FMA_B f119 = f46, f63, f119 // A7 * B8
  1157. nop __LINE__
  1158. }
  1159. ;;
  1160. /* 61 */
  1161. { .mfb
  1162. nop __LINE__
  1163. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  1164. nop __LINE__
  1165. }
  1166. { .mfb
  1167. nop __LINE__
  1168. (p3) FMA_A f70 = f47, f57, f70 // A8 * B2
  1169. nop __LINE__
  1170. }
  1171. ;;
  1172. /* 62 */
  1173. { .mfb
  1174. nop __LINE__
  1175. (p3) FMA f87 = f47, f58, f87 // A8 * B3
  1176. nop __LINE__
  1177. }
  1178. { .mfb
  1179. nop __LINE__
  1180. (p3) FMA_A f86 = f47, f59, f86 // A8 * B4
  1181. nop __LINE__
  1182. }
  1183. ;;
  1184. /* 63 */
  1185. { .mfb
  1186. nop __LINE__
  1187. (p3) FMA f103 = f47, f60, f103 // A8 * B5
  1188. nop __LINE__
  1189. }
  1190. { .mfb
  1191. nop __LINE__
  1192. (p3) FMA_A f102 = f47, f61, f102 // A8 * B6
  1193. nop __LINE__
  1194. }
  1195. ;;
  1196. /* 64 */
  1197. { .mfi
  1198. nop __LINE__
  1199. (p3) FMA f119 = f47, f62, f119 // A8 * B7
  1200. adds L = -1, L
  1201. }
  1202. { .mfb
  1203. nop __LINE__
  1204. (p3) FMA_A f118 = f47, f63, f118 // A8 * B8
  1205. br.cloop.sptk.few .L012
  1206. }
  1207. ;;
  1208. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1209. { .mfb
  1210. nop __LINE__
  1211. FMA f72 = ALPHA_R, f64, f72
  1212. nop __LINE__
  1213. }
  1214. { .mfb
  1215. nop __LINE__
  1216. FMA f76 = ALPHA_R, f68, f76
  1217. nop __LINE__
  1218. }
  1219. ;;
  1220. { .mfb
  1221. nop __LINE__
  1222. FCALC_C f73 = ALPHA_R, f65, f73
  1223. nop __LINE__
  1224. }
  1225. { .mfb
  1226. nop __LINE__
  1227. FCALC_C f77 = ALPHA_R, f69, f77
  1228. nop __LINE__
  1229. }
  1230. ;;
  1231. { .mfb
  1232. nop __LINE__
  1233. FMA f74 = ALPHA_R, f66, f74
  1234. nop __LINE__
  1235. }
  1236. { .mfb
  1237. nop __LINE__
  1238. FMA f78 = ALPHA_R, f70, f78
  1239. nop __LINE__
  1240. }
  1241. ;;
  1242. { .mfb
  1243. nop __LINE__
  1244. FCALC_C f75 = ALPHA_R, f67, f75
  1245. nop __LINE__
  1246. }
  1247. { .mfb
  1248. nop __LINE__
  1249. FCALC_C f79 = ALPHA_R, f71, f79
  1250. nop __LINE__
  1251. }
  1252. ;;
  1253. { .mfb
  1254. nop __LINE__
  1255. FCALC_D f72 = ALPHA_I, f65, f72
  1256. nop __LINE__
  1257. }
  1258. { .mfb
  1259. nop __LINE__
  1260. FCALC_D f76 = ALPHA_I, f69, f76
  1261. nop __LINE__
  1262. }
  1263. ;;
  1264. { .mfb
  1265. nop __LINE__
  1266. FMA f73 = ALPHA_I, f64, f73
  1267. nop __LINE__
  1268. }
  1269. { .mfb
  1270. nop __LINE__
  1271. FMA f77 = ALPHA_I, f68, f77
  1272. nop __LINE__
  1273. }
  1274. ;;
  1275. { .mfb
  1276. nop __LINE__
  1277. FCALC_D f74 = ALPHA_I, f67, f74
  1278. nop __LINE__
  1279. }
  1280. { .mfb
  1281. nop __LINE__
  1282. FCALC_D f78 = ALPHA_I, f71, f78
  1283. nop __LINE__
  1284. }
  1285. ;;
  1286. { .mfb
  1287. nop __LINE__
  1288. FMA f75 = ALPHA_I, f66, f75
  1289. nop __LINE__
  1290. }
  1291. { .mfb
  1292. nop __LINE__
  1293. FMA f79 = ALPHA_I, f70, f79
  1294. nop __LINE__
  1295. }
  1296. ;;
  1297. { .mfb
  1298. STFD [C1] = f72, SIZE
  1299. FMA f88 = ALPHA_R, f80, f88
  1300. nop __LINE__
  1301. }
  1302. { .mfb
  1303. STFD [C5] = f76, SIZE
  1304. FMA f92 = ALPHA_R, f84, f92
  1305. nop __LINE__
  1306. }
  1307. ;;
  1308. { .mfb
  1309. STFD [C1] = f73, SIZE
  1310. FCALC_C f89 = ALPHA_R, f81, f89
  1311. nop __LINE__
  1312. }
  1313. { .mfb
  1314. STFD [C5] = f77, SIZE
  1315. FCALC_C f93 = ALPHA_R, f85, f93
  1316. nop __LINE__
  1317. }
  1318. ;;
  1319. { .mfb
  1320. STFD [C1] = f74, SIZE
  1321. FMA f90 = ALPHA_R, f82, f90
  1322. nop __LINE__
  1323. }
  1324. { .mfb
  1325. STFD [C5] = f78, SIZE
  1326. FMA f94 = ALPHA_R, f86, f94
  1327. nop __LINE__
  1328. }
  1329. ;;
  1330. { .mfb
  1331. STFD [C1] = f75, 5 * SIZE
  1332. FCALC_C f91 = ALPHA_R, f83, f91
  1333. nop __LINE__
  1334. }
  1335. { .mfb
  1336. STFD [C5] = f79, 5 * SIZE
  1337. FCALC_C f95 = ALPHA_R, f87, f95
  1338. nop __LINE__
  1339. }
  1340. ;;
  1341. { .mfb
  1342. nop __LINE__
  1343. FCALC_D f88 = ALPHA_I, f81, f88
  1344. nop __LINE__
  1345. }
  1346. { .mfb
  1347. nop __LINE__
  1348. FCALC_D f92 = ALPHA_I, f85, f92
  1349. nop __LINE__
  1350. }
  1351. ;;
  1352. { .mfb
  1353. nop __LINE__
  1354. FMA f89 = ALPHA_I, f80, f89
  1355. nop __LINE__
  1356. }
  1357. { .mfb
  1358. nop __LINE__
  1359. FMA f93 = ALPHA_I, f84, f93
  1360. nop __LINE__
  1361. }
  1362. ;;
  1363. { .mfb
  1364. nop __LINE__
  1365. FCALC_D f90 = ALPHA_I, f83, f90
  1366. nop __LINE__
  1367. }
  1368. { .mfb
  1369. nop __LINE__
  1370. FCALC_D f94 = ALPHA_I, f87, f94
  1371. nop __LINE__
  1372. }
  1373. ;;
  1374. { .mfb
  1375. nop __LINE__
  1376. FMA f91 = ALPHA_I, f82, f91
  1377. nop __LINE__
  1378. }
  1379. { .mfb
  1380. nop __LINE__
  1381. FMA f95 = ALPHA_I, f86, f95
  1382. nop __LINE__
  1383. }
  1384. ;;
  1385. { .mfb
  1386. STFD [C2] = f88, SIZE
  1387. FMA f104 = ALPHA_R, f96, f104
  1388. nop __LINE__
  1389. }
  1390. { .mfb
  1391. STFD [C6] = f92, SIZE
  1392. FMA f108 = ALPHA_R, f100, f108
  1393. nop __LINE__
  1394. }
  1395. ;;
  1396. { .mfb
  1397. STFD [C2] = f89, SIZE
  1398. FCALC_C f105 = ALPHA_R, f97, f105
  1399. nop __LINE__
  1400. }
  1401. { .mfb
  1402. STFD [C6] = f93, SIZE
  1403. FCALC_C f109 = ALPHA_R, f101, f109
  1404. nop __LINE__
  1405. }
  1406. ;;
  1407. { .mfb
  1408. STFD [C2] = f90, SIZE
  1409. FMA f106 = ALPHA_R, f98, f106
  1410. nop __LINE__
  1411. }
  1412. { .mfb
  1413. STFD [C6] = f94, SIZE
  1414. FMA f110 = ALPHA_R, f102, f110
  1415. nop __LINE__
  1416. }
  1417. ;;
  1418. { .mfb
  1419. STFD [C2] = f91, 5 * SIZE
  1420. FCALC_C f107 = ALPHA_R, f99, f107
  1421. nop __LINE__
  1422. }
  1423. { .mfb
  1424. STFD [C6] = f95, 5 * SIZE
  1425. FCALC_C f111 = ALPHA_R, f103, f111
  1426. nop __LINE__
  1427. }
  1428. ;;
  1429. { .mfb
  1430. nop __LINE__
  1431. FCALC_D f104 = ALPHA_I, f97, f104
  1432. nop __LINE__
  1433. }
  1434. { .mfb
  1435. nop __LINE__
  1436. FCALC_D f108 = ALPHA_I, f101, f108
  1437. nop __LINE__
  1438. }
  1439. ;;
  1440. { .mfb
  1441. nop __LINE__
  1442. FMA f105 = ALPHA_I, f96, f105
  1443. nop __LINE__
  1444. }
  1445. { .mfb
  1446. nop __LINE__
  1447. FMA f109 = ALPHA_I, f100, f109
  1448. nop __LINE__
  1449. }
  1450. ;;
  1451. { .mfb
  1452. nop __LINE__
  1453. FCALC_D f106 = ALPHA_I, f99, f106
  1454. nop __LINE__
  1455. }
  1456. { .mfb
  1457. nop __LINE__
  1458. FCALC_D f110 = ALPHA_I, f103, f110
  1459. nop __LINE__
  1460. }
  1461. ;;
  1462. { .mfb
  1463. nop __LINE__
  1464. FMA f107 = ALPHA_I, f98, f107
  1465. nop __LINE__
  1466. }
  1467. { .mfb
  1468. nop __LINE__
  1469. FMA f111 = ALPHA_I, f102, f111
  1470. nop __LINE__
  1471. }
  1472. ;;
  1473. { .mfb
  1474. STFD [C3] = f104, SIZE
  1475. FMA f120 = ALPHA_R, f112, f120
  1476. nop __LINE__
  1477. }
  1478. { .mfb
  1479. STFD [C7] = f108, SIZE
  1480. FMA f124 = ALPHA_R, f116, f124
  1481. nop __LINE__
  1482. }
  1483. ;;
  1484. { .mfb
  1485. STFD [C3] = f105, SIZE
  1486. FCALC_C f121 = ALPHA_R, f113, f121
  1487. nop __LINE__
  1488. }
  1489. { .mfb
  1490. STFD [C7] = f109, SIZE
  1491. FCALC_C f125 = ALPHA_R, f117, f125
  1492. nop __LINE__
  1493. }
  1494. ;;
  1495. { .mfb
  1496. STFD [C3] = f106, SIZE
  1497. FMA f122 = ALPHA_R, f114, f122
  1498. nop __LINE__
  1499. }
  1500. { .mfb
  1501. STFD [C7] = f110, SIZE
  1502. FMA f126 = ALPHA_R, f118, f126
  1503. nop __LINE__
  1504. }
  1505. ;;
  1506. { .mfb
  1507. STFD [C3] = f107, 5 * SIZE
  1508. FCALC_C f123 = ALPHA_R, f115, f123
  1509. nop __LINE__
  1510. }
  1511. { .mfb
  1512. STFD [C7] = f111, 5 * SIZE
  1513. FCALC_C f127 = ALPHA_R, f119, f127
  1514. nop __LINE__
  1515. }
  1516. ;;
  1517. { .mfb
  1518. nop __LINE__
  1519. FCALC_D f120 = ALPHA_I, f113, f120
  1520. nop __LINE__
  1521. }
  1522. { .mfb
  1523. nop __LINE__
  1524. FCALC_D f124 = ALPHA_I, f117, f124
  1525. nop __LINE__
  1526. }
  1527. ;;
  1528. { .mfb
  1529. nop __LINE__
  1530. FMA f121 = ALPHA_I, f112, f121
  1531. nop __LINE__
  1532. }
  1533. { .mfb
  1534. nop __LINE__
  1535. FMA f125 = ALPHA_I, f116, f125
  1536. nop __LINE__
  1537. }
  1538. ;;
  1539. { .mfb
  1540. nop __LINE__
  1541. FCALC_D f122 = ALPHA_I, f115, f122
  1542. nop __LINE__
  1543. }
  1544. { .mfb
  1545. nop __LINE__
  1546. FCALC_D f126 = ALPHA_I, f119, f126
  1547. nop __LINE__
  1548. }
  1549. ;;
  1550. { .mfi
  1551. nop __LINE__
  1552. FMA f123 = ALPHA_I, f114, f123
  1553. cmp.ne p6, p0 = 1, I
  1554. }
  1555. { .mfb
  1556. nop __LINE__
  1557. FMA f127 = ALPHA_I, f118, f127
  1558. nop __LINE__
  1559. }
  1560. ;;
  1561. { .mfi
  1562. STFD [C4] = f120, SIZE
  1563. mov f64 = f0
  1564. adds I = -1, I
  1565. }
  1566. { .mfb
  1567. STFD [C8] = f124, SIZE
  1568. mov f65 = f0
  1569. nop __LINE__
  1570. }
  1571. ;;
  1572. { .mfi
  1573. STFD [C4] = f121, SIZE
  1574. mov f80 = f0
  1575. and TEMP = 3, M
  1576. }
  1577. { .mfb
  1578. STFD [C8] = f125, SIZE
  1579. mov f81 = f0
  1580. nop __LINE__
  1581. }
  1582. ;;
  1583. { .mfi
  1584. STFD [C4] = f122, SIZE
  1585. mov f96 = f0
  1586. cmp.ne p8, p9 = r0, TEMP
  1587. }
  1588. { .mfb
  1589. STFD [C8] = f126, SIZE
  1590. mov f97 = f0
  1591. nop __LINE__
  1592. }
  1593. ;;
  1594. { .mfb
  1595. STFD [C4] = f123, 5 * SIZE
  1596. mov f112 = f0
  1597. nop __LINE__
  1598. }
  1599. { .mfb
  1600. STFD [C8] = f127, 5 * SIZE
  1601. mov f113 = f0
  1602. (p6) br.cond.dptk .L011
  1603. }
  1604. ;;
  1605. #else
  1606. { .mfb
  1607. nop __LINE__
  1608. FMPY f72 = ALPHA_R, f64
  1609. nop __LINE__
  1610. }
  1611. { .mfb
  1612. nop __LINE__
  1613. FMPY f76 = ALPHA_R, f68
  1614. nop __LINE__
  1615. }
  1616. ;;
  1617. { .mfb
  1618. nop __LINE__
  1619. FCALC_C f73 = ALPHA_R, f65, f0
  1620. nop __LINE__
  1621. }
  1622. { .mfb
  1623. nop __LINE__
  1624. FCALC_C f77 = ALPHA_R, f69, f0
  1625. nop __LINE__
  1626. }
  1627. ;;
  1628. { .mfb
  1629. nop __LINE__
  1630. FMPY f74 = ALPHA_R, f66
  1631. nop __LINE__
  1632. }
  1633. { .mfb
  1634. nop __LINE__
  1635. FMPY f78 = ALPHA_R, f70
  1636. nop __LINE__
  1637. }
  1638. ;;
  1639. { .mfb
  1640. nop __LINE__
  1641. FCALC_C f75 = ALPHA_R, f67, f0
  1642. nop __LINE__
  1643. }
  1644. { .mfb
  1645. nop __LINE__
  1646. FCALC_C f79 = ALPHA_R, f71, f0
  1647. nop __LINE__
  1648. }
  1649. ;;
  1650. { .mfb
  1651. nop __LINE__
  1652. FCALC_D f72 = ALPHA_I, f65, f72
  1653. nop __LINE__
  1654. }
  1655. { .mfb
  1656. nop __LINE__
  1657. FCALC_D f76 = ALPHA_I, f69, f76
  1658. nop __LINE__
  1659. }
  1660. ;;
  1661. { .mfb
  1662. nop __LINE__
  1663. FMA f73 = ALPHA_I, f64, f73
  1664. nop __LINE__
  1665. }
  1666. { .mfb
  1667. nop __LINE__
  1668. FMA f77 = ALPHA_I, f68, f77
  1669. nop __LINE__
  1670. }
  1671. ;;
  1672. { .mfb
  1673. nop __LINE__
  1674. FCALC_D f74 = ALPHA_I, f67, f74
  1675. nop __LINE__
  1676. }
  1677. { .mfb
  1678. nop __LINE__
  1679. FCALC_D f78 = ALPHA_I, f71, f78
  1680. nop __LINE__
  1681. }
  1682. ;;
  1683. { .mfb
  1684. nop __LINE__
  1685. FMA f75 = ALPHA_I, f66, f75
  1686. nop __LINE__
  1687. }
  1688. { .mfb
  1689. nop __LINE__
  1690. FMA f79 = ALPHA_I, f70, f79
  1691. nop __LINE__
  1692. }
  1693. ;;
  1694. { .mfb
  1695. STFD [C1] = f72, SIZE
  1696. FMPY f88 = ALPHA_R, f80
  1697. nop __LINE__
  1698. }
  1699. { .mfb
  1700. STFD [C5] = f76, SIZE
  1701. FMPY f92 = ALPHA_R, f84
  1702. nop __LINE__
  1703. }
  1704. ;;
  1705. { .mfb
  1706. STFD [C1] = f73, SIZE
  1707. FCALC_C f89 = ALPHA_R, f81, f0
  1708. nop __LINE__
  1709. }
  1710. { .mfb
  1711. STFD [C5] = f77, SIZE
  1712. FCALC_C f93 = ALPHA_R, f85, f0
  1713. nop __LINE__
  1714. }
  1715. ;;
  1716. { .mfb
  1717. STFD [C1] = f74, SIZE
  1718. FMPY f90 = ALPHA_R, f82
  1719. nop __LINE__
  1720. }
  1721. { .mfb
  1722. STFD [C5] = f78, SIZE
  1723. FMPY f94 = ALPHA_R, f86
  1724. nop __LINE__
  1725. }
  1726. ;;
  1727. { .mfb
  1728. STFD [C1] = f75, 5 * SIZE
  1729. FCALC_C f91 = ALPHA_R, f83, f0
  1730. nop __LINE__
  1731. }
  1732. { .mfb
  1733. STFD [C5] = f79, 5 * SIZE
  1734. FCALC_C f95 = ALPHA_R, f87, f0
  1735. nop __LINE__
  1736. }
  1737. ;;
  1738. { .mfb
  1739. nop __LINE__
  1740. FCALC_D f88 = ALPHA_I, f81, f88
  1741. nop __LINE__
  1742. }
  1743. { .mfb
  1744. nop __LINE__
  1745. FCALC_D f92 = ALPHA_I, f85, f92
  1746. nop __LINE__
  1747. }
  1748. ;;
  1749. { .mfb
  1750. nop __LINE__
  1751. FMA f89 = ALPHA_I, f80, f89
  1752. nop __LINE__
  1753. }
  1754. { .mfb
  1755. nop __LINE__
  1756. FMA f93 = ALPHA_I, f84, f93
  1757. nop __LINE__
  1758. }
  1759. ;;
  1760. { .mfb
  1761. nop __LINE__
  1762. FCALC_D f90 = ALPHA_I, f83, f90
  1763. nop __LINE__
  1764. }
  1765. { .mfb
  1766. nop __LINE__
  1767. FCALC_D f94 = ALPHA_I, f87, f94
  1768. nop __LINE__
  1769. }
  1770. ;;
  1771. { .mfb
  1772. nop __LINE__
  1773. FMA f91 = ALPHA_I, f82, f91
  1774. nop __LINE__
  1775. }
  1776. { .mfb
  1777. nop __LINE__
  1778. FMA f95 = ALPHA_I, f86, f95
  1779. nop __LINE__
  1780. }
  1781. ;;
  1782. { .mfb
  1783. STFD [C2] = f88, SIZE
  1784. FMPY f104 = ALPHA_R, f96
  1785. nop __LINE__
  1786. }
  1787. { .mfb
  1788. STFD [C6] = f92, SIZE
  1789. FMPY f108 = ALPHA_R, f100
  1790. nop __LINE__
  1791. }
  1792. ;;
  1793. { .mfb
  1794. STFD [C2] = f89, SIZE
  1795. FCALC_C f105 = ALPHA_R, f97, f0
  1796. nop __LINE__
  1797. }
  1798. { .mfb
  1799. STFD [C6] = f93, SIZE
  1800. FCALC_C f109 = ALPHA_R, f101, f0
  1801. nop __LINE__
  1802. }
  1803. ;;
  1804. { .mfb
  1805. STFD [C2] = f90, SIZE
  1806. FMPY f106 = ALPHA_R, f98
  1807. nop __LINE__
  1808. }
  1809. { .mfb
  1810. STFD [C6] = f94, SIZE
  1811. FMPY f110 = ALPHA_R, f102
  1812. nop __LINE__
  1813. }
  1814. ;;
  1815. { .mfb
  1816. STFD [C2] = f91, 5 * SIZE
  1817. FCALC_C f107 = ALPHA_R, f99, f0
  1818. nop __LINE__
  1819. }
  1820. { .mfb
  1821. STFD [C6] = f95, 5 * SIZE
  1822. FCALC_C f111 = ALPHA_R, f103, f0
  1823. nop __LINE__
  1824. }
  1825. ;;
  1826. { .mfb
  1827. nop __LINE__
  1828. FCALC_D f104 = ALPHA_I, f97, f104
  1829. nop __LINE__
  1830. }
  1831. { .mfb
  1832. nop __LINE__
  1833. FCALC_D f108 = ALPHA_I, f101, f108
  1834. nop __LINE__
  1835. }
  1836. ;;
  1837. { .mfb
  1838. nop __LINE__
  1839. FMA f105 = ALPHA_I, f96, f105
  1840. nop __LINE__
  1841. }
  1842. { .mfb
  1843. nop __LINE__
  1844. FMA f109 = ALPHA_I, f100, f109
  1845. nop __LINE__
  1846. }
  1847. ;;
  1848. { .mfb
  1849. nop __LINE__
  1850. FCALC_D f106 = ALPHA_I, f99, f106
  1851. nop __LINE__
  1852. }
  1853. { .mfb
  1854. nop __LINE__
  1855. FCALC_D f110 = ALPHA_I, f103, f110
  1856. nop __LINE__
  1857. }
  1858. ;;
  1859. { .mfb
  1860. nop __LINE__
  1861. FMA f107 = ALPHA_I, f98, f107
  1862. nop __LINE__
  1863. }
  1864. { .mfb
  1865. nop __LINE__
  1866. FMA f111 = ALPHA_I, f102, f111
  1867. nop __LINE__
  1868. }
  1869. ;;
  1870. { .mfb
  1871. STFD [C3] = f104, SIZE
  1872. FMPY f120 = ALPHA_R, f112
  1873. nop __LINE__
  1874. }
  1875. { .mfb
  1876. STFD [C7] = f108, SIZE
  1877. FMPY f124 = ALPHA_R, f116
  1878. nop __LINE__
  1879. }
  1880. ;;
  1881. { .mfb
  1882. STFD [C3] = f105, SIZE
  1883. FCALC_C f121 = ALPHA_R, f113, f0
  1884. nop __LINE__
  1885. }
  1886. { .mfb
  1887. STFD [C7] = f109, SIZE
  1888. FCALC_C f125 = ALPHA_R, f117, f0
  1889. nop __LINE__
  1890. }
  1891. ;;
  1892. { .mfb
  1893. STFD [C3] = f106, SIZE
  1894. FMPY f122 = ALPHA_R, f114
  1895. nop __LINE__
  1896. }
  1897. { .mfb
  1898. STFD [C7] = f110, SIZE
  1899. FMPY f126 = ALPHA_R, f118
  1900. nop __LINE__
  1901. }
  1902. ;;
  1903. { .mfb
  1904. STFD [C3] = f107, 5 * SIZE
  1905. FCALC_C f123 = ALPHA_R, f115, f0
  1906. nop __LINE__
  1907. }
  1908. { .mfb
  1909. STFD [C7] = f111, 5 * SIZE
  1910. FCALC_C f127 = ALPHA_R, f119, f0
  1911. nop __LINE__
  1912. }
  1913. ;;
  1914. { .mfb
  1915. nop __LINE__
  1916. FCALC_D f120 = ALPHA_I, f113, f120
  1917. nop __LINE__
  1918. }
  1919. { .mfb
  1920. nop __LINE__
  1921. FCALC_D f124 = ALPHA_I, f117, f124
  1922. nop __LINE__
  1923. }
  1924. ;;
  1925. { .mfi
  1926. nop __LINE__
  1927. FMA f121 = ALPHA_I, f112, f121
  1928. #if defined(TRMMKERNEL) && \
  1929. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  1930. sub L = K, KK
  1931. #else
  1932. nop __LINE__
  1933. #endif
  1934. }
  1935. { .mfb
  1936. nop __LINE__
  1937. FMA f125 = ALPHA_I, f116, f125
  1938. nop __LINE__
  1939. }
  1940. ;;
  1941. { .mfi
  1942. nop __LINE__
  1943. FCALC_D f122 = ALPHA_I, f115, f122
  1944. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  1945. adds L = -4, L
  1946. #else
  1947. nop __LINE__
  1948. #endif
  1949. }
  1950. { .mfi
  1951. nop __LINE__
  1952. FCALC_D f126 = ALPHA_I, f119, f126
  1953. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  1954. adds L = -4, L
  1955. #else
  1956. nop __LINE__
  1957. #endif
  1958. }
  1959. ;;
  1960. { .mfi
  1961. nop __LINE__
  1962. FMA f123 = ALPHA_I, f114, f123
  1963. cmp.ne p6, p0 = 1, I
  1964. }
  1965. { .mfi
  1966. nop __LINE__
  1967. FMA f127 = ALPHA_I, f118, f127
  1968. adds I = -1, I
  1969. }
  1970. ;;
  1971. { .mfi
  1972. STFD [C4] = f120, SIZE
  1973. mov f64 = f0
  1974. #if defined(TRMMKERNEL) && \
  1975. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  1976. shladd KK8 = L, ZBASE_SHIFT, r0
  1977. #else
  1978. nop __LINE__
  1979. #endif
  1980. }
  1981. { .mfi
  1982. STFD [C8] = f124, SIZE
  1983. mov f65 = f0
  1984. and TEMP = 3, M
  1985. }
  1986. ;;
  1987. { .mfi
  1988. STFD [C4] = f121, SIZE
  1989. mov f80 = f0
  1990. #if defined(TRMMKERNEL) && \
  1991. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  1992. shladd AOFFSET = KK8, 2, AOFFSET
  1993. #else
  1994. nop __LINE__
  1995. #endif
  1996. }
  1997. { .mfi
  1998. STFD [C8] = f125, SIZE
  1999. mov f81 = f0
  2000. #if defined(TRMMKERNEL) && \
  2001. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2002. shladd BOFFSET = KK8, 2, BOFFSET
  2003. #else
  2004. nop __LINE__
  2005. #endif
  2006. }
  2007. ;;
  2008. { .mfi
  2009. STFD [C4] = f122, SIZE
  2010. mov f96 = f0
  2011. #if defined(TRMMKERNEL) && defined(LEFT)
  2012. adds KK = 4, KK
  2013. #else
  2014. nop __LINE__
  2015. #endif
  2016. }
  2017. { .mfi
  2018. STFD [C8] = f126, SIZE
  2019. mov f97 = f0
  2020. cmp.ne p8, p9 = r0, TEMP
  2021. }
  2022. ;;
  2023. { .mfi
  2024. STFD [C4] = f123, 5 * SIZE
  2025. mov f112 = f0
  2026. #ifdef TRMMKERNEL
  2027. shladd KK8 = KK, ZBASE_SHIFT, r0
  2028. #else
  2029. nop __LINE__
  2030. #endif
  2031. }
  2032. { .mfb
  2033. STFD [C8] = f127, 5 * SIZE
  2034. mov f113 = f0
  2035. (p6) br.cond.dptk .L011
  2036. }
  2037. ;;
  2038. #endif
  2039. .L020:
  2040. { .mib
  2041. #ifndef TRMMKERNEL
  2042. nop __LINE__
  2043. #else
  2044. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2045. sub L = K, KK
  2046. #elif defined(LEFT)
  2047. adds L = 2, KK
  2048. #else
  2049. adds L = 4, KK
  2050. #endif
  2051. #endif
  2052. tbit.z p6, p7 = M, 1
  2053. (p6) br.cond.dptk .L030
  2054. }
  2055. ;;
  2056. #if !defined(TRMMKERNEL) || \
  2057. defined(TRMMKERNEL) && \
  2058. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2059. { .mfb
  2060. LDFPD f48, f49 = [B]
  2061. mov f66 = f0
  2062. nop __LINE__
  2063. }
  2064. { .mfi
  2065. adds BOFFSET = 2 * SIZE, B
  2066. mov f67 = f0
  2067. #ifndef TRMMKERNEL
  2068. adds L = 1, K
  2069. #else
  2070. adds L = 1, L
  2071. #endif
  2072. }
  2073. ;;
  2074. #else
  2075. { .mfi
  2076. shladd BOFFSET = KK8, 2, B
  2077. mov f66 = f0
  2078. shladd AOFFSET = KK8, 1, AOFFSET
  2079. }
  2080. ;;
  2081. { .mfi
  2082. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  2083. mov f67 = f0
  2084. #ifndef TRMMKERNEL
  2085. adds L = 1, K
  2086. #else
  2087. adds L = 1, L
  2088. #endif
  2089. }
  2090. ;;
  2091. #endif
  2092. { .mfi
  2093. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  2094. mov f82 = f0
  2095. tbit.z p12, p0 = L, 0
  2096. }
  2097. { .mfi
  2098. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  2099. mov f83 = f0
  2100. shr L = L, 1
  2101. }
  2102. ;;
  2103. { .mfi
  2104. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  2105. mov f98 = f0
  2106. adds L = -1, L
  2107. }
  2108. { .mfi
  2109. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  2110. mov f99 = f0
  2111. cmp.eq p3, p0 = r0, r0
  2112. }
  2113. ;;
  2114. { .mfi
  2115. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  2116. mov f114 = f0
  2117. mov ar.lc = L
  2118. }
  2119. { .mfi
  2120. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  2121. mov f115 = f0
  2122. nop __LINE__
  2123. }
  2124. ;;
  2125. .align 16
  2126. .L022:
  2127. { .mfi
  2128. lfetch.nt1 [PREA], 8 * SIZE
  2129. FMA f64 = f32, f48, f64 // A1 * B1
  2130. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  2131. }
  2132. { .mfi
  2133. nop __LINE__
  2134. FMA_B f65 = f32, f49, f65 // A1 * B2
  2135. (p12) cmp.ne p3, p0 = 0, L
  2136. }
  2137. ;;
  2138. { .mfi
  2139. lfetch.nt1 [PREB], 16 * SIZE
  2140. FMA f80 = f32, f50, f80 // A1 * B3
  2141. cmp.ne p4, p5 = 0, L
  2142. }
  2143. { .mfb
  2144. nop __LINE__
  2145. FMA_B f81 = f32, f51, f81 // A1 * B4
  2146. nop __LINE__
  2147. }
  2148. ;;
  2149. { .mfb
  2150. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  2151. FMA f96 = f32, f52, f96 // A1 * B5
  2152. nop __LINE__
  2153. }
  2154. { .mfb
  2155. nop __LINE__
  2156. FMA_B f97 = f32, f53, f97 // A1 * B6
  2157. nop __LINE__
  2158. }
  2159. ;;
  2160. { .mfb
  2161. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  2162. FMA f112 = f32, f54, f112 // A1 * B7
  2163. nop __LINE__
  2164. }
  2165. { .mfb
  2166. nop __LINE__
  2167. FMA_B f113 = f32, f55, f113 // A1 * B8
  2168. nop __LINE__
  2169. }
  2170. ;;
  2171. { .mfb
  2172. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  2173. FMA f65 = f33, f48, f65 // A2 * B1
  2174. nop __LINE__
  2175. }
  2176. { .mfb
  2177. nop __LINE__
  2178. FMA_A f64 = f33, f49, f64 // A2 * B2
  2179. nop __LINE__
  2180. }
  2181. ;;
  2182. { .mfb
  2183. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  2184. FMA f81 = f33, f50, f81 // A2 * B3
  2185. nop __LINE__
  2186. }
  2187. { .mfb
  2188. nop __LINE__
  2189. FMA_A f80 = f33, f51, f80 // A2 * B4
  2190. nop __LINE__
  2191. }
  2192. ;;
  2193. { .mfb
  2194. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  2195. FMA f97 = f33, f52, f97 // A2 * B5
  2196. nop __LINE__
  2197. }
  2198. { .mfb
  2199. nop __LINE__
  2200. FMA_A f96 = f33, f53, f96 // A2 * B6
  2201. nop __LINE__
  2202. }
  2203. ;;
  2204. { .mfb
  2205. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  2206. FMA f113 = f33, f54, f113 // A2 * B7
  2207. nop __LINE__
  2208. }
  2209. { .mfb
  2210. nop __LINE__
  2211. FMA_A f112 = f33, f55, f112 // A2 * B8
  2212. nop __LINE__
  2213. }
  2214. ;;
  2215. { .mfb
  2216. nop __LINE__
  2217. FMA f66 = f34, f48, f66 // A3 * B1
  2218. nop __LINE__
  2219. }
  2220. { .mfb
  2221. nop __LINE__
  2222. FMA_B f67 = f34, f49, f67 // A3 * B2
  2223. nop __LINE__
  2224. }
  2225. ;;
  2226. { .mfb
  2227. nop __LINE__
  2228. FMA f82 = f34, f50, f82 // A3 * B3
  2229. nop __LINE__
  2230. }
  2231. { .mfb
  2232. nop __LINE__
  2233. FMA_B f83 = f34, f51, f83 // A3 * B4
  2234. nop __LINE__
  2235. }
  2236. ;;
  2237. { .mfb
  2238. nop __LINE__
  2239. FMA f98 = f34, f52, f98 // A3 * B5
  2240. nop __LINE__
  2241. }
  2242. { .mfb
  2243. nop __LINE__
  2244. FMA_B f99 = f34, f53, f99 // A3 * B6
  2245. nop __LINE__
  2246. }
  2247. ;;
  2248. { .mfb
  2249. nop __LINE__
  2250. FMA f114 = f34, f54, f114 // A3 * B7
  2251. nop __LINE__
  2252. }
  2253. { .mfb
  2254. nop __LINE__
  2255. FMA_B f115 = f34, f55, f115 // A3 * B8
  2256. nop __LINE__
  2257. }
  2258. ;;
  2259. { .mfb
  2260. nop __LINE__
  2261. FMA f67 = f35, f48, f67 // A4 * B1
  2262. nop __LINE__
  2263. }
  2264. { .mfb
  2265. nop __LINE__
  2266. FMA_A f66 = f35, f49, f66 // A4 * B2
  2267. nop __LINE__
  2268. }
  2269. ;;
  2270. { .mfb
  2271. nop __LINE__
  2272. FMA f83 = f35, f50, f83 // A4 * B3
  2273. nop __LINE__
  2274. }
  2275. { .mfb
  2276. nop __LINE__
  2277. FMA_A f82 = f35, f51, f82 // A4 * B4
  2278. nop __LINE__
  2279. }
  2280. ;;
  2281. { .mfb
  2282. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  2283. FMA f99 = f35, f52, f99 // A4 * B5
  2284. nop __LINE__
  2285. }
  2286. { .mfb
  2287. nop __LINE__
  2288. FMA_A f98 = f35, f53, f98 // A4 * B6
  2289. nop __LINE__
  2290. }
  2291. ;;
  2292. { .mfb
  2293. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  2294. FMA f115 = f35, f54, f115 // A4 * B7
  2295. nop __LINE__
  2296. }
  2297. { .mfb
  2298. nop __LINE__
  2299. FMA_A f114 = f35, f55, f114 // A4 * B8
  2300. nop __LINE__
  2301. }
  2302. ;;
  2303. { .mfb
  2304. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  2305. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  2306. nop __LINE__
  2307. }
  2308. { .mfb
  2309. nop __LINE__
  2310. (p3) FMA_B f65 = f40, f57, f65 // A1 * B2
  2311. nop __LINE__
  2312. }
  2313. ;;
  2314. { .mfb
  2315. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  2316. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  2317. nop __LINE__
  2318. }
  2319. { .mfb
  2320. nop __LINE__
  2321. (p3) FMA_B f81 = f40, f59, f81 // A1 * B4
  2322. nop __LINE__
  2323. }
  2324. ;;
  2325. { .mfb
  2326. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2327. (p5) LDFD f72 = [C1], SIZE
  2328. #else
  2329. nop __LINE__
  2330. #endif
  2331. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  2332. nop __LINE__
  2333. }
  2334. { .mfb
  2335. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2336. (p5) LDFD f88 = [C2], SIZE
  2337. #else
  2338. nop __LINE__
  2339. #endif
  2340. (p3) FMA_B f97 = f40, f61, f97 // A1 * B6
  2341. nop __LINE__
  2342. }
  2343. ;;
  2344. { .mfb
  2345. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2346. (p5) LDFD f73 = [C1], SIZE
  2347. #else
  2348. nop __LINE__
  2349. #endif
  2350. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  2351. nop __LINE__
  2352. }
  2353. { .mfb
  2354. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2355. (p5) LDFD f89 = [C2], SIZE
  2356. #else
  2357. nop __LINE__
  2358. #endif
  2359. (p3) FMA_B f113 = f40, f63, f113 // A1 * B8
  2360. nop __LINE__
  2361. }
  2362. ;;
  2363. { .mfb
  2364. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  2365. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  2366. nop __LINE__
  2367. }
  2368. { .mfb
  2369. (p3) FMA_A f64 = f41, f57, f64 // A2 * B2
  2370. nop __LINE__
  2371. }
  2372. { .mfb
  2373. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  2374. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  2375. nop __LINE__
  2376. }
  2377. { .mfb
  2378. (p3) FMA_A f80 = f41, f59, f80 // A2 * B4
  2379. nop __LINE__
  2380. }
  2381. ;;
  2382. { .mfb
  2383. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2384. (p5) LDFD f74 = [C1], SIZE
  2385. #else
  2386. nop __LINE__
  2387. #endif
  2388. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  2389. nop __LINE__
  2390. }
  2391. { .mfb
  2392. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2393. (p5) LDFD f90 = [C2], SIZE
  2394. #else
  2395. nop __LINE__
  2396. #endif
  2397. (p3) FMA_A f96 = f41, f61, f96 // A2 * B6
  2398. nop __LINE__
  2399. }
  2400. ;;
  2401. { .mfb
  2402. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2403. (p5) LDFD f75 = [C1], -3 * SIZE
  2404. #else
  2405. nop __LINE__
  2406. #endif
  2407. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  2408. nop __LINE__
  2409. }
  2410. { .mfb
  2411. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2412. (p5) LDFD f91 = [C2], -3 * SIZE
  2413. #else
  2414. nop __LINE__
  2415. #endif
  2416. (p3) FMA_A f112 = f41, f63, f112 // A2 * B8
  2417. nop __LINE__
  2418. }
  2419. ;;
  2420. { .mfb
  2421. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2422. (p5) LDFD f104 = [C3], SIZE
  2423. #else
  2424. nop __LINE__
  2425. #endif
  2426. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  2427. nop __LINE__
  2428. }
  2429. { .mfb
  2430. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2431. (p5) LDFD f120 = [C4], SIZE
  2432. #else
  2433. nop __LINE__
  2434. #endif
  2435. (p3) FMA_B f67 = f42, f57, f67 // A3 * B2
  2436. nop __LINE__
  2437. }
  2438. ;;
  2439. { .mfb
  2440. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2441. (p5) LDFD f105 = [C3], SIZE
  2442. #else
  2443. nop __LINE__
  2444. #endif
  2445. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  2446. nop __LINE__
  2447. }
  2448. { .mfb
  2449. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2450. (p5) LDFD f121 = [C4], SIZE
  2451. #else
  2452. nop __LINE__
  2453. #endif
  2454. (p3) FMA_B f83 = f42, f59, f83 // A3 * B4
  2455. nop __LINE__
  2456. }
  2457. ;;
  2458. { .mfb
  2459. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2460. (p5) LDFD f106 = [C3], SIZE
  2461. #else
  2462. nop __LINE__
  2463. #endif
  2464. (p3) FMA f98 = f42, f60, f98 // A3 * B5
  2465. nop __LINE__
  2466. }
  2467. { .mfb
  2468. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2469. (p5) LDFD f122 = [C4], SIZE
  2470. #else
  2471. nop __LINE__
  2472. #endif
  2473. (p3) FMA_B f99 = f42, f61, f99 // A3 * B6
  2474. nop __LINE__
  2475. }
  2476. ;;
  2477. { .mfb
  2478. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2479. (p5) LDFD f107 = [C3], -3 * SIZE
  2480. #else
  2481. nop __LINE__
  2482. #endif
  2483. (p3) FMA f114 = f42, f62, f114 // A3 * B7
  2484. nop __LINE__
  2485. }
  2486. { .mfb
  2487. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2488. (p5) LDFD f123 = [C4], -3 * SIZE
  2489. #else
  2490. nop __LINE__
  2491. #endif
  2492. (p3) FMA_B f115 = f42, f63, f115 // A3 * B8
  2493. nop __LINE__
  2494. }
  2495. ;;
  2496. { .mfb
  2497. nop __LINE__
  2498. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  2499. nop __LINE__
  2500. }
  2501. { .mfb
  2502. nop __LINE__
  2503. (p3) FMA_A f66 = f43, f57, f66 // A4 * B2
  2504. nop __LINE__
  2505. }
  2506. ;;
  2507. { .mfb
  2508. nop __LINE__
  2509. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  2510. nop __LINE__
  2511. }
  2512. { .mfb
  2513. nop __LINE__
  2514. (p3) FMA_A f82 = f43, f59, f82 // A4 * B4
  2515. nop __LINE__
  2516. }
  2517. ;;
  2518. { .mfb
  2519. nop __LINE__
  2520. (p3) FMA f99 = f43, f60, f99 // A4 * B5
  2521. nop __LINE__
  2522. }
  2523. { .mfb
  2524. nop __LINE__
  2525. (p3) FMA_A f98 = f43, f61, f98 // A4 * B6
  2526. nop __LINE__
  2527. }
  2528. ;;
  2529. { .mfi
  2530. nop __LINE__
  2531. (p3) FMA f115 = f43, f62, f115 // A4 * B7
  2532. adds L = -1, L
  2533. }
  2534. { .mfb
  2535. nop __LINE__
  2536. (p3) FMA_A f114 = f43, f63, f114 // A4 * B8
  2537. br.cloop.sptk.few .L022
  2538. }
  2539. ;;
  2540. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2541. { .mfb
  2542. nop __LINE__
  2543. FMA f72 = ALPHA_R, f64, f72
  2544. nop __LINE__
  2545. }
  2546. { .mfb
  2547. nop __LINE__
  2548. FMA f88 = ALPHA_R, f80, f88
  2549. nop __LINE__
  2550. }
  2551. ;;
  2552. { .mfb
  2553. nop __LINE__
  2554. FCALC_C f73 = ALPHA_R, f65, f73
  2555. nop __LINE__
  2556. }
  2557. { .mfb
  2558. nop __LINE__
  2559. FCALC_C f89 = ALPHA_R, f81, f89
  2560. nop __LINE__
  2561. }
  2562. ;;
  2563. { .mfb
  2564. nop __LINE__
  2565. FMA f74 = ALPHA_R, f66, f74
  2566. nop __LINE__
  2567. }
  2568. { .mfb
  2569. nop __LINE__
  2570. FMA f90 = ALPHA_R, f82, f90
  2571. nop __LINE__
  2572. }
  2573. ;;
  2574. { .mfb
  2575. nop __LINE__
  2576. FCALC_C f75 = ALPHA_R, f67, f75
  2577. nop __LINE__
  2578. }
  2579. { .mfb
  2580. nop __LINE__
  2581. FCALC_C f91 = ALPHA_R, f83, f91
  2582. nop __LINE__
  2583. }
  2584. ;;
  2585. { .mfb
  2586. nop __LINE__
  2587. FCALC_D f72 = ALPHA_I, f65, f72
  2588. nop __LINE__
  2589. }
  2590. { .mfb
  2591. nop __LINE__
  2592. FCALC_D f88 = ALPHA_I, f81, f88
  2593. nop __LINE__
  2594. }
  2595. { .mfb
  2596. nop __LINE__
  2597. FMA f73 = ALPHA_I, f64, f73
  2598. nop __LINE__
  2599. }
  2600. { .mfb
  2601. FMA f89 = ALPHA_I, f80, f89
  2602. nop __LINE__
  2603. }
  2604. { .mfb
  2605. nop __LINE__
  2606. FCALC_D f74 = ALPHA_I, f67, f74
  2607. nop __LINE__
  2608. }
  2609. { .mfb
  2610. FCALC_D f90 = ALPHA_I, f83, f90
  2611. nop __LINE__
  2612. }
  2613. { .mfb
  2614. nop __LINE__
  2615. FMA f75 = ALPHA_I, f66, f75
  2616. nop __LINE__
  2617. }
  2618. { .mfb
  2619. nop __LINE__
  2620. FMA f91 = ALPHA_I, f82, f91
  2621. nop __LINE__
  2622. }
  2623. ;;
  2624. { .mfb
  2625. STFD [C1] = f72, SIZE
  2626. FMA f104 = ALPHA_R, f96, f104
  2627. nop __LINE__
  2628. }
  2629. { .mfb
  2630. STFD [C2] = f88, SIZE
  2631. FMA f120 = ALPHA_R, f112, f120
  2632. nop __LINE__
  2633. }
  2634. ;;
  2635. { .mfb
  2636. STFD [C1] = f73, SIZE
  2637. FCALC_C f105 = ALPHA_R, f97, f105
  2638. nop __LINE__
  2639. }
  2640. { .mfb
  2641. STFD [C2] = f89, SIZE
  2642. FCALC_C f121 = ALPHA_R, f113, f121
  2643. nop __LINE__
  2644. }
  2645. ;;
  2646. { .mfb
  2647. STFD [C1] = f74, SIZE
  2648. FMA f106 = ALPHA_R, f98, f106
  2649. nop __LINE__
  2650. }
  2651. { .mfb
  2652. STFD [C2] = f90, SIZE
  2653. FMA f122 = ALPHA_R, f114, f122
  2654. nop __LINE__
  2655. }
  2656. ;;
  2657. { .mfb
  2658. STFD [C1] = f75, SIZE
  2659. FCALC_C f107 = ALPHA_R, f99, f107
  2660. nop __LINE__
  2661. }
  2662. { .mfb
  2663. STFD [C2] = f91, SIZE
  2664. FCALC_C f123 = ALPHA_R, f115, f123
  2665. nop __LINE__
  2666. }
  2667. ;;
  2668. { .mfb
  2669. nop __LINE__
  2670. FCALC_D f104 = ALPHA_I, f97, f104
  2671. nop __LINE__
  2672. }
  2673. { .mfb
  2674. nop __LINE__
  2675. FCALC_D f120 = ALPHA_I, f113, f120
  2676. nop __LINE__
  2677. }
  2678. ;;
  2679. { .mfb
  2680. nop __LINE__
  2681. FMA f105 = ALPHA_I, f96, f105
  2682. nop __LINE__
  2683. }
  2684. { .mfb
  2685. nop __LINE__
  2686. FMA f121 = ALPHA_I, f112, f121
  2687. nop __LINE__
  2688. }
  2689. ;;
  2690. { .mfb
  2691. nop __LINE__
  2692. FCALC_D f106 = ALPHA_I, f99, f106
  2693. nop __LINE__
  2694. }
  2695. { .mfb
  2696. nop __LINE__
  2697. FCALC_D f122 = ALPHA_I, f115, f122
  2698. nop __LINE__
  2699. }
  2700. ;;
  2701. { .mfb
  2702. nop __LINE__
  2703. FMA f107 = ALPHA_I, f98, f107
  2704. nop __LINE__
  2705. }
  2706. { .mfb
  2707. nop __LINE__
  2708. FMA f123 = ALPHA_I, f114, f123
  2709. nop __LINE__
  2710. }
  2711. ;;
  2712. { .mfb
  2713. STFD [C3] = f104, SIZE
  2714. mov f64 = f0
  2715. nop __LINE__
  2716. }
  2717. { .mfi
  2718. STFD [C4] = f120, SIZE
  2719. mov f65 = f0
  2720. }
  2721. ;;
  2722. { .mfb
  2723. STFD [C3] = f105, SIZE
  2724. mov f80 = f0
  2725. nop __LINE__
  2726. }
  2727. { .mfi
  2728. STFD [C4] = f121, SIZE
  2729. mov f81 = f0
  2730. }
  2731. ;;
  2732. { .mfb
  2733. STFD [C3] = f106, SIZE
  2734. mov f96 = f0
  2735. nop __LINE__
  2736. }
  2737. { .mfi
  2738. STFD [C4] = f122, SIZE
  2739. mov f97 = f0
  2740. }
  2741. ;;
  2742. { .mfi
  2743. STFD [C3] = f107, SIZE
  2744. mov f112 = f0
  2745. }
  2746. { .mfb
  2747. STFD [C4] = f123, SIZE
  2748. mov f113 = f0
  2749. nop __LINE__
  2750. }
  2751. ;;
  2752. #else
  2753. { .mfb
  2754. nop __LINE__
  2755. FMPY f72 = ALPHA_R, f64
  2756. nop __LINE__
  2757. }
  2758. { .mfb
  2759. nop __LINE__
  2760. FMPY f88 = ALPHA_R, f80
  2761. nop __LINE__
  2762. }
  2763. ;;
  2764. { .mfb
  2765. nop __LINE__
  2766. FCALC_C f73 = ALPHA_R, f65, f0
  2767. nop __LINE__
  2768. }
  2769. { .mfb
  2770. nop __LINE__
  2771. FCALC_C f89 = ALPHA_R, f81, f0
  2772. nop __LINE__
  2773. }
  2774. ;;
  2775. { .mfb
  2776. nop __LINE__
  2777. FMPY f74 = ALPHA_R, f66
  2778. nop __LINE__
  2779. }
  2780. { .mfb
  2781. nop __LINE__
  2782. FMPY f90 = ALPHA_R, f82
  2783. nop __LINE__
  2784. }
  2785. ;;
  2786. { .mfb
  2787. nop __LINE__
  2788. FCALC_C f75 = ALPHA_R, f67, f0
  2789. nop __LINE__
  2790. }
  2791. { .mfb
  2792. nop __LINE__
  2793. FCALC_C f91 = ALPHA_R, f83, f0
  2794. nop __LINE__
  2795. }
  2796. ;;
  2797. { .mfb
  2798. nop __LINE__
  2799. FCALC_D f72 = ALPHA_I, f65, f72
  2800. nop __LINE__
  2801. }
  2802. { .mfb
  2803. nop __LINE__
  2804. FCALC_D f88 = ALPHA_I, f81, f88
  2805. nop __LINE__
  2806. }
  2807. ;;
  2808. { .mfb
  2809. nop __LINE__
  2810. FMA f73 = ALPHA_I, f64, f73
  2811. nop __LINE__
  2812. }
  2813. { .mfb
  2814. FMA f89 = ALPHA_I, f80, f89
  2815. nop __LINE__
  2816. }
  2817. ;;
  2818. { .mfb
  2819. nop __LINE__
  2820. FCALC_D f74 = ALPHA_I, f67, f74
  2821. nop __LINE__
  2822. }
  2823. { .mfb
  2824. FCALC_D f90 = ALPHA_I, f83, f90
  2825. nop __LINE__
  2826. }
  2827. ;;
  2828. { .mfb
  2829. nop __LINE__
  2830. FMA f75 = ALPHA_I, f66, f75
  2831. nop __LINE__
  2832. }
  2833. { .mfb
  2834. nop __LINE__
  2835. FMA f91 = ALPHA_I, f82, f91
  2836. nop __LINE__
  2837. }
  2838. ;;
  2839. { .mfb
  2840. STFD [C1] = f72, SIZE
  2841. FMPY f104 = ALPHA_R, f96
  2842. nop __LINE__
  2843. }
  2844. { .mfb
  2845. STFD [C2] = f88, SIZE
  2846. FMPY f120 = ALPHA_R, f112
  2847. nop __LINE__
  2848. }
  2849. ;;
  2850. { .mfb
  2851. STFD [C1] = f73, SIZE
  2852. FCALC_C f105 = ALPHA_R, f97, f0
  2853. nop __LINE__
  2854. }
  2855. { .mfb
  2856. STFD [C2] = f89, SIZE
  2857. FCALC_C f121 = ALPHA_R, f113, f0
  2858. nop __LINE__
  2859. }
  2860. ;;
  2861. { .mfb
  2862. STFD [C1] = f74, SIZE
  2863. FMPY f106 = ALPHA_R, f98
  2864. nop __LINE__
  2865. }
  2866. { .mfb
  2867. STFD [C2] = f90, SIZE
  2868. FMPY f122 = ALPHA_R, f114
  2869. nop __LINE__
  2870. }
  2871. ;;
  2872. { .mfb
  2873. STFD [C1] = f75, SIZE
  2874. FCALC_C f107 = ALPHA_R, f99, f0
  2875. nop __LINE__
  2876. }
  2877. { .mfb
  2878. STFD [C2] = f91, SIZE
  2879. FCALC_C f123 = ALPHA_R, f115, f0
  2880. nop __LINE__
  2881. }
  2882. ;;
  2883. { .mfb
  2884. nop __LINE__
  2885. FCALC_D f104 = ALPHA_I, f97, f104
  2886. nop __LINE__
  2887. }
  2888. { .mfb
  2889. nop __LINE__
  2890. FCALC_D f120 = ALPHA_I, f113, f120
  2891. nop __LINE__
  2892. }
  2893. ;;
  2894. { .mfb
  2895. nop __LINE__
  2896. FMA f105 = ALPHA_I, f96, f105
  2897. nop __LINE__
  2898. }
  2899. { .mfb
  2900. nop __LINE__
  2901. FMA f121 = ALPHA_I, f112, f121
  2902. nop __LINE__
  2903. }
  2904. ;;
  2905. { .mfi
  2906. nop __LINE__
  2907. FCALC_D f106 = ALPHA_I, f99, f106
  2908. #if defined(TRMMKERNEL) && \
  2909. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2910. sub L = K, KK
  2911. #else
  2912. nop __LINE__
  2913. #endif
  2914. }
  2915. { .mfb
  2916. nop __LINE__
  2917. FCALC_D f122 = ALPHA_I, f115, f122
  2918. nop __LINE__
  2919. }
  2920. ;;
  2921. { .mfi
  2922. nop __LINE__
  2923. FMA f107 = ALPHA_I, f98, f107
  2924. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  2925. adds L = -2, L
  2926. #else
  2927. nop __LINE__
  2928. #endif
  2929. }
  2930. { .mfi
  2931. nop __LINE__
  2932. FMA f123 = ALPHA_I, f114, f123
  2933. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  2934. adds L = -4, L
  2935. #else
  2936. nop __LINE__
  2937. #endif
  2938. }
  2939. ;;
  2940. { .mfi
  2941. STFD [C3] = f104, SIZE
  2942. mov f64 = f0
  2943. #if defined(TRMMKERNEL) && \
  2944. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2945. shladd KK8 = L, ZBASE_SHIFT, r0
  2946. #else
  2947. nop __LINE__
  2948. #endif
  2949. }
  2950. { .mfi
  2951. STFD [C4] = f120, SIZE
  2952. mov f65 = f0
  2953. nop __LINE__
  2954. }
  2955. ;;
  2956. { .mfi
  2957. STFD [C3] = f105, SIZE
  2958. mov f80 = f0
  2959. #if defined(TRMMKERNEL) && \
  2960. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2961. shladd AOFFSET = KK8, 1, AOFFSET
  2962. #else
  2963. nop __LINE__
  2964. #endif
  2965. }
  2966. { .mfi
  2967. STFD [C4] = f121, SIZE
  2968. mov f81 = f0
  2969. #if defined(TRMMKERNEL) && \
  2970. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2971. shladd BOFFSET = KK8, 2, BOFFSET
  2972. #else
  2973. nop __LINE__
  2974. #endif
  2975. }
  2976. ;;
  2977. { .mfi
  2978. STFD [C3] = f106, SIZE
  2979. mov f96 = f0
  2980. #if defined(TRMMKERNEL) && defined(LEFT)
  2981. adds KK = 2, KK
  2982. #else
  2983. nop __LINE__
  2984. #endif
  2985. }
  2986. { .mfi
  2987. STFD [C4] = f122, SIZE
  2988. mov f97 = f0
  2989. nop __LINE__
  2990. }
  2991. ;;
  2992. { .mfi
  2993. STFD [C3] = f107, SIZE
  2994. mov f112 = f0
  2995. #ifdef TRMMKERNEL
  2996. shladd KK8 = KK, ZBASE_SHIFT, r0
  2997. #else
  2998. nop __LINE__
  2999. #endif
  3000. }
  3001. { .mfb
  3002. STFD [C4] = f123, SIZE
  3003. mov f113 = f0
  3004. nop __LINE__
  3005. }
  3006. ;;
  3007. #endif
  3008. .align 16
  3009. .L030:
  3010. { .mib
  3011. #ifndef TRMMKERNEL
  3012. nop __LINE__
  3013. #else
  3014. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3015. sub L = K, KK
  3016. #elif defined(LEFT)
  3017. adds L = 1, KK
  3018. #else
  3019. adds L = 4, KK
  3020. #endif
  3021. #endif
  3022. tbit.z p6, p7 = M, 0
  3023. (p6) br.cond.dptk .L049
  3024. }
  3025. ;;
  3026. #if !defined(TRMMKERNEL) || \
  3027. defined(TRMMKERNEL) && \
  3028. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3029. { .mfb
  3030. LDFPD f48, f49 = [B]
  3031. mov f72 = f0
  3032. nop __LINE__
  3033. }
  3034. { .mfi
  3035. adds BOFFSET = 2 * SIZE, B
  3036. mov f73 = f0
  3037. #ifndef TRMMKERNEL
  3038. adds L = 1, K
  3039. #else
  3040. adds L = 1, L
  3041. #endif
  3042. }
  3043. ;;
  3044. #else
  3045. { .mfi
  3046. shladd BOFFSET = KK8, 2, B
  3047. mov f72 = f0
  3048. add AOFFSET = KK8, AOFFSET
  3049. }
  3050. ;;
  3051. { .mfi
  3052. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3053. mov f73 = f0
  3054. #ifndef TRMMKERNEL
  3055. adds L = 1, K
  3056. #else
  3057. adds L = 1, L
  3058. #endif
  3059. }
  3060. ;;
  3061. #endif
  3062. { .mmi
  3063. nop __LINE__
  3064. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  3065. tbit.z p12, p0 = L, 0
  3066. }
  3067. ;;
  3068. { .mfi
  3069. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3070. mov f88 = f0
  3071. shr L = L, 1
  3072. }
  3073. { .mfi
  3074. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  3075. mov f89 = f0
  3076. nop __LINE__
  3077. }
  3078. ;;
  3079. { .mfi
  3080. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  3081. mov f104 = f0
  3082. adds L = -1, L
  3083. }
  3084. { .mfb
  3085. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  3086. mov f105 = f0
  3087. nop __LINE__
  3088. }
  3089. ;;
  3090. { .mfi
  3091. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  3092. mov f120 = f0
  3093. mov ar.lc = L
  3094. }
  3095. { .mfi
  3096. cmp.eq p3, p0 = r0, r0
  3097. mov f121 = f0
  3098. nop __LINE__
  3099. }
  3100. ;;
  3101. .align 16
  3102. .L032:
  3103. { .mfb
  3104. lfetch.nt1 [PREA], 4 * SIZE
  3105. FMA f64 = f32, f48, f64 // A1 * B1
  3106. nop __LINE__
  3107. }
  3108. { .mfi
  3109. nop __LINE__
  3110. FMA_B f65 = f32, f49, f65 // A1 * B2
  3111. (p12) cmp.ne p3, p0 = 0, L
  3112. }
  3113. ;;
  3114. { .mfi
  3115. lfetch.nt1 [PREB], 16 * SIZE
  3116. FMA f80 = f32, f50, f80 // A1 * B3
  3117. cmp.ne p4, p5 = 0, L
  3118. }
  3119. { .mfb
  3120. nop __LINE__
  3121. FMA_B f81 = f32, f51, f81 // A1 * B4
  3122. nop __LINE__
  3123. }
  3124. ;;
  3125. { .mfb
  3126. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  3127. FMA f96 = f32, f52, f96 // A1 * B5
  3128. nop __LINE__
  3129. }
  3130. { .mfb
  3131. nop __LINE__
  3132. FMA_B f97 = f32, f53, f97 // A1 * B6
  3133. nop __LINE__
  3134. }
  3135. ;;
  3136. { .mfb
  3137. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  3138. FMA f112 = f32, f54, f112 // A1 * B7
  3139. nop __LINE__
  3140. }
  3141. { .mfb
  3142. nop __LINE__
  3143. FMA_B f113 = f32, f55, f113 // A1 * B8
  3144. nop __LINE__
  3145. }
  3146. ;;
  3147. { .mfb
  3148. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  3149. FMA f65 = f33, f48, f65 // A2 * B1
  3150. nop __LINE__
  3151. }
  3152. { .mfb
  3153. nop __LINE__
  3154. FMA_A f64 = f33, f49, f64 // A2 * B2
  3155. nop __LINE__
  3156. }
  3157. ;;
  3158. { .mfb
  3159. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  3160. FMA f81 = f33, f50, f81 // A2 * B3
  3161. nop __LINE__
  3162. }
  3163. { .mfb
  3164. nop __LINE__
  3165. FMA_A f80 = f33, f51, f80 // A2 * B4
  3166. nop __LINE__
  3167. }
  3168. ;;
  3169. { .mfb
  3170. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  3171. FMA f97 = f33, f52, f97 // A2 * B5
  3172. nop __LINE__
  3173. }
  3174. { .mfb
  3175. nop __LINE__
  3176. FMA_A f96 = f33, f53, f96 // A2 * B6
  3177. nop __LINE__
  3178. }
  3179. ;;
  3180. { .mfb
  3181. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3182. FMA f113 = f33, f54, f113 // A2 * B7
  3183. nop __LINE__
  3184. }
  3185. { .mfb
  3186. nop __LINE__
  3187. FMA_A f112 = f33, f55, f112 // A2 * B8
  3188. nop __LINE__
  3189. }
  3190. ;;
  3191. { .mfb
  3192. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  3193. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  3194. nop __LINE__
  3195. }
  3196. { .mfb
  3197. nop __LINE__
  3198. (p3) FMA_B f65 = f40, f57, f65 // A1 * B2
  3199. nop __LINE__
  3200. }
  3201. ;;
  3202. { .mfb
  3203. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3204. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  3205. nop __LINE__
  3206. }
  3207. { .mfb
  3208. nop __LINE__
  3209. (p3) FMA_B f81 = f40, f59, f81 // A1 * B4
  3210. nop __LINE__
  3211. }
  3212. ;;
  3213. { .mfb
  3214. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  3215. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  3216. nop __LINE__
  3217. }
  3218. { .mfb
  3219. nop __LINE__
  3220. (p3) FMA_B f97 = f40, f61, f97 // A1 * B6
  3221. nop __LINE__
  3222. }
  3223. ;;
  3224. { .mfb
  3225. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3226. (p5) LDFD f72 = [C1], SIZE
  3227. #else
  3228. nop __LINE__
  3229. #endif
  3230. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  3231. nop __LINE__
  3232. }
  3233. { .mfb
  3234. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3235. (p5) LDFD f88 = [C2], SIZE
  3236. #else
  3237. nop __LINE__
  3238. #endif
  3239. (p3) FMA_B f113 = f40, f63, f113 // A1 * B8
  3240. nop __LINE__
  3241. }
  3242. ;;
  3243. { .mfb
  3244. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  3245. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  3246. nop __LINE__
  3247. }
  3248. { .mfb
  3249. nop __LINE__
  3250. (p3) FMA_A f64 = f41, f57, f64 // A2 * B2
  3251. nop __LINE__
  3252. }
  3253. ;;
  3254. { .mfb
  3255. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3256. (p5) LDFD f73 = [C1], - SIZE
  3257. #else
  3258. nop __LINE__
  3259. #endif
  3260. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  3261. nop __LINE__
  3262. }
  3263. { .mfb
  3264. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3265. (p5) LDFD f89 = [C2], - SIZE
  3266. #else
  3267. nop __LINE__
  3268. #endif
  3269. (p3) FMA_A f80 = f41, f59, f80 // A2 * B4
  3270. nop __LINE__
  3271. }
  3272. ;;
  3273. { .mfb
  3274. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3275. (p5) LDFD f104 = [C3], SIZE
  3276. #else
  3277. nop __LINE__
  3278. #endif
  3279. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  3280. nop __LINE__
  3281. }
  3282. { .mfb
  3283. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3284. (p5) LDFD f120 = [C4], SIZE
  3285. #else
  3286. nop __LINE__
  3287. #endif
  3288. (p3) FMA_A f96 = f41, f61, f96 // A2 * B6
  3289. nop __LINE__
  3290. }
  3291. ;;
  3292. { .mfi
  3293. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3294. (p5) LDFD f105 = [C3], - SIZE
  3295. #else
  3296. nop __LINE__
  3297. #endif
  3298. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  3299. adds L = -1, L
  3300. }
  3301. { .mfb
  3302. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3303. (p5) LDFD f121 = [C4], - SIZE
  3304. #else
  3305. nop __LINE__
  3306. #endif
  3307. (p3) FMA_A f112 = f41, f63, f112 // A2 * B8
  3308. br.cloop.sptk.few .L032
  3309. }
  3310. ;;
  3311. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3312. { .mfb
  3313. nop __LINE__
  3314. FMA f72 = ALPHA_R, f64, f72
  3315. nop __LINE__
  3316. }
  3317. { .mfb
  3318. nop __LINE__
  3319. FMA f88 = ALPHA_R, f80, f88
  3320. nop __LINE__
  3321. }
  3322. ;;
  3323. { .mfb
  3324. nop __LINE__
  3325. FCALC_C f73 = ALPHA_R, f65, f73
  3326. nop __LINE__
  3327. }
  3328. { .mfb
  3329. nop __LINE__
  3330. FCALC_C f89 = ALPHA_R, f81, f89
  3331. nop __LINE__
  3332. }
  3333. ;;
  3334. { .mfb
  3335. nop __LINE__
  3336. FMA f104 = ALPHA_R, f96, f104
  3337. nop __LINE__
  3338. }
  3339. { .mfb
  3340. nop __LINE__
  3341. FMA f120 = ALPHA_R, f112, f120
  3342. nop __LINE__
  3343. }
  3344. { .mfb
  3345. nop __LINE__
  3346. FCALC_C f105 = ALPHA_R, f97, f105
  3347. nop __LINE__
  3348. }
  3349. { .mfb
  3350. nop __LINE__
  3351. FCALC_C f121 = ALPHA_R, f113, f121
  3352. nop __LINE__
  3353. }
  3354. ;;
  3355. { .mfb
  3356. nop __LINE__
  3357. FCALC_D f72 = ALPHA_I, f65, f72
  3358. nop __LINE__
  3359. }
  3360. { .mfb
  3361. nop __LINE__
  3362. FCALC_D f88 = ALPHA_I, f81, f88
  3363. nop __LINE__
  3364. }
  3365. ;;
  3366. { .mfb
  3367. nop __LINE__
  3368. FMA f73 = ALPHA_I, f64, f73
  3369. nop __LINE__
  3370. }
  3371. { .mfb
  3372. FMA f89 = ALPHA_I, f80, f89
  3373. nop __LINE__
  3374. }
  3375. ;;
  3376. { .mfb
  3377. nop __LINE__
  3378. FCALC_D f104 = ALPHA_I, f97, f104
  3379. nop __LINE__
  3380. }
  3381. { .mfb
  3382. nop __LINE__
  3383. FCALC_D f120 = ALPHA_I, f113, f120
  3384. nop __LINE__
  3385. }
  3386. ;;
  3387. { .mfb
  3388. nop __LINE__
  3389. FMA f105 = ALPHA_I, f96, f105
  3390. nop __LINE__
  3391. }
  3392. { .mfb
  3393. nop __LINE__
  3394. FMA f121 = ALPHA_I, f112, f121
  3395. nop __LINE__
  3396. }
  3397. ;;
  3398. { .mfb
  3399. STFD [C1] = f72, SIZE
  3400. mov f64 = f0
  3401. nop __LINE__
  3402. }
  3403. { .mfb
  3404. STFD [C2] = f88, SIZE
  3405. mov f65 = f0
  3406. nop __LINE__
  3407. }
  3408. ;;
  3409. { .mfb
  3410. STFD [C1] = f73, SIZE
  3411. mov f80 = f0
  3412. nop __LINE__
  3413. }
  3414. { .mfb
  3415. STFD [C2] = f89, SIZE
  3416. mov f81 = f0
  3417. nop __LINE__
  3418. }
  3419. ;;
  3420. { .mfb
  3421. STFD [C3] = f104, SIZE
  3422. mov f96 = f0
  3423. nop __LINE__
  3424. }
  3425. { .mfi
  3426. STFD [C4] = f120, SIZE
  3427. mov f97 = f0
  3428. nop __LINE__
  3429. }
  3430. ;;
  3431. { .mfb
  3432. STFD [C3] = f105, SIZE
  3433. mov f112 = f0
  3434. nop __LINE__
  3435. }
  3436. { .mfi
  3437. STFD [C4] = f121, SIZE
  3438. mov f113 = f0
  3439. nop __LINE__
  3440. }
  3441. ;;
  3442. #else
  3443. { .mfb
  3444. nop __LINE__
  3445. FMA f72 = ALPHA_R, f64, f0
  3446. nop __LINE__
  3447. }
  3448. { .mfb
  3449. nop __LINE__
  3450. FMA f88 = ALPHA_R, f80, f0
  3451. nop __LINE__
  3452. }
  3453. ;;
  3454. { .mfb
  3455. nop __LINE__
  3456. FCALC_C f73 = ALPHA_R, f65, f0
  3457. nop __LINE__
  3458. }
  3459. { .mfb
  3460. nop __LINE__
  3461. FCALC_C f89 = ALPHA_R, f81, f0
  3462. nop __LINE__
  3463. }
  3464. ;;
  3465. { .mfb
  3466. nop __LINE__
  3467. FMA f104 = ALPHA_R, f96, f0
  3468. nop __LINE__
  3469. }
  3470. { .mfb
  3471. nop __LINE__
  3472. FMA f120 = ALPHA_R, f112, f0
  3473. nop __LINE__
  3474. }
  3475. ;;
  3476. { .mfb
  3477. nop __LINE__
  3478. FCALC_C f105 = ALPHA_R, f97, f0
  3479. nop __LINE__
  3480. }
  3481. { .mfb
  3482. nop __LINE__
  3483. FCALC_C f121 = ALPHA_R, f113, f0
  3484. nop __LINE__
  3485. }
  3486. ;;
  3487. { .mfb
  3488. nop __LINE__
  3489. FCALC_D f72 = ALPHA_I, f65, f72
  3490. nop __LINE__
  3491. }
  3492. { .mfb
  3493. nop __LINE__
  3494. FCALC_D f88 = ALPHA_I, f81, f88
  3495. nop __LINE__
  3496. }
  3497. ;;
  3498. { .mfb
  3499. nop __LINE__
  3500. FMA f73 = ALPHA_I, f64, f73
  3501. nop __LINE__
  3502. }
  3503. { .mfb
  3504. FMA f89 = ALPHA_I, f80, f89
  3505. nop __LINE__
  3506. }
  3507. ;;
  3508. { .mfi
  3509. nop __LINE__
  3510. FCALC_D f104 = ALPHA_I, f97, f104
  3511. #if defined(TRMMKERNEL) && \
  3512. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3513. sub L = K, KK
  3514. #else
  3515. nop __LINE__
  3516. #endif
  3517. }
  3518. { .mfb
  3519. nop __LINE__
  3520. FCALC_D f120 = ALPHA_I, f113, f120
  3521. nop __LINE__
  3522. }
  3523. ;;
  3524. { .mfi
  3525. nop __LINE__
  3526. FMA f105 = ALPHA_I, f96, f105
  3527. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  3528. adds L = -1, L
  3529. #else
  3530. nop __LINE__
  3531. #endif
  3532. }
  3533. { .mfi
  3534. nop __LINE__
  3535. FMA f121 = ALPHA_I, f112, f121
  3536. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  3537. adds L = -4, L
  3538. #else
  3539. nop __LINE__
  3540. #endif
  3541. }
  3542. ;;
  3543. { .mfi
  3544. STFD [C1] = f72, SIZE
  3545. mov f64 = f0
  3546. #if defined(TRMMKERNEL) && \
  3547. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3548. shladd KK8 = L, ZBASE_SHIFT, r0
  3549. #else
  3550. nop __LINE__
  3551. #endif
  3552. }
  3553. { .mfb
  3554. STFD [C2] = f88, SIZE
  3555. mov f65 = f0
  3556. nop __LINE__
  3557. }
  3558. ;;
  3559. { .mfi
  3560. STFD [C1] = f73, SIZE
  3561. mov f80 = f0
  3562. #if defined(TRMMKERNEL) && \
  3563. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3564. add AOFFSET = KK8, AOFFSET
  3565. #else
  3566. nop __LINE__
  3567. #endif
  3568. }
  3569. { .mfi
  3570. STFD [C2] = f89, SIZE
  3571. mov f81 = f0
  3572. #if defined(TRMMKERNEL) && \
  3573. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3574. shladd BOFFSET = KK8, 2, BOFFSET
  3575. #else
  3576. nop __LINE__
  3577. #endif
  3578. }
  3579. ;;
  3580. { .mfi
  3581. STFD [C3] = f104, SIZE
  3582. mov f96 = f0
  3583. #if defined(TRMMKERNEL) && defined(LEFT)
  3584. adds KK = 1, KK
  3585. #else
  3586. nop __LINE__
  3587. #endif
  3588. }
  3589. { .mfi
  3590. STFD [C4] = f120, SIZE
  3591. mov f97 = f0
  3592. nop __LINE__
  3593. }
  3594. ;;
  3595. { .mfi
  3596. STFD [C3] = f105, SIZE
  3597. mov f112 = f0
  3598. #ifdef TRMMKERNEL
  3599. shladd KK8 = KK, ZBASE_SHIFT, r0
  3600. #else
  3601. nop __LINE__
  3602. #endif
  3603. }
  3604. { .mfi
  3605. STFD [C4] = f121, SIZE
  3606. mov f113 = f0
  3607. nop __LINE__
  3608. }
  3609. ;;
  3610. #endif
  3611. .align 16
  3612. .L049:
  3613. { .mmi
  3614. mov B = BOFFSET
  3615. mov AOFFSET = A
  3616. #if defined(TRMMKERNEL) && !defined(LEFT)
  3617. adds KK = 4, KK
  3618. #else
  3619. nop __LINE__
  3620. #endif
  3621. }
  3622. { .mmb
  3623. nop __LINE__
  3624. cmp.lt p6, p0 = 0, J
  3625. (p6) br.cond.dptk .L010
  3626. }
  3627. ;;
  3628. .align 16
  3629. .L050:
  3630. { .mmi
  3631. #if defined(TRMMKERNEL) && defined(LEFT)
  3632. mov KK = OFFSET
  3633. #else
  3634. nop __LINE__
  3635. #endif
  3636. shr I = M, 2
  3637. }
  3638. { .mib
  3639. mov C1 = C
  3640. tbit.z p6, p0 = N, 1
  3641. (p6) br.cond.dpnt .L090
  3642. }
  3643. ;;
  3644. { .mmi
  3645. add C2 = LDC, C
  3646. #ifdef TRMMKERNEL
  3647. shladd KK8 = KK, ZBASE_SHIFT, r0
  3648. #else
  3649. nop __LINE__
  3650. #endif
  3651. nop __LINE__
  3652. }
  3653. { .mib
  3654. cmp.eq p6, p7 = 0, I
  3655. shladd C = LDC, 1, C
  3656. (p6) br.cond.dpnt .L060
  3657. }
  3658. ;;
  3659. .align 16
  3660. .L052:
  3661. #if !defined(TRMMKERNEL) || \
  3662. defined(TRMMKERNEL) && \
  3663. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3664. { .mfi
  3665. LDFPD f48, f49 = [B]
  3666. mov f66 = f0
  3667. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  3668. }
  3669. { .mfb
  3670. adds BOFFSET = 2 * SIZE, B
  3671. mov f67 = f0
  3672. nop __LINE__
  3673. }
  3674. ;;
  3675. #else
  3676. { .mfi
  3677. shladd BOFFSET = KK8, 1, B
  3678. mov f66 = f0
  3679. shladd AOFFSET = KK8, 2, AOFFSET
  3680. }
  3681. ;;
  3682. { .mfi
  3683. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3684. mov f67 = f0
  3685. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  3686. }
  3687. ;;
  3688. #endif
  3689. { .mfi
  3690. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  3691. mov f82 = f0
  3692. adds PREC = CPREFETCHSIZE * SIZE, C1
  3693. }
  3694. { .mfi
  3695. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3696. mov f83 = f0
  3697. #ifndef TRMMKERNEL
  3698. nop __LINE__
  3699. #else
  3700. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3701. sub L = K, KK
  3702. #elif defined(LEFT)
  3703. adds L = 4, KK
  3704. #else
  3705. adds L = 2, KK
  3706. #endif
  3707. #endif
  3708. }
  3709. ;;
  3710. { .mfi
  3711. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  3712. mov f98 = f0
  3713. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  3714. }
  3715. { .mfi
  3716. cmp.eq p3, p0 = r0, r0
  3717. mov f99 = f0
  3718. #ifndef TRMMKERNEL
  3719. adds L = 1, K
  3720. #else
  3721. adds L = 1, L
  3722. #endif
  3723. }
  3724. ;;
  3725. { .mfi
  3726. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  3727. mov f114 = f0
  3728. tbit.z p12, p0 = L, 0
  3729. }
  3730. { .mfi
  3731. CPREFETCH [PREC], LDC
  3732. mov f115 = f0
  3733. shr L = L, 1
  3734. }
  3735. ;;
  3736. { .mmi
  3737. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  3738. adds C5 = 4 * SIZE, C1
  3739. adds L = -1, L
  3740. }
  3741. ;;
  3742. { .mmi
  3743. CPREFETCH [PREC], LDC
  3744. adds C6 = 4 * SIZE, C2
  3745. mov ar.lc = L
  3746. }
  3747. ;;
  3748. .align 16
  3749. .L053:
  3750. { .mfb
  3751. lfetch.nt1 [PREA], 16 * SIZE
  3752. FMA f64 = f32, f48, f64 // A1 * B1
  3753. nop __LINE__
  3754. }
  3755. { .mfi
  3756. nop __LINE__
  3757. FMA_B f65 = f32, f49, f65 // A1 * B2
  3758. (p12) cmp.ne p3, p0 = 0, L
  3759. }
  3760. ;;
  3761. { .mfi
  3762. lfetch.nt1 [PREB], 8 * SIZE
  3763. FMA f80 = f32, f50, f80 // A1 * B3
  3764. cmp.ne p4, p5 = 0, L
  3765. }
  3766. { .mfi
  3767. nop __LINE__
  3768. FMA_B f81 = f32, f51, f81 // A1 * B4
  3769. nop __LINE__
  3770. }
  3771. ;;
  3772. { .mfi
  3773. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  3774. FMA f96 = f34, f48, f96 // A3 * B1
  3775. nop __LINE__
  3776. }
  3777. { .mfi
  3778. FMA_B f97 = f34, f49, f97 // A3 * B2
  3779. nop __LINE__
  3780. }
  3781. ;;
  3782. { .mfi
  3783. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  3784. FMA f112 = f34, f50, f112 // A3 * B3
  3785. nop __LINE__
  3786. }
  3787. { .mfb
  3788. nop __LINE__
  3789. FMA_B f113 = f34, f51, f113 // A3 * B4
  3790. nop __LINE__
  3791. }
  3792. ;;
  3793. { .mfb
  3794. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  3795. FMA f65 = f33, f48, f65 // A2 * B1
  3796. nop __LINE__
  3797. }
  3798. { .mfb
  3799. nop __LINE__
  3800. FMA_A f64 = f33, f49, f64 // A2 * B2
  3801. nop __LINE__
  3802. }
  3803. ;;
  3804. { .mfb
  3805. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  3806. FMA f81 = f33, f50, f81 // A2 * B3
  3807. nop __LINE__
  3808. }
  3809. { .mfb
  3810. nop __LINE__
  3811. FMA_A f80 = f33, f51, f80 // A2 * B4
  3812. nop __LINE__
  3813. }
  3814. ;;
  3815. { .mfb
  3816. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  3817. FMA f97 = f35, f48, f97 // A4 * B1
  3818. nop __LINE__
  3819. }
  3820. { .mfb
  3821. nop __LINE__
  3822. FMA_A f96 = f35, f49, f96 // A4 * B2
  3823. nop __LINE__
  3824. }
  3825. ;;
  3826. { .mfb
  3827. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  3828. FMA f113 = f35, f50, f113 // A4 * B3
  3829. nop __LINE__
  3830. }
  3831. { .mfb
  3832. nop __LINE__
  3833. FMA_A f112 = f35, f51, f112 // A4 * B4
  3834. nop __LINE__
  3835. }
  3836. ;;
  3837. { .mfb
  3838. nop __LINE__
  3839. FMA f66 = f36, f48, f66 // A5 * B1
  3840. nop __LINE__
  3841. }
  3842. { .mfb
  3843. nop __LINE__
  3844. FMA_B f67 = f36, f49, f67 // A5 * B2
  3845. nop __LINE__
  3846. }
  3847. ;;
  3848. { .mfb
  3849. nop __LINE__
  3850. FMA f82 = f36, f50, f82 // A5 * B3
  3851. nop __LINE__
  3852. }
  3853. { .mfb
  3854. nop __LINE__
  3855. FMA_B f83 = f36, f51, f83 // A5 * B4
  3856. nop __LINE__
  3857. }
  3858. ;;
  3859. { .mfb
  3860. nop __LINE__
  3861. FMA f98 = f38, f48, f98 // A7 * B1
  3862. nop __LINE__
  3863. }
  3864. { .mfb
  3865. nop __LINE__
  3866. FMA_B f99 = f38, f49, f99 // A7 * B2
  3867. nop __LINE__
  3868. }
  3869. ;;
  3870. { .mfb
  3871. nop __LINE__
  3872. FMA f114 = f38, f50, f114 // A7 * B3
  3873. nop __LINE__
  3874. }
  3875. { .mfb
  3876. nop __LINE__
  3877. FMA_B f115 = f38, f51, f115 // A7 * B4
  3878. nop __LINE__
  3879. }
  3880. ;;
  3881. { .mfb
  3882. nop __LINE__
  3883. FMA f67 = f37, f48, f67 // A6 * B1
  3884. nop __LINE__
  3885. }
  3886. { .mfb
  3887. nop __LINE__
  3888. FMA_A f66 = f37, f49, f66 // A6 * B2
  3889. nop __LINE__
  3890. }
  3891. ;;
  3892. { .mfb
  3893. nop __LINE__
  3894. FMA f83 = f37, f50, f83 // A6 * B3
  3895. nop __LINE__
  3896. }
  3897. { .mfb
  3898. nop __LINE__
  3899. FMA_A f82 = f37, f51, f82 // A6 * B4
  3900. nop __LINE__
  3901. }
  3902. ;;
  3903. { .mfb
  3904. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  3905. FMA f99 = f39, f48, f99 // A8 * B1
  3906. nop __LINE__
  3907. }
  3908. { .mfb
  3909. nop __LINE__
  3910. FMA_A f98 = f39, f49, f98 // A8 * B2
  3911. nop __LINE__
  3912. }
  3913. ;;
  3914. { .mfb
  3915. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3916. FMA f115 = f39, f50, f115 // A8 * B3
  3917. nop __LINE__
  3918. }
  3919. { .mfb
  3920. nop __LINE__
  3921. FMA_A f114 = f39, f51, f114 // A8 * B4
  3922. nop __LINE__
  3923. }
  3924. ;;
  3925. { .mfb
  3926. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3927. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  3928. nop __LINE__
  3929. }
  3930. { .mfb
  3931. nop __LINE__
  3932. (p3) FMA_B f65 = f40, f57, f65 // A1 * B2
  3933. nop __LINE__
  3934. }
  3935. ;;
  3936. { .mfb
  3937. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  3938. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  3939. nop __LINE__
  3940. }
  3941. { .mfb
  3942. nop __LINE__
  3943. (p3) FMA_B f81 = f40, f59, f81 // A1 * B4
  3944. nop __LINE__
  3945. }
  3946. ;;
  3947. { .mfb
  3948. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  3949. (p3) FMA f96 = f42, f56, f96 // A3 * B1
  3950. nop __LINE__
  3951. }
  3952. { .mfb
  3953. nop __LINE__
  3954. (p3) FMA_B f97 = f42, f57, f97 // A3 * B2
  3955. nop __LINE__
  3956. }
  3957. ;;
  3958. { .mfb
  3959. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  3960. (p3) FMA f112 = f42, f58, f112 // A3 * B3
  3961. nop __LINE__
  3962. }
  3963. { .mfb
  3964. nop __LINE__
  3965. (p3) FMA_B f113 = f42, f59, f113 // A3 * B4
  3966. nop __LINE__
  3967. }
  3968. ;;
  3969. { .mfb
  3970. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3971. (p5) LDFD f72 = [C1 ], SIZE
  3972. #else
  3973. nop __LINE__
  3974. #endif
  3975. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  3976. nop __LINE__
  3977. }
  3978. { .mfb
  3979. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3980. (p5) LDFD f76 = [C5 ], SIZE
  3981. #else
  3982. nop __LINE__
  3983. #endif
  3984. (p3) FMA_A f64 = f41, f57, f64 // A2 * B2
  3985. nop __LINE__
  3986. }
  3987. ;;
  3988. { .mfb
  3989. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3990. (p5) LDFD f73 = [C1 ], SIZE
  3991. #else
  3992. nop __LINE__
  3993. #endif
  3994. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  3995. nop __LINE__
  3996. }
  3997. { .mfb
  3998. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3999. (p5) LDFD f77 = [C5 ], SIZE
  4000. #else
  4001. nop __LINE__
  4002. #endif
  4003. (p3) FMA_A f80 = f41, f59, f80 // A2 * B4
  4004. nop __LINE__
  4005. }
  4006. ;;
  4007. { .mfb
  4008. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4009. (p5) LDFD f74 = [C1 ], SIZE
  4010. #else
  4011. nop __LINE__
  4012. #endif
  4013. (p3) FMA f97 = f43, f56, f97 // A4 * B1
  4014. nop __LINE__
  4015. }
  4016. { .mfb
  4017. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4018. (p5) LDFD f78 = [C5 ], SIZE
  4019. #else
  4020. nop __LINE__
  4021. #endif
  4022. (p3) FMA_A f96 = f43, f57, f96 // A4 * B2
  4023. nop __LINE__
  4024. }
  4025. ;;
  4026. { .mfb
  4027. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4028. (p5) LDFD f75 = [C1 ], -3 * SIZE
  4029. #else
  4030. nop __LINE__
  4031. #endif
  4032. (p3) FMA f113 = f43, f58, f113 // A4 * B3
  4033. nop __LINE__
  4034. }
  4035. { .mfb
  4036. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4037. (p5) LDFD f79 = [C5 ], -3 * SIZE
  4038. #else
  4039. nop __LINE__
  4040. #endif
  4041. (p3) FMA_A f112 = f43, f59, f112 // A4 * B4
  4042. nop __LINE__
  4043. }
  4044. ;;
  4045. { .mfb
  4046. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4047. (p5) LDFD f88 = [C2 ], SIZE
  4048. #else
  4049. nop __LINE__
  4050. #endif
  4051. (p3) FMA f66 = f44, f56, f66 // A5 * B1
  4052. nop __LINE__
  4053. }
  4054. { .mfb
  4055. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4056. (p5) LDFD f92 = [C6 ], SIZE
  4057. #else
  4058. nop __LINE__
  4059. #endif
  4060. (p3) FMA_B f67 = f44, f57, f67 // A5 * B2
  4061. nop __LINE__
  4062. }
  4063. ;;
  4064. { .mfb
  4065. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4066. (p5) LDFD f89 = [C2 ], SIZE
  4067. #else
  4068. nop __LINE__
  4069. #endif
  4070. (p3) FMA f82 = f44, f58, f82 // A5 * B3
  4071. nop __LINE__
  4072. }
  4073. { .mfb
  4074. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4075. (p5) LDFD f93 = [C6 ], SIZE
  4076. #else
  4077. nop __LINE__
  4078. #endif
  4079. (p3) FMA_B f83 = f44, f59, f83 // A5 * B4
  4080. nop __LINE__
  4081. }
  4082. ;;
  4083. { .mfb
  4084. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4085. (p5) LDFD f90 = [C2 ], SIZE
  4086. #else
  4087. nop __LINE__
  4088. #endif
  4089. (p3) FMA f98 = f46, f56, f98 // A7 * B1
  4090. nop __LINE__
  4091. }
  4092. { .mfb
  4093. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4094. (p5) LDFD f94 = [C6 ], SIZE
  4095. #else
  4096. nop __LINE__
  4097. #endif
  4098. (p3) FMA_B f99 = f46, f57, f99 // A7 * B2
  4099. nop __LINE__
  4100. }
  4101. ;;
  4102. { .mfb
  4103. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4104. (p5) LDFD f91 = [C2 ], -3 * SIZE
  4105. #else
  4106. nop __LINE__
  4107. #endif
  4108. (p3) FMA f114 = f46, f58, f114 // A7 * B3
  4109. nop __LINE__
  4110. }
  4111. { .mfb
  4112. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4113. (p5) LDFD f95 = [C6 ], -3 * SIZE
  4114. #else
  4115. nop __LINE__
  4116. #endif
  4117. (p3) FMA_B f115 = f46, f59, f115 // A7 * B4
  4118. nop __LINE__
  4119. }
  4120. ;;
  4121. { .mfb
  4122. nop __LINE__
  4123. (p3) FMA f67 = f45, f56, f67 // A6 * B1
  4124. nop __LINE__
  4125. }
  4126. { .mfb
  4127. nop __LINE__
  4128. (p3) FMA_A f66 = f45, f57, f66 // A6 * B2
  4129. nop __LINE__
  4130. }
  4131. ;;
  4132. { .mfb
  4133. nop __LINE__
  4134. (p3) FMA f83 = f45, f58, f83 // A6 * B3
  4135. nop __LINE__
  4136. }
  4137. { .mfb
  4138. nop __LINE__
  4139. (p3) FMA_A f82 = f45, f59, f82 // A6 * B4
  4140. nop __LINE__
  4141. }
  4142. ;;
  4143. { .mfb
  4144. nop __LINE__
  4145. (p3) FMA f99 = f47, f56, f99 // A8 * B1
  4146. nop __LINE__
  4147. }
  4148. { .mfb
  4149. nop __LINE__
  4150. (p3) FMA_A f98 = f47, f57, f98 // A8 * B2
  4151. nop __LINE__
  4152. }
  4153. ;;
  4154. { .mfi
  4155. nop __LINE__
  4156. (p3) FMA f115 = f47, f58, f115 // A8 * B3
  4157. adds L = -1, L
  4158. }
  4159. { .mfb
  4160. nop __LINE__
  4161. (p3) FMA_A f114 = f47, f59, f114 // A8 * B4
  4162. br.cloop.sptk.few .L053
  4163. }
  4164. ;;
  4165. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4166. { .mfb
  4167. nop __LINE__
  4168. FMA f72 = ALPHA_R, f64, f72
  4169. nop __LINE__
  4170. }
  4171. { .mfb
  4172. nop __LINE__
  4173. FMA f76 = ALPHA_R, f66, f76
  4174. nop __LINE__
  4175. }
  4176. ;;
  4177. { .mfb
  4178. nop __LINE__
  4179. FCALC_C f73 = ALPHA_R, f65, f73
  4180. nop __LINE__
  4181. }
  4182. { .mfb
  4183. nop __LINE__
  4184. FCALC_C f77 = ALPHA_R, f67, f77
  4185. nop __LINE__
  4186. }
  4187. ;;
  4188. { .mfb
  4189. nop __LINE__
  4190. FMA f74 = ALPHA_R, f96, f74
  4191. nop __LINE__
  4192. }
  4193. { .mfb
  4194. nop __LINE__
  4195. FMA f78 = ALPHA_R, f98, f78
  4196. nop __LINE__
  4197. }
  4198. ;;
  4199. { .mfb
  4200. nop __LINE__
  4201. FCALC_C f75 = ALPHA_R, f97, f75
  4202. nop __LINE__
  4203. }
  4204. { .mfb
  4205. nop __LINE__
  4206. FCALC_C f79 = ALPHA_R, f99, f79
  4207. nop __LINE__
  4208. }
  4209. ;;
  4210. { .mfb
  4211. nop __LINE__
  4212. FCALC_D f72 = ALPHA_I, f65, f72
  4213. nop __LINE__
  4214. }
  4215. { .mfb
  4216. nop __LINE__
  4217. FCALC_D f76 = ALPHA_I, f67, f76
  4218. nop __LINE__
  4219. }
  4220. { .mfb
  4221. nop __LINE__
  4222. FMA f73 = ALPHA_I, f64, f73
  4223. nop __LINE__
  4224. }
  4225. { .mfb
  4226. nop __LINE__
  4227. FMA f77 = ALPHA_I, f66, f77
  4228. nop __LINE__
  4229. }
  4230. { .mfb
  4231. nop __LINE__
  4232. FCALC_D f74 = ALPHA_I, f97, f74
  4233. nop __LINE__
  4234. }
  4235. { .mfb
  4236. nop __LINE__
  4237. FCALC_D f78 = ALPHA_I, f99, f78
  4238. nop __LINE__
  4239. }
  4240. { .mfb
  4241. nop __LINE__
  4242. FMA f75 = ALPHA_I, f96, f75
  4243. nop __LINE__
  4244. }
  4245. { .mfb
  4246. nop __LINE__
  4247. FMA f79 = ALPHA_I, f98, f79
  4248. nop __LINE__
  4249. }
  4250. ;;
  4251. { .mfb
  4252. STFD [C1] = f72, SIZE
  4253. FMA f88 = ALPHA_R, f80, f88
  4254. nop __LINE__
  4255. }
  4256. { .mfb
  4257. STFD [C5] = f76, SIZE
  4258. FMA f92 = ALPHA_R, f82, f92
  4259. nop __LINE__
  4260. }
  4261. ;;
  4262. { .mfb
  4263. STFD [C1] = f73, SIZE
  4264. FCALC_C f89 = ALPHA_R, f81, f89
  4265. nop __LINE__
  4266. }
  4267. { .mfb
  4268. STFD [C5] = f77, SIZE
  4269. FCALC_C f93 = ALPHA_R, f83, f93
  4270. nop __LINE__
  4271. }
  4272. ;;
  4273. { .mfb
  4274. STFD [C1] = f74, SIZE
  4275. FMA f90 = ALPHA_R, f112, f90
  4276. nop __LINE__
  4277. }
  4278. { .mfb
  4279. STFD [C5] = f78, SIZE
  4280. FMA f94 = ALPHA_R, f114, f94
  4281. nop __LINE__
  4282. }
  4283. ;;
  4284. { .mfb
  4285. STFD [C1] = f75, 5 * SIZE
  4286. FCALC_C f91 = ALPHA_R, f113, f91
  4287. nop __LINE__
  4288. }
  4289. { .mfb
  4290. STFD [C5] = f79, 5 * SIZE
  4291. FCALC_C f95 = ALPHA_R, f115, f95
  4292. nop __LINE__
  4293. }
  4294. ;;
  4295. { .mfb
  4296. nop __LINE__
  4297. FCALC_D f88 = ALPHA_I, f81, f88
  4298. nop __LINE__
  4299. }
  4300. { .mfb
  4301. nop __LINE__
  4302. FCALC_D f92 = ALPHA_I, f83, f92
  4303. nop __LINE__
  4304. }
  4305. ;;
  4306. { .mfb
  4307. nop __LINE__
  4308. FMA f89 = ALPHA_I, f80, f89
  4309. nop __LINE__
  4310. }
  4311. { .mfb
  4312. nop __LINE__
  4313. FMA f93 = ALPHA_I, f82, f93
  4314. nop __LINE__
  4315. }
  4316. ;;
  4317. { .mfb
  4318. nop __LINE__
  4319. FCALC_D f90 = ALPHA_I, f113, f90
  4320. nop __LINE__
  4321. }
  4322. { .mfb
  4323. nop __LINE__
  4324. FCALC_D f94 = ALPHA_I, f115, f94
  4325. nop __LINE__
  4326. }
  4327. ;;
  4328. { .mfi
  4329. nop __LINE__
  4330. FMA f91 = ALPHA_I, f112, f91
  4331. cmp.ne p6, p0 = 1, I
  4332. }
  4333. { .mfb
  4334. nop __LINE__
  4335. FMA f95 = ALPHA_I, f114, f95
  4336. nop __LINE__
  4337. }
  4338. ;;
  4339. { .mfb
  4340. STFD [C2] = f88, SIZE
  4341. mov f64 = f0
  4342. nop __LINE__
  4343. }
  4344. { .mfb
  4345. STFD [C6] = f92, SIZE
  4346. mov f65 = f0
  4347. nop __LINE__
  4348. }
  4349. ;;
  4350. { .mfi
  4351. STFD [C2] = f89, SIZE
  4352. mov f80 = f0
  4353. adds I = -1, I
  4354. }
  4355. { .mfb
  4356. STFD [C6] = f93, SIZE
  4357. mov f81 = f0
  4358. nop __LINE__
  4359. }
  4360. ;;
  4361. { .mfb
  4362. STFD [C2] = f90, SIZE
  4363. mov f96 = f0
  4364. nop __LINE__
  4365. }
  4366. { .mfb
  4367. STFD [C6] = f94, SIZE
  4368. mov f97 = f0
  4369. nop __LINE__
  4370. }
  4371. ;;
  4372. { .mfb
  4373. STFD [C2] = f91, 5 * SIZE
  4374. mov f112 = f0
  4375. nop __LINE__
  4376. }
  4377. { .mfb
  4378. STFD [C6] = f95, 5 * SIZE
  4379. mov f113 = f0
  4380. (p6) br.cond.dptk .L052
  4381. }
  4382. ;;
  4383. #else
  4384. { .mfb
  4385. nop __LINE__
  4386. FMA f72 = ALPHA_R, f64, f0
  4387. nop __LINE__
  4388. }
  4389. { .mfb
  4390. nop __LINE__
  4391. FMA f76 = ALPHA_R, f66, f0
  4392. nop __LINE__
  4393. }
  4394. ;;
  4395. { .mfb
  4396. nop __LINE__
  4397. FCALC_C f73 = ALPHA_R, f65, f0
  4398. nop __LINE__
  4399. }
  4400. { .mfb
  4401. nop __LINE__
  4402. FCALC_C f77 = ALPHA_R, f67, f0
  4403. nop __LINE__
  4404. }
  4405. ;;
  4406. { .mfb
  4407. nop __LINE__
  4408. FMA f74 = ALPHA_R, f96, f0
  4409. nop __LINE__
  4410. }
  4411. { .mfb
  4412. nop __LINE__
  4413. FMA f78 = ALPHA_R, f98, f0
  4414. nop __LINE__
  4415. }
  4416. ;;
  4417. { .mfb
  4418. nop __LINE__
  4419. FCALC_C f75 = ALPHA_R, f97, f0
  4420. nop __LINE__
  4421. }
  4422. { .mfb
  4423. nop __LINE__
  4424. FCALC_C f79 = ALPHA_R, f99, f0
  4425. nop __LINE__
  4426. }
  4427. ;;
  4428. { .mfb
  4429. nop __LINE__
  4430. FCALC_D f72 = ALPHA_I, f65, f72
  4431. nop __LINE__
  4432. }
  4433. { .mfb
  4434. nop __LINE__
  4435. FCALC_D f76 = ALPHA_I, f67, f76
  4436. nop __LINE__
  4437. }
  4438. ;;
  4439. { .mfb
  4440. nop __LINE__
  4441. FMA f73 = ALPHA_I, f64, f73
  4442. nop __LINE__
  4443. }
  4444. { .mfb
  4445. nop __LINE__
  4446. FMA f77 = ALPHA_I, f66, f77
  4447. nop __LINE__
  4448. }
  4449. ;;
  4450. { .mfb
  4451. nop __LINE__
  4452. FCALC_D f74 = ALPHA_I, f97, f74
  4453. nop __LINE__
  4454. }
  4455. { .mfb
  4456. nop __LINE__
  4457. FCALC_D f78 = ALPHA_I, f99, f78
  4458. nop __LINE__
  4459. }
  4460. ;;
  4461. { .mfb
  4462. nop __LINE__
  4463. FMA f75 = ALPHA_I, f96, f75
  4464. nop __LINE__
  4465. }
  4466. { .mfb
  4467. nop __LINE__
  4468. FMA f79 = ALPHA_I, f98, f79
  4469. nop __LINE__
  4470. }
  4471. ;;
  4472. { .mfb
  4473. STFD [C1] = f72, SIZE
  4474. FMA f88 = ALPHA_R, f80, f0
  4475. nop __LINE__
  4476. }
  4477. { .mfb
  4478. STFD [C5] = f76, SIZE
  4479. FMA f92 = ALPHA_R, f82, f0
  4480. nop __LINE__
  4481. }
  4482. ;;
  4483. { .mfb
  4484. STFD [C1] = f73, SIZE
  4485. FCALC_C f89 = ALPHA_R, f81, f0
  4486. nop __LINE__
  4487. }
  4488. { .mfb
  4489. STFD [C5] = f77, SIZE
  4490. FCALC_C f93 = ALPHA_R, f83, f0
  4491. nop __LINE__
  4492. }
  4493. ;;
  4494. { .mfb
  4495. STFD [C1] = f74, SIZE
  4496. FMA f90 = ALPHA_R, f112, f0
  4497. nop __LINE__
  4498. }
  4499. { .mfb
  4500. STFD [C5] = f78, SIZE
  4501. FMA f94 = ALPHA_R, f114, f0
  4502. nop __LINE__
  4503. }
  4504. ;;
  4505. { .mfb
  4506. STFD [C1] = f75, 5 * SIZE
  4507. FCALC_C f91 = ALPHA_R, f113, f0
  4508. nop __LINE__
  4509. }
  4510. { .mfb
  4511. STFD [C5] = f79, 5 * SIZE
  4512. FCALC_C f95 = ALPHA_R, f115, f0
  4513. nop __LINE__
  4514. }
  4515. ;;
  4516. { .mfb
  4517. nop __LINE__
  4518. FCALC_D f88 = ALPHA_I, f81, f88
  4519. nop __LINE__
  4520. }
  4521. { .mfb
  4522. nop __LINE__
  4523. FCALC_D f92 = ALPHA_I, f83, f92
  4524. nop __LINE__
  4525. }
  4526. ;;
  4527. { .mfb
  4528. nop __LINE__
  4529. FMA f89 = ALPHA_I, f80, f89
  4530. nop __LINE__
  4531. }
  4532. { .mfb
  4533. nop __LINE__
  4534. FMA f93 = ALPHA_I, f82, f93
  4535. nop __LINE__
  4536. }
  4537. ;;
  4538. { .mfi
  4539. nop __LINE__
  4540. FCALC_D f90 = ALPHA_I, f113, f90
  4541. #if defined(TRMMKERNEL) && \
  4542. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4543. sub L = K, KK
  4544. #else
  4545. nop __LINE__
  4546. #endif
  4547. }
  4548. { .mfi
  4549. nop __LINE__
  4550. FCALC_D f94 = ALPHA_I, f115, f94
  4551. cmp.ne p6, p0 = 1, I
  4552. }
  4553. ;;
  4554. { .mfi
  4555. nop __LINE__
  4556. FMA f91 = ALPHA_I, f112, f91
  4557. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  4558. adds L = -4, L
  4559. #else
  4560. nop __LINE__
  4561. #endif
  4562. }
  4563. { .mfi
  4564. nop __LINE__
  4565. FMA f95 = ALPHA_I, f114, f95
  4566. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  4567. adds L = -2, L
  4568. #else
  4569. nop __LINE__
  4570. #endif
  4571. }
  4572. ;;
  4573. { .mfi
  4574. STFD [C2] = f88, SIZE
  4575. mov f64 = f0
  4576. #if defined(TRMMKERNEL) && \
  4577. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4578. shladd KK8 = L, ZBASE_SHIFT, r0
  4579. #else
  4580. nop __LINE__
  4581. #endif
  4582. }
  4583. { .mfi
  4584. STFD [C6] = f92, SIZE
  4585. mov f65 = f0
  4586. adds I = -1, I
  4587. }
  4588. ;;
  4589. { .mfi
  4590. STFD [C2] = f89, SIZE
  4591. mov f80 = f0
  4592. #if defined(TRMMKERNEL) && \
  4593. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4594. shladd AOFFSET = KK8, 2, AOFFSET
  4595. #else
  4596. nop __LINE__
  4597. #endif
  4598. }
  4599. { .mfi
  4600. STFD [C6] = f93, SIZE
  4601. mov f81 = f0
  4602. #if defined(TRMMKERNEL) && \
  4603. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4604. shladd BOFFSET = KK8, 1, BOFFSET
  4605. #else
  4606. nop __LINE__
  4607. #endif
  4608. }
  4609. ;;
  4610. { .mfi
  4611. STFD [C2] = f90, SIZE
  4612. mov f96 = f0
  4613. #if defined(TRMMKERNEL) && defined(LEFT)
  4614. adds KK = 4, KK
  4615. #else
  4616. nop __LINE__
  4617. #endif
  4618. }
  4619. { .mfb
  4620. STFD [C6] = f94, SIZE
  4621. mov f97 = f0
  4622. nop __LINE__
  4623. }
  4624. ;;
  4625. { .mfi
  4626. STFD [C2] = f91, 5 * SIZE
  4627. mov f112 = f0
  4628. #ifdef TRMMKERNEL
  4629. shladd KK8 = KK, ZBASE_SHIFT, r0
  4630. #else
  4631. nop __LINE__
  4632. #endif
  4633. }
  4634. { .mfb
  4635. STFD [C6] = f95, 5 * SIZE
  4636. mov f113 = f0
  4637. (p6) br.cond.dptk .L052
  4638. }
  4639. ;;
  4640. #endif
  4641. .align 16
  4642. .L060:
  4643. { .mib
  4644. #ifndef TRMMKERNEL
  4645. nop __LINE__
  4646. #else
  4647. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4648. sub L = K, KK
  4649. #elif defined(LEFT)
  4650. adds L = 2, KK
  4651. #else
  4652. adds L = 2, KK
  4653. #endif
  4654. #endif
  4655. tbit.z p6, p7 = M, 1
  4656. (p6) br.cond.dptk .L070
  4657. }
  4658. ;;
  4659. #if !defined(TRMMKERNEL) || \
  4660. defined(TRMMKERNEL) && \
  4661. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4662. { .mmi
  4663. LDFPD f48, f49 = [B]
  4664. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  4665. nop __LINE__
  4666. }
  4667. { .mmi
  4668. adds BOFFSET = 2 * SIZE, B
  4669. cmp.eq p3, p0 = r0, r0
  4670. #ifndef TRMMKERNEL
  4671. adds L = 1, K
  4672. #else
  4673. adds L = 1, L
  4674. #endif
  4675. }
  4676. ;;
  4677. #else
  4678. { .mmi
  4679. shladd BOFFSET = KK8, 1, B
  4680. shladd AOFFSET = KK8, 1, AOFFSET
  4681. cmp.eq p3, p0 = r0, r0
  4682. }
  4683. ;;
  4684. { .mmi
  4685. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  4686. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  4687. #ifndef TRMMKERNEL
  4688. adds L = 1, K
  4689. #else
  4690. adds L = 1, L
  4691. #endif
  4692. }
  4693. ;;
  4694. #endif
  4695. { .mmi
  4696. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  4697. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  4698. tbit.z p12, p0 = L, 0
  4699. }
  4700. { .mmi
  4701. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  4702. shr L = L, 1
  4703. }
  4704. ;;
  4705. { .mmi
  4706. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  4707. nop __LINE__
  4708. adds L = -1, L
  4709. }
  4710. ;;
  4711. { .mmi
  4712. nop __LINE__
  4713. nop __LINE__
  4714. mov ar.lc = L
  4715. }
  4716. ;;
  4717. .align 16
  4718. .L062:
  4719. { .mfi
  4720. lfetch.nt1 [PREA], 8 * SIZE
  4721. FMA f64 = f32, f48, f64 // A1 * B1
  4722. cmp.ne p4, p5 = 0, L
  4723. }
  4724. { .mfi
  4725. nop __LINE__
  4726. FMA_B f65 = f32, f49, f65 // A1 * B2
  4727. (p12) cmp.ne p3, p0 = 0, L
  4728. }
  4729. ;;
  4730. { .mfb
  4731. lfetch.nt1 [PREB], 8 * SIZE
  4732. FMA f80 = f32, f50, f80 // A1 * B3
  4733. nop __LINE__
  4734. }
  4735. { .mfb
  4736. nop __LINE__
  4737. FMA_B f81 = f32, f51, f81 // A1 * B4
  4738. nop __LINE__
  4739. }
  4740. ;;
  4741. { .mfb
  4742. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  4743. FMA f96 = f34, f48, f96 // A3 * B1
  4744. nop __LINE__
  4745. }
  4746. { .mfb
  4747. nop __LINE__
  4748. FMA_B f97 = f34, f49, f97 // A3 * B2
  4749. nop __LINE__
  4750. }
  4751. ;;
  4752. { .mfb
  4753. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  4754. FMA f112 = f34, f50, f112 // A3 * B3
  4755. nop __LINE__
  4756. }
  4757. { .mfb
  4758. nop __LINE__
  4759. FMA_B f113 = f34, f51, f113 // A3 * B4
  4760. nop __LINE__
  4761. }
  4762. ;;
  4763. { .mfb
  4764. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  4765. FMA f65 = f33, f48, f65 // A2 * B1
  4766. nop __LINE__
  4767. }
  4768. { .mfb
  4769. nop __LINE__
  4770. FMA_A f64 = f33, f49, f64 // A2 * B2
  4771. nop __LINE__
  4772. }
  4773. ;;
  4774. { .mfb
  4775. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  4776. FMA f81 = f33, f50, f81 // A2 * B3
  4777. nop __LINE__
  4778. }
  4779. { .mfb
  4780. nop __LINE__
  4781. FMA_A f80 = f33, f51, f80 // A2 * B4
  4782. nop __LINE__
  4783. }
  4784. ;;
  4785. { .mfb
  4786. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  4787. FMA f97 = f35, f48, f97 // A4 * B1
  4788. }
  4789. { .mfb
  4790. FMA_A f96 = f35, f49, f96 // A4 * B2
  4791. nop __LINE__
  4792. }
  4793. { .mfb
  4794. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  4795. FMA f113 = f35, f50, f113 // A4 * B3
  4796. nop __LINE__
  4797. }
  4798. { .mfb
  4799. FMA_A f112 = f35, f51, f112 // A4 * B4
  4800. nop __LINE__
  4801. }
  4802. ;;
  4803. { .mfb
  4804. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  4805. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  4806. nop __LINE__
  4807. }
  4808. { .mfb
  4809. (p3) FMA_B f65 = f40, f57, f65 // A1 * B2
  4810. nop __LINE__
  4811. }
  4812. ;;
  4813. { .mfb
  4814. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  4815. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  4816. nop __LINE__
  4817. }
  4818. { .mfb
  4819. (p3) FMA_B f81 = f40, f59, f81 // A1 * B4
  4820. nop __LINE__
  4821. }
  4822. ;;
  4823. { .mfb
  4824. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4825. (p5) LDFD f72 = [C1 ], SIZE
  4826. #else
  4827. nop __LINE__
  4828. #endif
  4829. (p3) FMA f96 = f42, f56, f96 // A3 * B1
  4830. nop __LINE__
  4831. }
  4832. { .mfb
  4833. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4834. (p5) LDFD f88 = [C2 ], SIZE
  4835. #else
  4836. nop __LINE__
  4837. #endif
  4838. (p3) FMA_B f97 = f42, f57, f97 // A3 * B2
  4839. nop __LINE__
  4840. }
  4841. ;;
  4842. { .mfb
  4843. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4844. (p5) LDFD f73 = [C1 ], SIZE
  4845. #else
  4846. nop __LINE__
  4847. #endif
  4848. (p3) FMA f112 = f42, f58, f112 // A3 * B3
  4849. nop __LINE__
  4850. }
  4851. { .mfb
  4852. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4853. (p5) LDFD f89 = [C2 ], SIZE
  4854. #else
  4855. nop __LINE__
  4856. #endif
  4857. (p3) FMA_B f113 = f42, f59, f113 // A3 * B4
  4858. nop __LINE__
  4859. }
  4860. ;;
  4861. { .mfb
  4862. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4863. (p5) LDFD f74 = [C1 ], SIZE
  4864. #else
  4865. nop __LINE__
  4866. #endif
  4867. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  4868. nop __LINE__
  4869. }
  4870. { .mfb
  4871. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4872. (p5) LDFD f90 = [C2 ], SIZE
  4873. #else
  4874. nop __LINE__
  4875. #endif
  4876. (p3) FMA_A f64 = f41, f57, f64 // A2 * B2
  4877. nop __LINE__
  4878. }
  4879. ;;
  4880. { .mfb
  4881. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4882. (p5) LDFD f75 = [C1 ], -3 * SIZE
  4883. #else
  4884. nop __LINE__
  4885. #endif
  4886. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  4887. nop __LINE__
  4888. }
  4889. { .mfb
  4890. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4891. (p5) LDFD f91 = [C2 ], -3 * SIZE
  4892. #else
  4893. nop __LINE__
  4894. #endif
  4895. (p3) FMA_A f80 = f41, f59, f80 // A2 * B4
  4896. nop __LINE__
  4897. }
  4898. ;;
  4899. { .mfb
  4900. nop __LINE__
  4901. (p3) FMA f97 = f43, f56, f97 // A4 * B1
  4902. nop __LINE__
  4903. }
  4904. { .mfb
  4905. nop __LINE__
  4906. (p3) FMA_A f96 = f43, f57, f96 // A4 * B2
  4907. nop __LINE__
  4908. }
  4909. ;;
  4910. { .mfi
  4911. nop __LINE__
  4912. (p3) FMA f113 = f43, f58, f113 // A4 * B3
  4913. adds L = -1, L
  4914. }
  4915. { .mfb
  4916. nop __LINE__
  4917. (p3) FMA_A f112 = f43, f59, f112 // A4 * B4
  4918. br.cloop.sptk.few .L062
  4919. }
  4920. ;;
  4921. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4922. { .mfb
  4923. nop __LINE__
  4924. FMA f72 = ALPHA_R, f64, f72
  4925. nop __LINE__
  4926. }
  4927. { .mfb
  4928. nop __LINE__
  4929. FMA f88 = ALPHA_R, f80, f88
  4930. nop __LINE__
  4931. }
  4932. ;;
  4933. { .mfb
  4934. nop __LINE__
  4935. FCALC_C f73 = ALPHA_R, f65, f73
  4936. nop __LINE__
  4937. }
  4938. { .mfb
  4939. nop __LINE__
  4940. FCALC_C f89 = ALPHA_R, f81, f89
  4941. nop __LINE__
  4942. }
  4943. ;;
  4944. { .mfb
  4945. nop __LINE__
  4946. FMA f74 = ALPHA_R, f96, f74
  4947. nop __LINE__
  4948. }
  4949. { .mfb
  4950. nop __LINE__
  4951. FMA f90 = ALPHA_R, f112, f90
  4952. nop __LINE__
  4953. }
  4954. ;;
  4955. { .mfb
  4956. nop __LINE__
  4957. FCALC_C f75 = ALPHA_R, f97, f75
  4958. nop __LINE__
  4959. }
  4960. { .mfb
  4961. nop __LINE__
  4962. FCALC_C f91 = ALPHA_R, f113, f91
  4963. nop __LINE__
  4964. }
  4965. ;;
  4966. { .mfb
  4967. nop __LINE__
  4968. FCALC_D f72 = ALPHA_I, f65, f72
  4969. nop __LINE__
  4970. }
  4971. { .mfb
  4972. nop __LINE__
  4973. FCALC_D f88 = ALPHA_I, f81, f88
  4974. nop __LINE__
  4975. }
  4976. ;;
  4977. { .mfb
  4978. nop __LINE__
  4979. FMA f73 = ALPHA_I, f64, f73
  4980. nop __LINE__
  4981. }
  4982. { .mfb
  4983. nop __LINE__
  4984. FMA f89 = ALPHA_I, f80, f89
  4985. nop __LINE__
  4986. }
  4987. ;;
  4988. { .mfb
  4989. nop __LINE__
  4990. FCALC_D f74 = ALPHA_I, f97, f74
  4991. nop __LINE__
  4992. }
  4993. { .mfb
  4994. nop __LINE__
  4995. FCALC_D f90 = ALPHA_I, f113, f90
  4996. nop __LINE__
  4997. }
  4998. ;;
  4999. { .mfb
  5000. nop __LINE__
  5001. FMA f75 = ALPHA_I, f96, f75
  5002. nop __LINE__
  5003. }
  5004. { .mfb
  5005. nop __LINE__
  5006. FMA f91 = ALPHA_I, f112, f91
  5007. nop __LINE__
  5008. }
  5009. ;;
  5010. { .mfb
  5011. STFD [C1] = f72, SIZE
  5012. mov f64 = f0
  5013. nop __LINE__
  5014. }
  5015. { .mfb
  5016. STFD [C2] = f88, SIZE
  5017. mov f65 = f0
  5018. nop __LINE__
  5019. }
  5020. ;;
  5021. { .mfb
  5022. STFD [C1] = f73, SIZE
  5023. mov f80 = f0
  5024. nop __LINE__
  5025. }
  5026. { .mfb
  5027. STFD [C2] = f89, SIZE
  5028. mov f81 = f0
  5029. nop __LINE__
  5030. }
  5031. ;;
  5032. { .mfi
  5033. STFD [C1] = f74, SIZE
  5034. mov f96 = f0
  5035. adds L = 1, K
  5036. }
  5037. { .mfb
  5038. STFD [C2] = f90, SIZE
  5039. mov f97 = f0
  5040. nop __LINE__
  5041. }
  5042. ;;
  5043. { .mfi
  5044. STFD [C1] = f75, SIZE
  5045. mov f112 = f0
  5046. shr L = L, 1
  5047. }
  5048. { .mfb
  5049. STFD [C2] = f91, SIZE
  5050. mov f113 = f0
  5051. nop __LINE__
  5052. }
  5053. ;;
  5054. #else
  5055. { .mfb
  5056. nop __LINE__
  5057. FMA f72 = ALPHA_R, f64, f0
  5058. nop __LINE__
  5059. }
  5060. { .mfb
  5061. nop __LINE__
  5062. FMA f88 = ALPHA_R, f80, f0
  5063. nop __LINE__
  5064. }
  5065. ;;
  5066. { .mfb
  5067. nop __LINE__
  5068. FCALC_C f73 = ALPHA_R, f65, f0
  5069. nop __LINE__
  5070. }
  5071. { .mfb
  5072. nop __LINE__
  5073. FCALC_C f89 = ALPHA_R, f81, f0
  5074. nop __LINE__
  5075. }
  5076. ;;
  5077. { .mfb
  5078. nop __LINE__
  5079. FMA f74 = ALPHA_R, f96, f0
  5080. nop __LINE__
  5081. }
  5082. { .mfb
  5083. nop __LINE__
  5084. FMA f90 = ALPHA_R, f112, f0
  5085. nop __LINE__
  5086. }
  5087. ;;
  5088. { .mfb
  5089. nop __LINE__
  5090. FCALC_C f75 = ALPHA_R, f97, f0
  5091. nop __LINE__
  5092. }
  5093. { .mfb
  5094. nop __LINE__
  5095. FCALC_C f91 = ALPHA_R, f113, f0
  5096. nop __LINE__
  5097. }
  5098. ;;
  5099. { .mfb
  5100. nop __LINE__
  5101. FCALC_D f72 = ALPHA_I, f65, f72
  5102. nop __LINE__
  5103. }
  5104. { .mfb
  5105. nop __LINE__
  5106. FCALC_D f88 = ALPHA_I, f81, f88
  5107. nop __LINE__
  5108. }
  5109. ;;
  5110. { .mfb
  5111. nop __LINE__
  5112. FMA f73 = ALPHA_I, f64, f73
  5113. nop __LINE__
  5114. }
  5115. { .mfb
  5116. nop __LINE__
  5117. FMA f89 = ALPHA_I, f80, f89
  5118. nop __LINE__
  5119. }
  5120. ;;
  5121. { .mfi
  5122. nop __LINE__
  5123. FCALC_D f74 = ALPHA_I, f97, f74
  5124. #if defined(TRMMKERNEL) && \
  5125. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5126. sub L = K, KK
  5127. #else
  5128. nop __LINE__
  5129. #endif
  5130. }
  5131. { .mfb
  5132. nop __LINE__
  5133. FCALC_D f90 = ALPHA_I, f113, f90
  5134. nop __LINE__
  5135. }
  5136. ;;
  5137. { .mfi
  5138. nop __LINE__
  5139. FMA f75 = ALPHA_I, f96, f75
  5140. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  5141. adds L = -2, L
  5142. #else
  5143. nop __LINE__
  5144. #endif
  5145. }
  5146. { .mfi
  5147. nop __LINE__
  5148. FMA f91 = ALPHA_I, f112, f91
  5149. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  5150. adds L = -2, L
  5151. #else
  5152. nop __LINE__
  5153. #endif
  5154. }
  5155. ;;
  5156. { .mfi
  5157. STFD [C1] = f72, SIZE
  5158. mov f64 = f0
  5159. #if defined(TRMMKERNEL) && \
  5160. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5161. shladd KK8 = L, ZBASE_SHIFT, r0
  5162. #else
  5163. nop __LINE__
  5164. #endif
  5165. }
  5166. { .mfb
  5167. STFD [C2] = f88, SIZE
  5168. mov f65 = f0
  5169. nop __LINE__
  5170. }
  5171. ;;
  5172. { .mfi
  5173. STFD [C1] = f73, SIZE
  5174. mov f80 = f0
  5175. #if defined(TRMMKERNEL) && \
  5176. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5177. shladd AOFFSET = KK8, 1, AOFFSET
  5178. #else
  5179. nop __LINE__
  5180. #endif
  5181. }
  5182. { .mfi
  5183. STFD [C2] = f89, SIZE
  5184. mov f81 = f0
  5185. #if defined(TRMMKERNEL) && \
  5186. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5187. shladd BOFFSET = KK8, 1, BOFFSET
  5188. #else
  5189. nop __LINE__
  5190. #endif
  5191. }
  5192. ;;
  5193. { .mfi
  5194. STFD [C1] = f74, SIZE
  5195. mov f96 = f0
  5196. #if defined(TRMMKERNEL) && defined(LEFT)
  5197. adds KK = 2, KK
  5198. #else
  5199. nop __LINE__
  5200. #endif
  5201. }
  5202. { .mfb
  5203. STFD [C2] = f90, SIZE
  5204. mov f97 = f0
  5205. nop __LINE__
  5206. }
  5207. ;;
  5208. { .mfi
  5209. STFD [C1] = f75, SIZE
  5210. mov f112 = f0
  5211. #ifdef TRMMKERNEL
  5212. shladd KK8 = KK, ZBASE_SHIFT, r0
  5213. #else
  5214. nop __LINE__
  5215. #endif
  5216. }
  5217. { .mfb
  5218. STFD [C2] = f91, SIZE
  5219. mov f113 = f0
  5220. nop __LINE__
  5221. }
  5222. ;;
  5223. #endif
  5224. .align 16
  5225. .L070:
  5226. { .mib
  5227. #ifndef TRMMKERNEL
  5228. nop __LINE__
  5229. #else
  5230. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5231. sub L = K, KK
  5232. #elif defined(LEFT)
  5233. adds L = 1, KK
  5234. #else
  5235. adds L = 2, KK
  5236. #endif
  5237. #endif
  5238. tbit.z p6, p7 = M, 0
  5239. (p6) br.cond.dptk .L089
  5240. }
  5241. ;;
  5242. #if !defined(TRMMKERNEL) || \
  5243. defined(TRMMKERNEL) && \
  5244. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5245. { .mmi
  5246. LDFPD f48, f49 = [B]
  5247. adds BOFFSET = 2 * SIZE, B
  5248. #ifndef TRMMKERNEL
  5249. adds L = 1, K
  5250. #else
  5251. adds L = 1, L
  5252. #endif
  5253. }
  5254. ;;
  5255. #else
  5256. { .mmi
  5257. shladd BOFFSET = KK8, 1, B
  5258. add AOFFSET = KK8, AOFFSET
  5259. nop __LINE__
  5260. }
  5261. ;;
  5262. { .mmi
  5263. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5264. nop __LINE__
  5265. #ifndef TRMMKERNEL
  5266. adds L = 1, K
  5267. #else
  5268. adds L = 1, L
  5269. #endif
  5270. }
  5271. ;;
  5272. #endif
  5273. ;;
  5274. { .mii
  5275. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5276. tbit.z p12, p0 = L, 0
  5277. shr L = L, 1
  5278. }
  5279. ;;
  5280. { .mmi
  5281. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  5282. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  5283. adds L = -1, L
  5284. }
  5285. ;;
  5286. { .mmi
  5287. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  5288. cmp.eq p3, p0 = r0, r0
  5289. mov ar.lc = L
  5290. }
  5291. ;;
  5292. .align 16
  5293. .L072:
  5294. { .mfb
  5295. lfetch.nt1 [PREA], 4 * SIZE
  5296. FMA f64 = f32, f48, f64 // A1 * B1
  5297. nop __LINE__
  5298. }
  5299. { .mfi
  5300. nop __LINE__
  5301. FMA f96 = f32, f49, f96 // A1 * B2
  5302. (p12) cmp.ne p3, p0 = 0, L
  5303. }
  5304. ;;
  5305. { .mfi
  5306. lfetch.nt1 [PREB], 8 * SIZE
  5307. FMA f80 = f32, f50, f80 // A1 * B3
  5308. cmp.ne p4, p5 = 0, L
  5309. }
  5310. { .mfb
  5311. nop __LINE__
  5312. FMA f112 = f32, f51, f112 // A1 * B4
  5313. nop __LINE__
  5314. }
  5315. ;;
  5316. { .mfi
  5317. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  5318. FMA f65 = f33, f48, f65 // A2 * B1
  5319. }
  5320. { .mfi
  5321. nop __LINE__
  5322. FMA f97 = f33, f49, f97 // A2 * B2
  5323. }
  5324. ;;
  5325. { .mfi
  5326. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  5327. FMA f81 = f33, f50, f81 // A2 * B3
  5328. }
  5329. { .mmf
  5330. nop __LINE__
  5331. nop __LINE__
  5332. FMA f113 = f33, f51, f113 // A2 * B4
  5333. }
  5334. ;;
  5335. { .mfb
  5336. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  5337. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  5338. nop __LINE__
  5339. }
  5340. { .mmf
  5341. nop __LINE__
  5342. nop __LINE__
  5343. (p3) FMA f96 = f40, f57, f96 // A1 * B2
  5344. }
  5345. ;;
  5346. { .mfb
  5347. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5348. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  5349. nop __LINE__
  5350. }
  5351. { .mmf
  5352. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5353. (p5) LDFD f72 = [C1 ], SIZE
  5354. (p5) LDFD f88 = [C2 ], SIZE
  5355. #else
  5356. nop __LINE__
  5357. nop __LINE__
  5358. #endif
  5359. (p3) FMA f112 = f40, f59, f112 // A1 * B4
  5360. }
  5361. ;;
  5362. { .mfb
  5363. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5364. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  5365. nop __LINE__
  5366. }
  5367. { .mfb
  5368. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5369. (p5) LDFD f73 = [C1 ], - SIZE
  5370. #else
  5371. nop __LINE__
  5372. #endif
  5373. (p3) FMA f97 = f41, f57, f97 // A2 * B2
  5374. nop __LINE__
  5375. }
  5376. ;;
  5377. { .mfi
  5378. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  5379. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  5380. adds L = -1, L
  5381. }
  5382. { .mfb
  5383. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5384. (p5) LDFD f89 = [C2 ], - SIZE
  5385. #else
  5386. nop __LINE__
  5387. #endif
  5388. (p3) FMA f113 = f41, f59, f113 // A2 * B4
  5389. br.cloop.sptk.few .L072
  5390. }
  5391. ;;
  5392. { .mfb
  5393. nop __LINE__
  5394. FCALC_A f64 = f64, f97
  5395. nop __LINE__
  5396. }
  5397. { .mfb
  5398. nop __LINE__
  5399. FCALC_A f80 = f80, f113
  5400. nop __LINE__
  5401. }
  5402. { .mfb
  5403. nop __LINE__
  5404. FCALC_B f65 = f65, f96
  5405. nop __LINE__
  5406. }
  5407. { .mfb
  5408. nop __LINE__
  5409. FCALC_B f81 = f81, f112
  5410. nop __LINE__
  5411. }
  5412. ;;
  5413. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5414. { .mfb
  5415. setf.d f96 = r0
  5416. FMA f72 = ALPHA_R, f64, f72
  5417. nop __LINE__
  5418. }
  5419. { .mfb
  5420. setf.d f97 = r0
  5421. FMA f88 = ALPHA_R, f80, f88
  5422. nop __LINE__
  5423. }
  5424. ;;
  5425. { .mfb
  5426. setf.d f112 = r0
  5427. FCALC_C f73 = ALPHA_R, f65, f73
  5428. nop __LINE__
  5429. }
  5430. { .mfb
  5431. setf.d f113 = r0
  5432. FCALC_C f89 = ALPHA_R, f81, f89
  5433. nop __LINE__
  5434. }
  5435. ;;
  5436. { .mfb
  5437. nop __LINE__
  5438. FCALC_D f72 = ALPHA_I, f65, f72
  5439. nop __LINE__
  5440. }
  5441. { .mfb
  5442. setf.d f65 = r0
  5443. FCALC_D f88 = ALPHA_I, f81, f88
  5444. nop __LINE__
  5445. }
  5446. ;;
  5447. { .mfb
  5448. setf.d f81 = r0
  5449. FMA f73 = ALPHA_I, f64, f73
  5450. nop __LINE__
  5451. }
  5452. { .mfb
  5453. setf.d f64 = r0
  5454. FMA f89 = ALPHA_I, f80, f89
  5455. nop __LINE__
  5456. }
  5457. ;;
  5458. { .mmf
  5459. STFD [C1] = f72, SIZE
  5460. STFD [C2] = f88, SIZE
  5461. mov f80 = f0
  5462. }
  5463. ;;
  5464. { .mmi
  5465. STFD [C1] = f73, SIZE
  5466. STFD [C2] = f89, SIZE
  5467. mov B = BOFFSET
  5468. }
  5469. ;;
  5470. #else
  5471. { .mfi
  5472. setf.d f96 = r0
  5473. FMA f72 = ALPHA_R, f64, f0
  5474. #if defined(TRMMKERNEL) && \
  5475. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5476. sub L = K, KK
  5477. #else
  5478. nop __LINE__
  5479. #endif
  5480. }
  5481. { .mfb
  5482. setf.d f97 = r0
  5483. FMA f88 = ALPHA_R, f80, f0
  5484. nop __LINE__
  5485. }
  5486. ;;
  5487. { .mfi
  5488. setf.d f112 = r0
  5489. FCALC_C f73 = ALPHA_R, f65, f0
  5490. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  5491. adds L = -1, L
  5492. #else
  5493. nop __LINE__
  5494. #endif
  5495. }
  5496. { .mfi
  5497. setf.d f113 = r0
  5498. FCALC_C f89 = ALPHA_R, f81, f0
  5499. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  5500. adds L = -2, L
  5501. #else
  5502. nop __LINE__
  5503. #endif
  5504. }
  5505. ;;
  5506. { .mfi
  5507. nop __LINE__
  5508. FCALC_D f72 = ALPHA_I, f65, f72
  5509. #if defined(TRMMKERNEL) && \
  5510. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5511. shladd KK8 = L, ZBASE_SHIFT, r0
  5512. #else
  5513. nop __LINE__
  5514. #endif
  5515. }
  5516. { .mfb
  5517. setf.d f65 = r0
  5518. FCALC_D f88 = ALPHA_I, f81, f88
  5519. nop __LINE__
  5520. }
  5521. ;;
  5522. { .mfi
  5523. setf.d f81 = r0
  5524. FMA f73 = ALPHA_I, f64, f73
  5525. #if defined(TRMMKERNEL) && \
  5526. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5527. add AOFFSET = KK8, AOFFSET
  5528. #else
  5529. nop __LINE__
  5530. #endif
  5531. }
  5532. { .mfi
  5533. setf.d f64 = r0
  5534. FMA f89 = ALPHA_I, f80, f89
  5535. #if defined(TRMMKERNEL) && \
  5536. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5537. shladd BOFFSET = KK8, 1, BOFFSET
  5538. #else
  5539. nop __LINE__
  5540. #endif
  5541. }
  5542. ;;
  5543. { .mmf
  5544. nop __LINE__
  5545. nop __LINE__
  5546. mov f80 = f0
  5547. }
  5548. ;;
  5549. { .mmi
  5550. STFD [C1] = f72, SIZE
  5551. STFD [C2] = f88, SIZE
  5552. #if defined(TRMMKERNEL) && defined(LEFT)
  5553. adds KK = 1, KK
  5554. #else
  5555. nop __LINE__
  5556. #endif
  5557. }
  5558. ;;
  5559. { .mmi
  5560. STFD [C1] = f73, SIZE
  5561. STFD [C2] = f89, SIZE
  5562. #ifdef TRMMKERNEL
  5563. shladd KK8 = KK, ZBASE_SHIFT, r0
  5564. #else
  5565. nop __LINE__
  5566. #endif
  5567. }
  5568. #endif
  5569. ;;
  5570. .align 16
  5571. .L089:
  5572. { .mmi
  5573. mov B = BOFFSET
  5574. mov AOFFSET = A
  5575. #if defined(TRMMKERNEL) && !defined(LEFT)
  5576. adds KK = 2, KK
  5577. #else
  5578. nop __LINE__
  5579. #endif
  5580. }
  5581. ;;
  5582. .align 16
  5583. .L090:
  5584. { .mfi
  5585. mov C1 = C
  5586. mov f64 = f0
  5587. tbit.z p6, p0 = N, 0
  5588. }
  5589. { .mfi
  5590. #if defined(TRMMKERNEL) && defined(LEFT)
  5591. mov KK = OFFSET
  5592. #else
  5593. nop __LINE__
  5594. #endif
  5595. mov f72 = f0
  5596. shr I = M, 2
  5597. }
  5598. ;;
  5599. { .mfi
  5600. setf.d f66 = r0
  5601. mov f65 = f0
  5602. #ifdef TRMMKERNEL
  5603. shladd KK8 = KK, ZBASE_SHIFT, r0
  5604. #else
  5605. nop __LINE__
  5606. #endif
  5607. }
  5608. { .mfb
  5609. mov AOFFSET = A
  5610. mov f73 = f0
  5611. (p6) br.cond.dpnt .L999
  5612. }
  5613. ;;
  5614. { .mfi
  5615. setf.d f74 = r0
  5616. mov f67 = f0
  5617. nop __LINE__
  5618. }
  5619. { .mfb
  5620. cmp.eq p6, p7 = 0, I
  5621. mov f75 = f0
  5622. (p6) br.cond.dpnt .L100
  5623. }
  5624. ;;
  5625. .align 16
  5626. .L092:
  5627. #if !defined(TRMMKERNEL) || \
  5628. defined(TRMMKERNEL) && \
  5629. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5630. { .mfb
  5631. LDFPD f48, f49 = [B]
  5632. nop __LINE__
  5633. }
  5634. { .mfi
  5635. adds BOFFSET = 2 * SIZE, B
  5636. #ifndef TRMMKERNEL
  5637. nop __LINE__
  5638. #else
  5639. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5640. sub L = K, KK
  5641. #elif defined(LEFT)
  5642. adds L = 4, KK
  5643. #else
  5644. adds L = 1, KK
  5645. #endif
  5646. #endif
  5647. }
  5648. ;;
  5649. #else
  5650. { .mfi
  5651. add BOFFSET = KK8, B
  5652. shladd AOFFSET = KK8, 2, AOFFSET
  5653. }
  5654. ;;
  5655. { .mfi
  5656. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5657. #ifndef TRMMKERNEL
  5658. nop __LINE__
  5659. #else
  5660. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5661. sub L = K, KK
  5662. #elif defined(LEFT)
  5663. adds L = 4, KK
  5664. #else
  5665. adds L = 1, KK
  5666. #endif
  5667. #endif
  5668. }
  5669. ;;
  5670. #endif
  5671. { .mfi
  5672. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5673. #ifndef TRMMKERNEL
  5674. adds L = 1, K
  5675. #else
  5676. adds L = 1, L
  5677. #endif
  5678. }
  5679. ;;
  5680. { .mfi
  5681. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  5682. tbit.z p12, p0 = L, 0
  5683. }
  5684. { .mfi
  5685. adds PREC = CPREFETCHSIZE * SIZE, C1
  5686. shr L = L, 1
  5687. }
  5688. ;;
  5689. { .mfi
  5690. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  5691. adds L = -1, L
  5692. }
  5693. { .mmf
  5694. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  5695. CPREFETCH [PREC]
  5696. }
  5697. ;;
  5698. { .mfi
  5699. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  5700. mov ar.lc = L
  5701. }
  5702. { .mmi
  5703. adds C5 = 4 * SIZE, C1
  5704. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  5705. cmp.eq p3, p0 = r0, r0
  5706. }
  5707. ;;
  5708. .align 16
  5709. .L093:
  5710. /* 1 */
  5711. { .mfi
  5712. lfetch.nt1 [PREA], 16 * SIZE
  5713. FMA f64 = f32, f48, f64 // A1 * B1
  5714. cmp.ne p4, p5 = 0, L
  5715. }
  5716. { .mfi
  5717. nop __LINE__
  5718. FMA_B f65 = f32, f49, f65 // A1 * B2
  5719. (p12) cmp.ne p3, p0 = 0, L
  5720. }
  5721. ;;
  5722. { .mfi
  5723. lfetch.nt1 [PREB], 4 * SIZE
  5724. FMA f80 = f34, f48, f80 // A3 * B1
  5725. nop __LINE__
  5726. }
  5727. { .mfi
  5728. nop __LINE__
  5729. FMA_B f81 = f34, f49, f81 // A3 * B2
  5730. nop __LINE__
  5731. }
  5732. ;;
  5733. { .mfi
  5734. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  5735. FMA f96 = f36, f48, f96 // A5 * B1
  5736. nop __LINE__
  5737. }
  5738. { .mfi
  5739. nop __LINE__
  5740. FMA_B f97 = f36, f49, f97 // A5 * B2
  5741. nop __LINE__
  5742. }
  5743. ;;
  5744. { .mfb
  5745. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  5746. FMA f112 = f38, f48, f112 // A7 * B1
  5747. nop __LINE__
  5748. }
  5749. { .mfb
  5750. nop __LINE__
  5751. FMA_B f113 = f38, f49, f113 // A7 * B2
  5752. nop __LINE__
  5753. }
  5754. ;;
  5755. { .mfb
  5756. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  5757. FMA f65 = f33, f48, f65 // A2 * B1
  5758. nop __LINE__
  5759. }
  5760. { .mfb
  5761. nop __LINE__
  5762. FMA_A f64 = f33, f49, f64 // A2 * B2
  5763. nop __LINE__
  5764. }
  5765. ;;
  5766. { .mfb
  5767. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  5768. FMA f81 = f35, f48, f81 // A4 * B1
  5769. nop __LINE__
  5770. }
  5771. { .mfb
  5772. nop __LINE__
  5773. FMA_A f80 = f35, f49, f80 // A4 * B2
  5774. nop __LINE__
  5775. }
  5776. ;;
  5777. { .mfb
  5778. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  5779. FMA f97 = f37, f48, f97 // A6 * B1
  5780. nop __LINE__
  5781. }
  5782. { .mfb
  5783. nop __LINE__
  5784. FMA_A f96 = f37, f49, f96 // A6 * B2
  5785. nop __LINE__
  5786. }
  5787. ;;
  5788. { .mfb
  5789. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5790. FMA f113 = f39, f48, f113 // A8 * B1
  5791. nop __LINE__
  5792. }
  5793. { .mfb
  5794. nop __LINE__
  5795. FMA_A f112 = f39, f49, f112 // A8 * B2
  5796. nop __LINE__
  5797. }
  5798. ;;
  5799. { .mfb
  5800. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5801. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  5802. nop __LINE__
  5803. }
  5804. { .mfb
  5805. nop __LINE__
  5806. (p3) FMA_B f65 = f40, f57, f65 // A1 * B2
  5807. nop __LINE__
  5808. }
  5809. ;;
  5810. { .mfb
  5811. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  5812. (p3) FMA f80 = f42, f56, f80 // A3 * B1
  5813. nop __LINE__
  5814. }
  5815. { .mfb
  5816. nop __LINE__
  5817. (p3) FMA_B f81 = f42, f57, f81 // A3 * B2
  5818. nop __LINE__
  5819. }
  5820. ;;
  5821. { .mfb
  5822. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  5823. (p3) FMA f96 = f44, f56, f96 // A5 * B1
  5824. nop __LINE__
  5825. }
  5826. { .mfb
  5827. nop __LINE__
  5828. (p3) FMA_B f97 = f44, f57, f97 // A5 * B2
  5829. nop __LINE__
  5830. }
  5831. ;;
  5832. { .mfb
  5833. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  5834. (p3) FMA f112 = f46, f56, f112 // A7 * B1
  5835. nop __LINE__
  5836. }
  5837. { .mfb
  5838. nop __LINE__
  5839. (p3) FMA_B f113 = f46, f57, f113 // A7 * B2
  5840. nop __LINE__
  5841. }
  5842. ;;
  5843. { .mfb
  5844. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5845. (p5) LDFD f72 = [C1 ], SIZE
  5846. #else
  5847. nop __LINE__
  5848. #endif
  5849. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  5850. nop __LINE__
  5851. }
  5852. { .mfb
  5853. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5854. (p5) LDFD f76 = [C5 ], SIZE
  5855. #else
  5856. nop __LINE__
  5857. #endif
  5858. (p3) FMA_A f64 = f41, f57, f64 // A2 * B2
  5859. nop __LINE__
  5860. }
  5861. ;;
  5862. { .mfb
  5863. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5864. (p5) LDFD f73 = [C1 ], SIZE
  5865. #else
  5866. nop __LINE__
  5867. #endif
  5868. (p3) FMA f81 = f43, f56, f81 // A4 * B1
  5869. nop __LINE__
  5870. }
  5871. { .mfb
  5872. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5873. (p5) LDFD f77 = [C5 ], SIZE
  5874. #else
  5875. nop __LINE__
  5876. #endif
  5877. (p3) FMA_A f80 = f43, f57, f80 // A4 * B2
  5878. nop __LINE__
  5879. }
  5880. ;;
  5881. { .mfb
  5882. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5883. (p5) LDFD f74 = [C1 ], SIZE
  5884. #else
  5885. nop __LINE__
  5886. #endif
  5887. (p3) FMA f97 = f45, f56, f97 // A6 * B1
  5888. nop __LINE__
  5889. }
  5890. { .mfb
  5891. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5892. (p5) LDFD f78 = [C5 ], SIZE
  5893. #else
  5894. nop __LINE__
  5895. #endif
  5896. (p3) FMA_A f96 = f45, f57, f96 // A6 * B2
  5897. nop __LINE__
  5898. }
  5899. ;;
  5900. { .mfi
  5901. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5902. (p5) LDFD f75 = [C1 ], -3 * SIZE
  5903. #else
  5904. nop __LINE__
  5905. #endif
  5906. (p3) FMA f113 = f47, f56, f113 // A8 * B1
  5907. adds L = -1, L
  5908. }
  5909. { .mfb
  5910. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5911. (p5) LDFD f79 = [C5 ], -3 * SIZE
  5912. #else
  5913. nop __LINE__
  5914. #endif
  5915. (p3) FMA_A f112 = f47, f57, f112 // A8 * B2
  5916. br.cloop.sptk.few .L093
  5917. }
  5918. ;;
  5919. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5920. { .mfb
  5921. nop __LINE__
  5922. FMA f72 = ALPHA_R, f64, f72
  5923. nop __LINE__
  5924. }
  5925. { .mfb
  5926. nop __LINE__
  5927. FMA f76 = ALPHA_R, f96, f76
  5928. nop __LINE__
  5929. }
  5930. { .mfb
  5931. nop __LINE__
  5932. FCALC_C f73 = ALPHA_R, f65, f73
  5933. nop __LINE__
  5934. }
  5935. { .mfb
  5936. nop __LINE__
  5937. FCALC_C f77 = ALPHA_R, f97, f77
  5938. nop __LINE__
  5939. }
  5940. { .mfb
  5941. nop __LINE__
  5942. FMA f74 = ALPHA_R, f80, f74
  5943. nop __LINE__
  5944. }
  5945. { .mfb
  5946. nop __LINE__
  5947. FMA f78 = ALPHA_R, f112, f78
  5948. nop __LINE__
  5949. }
  5950. { .mfb
  5951. nop __LINE__
  5952. FCALC_C f75 = ALPHA_R, f81, f75
  5953. nop __LINE__
  5954. }
  5955. { .mfb
  5956. nop __LINE__
  5957. FCALC_C f79 = ALPHA_R, f113, f79
  5958. nop __LINE__
  5959. }
  5960. ;;
  5961. { .mfb
  5962. nop __LINE__
  5963. FCALC_D f72 = ALPHA_I, f65, f72
  5964. nop __LINE__
  5965. }
  5966. { .mfb
  5967. nop __LINE__
  5968. FCALC_D f76 = ALPHA_I, f97, f76
  5969. nop __LINE__
  5970. }
  5971. { .mfb
  5972. nop __LINE__
  5973. FMA f73 = ALPHA_I, f64, f73
  5974. nop __LINE__
  5975. }
  5976. { .mfb
  5977. nop __LINE__
  5978. FMA f77 = ALPHA_I, f96, f77
  5979. nop __LINE__
  5980. }
  5981. { .mfb
  5982. nop __LINE__
  5983. FCALC_D f74 = ALPHA_I, f81, f74
  5984. nop __LINE__
  5985. }
  5986. { .mfb
  5987. nop __LINE__
  5988. FCALC_D f78 = ALPHA_I, f113, f78
  5989. nop __LINE__
  5990. }
  5991. { .mfb
  5992. nop __LINE__
  5993. FMA f75 = ALPHA_I, f80, f75
  5994. nop __LINE__
  5995. }
  5996. { .mfb
  5997. nop __LINE__
  5998. FMA f79 = ALPHA_I, f112, f79
  5999. nop __LINE__
  6000. }
  6001. ;;
  6002. { .mfi
  6003. STFD [C1] = f72, SIZE
  6004. mov f64 = f0
  6005. cmp.ne p6, p0 = 1, I
  6006. }
  6007. { .mfb
  6008. STFD [C5] = f76, SIZE
  6009. mov f65 = f0
  6010. nop __LINE__
  6011. }
  6012. ;;
  6013. { .mfi
  6014. STFD [C1] = f73, SIZE
  6015. mov f80 = f0
  6016. adds I = -1, I
  6017. }
  6018. { .mfb
  6019. STFD [C5] = f77, SIZE
  6020. mov f81 = f0
  6021. nop __LINE__
  6022. }
  6023. ;;
  6024. { .mfb
  6025. STFD [C1] = f74, SIZE
  6026. mov f96 = f0
  6027. nop __LINE__
  6028. }
  6029. { .mfb
  6030. STFD [C5] = f78, SIZE
  6031. mov f97 = f0
  6032. nop __LINE__
  6033. }
  6034. ;;
  6035. { .mfi
  6036. STFD [C1] = f75, 5 * SIZE
  6037. mov f112 = f0
  6038. }
  6039. { .mfb
  6040. STFD [C5] = f79, 5 * SIZE
  6041. mov f113 = f0
  6042. (p6) br.cond.dptk .L092
  6043. }
  6044. ;;
  6045. #else
  6046. { .mfb
  6047. nop __LINE__
  6048. FMA f6 = ALPHA_R, f64, f0
  6049. nop __LINE__
  6050. }
  6051. { .mfb
  6052. nop __LINE__
  6053. FMA f76 = ALPHA_R, f96, f0
  6054. nop __LINE__
  6055. }
  6056. { .mfb
  6057. nop __LINE__
  6058. FCALC_C f73 = ALPHA_R, f65, f0
  6059. nop __LINE__
  6060. }
  6061. { .mfb
  6062. nop __LINE__
  6063. FCALC_C f77 = ALPHA_R, f97, f0
  6064. nop __LINE__
  6065. }
  6066. { .mfb
  6067. nop __LINE__
  6068. FMA f74 = ALPHA_R, f80, f0
  6069. nop __LINE__
  6070. }
  6071. { .mfb
  6072. nop __LINE__
  6073. FMA f78 = ALPHA_R, f112, f0
  6074. nop __LINE__
  6075. }
  6076. { .mfb
  6077. nop __LINE__
  6078. FCALC_C f75 = ALPHA_R, f81, f0
  6079. nop __LINE__
  6080. }
  6081. { .mfb
  6082. nop __LINE__
  6083. FCALC_C f79 = ALPHA_R, f113, f0
  6084. nop __LINE__
  6085. }
  6086. ;;
  6087. { .mfb
  6088. nop __LINE__
  6089. FCALC_D f6 = ALPHA_I, f65, f6
  6090. nop __LINE__
  6091. }
  6092. { .mfb
  6093. nop __LINE__
  6094. FCALC_D f76 = ALPHA_I, f97, f76
  6095. nop __LINE__
  6096. }
  6097. { .mfb
  6098. nop __LINE__
  6099. FMA f73 = ALPHA_I, f64, f73
  6100. nop __LINE__
  6101. }
  6102. { .mfb
  6103. nop __LINE__
  6104. FMA f77 = ALPHA_I, f96, f77
  6105. nop __LINE__
  6106. }
  6107. ;;
  6108. { .mfi
  6109. nop __LINE__
  6110. FCALC_D f74 = ALPHA_I, f81, f74
  6111. #if defined(TRMMKERNEL) && \
  6112. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6113. sub L = K, KK
  6114. #else
  6115. nop __LINE__
  6116. #endif
  6117. }
  6118. { .mfb
  6119. nop __LINE__
  6120. FCALC_D f78 = ALPHA_I, f113, f78
  6121. nop __LINE__
  6122. }
  6123. ;;
  6124. { .mfi
  6125. nop __LINE__
  6126. FMA f75 = ALPHA_I, f80, f75
  6127. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  6128. adds L = -4, L
  6129. #else
  6130. nop __LINE__
  6131. #endif
  6132. }
  6133. { .mfi
  6134. nop __LINE__
  6135. FMA f79 = ALPHA_I, f112, f79
  6136. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  6137. adds L = -1, L
  6138. #else
  6139. nop __LINE__
  6140. #endif
  6141. }
  6142. ;;
  6143. { .mfi
  6144. STFD [C1] = f6, SIZE
  6145. mov f64 = f0
  6146. #if defined(TRMMKERNEL) && \
  6147. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6148. shladd KK8 = L, ZBASE_SHIFT, r0
  6149. #else
  6150. nop __LINE__
  6151. #endif
  6152. }
  6153. { .mfi
  6154. STFD [C5] = f76, SIZE
  6155. mov f65 = f0
  6156. cmp.ne p6, p0 = 1, I
  6157. }
  6158. ;;
  6159. { .mfi
  6160. STFD [C1] = f73, SIZE
  6161. mov f80 = f0
  6162. #if defined(TRMMKERNEL) && \
  6163. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6164. shladd AOFFSET = KK8, 2, AOFFSET
  6165. #else
  6166. nop __LINE__
  6167. #endif
  6168. }
  6169. { .mfi
  6170. STFD [C5] = f77, SIZE
  6171. mov f81 = f0
  6172. #if defined(TRMMKERNEL) && \
  6173. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6174. add BOFFSET = KK8, BOFFSET
  6175. #else
  6176. nop __LINE__
  6177. #endif
  6178. }
  6179. ;;
  6180. { .mfi
  6181. STFD [C1] = f74, SIZE
  6182. mov f96 = f0
  6183. #if defined(TRMMKERNEL) && defined(LEFT)
  6184. adds KK = 4, KK
  6185. #else
  6186. nop __LINE__
  6187. #endif
  6188. }
  6189. { .mfi
  6190. STFD [C5] = f78, SIZE
  6191. mov f97 = f0
  6192. adds I = -1, I
  6193. }
  6194. ;;
  6195. { .mfi
  6196. STFD [C1] = f75, 5 * SIZE
  6197. mov f112 = f0
  6198. #ifdef TRMMKERNEL
  6199. shladd KK8 = KK, ZBASE_SHIFT, r0
  6200. #else
  6201. nop __LINE__
  6202. #endif
  6203. }
  6204. { .mfb
  6205. STFD [C5] = f79, 5 * SIZE
  6206. mov f113 = f0
  6207. (p6) br.cond.dptk .L092
  6208. }
  6209. ;;
  6210. #endif
  6211. .align 16
  6212. .L100:
  6213. { .mib
  6214. #ifndef TRMMKERNEL
  6215. nop __LINE__
  6216. #else
  6217. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  6218. sub L = K, KK
  6219. #elif defined(LEFT)
  6220. adds L = 2, KK
  6221. #else
  6222. adds L = 1, KK
  6223. #endif
  6224. #endif
  6225. tbit.z p6, p7 = M, 1
  6226. (p6) br.cond.dptk .L110
  6227. }
  6228. ;;
  6229. #if !defined(TRMMKERNEL) || \
  6230. defined(TRMMKERNEL) && \
  6231. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6232. { .mmi
  6233. LDFPD f48, f49 = [B]
  6234. adds BOFFSET = 2 * SIZE, B
  6235. #ifndef TRMMKERNEL
  6236. adds L = 1, K
  6237. #else
  6238. adds L = 1, L
  6239. #endif
  6240. }
  6241. ;;
  6242. #else
  6243. { .mii
  6244. add BOFFSET = KK8, B
  6245. shladd AOFFSET = KK8, 1, AOFFSET
  6246. nop __LINE__
  6247. }
  6248. ;;
  6249. { .mfi
  6250. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6251. #ifndef TRMMKERNEL
  6252. adds L = 1, K
  6253. #else
  6254. adds L = 1, L
  6255. #endif
  6256. }
  6257. ;;
  6258. #endif
  6259. { .mii
  6260. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6261. tbit.z p12, p0 = L, 0
  6262. shr L = L, 1
  6263. }
  6264. ;;
  6265. { .mmi
  6266. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  6267. nop __LINE__
  6268. adds L = -1, L
  6269. }
  6270. ;;
  6271. { .mmi
  6272. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  6273. cmp.eq p3, p0 = r0, r0
  6274. mov ar.lc = L
  6275. }
  6276. ;;
  6277. .align 16
  6278. .L102:
  6279. { .mfi
  6280. lfetch.nt1 [PREA], 8 * SIZE
  6281. FMA f64 = f32, f48, f64 // A1 * B1
  6282. cmp.ne p4, p5 = 0, L
  6283. }
  6284. { .mfi
  6285. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  6286. FMA f80 = f32, f49, f80 // A1 * B2
  6287. (p12) cmp.ne p3, p0 = 0, L
  6288. }
  6289. ;;
  6290. { .mfb
  6291. lfetch.nt1 [PREB], 4 * SIZE
  6292. FMA f65 = f33, f48, f65 // A2 * B1
  6293. nop __LINE__
  6294. }
  6295. { .mfb
  6296. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6297. (p5) LDFD f72 = [C1 ], SIZE
  6298. #else
  6299. nop __LINE__
  6300. #endif
  6301. FMA f81 = f33, f49, f81 // A2 * B2
  6302. nop __LINE__
  6303. }
  6304. ;;
  6305. { .mfb
  6306. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  6307. FMA f96 = f34, f48, f96 // A3 * B1
  6308. nop __LINE__
  6309. }
  6310. { .mfb
  6311. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6312. (p5) LDFD f73 = [C1 ], SIZE
  6313. #else
  6314. nop __LINE__
  6315. #endif
  6316. FMA f112 = f34, f49, f112 // A3 * B2
  6317. nop __LINE__
  6318. }
  6319. ;;
  6320. { .mfb
  6321. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  6322. FMA f97 = f35, f48, f97 // A4 * B1
  6323. nop __LINE__
  6324. }
  6325. { .mfb
  6326. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6327. (p5) LDFD f74 = [C1 ], SIZE
  6328. #else
  6329. nop __LINE__
  6330. #endif
  6331. FMA f113 = f35, f49, f113 // A4 * B2
  6332. nop __LINE__
  6333. }
  6334. ;;
  6335. { .mfb
  6336. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  6337. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6338. nop __LINE__
  6339. }
  6340. { .mfb
  6341. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6342. (p5) LDFD f75 = [C1 ], -3 * SIZE
  6343. #else
  6344. nop __LINE__
  6345. #endif
  6346. (p3) FMA f80 = f40, f57, f80 // A1 * B2
  6347. nop __LINE__
  6348. }
  6349. ;;
  6350. { .mfb
  6351. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6352. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  6353. nop __LINE__
  6354. }
  6355. { .mfb
  6356. nop __LINE__
  6357. (p3) FMA f81 = f41, f57, f81 // A2 * B2
  6358. nop __LINE__
  6359. }
  6360. ;;
  6361. { .mfb
  6362. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6363. (p3) FMA f96 = f42, f56, f96 // A3 * B1
  6364. nop __LINE__
  6365. }
  6366. { .mfb
  6367. nop __LINE__
  6368. (p3) FMA f112 = f42, f57, f112 // A3 * B2
  6369. nop __LINE__
  6370. }
  6371. ;;
  6372. { .mfi
  6373. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  6374. (p3) FMA f97 = f43, f56, f97 // A4 * B1
  6375. adds L = -1, L
  6376. }
  6377. { .mfb
  6378. nop __LINE__
  6379. (p3) FMA f113 = f43, f57, f113 // A4 * B2
  6380. br.cloop.sptk.few .L102
  6381. }
  6382. ;;
  6383. { .mfb
  6384. nop __LINE__
  6385. FCALC_A f64 = f64, f81
  6386. nop __LINE__
  6387. }
  6388. { .mfb
  6389. nop __LINE__
  6390. FCALC_B f65 = f65, f80
  6391. nop __LINE__
  6392. }
  6393. { .mfb
  6394. nop __LINE__
  6395. FCALC_A f96 = f96, f113
  6396. nop __LINE__
  6397. }
  6398. { .mfb
  6399. nop __LINE__
  6400. FCALC_B f97 = f97, f112
  6401. nop __LINE__
  6402. }
  6403. ;;
  6404. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6405. { .mfb
  6406. nop __LINE__
  6407. FMA f72 = ALPHA_R, f64, f72
  6408. nop __LINE__
  6409. }
  6410. { .mfb
  6411. nop __LINE__
  6412. FCALC_C f73 = ALPHA_R, f65, f73
  6413. nop __LINE__
  6414. }
  6415. { .mfb
  6416. nop __LINE__
  6417. FMA f74 = ALPHA_R, f96, f74
  6418. nop __LINE__
  6419. }
  6420. { .mfb
  6421. nop __LINE__
  6422. FCALC_C f75 = ALPHA_R, f97, f75
  6423. nop __LINE__
  6424. }
  6425. ;;
  6426. { .mfb
  6427. nop __LINE__
  6428. FCALC_D f72 = ALPHA_I, f65, f72
  6429. nop __LINE__
  6430. }
  6431. { .mfb
  6432. nop __LINE__
  6433. FMA f73 = ALPHA_I, f64, f73
  6434. nop __LINE__
  6435. }
  6436. { .mfb
  6437. setf.d f112 = r0
  6438. FCALC_D f74 = ALPHA_I, f97, f74
  6439. nop __LINE__
  6440. }
  6441. { .mfb
  6442. setf.d f113 = r0
  6443. FMA f75 = ALPHA_I, f96, f75
  6444. nop __LINE__
  6445. }
  6446. ;;
  6447. { .mmf
  6448. STFD [C1] = f72, SIZE
  6449. setf.d f97 = r0
  6450. mov f64 = f0
  6451. }
  6452. ;;
  6453. { .mmf
  6454. STFD [C1] = f73, SIZE
  6455. setf.d f96 = r0
  6456. mov f80 = f0
  6457. }
  6458. ;;
  6459. { .mfi
  6460. STFD [C1] = f74, SIZE
  6461. mov f65 = f0
  6462. adds L = 1, K
  6463. }
  6464. ;;
  6465. { .mfi
  6466. STFD [C1] = f75, SIZE
  6467. mov f81 = f0
  6468. shr L = L, 1
  6469. }
  6470. ;;
  6471. #else
  6472. { .mfb
  6473. nop __LINE__
  6474. FMA f72 = ALPHA_R, f64, f0
  6475. nop __LINE__
  6476. }
  6477. { .mfb
  6478. nop __LINE__
  6479. FCALC_C f73 = ALPHA_R, f65, f0
  6480. nop __LINE__
  6481. }
  6482. ;;
  6483. { .mfi
  6484. setf.d f112 = r0
  6485. FMA f74 = ALPHA_R, f96, f0
  6486. #if defined(TRMMKERNEL) && \
  6487. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6488. sub L = K, KK
  6489. #else
  6490. nop __LINE__
  6491. #endif
  6492. }
  6493. { .mfb
  6494. setf.d f113 = r0
  6495. FCALC_C f75 = ALPHA_R, f97, f0
  6496. nop __LINE__
  6497. }
  6498. ;;
  6499. { .mfi
  6500. setf.d f97 = r0
  6501. FCALC_D f72 = ALPHA_I, f65, f72
  6502. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  6503. adds L = -2, L
  6504. #else
  6505. nop __LINE__
  6506. #endif
  6507. }
  6508. { .mfi
  6509. setf.d f96 = r0
  6510. FMA f73 = ALPHA_I, f64, f73
  6511. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  6512. adds L = -1, L
  6513. #else
  6514. nop __LINE__
  6515. #endif
  6516. }
  6517. ;;
  6518. { .mfi
  6519. nop __LINE__
  6520. FCALC_D f74 = ALPHA_I, f97, f74
  6521. #if defined(TRMMKERNEL) && \
  6522. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6523. shladd KK8 = L, ZBASE_SHIFT, r0
  6524. #else
  6525. nop __LINE__
  6526. #endif
  6527. }
  6528. { .mfb
  6529. nop __LINE__
  6530. FMA f75 = ALPHA_I, f96, f75
  6531. nop __LINE__
  6532. }
  6533. ;;
  6534. { .mfi
  6535. STFD [C1] = f72, SIZE
  6536. mov f64 = f0
  6537. #if defined(TRMMKERNEL) && \
  6538. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6539. shladd AOFFSET = KK8, 1, AOFFSET
  6540. #else
  6541. nop __LINE__
  6542. #endif
  6543. }
  6544. ;;
  6545. { .mfi
  6546. STFD [C1] = f73, SIZE
  6547. mov f80 = f0
  6548. #if defined(TRMMKERNEL) && \
  6549. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6550. add BOFFSET = KK8, BOFFSET
  6551. #else
  6552. nop __LINE__
  6553. #endif
  6554. }
  6555. ;;
  6556. { .mfi
  6557. STFD [C1] = f74, SIZE
  6558. mov f65 = f0
  6559. #if defined(TRMMKERNEL) && defined(LEFT)
  6560. adds KK = 2, KK
  6561. #else
  6562. nop __LINE__
  6563. #endif
  6564. }
  6565. ;;
  6566. { .mfi
  6567. STFD [C1] = f75, SIZE
  6568. mov f81 = f0
  6569. #ifdef TRMMKERNEL
  6570. shladd KK8 = KK, ZBASE_SHIFT, r0
  6571. #else
  6572. nop __LINE__
  6573. #endif
  6574. }
  6575. ;;
  6576. #endif
  6577. .align 16
  6578. .L110:
  6579. { .mib
  6580. #ifndef TRMMKERNEL
  6581. nop __LINE__
  6582. #else
  6583. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  6584. sub L = K, KK
  6585. #elif defined(LEFT)
  6586. adds L = 1, KK
  6587. #else
  6588. adds L = 1, KK
  6589. #endif
  6590. #endif
  6591. tbit.z p6, p7 = M, 0
  6592. (p6) br.cond.dptk .L119
  6593. }
  6594. ;;
  6595. #if !defined(TRMMKERNEL) || \
  6596. defined(TRMMKERNEL) && \
  6597. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6598. { .mmi
  6599. LDFPD f48, f49 = [B]
  6600. adds BOFFSET = 2 * SIZE, B
  6601. #ifndef TRMMKERNEL
  6602. adds L = 1, K
  6603. #else
  6604. adds L = 1, L
  6605. #endif
  6606. }
  6607. ;;
  6608. #else
  6609. { .mii
  6610. add BOFFSET = KK8, B
  6611. add AOFFSET = KK8, AOFFSET
  6612. nop __LINE__
  6613. }
  6614. ;;
  6615. { .mfi
  6616. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6617. #ifndef TRMMKERNEL
  6618. adds L = 1, K
  6619. #else
  6620. adds L = 1, L
  6621. #endif
  6622. }
  6623. ;;
  6624. #endif
  6625. ;;
  6626. { .mii
  6627. nop __LINE__
  6628. tbit.z p12, p0 = L, 0
  6629. shr L = L, 1
  6630. }
  6631. ;;
  6632. { .mmi
  6633. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6634. cmp.eq p3, p0 = r0, r0
  6635. adds L = -1, L
  6636. }
  6637. ;;
  6638. { .mmi
  6639. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  6640. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  6641. mov ar.lc = L
  6642. }
  6643. ;;
  6644. .align 16
  6645. .L112:
  6646. { .mfi
  6647. lfetch.nt1 [PREA], 4 * SIZE
  6648. FMA f64 = f32, f48, f64 // A1 * B1
  6649. cmp.ne p4, p5 = 0, L
  6650. }
  6651. { .mfi
  6652. lfetch.nt1 [PREB], 4 * SIZE
  6653. FMA f80 = f32, f49, f80 // A1 * B2
  6654. (p12) cmp.ne p3, p0 = 0, L
  6655. }
  6656. ;;
  6657. { .mmf
  6658. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  6659. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  6660. FMA f65 = f33, f48, f65 // A2 * B1
  6661. }
  6662. { .mmf
  6663. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6664. (p5) LDFD f72 = [C1 ], SIZE
  6665. #else
  6666. nop __LINE__
  6667. #endif
  6668. nop __LINE__
  6669. FMA f81 = f33, f49, f81 // A2 * B2
  6670. }
  6671. ;;
  6672. { .mfb
  6673. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6674. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6675. nop __LINE__
  6676. }
  6677. { .mfb
  6678. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6679. (p5) LDFD f73 = [C1 ], -1 * SIZE
  6680. #else
  6681. nop __LINE__
  6682. #endif
  6683. (p3) FMA f80 = f40, f57, f80 // A1 * B2
  6684. nop __LINE__
  6685. }
  6686. ;;
  6687. { .mfi
  6688. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6689. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  6690. adds L = -1, L
  6691. }
  6692. { .mfb
  6693. (p3) FMA f81 = f41, f57, f81 // A2 * B2
  6694. br.cloop.sptk.few .L112
  6695. }
  6696. ;;
  6697. { .mfb
  6698. nop __LINE__
  6699. FCALC_A f64 = f64, f81
  6700. nop __LINE__
  6701. }
  6702. { .mfb
  6703. nop __LINE__
  6704. FCALC_B f65 = f65, f80
  6705. nop __LINE__
  6706. }
  6707. ;;
  6708. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6709. { .mfb
  6710. nop __LINE__
  6711. FMA f72 = ALPHA_R, f64, f72
  6712. nop __LINE__
  6713. }
  6714. { .mfb
  6715. nop __LINE__
  6716. FCALC_C f73 = ALPHA_R, f65, f73
  6717. nop __LINE__
  6718. }
  6719. ;;
  6720. { .mfb
  6721. nop __LINE__
  6722. FCALC_D f72 = ALPHA_I, f65, f72
  6723. nop __LINE__
  6724. }
  6725. { .mfb
  6726. nop __LINE__
  6727. FMA f73 = ALPHA_I, f64, f73
  6728. nop __LINE__
  6729. }
  6730. ;;
  6731. { .mmf
  6732. STFD [C1] = f72, SIZE
  6733. setf.d f64 = r0
  6734. mov f80 = f0
  6735. }
  6736. ;;
  6737. { .mmf
  6738. STFD [C1] = f73, SIZE
  6739. setf.d f65 = r0
  6740. mov f81 = f0
  6741. }
  6742. ;;
  6743. #else
  6744. { .mfb
  6745. nop __LINE__
  6746. FMA f72 = ALPHA_R, f64, f0
  6747. nop __LINE__
  6748. }
  6749. { .mfb
  6750. nop __LINE__
  6751. FCALC_C f73 = ALPHA_R, f65, f0
  6752. nop __LINE__
  6753. }
  6754. ;;
  6755. { .mfb
  6756. nop __LINE__
  6757. FCALC_D f72 = ALPHA_I, f65, f72
  6758. nop __LINE__
  6759. }
  6760. { .mfb
  6761. nop __LINE__
  6762. FMA f73 = ALPHA_I, f64, f73
  6763. nop __LINE__
  6764. }
  6765. ;;
  6766. { .mmf
  6767. STFD [C1] = f72, SIZE
  6768. setf.d f64 = r0
  6769. mov f80 = f0
  6770. }
  6771. ;;
  6772. { .mmf
  6773. STFD [C1] = f73, SIZE
  6774. setf.d f65 = r0
  6775. mov f81 = f0
  6776. }
  6777. ;;
  6778. #endif
  6779. .align 16
  6780. .L119:
  6781. { .mmi
  6782. mov B = BOFFSET
  6783. mov AOFFSET = A
  6784. #if defined(TRMMKERNEL) && !defined(LEFT)
  6785. adds KK = 1, KK
  6786. #else
  6787. nop __LINE__
  6788. #endif
  6789. }
  6790. ;;
  6791. .align 16
  6792. .L999:
  6793. { .mii
  6794. nop __LINE__
  6795. mov ar.lc = ARLC
  6796. mov pr = PR, -1
  6797. }
  6798. { .mib
  6799. nop __LINE__
  6800. #ifdef TRMMKERNEL
  6801. mov ar.pfs = ARPFS
  6802. #else
  6803. nop __LINE__
  6804. #endif
  6805. br.ret.sptk.many b0
  6806. }
  6807. EPILOGUE