You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel.S 147 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef DOUBLE
  41. #define PREFETCHSIZE (16 * 8)
  42. #else
  43. #define PREFETCHSIZE (32 * 8)
  44. #endif
  45. #define CPREFETCHSIZE 7
  46. #define CPREFETCH lfetch.excl.nt1
  47. #define M r32
  48. #define N r33
  49. #define K r34
  50. #define A r36
  51. #define B r37
  52. #define C r38
  53. #define LDC r39
  54. #define I r15
  55. #define J r16
  56. #define AOFFSET r17
  57. #define BOFFSET r18
  58. #define BB r19
  59. #define L r20
  60. #define C1 r21
  61. #define C2 r22
  62. #define C3 r23
  63. #define C4 r24
  64. #define C5 r25
  65. #define C6 r26
  66. #define C7 r27
  67. #define C8 r28
  68. #define C9 loc0
  69. #define C10 loc1
  70. #define C11 loc2
  71. #define C12 loc3
  72. #define C13 loc4
  73. #define C14 loc5
  74. #define C15 loc6
  75. #define C16 loc7
  76. #define PREA r8
  77. #define PREB r9
  78. #define PREC r10
  79. #define SP r12
  80. #define ARLC r29
  81. #define PR r30
  82. #define ARPFS r31
  83. #define ALPHA f8
  84. #define AORIG loc8
  85. #define KK loc9
  86. #define KK8 loc10
  87. #define OFFSET loc11
  88. PROLOGUE
  89. .prologue
  90. PROFCODE
  91. { .mmi
  92. .save ar.pfs, ARPFS
  93. #ifdef TRMMKERNEL
  94. alloc ARPFS = ar.pfs, 8, 16, 0, 0
  95. #else
  96. alloc ARPFS = ar.pfs, 8, 8, 0, 0
  97. #endif
  98. adds r14 = 16, SP
  99. mov ARLC = ar.lc
  100. }
  101. { .mmi
  102. adds r8 = -16 * 16, SP
  103. adds r9 = -15 * 16, SP
  104. adds SP = -16 * 16, SP
  105. }
  106. ;;
  107. stf.spill [r8] = f16, 32
  108. stf.spill [r9] = f17, 32
  109. mov PR = pr
  110. ;;
  111. stf.spill [r8] = f18, 32
  112. stf.spill [r9] = f19, 32
  113. shladd LDC = LDC, BASE_SHIFT, r0
  114. ;;
  115. stf.spill [r8] = f20, 32
  116. stf.spill [r9] = f21, 32
  117. shr J = N, 3
  118. ;;
  119. stf.spill [r8] = f22, 32
  120. stf.spill [r9] = f23, 32
  121. mov AOFFSET = A
  122. ;;
  123. stf.spill [r8] = f24, 32
  124. stf.spill [r9] = f25, 32
  125. cmp.ge p6, p0 = 0, J
  126. ;;
  127. stf.spill [r8] = f26, 32
  128. stf.spill [r9] = f27, 32
  129. shr BB = K, 3
  130. ;;
  131. stf.spill [r8] = f28, 32
  132. stf.spill [r9] = f29, 32
  133. ;;
  134. stf.spill [r8] = f30
  135. stf.spill [r9] = f31
  136. #ifndef TRMMKERNEL
  137. (p6) br.cond.dpnt .L050
  138. .body
  139. ;;
  140. #else
  141. .body
  142. ;;
  143. ld8 OFFSET = [r14]
  144. #if defined(TRMMKERNEL) && !defined(LEFT)
  145. ;;
  146. sub KK = r0, OFFSET
  147. #endif
  148. (p6) br.cond.dpnt .L050
  149. ;;
  150. #endif
  151. .align 32
  152. .L010:
  153. { .mfi
  154. adds J = -1, J
  155. mov f64 = f0
  156. shr I = M, 3
  157. }
  158. { .mfi
  159. mov C1 = C // coffset1 = c + 0 * ldc
  160. mov f72 = f0
  161. shladd BB = BB, BASE_SHIFT, B
  162. }
  163. ;;
  164. { .mmf
  165. cmp.eq p6, p7 = 0, I
  166. #if defined(TRMMKERNEL) && defined(LEFT)
  167. mov KK = OFFSET
  168. #else
  169. nop __LINE__
  170. #endif
  171. mov f80 = f0
  172. }
  173. { .mmf
  174. add C2 = LDC, C // coffset2 = c + 1 * ldc
  175. shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc
  176. mov f88 = f0
  177. }
  178. ;;
  179. { .mmf
  180. shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc
  181. shladd C = LDC, 3, C // coffset += 8 * ldc
  182. mov f96 = f0
  183. }
  184. { .mmf
  185. shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc
  186. shladd C6 = LDC, 2, C2 // coffset6 = c + 5 * ldc
  187. mov f104 = f0
  188. }
  189. ;;
  190. { .mfi
  191. shladd C7 = LDC, 2, C3 // coffset7 = c + 6 * ldc
  192. mov f112 = f0
  193. #ifdef TRMMKERNEL
  194. shladd KK8 = KK, BASE_SHIFT, r0
  195. #else
  196. nop __LINE__
  197. #endif
  198. }{ .mfb
  199. sub C8 = C, LDC // coffset8 = c + 7 * ldc
  200. mov f120 = f0
  201. (p6) br.cond.dpnt .L020
  202. }
  203. ;;
  204. .align 16
  205. .L011:
  206. #if !defined(TRMMKERNEL) || \
  207. defined(TRMMKERNEL) && \
  208. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  209. { .mfb
  210. LDFPD f48, f49 = [B]
  211. mov f65 = f0
  212. nop __LINE__
  213. }
  214. { .mfb
  215. adds BOFFSET = 2 * SIZE, B
  216. mov f73 = f0
  217. nop __LINE__
  218. }
  219. ;;
  220. #else
  221. { .mfi
  222. shladd BOFFSET = KK8, 3, B
  223. mov f65 = f0
  224. shladd AOFFSET = KK8, 3, AOFFSET
  225. }
  226. ;;
  227. { .mfi
  228. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  229. mov f73 = f0
  230. nop __LINE__
  231. }
  232. ;;
  233. #endif
  234. { .mfb
  235. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  236. mov f81 = f0
  237. nop __LINE__
  238. }
  239. { .mfb
  240. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  241. mov f89 = f0
  242. nop __LINE__
  243. }
  244. ;;
  245. { .mmf
  246. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  247. setf.d f97 = r0
  248. mov f105 = f0
  249. }
  250. { .mmf
  251. lfetch.nt1 [BB]
  252. setf.d f113 = r0
  253. mov f121 = f0
  254. }
  255. ;;
  256. { .mmf
  257. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  258. setf.d f66 = r0
  259. mov f74 = f0
  260. }
  261. { .mfi
  262. setf.d f82 = r0
  263. mov f90 = f0
  264. adds BB = 16 * SIZE, BB
  265. }
  266. ;;
  267. { .mmf
  268. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  269. setf.d f98 = r0
  270. mov f106 = f0
  271. }
  272. { .mfb
  273. setf.d f114 = r0
  274. mov f122 = f0
  275. nop __LINE__
  276. }
  277. ;;
  278. { .mmf
  279. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  280. setf.d f67 = r0
  281. mov f75 = f0
  282. }
  283. { .mfi
  284. setf.d f83 = r0
  285. mov f91 = f0
  286. #ifndef TRMMKERNEL
  287. nop __LINE__
  288. #else
  289. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  290. sub L = K, KK
  291. #elif defined(LEFT)
  292. adds L = 8, KK
  293. #else
  294. adds L = 8, KK
  295. #endif
  296. #endif
  297. }
  298. ;;
  299. { .mmf
  300. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  301. setf.d f99 = r0
  302. mov f107 = f0
  303. }
  304. { .mfi
  305. setf.d f115 = r0
  306. mov f123 = f0
  307. adds PREC = CPREFETCHSIZE * SIZE, C1
  308. }
  309. ;;
  310. { .mmf
  311. CPREFETCH [PREC], LDC
  312. setf.d f68 = r0
  313. mov f76 = f0
  314. }
  315. { .mfi
  316. setf.d f84 = r0
  317. mov f92 = f0
  318. #ifndef TRMMKERNEL
  319. adds L = 1, K
  320. #else
  321. adds L = 1, L
  322. #endif
  323. }
  324. ;;
  325. { .mmf
  326. CPREFETCH [PREC], LDC
  327. setf.d f100 = r0
  328. mov f108 = f0
  329. }
  330. { .mfi
  331. setf.d f116 = r0
  332. mov f124 = f0
  333. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  334. }
  335. ;;
  336. { .mmf
  337. CPREFETCH [PREC], LDC
  338. setf.d f69 = r0
  339. mov f77 = f0
  340. }
  341. { .mfi
  342. setf.d f85 = r0
  343. mov f93 = f0
  344. adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET
  345. }
  346. ;;
  347. { .mmf
  348. CPREFETCH [PREC], LDC
  349. setf.d f101 = r0
  350. mov f109 = f0
  351. }
  352. { .mfi
  353. setf.d f117 = r0
  354. mov f125 = f0
  355. tbit.z p12, p0 = L, 0
  356. }
  357. ;;
  358. { .mmf
  359. CPREFETCH [PREC], LDC
  360. setf.d f70 = r0
  361. mov f78 = f0
  362. }
  363. { .mfi
  364. setf.d f86 = r0
  365. mov f94 = f0
  366. shr L = L, 1
  367. }
  368. ;;
  369. { .mmf
  370. CPREFETCH [PREC], LDC
  371. setf.d f102 = r0
  372. mov f110 = f0
  373. }
  374. { .mfi
  375. setf.d f118 = r0
  376. mov f126 = f0
  377. adds L = -1, L
  378. }
  379. ;;
  380. { .mmf
  381. CPREFETCH [PREC], LDC
  382. setf.d f71 = r0
  383. mov f79 = f0
  384. }
  385. { .mfi
  386. setf.d f87 = r0
  387. mov f95 = f0
  388. mov ar.lc = L
  389. }
  390. ;;
  391. { .mmf
  392. CPREFETCH [PREC]
  393. setf.d f103 = r0
  394. mov f111 = f0
  395. }
  396. { .mfi
  397. setf.d f119 = r0
  398. mov f127 = f0
  399. cmp.eq p3, p0 = r0, r0
  400. }
  401. ;;
  402. .align 16
  403. .L012:
  404. /* 1 */
  405. { .mfi
  406. lfetch.nt1 [PREA], 16 * SIZE
  407. FMA f64 = f32, f48, f64 // A1 * B1
  408. nop __LINE__
  409. }
  410. { .mfi
  411. (p12) cmp.ne p3, p0 = 0, L
  412. FMA f72 = f32, f49, f72 // A1 * B2
  413. nop __LINE__
  414. }
  415. ;;
  416. /* 2 */
  417. { .mfi
  418. lfetch.nt1 [PREB], 16 * SIZE
  419. FMA f80 = f32, f50, f80 // A1 * B3
  420. nop __LINE__
  421. }
  422. { .mfi
  423. cmp.ne p4, p5 = 0, L
  424. FMA f88 = f32, f51, f88 // A1 * B4
  425. nop __LINE__
  426. }
  427. ;;
  428. /* 3 */
  429. { .mfi
  430. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  431. FMA f96 = f32, f52, f96 // A1 * B5
  432. nop __LINE__
  433. }
  434. { .mfi
  435. adds C9 = 4 * SIZE, C1
  436. FMA f104 = f32, f53, f104 // A1 * B6
  437. nop __LINE__
  438. }
  439. ;;
  440. /* 4 */
  441. { .mfi
  442. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  443. FMA f112 = f32, f54, f112 // A1 * B7
  444. nop __LINE__
  445. }
  446. { .mfi
  447. adds C10 = 4 * SIZE, C2
  448. FMA f120 = f32, f55, f120 // A1 * B8
  449. nop __LINE__
  450. }
  451. ;;
  452. /* 5 */
  453. { .mfi
  454. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  455. FMA f65 = f33, f48, f65 // A2 * B1
  456. nop __LINE__
  457. }
  458. { .mfi
  459. adds C11 = 4 * SIZE, C3
  460. FMA f73 = f33, f49, f73 // A2 * B2
  461. nop __LINE__
  462. }
  463. ;;
  464. /* 6 */
  465. { .mfi
  466. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  467. FMA f81 = f33, f50, f81 // A2 * B3
  468. nop __LINE__
  469. }
  470. { .mfi
  471. adds C12 = 4 * SIZE, C4
  472. FMA f89 = f33, f51, f89 // A2 * B4
  473. nop __LINE__
  474. }
  475. ;;
  476. /* 7 */
  477. { .mfi
  478. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  479. FMA f97 = f33, f52, f97 // A2 * B5
  480. nop __LINE__
  481. }
  482. { .mfi
  483. adds C13 = 4 * SIZE, C5
  484. FMA f105 = f33, f53, f105 // A2 * B6
  485. nop __LINE__
  486. }
  487. ;;
  488. /* 8 */
  489. { .mfi
  490. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  491. FMA f113 = f33, f54, f113 // A2 * B7
  492. nop __LINE__
  493. }
  494. { .mfi
  495. adds C14 = 4 * SIZE, C6
  496. FMA f121 = f33, f55, f121 // A2 * B8
  497. nop __LINE__
  498. }
  499. ;;
  500. /* 9 */
  501. { .mfi
  502. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  503. FMA f66 = f34, f48, f66 // A3 * B1
  504. nop __LINE__
  505. }
  506. { .mfi
  507. adds C15 = 4 * SIZE, C7
  508. FMA f74 = f34, f49, f74 // A3 * B2
  509. nop __LINE__
  510. }
  511. ;;
  512. /* 10 */
  513. { .mfi
  514. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  515. FMA f82 = f34, f50, f82 // A3 * B3
  516. nop __LINE__
  517. }
  518. { .mfi
  519. adds C16 = 4 * SIZE, C8
  520. FMA f90 = f34, f51, f90 // A3 * B4
  521. nop __LINE__
  522. }
  523. ;;
  524. /* 11 */
  525. { .mfi
  526. FMA f98 = f34, f52, f98 // A3 * B5
  527. nop __LINE__
  528. }
  529. { .mfi
  530. nop __LINE__
  531. FMA f106 = f34, f53, f106 // A3 * B6
  532. nop __LINE__
  533. }
  534. ;;
  535. /* 12 */
  536. { .mfi
  537. FMA f114 = f34, f54, f114 // A3 * B7
  538. nop __LINE__
  539. }
  540. { .mfi
  541. nop __LINE__
  542. FMA f122 = f34, f55, f122 // A3 * B8
  543. nop __LINE__
  544. }
  545. ;;
  546. /* 13 */
  547. { .mfi
  548. nop __LINE__
  549. FMA f67 = f35, f48, f67 // A4 * B1
  550. }
  551. { .mfi
  552. nop __LINE__
  553. FMA f75 = f35, f49, f75 // A4 * B2
  554. nop __LINE__
  555. }
  556. ;;
  557. /* 14 */
  558. { .mfi
  559. FMA f83 = f35, f50, f83 // A4 * B3
  560. nop __LINE__
  561. }
  562. { .mfi
  563. nop __LINE__
  564. FMA f91 = f35, f51, f91 // A4 * B4
  565. nop __LINE__
  566. }
  567. ;;
  568. /* 15 */
  569. { .mfi
  570. FMA f99 = f35, f52, f99 // A4 * B5
  571. nop __LINE__
  572. }
  573. { .mfi
  574. nop __LINE__
  575. FMA f107 = f35, f53, f107 // A4 * B6
  576. nop __LINE__
  577. }
  578. ;;
  579. /* 16 */
  580. { .mfi
  581. FMA f115 = f35, f54, f115 // A4 * B7
  582. nop __LINE__
  583. }
  584. { .mfi
  585. nop __LINE__
  586. FMA f123 = f35, f55, f123 // A4 * B8
  587. nop __LINE__
  588. }
  589. ;;
  590. /* 17 */
  591. { .mfi
  592. nop __LINE__
  593. FMA f68 = f36, f48, f68 // A5 * B1
  594. nop __LINE__
  595. }
  596. { .mfi
  597. nop __LINE__
  598. FMA f76 = f36, f49, f76 // A5 * B2
  599. nop __LINE__
  600. }
  601. ;;
  602. /* 18 */
  603. { .mfi
  604. nop __LINE__
  605. FMA f84 = f36, f50, f84 // A5 * B3
  606. nop __LINE__
  607. }
  608. { .mfi
  609. nop __LINE__
  610. FMA f92 = f36, f51, f92 // A5 * B4
  611. nop __LINE__
  612. }
  613. ;;
  614. /* 19 */
  615. { .mfi
  616. nop __LINE__
  617. FMA f100 = f36, f52, f100 // A5 * B5
  618. nop __LINE__
  619. }
  620. { .mfi
  621. nop __LINE__
  622. FMA f108 = f36, f53, f108 // A5 * B6
  623. nop __LINE__
  624. }
  625. ;;
  626. /* 20 */
  627. { .mfi
  628. nop __LINE__
  629. FMA f116 = f36, f54, f116 // A5 * B7
  630. nop __LINE__
  631. }
  632. { .mfi
  633. nop __LINE__
  634. FMA f124 = f36, f55, f124 // A5 * B8
  635. nop __LINE__
  636. }
  637. ;;
  638. /* 21 */
  639. { .mfi
  640. nop __LINE__
  641. FMA f69 = f37, f48, f69 // A6 * B1
  642. nop __LINE__
  643. }
  644. { .mfi
  645. nop __LINE__
  646. FMA f77 = f37, f49, f77 // A6 * B2
  647. nop __LINE__
  648. }
  649. ;;
  650. /* 22 */
  651. { .mfi
  652. nop __LINE__
  653. FMA f85 = f37, f50, f85 // A6 * B3
  654. nop __LINE__
  655. }
  656. { .mfi
  657. nop __LINE__
  658. FMA f93 = f37, f51, f93 // A6 * B4
  659. nop __LINE__
  660. }
  661. ;;
  662. /* 23 */
  663. { .mfi
  664. nop __LINE__
  665. FMA f101 = f37, f52, f101 // A6 * B5
  666. nop __LINE__
  667. }
  668. { .mfi
  669. nop __LINE__
  670. FMA f109 = f37, f53, f109 // A6 * B6
  671. nop __LINE__
  672. }
  673. ;;
  674. /* 24 */
  675. { .mfi
  676. nop __LINE__
  677. FMA f117 = f37, f54, f117 // A6 * B7
  678. nop __LINE__
  679. }
  680. { .mfi
  681. nop __LINE__
  682. FMA f125 = f37, f55, f125 // A6 * B8
  683. nop __LINE__
  684. }
  685. ;;
  686. /* 25 */
  687. { .mfi
  688. nop __LINE__
  689. FMA f70 = f38, f48, f70 // A7 * B1
  690. nop __LINE__
  691. }
  692. { .mfi
  693. nop __LINE__
  694. FMA f78 = f38, f49, f78 // A7 * B2
  695. nop __LINE__
  696. }
  697. ;;
  698. /* 26 */
  699. { .mfi
  700. nop __LINE__
  701. FMA f86 = f38, f50, f86 // A7 * B3
  702. nop __LINE__
  703. }
  704. { .mfi
  705. nop __LINE__
  706. FMA f94 = f38, f51, f94 // A7 * B4
  707. nop __LINE__
  708. }
  709. ;;
  710. /* 27 */
  711. { .mfi
  712. nop __LINE__
  713. FMA f102 = f38, f52, f102 // A7 * B5
  714. nop __LINE__
  715. }
  716. { .mfi
  717. nop __LINE__
  718. FMA f110 = f38, f53, f110 // A7 * B6
  719. nop __LINE__
  720. }
  721. ;;
  722. /* 28 */
  723. { .mfi
  724. nop __LINE__
  725. FMA f118 = f38, f54, f118 // A7 * B7
  726. nop __LINE__
  727. }
  728. { .mfi
  729. nop __LINE__
  730. FMA f126 = f38, f55, f126 // A7 * B8
  731. nop __LINE__
  732. }
  733. ;;
  734. /* 29 */
  735. { .mfi
  736. nop __LINE__
  737. FMA f71 = f39, f48, f71 // A8 * B1
  738. nop __LINE__
  739. }
  740. { .mfi
  741. nop __LINE__
  742. FMA f79 = f39, f49, f79 // A8 * B2
  743. nop __LINE__
  744. }
  745. ;;
  746. /* 30 */
  747. { .mfi
  748. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  749. FMA f87 = f39, f50, f87 // A8 * B3
  750. nop __LINE__
  751. }
  752. { .mfi
  753. nop __LINE__
  754. FMA f95 = f39, f51, f95 // A8 * B4
  755. nop __LINE__
  756. }
  757. ;;
  758. /* 31 */
  759. { .mfi
  760. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  761. FMA f103 = f39, f52, f103 // A8 * B5
  762. nop __LINE__
  763. }
  764. { .mfi
  765. nop __LINE__
  766. FMA f111 = f39, f53, f111 // A8 * B6
  767. nop __LINE__
  768. }
  769. ;;
  770. /* 32 */
  771. { .mfi
  772. nop __LINE__
  773. FMA f119 = f39, f54, f119 // A8 * B7
  774. nop __LINE__
  775. }
  776. { .mfi
  777. nop __LINE__
  778. FMA f127 = f39, f55, f127 // A8 * B8
  779. nop __LINE__
  780. }
  781. ;;
  782. /* 33 */
  783. { .mfi
  784. nop __LINE__
  785. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  786. nop __LINE__
  787. }
  788. { .mfi
  789. nop __LINE__
  790. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  791. nop __LINE__
  792. }
  793. ;;
  794. /* 34 */
  795. { .mfi
  796. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  797. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  798. nop __LINE__
  799. }
  800. { .mfi
  801. nop __LINE__
  802. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  803. nop __LINE__
  804. }
  805. ;;
  806. /* 35 */
  807. { .mfi
  808. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  809. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  810. nop __LINE__
  811. }
  812. { .mfi
  813. nop __LINE__
  814. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  815. nop __LINE__
  816. }
  817. ;;
  818. /* 36 */
  819. { .mfi
  820. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  821. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  822. nop __LINE__
  823. }
  824. { .mfi
  825. nop __LINE__
  826. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  827. nop __LINE__
  828. }
  829. ;;
  830. /* 37 */
  831. { .mfi
  832. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  833. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  834. nop __LINE__
  835. }
  836. { .mfi
  837. nop __LINE__
  838. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  839. nop __LINE__
  840. }
  841. ;;
  842. /* 38 */
  843. { .mfi
  844. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  845. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  846. nop __LINE__
  847. }
  848. { .mfi
  849. nop __LINE__
  850. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  851. nop __LINE__
  852. }
  853. ;;
  854. /* 39 */
  855. { .mfi
  856. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  857. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  858. nop __LINE__
  859. }
  860. { .mfi
  861. nop __LINE__
  862. (p3) FMA f105 = f41, f61, f105 // A2 * B6
  863. nop __LINE__
  864. }
  865. ;;
  866. /* 40 */
  867. { .mfi
  868. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  869. (p5) LDFD f6 = [C1 ], SIZE
  870. #else
  871. nop __LINE__
  872. #endif
  873. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  874. nop __LINE__
  875. }
  876. { .mfi
  877. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  878. (p5) LDFD f7 = [C9 ], SIZE
  879. #else
  880. nop __LINE__
  881. #endif
  882. (p3) FMA f121 = f41, f63, f121 // A2 * B8
  883. nop __LINE__
  884. }
  885. ;;
  886. /* 41 */
  887. { .mfi
  888. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  889. (p5) LDFD f10 = [C1 ], SIZE
  890. #else
  891. nop __LINE__
  892. #endif
  893. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  894. nop __LINE__
  895. }
  896. { .mfi
  897. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  898. (p5) LDFD f11 = [C9 ], SIZE
  899. #else
  900. nop __LINE__
  901. #endif
  902. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  903. nop __LINE__
  904. }
  905. ;;
  906. /* 42 */
  907. { .mfi
  908. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  909. (p5) LDFD f12 = [C1 ], SIZE
  910. #else
  911. nop __LINE__
  912. #endif
  913. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  914. nop __LINE__
  915. }
  916. { .mfi
  917. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  918. (p5) LDFD f13 = [C9 ], SIZE
  919. #else
  920. nop __LINE__
  921. #endif
  922. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  923. nop __LINE__
  924. }
  925. ;;
  926. /* 43 */
  927. { .mfi
  928. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  929. (p5) LDFD f14 = [C1 ], -3 * SIZE
  930. #else
  931. nop __LINE__
  932. #endif
  933. (p3) FMA f98 = f42, f60, f98 // A3 * B5
  934. nop __LINE__
  935. }
  936. { .mfi
  937. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  938. (p5) LDFD f15 = [C9 ], -3 * SIZE
  939. #else
  940. nop __LINE__
  941. #endif
  942. (p3) FMA f106 = f42, f61, f106 // A3 * B6
  943. nop __LINE__
  944. }
  945. ;;
  946. /* 44 */
  947. { .mfi
  948. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  949. (p5) LDFD f16 = [C2 ], SIZE
  950. #else
  951. nop __LINE__
  952. #endif
  953. (p3) FMA f114 = f42, f62, f114 // A3 * B7
  954. nop __LINE__
  955. }
  956. { .mfi
  957. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  958. (p5) LDFD f17 = [C10], SIZE
  959. #else
  960. nop __LINE__
  961. #endif
  962. (p3) FMA f122 = f42, f63, f122 // A3 * B8
  963. nop __LINE__
  964. }
  965. ;;
  966. /* 45 */
  967. { .mfi
  968. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  969. (p5) LDFD f18 = [C2 ], SIZE
  970. #else
  971. nop __LINE__
  972. #endif
  973. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  974. nop __LINE__
  975. }
  976. { .mfi
  977. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  978. (p5) LDFD f19 = [C10], SIZE
  979. #else
  980. nop __LINE__
  981. #endif
  982. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  983. nop __LINE__
  984. }
  985. ;;
  986. /* 46 */
  987. { .mfi
  988. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  989. (p5) LDFD f20 = [C2 ], SIZE
  990. #else
  991. nop __LINE__
  992. #endif
  993. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  994. nop __LINE__
  995. }
  996. { .mfi
  997. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  998. (p5) LDFD f21 = [C10], SIZE
  999. #else
  1000. nop __LINE__
  1001. #endif
  1002. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  1003. nop __LINE__
  1004. }
  1005. ;;
  1006. /* 47 */
  1007. { .mfi
  1008. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1009. (p5) LDFD f22 = [C2 ], -3 * SIZE
  1010. #else
  1011. nop __LINE__
  1012. #endif
  1013. (p3) FMA f99 = f43, f60, f99 // A4 * B5
  1014. nop __LINE__
  1015. }
  1016. { .mfi
  1017. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1018. (p5) LDFD f23 = [C10], -3 * SIZE
  1019. #else
  1020. nop __LINE__
  1021. #endif
  1022. (p3) FMA f107 = f43, f61, f107 // A4 * B6
  1023. nop __LINE__
  1024. }
  1025. ;;
  1026. /* 48 */
  1027. { .mfi
  1028. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1029. (p5) LDFD f24 = [C3 ], SIZE
  1030. #else
  1031. nop __LINE__
  1032. #endif
  1033. (p3) FMA f115 = f43, f62, f115 // A4 * B7
  1034. nop __LINE__
  1035. }
  1036. { .mfi
  1037. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1038. (p5) LDFD f25 = [C11], SIZE
  1039. #else
  1040. nop __LINE__
  1041. #endif
  1042. (p3) FMA f123 = f43, f63, f123 // A4 * B8
  1043. nop __LINE__
  1044. }
  1045. ;;
  1046. /* 49 */
  1047. { .mfi
  1048. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1049. (p5) LDFD f26 = [C3 ], SIZE
  1050. #else
  1051. nop __LINE__
  1052. #endif
  1053. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  1054. nop __LINE__
  1055. }
  1056. { .mfi
  1057. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1058. (p5) LDFD f27 = [C11], SIZE
  1059. #else
  1060. nop __LINE__
  1061. #endif
  1062. (p3) FMA f76 = f44, f57, f76 // A5 * B2
  1063. nop __LINE__
  1064. }
  1065. ;;
  1066. /* 50 */
  1067. { .mfi
  1068. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1069. (p5) LDFD f28 = [C3 ], SIZE
  1070. #else
  1071. nop __LINE__
  1072. #endif
  1073. (p3) FMA f84 = f44, f58, f84 // A5 * B3
  1074. nop __LINE__
  1075. }
  1076. { .mfi
  1077. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1078. (p5) LDFD f29 = [C11], SIZE
  1079. #else
  1080. nop __LINE__
  1081. #endif
  1082. (p3) FMA f92 = f44, f59, f92 // A5 * B4
  1083. nop __LINE__
  1084. }
  1085. ;;
  1086. /* 51 */
  1087. { .mfi
  1088. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1089. (p5) LDFD f30 = [C3 ], -3 * SIZE
  1090. #else
  1091. nop __LINE__
  1092. #endif
  1093. (p3) FMA f100 = f44, f60, f100 // A5 * B5
  1094. nop __LINE__
  1095. }
  1096. { .mfi
  1097. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1098. (p5) LDFD f31 = [C11], -3 * SIZE
  1099. #else
  1100. nop __LINE__
  1101. #endif
  1102. (p3) FMA f108 = f44, f61, f108 // A5 * B6
  1103. nop __LINE__
  1104. }
  1105. ;;
  1106. /* 52 */
  1107. { .mfi
  1108. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1109. (p5) LDFD f32 = [C4 ], SIZE
  1110. #else
  1111. nop __LINE__
  1112. #endif
  1113. (p3) FMA f116 = f44, f62, f116 // A5 * B7
  1114. nop __LINE__
  1115. }
  1116. { .mfi
  1117. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1118. (p5) LDFD f33 = [C12], SIZE
  1119. #else
  1120. nop __LINE__
  1121. #endif
  1122. (p3) FMA f124 = f44, f63, f124 // A5 * B8
  1123. nop __LINE__
  1124. }
  1125. ;;
  1126. /* 53 */
  1127. { .mfi
  1128. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1129. (p5) LDFD f34 = [C4 ], SIZE
  1130. #else
  1131. nop __LINE__
  1132. #endif
  1133. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  1134. nop __LINE__
  1135. }
  1136. { .mfi
  1137. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1138. (p5) LDFD f35 = [C12], SIZE
  1139. #else
  1140. nop __LINE__
  1141. #endif
  1142. (p3) FMA f77 = f45, f57, f77 // A6 * B2
  1143. nop __LINE__
  1144. }
  1145. ;;
  1146. /* 54 */
  1147. { .mfi
  1148. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1149. (p5) LDFD f36 = [C4 ], SIZE
  1150. #else
  1151. nop __LINE__
  1152. #endif
  1153. (p3) FMA f85 = f45, f58, f85 // A6 * B3
  1154. nop __LINE__
  1155. }
  1156. { .mfi
  1157. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1158. (p5) LDFD f37 = [C12], SIZE
  1159. #else
  1160. nop __LINE__
  1161. #endif
  1162. (p3) FMA f93 = f45, f59, f93 // A6 * B4
  1163. nop __LINE__
  1164. }
  1165. ;;
  1166. /* 55 */
  1167. { .mfi
  1168. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1169. (p5) LDFD f38 = [C4 ], -3 * SIZE
  1170. #else
  1171. nop __LINE__
  1172. #endif
  1173. (p3) FMA f101 = f45, f60, f101 // A6 * B5
  1174. nop __LINE__
  1175. }
  1176. { .mfi
  1177. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1178. (p5) LDFD f39 = [C12], -3 * SIZE
  1179. #else
  1180. nop __LINE__
  1181. #endif
  1182. (p3) FMA f109 = f45, f61, f109 // A6 * B6
  1183. nop __LINE__
  1184. }
  1185. ;;
  1186. /* 56 */
  1187. { .mfi
  1188. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1189. (p5) LDFD f48 = [C5 ], SIZE
  1190. #else
  1191. nop __LINE__
  1192. #endif
  1193. (p3) FMA f117 = f45, f62, f117 // A6 * B7
  1194. nop __LINE__
  1195. }
  1196. { .mfi
  1197. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1198. (p5) LDFD f49 = [C13], SIZE
  1199. #else
  1200. nop __LINE__
  1201. #endif
  1202. (p3) FMA f125 = f45, f63, f125 // A6 * B8
  1203. nop __LINE__
  1204. }
  1205. ;;
  1206. /* 57 */
  1207. { .mfi
  1208. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1209. (p5) LDFD f50 = [C5 ], SIZE
  1210. #else
  1211. nop __LINE__
  1212. #endif
  1213. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  1214. nop __LINE__
  1215. }
  1216. { .mfi
  1217. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1218. (p5) LDFD f51 = [C13], SIZE
  1219. #else
  1220. nop __LINE__
  1221. #endif
  1222. (p3) FMA f78 = f46, f57, f78 // A7 * B2
  1223. nop __LINE__
  1224. }
  1225. ;;
  1226. /* 58 */
  1227. { .mfi
  1228. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1229. (p5) LDFD f52 = [C5 ], SIZE
  1230. #else
  1231. nop __LINE__
  1232. #endif
  1233. (p3) FMA f86 = f46, f58, f86 // A7 * B3
  1234. nop __LINE__
  1235. }
  1236. { .mfi
  1237. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1238. (p5) LDFD f53 = [C13], SIZE
  1239. #else
  1240. nop __LINE__
  1241. #endif
  1242. (p3) FMA f94 = f46, f59, f94 // A7 * B4
  1243. nop __LINE__
  1244. }
  1245. ;;
  1246. /* 59 */
  1247. { .mfi
  1248. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1249. (p5) LDFD f54 = [C5 ], -3 * SIZE
  1250. #else
  1251. nop __LINE__
  1252. #endif
  1253. (p3) FMA f102 = f46, f60, f102 // A7 * B5
  1254. nop __LINE__
  1255. }
  1256. { .mfi
  1257. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1258. (p5) LDFD f55 = [C13], -3 * SIZE
  1259. #else
  1260. nop __LINE__
  1261. #endif
  1262. (p3) FMA f110 = f46, f61, f110 // A7 * B6
  1263. nop __LINE__
  1264. }
  1265. ;;
  1266. /* 60 */
  1267. { .mfi
  1268. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1269. (p5) LDFD f40 = [C6 ], SIZE
  1270. #else
  1271. nop __LINE__
  1272. #endif
  1273. (p3) FMA f118 = f46, f62, f118 // A7 * B7
  1274. nop __LINE__
  1275. }
  1276. { .mfi
  1277. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1278. (p5) LDFD f41 = [C14], SIZE
  1279. #else
  1280. nop __LINE__
  1281. #endif
  1282. (p3) FMA f126 = f46, f63, f126 // A7 * B8
  1283. nop __LINE__
  1284. }
  1285. ;;
  1286. /* 61 */
  1287. { .mfi
  1288. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1289. (p5) LDFD f42 = [C6 ], SIZE
  1290. #else
  1291. nop __LINE__
  1292. #endif
  1293. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  1294. nop __LINE__
  1295. }
  1296. { .mfi
  1297. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1298. (p5) LDFD f43 = [C14], SIZE
  1299. #else
  1300. nop __LINE__
  1301. #endif
  1302. (p3) FMA f79 = f47, f57, f79 // A8 * B2
  1303. nop __LINE__
  1304. }
  1305. ;;
  1306. /* 62 */
  1307. { .mfi
  1308. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1309. (p5) LDFD f44 = [C6 ], SIZE
  1310. #else
  1311. nop __LINE__
  1312. #endif
  1313. (p3) FMA f87 = f47, f58, f87 // A8 * B3
  1314. nop __LINE__
  1315. }
  1316. { .mfi
  1317. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1318. (p5) LDFD f45 = [C14], SIZE
  1319. #else
  1320. nop __LINE__
  1321. #endif
  1322. (p3) FMA f95 = f47, f59, f95 // A8 * B4
  1323. nop __LINE__
  1324. }
  1325. ;;
  1326. /* 63 */
  1327. { .mfi
  1328. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1329. (p5) LDFD f59 = [C6 ], -3 * SIZE
  1330. #else
  1331. nop __LINE__
  1332. #endif
  1333. (p3) FMA f103 = f47, f60, f103 // A8 * B5
  1334. nop __LINE__
  1335. }
  1336. { .mfi
  1337. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1338. (p5) LDFD f60 = [C14], -3 * SIZE
  1339. #else
  1340. nop __LINE__
  1341. #endif
  1342. (p3) FMA f111 = f47, f61, f111 // A8 * B6
  1343. nop __LINE__
  1344. }
  1345. ;;
  1346. /* 64 */
  1347. { .mfi
  1348. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1349. (p5) LDFD f61 = [C7 ], SIZE
  1350. #else
  1351. nop __LINE__
  1352. #endif
  1353. (p3) FMA f119 = f47, f62, f119 // A8 * B7
  1354. adds L = -1, L
  1355. }
  1356. { .mfb
  1357. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1358. (p5) LDFD f62 = [C15], SIZE
  1359. #else
  1360. nop __LINE__
  1361. #endif
  1362. (p3) FMA f127 = f47, f63, f127 // A8 * B8
  1363. br.cloop.sptk.few .L012
  1364. }
  1365. ;;
  1366. .L013:
  1367. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1368. { .mfi
  1369. (p5) LDFD f63 = [C7 ], SIZE
  1370. FMA f64 = ALPHA, f64, f6
  1371. cmp.ne p6, p0 = 1, I
  1372. }
  1373. { .mfb
  1374. (p5) LDFD f6 = [C15], SIZE
  1375. FMA f68 = ALPHA, f68, f7
  1376. nop __LINE__
  1377. }
  1378. ;;
  1379. { .mfi
  1380. (p5) LDFD f7 = [C7 ], SIZE
  1381. FMA f65 = ALPHA, f65, f10
  1382. adds I = -1, I
  1383. }
  1384. { .mfb
  1385. (p5) LDFD f10 = [C15], SIZE
  1386. FMA f69 = ALPHA, f69, f11
  1387. nop __LINE__
  1388. }
  1389. ;;
  1390. { .mfb
  1391. (p5) LDFD f11 = [C7 ], -3 * SIZE
  1392. FMA f66 = ALPHA, f66, f12
  1393. nop __LINE__
  1394. }
  1395. { .mfb
  1396. (p5) LDFD f12 = [C15], -3 * SIZE
  1397. FMA f70 = ALPHA, f70, f13
  1398. nop __LINE__
  1399. }
  1400. ;;
  1401. { .mfb
  1402. LDFD f13 = [C8 ], SIZE
  1403. FMA f67 = ALPHA, f67, f14
  1404. nop __LINE__
  1405. }
  1406. { .mfb
  1407. LDFD f14 = [C16], SIZE
  1408. FMA f71 = ALPHA, f71, f15
  1409. nop __LINE__
  1410. }
  1411. ;;
  1412. { .mmf
  1413. STFD [C1 ] = f64, SIZE
  1414. STFD [C9 ] = f68, SIZE
  1415. FMA f72 = ALPHA, f72, f16
  1416. }
  1417. { .mmf
  1418. LDFD f15 = [C8 ], SIZE
  1419. LDFD f16 = [C16], SIZE
  1420. FMA f76 = ALPHA, f76, f17
  1421. }
  1422. ;;
  1423. { .mmf
  1424. STFD [C1 ] = f65, SIZE
  1425. STFD [C9 ] = f69, SIZE
  1426. FMA f73 = ALPHA, f73, f18
  1427. }
  1428. { .mmf
  1429. LDFD f17 = [C8 ], SIZE
  1430. LDFD f18 = [C16], SIZE
  1431. FMA f77 = ALPHA, f77, f19
  1432. }
  1433. ;;
  1434. { .mmf
  1435. STFD [C1 ] = f66, SIZE
  1436. STFD [C9 ] = f70, SIZE
  1437. FMA f74 = ALPHA, f74, f20
  1438. }
  1439. { .mmf
  1440. LDFD f19 = [C8 ], -3 * SIZE
  1441. LDFD f20 = [C16], -3 * SIZE
  1442. FMA f78 = ALPHA, f78, f21
  1443. }
  1444. ;;
  1445. { .mfb
  1446. STFD [C1 ] = f67, 5 * SIZE
  1447. FMA f75 = ALPHA, f75, f22
  1448. nop __LINE__
  1449. }
  1450. { .mfb
  1451. STFD [C9 ] = f71, 5 * SIZE
  1452. FMA f79 = ALPHA, f79, f23
  1453. nop __LINE__
  1454. }
  1455. ;;
  1456. { .mfb
  1457. STFD [C2 ] = f72, SIZE
  1458. FMA f80 = ALPHA, f80, f24
  1459. nop __LINE__
  1460. }
  1461. { .mfb
  1462. STFD [C10] = f76, SIZE
  1463. FMA f84 = ALPHA, f84, f25
  1464. nop __LINE__
  1465. }
  1466. ;;
  1467. { .mfb
  1468. STFD [C2 ] = f73, SIZE
  1469. FMA f81 = ALPHA, f81, f26
  1470. nop __LINE__
  1471. }
  1472. { .mfb
  1473. STFD [C10] = f77, SIZE
  1474. FMA f85 = ALPHA, f85, f27
  1475. nop __LINE__
  1476. }
  1477. ;;
  1478. { .mfb
  1479. STFD [C2 ] = f74, SIZE
  1480. FMA f82 = ALPHA, f82, f28
  1481. nop __LINE__
  1482. }
  1483. { .mfb
  1484. STFD [C10] = f78, SIZE
  1485. FMA f86 = ALPHA, f86, f29
  1486. nop __LINE__
  1487. }
  1488. ;;
  1489. { .mfb
  1490. STFD [C2 ] = f75, 5 * SIZE
  1491. FMA f83 = ALPHA, f83, f30
  1492. nop __LINE__
  1493. }
  1494. { .mfb
  1495. STFD [C10] = f79, 5 * SIZE
  1496. FMA f87 = ALPHA, f87, f31
  1497. nop __LINE__
  1498. }
  1499. ;;
  1500. { .mfb
  1501. STFD [C3 ] = f80, SIZE
  1502. FMA f88 = ALPHA, f88, f32
  1503. nop __LINE__
  1504. }
  1505. { .mfb
  1506. STFD [C11] = f84, SIZE
  1507. FMA f92 = ALPHA, f92, f33
  1508. nop __LINE__
  1509. }
  1510. ;;
  1511. { .mfb
  1512. STFD [C3 ] = f81, SIZE
  1513. FMA f89 = ALPHA, f89, f34
  1514. nop __LINE__
  1515. }
  1516. { .mfb
  1517. STFD [C11] = f85, SIZE
  1518. FMA f93 = ALPHA, f93, f35
  1519. nop __LINE__
  1520. }
  1521. ;;
  1522. { .mfb
  1523. STFD [C3 ] = f82, SIZE
  1524. FMA f90 = ALPHA, f90, f36
  1525. nop __LINE__
  1526. }
  1527. { .mfb
  1528. STFD [C11] = f86, SIZE
  1529. FMA f94 = ALPHA, f94, f37
  1530. nop __LINE__
  1531. }
  1532. ;;
  1533. { .mfb
  1534. STFD [C3 ] = f83, 5 * SIZE
  1535. FMA f91 = ALPHA, f91, f38
  1536. nop __LINE__
  1537. }
  1538. { .mfb
  1539. STFD [C11] = f87, 5 * SIZE
  1540. FMA f95 = ALPHA, f95, f39
  1541. nop __LINE__
  1542. }
  1543. ;;
  1544. { .mfb
  1545. STFD [C4 ] = f88, SIZE
  1546. FMA f96 = ALPHA, f96, f48
  1547. nop __LINE__
  1548. }
  1549. { .mfb
  1550. STFD [C12] = f92, SIZE
  1551. FMA f100 = ALPHA, f100, f49
  1552. nop __LINE__
  1553. }
  1554. ;;
  1555. { .mfb
  1556. STFD [C4 ] = f89, SIZE
  1557. FMA f97 = ALPHA, f97, f50
  1558. nop __LINE__
  1559. }
  1560. { .mfb
  1561. STFD [C12] = f93, SIZE
  1562. FMA f101 = ALPHA, f101, f51
  1563. nop __LINE__
  1564. }
  1565. ;;
  1566. { .mfb
  1567. STFD [C4 ] = f90, SIZE
  1568. FMA f98 = ALPHA, f98, f52
  1569. nop __LINE__
  1570. }
  1571. { .mfb
  1572. STFD [C12] = f94, SIZE
  1573. FMA f102 = ALPHA, f102, f53
  1574. nop __LINE__
  1575. }
  1576. ;;
  1577. { .mfb
  1578. STFD [C4 ] = f91, 5 * SIZE
  1579. FMA f99 = ALPHA, f99, f54
  1580. nop __LINE__
  1581. }
  1582. { .mfb
  1583. STFD [C12] = f95, 5 * SIZE
  1584. FMA f103 = ALPHA, f103, f55
  1585. nop __LINE__
  1586. }
  1587. ;;
  1588. { .mfb
  1589. STFD [C5 ] = f96, SIZE
  1590. FMA f104 = ALPHA, f104, f40
  1591. nop __LINE__
  1592. }
  1593. { .mfb
  1594. STFD [C13] = f100, SIZE
  1595. FMA f108 = ALPHA, f108, f41
  1596. nop __LINE__
  1597. }
  1598. ;;
  1599. { .mfb
  1600. STFD [C5 ] = f97, SIZE
  1601. FMA f105 = ALPHA, f105, f42
  1602. nop __LINE__
  1603. }
  1604. { .mfb
  1605. STFD [C13] = f101, SIZE
  1606. FMA f109 = ALPHA, f109, f43
  1607. nop __LINE__
  1608. }
  1609. ;;
  1610. { .mfb
  1611. STFD [C5 ] = f98, SIZE
  1612. FMA f106 = ALPHA, f106, f44
  1613. nop __LINE__
  1614. }
  1615. { .mfb
  1616. STFD [C13] = f102, SIZE
  1617. FMA f110 = ALPHA, f110, f45
  1618. nop __LINE__
  1619. }
  1620. ;;
  1621. { .mfb
  1622. STFD [C5 ] = f99, 5 * SIZE
  1623. FMA f107 = ALPHA, f107, f59
  1624. nop __LINE__
  1625. }
  1626. { .mfb
  1627. STFD [C13] = f103, 5 * SIZE
  1628. FMA f111 = ALPHA, f111, f60
  1629. nop __LINE__
  1630. }
  1631. ;;
  1632. { .mfb
  1633. STFD [C6 ] = f104, SIZE
  1634. FMA f112 = ALPHA, f112, f61
  1635. nop __LINE__
  1636. }
  1637. { .mfb
  1638. STFD [C14] = f108, SIZE
  1639. FMA f116 = ALPHA, f116, f62
  1640. nop __LINE__
  1641. }
  1642. ;;
  1643. { .mfb
  1644. STFD [C6 ] = f105, SIZE
  1645. FMA f113 = ALPHA, f113, f63
  1646. nop __LINE__
  1647. }
  1648. { .mfb
  1649. STFD [C14] = f109, SIZE
  1650. FMA f117 = ALPHA, f117, f6
  1651. nop __LINE__
  1652. }
  1653. ;;
  1654. { .mfb
  1655. STFD [C6 ] = f106, SIZE
  1656. FMA f114 = ALPHA, f114, f7
  1657. nop __LINE__
  1658. }
  1659. { .mfb
  1660. STFD [C14] = f110, SIZE
  1661. FMA f118 = ALPHA, f118, f10
  1662. nop __LINE__
  1663. }
  1664. ;;
  1665. { .mfb
  1666. STFD [C6 ] = f107, 5 * SIZE
  1667. FMA f115 = ALPHA, f115, f11
  1668. nop __LINE__
  1669. }
  1670. { .mfb
  1671. STFD [C14] = f111, 5 * SIZE
  1672. FMA f119 = ALPHA, f119, f12
  1673. nop __LINE__
  1674. }
  1675. ;;
  1676. { .mfb
  1677. STFD [C7 ] = f112, SIZE
  1678. FMA f120 = ALPHA, f120, f13
  1679. nop __LINE__
  1680. }
  1681. { .mfb
  1682. STFD [C15] = f116, SIZE
  1683. FMA f124 = ALPHA, f124, f14
  1684. nop __LINE__
  1685. }
  1686. ;;
  1687. { .mfb
  1688. STFD [C7 ] = f113, SIZE
  1689. FMA f121 = ALPHA, f121, f15
  1690. nop __LINE__
  1691. }
  1692. { .mfb
  1693. STFD [C15] = f117, SIZE
  1694. FMA f125 = ALPHA, f125, f16
  1695. nop __LINE__
  1696. }
  1697. ;;
  1698. { .mfb
  1699. STFD [C7 ] = f114, SIZE
  1700. FMA f122 = ALPHA, f122, f17
  1701. nop __LINE__
  1702. }
  1703. { .mfb
  1704. STFD [C15] = f118, SIZE
  1705. FMA f126 = ALPHA, f126, f18
  1706. nop __LINE__
  1707. }
  1708. ;;
  1709. { .mfb
  1710. STFD [C7 ] = f115, 5 * SIZE
  1711. FMA f123 = ALPHA, f123, f19
  1712. nop __LINE__
  1713. }
  1714. { .mfb
  1715. STFD [C15] = f119, 5 * SIZE
  1716. FMA f127 = ALPHA, f127, f20
  1717. nop __LINE__
  1718. }
  1719. ;;
  1720. { .mfb
  1721. STFD [C8 ] = f120, SIZE
  1722. mov f64 = f0
  1723. nop __LINE__
  1724. }
  1725. { .mfb
  1726. STFD [C16] = f124, SIZE
  1727. mov f72 = f0
  1728. nop __LINE__
  1729. }
  1730. ;;
  1731. { .mfi
  1732. STFD [C8 ] = f121, SIZE
  1733. mov f80 = f0
  1734. nop __LINE__
  1735. }
  1736. { .mfb
  1737. STFD [C16] = f125, SIZE
  1738. mov f88 = f0
  1739. nop __LINE__
  1740. }
  1741. ;;
  1742. { .mfi
  1743. STFD [C8 ] = f122, SIZE
  1744. mov f96 = f0
  1745. nop __LINE__
  1746. }
  1747. { .mfb
  1748. STFD [C16] = f126, SIZE
  1749. mov f104 = f0
  1750. nop __LINE__
  1751. }
  1752. ;;
  1753. { .mfi
  1754. STFD [C8 ] = f123, 5 * SIZE
  1755. mov f112 = f0
  1756. nop __LINE__
  1757. }
  1758. { .mfb
  1759. STFD [C16] = f127, 5 * SIZE
  1760. mov f120 = f0
  1761. (p6) br.cond.dptk .L011
  1762. }
  1763. ;;
  1764. #else
  1765. { .mfi
  1766. nop __LINE__
  1767. FMPY f64 = ALPHA, f64
  1768. cmp.ne p6, p0 = 1, I
  1769. }
  1770. { .mfb
  1771. nop __LINE__
  1772. FMPY f68 = ALPHA, f68
  1773. nop __LINE__
  1774. }
  1775. ;;
  1776. { .mfi
  1777. nop __LINE__
  1778. FMPY f65 = ALPHA, f65
  1779. adds I = -1, I
  1780. }
  1781. { .mfb
  1782. nop __LINE__
  1783. FMPY f69 = ALPHA, f69
  1784. nop __LINE__
  1785. }
  1786. ;;
  1787. { .mfb
  1788. nop __LINE__
  1789. FMPY f66 = ALPHA, f66
  1790. nop __LINE__
  1791. }
  1792. { .mfb
  1793. nop __LINE__
  1794. FMPY f70 = ALPHA, f70
  1795. nop __LINE__
  1796. }
  1797. ;;
  1798. { .mfb
  1799. nop __LINE__
  1800. FMPY f67 = ALPHA, f67
  1801. nop __LINE__
  1802. }
  1803. { .mfb
  1804. nop __LINE__
  1805. FMPY f71 = ALPHA, f71
  1806. nop __LINE__
  1807. }
  1808. ;;
  1809. { .mmf
  1810. STFD [C1 ] = f64, SIZE
  1811. STFD [C9 ] = f68, SIZE
  1812. FMPY f72 = ALPHA, f72
  1813. }
  1814. { .mmf
  1815. nop __LINE__
  1816. nop __LINE__
  1817. FMPY f76 = ALPHA, f76
  1818. }
  1819. ;;
  1820. { .mmf
  1821. STFD [C1 ] = f65, SIZE
  1822. STFD [C9 ] = f69, SIZE
  1823. FMPY f73 = ALPHA, f73
  1824. }
  1825. { .mmf
  1826. nop __LINE__
  1827. nop __LINE__
  1828. FMPY f77 = ALPHA, f77
  1829. }
  1830. ;;
  1831. { .mmf
  1832. STFD [C1 ] = f66, SIZE
  1833. STFD [C9 ] = f70, SIZE
  1834. FMPY f74 = ALPHA, f74
  1835. }
  1836. { .mmf
  1837. nop __LINE__
  1838. nop __LINE__
  1839. FMPY f78 = ALPHA, f78
  1840. }
  1841. ;;
  1842. { .mfb
  1843. STFD [C1 ] = f67, 5 * SIZE
  1844. FMPY f75 = ALPHA, f75
  1845. nop __LINE__
  1846. }
  1847. { .mfb
  1848. STFD [C9 ] = f71, 5 * SIZE
  1849. FMPY f79 = ALPHA, f79
  1850. nop __LINE__
  1851. }
  1852. ;;
  1853. { .mfb
  1854. STFD [C2 ] = f72, SIZE
  1855. FMPY f80 = ALPHA, f80
  1856. nop __LINE__
  1857. }
  1858. { .mfb
  1859. STFD [C10] = f76, SIZE
  1860. FMPY f84 = ALPHA, f84
  1861. nop __LINE__
  1862. }
  1863. ;;
  1864. { .mfb
  1865. STFD [C2 ] = f73, SIZE
  1866. FMPY f81 = ALPHA, f81
  1867. nop __LINE__
  1868. }
  1869. { .mfb
  1870. STFD [C10] = f77, SIZE
  1871. FMPY f85 = ALPHA, f85
  1872. nop __LINE__
  1873. }
  1874. ;;
  1875. { .mfb
  1876. STFD [C2 ] = f74, SIZE
  1877. FMPY f82 = ALPHA, f82
  1878. nop __LINE__
  1879. }
  1880. { .mfb
  1881. STFD [C10] = f78, SIZE
  1882. FMPY f86 = ALPHA, f86
  1883. nop __LINE__
  1884. }
  1885. ;;
  1886. { .mfb
  1887. STFD [C2 ] = f75, 5 * SIZE
  1888. FMPY f83 = ALPHA, f83
  1889. nop __LINE__
  1890. }
  1891. { .mfb
  1892. STFD [C10] = f79, 5 * SIZE
  1893. FMPY f87 = ALPHA, f87
  1894. nop __LINE__
  1895. }
  1896. ;;
  1897. { .mfb
  1898. STFD [C3 ] = f80, SIZE
  1899. FMPY f88 = ALPHA, f88
  1900. nop __LINE__
  1901. }
  1902. { .mfb
  1903. STFD [C11] = f84, SIZE
  1904. FMPY f92 = ALPHA, f92
  1905. nop __LINE__
  1906. }
  1907. ;;
  1908. { .mfb
  1909. STFD [C3 ] = f81, SIZE
  1910. FMPY f89 = ALPHA, f89
  1911. nop __LINE__
  1912. }
  1913. { .mfb
  1914. STFD [C11] = f85, SIZE
  1915. FMPY f93 = ALPHA, f93
  1916. nop __LINE__
  1917. }
  1918. ;;
  1919. { .mfb
  1920. STFD [C3 ] = f82, SIZE
  1921. FMPY f90 = ALPHA, f90
  1922. nop __LINE__
  1923. }
  1924. { .mfb
  1925. STFD [C11] = f86, SIZE
  1926. FMPY f94 = ALPHA, f94
  1927. nop __LINE__
  1928. }
  1929. ;;
  1930. { .mfb
  1931. STFD [C3 ] = f83, 5 * SIZE
  1932. FMPY f91 = ALPHA, f91
  1933. nop __LINE__
  1934. }
  1935. { .mfb
  1936. STFD [C11] = f87, 5 * SIZE
  1937. FMPY f95 = ALPHA, f95
  1938. nop __LINE__
  1939. }
  1940. ;;
  1941. { .mfb
  1942. STFD [C4 ] = f88, SIZE
  1943. FMPY f96 = ALPHA, f96
  1944. nop __LINE__
  1945. }
  1946. { .mfb
  1947. STFD [C12] = f92, SIZE
  1948. FMPY f100 = ALPHA, f100
  1949. nop __LINE__
  1950. }
  1951. ;;
  1952. { .mfb
  1953. STFD [C4 ] = f89, SIZE
  1954. FMPY f97 = ALPHA, f97
  1955. nop __LINE__
  1956. }
  1957. { .mfb
  1958. STFD [C12] = f93, SIZE
  1959. FMPY f101 = ALPHA, f101
  1960. nop __LINE__
  1961. }
  1962. ;;
  1963. { .mfb
  1964. STFD [C4 ] = f90, SIZE
  1965. FMPY f98 = ALPHA, f98
  1966. nop __LINE__
  1967. }
  1968. { .mfb
  1969. STFD [C12] = f94, SIZE
  1970. FMPY f102 = ALPHA, f102
  1971. nop __LINE__
  1972. }
  1973. ;;
  1974. { .mfb
  1975. STFD [C4 ] = f91, 5 * SIZE
  1976. FMPY f99 = ALPHA, f99
  1977. nop __LINE__
  1978. }
  1979. { .mfb
  1980. STFD [C12] = f95, 5 * SIZE
  1981. FMPY f103 = ALPHA, f103
  1982. nop __LINE__
  1983. }
  1984. ;;
  1985. { .mfb
  1986. STFD [C5 ] = f96, SIZE
  1987. FMPY f104 = ALPHA, f104
  1988. nop __LINE__
  1989. }
  1990. { .mfb
  1991. STFD [C13] = f100, SIZE
  1992. FMPY f108 = ALPHA, f108
  1993. nop __LINE__
  1994. }
  1995. ;;
  1996. { .mfb
  1997. STFD [C5 ] = f97, SIZE
  1998. FMPY f105 = ALPHA, f105
  1999. nop __LINE__
  2000. }
  2001. { .mfb
  2002. STFD [C13] = f101, SIZE
  2003. FMPY f109 = ALPHA, f109
  2004. nop __LINE__
  2005. }
  2006. ;;
  2007. { .mfb
  2008. STFD [C5 ] = f98, SIZE
  2009. FMPY f106 = ALPHA, f106
  2010. nop __LINE__
  2011. }
  2012. { .mfb
  2013. STFD [C13] = f102, SIZE
  2014. FMPY f110 = ALPHA, f110
  2015. nop __LINE__
  2016. }
  2017. ;;
  2018. { .mfb
  2019. STFD [C5 ] = f99, 5 * SIZE
  2020. FMPY f107 = ALPHA, f107
  2021. nop __LINE__
  2022. }
  2023. { .mfb
  2024. STFD [C13] = f103, 5 * SIZE
  2025. FMPY f111 = ALPHA, f111
  2026. nop __LINE__
  2027. }
  2028. ;;
  2029. { .mfb
  2030. STFD [C6 ] = f104, SIZE
  2031. FMPY f112 = ALPHA, f112
  2032. nop __LINE__
  2033. }
  2034. { .mfb
  2035. STFD [C14] = f108, SIZE
  2036. FMPY f116 = ALPHA, f116
  2037. nop __LINE__
  2038. }
  2039. ;;
  2040. { .mfb
  2041. STFD [C6 ] = f105, SIZE
  2042. FMPY f113 = ALPHA, f113
  2043. nop __LINE__
  2044. }
  2045. { .mfb
  2046. STFD [C14] = f109, SIZE
  2047. FMPY f117 = ALPHA, f117
  2048. nop __LINE__
  2049. }
  2050. ;;
  2051. { .mfb
  2052. STFD [C6 ] = f106, SIZE
  2053. FMPY f114 = ALPHA, f114
  2054. nop __LINE__
  2055. }
  2056. { .mfb
  2057. STFD [C14] = f110, SIZE
  2058. FMPY f118 = ALPHA, f118
  2059. nop __LINE__
  2060. }
  2061. ;;
  2062. { .mfb
  2063. STFD [C6 ] = f107, 5 * SIZE
  2064. FMPY f115 = ALPHA, f115
  2065. nop __LINE__
  2066. }
  2067. { .mfb
  2068. STFD [C14] = f111, 5 * SIZE
  2069. FMPY f119 = ALPHA, f119
  2070. nop __LINE__
  2071. }
  2072. ;;
  2073. { .mfb
  2074. STFD [C7 ] = f112, SIZE
  2075. FMPY f120 = ALPHA, f120
  2076. nop __LINE__
  2077. }
  2078. { .mfb
  2079. STFD [C15] = f116, SIZE
  2080. FMPY f124 = ALPHA, f124
  2081. nop __LINE__
  2082. }
  2083. ;;
  2084. { .mfb
  2085. STFD [C7 ] = f113, SIZE
  2086. FMPY f121 = ALPHA, f121
  2087. nop __LINE__
  2088. }
  2089. { .mfb
  2090. STFD [C15] = f117, SIZE
  2091. FMPY f125 = ALPHA, f125
  2092. nop __LINE__
  2093. }
  2094. ;;
  2095. { .mfi
  2096. STFD [C7 ] = f114, SIZE
  2097. FMPY f122 = ALPHA, f122
  2098. #if defined(TRMMKERNEL) && \
  2099. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2100. sub L = K, KK
  2101. #else
  2102. nop __LINE__
  2103. #endif
  2104. }
  2105. { .mfb
  2106. STFD [C15] = f118, SIZE
  2107. FMPY f126 = ALPHA, f126
  2108. nop __LINE__
  2109. }
  2110. ;;
  2111. { .mfi
  2112. STFD [C7 ] = f115, 5 * SIZE
  2113. FMPY f123 = ALPHA, f123
  2114. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  2115. adds L = -8, L
  2116. #else
  2117. nop __LINE__
  2118. #endif
  2119. }
  2120. { .mfi
  2121. STFD [C15] = f119, 5 * SIZE
  2122. FMPY f127 = ALPHA, f127
  2123. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  2124. adds L = -8, L
  2125. #else
  2126. nop __LINE__
  2127. #endif
  2128. }
  2129. ;;
  2130. { .mfi
  2131. STFD [C8 ] = f120, SIZE
  2132. mov f64 = f0
  2133. #if defined(TRMMKERNEL) && \
  2134. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2135. shladd KK8 = L, BASE_SHIFT, r0
  2136. #else
  2137. nop __LINE__
  2138. #endif
  2139. }
  2140. { .mfb
  2141. STFD [C16] = f124, SIZE
  2142. mov f72 = f0
  2143. nop __LINE__
  2144. }
  2145. ;;
  2146. { .mfi
  2147. STFD [C8 ] = f121, SIZE
  2148. mov f80 = f0
  2149. #if defined(TRMMKERNEL) && \
  2150. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2151. shladd AOFFSET = KK8, 3, AOFFSET
  2152. #else
  2153. nop __LINE__
  2154. #endif
  2155. }
  2156. { .mfi
  2157. STFD [C16] = f125, SIZE
  2158. mov f88 = f0
  2159. #if defined(TRMMKERNEL) && \
  2160. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2161. shladd BOFFSET = KK8, 3, BOFFSET
  2162. #else
  2163. nop __LINE__
  2164. #endif
  2165. }
  2166. ;;
  2167. { .mfi
  2168. STFD [C8 ] = f122, SIZE
  2169. mov f96 = f0
  2170. #if defined(TRMMKERNEL) && defined(LEFT)
  2171. adds KK = 8, KK
  2172. #else
  2173. nop __LINE__
  2174. #endif
  2175. }
  2176. { .mfb
  2177. STFD [C16] = f126, SIZE
  2178. mov f104 = f0
  2179. nop __LINE__
  2180. }
  2181. ;;
  2182. { .mfi
  2183. STFD [C8 ] = f123, 5 * SIZE
  2184. mov f112 = f0
  2185. #ifdef TRMMKERNEL
  2186. shladd KK8 = KK, BASE_SHIFT, r0
  2187. #else
  2188. nop __LINE__
  2189. #endif
  2190. }
  2191. { .mfb
  2192. STFD [C16] = f127, 5 * SIZE
  2193. mov f120 = f0
  2194. (p6) br.cond.dptk .L011
  2195. }
  2196. ;;
  2197. #endif
  2198. .L020:
  2199. { .mfi
  2200. cmp.eq p3, p0 = r0, r0
  2201. mov f89 = f0
  2202. tbit.z p6, p7 = M, 2
  2203. }
  2204. { .mfb
  2205. #ifndef TRMMKERNEL
  2206. nop __LINE__
  2207. #else
  2208. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2209. sub L = K, KK
  2210. #elif defined(LEFT)
  2211. adds L = 4, KK
  2212. #else
  2213. adds L = 8, KK
  2214. #endif
  2215. #endif
  2216. mov f81 = f0
  2217. (p6) br.cond.dptk .L030
  2218. }
  2219. ;;
  2220. #if !defined(TRMMKERNEL) || \
  2221. defined(TRMMKERNEL) && \
  2222. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2223. { .mfi
  2224. LDFPD f48, f49 = [B]
  2225. mov f65 = f0
  2226. nop __LINE__
  2227. }
  2228. { .mfi
  2229. adds BOFFSET = 2 * SIZE, B
  2230. mov f73 = f0
  2231. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  2232. }
  2233. ;;
  2234. #else
  2235. { .mfi
  2236. shladd BOFFSET = KK8, 3, B
  2237. mov f65 = f0
  2238. shladd AOFFSET = KK8, 2, AOFFSET
  2239. }
  2240. ;;
  2241. { .mfi
  2242. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  2243. mov f73 = f0
  2244. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  2245. }
  2246. ;;
  2247. #endif
  2248. { .mmf
  2249. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  2250. setf.d f97 = r0
  2251. mov f105 = f0
  2252. }
  2253. { .mfi
  2254. setf.d f113 = r0
  2255. mov f121 = f0
  2256. #ifndef TRMMKERNEL
  2257. adds L = 1, K
  2258. #else
  2259. adds L = 1, L
  2260. #endif
  2261. }
  2262. ;;
  2263. { .mmf
  2264. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  2265. setf.d f66 = r0
  2266. mov f74 = f0
  2267. }
  2268. { .mfi
  2269. setf.d f82 = r0
  2270. mov f90 = f0
  2271. tbit.z p12, p0 = L, 0
  2272. }
  2273. ;;
  2274. { .mmf
  2275. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  2276. setf.d f98 = r0
  2277. mov f106 = f0
  2278. }
  2279. { .mfi
  2280. setf.d f114 = r0
  2281. mov f122 = f0
  2282. shr L = L, 1
  2283. }
  2284. ;;
  2285. { .mfi
  2286. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  2287. mov f75 = f0
  2288. adds L = -1, L
  2289. }
  2290. { .mmf
  2291. setf.d f67 = r0
  2292. setf.d f83 = r0
  2293. mov f91 = f0
  2294. }
  2295. ;;
  2296. { .mfi
  2297. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  2298. mov f107 = f0
  2299. mov ar.lc = L
  2300. }
  2301. { .mmf
  2302. setf.d f99 = r0
  2303. setf.d f115 = r0
  2304. mov f123 = f0
  2305. }
  2306. ;;
  2307. .align 32
  2308. .L022:
  2309. { .mfi
  2310. lfetch.nt1 [PREA], 16 * SIZE
  2311. FMA f64 = f32, f48, f64 // A1 * B1
  2312. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  2313. }
  2314. { .mfi
  2315. nop __LINE__
  2316. FMA f72 = f32, f49, f72 // A1 * B2
  2317. (p12) cmp.ne p3, p0 = 0, L
  2318. }
  2319. ;;
  2320. { .mfi
  2321. lfetch.nt1 [PREB], 16 * SIZE
  2322. FMA f80 = f32, f50, f80 // A1 * B3
  2323. cmp.ne p4, p5 = 0, L
  2324. }
  2325. { .mfb
  2326. nop __LINE__
  2327. FMA f88 = f32, f51, f88 // A1 * B4
  2328. nop __LINE__
  2329. }
  2330. ;;
  2331. { .mfi
  2332. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  2333. FMA f96 = f32, f52, f96 // A1 * B5
  2334. (p5) adds C9 = 2 * SIZE, C1
  2335. }
  2336. { .mfi
  2337. nop __LINE__
  2338. FMA f104 = f32, f53, f104 // A1 * B6
  2339. (p5) adds C10 = 2 * SIZE, C2
  2340. }
  2341. ;;
  2342. { .mfi
  2343. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  2344. FMA f112 = f32, f54, f112 // A1 * B7
  2345. (p5) adds C11 = 2 * SIZE, C3
  2346. }
  2347. { .mfi
  2348. nop __LINE__
  2349. FMA f120 = f32, f55, f120 // A1 * B8
  2350. (p5) adds C12 = 2 * SIZE, C4
  2351. }
  2352. ;;
  2353. { .mfi
  2354. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  2355. FMA f65 = f33, f48, f65 // A2 * B1
  2356. (p5) adds C13 = 2 * SIZE, C5
  2357. }
  2358. { .mfi
  2359. nop __LINE__
  2360. FMA f73 = f33, f49, f73 // A2 * B2
  2361. (p5) adds C14 = 2 * SIZE, C6
  2362. }
  2363. ;;
  2364. { .mfi
  2365. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  2366. FMA f81 = f33, f50, f81 // A2 * B3
  2367. (p5) adds C15 = 2 * SIZE, C7
  2368. }
  2369. { .mfi
  2370. nop __LINE__
  2371. FMA f89 = f33, f51, f89 // A2 * B4
  2372. (p5) adds C16 = 2 * SIZE, C8
  2373. }
  2374. ;;
  2375. { .mfb
  2376. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  2377. FMA f97 = f33, f52, f97 // A2 * B5
  2378. nop __LINE__
  2379. }
  2380. { .mfb
  2381. nop __LINE__
  2382. FMA f105 = f33, f53, f105 // A2 * B6
  2383. nop __LINE__
  2384. }
  2385. ;;
  2386. { .mfb
  2387. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  2388. FMA f113 = f33, f54, f113 // A2 * B7
  2389. nop __LINE__
  2390. }
  2391. { .mfb
  2392. nop __LINE__
  2393. FMA f121 = f33, f55, f121 // A2 * B8
  2394. nop __LINE__
  2395. }
  2396. ;;
  2397. { .mfb
  2398. nop __LINE__
  2399. FMA f66 = f34, f48, f66 // A3 * B1
  2400. nop __LINE__
  2401. }
  2402. { .mfb
  2403. nop __LINE__
  2404. FMA f74 = f34, f49, f74 // A3 * B2
  2405. nop __LINE__
  2406. }
  2407. ;;
  2408. { .mfb
  2409. nop __LINE__
  2410. FMA f82 = f34, f50, f82 // A3 * B3
  2411. nop __LINE__
  2412. }
  2413. { .mfb
  2414. nop __LINE__
  2415. FMA f90 = f34, f51, f90 // A3 * B4
  2416. nop __LINE__
  2417. }
  2418. ;;
  2419. { .mfb
  2420. nop __LINE__
  2421. FMA f98 = f34, f52, f98 // A3 * B5
  2422. nop __LINE__
  2423. }
  2424. { .mfb
  2425. nop __LINE__
  2426. FMA f106 = f34, f53, f106 // A3 * B6
  2427. nop __LINE__
  2428. }
  2429. { .mfb
  2430. nop __LINE__
  2431. FMA f114 = f34, f54, f114 // A3 * B7
  2432. nop __LINE__
  2433. }
  2434. { .mfb
  2435. nop __LINE__
  2436. FMA f122 = f34, f55, f122 // A3 * B8
  2437. nop __LINE__
  2438. }
  2439. { .mfb
  2440. nop __LINE__
  2441. FMA f67 = f35, f48, f67 // A4 * B1
  2442. nop __LINE__
  2443. }
  2444. { .mfb
  2445. nop __LINE__
  2446. FMA f75 = f35, f49, f75 // A4 * B2
  2447. nop __LINE__
  2448. }
  2449. { .mfb
  2450. nop __LINE__
  2451. FMA f83 = f35, f50, f83 // A4 * B3
  2452. nop __LINE__
  2453. }
  2454. { .mfb
  2455. nop __LINE__
  2456. FMA f91 = f35, f51, f91 // A4 * B4
  2457. nop __LINE__
  2458. }
  2459. { .mfb
  2460. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  2461. FMA f99 = f35, f52, f99 // A4 * B5
  2462. nop __LINE__
  2463. }
  2464. { .mfb
  2465. nop __LINE__
  2466. FMA f107 = f35, f53, f107 // A4 * B6
  2467. nop __LINE__
  2468. }
  2469. { .mfb
  2470. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  2471. FMA f115 = f35, f54, f115 // A4 * B7
  2472. nop __LINE__
  2473. }
  2474. { .mfb
  2475. nop __LINE__
  2476. FMA f123 = f35, f55, f123 // A4 * B8
  2477. nop __LINE__
  2478. }
  2479. ;;
  2480. { .mfb
  2481. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  2482. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  2483. nop __LINE__
  2484. }
  2485. { .mfb
  2486. nop __LINE__
  2487. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  2488. nop __LINE__
  2489. }
  2490. ;;
  2491. { .mfb
  2492. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  2493. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  2494. nop __LINE__
  2495. }
  2496. { .mfb
  2497. nop __LINE__
  2498. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  2499. nop __LINE__
  2500. }
  2501. ;;
  2502. { .mfb
  2503. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2504. (p5) LDFD f68 = [C1 ], SIZE
  2505. #else
  2506. nop __LINE__
  2507. #endif
  2508. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  2509. nop __LINE__
  2510. }
  2511. { .mfb
  2512. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2513. (p5) LDFD f70 = [C9 ], SIZE
  2514. #else
  2515. nop __LINE__
  2516. #endif
  2517. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  2518. nop __LINE__
  2519. }
  2520. ;;
  2521. { .mfb
  2522. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2523. (p5) LDFD f69 = [C1 ], -1 * SIZE
  2524. #else
  2525. nop __LINE__
  2526. #endif
  2527. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  2528. nop __LINE__
  2529. }
  2530. { .mfb
  2531. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2532. (p5) LDFD f71 = [C9 ], -1 * SIZE
  2533. #else
  2534. nop __LINE__
  2535. #endif
  2536. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  2537. nop __LINE__
  2538. }
  2539. ;;
  2540. { .mfb
  2541. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  2542. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  2543. nop __LINE__
  2544. }
  2545. { .mfb
  2546. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  2547. nop __LINE__
  2548. }
  2549. { .mfb
  2550. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  2551. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  2552. nop __LINE__
  2553. }
  2554. { .mfb
  2555. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  2556. nop __LINE__
  2557. }
  2558. ;;
  2559. { .mfb
  2560. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2561. (p5) LDFD f76 = [C2 ], SIZE
  2562. #else
  2563. nop __LINE__
  2564. #endif
  2565. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  2566. nop __LINE__
  2567. }
  2568. { .mfb
  2569. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2570. (p5) LDFD f78 = [C10], SIZE
  2571. #else
  2572. nop __LINE__
  2573. #endif
  2574. (p3) FMA f105 = f41, f61, f105 // A2 * B6
  2575. nop __LINE__
  2576. }
  2577. ;;
  2578. { .mfb
  2579. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2580. (p5) LDFD f77 = [C2 ], -1 * SIZE
  2581. #else
  2582. nop __LINE__
  2583. #endif
  2584. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  2585. nop __LINE__
  2586. }
  2587. { .mfb
  2588. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2589. (p5) LDFD f79 = [C10], -1 * SIZE
  2590. #else
  2591. nop __LINE__
  2592. #endif
  2593. (p3) FMA f121 = f41, f63, f121 // A2 * B8
  2594. nop __LINE__
  2595. }
  2596. ;;
  2597. { .mfb
  2598. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2599. (p5) LDFD f84 = [C3 ], SIZE
  2600. #else
  2601. nop __LINE__
  2602. #endif
  2603. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  2604. nop __LINE__
  2605. }
  2606. { .mfb
  2607. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2608. (p5) LDFD f86 = [C11], SIZE
  2609. #else
  2610. nop __LINE__
  2611. #endif
  2612. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  2613. nop __LINE__
  2614. }
  2615. ;;
  2616. { .mfb
  2617. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2618. (p5) LDFD f85 = [C3 ], -1 * SIZE
  2619. #else
  2620. nop __LINE__
  2621. #endif
  2622. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  2623. nop __LINE__
  2624. }
  2625. { .mfb
  2626. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2627. (p5) LDFD f87 = [C11], -1 * SIZE
  2628. #else
  2629. nop __LINE__
  2630. #endif
  2631. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  2632. nop __LINE__
  2633. }
  2634. ;;
  2635. { .mfb
  2636. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2637. (p5) LDFD f92 = [C4 ], SIZE
  2638. #else
  2639. nop __LINE__
  2640. #endif
  2641. (p3) FMA f98 = f42, f60, f98 // A3 * B5
  2642. nop __LINE__
  2643. }
  2644. { .mfb
  2645. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2646. (p5) LDFD f94 = [C12], SIZE
  2647. #else
  2648. nop __LINE__
  2649. #endif
  2650. (p3) FMA f106 = f42, f61, f106 // A3 * B6
  2651. nop __LINE__
  2652. }
  2653. ;;
  2654. { .mfb
  2655. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2656. (p5) LDFD f93 = [C4 ], -1 * SIZE
  2657. #else
  2658. nop __LINE__
  2659. #endif
  2660. (p3) FMA f114 = f42, f62, f114 // A3 * B7
  2661. nop __LINE__
  2662. }
  2663. { .mfb
  2664. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2665. (p5) LDFD f95 = [C12], -1 * SIZE
  2666. #else
  2667. nop __LINE__
  2668. #endif
  2669. (p3) FMA f122 = f42, f63, f122 // A3 * B8
  2670. nop __LINE__
  2671. }
  2672. ;;
  2673. { .mfb
  2674. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2675. (p5) LDFD f100 = [C5 ], SIZE
  2676. #else
  2677. nop __LINE__
  2678. #endif
  2679. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  2680. nop __LINE__
  2681. }
  2682. { .mfb
  2683. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2684. (p5) LDFD f102 = [C13], SIZE
  2685. #else
  2686. nop __LINE__
  2687. #endif
  2688. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  2689. nop __LINE__
  2690. }
  2691. ;;
  2692. { .mfb
  2693. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2694. (p5) LDFD f101 = [C5 ], -1 * SIZE
  2695. #else
  2696. nop __LINE__
  2697. #endif
  2698. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  2699. nop __LINE__
  2700. }
  2701. { .mfb
  2702. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2703. (p5) LDFD f103 = [C13], -1 * SIZE
  2704. #else
  2705. nop __LINE__
  2706. #endif
  2707. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  2708. nop __LINE__
  2709. }
  2710. ;;
  2711. { .mfb
  2712. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2713. (p5) LDFD f108 = [C6 ], SIZE
  2714. #else
  2715. nop __LINE__
  2716. #endif
  2717. (p3) FMA f99 = f43, f60, f99 // A4 * B5
  2718. nop __LINE__
  2719. }
  2720. { .mfb
  2721. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2722. (p5) LDFD f110 = [C14], SIZE
  2723. #else
  2724. nop __LINE__
  2725. #endif
  2726. (p3) FMA f107 = f43, f61, f107 // A4 * B6
  2727. nop __LINE__
  2728. }
  2729. ;;
  2730. { .mfi
  2731. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2732. (p5) LDFD f109 = [C6 ], -1 * SIZE
  2733. #else
  2734. nop __LINE__
  2735. #endif
  2736. (p3) FMA f115 = f43, f62, f115 // A4 * B7
  2737. adds L = -1, L
  2738. }
  2739. { .mfb
  2740. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2741. (p5) LDFD f111 = [C14], -1 * SIZE
  2742. #else
  2743. nop __LINE__
  2744. #endif
  2745. (p3) FMA f123 = f43, f63, f123 // A4 * B8
  2746. br.cloop.sptk.few .L022
  2747. }
  2748. ;;
  2749. .L028:
  2750. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2751. { .mfb
  2752. LDFD f116 = [C7 ], SIZE
  2753. FMA f64 = ALPHA, f64, f68
  2754. nop __LINE__
  2755. }
  2756. { .mfb
  2757. LDFD f118 = [C15], SIZE
  2758. FMA f66 = ALPHA, f66, f70
  2759. nop __LINE__
  2760. }
  2761. ;;
  2762. { .mfb
  2763. LDFD f117 = [C7 ], -1 * SIZE
  2764. FMA f65 = ALPHA, f65, f69
  2765. nop __LINE__
  2766. }
  2767. { .mfb
  2768. LDFD f119 = [C15], -1 * SIZE
  2769. FMA f67 = ALPHA, f67, f71
  2770. nop __LINE__
  2771. }
  2772. ;;
  2773. { .mfb
  2774. LDFD f124 = [C8], SIZE
  2775. FMA f72 = ALPHA, f72, f76
  2776. nop __LINE__
  2777. }
  2778. { .mfb
  2779. LDFD f126 = [C16], SIZE
  2780. FMA f74 = ALPHA, f74, f78
  2781. nop __LINE__
  2782. }
  2783. ;;
  2784. { .mfb
  2785. LDFD f125 = [C8], -1 * SIZE
  2786. FMA f73 = ALPHA, f73, f77
  2787. nop __LINE__
  2788. }
  2789. { .mfb
  2790. LDFD f127 = [C16], -1 * SIZE
  2791. FMA f75 = ALPHA, f75, f79
  2792. nop __LINE__
  2793. }
  2794. ;;
  2795. { .mfb
  2796. STFD [C1 ] = f64, SIZE
  2797. FMA f80 = ALPHA, f80, f84
  2798. nop __LINE__
  2799. }
  2800. { .mfb
  2801. STFD [C9 ] = f66, SIZE
  2802. FMA f82 = ALPHA, f82, f86
  2803. nop __LINE__
  2804. }
  2805. ;;
  2806. { .mfb
  2807. STFD [C1 ] = f65, 3 * SIZE
  2808. FMA f81 = ALPHA, f81, f85
  2809. nop __LINE__
  2810. }
  2811. { .mfb
  2812. STFD [C9 ] = f67, 3 * SIZE
  2813. FMA f83 = ALPHA, f83, f87
  2814. nop __LINE__
  2815. }
  2816. ;;
  2817. { .mfb
  2818. STFD [C2 ] = f72, SIZE
  2819. FMA f88 = ALPHA, f88, f92
  2820. nop __LINE__
  2821. }
  2822. { .mfb
  2823. STFD [C10] = f74, SIZE
  2824. FMA f90 = ALPHA, f90, f94
  2825. nop __LINE__
  2826. }
  2827. ;;
  2828. { .mfb
  2829. STFD [C2 ] = f73, 3 * SIZE
  2830. FMA f89 = ALPHA, f89, f93
  2831. nop __LINE__
  2832. }
  2833. { .mfb
  2834. STFD [C10] = f75, 3 * SIZE
  2835. FMA f91 = ALPHA, f91, f95
  2836. nop __LINE__
  2837. }
  2838. ;;
  2839. { .mfb
  2840. STFD [C3 ] = f80, SIZE
  2841. FMA f96 = ALPHA, f96, f100
  2842. nop __LINE__
  2843. }
  2844. { .mfb
  2845. STFD [C11] = f82, SIZE
  2846. FMA f98 = ALPHA, f98, f102
  2847. nop __LINE__
  2848. }
  2849. ;;
  2850. { .mfb
  2851. STFD [C3 ] = f81, 3 * SIZE
  2852. FMA f97 = ALPHA, f97, f101
  2853. nop __LINE__
  2854. }
  2855. { .mfb
  2856. STFD [C11] = f83, 3 * SIZE
  2857. FMA f99 = ALPHA, f99, f103
  2858. nop __LINE__
  2859. }
  2860. ;;
  2861. { .mfb
  2862. STFD [C4 ] = f88, SIZE
  2863. FMA f104 = ALPHA, f104, f108
  2864. nop __LINE__
  2865. }
  2866. { .mfb
  2867. STFD [C12] = f90, SIZE
  2868. FMA f106 = ALPHA, f106, f110
  2869. nop __LINE__
  2870. }
  2871. ;;
  2872. { .mfb
  2873. STFD [C4 ] = f89, 3 * SIZE
  2874. FMA f105 = ALPHA, f105, f109
  2875. nop __LINE__
  2876. }
  2877. { .mfb
  2878. STFD [C12] = f91, 3 * SIZE
  2879. FMA f107 = ALPHA, f107, f111
  2880. nop __LINE__
  2881. }
  2882. ;;
  2883. { .mfb
  2884. STFD [C5 ] = f96, SIZE
  2885. FMA f112 = ALPHA, f112, f116
  2886. nop __LINE__
  2887. }
  2888. { .mfb
  2889. STFD [C13] = f98, SIZE
  2890. FMA f114 = ALPHA, f114, f118
  2891. nop __LINE__
  2892. }
  2893. ;;
  2894. { .mfb
  2895. STFD [C5 ] = f97, 3 * SIZE
  2896. FMA f113 = ALPHA, f113, f117
  2897. nop __LINE__
  2898. }
  2899. { .mfb
  2900. STFD [C13] = f99, 3 * SIZE
  2901. FMA f115 = ALPHA, f115, f119
  2902. nop __LINE__
  2903. }
  2904. ;;
  2905. { .mfb
  2906. STFD [C6 ] = f104, SIZE
  2907. FMA f120 = ALPHA, f120, f124
  2908. nop __LINE__
  2909. }
  2910. { .mfb
  2911. STFD [C14] = f106, SIZE
  2912. FMA f122 = ALPHA, f122, f126
  2913. nop __LINE__
  2914. }
  2915. ;;
  2916. { .mfb
  2917. STFD [C6 ] = f105, 3 * SIZE
  2918. FMA f121 = ALPHA, f121, f125
  2919. nop __LINE__
  2920. }
  2921. { .mfb
  2922. STFD [C14] = f107, 3 * SIZE
  2923. FMA f123 = ALPHA, f123, f127
  2924. nop __LINE__
  2925. }
  2926. ;;
  2927. { .mfb
  2928. STFD [C7 ] = f112, SIZE
  2929. mov f64 = f0
  2930. nop __LINE__
  2931. }
  2932. { .mfb
  2933. STFD [C15] = f114, SIZE
  2934. mov f72 = f0
  2935. nop __LINE__
  2936. }
  2937. ;;
  2938. { .mfb
  2939. STFD [C7 ] = f113, 3 * SIZE
  2940. mov f80 = f0
  2941. nop __LINE__
  2942. }
  2943. { .mfb
  2944. STFD [C15] = f115, 3 * SIZE
  2945. mov f88 = f0
  2946. nop __LINE__
  2947. }
  2948. ;;
  2949. { .mfb
  2950. STFD [C8 ] = f120, SIZE
  2951. mov f96 = f0
  2952. nop __LINE__
  2953. }
  2954. { .mfb
  2955. STFD [C16] = f122, SIZE
  2956. mov f104 = f0
  2957. nop __LINE__
  2958. }
  2959. ;;
  2960. { .mfb
  2961. STFD [C8 ] = f121, 3 * SIZE
  2962. mov f112 = f0
  2963. nop __LINE__
  2964. }
  2965. { .mfb
  2966. STFD [C16] = f123, 3 * SIZE
  2967. mov f120 = f0
  2968. nop __LINE__
  2969. }
  2970. ;;
  2971. #else
  2972. { .mfb
  2973. FMPY f64 = ALPHA, f64
  2974. nop __LINE__
  2975. }
  2976. { .mfb
  2977. FMPY f66 = ALPHA, f66
  2978. nop __LINE__
  2979. }
  2980. ;;
  2981. { .mfb
  2982. FMPY f65 = ALPHA, f65
  2983. nop __LINE__
  2984. }
  2985. { .mfb
  2986. FMPY f67 = ALPHA, f67
  2987. nop __LINE__
  2988. }
  2989. ;;
  2990. { .mfb
  2991. FMPY f72 = ALPHA, f72
  2992. nop __LINE__
  2993. }
  2994. { .mfb
  2995. FMPY f74 = ALPHA, f74
  2996. nop __LINE__
  2997. }
  2998. ;;
  2999. { .mfb
  3000. FMPY f73 = ALPHA, f73
  3001. nop __LINE__
  3002. }
  3003. { .mfb
  3004. FMPY f75 = ALPHA, f75
  3005. nop __LINE__
  3006. }
  3007. ;;
  3008. { .mfb
  3009. STFD [C1 ] = f64, SIZE
  3010. FMPY f80 = ALPHA, f80
  3011. nop __LINE__
  3012. }
  3013. { .mfb
  3014. STFD [C9 ] = f66, SIZE
  3015. FMPY f82 = ALPHA, f82
  3016. nop __LINE__
  3017. }
  3018. ;;
  3019. { .mfb
  3020. STFD [C1 ] = f65, 3 * SIZE
  3021. FMPY f81 = ALPHA, f81
  3022. nop __LINE__
  3023. }
  3024. { .mfb
  3025. STFD [C9 ] = f67, 3 * SIZE
  3026. FMPY f83 = ALPHA, f83
  3027. nop __LINE__
  3028. }
  3029. ;;
  3030. { .mfb
  3031. STFD [C2 ] = f72, SIZE
  3032. FMPY f88 = ALPHA, f88
  3033. nop __LINE__
  3034. }
  3035. { .mfb
  3036. STFD [C10] = f74, SIZE
  3037. FMPY f90 = ALPHA, f90
  3038. nop __LINE__
  3039. }
  3040. ;;
  3041. { .mfb
  3042. STFD [C2 ] = f73, 3 * SIZE
  3043. FMPY f89 = ALPHA, f89
  3044. nop __LINE__
  3045. }
  3046. { .mfb
  3047. STFD [C10] = f75, 3 * SIZE
  3048. FMPY f91 = ALPHA, f91
  3049. nop __LINE__
  3050. }
  3051. ;;
  3052. { .mfb
  3053. STFD [C3 ] = f80, SIZE
  3054. FMPY f96 = ALPHA, f96
  3055. nop __LINE__
  3056. }
  3057. { .mfb
  3058. STFD [C11] = f82, SIZE
  3059. FMPY f98 = ALPHA, f98
  3060. nop __LINE__
  3061. }
  3062. ;;
  3063. { .mfb
  3064. STFD [C3 ] = f81, 3 * SIZE
  3065. FMPY f97 = ALPHA, f97
  3066. nop __LINE__
  3067. }
  3068. { .mfb
  3069. STFD [C11] = f83, 3 * SIZE
  3070. FMPY f99 = ALPHA, f99
  3071. nop __LINE__
  3072. }
  3073. ;;
  3074. { .mfb
  3075. STFD [C4 ] = f88, SIZE
  3076. FMPY f104 = ALPHA, f104
  3077. nop __LINE__
  3078. }
  3079. { .mfb
  3080. STFD [C12] = f90, SIZE
  3081. FMPY f106 = ALPHA, f106
  3082. nop __LINE__
  3083. }
  3084. ;;
  3085. { .mfb
  3086. STFD [C4 ] = f89, 3 * SIZE
  3087. FMPY f105 = ALPHA, f105
  3088. nop __LINE__
  3089. }
  3090. { .mfb
  3091. STFD [C12] = f91, 3 * SIZE
  3092. FMPY f107 = ALPHA, f107
  3093. nop __LINE__
  3094. }
  3095. ;;
  3096. { .mfb
  3097. STFD [C5 ] = f96, SIZE
  3098. FMPY f112 = ALPHA, f112
  3099. nop __LINE__
  3100. }
  3101. { .mfb
  3102. STFD [C13] = f98, SIZE
  3103. FMPY f114 = ALPHA, f114
  3104. nop __LINE__
  3105. }
  3106. ;;
  3107. { .mfb
  3108. STFD [C5 ] = f97, 3 * SIZE
  3109. FMPY f113 = ALPHA, f113
  3110. nop __LINE__
  3111. }
  3112. { .mfb
  3113. STFD [C13] = f99, 3 * SIZE
  3114. FMPY f115 = ALPHA, f115
  3115. nop __LINE__
  3116. }
  3117. ;;
  3118. { .mfi
  3119. STFD [C6 ] = f104, SIZE
  3120. FMPY f120 = ALPHA, f120
  3121. #if defined(TRMMKERNEL) && \
  3122. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3123. sub L = K, KK
  3124. #else
  3125. nop __LINE__
  3126. #endif
  3127. }
  3128. { .mfb
  3129. STFD [C14] = f106, SIZE
  3130. FMPY f122 = ALPHA, f122
  3131. nop __LINE__
  3132. }
  3133. ;;
  3134. { .mfi
  3135. STFD [C6 ] = f105, 3 * SIZE
  3136. FMPY f121 = ALPHA, f121
  3137. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  3138. adds L = -4, L
  3139. #else
  3140. nop __LINE__
  3141. #endif
  3142. }
  3143. { .mfi
  3144. STFD [C14] = f107, 3 * SIZE
  3145. FMPY f123 = ALPHA, f123
  3146. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  3147. adds L = -8, L
  3148. #else
  3149. nop __LINE__
  3150. #endif
  3151. }
  3152. ;;
  3153. { .mfi
  3154. STFD [C7 ] = f112, SIZE
  3155. mov f64 = f0
  3156. #if defined(TRMMKERNEL) && \
  3157. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3158. shladd KK8 = L, BASE_SHIFT, r0
  3159. #else
  3160. nop __LINE__
  3161. #endif
  3162. }
  3163. { .mfb
  3164. STFD [C15] = f114, SIZE
  3165. mov f72 = f0
  3166. nop __LINE__
  3167. }
  3168. ;;
  3169. { .mfi
  3170. STFD [C7 ] = f113, 3 * SIZE
  3171. mov f80 = f0
  3172. #if defined(TRMMKERNEL) && \
  3173. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3174. shladd AOFFSET = KK8, 2, AOFFSET
  3175. #else
  3176. nop __LINE__
  3177. #endif
  3178. }
  3179. { .mfi
  3180. STFD [C15] = f115, 3 * SIZE
  3181. mov f88 = f0
  3182. #if defined(TRMMKERNEL) && \
  3183. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3184. shladd BOFFSET = KK8, 3, BOFFSET
  3185. #else
  3186. nop __LINE__
  3187. #endif
  3188. }
  3189. ;;
  3190. { .mfi
  3191. STFD [C8 ] = f120, SIZE
  3192. mov f96 = f0
  3193. #if defined(TRMMKERNEL) && defined(LEFT)
  3194. adds KK = 4, KK
  3195. #else
  3196. nop __LINE__
  3197. #endif
  3198. }
  3199. { .mfb
  3200. STFD [C16] = f122, SIZE
  3201. mov f104 = f0
  3202. nop __LINE__
  3203. }
  3204. ;;
  3205. { .mfi
  3206. STFD [C8 ] = f121, 3 * SIZE
  3207. mov f112 = f0
  3208. #ifdef TRMMKERNEL
  3209. shladd KK8 = KK, BASE_SHIFT, r0
  3210. #else
  3211. nop __LINE__
  3212. #endif
  3213. }
  3214. { .mfb
  3215. STFD [C16] = f123, 3 * SIZE
  3216. mov f120 = f0
  3217. nop __LINE__
  3218. }
  3219. ;;
  3220. #endif
  3221. .align 32
  3222. .L030:
  3223. { .mib
  3224. #ifndef TRMMKERNEL
  3225. nop __LINE__
  3226. #else
  3227. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3228. sub L = K, KK
  3229. #elif defined(LEFT)
  3230. adds L = 2, KK
  3231. #else
  3232. adds L = 8, KK
  3233. #endif
  3234. #endif
  3235. tbit.z p6, p7 = M, 1
  3236. (p6) br.cond.dptk .L040
  3237. }
  3238. ;;
  3239. #if !defined(TRMMKERNEL) || \
  3240. defined(TRMMKERNEL) && \
  3241. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3242. { .mfi
  3243. LDFPD f48, f49 = [B]
  3244. mov f65 = f0
  3245. nop __LINE__
  3246. }
  3247. { .mfi
  3248. adds BOFFSET = 2 * SIZE, B
  3249. mov f73 = f0
  3250. #ifndef TRMMKERNEL
  3251. adds L = 1, K
  3252. #else
  3253. adds L = 1, L
  3254. #endif
  3255. }
  3256. #else
  3257. { .mmf
  3258. shladd BOFFSET = KK8, 3, B
  3259. shladd AOFFSET = KK8, 1, AOFFSET
  3260. mov f65 = f0
  3261. }
  3262. ;;
  3263. { .mfi
  3264. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3265. mov f73 = f0
  3266. #ifndef TRMMKERNEL
  3267. adds L = 1, K
  3268. #else
  3269. adds L = 1, L
  3270. #endif
  3271. }
  3272. #endif
  3273. ;;
  3274. { .mfi
  3275. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3276. mov f81 = f0
  3277. tbit.z p12, p0 = L, 0
  3278. }
  3279. { .mfi
  3280. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  3281. mov f89 = f0
  3282. shr L = L, 1
  3283. }
  3284. ;;
  3285. { .mfi
  3286. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  3287. mov f97 = f0
  3288. adds L = -1, L
  3289. }
  3290. { .mfi
  3291. nop __LINE__
  3292. mov f105 = f0
  3293. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  3294. }
  3295. ;;
  3296. { .mfi
  3297. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  3298. mov f113 = f0
  3299. mov ar.lc = L
  3300. }
  3301. { .mfi
  3302. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  3303. mov f121 = f0
  3304. cmp.eq p3, p0 = r0, r0
  3305. }
  3306. ;;
  3307. .align 32
  3308. .L032:
  3309. { .mfb
  3310. lfetch.nt1 [PREA], 4 * SIZE
  3311. FMA f64 = f32, f48, f64 // A1 * B1
  3312. nop __LINE__
  3313. }
  3314. { .mfi
  3315. nop __LINE__
  3316. FMA f72 = f32, f49, f72 // A1 * B2
  3317. (p12) cmp.ne p3, p0 = 0, L
  3318. }
  3319. ;;
  3320. { .mfi
  3321. lfetch.nt1 [PREB], 16 * SIZE
  3322. FMA f80 = f32, f50, f80 // A1 * B3
  3323. cmp.ne p4, p5 = 0, L
  3324. }
  3325. { .mfb
  3326. nop __LINE__
  3327. FMA f88 = f32, f51, f88 // A1 * B4
  3328. nop __LINE__
  3329. }
  3330. ;;
  3331. { .mfb
  3332. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  3333. FMA f96 = f32, f52, f96 // A1 * B5
  3334. nop __LINE__
  3335. }
  3336. { .mfb
  3337. nop __LINE__
  3338. FMA f104 = f32, f53, f104 // A1 * B6
  3339. nop __LINE__
  3340. }
  3341. ;;
  3342. { .mfb
  3343. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  3344. FMA f112 = f32, f54, f112 // A1 * B7
  3345. nop __LINE__
  3346. }
  3347. { .mfb
  3348. nop __LINE__
  3349. FMA f120 = f32, f55, f120 // A1 * B8
  3350. nop __LINE__
  3351. }
  3352. ;;
  3353. { .mfb
  3354. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  3355. FMA f65 = f33, f48, f65 // A2 * B1
  3356. nop __LINE__
  3357. }
  3358. { .mfb
  3359. nop __LINE__
  3360. FMA f73 = f33, f49, f73 // A2 * B2
  3361. nop __LINE__
  3362. }
  3363. ;;
  3364. { .mfb
  3365. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  3366. FMA f81 = f33, f50, f81 // A2 * B3
  3367. nop __LINE__
  3368. }
  3369. { .mfb
  3370. nop __LINE__
  3371. FMA f89 = f33, f51, f89 // A2 * B4
  3372. nop __LINE__
  3373. }
  3374. ;;
  3375. { .mfb
  3376. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  3377. FMA f97 = f33, f52, f97 // A2 * B5
  3378. nop __LINE__
  3379. }
  3380. { .mfb
  3381. nop __LINE__
  3382. FMA f105 = f33, f53, f105 // A2 * B6
  3383. nop __LINE__
  3384. }
  3385. ;;
  3386. { .mfb
  3387. nop __LINE__
  3388. FMA f113 = f33, f54, f113 // A2 * B7
  3389. nop __LINE__
  3390. }
  3391. { .mfb
  3392. nop __LINE__
  3393. FMA f121 = f33, f55, f121 // A2 * B8
  3394. nop __LINE__
  3395. }
  3396. ;;
  3397. { .mfb
  3398. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  3399. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  3400. nop __LINE__
  3401. }
  3402. { .mfb
  3403. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3404. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  3405. nop __LINE__
  3406. }
  3407. ;;
  3408. { .mfb
  3409. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3410. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  3411. nop __LINE__
  3412. }
  3413. { .mfb
  3414. nop __LINE__
  3415. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  3416. nop __LINE__
  3417. }
  3418. ;;
  3419. { .mfb
  3420. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  3421. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  3422. nop __LINE__
  3423. }
  3424. { .mfb
  3425. nop __LINE__
  3426. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  3427. nop __LINE__
  3428. }
  3429. ;;
  3430. { .mfb
  3431. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3432. (p5) LDFD f68 = [C1], SIZE
  3433. #else
  3434. nop __LINE__
  3435. #endif
  3436. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  3437. nop __LINE__
  3438. }
  3439. { .mfb
  3440. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3441. (p5) LDFD f76 = [C2], SIZE
  3442. #else
  3443. nop __LINE__
  3444. #endif
  3445. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  3446. nop __LINE__
  3447. }
  3448. ;;
  3449. { .mfb
  3450. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  3451. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  3452. nop __LINE__
  3453. }
  3454. { .mfb
  3455. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  3456. nop __LINE__
  3457. }
  3458. { .mfb
  3459. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3460. (p5) LDFD f69 = [C1], -1 * SIZE
  3461. #else
  3462. nop __LINE__
  3463. #endif
  3464. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  3465. nop __LINE__
  3466. }
  3467. { .mfb
  3468. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3469. (p5) LDFD f77 = [C2], -1 * SIZE
  3470. #else
  3471. nop __LINE__
  3472. #endif
  3473. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  3474. nop __LINE__
  3475. }
  3476. ;;
  3477. { .mfb
  3478. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3479. (p5) LDFD f84 = [C3], SIZE
  3480. #else
  3481. nop __LINE__
  3482. #endif
  3483. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  3484. nop __LINE__
  3485. }
  3486. { .mfb
  3487. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3488. (p5) LDFD f92 = [C4], SIZE
  3489. #else
  3490. nop __LINE__
  3491. #endif
  3492. (p3) FMA f105 = f41, f61, f105 // A2 * B6
  3493. nop __LINE__
  3494. }
  3495. ;;
  3496. { .mfi
  3497. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3498. (p5) LDFD f85 = [C3], -1 * SIZE
  3499. #else
  3500. nop __LINE__
  3501. #endif
  3502. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  3503. adds L = -1, L
  3504. }
  3505. { .mfb
  3506. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3507. (p5) LDFD f93 = [C4], -1 * SIZE
  3508. #else
  3509. nop __LINE__
  3510. #endif
  3511. (p3) FMA f121 = f41, f63, f121 // A2 * B8
  3512. br.cloop.sptk.few .L032
  3513. }
  3514. ;;
  3515. .L038:
  3516. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3517. { .mfb
  3518. LDFD f100 = [C5], SIZE
  3519. FMA f64 = ALPHA, f64, f68
  3520. nop __LINE__
  3521. }
  3522. { .mfb
  3523. LDFD f108 = [C6], SIZE
  3524. FMA f65 = ALPHA, f65, f69
  3525. nop __LINE__
  3526. }
  3527. ;;
  3528. { .mfb
  3529. LDFD f101 = [C5], -1 * SIZE
  3530. FMA f72 = ALPHA, f72, f76
  3531. nop __LINE__
  3532. }
  3533. { .mfb
  3534. LDFD f109 = [C6], -1 * SIZE
  3535. FMA f73 = ALPHA, f73, f77
  3536. nop __LINE__
  3537. }
  3538. ;;
  3539. { .mfb
  3540. LDFD f116 = [C7], SIZE
  3541. FMA f80 = ALPHA, f80, f84
  3542. nop __LINE__
  3543. }
  3544. { .mfb
  3545. LDFD f124 = [C8], SIZE
  3546. FMA f81 = ALPHA, f81, f85
  3547. nop __LINE__
  3548. }
  3549. ;;
  3550. { .mfb
  3551. LDFD f117 = [C7], -1 * SIZE
  3552. FMA f88 = ALPHA, f88, f92
  3553. nop __LINE__
  3554. }
  3555. { .mfb
  3556. LDFD f125 = [C8], -1 * SIZE
  3557. FMA f89 = ALPHA, f89, f93
  3558. nop __LINE__
  3559. }
  3560. ;;
  3561. { .mfb
  3562. STFD [C1 ] = f64, SIZE
  3563. FMA f96 = ALPHA, f96, f100
  3564. nop __LINE__
  3565. }
  3566. { .mfb
  3567. STFD [C2 ] = f72, SIZE
  3568. FMA f104 = ALPHA, f104, f108
  3569. nop __LINE__
  3570. }
  3571. ;;
  3572. { .mfb
  3573. STFD [C1 ] = f65, SIZE
  3574. FMA f97 = ALPHA, f97, f101
  3575. nop __LINE__
  3576. }
  3577. { .mfb
  3578. STFD [C2 ] = f73, SIZE
  3579. FMA f105 = ALPHA, f105, f109
  3580. nop __LINE__
  3581. }
  3582. ;;
  3583. { .mfb
  3584. STFD [C3 ] = f80, SIZE
  3585. FMA f112 = ALPHA, f112, f116
  3586. nop __LINE__
  3587. }
  3588. { .mfb
  3589. STFD [C4 ] = f88, SIZE
  3590. FMA f120 = ALPHA, f120, f124
  3591. nop __LINE__
  3592. }
  3593. ;;
  3594. { .mfb
  3595. STFD [C3 ] = f81, SIZE
  3596. FMA f113 = ALPHA, f113, f117
  3597. nop __LINE__
  3598. }
  3599. { .mfb
  3600. STFD [C4 ] = f89, SIZE
  3601. FMA f121 = ALPHA, f121, f125
  3602. nop __LINE__
  3603. }
  3604. ;;
  3605. { .mfb
  3606. STFD [C5 ] = f96, SIZE
  3607. mov f64 = f0
  3608. nop __LINE__
  3609. }
  3610. { .mfb
  3611. STFD [C6 ] = f104, SIZE
  3612. mov f72 = f0
  3613. nop __LINE__
  3614. }
  3615. ;;
  3616. { .mfb
  3617. STFD [C5 ] = f97, SIZE
  3618. mov f80 = f0
  3619. nop __LINE__
  3620. }
  3621. { .mfb
  3622. STFD [C6 ] = f105, SIZE
  3623. mov f88 = f0
  3624. nop __LINE__
  3625. }
  3626. ;;
  3627. { .mfb
  3628. STFD [C7 ] = f112, SIZE
  3629. mov f96 = f0
  3630. nop __LINE__
  3631. }
  3632. { .mfb
  3633. STFD [C8 ] = f120, SIZE
  3634. mov f104 = f0
  3635. nop __LINE__
  3636. }
  3637. ;;
  3638. { .mfb
  3639. STFD [C7 ] = f113, SIZE
  3640. mov f112 = f0
  3641. nop __LINE__
  3642. }
  3643. { .mfb
  3644. STFD [C8 ] = f121, SIZE
  3645. mov f120 = f0
  3646. nop __LINE__
  3647. }
  3648. ;;
  3649. #else
  3650. { .mfb
  3651. nop __LINE__
  3652. FMPY f64 = ALPHA, f64
  3653. nop __LINE__
  3654. }
  3655. { .mfb
  3656. nop __LINE__
  3657. FMPY f65 = ALPHA, f65
  3658. nop __LINE__
  3659. }
  3660. ;;
  3661. { .mfb
  3662. nop __LINE__
  3663. FMPY f72 = ALPHA, f72
  3664. nop __LINE__
  3665. }
  3666. { .mfb
  3667. nop __LINE__
  3668. FMPY f73 = ALPHA, f73
  3669. nop __LINE__
  3670. }
  3671. ;;
  3672. { .mfb
  3673. nop __LINE__
  3674. FMPY f80 = ALPHA, f80
  3675. nop __LINE__
  3676. }
  3677. { .mfb
  3678. nop __LINE__
  3679. FMPY f81 = ALPHA, f81
  3680. nop __LINE__
  3681. }
  3682. ;;
  3683. { .mfb
  3684. nop __LINE__
  3685. FMPY f88 = ALPHA, f88
  3686. nop __LINE__
  3687. }
  3688. { .mfb
  3689. nop __LINE__
  3690. FMPY f89 = ALPHA, f89
  3691. nop __LINE__
  3692. }
  3693. ;;
  3694. { .mfb
  3695. STFD [C1 ] = f64, SIZE
  3696. FMPY f96 = ALPHA, f96
  3697. nop __LINE__
  3698. }
  3699. { .mfb
  3700. STFD [C2 ] = f72, SIZE
  3701. FMPY f104 = ALPHA, f104
  3702. nop __LINE__
  3703. }
  3704. ;;
  3705. { .mfb
  3706. STFD [C1 ] = f65, SIZE
  3707. FMPY f97 = ALPHA, f97
  3708. nop __LINE__
  3709. }
  3710. { .mfb
  3711. STFD [C2 ] = f73, SIZE
  3712. FMPY f105 = ALPHA, f105
  3713. nop __LINE__
  3714. }
  3715. ;;
  3716. { .mfi
  3717. STFD [C3 ] = f80, SIZE
  3718. FMPY f112 = ALPHA, f112
  3719. #if defined(TRMMKERNEL) && \
  3720. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3721. sub L = K, KK
  3722. #else
  3723. nop __LINE__
  3724. #endif
  3725. }
  3726. { .mfb
  3727. STFD [C4 ] = f88, SIZE
  3728. FMPY f120 = ALPHA, f120
  3729. nop __LINE__
  3730. }
  3731. ;;
  3732. { .mfi
  3733. STFD [C3 ] = f81, SIZE
  3734. FMPY f113 = ALPHA, f113
  3735. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  3736. adds L = -2, L
  3737. #else
  3738. nop __LINE__
  3739. #endif
  3740. }
  3741. { .mfi
  3742. STFD [C4 ] = f89, SIZE
  3743. FMPY f121 = ALPHA, f121
  3744. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  3745. adds L = -8, L
  3746. #else
  3747. nop __LINE__
  3748. #endif
  3749. }
  3750. ;;
  3751. { .mfi
  3752. STFD [C5 ] = f96, SIZE
  3753. mov f64 = f0
  3754. #if defined(TRMMKERNEL) && \
  3755. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3756. shladd KK8 = L, BASE_SHIFT, r0
  3757. #else
  3758. nop __LINE__
  3759. #endif
  3760. }
  3761. { .mfb
  3762. STFD [C6 ] = f104, SIZE
  3763. mov f72 = f0
  3764. nop __LINE__
  3765. }
  3766. ;;
  3767. { .mfi
  3768. STFD [C5 ] = f97, SIZE
  3769. mov f80 = f0
  3770. #if defined(TRMMKERNEL) && \
  3771. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3772. shladd AOFFSET = KK8, 1, AOFFSET
  3773. #else
  3774. nop __LINE__
  3775. #endif
  3776. }
  3777. { .mfi
  3778. STFD [C6 ] = f105, SIZE
  3779. mov f88 = f0
  3780. #if defined(TRMMKERNEL) && \
  3781. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3782. shladd BOFFSET = KK8, 3, BOFFSET
  3783. #else
  3784. nop __LINE__
  3785. #endif
  3786. }
  3787. ;;
  3788. { .mfi
  3789. STFD [C7 ] = f112, SIZE
  3790. mov f96 = f0
  3791. #if defined(TRMMKERNEL) && defined(LEFT)
  3792. adds KK = 2, KK
  3793. #else
  3794. nop __LINE__
  3795. #endif
  3796. }
  3797. { .mfb
  3798. STFD [C8 ] = f120, SIZE
  3799. mov f104 = f0
  3800. nop __LINE__
  3801. }
  3802. ;;
  3803. { .mfi
  3804. STFD [C7 ] = f113, SIZE
  3805. mov f112 = f0
  3806. #ifdef TRMMKERNEL
  3807. shladd KK8 = KK, BASE_SHIFT, r0
  3808. #else
  3809. nop __LINE__
  3810. #endif
  3811. }
  3812. { .mfb
  3813. STFD [C8 ] = f121, SIZE
  3814. mov f120 = f0
  3815. nop __LINE__
  3816. }
  3817. ;;
  3818. #endif
  3819. .align 32
  3820. .L040:
  3821. { .mib
  3822. #ifndef TRMMKERNEL
  3823. nop __LINE__
  3824. #else
  3825. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3826. sub L = K, KK
  3827. #elif defined(LEFT)
  3828. adds L = 1, KK
  3829. #else
  3830. adds L = 8, KK
  3831. #endif
  3832. #endif
  3833. tbit.z p6, p7 = M, 0
  3834. (p6) br.cond.dptk .L049
  3835. }
  3836. ;;
  3837. #if !defined(TRMMKERNEL) || \
  3838. defined(TRMMKERNEL) && \
  3839. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3840. { .mmi
  3841. LDFPD f48, f49 = [B]
  3842. adds BOFFSET = 2 * SIZE, B
  3843. #ifndef TRMMKERNEL
  3844. adds L = 1, K
  3845. #else
  3846. adds L = 1, L
  3847. #endif
  3848. }
  3849. #else
  3850. { .mmi
  3851. shladd BOFFSET = KK8, 3, B
  3852. add AOFFSET = KK8, AOFFSET
  3853. nop __LINE__
  3854. }
  3855. ;;
  3856. { .mmi
  3857. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3858. nop __LINE__
  3859. #ifndef TRMMKERNEL
  3860. adds L = 1, K
  3861. #else
  3862. adds L = 1, L
  3863. #endif
  3864. }
  3865. #endif
  3866. ;;
  3867. { .mii
  3868. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3869. tbit.z p12, p0 = L, 0
  3870. shr L = L, 1
  3871. }
  3872. ;;
  3873. { .mmi
  3874. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  3875. LDFD f32 = [AOFFSET], 1 * SIZE
  3876. adds L = -1, L
  3877. }
  3878. ;;
  3879. { .mmi
  3880. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  3881. cmp.eq p3, p0 = r0, r0
  3882. mov ar.lc = L
  3883. }
  3884. { .mmi
  3885. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  3886. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  3887. nop __LINE__
  3888. }
  3889. ;;
  3890. .align 32
  3891. .L042:
  3892. { .mfb
  3893. lfetch.nt1 [PREB], 16 * SIZE
  3894. FMA f64 = f32, f48, f64 // A1 * B1
  3895. nop __LINE__
  3896. }
  3897. { .mfb
  3898. (p12) cmp.ne p3, p0 = 0, L
  3899. FMA f72 = f32, f49, f72 // A1 * B2
  3900. nop __LINE__
  3901. }
  3902. ;;
  3903. { .mfi
  3904. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  3905. FMA f80 = f32, f50, f80 // A1 * B3
  3906. cmp.ne p4, p5 = 0, L
  3907. }
  3908. { .mfb
  3909. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  3910. FMA f88 = f32, f51, f88 // A1 * B4
  3911. nop __LINE__
  3912. }
  3913. ;;
  3914. { .mfb
  3915. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  3916. FMA f96 = f32, f52, f96 // A1 * B5
  3917. nop __LINE__
  3918. }
  3919. { .mfb
  3920. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3921. (p5) LDFD f68 = [C1]
  3922. #else
  3923. nop __LINE__
  3924. #endif
  3925. FMA f104 = f32, f53, f104 // A1 * B6
  3926. nop __LINE__
  3927. }
  3928. ;;
  3929. { .mfb
  3930. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  3931. FMA f112 = f32, f54, f112 // A1 * B7
  3932. nop __LINE__
  3933. }
  3934. { .mfb
  3935. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3936. (p5) LDFD f76 = [C2]
  3937. #else
  3938. nop __LINE__
  3939. #endif
  3940. FMA f120 = f32, f55, f120 // A1 * B8
  3941. nop __LINE__
  3942. }
  3943. ;;
  3944. { .mfb
  3945. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  3946. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  3947. nop __LINE__
  3948. }
  3949. { .mfb
  3950. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  3951. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  3952. nop __LINE__
  3953. }
  3954. ;;
  3955. { .mfb
  3956. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3957. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  3958. nop __LINE__
  3959. }
  3960. { .mfb
  3961. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3962. (p5) LDFD f84 = [C3]
  3963. #else
  3964. nop __LINE__
  3965. #endif
  3966. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  3967. nop __LINE__
  3968. }
  3969. ;;
  3970. { .mfb
  3971. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3972. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  3973. nop __LINE__
  3974. }
  3975. { .mfb
  3976. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3977. (p5) LDFD f92 = [C4]
  3978. #else
  3979. nop __LINE__
  3980. #endif
  3981. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  3982. nop __LINE__
  3983. }
  3984. ;;
  3985. { .mfi
  3986. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  3987. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  3988. adds L = -1, L
  3989. }
  3990. { .mmb
  3991. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3992. (p5) LDFD f100 = [C5]
  3993. (p5) LDFD f108 = [C6]
  3994. #else
  3995. nop __LINE__
  3996. nop __LINE__
  3997. #endif
  3998. nop __LINE__
  3999. }
  4000. ;;
  4001. { .mfb
  4002. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  4003. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  4004. nop __LINE__
  4005. }
  4006. { .mmb
  4007. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4008. (p5) LDFD f116 = [C7]
  4009. (p5) LDFD f124 = [C8]
  4010. #else
  4011. nop __LINE__
  4012. nop __LINE__
  4013. #endif
  4014. br.cloop.sptk.few .L042
  4015. }
  4016. ;;
  4017. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4018. FMA f64 = ALPHA, f64, f68
  4019. FMA f72 = ALPHA, f72, f76
  4020. FMA f80 = ALPHA, f80, f84
  4021. FMA f88 = ALPHA, f88, f92
  4022. FMA f96 = ALPHA, f96, f100
  4023. FMA f104 = ALPHA, f104, f108
  4024. FMA f112 = ALPHA, f112, f116
  4025. FMA f120 = ALPHA, f120, f124
  4026. ;;
  4027. STFD [C1 ] = f64, SIZE
  4028. mov f64 = f0
  4029. STFD [C2 ] = f72, SIZE
  4030. mov f72 = f0
  4031. ;;
  4032. STFD [C3 ] = f80, SIZE
  4033. mov f80 = f0
  4034. STFD [C4 ] = f88, SIZE
  4035. mov f88 = f0
  4036. ;;
  4037. STFD [C5 ] = f96, SIZE
  4038. mov f96 = f0
  4039. STFD [C6 ] = f104, SIZE
  4040. mov f104 = f0
  4041. ;;
  4042. STFD [C7 ] = f112, SIZE
  4043. mov f112 = f0
  4044. STFD [C8 ] = f120, SIZE
  4045. mov f120 = f0
  4046. ;;
  4047. #else
  4048. FMPY f64 = ALPHA, f64
  4049. FMPY f72 = ALPHA, f72
  4050. FMPY f80 = ALPHA, f80
  4051. FMPY f88 = ALPHA, f88
  4052. { .mfi
  4053. FMPY f96 = ALPHA, f96
  4054. #if defined(TRMMKERNEL) && \
  4055. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4056. sub L = K, KK
  4057. #else
  4058. nop __LINE__
  4059. #endif
  4060. }
  4061. { .mfi
  4062. nop __LINE__
  4063. FMPY f104 = ALPHA, f104
  4064. nop __LINE__
  4065. }
  4066. ;;
  4067. { .mfi
  4068. nop __LINE__
  4069. FMPY f112 = ALPHA, f112
  4070. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  4071. adds L = -1, L
  4072. #else
  4073. nop __LINE__
  4074. #endif
  4075. }
  4076. { .mfi
  4077. nop __LINE__
  4078. FMPY f120 = ALPHA, f120
  4079. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  4080. adds L = -8, L
  4081. #else
  4082. nop __LINE__
  4083. #endif
  4084. }
  4085. ;;
  4086. { .mfi
  4087. STFD [C1 ] = f64, SIZE
  4088. mov f64 = f0
  4089. #if defined(TRMMKERNEL) && \
  4090. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4091. shladd KK8 = L, BASE_SHIFT, r0
  4092. #else
  4093. nop __LINE__
  4094. #endif
  4095. }
  4096. { .mfi
  4097. STFD [C2 ] = f72, SIZE
  4098. mov f72 = f0
  4099. nop __LINE__
  4100. }
  4101. ;;
  4102. { .mfi
  4103. STFD [C3 ] = f80, SIZE
  4104. mov f80 = f0
  4105. #if defined(TRMMKERNEL) && \
  4106. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4107. add AOFFSET = KK8, AOFFSET
  4108. #else
  4109. nop __LINE__
  4110. #endif
  4111. }
  4112. { .mfi
  4113. STFD [C4 ] = f88, SIZE
  4114. mov f88 = f0
  4115. #if defined(TRMMKERNEL) && \
  4116. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4117. shladd BOFFSET = KK8, 3, BOFFSET
  4118. #else
  4119. nop __LINE__
  4120. #endif
  4121. }
  4122. ;;
  4123. { .mfi
  4124. STFD [C5 ] = f96, SIZE
  4125. mov f96 = f0
  4126. #if defined(TRMMKERNEL) && defined(LEFT)
  4127. adds KK = 1, KK
  4128. #else
  4129. nop __LINE__
  4130. #endif
  4131. }
  4132. { .mfi
  4133. STFD [C6 ] = f104, SIZE
  4134. mov f104 = f0
  4135. nop __LINE__
  4136. }
  4137. ;;
  4138. { .mfi
  4139. STFD [C7 ] = f112, SIZE
  4140. mov f112 = f0
  4141. #ifdef TRMMKERNEL
  4142. shladd KK8 = KK, BASE_SHIFT, r0
  4143. #else
  4144. nop __LINE__
  4145. #endif
  4146. }
  4147. { .mfi
  4148. STFD [C8 ] = f120, SIZE
  4149. mov f120 = f0
  4150. nop __LINE__
  4151. }
  4152. ;;
  4153. #endif
  4154. .align 32
  4155. .L049:
  4156. { .mmi
  4157. mov B = BOFFSET
  4158. mov AOFFSET = A
  4159. #if defined(TRMMKERNEL) && !defined(LEFT)
  4160. adds KK = 8, KK
  4161. #else
  4162. nop __LINE__
  4163. #endif
  4164. }
  4165. ;;
  4166. { .mib
  4167. cmp.lt p6, p0 = 0, J
  4168. shr BB = K, 3
  4169. (p6) br.cond.dptk .L010
  4170. }
  4171. ;;
  4172. .align 32
  4173. .L050:
  4174. { .mfi
  4175. mov C1 = C
  4176. mov f64 = f0
  4177. tbit.z p6, p0 = N, 2
  4178. }
  4179. { .mfi
  4180. add C2 = LDC, C
  4181. mov f72 = f0
  4182. shr I = M, 3
  4183. }
  4184. ;;
  4185. { .mfi
  4186. shladd C3 = LDC, 1, C
  4187. mov f80 = f0
  4188. nop __LINE__
  4189. }
  4190. { .mfb
  4191. mov AOFFSET = A
  4192. mov f88 = f0
  4193. (p6) br.cond.dpnt .L090
  4194. }
  4195. ;;
  4196. { .mfi
  4197. cmp.eq p6, p7 = 0, I
  4198. mov f65 = f0
  4199. #if defined(TRMMKERNEL) && defined(LEFT)
  4200. mov KK = OFFSET
  4201. #else
  4202. nop __LINE__
  4203. #endif
  4204. }
  4205. { .mfi
  4206. shladd C4 = LDC, 1, C2
  4207. mov f73 = f0
  4208. nop __LINE__
  4209. }
  4210. ;;
  4211. { .mfi
  4212. nop __LINE__
  4213. mov f81 = f0
  4214. #ifdef TRMMKERNEL
  4215. shladd KK8 = KK, BASE_SHIFT, r0
  4216. #else
  4217. nop __LINE__
  4218. #endif
  4219. }
  4220. { .mfb
  4221. shladd C = LDC, 2, C
  4222. mov f89 = f0
  4223. (p6) br.cond.dpnt .L060
  4224. }
  4225. ;;
  4226. .align 32
  4227. .L052:
  4228. #if !defined(TRMMKERNEL) || \
  4229. defined(TRMMKERNEL) && \
  4230. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4231. { .mfb
  4232. LDFPD f48, f49 = [B]
  4233. mov f66 = f0
  4234. nop __LINE__
  4235. }
  4236. { .mfb
  4237. adds BOFFSET = 2 * SIZE, B
  4238. mov f74 = f0
  4239. nop __LINE__
  4240. }
  4241. ;;
  4242. #else
  4243. { .mfi
  4244. shladd BOFFSET = KK8, 2, B
  4245. mov f66 = f0
  4246. shladd AOFFSET = KK8, 3, AOFFSET
  4247. }
  4248. ;;
  4249. { .mfi
  4250. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  4251. mov f74 = f0
  4252. nop __LINE__
  4253. }
  4254. ;;
  4255. #endif
  4256. ;;
  4257. { .mfi
  4258. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  4259. mov f82 = f0
  4260. #ifndef TRMMKERNEL
  4261. nop __LINE__
  4262. #else
  4263. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4264. sub L = K, KK
  4265. #elif defined(LEFT)
  4266. adds L = 8, KK
  4267. #else
  4268. adds L = 4, KK
  4269. #endif
  4270. #endif
  4271. }
  4272. { .mfi
  4273. setf.d f84 = r0
  4274. mov f90 = f0
  4275. nop __LINE__
  4276. }
  4277. ;;
  4278. { .mfi
  4279. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  4280. mov f67 = f0
  4281. adds PREC = CPREFETCHSIZE * SIZE, C1
  4282. }
  4283. { .mfi
  4284. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  4285. mov f75 = f0
  4286. #ifndef TRMMKERNEL
  4287. adds L = 1, K
  4288. #else
  4289. adds L = 1, L
  4290. #endif
  4291. }
  4292. ;;
  4293. { .mfi
  4294. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  4295. mov f83 = f0
  4296. tbit.z p12, p0 = L, 0
  4297. }
  4298. { .mfi
  4299. setf.d f91 = r0
  4300. mov f68 = f0
  4301. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  4302. }
  4303. ;;
  4304. { .mfi
  4305. CPREFETCH [PREC], LDC
  4306. mov f76 = f0
  4307. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  4308. }
  4309. { .mfi
  4310. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  4311. mov f92 = f0
  4312. cmp.eq p3, p0 = r0, r0
  4313. }
  4314. ;;
  4315. { .mfi
  4316. CPREFETCH [PREC], LDC
  4317. mov f69 = f0
  4318. shr L = L, 1
  4319. }
  4320. { .mmf
  4321. setf.d f77 = r0
  4322. setf.d f85 = r0
  4323. mov f93 = f0
  4324. }
  4325. ;;
  4326. { .mfi
  4327. CPREFETCH [PREC], LDC
  4328. mov f70 = f0
  4329. adds L = -1, L
  4330. }
  4331. { .mmf
  4332. setf.d f78 = r0
  4333. setf.d f86 = r0
  4334. mov f94 = f0
  4335. }
  4336. ;;
  4337. { .mfi
  4338. CPREFETCH [PREC]
  4339. mov f71 = f0
  4340. mov ar.lc = L
  4341. }
  4342. { .mmf
  4343. setf.d f79 = r0
  4344. setf.d f87 = r0
  4345. mov f95 = f0
  4346. }
  4347. ;;
  4348. .align 32
  4349. .L053:
  4350. { .mfb
  4351. lfetch.nt1 [PREA], 16 * SIZE
  4352. FMA f64 = f32, f48, f64 // A1 * B1
  4353. nop __LINE__
  4354. }
  4355. { .mfi
  4356. nop __LINE__
  4357. FMA f72 = f32, f49, f72 // A1 * B2
  4358. (p12) cmp.ne p3, p0 = 0, L
  4359. }
  4360. ;;
  4361. { .mfi
  4362. lfetch.nt1 [PREB], 8 * SIZE
  4363. FMA f80 = f32, f50, f80 // A1 * B3
  4364. cmp.ne p4, p5 = 0, L
  4365. }
  4366. { .mfi
  4367. nop __LINE__
  4368. FMA f88 = f32, f51, f88 // A1 * B4
  4369. adds C9 = 4 * SIZE, C1
  4370. }
  4371. ;;
  4372. { .mfi
  4373. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  4374. FMA f65 = f33, f48, f65 // A2 * B1
  4375. adds C10 = 4 * SIZE, C2
  4376. }
  4377. { .mfi
  4378. nop __LINE__
  4379. FMA f73 = f33, f49, f73 // A2 * B2
  4380. adds C11 = 4 * SIZE, C3
  4381. }
  4382. ;;
  4383. { .mfi
  4384. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  4385. FMA f81 = f33, f50, f81 // A2 * B3
  4386. adds C12 = 4 * SIZE, C4
  4387. }
  4388. { .mfb
  4389. nop __LINE__
  4390. FMA f89 = f33, f51, f89 // A2 * B4
  4391. nop __LINE__
  4392. }
  4393. ;;
  4394. { .mfb
  4395. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  4396. FMA f66 = f34, f48, f66 // A3 * B1
  4397. nop __LINE__
  4398. }
  4399. { .mfb
  4400. nop __LINE__
  4401. FMA f74 = f34, f49, f74 // A3 * B2
  4402. nop __LINE__
  4403. }
  4404. ;;
  4405. { .mfb
  4406. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  4407. FMA f82 = f34, f50, f82 // A3 * B3
  4408. nop __LINE__
  4409. }
  4410. { .mfb
  4411. nop __LINE__
  4412. FMA f90 = f34, f51, f90 // A3 * B4
  4413. nop __LINE__
  4414. }
  4415. ;;
  4416. { .mfb
  4417. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  4418. FMA f67 = f35, f48, f67 // A4 * B1
  4419. nop __LINE__
  4420. }
  4421. { .mfb
  4422. nop __LINE__
  4423. FMA f75 = f35, f49, f75 // A4 * B2
  4424. nop __LINE__
  4425. }
  4426. ;;
  4427. { .mfb
  4428. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  4429. FMA f83 = f35, f50, f83 // A4 * B3
  4430. nop __LINE__
  4431. }
  4432. { .mfb
  4433. nop __LINE__
  4434. FMA f91 = f35, f51, f91 // A4 * B4
  4435. nop __LINE__
  4436. }
  4437. ;;
  4438. { .mfb
  4439. nop __LINE__
  4440. FMA f68 = f36, f48, f68 // A5 * B1
  4441. nop __LINE__
  4442. }
  4443. { .mfb
  4444. nop __LINE__
  4445. FMA f76 = f36, f49, f76 // A5 * B2
  4446. nop __LINE__
  4447. }
  4448. ;;
  4449. { .mfb
  4450. nop __LINE__
  4451. FMA f84 = f36, f50, f84 // A5 * B3
  4452. nop __LINE__
  4453. }
  4454. { .mfb
  4455. nop __LINE__
  4456. FMA f92 = f36, f51, f92 // A5 * B4
  4457. nop __LINE__
  4458. }
  4459. ;;
  4460. { .mfb
  4461. nop __LINE__
  4462. FMA f69 = f37, f48, f69 // A6 * B1
  4463. nop __LINE__
  4464. }
  4465. { .mfb
  4466. nop __LINE__
  4467. FMA f77 = f37, f49, f77 // A6 * B2
  4468. nop __LINE__
  4469. }
  4470. ;;
  4471. { .mfb
  4472. nop __LINE__
  4473. FMA f85 = f37, f50, f85 // A6 * B3
  4474. nop __LINE__
  4475. }
  4476. { .mfb
  4477. nop __LINE__
  4478. FMA f93 = f37, f51, f93 // A6 * B4
  4479. nop __LINE__
  4480. }
  4481. ;;
  4482. { .mfb
  4483. nop __LINE__
  4484. FMA f70 = f38, f48, f70 // A7 * B1
  4485. nop __LINE__
  4486. }
  4487. { .mfb
  4488. nop __LINE__
  4489. FMA f78 = f38, f49, f78 // A7 * B2
  4490. nop __LINE__
  4491. }
  4492. ;;
  4493. { .mfb
  4494. nop __LINE__
  4495. FMA f86 = f38, f50, f86 // A7 * B3
  4496. nop __LINE__
  4497. }
  4498. { .mfb
  4499. nop __LINE__
  4500. FMA f94 = f38, f51, f94 // A7 * B4
  4501. nop __LINE__
  4502. }
  4503. ;;
  4504. { .mfb
  4505. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  4506. FMA f71 = f39, f48, f71 // A8 * B1
  4507. nop __LINE__
  4508. }
  4509. { .mfb
  4510. nop __LINE__
  4511. FMA f79 = f39, f49, f79 // A8 * B2
  4512. nop __LINE__
  4513. }
  4514. ;;
  4515. { .mfb
  4516. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  4517. FMA f87 = f39, f50, f87 // A8 * B3
  4518. nop __LINE__
  4519. }
  4520. { .mfb
  4521. nop __LINE__
  4522. FMA f95 = f39, f51, f95 // A8 * B4
  4523. nop __LINE__
  4524. }
  4525. ;;
  4526. { .mfb
  4527. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  4528. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  4529. nop __LINE__
  4530. }
  4531. { .mfb
  4532. nop __LINE__
  4533. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  4534. nop __LINE__
  4535. }
  4536. ;;
  4537. { .mfb
  4538. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  4539. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  4540. nop __LINE__
  4541. }
  4542. { .mfb
  4543. nop __LINE__
  4544. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  4545. nop __LINE__
  4546. }
  4547. ;;
  4548. { .mfb
  4549. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  4550. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  4551. nop __LINE__
  4552. }
  4553. { .mfb
  4554. nop __LINE__
  4555. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  4556. nop __LINE__
  4557. }
  4558. ;;
  4559. { .mfb
  4560. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  4561. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  4562. nop __LINE__
  4563. }
  4564. { .mfb
  4565. nop __LINE__
  4566. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  4567. nop __LINE__
  4568. }
  4569. ;;
  4570. { .mfb
  4571. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4572. (p5) LDFD f96 = [C1 ], SIZE
  4573. #else
  4574. nop __LINE__
  4575. #endif
  4576. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  4577. nop __LINE__
  4578. }
  4579. { .mfb
  4580. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4581. (p5) LDFD f97 = [C9 ], SIZE
  4582. #else
  4583. nop __LINE__
  4584. #endif
  4585. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  4586. nop __LINE__
  4587. }
  4588. ;;
  4589. { .mfb
  4590. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4591. (p5) LDFD f98 = [C1 ], SIZE
  4592. #else
  4593. nop __LINE__
  4594. #endif
  4595. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  4596. nop __LINE__
  4597. }
  4598. { .mfb
  4599. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4600. (p5) LDFD f99 = [C9 ], SIZE
  4601. #else
  4602. nop __LINE__
  4603. #endif
  4604. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  4605. nop __LINE__
  4606. }
  4607. ;;
  4608. { .mfb
  4609. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4610. (p5) LDFD f100 = [C1 ], SIZE
  4611. #else
  4612. nop __LINE__
  4613. #endif
  4614. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  4615. nop __LINE__
  4616. }
  4617. { .mfb
  4618. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4619. (p5) LDFD f101 = [C9 ], SIZE
  4620. #else
  4621. nop __LINE__
  4622. #endif
  4623. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  4624. nop __LINE__
  4625. }
  4626. ;;
  4627. { .mfb
  4628. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4629. (p5) LDFD f102 = [C1 ], -3 * SIZE
  4630. #else
  4631. nop __LINE__
  4632. #endif
  4633. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  4634. nop __LINE__
  4635. }
  4636. { .mfb
  4637. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4638. (p5) LDFD f103 = [C9 ], -3 * SIZE
  4639. #else
  4640. nop __LINE__
  4641. #endif
  4642. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  4643. nop __LINE__
  4644. }
  4645. ;;
  4646. { .mfb
  4647. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4648. (p5) LDFD f104 = [C2 ], SIZE
  4649. #else
  4650. nop __LINE__
  4651. #endif
  4652. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  4653. nop __LINE__
  4654. }
  4655. { .mfb
  4656. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4657. (p5) LDFD f105 = [C10], SIZE
  4658. #else
  4659. nop __LINE__
  4660. #endif
  4661. (p3) FMA f76 = f44, f57, f76 // A5 * B2
  4662. nop __LINE__
  4663. }
  4664. ;;
  4665. { .mfb
  4666. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4667. (p5) LDFD f106 = [C2 ], SIZE
  4668. #else
  4669. nop __LINE__
  4670. #endif
  4671. (p3) FMA f84 = f44, f58, f84 // A5 * B3
  4672. nop __LINE__
  4673. }
  4674. { .mfb
  4675. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4676. (p5) LDFD f107 = [C10], SIZE
  4677. #else
  4678. nop __LINE__
  4679. #endif
  4680. (p3) FMA f92 = f44, f59, f92 // A5 * B4
  4681. nop __LINE__
  4682. }
  4683. ;;
  4684. { .mfb
  4685. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4686. (p5) LDFD f108 = [C2 ], SIZE
  4687. #else
  4688. nop __LINE__
  4689. #endif
  4690. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  4691. nop __LINE__
  4692. }
  4693. { .mfb
  4694. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4695. (p5) LDFD f109 = [C10], SIZE
  4696. #else
  4697. nop __LINE__
  4698. #endif
  4699. (p3) FMA f77 = f45, f57, f77 // A6 * B2
  4700. nop __LINE__
  4701. }
  4702. ;;
  4703. { .mfb
  4704. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4705. (p5) LDFD f110 = [C2 ], -3 * SIZE
  4706. #else
  4707. nop __LINE__
  4708. #endif
  4709. (p3) FMA f85 = f45, f58, f85 // A6 * B3
  4710. nop __LINE__
  4711. }
  4712. { .mfb
  4713. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4714. (p5) LDFD f111 = [C10], -3 * SIZE
  4715. #else
  4716. nop __LINE__
  4717. #endif
  4718. (p3) FMA f93 = f45, f59, f93 // A6 * B4
  4719. nop __LINE__
  4720. }
  4721. ;;
  4722. { .mfb
  4723. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4724. (p5) LDFD f112 = [C3 ], SIZE
  4725. #else
  4726. nop __LINE__
  4727. #endif
  4728. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  4729. nop __LINE__
  4730. }
  4731. { .mfb
  4732. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4733. (p5) LDFD f113 = [C11], SIZE
  4734. #else
  4735. nop __LINE__
  4736. #endif
  4737. (p3) FMA f78 = f46, f57, f78 // A7 * B2
  4738. nop __LINE__
  4739. }
  4740. ;;
  4741. { .mfb
  4742. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4743. (p5) LDFD f114 = [C3 ], SIZE
  4744. #else
  4745. nop __LINE__
  4746. #endif
  4747. (p3) FMA f86 = f46, f58, f86 // A7 * B3
  4748. nop __LINE__
  4749. }
  4750. { .mfb
  4751. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4752. (p5) LDFD f115 = [C11], SIZE
  4753. #else
  4754. nop __LINE__
  4755. #endif
  4756. (p3) FMA f94 = f46, f59, f94 // A7 * B4
  4757. nop __LINE__
  4758. }
  4759. ;;
  4760. { .mfb
  4761. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4762. (p5) LDFD f116 = [C3 ], SIZE
  4763. #else
  4764. nop __LINE__
  4765. #endif
  4766. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  4767. nop __LINE__
  4768. }
  4769. { .mfb
  4770. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4771. (p5) LDFD f117 = [C11], SIZE
  4772. #else
  4773. nop __LINE__
  4774. #endif
  4775. (p3) FMA f79 = f47, f57, f79 // A8 * B2
  4776. nop __LINE__
  4777. }
  4778. ;;
  4779. { .mfi
  4780. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4781. (p5) LDFD f118 = [C3 ], -3 * SIZE
  4782. #else
  4783. nop __LINE__
  4784. #endif
  4785. (p3) FMA f87 = f47, f58, f87 // A8 * B3
  4786. adds L = -1, L
  4787. }
  4788. { .mfb
  4789. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4790. (p5) LDFD f119 = [C11], -3 * SIZE
  4791. #else
  4792. nop __LINE__
  4793. #endif
  4794. (p3) FMA f95 = f47, f59, f95 // A8 * B4
  4795. br.cloop.sptk.few .L053
  4796. }
  4797. ;;
  4798. .align 32
  4799. .L058:
  4800. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4801. { .mfi
  4802. LDFD f120 = [C4 ], SIZE
  4803. FMA f64 = ALPHA, f64, f96
  4804. cmp.ne p6, p0 = 1, I
  4805. }
  4806. { .mfb
  4807. LDFD f121 = [C12], SIZE
  4808. FMA f68 = ALPHA, f68, f97
  4809. nop __LINE__
  4810. }
  4811. ;;
  4812. { .mfi
  4813. LDFD f122 = [C4 ], SIZE
  4814. FMA f65 = ALPHA, f65, f98
  4815. adds I = -1, I
  4816. }
  4817. { .mfb
  4818. LDFD f123 = [C12], SIZE
  4819. FMA f69 = ALPHA, f69, f99
  4820. nop __LINE__
  4821. }
  4822. ;;
  4823. { .mfb
  4824. LDFD f124 = [C4 ], SIZE
  4825. FMA f66 = ALPHA, f66, f100
  4826. nop __LINE__
  4827. }
  4828. { .mfb
  4829. LDFD f125 = [C12], SIZE
  4830. FMA f70 = ALPHA, f70, f101
  4831. nop __LINE__
  4832. }
  4833. ;;
  4834. { .mfb
  4835. LDFD f126 = [C4 ], -3 * SIZE
  4836. FMA f67 = ALPHA, f67, f102
  4837. nop __LINE__
  4838. }
  4839. { .mfb
  4840. LDFD f127 = [C12], -3 * SIZE
  4841. FMA f71 = ALPHA, f71, f103
  4842. nop __LINE__
  4843. }
  4844. ;;
  4845. { .mfb
  4846. STFD [C1 ] = f64, SIZE
  4847. FMA f72 = ALPHA, f72, f104
  4848. nop __LINE__
  4849. }
  4850. { .mfb
  4851. STFD [C9 ] = f68, SIZE
  4852. FMA f76 = ALPHA, f76, f105
  4853. nop __LINE__
  4854. }
  4855. ;;
  4856. { .mfb
  4857. STFD [C1 ] = f65, SIZE
  4858. FMA f73 = ALPHA, f73, f106
  4859. nop __LINE__
  4860. }
  4861. { .mfb
  4862. STFD [C9 ] = f69, SIZE
  4863. FMA f77 = ALPHA, f77, f107
  4864. nop __LINE__
  4865. }
  4866. ;;
  4867. { .mfb
  4868. STFD [C1 ] = f66, SIZE
  4869. FMA f74 = ALPHA, f74, f108
  4870. nop __LINE__
  4871. }
  4872. { .mfb
  4873. STFD [C9 ] = f70, SIZE
  4874. FMA f78 = ALPHA, f78, f109
  4875. nop __LINE__
  4876. }
  4877. ;;
  4878. { .mfb
  4879. STFD [C1 ] = f67, 5 * SIZE
  4880. FMA f75 = ALPHA, f75, f110
  4881. nop __LINE__
  4882. }
  4883. { .mfb
  4884. STFD [C9 ] = f71, 5 * SIZE
  4885. FMA f79 = ALPHA, f79, f111
  4886. nop __LINE__
  4887. }
  4888. ;;
  4889. { .mfb
  4890. STFD [C2 ] = f72, SIZE
  4891. FMA f80 = ALPHA, f80, f112
  4892. nop __LINE__
  4893. }
  4894. { .mfb
  4895. STFD [C10] = f76, SIZE
  4896. FMA f84 = ALPHA, f84, f113
  4897. nop __LINE__
  4898. }
  4899. ;;
  4900. { .mfb
  4901. STFD [C2 ] = f73, SIZE
  4902. FMA f81 = ALPHA, f81, f114
  4903. nop __LINE__
  4904. }
  4905. { .mfb
  4906. STFD [C10] = f77, SIZE
  4907. FMA f85 = ALPHA, f85, f115
  4908. nop __LINE__
  4909. }
  4910. ;;
  4911. { .mfb
  4912. STFD [C2 ] = f74, SIZE
  4913. FMA f82 = ALPHA, f82, f116
  4914. nop __LINE__
  4915. }
  4916. { .mfb
  4917. STFD [C10] = f78, SIZE
  4918. FMA f86 = ALPHA, f86, f117
  4919. nop __LINE__
  4920. }
  4921. ;;
  4922. { .mfb
  4923. STFD [C2 ] = f75, 5 * SIZE
  4924. FMA f83 = ALPHA, f83, f118
  4925. nop __LINE__
  4926. }
  4927. { .mfb
  4928. STFD [C10] = f79, 5 * SIZE
  4929. FMA f87 = ALPHA, f87, f119
  4930. nop __LINE__
  4931. }
  4932. ;;
  4933. { .mfb
  4934. STFD [C3 ] = f80, SIZE
  4935. FMA f88 = ALPHA, f88, f120
  4936. nop __LINE__
  4937. }
  4938. { .mfb
  4939. STFD [C11] = f84, SIZE
  4940. FMA f92 = ALPHA, f92, f121
  4941. nop __LINE__
  4942. }
  4943. ;;
  4944. { .mfb
  4945. STFD [C3 ] = f81, SIZE
  4946. FMA f89 = ALPHA, f89, f122
  4947. nop __LINE__
  4948. }
  4949. { .mfb
  4950. STFD [C11] = f85, SIZE
  4951. FMA f93 = ALPHA, f93, f123
  4952. nop __LINE__
  4953. }
  4954. ;;
  4955. { .mfb
  4956. STFD [C3 ] = f82, SIZE
  4957. FMA f90 = ALPHA, f90, f124
  4958. nop __LINE__
  4959. }
  4960. { .mfb
  4961. STFD [C11] = f86, SIZE
  4962. FMA f94 = ALPHA, f94, f125
  4963. nop __LINE__
  4964. }
  4965. ;;
  4966. { .mfb
  4967. STFD [C3 ] = f83, 5 * SIZE
  4968. FMA f91 = ALPHA, f91, f126
  4969. nop __LINE__
  4970. }
  4971. { .mfb
  4972. STFD [C11] = f87, 5 * SIZE
  4973. FMA f95 = ALPHA, f95, f127
  4974. nop __LINE__
  4975. }
  4976. ;;
  4977. { .mfb
  4978. STFD [C4 ] = f88, SIZE
  4979. mov f64 = f0
  4980. nop __LINE__
  4981. }
  4982. { .mfb
  4983. STFD [C12] = f92, SIZE
  4984. mov f72 = f0
  4985. nop __LINE__
  4986. }
  4987. ;;
  4988. { .mfb
  4989. STFD [C4 ] = f89, SIZE
  4990. mov f80 = f0
  4991. nop __LINE__
  4992. }
  4993. { .mfb
  4994. STFD [C12] = f93, SIZE
  4995. mov f88 = f0
  4996. nop __LINE__
  4997. }
  4998. ;;
  4999. { .mfb
  5000. STFD [C4 ] = f90, SIZE
  5001. mov f65 = f0
  5002. nop __LINE__
  5003. }
  5004. { .mfb
  5005. STFD [C12] = f94, SIZE
  5006. mov f73 = f0
  5007. nop __LINE__
  5008. }
  5009. ;;
  5010. { .mfb
  5011. STFD [C4 ] = f91, 5 * SIZE
  5012. mov f81 = f0
  5013. nop __LINE__
  5014. }
  5015. { .mfb
  5016. STFD [C12] = f95, 5 * SIZE
  5017. mov f89 = f0
  5018. (p6) br.cond.dptk .L052
  5019. }
  5020. ;;
  5021. #else
  5022. { .mfi
  5023. nop __LINE__
  5024. FMPY f64 = ALPHA, f64
  5025. cmp.ne p6, p0 = 1, I
  5026. }
  5027. { .mfb
  5028. nop __LINE__
  5029. FMPY f68 = ALPHA, f68
  5030. nop __LINE__
  5031. }
  5032. ;;
  5033. { .mfi
  5034. nop __LINE__
  5035. FMPY f65 = ALPHA, f65
  5036. adds I = -1, I
  5037. }
  5038. { .mfb
  5039. nop __LINE__
  5040. FMPY f69 = ALPHA, f69
  5041. nop __LINE__
  5042. }
  5043. ;;
  5044. { .mfb
  5045. nop __LINE__
  5046. FMPY f66 = ALPHA, f66
  5047. nop __LINE__
  5048. }
  5049. { .mfb
  5050. nop __LINE__
  5051. FMPY f70 = ALPHA, f70
  5052. nop __LINE__
  5053. }
  5054. ;;
  5055. { .mfb
  5056. nop __LINE__
  5057. FMPY f67 = ALPHA, f67
  5058. nop __LINE__
  5059. }
  5060. { .mfb
  5061. nop __LINE__
  5062. FMPY f71 = ALPHA, f71
  5063. nop __LINE__
  5064. }
  5065. ;;
  5066. { .mfb
  5067. STFD [C1 ] = f64, SIZE
  5068. FMPY f72 = ALPHA, f72
  5069. nop __LINE__
  5070. }
  5071. { .mfb
  5072. STFD [C9 ] = f68, SIZE
  5073. FMPY f76 = ALPHA, f76
  5074. nop __LINE__
  5075. }
  5076. ;;
  5077. { .mfb
  5078. STFD [C1 ] = f65, SIZE
  5079. FMPY f73 = ALPHA, f73
  5080. nop __LINE__
  5081. }
  5082. { .mfb
  5083. STFD [C9 ] = f69, SIZE
  5084. FMPY f77 = ALPHA, f77
  5085. nop __LINE__
  5086. }
  5087. ;;
  5088. { .mfb
  5089. STFD [C1 ] = f66, SIZE
  5090. FMPY f74 = ALPHA, f74
  5091. nop __LINE__
  5092. }
  5093. { .mfb
  5094. STFD [C9 ] = f70, SIZE
  5095. FMPY f78 = ALPHA, f78
  5096. nop __LINE__
  5097. }
  5098. ;;
  5099. { .mfb
  5100. STFD [C1 ] = f67, 5 * SIZE
  5101. FMPY f75 = ALPHA, f75
  5102. nop __LINE__
  5103. }
  5104. { .mfb
  5105. STFD [C9 ] = f71, 5 * SIZE
  5106. FMPY f79 = ALPHA, f79
  5107. nop __LINE__
  5108. }
  5109. ;;
  5110. { .mfb
  5111. STFD [C2 ] = f72, SIZE
  5112. FMPY f80 = ALPHA, f80
  5113. nop __LINE__
  5114. }
  5115. { .mfb
  5116. STFD [C10] = f76, SIZE
  5117. FMPY f84 = ALPHA, f84
  5118. nop __LINE__
  5119. }
  5120. ;;
  5121. { .mfb
  5122. STFD [C2 ] = f73, SIZE
  5123. FMPY f81 = ALPHA, f81
  5124. nop __LINE__
  5125. }
  5126. { .mfb
  5127. STFD [C10] = f77, SIZE
  5128. FMPY f85 = ALPHA, f85
  5129. nop __LINE__
  5130. }
  5131. ;;
  5132. { .mfb
  5133. STFD [C2 ] = f74, SIZE
  5134. FMPY f82 = ALPHA, f82
  5135. nop __LINE__
  5136. }
  5137. { .mfb
  5138. STFD [C10] = f78, SIZE
  5139. FMPY f86 = ALPHA, f86
  5140. nop __LINE__
  5141. }
  5142. ;;
  5143. { .mfb
  5144. STFD [C2 ] = f75, 5 * SIZE
  5145. FMPY f83 = ALPHA, f83
  5146. nop __LINE__
  5147. }
  5148. { .mfb
  5149. STFD [C10] = f79, 5 * SIZE
  5150. FMPY f87 = ALPHA, f87
  5151. nop __LINE__
  5152. }
  5153. ;;
  5154. { .mfb
  5155. STFD [C3 ] = f80, SIZE
  5156. FMPY f88 = ALPHA, f88
  5157. nop __LINE__
  5158. }
  5159. { .mfb
  5160. STFD [C11] = f84, SIZE
  5161. FMPY f92 = ALPHA, f92
  5162. nop __LINE__
  5163. }
  5164. ;;
  5165. { .mfb
  5166. STFD [C3 ] = f81, SIZE
  5167. FMPY f89 = ALPHA, f89
  5168. nop __LINE__
  5169. }
  5170. { .mfb
  5171. STFD [C11] = f85, SIZE
  5172. FMPY f93 = ALPHA, f93
  5173. nop __LINE__
  5174. }
  5175. ;;
  5176. { .mfi
  5177. STFD [C3 ] = f82, SIZE
  5178. FMPY f90 = ALPHA, f90
  5179. #if defined(TRMMKERNEL) && \
  5180. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5181. sub L = K, KK
  5182. #else
  5183. nop __LINE__
  5184. #endif
  5185. }
  5186. { .mfb
  5187. STFD [C11] = f86, SIZE
  5188. FMPY f94 = ALPHA, f94
  5189. nop __LINE__
  5190. }
  5191. ;;
  5192. { .mfi
  5193. STFD [C3 ] = f83, 5 * SIZE
  5194. FMPY f91 = ALPHA, f91
  5195. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  5196. adds L = -8, L
  5197. #else
  5198. nop __LINE__
  5199. #endif
  5200. }
  5201. { .mfi
  5202. STFD [C11] = f87, 5 * SIZE
  5203. FMPY f95 = ALPHA, f95
  5204. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  5205. adds L = -4, L
  5206. #else
  5207. nop __LINE__
  5208. #endif
  5209. }
  5210. ;;
  5211. { .mfi
  5212. STFD [C4 ] = f88, SIZE
  5213. mov f64 = f0
  5214. #if defined(TRMMKERNEL) && \
  5215. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5216. shladd KK8 = L, BASE_SHIFT, r0
  5217. #else
  5218. nop __LINE__
  5219. #endif
  5220. }
  5221. { .mfb
  5222. STFD [C12] = f92, SIZE
  5223. mov f72 = f0
  5224. nop __LINE__
  5225. }
  5226. ;;
  5227. { .mfi
  5228. STFD [C4 ] = f89, SIZE
  5229. mov f80 = f0
  5230. #if defined(TRMMKERNEL) && \
  5231. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5232. shladd AOFFSET = KK8, 3, AOFFSET
  5233. #else
  5234. nop __LINE__
  5235. #endif
  5236. }
  5237. { .mfi
  5238. STFD [C12] = f93, SIZE
  5239. mov f88 = f0
  5240. #if defined(TRMMKERNEL) && \
  5241. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5242. shladd BOFFSET = KK8, 2, BOFFSET
  5243. #else
  5244. nop __LINE__
  5245. #endif
  5246. }
  5247. ;;
  5248. { .mfi
  5249. STFD [C4 ] = f90, SIZE
  5250. mov f65 = f0
  5251. #if defined(TRMMKERNEL) && defined(LEFT)
  5252. adds KK = 8, KK
  5253. #else
  5254. nop __LINE__
  5255. #endif
  5256. }
  5257. { .mfb
  5258. STFD [C12] = f94, SIZE
  5259. mov f73 = f0
  5260. nop __LINE__
  5261. }
  5262. ;;
  5263. { .mfi
  5264. STFD [C4 ] = f91, 5 * SIZE
  5265. mov f81 = f0
  5266. #ifdef TRMMKERNEL
  5267. shladd KK8 = KK, BASE_SHIFT, r0
  5268. #else
  5269. nop __LINE__
  5270. #endif
  5271. }
  5272. { .mfb
  5273. STFD [C12] = f95, 5 * SIZE
  5274. mov f89 = f0
  5275. (p6) br.cond.dptk .L052
  5276. }
  5277. ;;
  5278. #endif
  5279. .align 32
  5280. .L060:
  5281. { .mfi
  5282. nop __LINE__
  5283. mov f66 = f0
  5284. tbit.z p6, p7 = M, 2
  5285. }
  5286. { .mfb
  5287. #ifndef TRMMKERNEL
  5288. nop __LINE__
  5289. #else
  5290. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5291. sub L = K, KK
  5292. #elif defined(LEFT)
  5293. adds L = 4, KK
  5294. #else
  5295. adds L = 4, KK
  5296. #endif
  5297. #endif
  5298. mov f74 = f0
  5299. (p6) br.cond.dptk .L070
  5300. }
  5301. ;;
  5302. #if !defined(TRMMKERNEL) || \
  5303. defined(TRMMKERNEL) && \
  5304. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5305. { .mfb
  5306. LDFPD f48, f49 = [B]
  5307. mov f82 = f0
  5308. nop __LINE__
  5309. }
  5310. { .mfi
  5311. adds BOFFSET = 2 * SIZE, B
  5312. mov f90 = f0
  5313. #ifndef TRMMKERNEL
  5314. adds L = 1, K
  5315. #else
  5316. adds L = 1, L
  5317. #endif
  5318. }
  5319. ;;
  5320. #else
  5321. { .mfi
  5322. shladd BOFFSET = KK8, 2, B
  5323. mov f82 = f0
  5324. shladd AOFFSET = KK8, 2, AOFFSET
  5325. }
  5326. ;;
  5327. { .mfi
  5328. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5329. mov f90 = f0
  5330. #ifndef TRMMKERNEL
  5331. adds L = 1, K
  5332. #else
  5333. adds L = 1, L
  5334. #endif
  5335. }
  5336. ;;
  5337. #endif
  5338. ;;
  5339. { .mii
  5340. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5341. tbit.z p12, p0 = L, 0
  5342. shr L = L, 1
  5343. }
  5344. ;;
  5345. { .mfi
  5346. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  5347. mov f67 = f0
  5348. adds L = -1, L
  5349. }
  5350. { .mfi
  5351. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  5352. mov f75 = f0
  5353. nop __LINE__
  5354. }
  5355. ;;
  5356. { .mfi
  5357. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  5358. mov f83 = f0
  5359. mov ar.lc = L
  5360. }
  5361. { .mfi
  5362. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  5363. mov f91 = f0
  5364. cmp.eq p3, p0 = r0, r0
  5365. }
  5366. ;;
  5367. .align 32
  5368. .L062:
  5369. { .mfi
  5370. lfetch.nt1 [PREA], 8 * SIZE
  5371. FMA f64 = f32, f48, f64 // A1 * B1
  5372. cmp.ne p4, p5 = 0, L
  5373. }
  5374. { .mfi
  5375. nop __LINE__
  5376. FMA f72 = f32, f49, f72 // A1 * B2
  5377. (p12) cmp.ne p3, p0 = 0, L
  5378. }
  5379. ;;
  5380. { .mfi
  5381. lfetch.nt1 [PREB], 8 * SIZE
  5382. FMA f80 = f32, f50, f80 // A1 * B3
  5383. (p5) adds C9 = 2 * SIZE, C1
  5384. }
  5385. { .mfi
  5386. nop __LINE__
  5387. FMA f88 = f32, f51, f88 // A1 * B4
  5388. (p5) adds C10 = 2 * SIZE, C2
  5389. }
  5390. ;;
  5391. { .mfi
  5392. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  5393. FMA f65 = f33, f48, f65 // A2 * B1
  5394. (p5) adds C11 = 2 * SIZE, C3
  5395. }
  5396. { .mfi
  5397. nop __LINE__
  5398. FMA f73 = f33, f49, f73 // A2 * B2
  5399. (p5) adds C12 = 2 * SIZE, C4
  5400. }
  5401. ;;
  5402. { .mfb
  5403. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  5404. FMA f81 = f33, f50, f81 // A2 * B3
  5405. nop __LINE__
  5406. }
  5407. { .mfb
  5408. nop __LINE__
  5409. FMA f89 = f33, f51, f89 // A2 * B4
  5410. nop __LINE__
  5411. }
  5412. ;;
  5413. { .mfb
  5414. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  5415. FMA f66 = f34, f48, f66 // A3 * B1
  5416. nop __LINE__
  5417. }
  5418. { .mfb
  5419. nop __LINE__
  5420. FMA f74 = f34, f49, f74 // A3 * B2
  5421. nop __LINE__
  5422. }
  5423. ;;
  5424. { .mfb
  5425. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  5426. FMA f82 = f34, f50, f82 // A3 * B3
  5427. nop __LINE__
  5428. }
  5429. { .mfb
  5430. nop __LINE__
  5431. FMA f90 = f34, f51, f90 // A3 * B4
  5432. nop __LINE__
  5433. }
  5434. ;;
  5435. { .mfb
  5436. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5437. FMA f67 = f35, f48, f67 // A4 * B1
  5438. }
  5439. { .mfb
  5440. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5441. (p5) LDFD f68 = [C1 ], SIZE
  5442. #else
  5443. nop __LINE__
  5444. #endif
  5445. FMA f75 = f35, f49, f75 // A4 * B2
  5446. nop __LINE__
  5447. }
  5448. { .mfb
  5449. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5450. FMA f83 = f35, f50, f83 // A4 * B3
  5451. nop __LINE__
  5452. }
  5453. { .mfb
  5454. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5455. (p5) LDFD f70 = [C9 ], SIZE
  5456. #else
  5457. nop __LINE__
  5458. #endif
  5459. FMA f91 = f35, f51, f91 // A4 * B4
  5460. nop __LINE__
  5461. }
  5462. ;;
  5463. { .mfb
  5464. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  5465. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  5466. nop __LINE__
  5467. }
  5468. { .mfb
  5469. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5470. (p5) LDFD f69 = [C1 ], -1 * SIZE
  5471. #else
  5472. nop __LINE__
  5473. #endif
  5474. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  5475. nop __LINE__
  5476. }
  5477. ;;
  5478. { .mfb
  5479. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  5480. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  5481. nop __LINE__
  5482. }
  5483. { .mfb
  5484. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5485. (p5) LDFD f71 = [C9 ], -1 * SIZE
  5486. #else
  5487. nop __LINE__
  5488. #endif
  5489. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  5490. nop __LINE__
  5491. }
  5492. ;;
  5493. { .mfb
  5494. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5495. (p5) LDFD f76 = [C2 ], SIZE
  5496. #else
  5497. nop __LINE__
  5498. #endif
  5499. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  5500. nop __LINE__
  5501. }
  5502. { .mfb
  5503. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5504. (p5) LDFD f78 = [C10], SIZE
  5505. #else
  5506. nop __LINE__
  5507. #endif
  5508. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  5509. nop __LINE__
  5510. }
  5511. ;;
  5512. { .mfb
  5513. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5514. (p5) LDFD f77 = [C2 ], -1 * SIZE
  5515. #else
  5516. nop __LINE__
  5517. #endif
  5518. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  5519. nop __LINE__
  5520. }
  5521. { .mfb
  5522. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5523. (p5) LDFD f79 = [C10], -1 * SIZE
  5524. #else
  5525. nop __LINE__
  5526. #endif
  5527. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  5528. nop __LINE__
  5529. }
  5530. ;;
  5531. { .mfb
  5532. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5533. (p5) LDFD f84 = [C3 ], SIZE
  5534. #else
  5535. nop __LINE__
  5536. #endif
  5537. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  5538. nop __LINE__
  5539. }
  5540. { .mfb
  5541. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5542. (p5) LDFD f86 = [C11], SIZE
  5543. #else
  5544. nop __LINE__
  5545. #endif
  5546. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  5547. nop __LINE__
  5548. }
  5549. ;;
  5550. { .mfb
  5551. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5552. (p5) LDFD f85 = [C3 ], -1 * SIZE
  5553. #else
  5554. nop __LINE__
  5555. #endif
  5556. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  5557. nop __LINE__
  5558. }
  5559. { .mfb
  5560. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5561. (p5) LDFD f87 = [C11], -1 * SIZE
  5562. #else
  5563. nop __LINE__
  5564. #endif
  5565. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  5566. nop __LINE__
  5567. }
  5568. ;;
  5569. { .mfb
  5570. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5571. (p5) LDFD f92 = [C4 ], SIZE
  5572. #else
  5573. nop __LINE__
  5574. #endif
  5575. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  5576. nop __LINE__
  5577. }
  5578. { .mfb
  5579. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5580. (p5) LDFD f94 = [C12], SIZE
  5581. #else
  5582. nop __LINE__
  5583. #endif
  5584. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  5585. nop __LINE__
  5586. }
  5587. ;;
  5588. { .mfi
  5589. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5590. (p5) LDFD f93 = [C4 ], -1 * SIZE
  5591. #else
  5592. nop __LINE__
  5593. #endif
  5594. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  5595. adds L = -1, L
  5596. }
  5597. { .mfb
  5598. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5599. (p5) LDFD f95 = [C12], -1 * SIZE
  5600. #else
  5601. nop __LINE__
  5602. #endif
  5603. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  5604. br.cloop.sptk.few .L062
  5605. }
  5606. ;;
  5607. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5608. FMA f64 = ALPHA, f64, f68
  5609. FMA f66 = ALPHA, f66, f70
  5610. FMA f65 = ALPHA, f65, f69
  5611. FMA f67 = ALPHA, f67, f71
  5612. FMA f72 = ALPHA, f72, f76
  5613. FMA f74 = ALPHA, f74, f78
  5614. FMA f73 = ALPHA, f73, f77
  5615. FMA f75 = ALPHA, f75, f79
  5616. ;;
  5617. { .mfb
  5618. STFD [C1 ] = f64, SIZE
  5619. FMA f80 = ALPHA, f80, f84
  5620. nop __LINE__
  5621. }
  5622. { .mfb
  5623. STFD [C9 ] = f66, SIZE
  5624. FMA f82 = ALPHA, f82, f86
  5625. nop __LINE__
  5626. }
  5627. ;;
  5628. { .mfb
  5629. STFD [C1 ] = f65, 3 * SIZE
  5630. FMA f81 = ALPHA, f81, f85
  5631. nop __LINE__
  5632. }
  5633. { .mfb
  5634. STFD [C9 ] = f67, 3 * SIZE
  5635. FMA f83 = ALPHA, f83, f87
  5636. nop __LINE__
  5637. }
  5638. ;;
  5639. { .mfb
  5640. STFD [C2 ] = f72, SIZE
  5641. FMA f88 = ALPHA, f88, f92
  5642. nop __LINE__
  5643. }
  5644. { .mfb
  5645. STFD [C10] = f74, SIZE
  5646. FMA f90 = ALPHA, f90, f94
  5647. nop __LINE__
  5648. }
  5649. ;;
  5650. { .mfb
  5651. STFD [C2 ] = f73, 3 * SIZE
  5652. FMA f89 = ALPHA, f89, f93
  5653. nop __LINE__
  5654. }
  5655. { .mfb
  5656. STFD [C10] = f75, 3 * SIZE
  5657. FMA f91 = ALPHA, f91, f95
  5658. nop __LINE__
  5659. }
  5660. ;;
  5661. { .mfb
  5662. STFD [C3 ] = f80, SIZE
  5663. mov f80 = f0
  5664. nop __LINE__
  5665. }
  5666. { .mfb
  5667. STFD [C11] = f82, SIZE
  5668. mov f64 = f0
  5669. nop __LINE__
  5670. }
  5671. ;;
  5672. { .mfb
  5673. STFD [C3 ] = f81, 3 * SIZE
  5674. mov f81 = f0
  5675. nop __LINE__
  5676. }
  5677. { .mfb
  5678. STFD [C11] = f83, 3 * SIZE
  5679. mov f72 = f0
  5680. nop __LINE__
  5681. }
  5682. ;;
  5683. { .mfi
  5684. STFD [C4 ] = f88, SIZE
  5685. mov f88 = f0
  5686. adds L = 1, K
  5687. }
  5688. { .mfb
  5689. STFD [C12] = f90, SIZE
  5690. mov f65 = f0
  5691. nop __LINE__
  5692. }
  5693. ;;
  5694. { .mfi
  5695. STFD [C4 ] = f89, 3 * SIZE
  5696. mov f89 = f0
  5697. shr L = L, 1
  5698. }
  5699. { .mfb
  5700. STFD [C12] = f91, 3 * SIZE
  5701. mov f73 = f0
  5702. nop __LINE__
  5703. }
  5704. ;;
  5705. #else
  5706. FMPY f64 = ALPHA, f64
  5707. FMPY f66 = ALPHA, f66
  5708. FMPY f65 = ALPHA, f65
  5709. FMPY f67 = ALPHA, f67
  5710. FMPY f72 = ALPHA, f72
  5711. FMPY f74 = ALPHA, f74
  5712. FMPY f73 = ALPHA, f73
  5713. FMPY f75 = ALPHA, f75
  5714. ;;
  5715. { .mfb
  5716. STFD [C1 ] = f64, SIZE
  5717. FMPY f80 = ALPHA, f80
  5718. nop __LINE__
  5719. }
  5720. { .mfb
  5721. STFD [C9 ] = f66, SIZE
  5722. FMPY f82 = ALPHA, f82
  5723. nop __LINE__
  5724. }
  5725. ;;
  5726. { .mfb
  5727. STFD [C1 ] = f65, 3 * SIZE
  5728. FMPY f81 = ALPHA, f81
  5729. nop __LINE__
  5730. }
  5731. { .mfb
  5732. STFD [C9 ] = f67, 3 * SIZE
  5733. FMPY f83 = ALPHA, f83
  5734. nop __LINE__
  5735. }
  5736. ;;
  5737. { .mfi
  5738. STFD [C2 ] = f72, SIZE
  5739. FMPY f88 = ALPHA, f88
  5740. #if defined(TRMMKERNEL) && \
  5741. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5742. sub L = K, KK
  5743. #else
  5744. nop __LINE__
  5745. #endif
  5746. }
  5747. { .mfb
  5748. STFD [C10] = f74, SIZE
  5749. FMPY f90 = ALPHA, f90
  5750. nop __LINE__
  5751. }
  5752. ;;
  5753. { .mfi
  5754. STFD [C2 ] = f73, 3 * SIZE
  5755. FMPY f89 = ALPHA, f89
  5756. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  5757. adds L = -4, L
  5758. #else
  5759. nop __LINE__
  5760. #endif
  5761. }
  5762. { .mfi
  5763. STFD [C10] = f75, 3 * SIZE
  5764. FMPY f91 = ALPHA, f91
  5765. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  5766. adds L = -4, L
  5767. #else
  5768. nop __LINE__
  5769. #endif
  5770. }
  5771. ;;
  5772. { .mfi
  5773. STFD [C3 ] = f80, SIZE
  5774. mov f80 = f0
  5775. #if defined(TRMMKERNEL) && \
  5776. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5777. shladd KK8 = L, BASE_SHIFT, r0
  5778. #else
  5779. nop __LINE__
  5780. #endif
  5781. }
  5782. { .mfb
  5783. STFD [C11] = f82, SIZE
  5784. mov f64 = f0
  5785. nop __LINE__
  5786. }
  5787. ;;
  5788. { .mfi
  5789. STFD [C3 ] = f81, 3 * SIZE
  5790. mov f81 = f0
  5791. #if defined(TRMMKERNEL) && \
  5792. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5793. shladd AOFFSET = KK8, 2, AOFFSET
  5794. #else
  5795. nop __LINE__
  5796. #endif
  5797. }
  5798. { .mfi
  5799. STFD [C11] = f83, 3 * SIZE
  5800. mov f72 = f0
  5801. #if defined(TRMMKERNEL) && \
  5802. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5803. shladd BOFFSET = KK8, 2, BOFFSET
  5804. #else
  5805. nop __LINE__
  5806. #endif
  5807. }
  5808. ;;
  5809. { .mfi
  5810. STFD [C4 ] = f88, SIZE
  5811. mov f88 = f0
  5812. #if defined(TRMMKERNEL) && defined(LEFT)
  5813. adds KK = 4, KK
  5814. #else
  5815. nop __LINE__
  5816. #endif
  5817. }
  5818. { .mfb
  5819. STFD [C12] = f90, SIZE
  5820. mov f65 = f0
  5821. nop __LINE__
  5822. }
  5823. ;;
  5824. { .mfi
  5825. STFD [C4 ] = f89, 3 * SIZE
  5826. mov f89 = f0
  5827. #ifdef TRMMKERNEL
  5828. shladd KK8 = KK, BASE_SHIFT, r0
  5829. #else
  5830. nop __LINE__
  5831. #endif
  5832. }
  5833. { .mfb
  5834. STFD [C12] = f91, 3 * SIZE
  5835. mov f73 = f0
  5836. nop __LINE__
  5837. }
  5838. ;;
  5839. #endif
  5840. .align 32
  5841. .L070:
  5842. { .mib
  5843. #ifndef TRMMKERNEL
  5844. nop __LINE__
  5845. #else
  5846. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5847. sub L = K, KK
  5848. #elif defined(LEFT)
  5849. adds L = 2, KK
  5850. #else
  5851. adds L = 4, KK
  5852. #endif
  5853. #endif
  5854. tbit.z p6,p7 = M, 1
  5855. (p6) br.cond.dptk .L080
  5856. }
  5857. ;;
  5858. #if !defined(TRMMKERNEL) || \
  5859. defined(TRMMKERNEL) && \
  5860. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5861. { .mmi
  5862. LDFPD f48, f49 = [B]
  5863. adds BOFFSET = 2 * SIZE, B
  5864. #ifndef TRMMKERNEL
  5865. adds L = 1, K
  5866. #else
  5867. adds L = 1, L
  5868. #endif
  5869. }
  5870. ;;
  5871. #else
  5872. { .mmi
  5873. shladd BOFFSET = KK8, 2, B
  5874. shladd AOFFSET = KK8, 1, AOFFSET
  5875. nop __LINE__
  5876. }
  5877. ;;
  5878. { .mmi
  5879. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5880. #ifndef TRMMKERNEL
  5881. adds L = 1, K
  5882. #else
  5883. adds L = 1, L
  5884. #endif
  5885. nop __LINE__
  5886. }
  5887. ;;
  5888. #endif
  5889. { .mii
  5890. cmp.eq p3, p0 = r0, r0
  5891. tbit.z p12, p0 = L, 0
  5892. shr L = L, 1
  5893. }
  5894. ;;
  5895. { .mmi
  5896. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5897. adds L = -1, L
  5898. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  5899. }
  5900. ;;
  5901. { .mmi
  5902. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  5903. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  5904. mov ar.lc = L
  5905. }
  5906. ;;
  5907. .align 32
  5908. .L072:
  5909. { .mfb
  5910. lfetch.nt1 [PREA], 4 * SIZE
  5911. FMA f64 = f32, f48, f64 // A1 * B1
  5912. nop __LINE__
  5913. }
  5914. { .mfi
  5915. nop __LINE__
  5916. FMA f72 = f32, f49, f72 // A1 * B2
  5917. (p12) cmp.ne p3, p0 = 0, L
  5918. }
  5919. ;;
  5920. { .mfi
  5921. lfetch.nt1 [PREB], 8 * SIZE
  5922. FMA f80 = f32, f50, f80 // A1 * B3
  5923. cmp.ne p4, p5 = 0, L
  5924. }
  5925. { .mfb
  5926. nop __LINE__
  5927. FMA f88 = f32, f51, f88 // A1 * B4
  5928. nop __LINE__
  5929. }
  5930. ;;
  5931. { .mfi
  5932. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  5933. FMA f65 = f33, f48, f65 // A2 * B1
  5934. }
  5935. { .mfi
  5936. nop __LINE__
  5937. FMA f73 = f33, f49, f73 // A2 * B2
  5938. }
  5939. ;;
  5940. { .mfi
  5941. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  5942. FMA f81 = f33, f50, f81 // A2 * B3
  5943. }
  5944. { .mmf
  5945. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5946. (p5) LDFD f68 = [C1 ], SIZE
  5947. (p5) LDFD f76 = [C2 ], SIZE
  5948. #else
  5949. nop __LINE__
  5950. nop __LINE__
  5951. #endif
  5952. FMA f89 = f33, f51, f89 // A2 * B4
  5953. }
  5954. ;;
  5955. { .mfb
  5956. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  5957. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  5958. nop __LINE__
  5959. }
  5960. { .mmf
  5961. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5962. (p5) LDFD f69 = [C1 ], -1 * SIZE
  5963. (p5) LDFD f77 = [C2 ], -1 * SIZE
  5964. #else
  5965. nop __LINE__
  5966. nop __LINE__
  5967. #endif
  5968. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  5969. }
  5970. ;;
  5971. { .mfb
  5972. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5973. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  5974. nop __LINE__
  5975. }
  5976. { .mmf
  5977. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5978. (p5) LDFD f84 = [C3 ], SIZE
  5979. (p5) LDFD f92 = [C4 ], SIZE
  5980. #else
  5981. nop __LINE__
  5982. nop __LINE__
  5983. #endif
  5984. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  5985. }
  5986. ;;
  5987. { .mfb
  5988. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5989. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  5990. nop __LINE__
  5991. }
  5992. { .mfb
  5993. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5994. (p5) LDFD f85 = [C3 ], -1 * SIZE
  5995. #else
  5996. nop __LINE__
  5997. #endif
  5998. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  5999. nop __LINE__
  6000. }
  6001. ;;
  6002. { .mfi
  6003. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  6004. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  6005. adds L = -1, L
  6006. }
  6007. { .mfb
  6008. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6009. (p5) LDFD f93 = [C4 ], -1 * SIZE
  6010. #else
  6011. nop __LINE__
  6012. #endif
  6013. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  6014. br.cloop.sptk.few .L072
  6015. }
  6016. ;;
  6017. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6018. FMA f64 = ALPHA, f64, f68
  6019. FMA f65 = ALPHA, f65, f69
  6020. FMA f72 = ALPHA, f72, f76
  6021. FMA f73 = ALPHA, f73, f77
  6022. FMA f80 = ALPHA, f80, f84
  6023. FMA f81 = ALPHA, f81, f85
  6024. FMA f88 = ALPHA, f88, f92
  6025. FMA f89 = ALPHA, f89, f93
  6026. ;;
  6027. { .mfb
  6028. STFD [C1 ] = f64, SIZE
  6029. mov f64 = f0
  6030. nop __LINE__
  6031. }
  6032. { .mfb
  6033. STFD [C2 ] = f72, SIZE
  6034. mov f72 = f0
  6035. nop __LINE__
  6036. }
  6037. ;;
  6038. { .mmi
  6039. STFD [C1 ] = f65, SIZE
  6040. STFD [C2 ] = f73, SIZE
  6041. nop __LINE__
  6042. }
  6043. ;;
  6044. { .mfi
  6045. STFD [C3 ] = f80, SIZE
  6046. mov f80 = f0
  6047. adds L = 1, K
  6048. }
  6049. { .mfb
  6050. STFD [C4 ] = f88, SIZE
  6051. mov f88 = f0
  6052. nop __LINE__
  6053. }
  6054. ;;
  6055. { .mmi
  6056. STFD [C3 ] = f81, SIZE
  6057. STFD [C4 ] = f89, SIZE
  6058. shr L = L, 1
  6059. }
  6060. ;;
  6061. #else
  6062. FMPY f64 = ALPHA, f64
  6063. FMPY f65 = ALPHA, f65
  6064. ;;
  6065. { .mfi
  6066. nop __LINE__
  6067. FMPY f72 = ALPHA, f72
  6068. #if defined(TRMMKERNEL) && \
  6069. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6070. sub L = K, KK
  6071. #else
  6072. nop __LINE__
  6073. #endif
  6074. }
  6075. { .mfi
  6076. nop __LINE__
  6077. FMPY f73 = ALPHA, f73
  6078. nop __LINE__
  6079. }
  6080. ;;
  6081. { .mfi
  6082. FMPY f80 = ALPHA, f80
  6083. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  6084. adds L = -2, L
  6085. #else
  6086. nop __LINE__
  6087. #endif
  6088. }
  6089. { .mfi
  6090. FMPY f81 = ALPHA, f81
  6091. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  6092. adds L = -4, L
  6093. #else
  6094. nop __LINE__
  6095. #endif
  6096. }
  6097. ;;
  6098. { .mfi
  6099. nop __LINE__
  6100. FMPY f88 = ALPHA, f88
  6101. #if defined(TRMMKERNEL) && \
  6102. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6103. shladd KK8 = L, BASE_SHIFT, r0
  6104. #else
  6105. nop __LINE__
  6106. #endif
  6107. }
  6108. { .mfi
  6109. nop __LINE__
  6110. FMPY f89 = ALPHA, f89
  6111. nop __LINE__
  6112. }
  6113. ;;
  6114. { .mfi
  6115. STFD [C1 ] = f64, SIZE
  6116. mov f64 = f0
  6117. #if defined(TRMMKERNEL) && \
  6118. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6119. shladd AOFFSET = KK8, 1, AOFFSET
  6120. #else
  6121. nop __LINE__
  6122. #endif
  6123. }
  6124. { .mfb
  6125. STFD [C2 ] = f72, SIZE
  6126. mov f72 = f0
  6127. nop __LINE__
  6128. }
  6129. ;;
  6130. { .mmi
  6131. STFD [C1 ] = f65, SIZE
  6132. STFD [C2 ] = f73, SIZE
  6133. #if defined(TRMMKERNEL) && \
  6134. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6135. shladd BOFFSET = KK8, 2, BOFFSET
  6136. #else
  6137. nop __LINE__
  6138. #endif
  6139. }
  6140. ;;
  6141. { .mfi
  6142. STFD [C3 ] = f80, SIZE
  6143. mov f80 = f0
  6144. #if defined(TRMMKERNEL) && defined(LEFT)
  6145. adds KK = 2, KK
  6146. #else
  6147. nop __LINE__
  6148. #endif
  6149. }
  6150. { .mfb
  6151. STFD [C4 ] = f88, SIZE
  6152. mov f88 = f0
  6153. nop __LINE__
  6154. }
  6155. ;;
  6156. { .mmi
  6157. STFD [C3 ] = f81, SIZE
  6158. STFD [C4 ] = f89, SIZE
  6159. #ifdef TRMMKERNEL
  6160. shladd KK8 = KK, BASE_SHIFT, r0
  6161. #else
  6162. nop __LINE__
  6163. #endif
  6164. }
  6165. ;;
  6166. #endif
  6167. .align 32
  6168. .L080:
  6169. { .mib
  6170. #ifndef TRMMKERNEL
  6171. nop __LINE__
  6172. #else
  6173. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  6174. sub L = K, KK
  6175. #elif defined(LEFT)
  6176. adds L = 1, KK
  6177. #else
  6178. adds L = 4, KK
  6179. #endif
  6180. #endif
  6181. tbit.z p6,p7 = M, 0
  6182. (p6) br.cond.dptk .L089
  6183. }
  6184. ;;
  6185. #if !defined(TRMMKERNEL) || \
  6186. defined(TRMMKERNEL) && \
  6187. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6188. { .mmi
  6189. LDFPD f48, f49 = [B]
  6190. adds BOFFSET = 2 * SIZE, B
  6191. #ifndef TRMMKERNEL
  6192. adds L = 1, K
  6193. #else
  6194. adds L = 1, L
  6195. #endif
  6196. }
  6197. ;;
  6198. #else
  6199. { .mmi
  6200. shladd BOFFSET = KK8, 2, B
  6201. add AOFFSET = KK8, AOFFSET
  6202. nop __LINE__
  6203. }
  6204. ;;
  6205. { .mmi
  6206. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6207. #ifndef TRMMKERNEL
  6208. adds L = 1, K
  6209. #else
  6210. adds L = 1, L
  6211. #endif
  6212. nop __LINE__
  6213. }
  6214. ;;
  6215. #endif
  6216. { .mii
  6217. LDFD f32 = [AOFFSET], 1 * SIZE
  6218. tbit.z p12, p0 = L, 0
  6219. shr L = L, 1
  6220. }
  6221. ;;
  6222. { .mmi
  6223. nop __LINE__
  6224. nop __LINE__
  6225. adds L = -1, L
  6226. }
  6227. ;;
  6228. { .mmi
  6229. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  6230. cmp.eq p3, p0 = r0, r0
  6231. mov ar.lc = L
  6232. }
  6233. ;;
  6234. .align 32
  6235. .L082:
  6236. { .mfb
  6237. cmp.ne p4, p5 = 0, L
  6238. FMA f64 = f32, f48, f64 // A1 * B1
  6239. nop __LINE__
  6240. }
  6241. { .mfi
  6242. (p12) cmp.ne p3, p0 = 0, L
  6243. FMA f72 = f32, f49, f72 // A1 * B2
  6244. nop __LINE__
  6245. }
  6246. ;;
  6247. { .mfb
  6248. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  6249. FMA f80 = f32, f50, f80 // A1 * B3
  6250. nop __LINE__
  6251. }
  6252. { .mfb
  6253. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  6254. FMA f88 = f32, f51, f88 // A1 * B4
  6255. nop __LINE__
  6256. }
  6257. ;;
  6258. { .mfb
  6259. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  6260. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6261. nop __LINE__
  6262. }
  6263. { .mfb
  6264. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6265. (p5) LDFD f68 = [C1]
  6266. #else
  6267. nop __LINE__
  6268. #endif
  6269. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  6270. nop __LINE__
  6271. }
  6272. ;;
  6273. { .mmf
  6274. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6275. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  6276. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  6277. }
  6278. { .mmf
  6279. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6280. (p5) LDFD f76 = [C2]
  6281. (p5) LDFD f84 = [C3]
  6282. #else
  6283. nop __LINE__
  6284. nop __LINE__
  6285. #endif
  6286. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  6287. }
  6288. ;;
  6289. { .mib
  6290. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  6291. nop __LINE__
  6292. nop __LINE__
  6293. }
  6294. { .mmb
  6295. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6296. (p5) LDFD f92 = [C4]
  6297. #else
  6298. nop __LINE__
  6299. #endif
  6300. adds L = -1, L
  6301. br.cloop.sptk.few .L082
  6302. }
  6303. ;;
  6304. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6305. FMA f64 = ALPHA, f64, f68
  6306. FMA f72 = ALPHA, f72, f76
  6307. FMA f80 = ALPHA, f80, f84
  6308. FMA f88 = ALPHA, f88, f92
  6309. ;;
  6310. STFD [C1 ] = f64, SIZE
  6311. STFD [C2 ] = f72, SIZE
  6312. STFD [C3 ] = f80, SIZE
  6313. STFD [C4 ] = f88, SIZE
  6314. ;;
  6315. #else
  6316. { .mfi
  6317. nop __LINE__
  6318. FMPY f64 = ALPHA, f64
  6319. #if defined(TRMMKERNEL) && \
  6320. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6321. sub L = K, KK
  6322. #else
  6323. nop __LINE__
  6324. #endif
  6325. }
  6326. { .mfi
  6327. nop __LINE__
  6328. FMPY f72 = ALPHA, f72
  6329. nop __LINE__
  6330. }
  6331. ;;
  6332. { .mfi
  6333. FMPY f80 = ALPHA, f80
  6334. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  6335. adds L = -1, L
  6336. #else
  6337. nop __LINE__
  6338. #endif
  6339. }
  6340. { .mfi
  6341. FMPY f88 = ALPHA, f88
  6342. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  6343. adds L = -4, L
  6344. #else
  6345. nop __LINE__
  6346. #endif
  6347. }
  6348. ;;
  6349. { .mmi
  6350. #if defined(TRMMKERNEL) && \
  6351. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6352. shladd KK8 = L, BASE_SHIFT, r0
  6353. #else
  6354. nop __LINE__
  6355. #endif
  6356. ;;
  6357. #if defined(TRMMKERNEL) && \
  6358. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6359. add AOFFSET = KK8, AOFFSET
  6360. #else
  6361. nop __LINE__
  6362. #endif
  6363. #if defined(TRMMKERNEL) && \
  6364. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6365. shladd BOFFSET = KK8, 2, BOFFSET
  6366. #else
  6367. nop __LINE__
  6368. #endif
  6369. }
  6370. ;;
  6371. { .mmi
  6372. STFD [C1 ] = f64, SIZE
  6373. STFD [C2 ] = f72, SIZE
  6374. #if defined(TRMMKERNEL) && defined(LEFT)
  6375. adds KK = 1, KK
  6376. #else
  6377. nop __LINE__
  6378. #endif
  6379. }
  6380. ;;
  6381. { .mmi
  6382. STFD [C3 ] = f80, SIZE
  6383. STFD [C4 ] = f88, SIZE
  6384. #ifdef TRMMKERNEL
  6385. shladd KK8 = KK, BASE_SHIFT, r0
  6386. #else
  6387. nop __LINE__
  6388. #endif
  6389. }
  6390. ;;
  6391. #endif
  6392. .align 32
  6393. .L089:
  6394. { .mmi
  6395. mov B = BOFFSET
  6396. mov AOFFSET = A
  6397. #if defined(TRMMKERNEL) && !defined(LEFT)
  6398. adds KK = 4, KK
  6399. #else
  6400. nop __LINE__
  6401. #endif
  6402. }
  6403. ;;
  6404. .align 16
  6405. .L090:
  6406. { .mfi
  6407. mov C1 = C
  6408. mov f64 = f0
  6409. tbit.z p6, p0 = N, 1
  6410. }
  6411. { .mfi
  6412. add C2 = LDC, C
  6413. mov f72 = f0
  6414. shr I = M, 3
  6415. }
  6416. ;;
  6417. { .mfi
  6418. setf.d f66 = r0
  6419. mov f65 = f0
  6420. #if defined(TRMMKERNEL) && defined(LEFT)
  6421. mov KK = OFFSET
  6422. #else
  6423. nop __LINE__
  6424. #endif
  6425. }
  6426. { .mfb
  6427. mov AOFFSET = A
  6428. mov f73 = f0
  6429. (p6) br.cond.dpnt .L130
  6430. }
  6431. ;;
  6432. { .mfi
  6433. #ifdef TRMMKERNEL
  6434. shladd KK8 = KK, BASE_SHIFT, r0
  6435. #else
  6436. nop __LINE__
  6437. #endif
  6438. mov f67 = f0
  6439. shladd C = LDC, 1, C
  6440. }
  6441. { .mfb
  6442. cmp.eq p6, p7 = 0, I
  6443. mov f74 = f0
  6444. (p6) br.cond.dpnt .L100
  6445. }
  6446. ;;
  6447. .align 32
  6448. .L092:
  6449. #if !defined(TRMMKERNEL) || \
  6450. defined(TRMMKERNEL) && \
  6451. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6452. { .mfb
  6453. LDFPD f48, f49 = [B]
  6454. mov f68 = f0
  6455. nop __LINE__
  6456. }
  6457. { .mfb
  6458. adds BOFFSET = 2 * SIZE, B
  6459. mov f79 = f0
  6460. nop __LINE__
  6461. }
  6462. ;;
  6463. #else
  6464. { .mfi
  6465. shladd BOFFSET = KK8, 1, B
  6466. mov f68 = f0
  6467. shladd AOFFSET = KK8, 3, AOFFSET
  6468. }
  6469. ;;
  6470. { .mfi
  6471. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6472. mov f79 = f0
  6473. nop __LINE__
  6474. }
  6475. ;;
  6476. #endif
  6477. { .mfi
  6478. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6479. mov f75 = f0
  6480. #ifndef TRMMKERNEL
  6481. nop __LINE__
  6482. #else
  6483. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  6484. sub L = K, KK
  6485. #elif defined(LEFT)
  6486. adds L = 8, KK
  6487. #else
  6488. adds L = 2, KK
  6489. #endif
  6490. #endif
  6491. }
  6492. ;;
  6493. { .mfi
  6494. adds PREC = CPREFETCHSIZE * SIZE, C1
  6495. mov f76 = f0
  6496. #ifndef TRMMKERNEL
  6497. adds L = 1, K
  6498. #else
  6499. adds L = 1, L
  6500. #endif
  6501. }
  6502. ;;
  6503. { .mfi
  6504. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  6505. mov f69 = f0
  6506. tbit.z p12, p0 = L, 0
  6507. }
  6508. { .mfi
  6509. cmp.eq p3, p0 = r0, r0
  6510. mov f77 = f0
  6511. shr L = L, 1
  6512. }
  6513. ;;
  6514. { .mfi
  6515. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  6516. adds L = -1, L
  6517. }
  6518. { .mmf
  6519. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  6520. CPREFETCH [PREC], LDC
  6521. mov f70 = f0
  6522. }
  6523. ;;
  6524. { .mfi
  6525. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  6526. mov f78 = f0
  6527. mov ar.lc = L
  6528. }
  6529. { .mfi
  6530. CPREFETCH [PREC]
  6531. mov f71 = f0
  6532. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  6533. }
  6534. ;;
  6535. .align 32
  6536. .L093:
  6537. /* 1 */
  6538. { .mfi
  6539. lfetch.nt1 [PREA], 16 * SIZE
  6540. FMA f64 = f32, f48, f64 // A1 * B1
  6541. cmp.ne p4, p5 = 0, L
  6542. }
  6543. { .mfi
  6544. nop __LINE__
  6545. FMA f72 = f32, f49, f72 // A1 * B2
  6546. (p12) cmp.ne p3, p0 = 0, L
  6547. }
  6548. ;;
  6549. { .mfi
  6550. lfetch.nt1 [PREB], 4 * SIZE
  6551. FMA f65 = f33, f48, f65 // A2 * B1
  6552. adds C9 = 4 * SIZE, C1
  6553. }
  6554. { .mfi
  6555. nop __LINE__
  6556. FMA f73 = f33, f49, f73 // A2 * B2
  6557. adds C10 = 4 * SIZE, C2
  6558. }
  6559. ;;
  6560. { .mfi
  6561. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  6562. FMA f66 = f34, f48, f66 // A3 * B1
  6563. adds C11 = 4 * SIZE, C3
  6564. }
  6565. { .mfi
  6566. nop __LINE__
  6567. FMA f74 = f34, f49, f74 // A3 * B2
  6568. adds C12 = 4 * SIZE, C4
  6569. }
  6570. ;;
  6571. { .mfb
  6572. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  6573. FMA f67 = f35, f48, f67 // A4 * B1
  6574. nop __LINE__
  6575. }
  6576. { .mfb
  6577. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6578. (p5) LDFD f96 = [C1 ], SIZE
  6579. #else
  6580. nop __LINE__
  6581. #endif
  6582. FMA f75 = f35, f49, f75 // A4 * B2
  6583. nop __LINE__
  6584. }
  6585. ;;
  6586. { .mfb
  6587. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  6588. FMA f68 = f36, f48, f68 // A5 * B1
  6589. nop __LINE__
  6590. }
  6591. { .mfb
  6592. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6593. (p5) LDFD f97 = [C9 ], SIZE
  6594. #else
  6595. nop __LINE__
  6596. #endif
  6597. FMA f76 = f36, f49, f76 // A5 * B2
  6598. nop __LINE__
  6599. }
  6600. ;;
  6601. { .mfb
  6602. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  6603. FMA f69 = f37, f48, f69 // A6 * B1
  6604. nop __LINE__
  6605. }
  6606. { .mfb
  6607. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6608. (p5) LDFD f98 = [C1 ], SIZE
  6609. #else
  6610. nop __LINE__
  6611. #endif
  6612. FMA f77 = f37, f49, f77 // A6 * B2
  6613. nop __LINE__
  6614. }
  6615. ;;
  6616. { .mfb
  6617. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  6618. FMA f70 = f38, f48, f70 // A7 * B1
  6619. nop __LINE__
  6620. }
  6621. { .mfb
  6622. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6623. (p5) LDFD f99 = [C9 ], SIZE
  6624. #else
  6625. nop __LINE__
  6626. #endif
  6627. FMA f78 = f38, f49, f78 // A7 * B2
  6628. nop __LINE__
  6629. }
  6630. ;;
  6631. { .mfb
  6632. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6633. FMA f71 = f39, f48, f71 // A8 * B1
  6634. nop __LINE__
  6635. }
  6636. { .mfb
  6637. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6638. (p5) LDFD f100 = [C1 ], SIZE
  6639. #else
  6640. nop __LINE__
  6641. #endif
  6642. FMA f79 = f39, f49, f79 // A8 * B2
  6643. nop __LINE__
  6644. }
  6645. ;;
  6646. { .mfb
  6647. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6648. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6649. nop __LINE__
  6650. }
  6651. { .mfb
  6652. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6653. (p5) LDFD f101 = [C9 ], SIZE
  6654. #else
  6655. nop __LINE__
  6656. #endif
  6657. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  6658. nop __LINE__
  6659. }
  6660. ;;
  6661. { .mfb
  6662. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  6663. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  6664. nop __LINE__
  6665. }
  6666. { .mfb
  6667. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6668. (p5) LDFD f102 = [C1 ], -3 * SIZE
  6669. #else
  6670. nop __LINE__
  6671. #endif
  6672. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  6673. nop __LINE__
  6674. }
  6675. ;;
  6676. { .mfb
  6677. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  6678. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  6679. nop __LINE__
  6680. }
  6681. { .mfb
  6682. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6683. (p5) LDFD f103 = [C9 ], -3 * SIZE
  6684. #else
  6685. nop __LINE__
  6686. #endif
  6687. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  6688. nop __LINE__
  6689. }
  6690. ;;
  6691. { .mfb
  6692. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  6693. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  6694. nop __LINE__
  6695. }
  6696. { .mfb
  6697. nop __LINE__
  6698. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  6699. nop __LINE__
  6700. }
  6701. ;;
  6702. { .mfb
  6703. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6704. (p5) LDFD f104 = [C2 ], SIZE
  6705. #else
  6706. nop __LINE__
  6707. #endif
  6708. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  6709. nop __LINE__
  6710. }
  6711. { .mfb
  6712. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6713. (p5) LDFD f105 = [C10], SIZE
  6714. #else
  6715. nop __LINE__
  6716. #endif
  6717. (p3) FMA f76 = f44, f57, f76 // A5 * B2
  6718. nop __LINE__
  6719. }
  6720. ;;
  6721. { .mfb
  6722. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6723. (p5) LDFD f106 = [C2 ], SIZE
  6724. #else
  6725. nop __LINE__
  6726. #endif
  6727. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  6728. nop __LINE__
  6729. }
  6730. { .mfb
  6731. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6732. (p5) LDFD f107 = [C10], SIZE
  6733. #else
  6734. nop __LINE__
  6735. #endif
  6736. (p3) FMA f77 = f45, f57, f77 // A6 * B2
  6737. nop __LINE__
  6738. }
  6739. ;;
  6740. { .mfb
  6741. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6742. (p5) LDFD f108 = [C2 ], SIZE
  6743. #else
  6744. nop __LINE__
  6745. #endif
  6746. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  6747. nop __LINE__
  6748. }
  6749. { .mfb
  6750. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6751. (p5) LDFD f109 = [C10], SIZE
  6752. #else
  6753. nop __LINE__
  6754. #endif
  6755. (p3) FMA f78 = f46, f57, f78 // A7 * B2
  6756. nop __LINE__
  6757. }
  6758. ;;
  6759. { .mfi
  6760. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6761. (p5) LDFD f110 = [C2 ], -3 * SIZE
  6762. #else
  6763. nop __LINE__
  6764. #endif
  6765. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  6766. adds L = -1, L
  6767. }
  6768. { .mfb
  6769. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6770. (p5) LDFD f111 = [C10], -3 * SIZE
  6771. #else
  6772. nop __LINE__
  6773. #endif
  6774. (p3) FMA f79 = f47, f57, f79 // A8 * B2
  6775. br.cloop.sptk.few .L093
  6776. }
  6777. ;;
  6778. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6779. { .mfi
  6780. nop __LINE__
  6781. FMA f64 = ALPHA, f64, f96
  6782. cmp.ne p6, p0 = 1, I
  6783. }
  6784. { .mfb
  6785. nop __LINE__
  6786. FMA f68 = ALPHA, f68, f97
  6787. nop __LINE__
  6788. }
  6789. ;;
  6790. { .mfi
  6791. nop __LINE__
  6792. FMA f65 = ALPHA, f65, f98
  6793. adds I = -1, I
  6794. }
  6795. { .mfb
  6796. nop __LINE__
  6797. FMA f69 = ALPHA, f69, f99
  6798. nop __LINE__
  6799. }
  6800. ;;
  6801. { .mfi
  6802. nop __LINE__
  6803. FMA f66 = ALPHA, f66, f100
  6804. nop __LINE__
  6805. }
  6806. { .mfb
  6807. nop __LINE__
  6808. FMA f70 = ALPHA, f70, f101
  6809. nop __LINE__
  6810. }
  6811. ;;
  6812. { .mfb
  6813. nop __LINE__
  6814. FMA f67 = ALPHA, f67, f102
  6815. nop __LINE__
  6816. }
  6817. { .mfb
  6818. nop __LINE__
  6819. FMA f71 = ALPHA, f71, f103
  6820. nop __LINE__
  6821. }
  6822. ;;
  6823. { .mfb
  6824. STFD [C1 ] = f64, SIZE
  6825. FMA f72 = ALPHA, f72, f104
  6826. nop __LINE__
  6827. }
  6828. { .mfb
  6829. STFD [C9 ] = f68, SIZE
  6830. FMA f76 = ALPHA, f76, f105
  6831. nop __LINE__
  6832. }
  6833. ;;
  6834. { .mfb
  6835. STFD [C1 ] = f65, SIZE
  6836. FMA f73 = ALPHA, f73, f106
  6837. nop __LINE__
  6838. }
  6839. { .mfb
  6840. STFD [C9 ] = f69, SIZE
  6841. FMA f77 = ALPHA, f77, f107
  6842. nop __LINE__
  6843. }
  6844. ;;
  6845. { .mfb
  6846. STFD [C1 ] = f66, SIZE
  6847. FMA f74 = ALPHA, f74, f108
  6848. nop __LINE__
  6849. }
  6850. { .mfb
  6851. STFD [C9 ] = f70, SIZE
  6852. FMA f78 = ALPHA, f78, f109
  6853. nop __LINE__
  6854. }
  6855. ;;
  6856. { .mfb
  6857. STFD [C1 ] = f67, 5 * SIZE
  6858. FMA f75 = ALPHA, f75, f110
  6859. nop __LINE__
  6860. }
  6861. { .mfb
  6862. STFD [C9 ] = f71, 5 * SIZE
  6863. FMA f79 = ALPHA, f79, f111
  6864. nop __LINE__
  6865. }
  6866. ;;
  6867. { .mfb
  6868. STFD [C2 ] = f72, SIZE
  6869. mov f64 = f0
  6870. nop __LINE__
  6871. }
  6872. { .mfb
  6873. STFD [C10] = f76, SIZE
  6874. mov f72 = f0
  6875. nop __LINE__
  6876. }
  6877. ;;
  6878. { .mfb
  6879. STFD [C2 ] = f73, SIZE
  6880. mov f65 = f0
  6881. nop __LINE__
  6882. }
  6883. { .mfb
  6884. STFD [C10] = f77, SIZE
  6885. mov f73 = f0
  6886. nop __LINE__
  6887. }
  6888. ;;
  6889. { .mfb
  6890. STFD [C2 ] = f74, SIZE
  6891. mov f66 = f0
  6892. nop __LINE__
  6893. }
  6894. { .mfb
  6895. STFD [C10] = f78, SIZE
  6896. mov f74 = f0
  6897. nop __LINE__
  6898. }
  6899. ;;
  6900. { .mfb
  6901. STFD [C2 ] = f75, 5 * SIZE
  6902. mov f67 = f0
  6903. nop __LINE__
  6904. }
  6905. { .mfb
  6906. STFD [C10] = f79, 5 * SIZE
  6907. (p6) br.cond.dptk .L092
  6908. }
  6909. ;;
  6910. #else
  6911. { .mfi
  6912. nop __LINE__
  6913. FMPY f64 = ALPHA, f64
  6914. cmp.ne p6, p0 = 1, I
  6915. }
  6916. { .mfb
  6917. nop __LINE__
  6918. FMPY f68 = ALPHA, f68
  6919. nop __LINE__
  6920. }
  6921. ;;
  6922. { .mfi
  6923. nop __LINE__
  6924. FMPY f65 = ALPHA, f65
  6925. adds I = -1, I
  6926. }
  6927. { .mfb
  6928. nop __LINE__
  6929. FMPY f69 = ALPHA, f69
  6930. nop __LINE__
  6931. }
  6932. ;;
  6933. { .mfi
  6934. nop __LINE__
  6935. FMPY f66 = ALPHA, f66
  6936. nop __LINE__
  6937. }
  6938. { .mfb
  6939. nop __LINE__
  6940. FMPY f70 = ALPHA, f70
  6941. nop __LINE__
  6942. }
  6943. ;;
  6944. { .mfb
  6945. nop __LINE__
  6946. FMPY f67 = ALPHA, f67
  6947. nop __LINE__
  6948. }
  6949. { .mfb
  6950. nop __LINE__
  6951. FMPY f71 = ALPHA, f71
  6952. nop __LINE__
  6953. }
  6954. ;;
  6955. { .mfb
  6956. STFD [C1 ] = f64, SIZE
  6957. FMPY f72 = ALPHA, f72
  6958. nop __LINE__
  6959. }
  6960. { .mfb
  6961. STFD [C9 ] = f68, SIZE
  6962. FMPY f76 = ALPHA, f76
  6963. nop __LINE__
  6964. }
  6965. ;;
  6966. { .mfb
  6967. STFD [C1 ] = f65, SIZE
  6968. FMPY f73 = ALPHA, f73
  6969. nop __LINE__
  6970. }
  6971. { .mfb
  6972. STFD [C9 ] = f69, SIZE
  6973. FMPY f77 = ALPHA, f77
  6974. nop __LINE__
  6975. }
  6976. ;;
  6977. { .mfi
  6978. STFD [C1 ] = f66, SIZE
  6979. FMPY f74 = ALPHA, f74
  6980. #if defined(TRMMKERNEL) && \
  6981. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6982. sub L = K, KK
  6983. #else
  6984. nop __LINE__
  6985. #endif
  6986. }
  6987. { .mfb
  6988. STFD [C9 ] = f70, SIZE
  6989. FMPY f78 = ALPHA, f78
  6990. nop __LINE__
  6991. }
  6992. ;;
  6993. { .mfi
  6994. STFD [C1 ] = f67, 5 * SIZE
  6995. FMPY f75 = ALPHA, f75
  6996. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  6997. adds L = -8, L
  6998. #else
  6999. nop __LINE__
  7000. #endif
  7001. }
  7002. { .mfi
  7003. STFD [C9 ] = f71, 5 * SIZE
  7004. FMPY f79 = ALPHA, f79
  7005. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  7006. adds L = -2, L
  7007. #else
  7008. nop __LINE__
  7009. #endif
  7010. }
  7011. ;;
  7012. { .mfi
  7013. STFD [C2 ] = f72, SIZE
  7014. mov f64 = f0
  7015. #if defined(TRMMKERNEL) && \
  7016. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7017. shladd KK8 = L, BASE_SHIFT, r0
  7018. #else
  7019. nop __LINE__
  7020. #endif
  7021. }
  7022. { .mfb
  7023. STFD [C10] = f76, SIZE
  7024. mov f72 = f0
  7025. nop __LINE__
  7026. }
  7027. ;;
  7028. { .mfi
  7029. STFD [C2 ] = f73, SIZE
  7030. mov f65 = f0
  7031. #if defined(TRMMKERNEL) && \
  7032. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7033. shladd AOFFSET = KK8, 3, AOFFSET
  7034. #else
  7035. nop __LINE__
  7036. #endif
  7037. }
  7038. { .mfi
  7039. STFD [C10] = f77, SIZE
  7040. mov f73 = f0
  7041. #if defined(TRMMKERNEL) && \
  7042. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7043. shladd BOFFSET = KK8, 1, BOFFSET
  7044. #else
  7045. nop __LINE__
  7046. #endif
  7047. }
  7048. ;;
  7049. { .mfi
  7050. STFD [C2 ] = f74, SIZE
  7051. mov f66 = f0
  7052. #if defined(TRMMKERNEL) && defined(LEFT)
  7053. adds KK = 8, KK
  7054. #else
  7055. nop __LINE__
  7056. #endif
  7057. }
  7058. { .mfb
  7059. STFD [C10] = f78, SIZE
  7060. mov f74 = f0
  7061. nop __LINE__
  7062. }
  7063. ;;
  7064. { .mfi
  7065. STFD [C2 ] = f75, 5 * SIZE
  7066. mov f67 = f0
  7067. #ifdef TRMMKERNEL
  7068. shladd KK8 = KK, BASE_SHIFT, r0
  7069. #else
  7070. nop __LINE__
  7071. #endif
  7072. }
  7073. { .mib
  7074. STFD [C10] = f79, 5 * SIZE
  7075. nop __LINE__
  7076. (p6) br.cond.dptk .L092
  7077. }
  7078. ;;
  7079. #endif
  7080. .align 32
  7081. .L100:
  7082. { .mib
  7083. #ifndef TRMMKERNEL
  7084. nop __LINE__
  7085. #else
  7086. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  7087. sub L = K, KK
  7088. #elif defined(LEFT)
  7089. adds L = 4, KK
  7090. #else
  7091. adds L = 2, KK
  7092. #endif
  7093. #endif
  7094. tbit.z p6, p7 = M, 2
  7095. (p6) br.cond.dptk .L110
  7096. }
  7097. ;;
  7098. #if !defined(TRMMKERNEL) || \
  7099. defined(TRMMKERNEL) && \
  7100. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7101. { .mmf
  7102. LDFPD f48, f49 = [B]
  7103. adds BOFFSET = 2 * SIZE, B
  7104. mov f75 = f0
  7105. }
  7106. { .mii
  7107. nop __LINE__
  7108. #ifndef TRMMKERNEL
  7109. adds L = 1, K
  7110. #else
  7111. adds L = 1, L
  7112. #endif
  7113. }
  7114. ;;
  7115. #else
  7116. { .mfi
  7117. shladd BOFFSET = KK8, 1, B
  7118. mov f75 = f0
  7119. shladd AOFFSET = KK8, 2, AOFFSET
  7120. }
  7121. ;;
  7122. { .mmi
  7123. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7124. nop __LINE__
  7125. adds L = 1, L
  7126. }
  7127. ;;
  7128. #endif
  7129. ;;
  7130. { .mii
  7131. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  7132. tbit.z p12, p0 = L, 0
  7133. shr L = L, 1
  7134. }
  7135. ;;
  7136. { .mmi
  7137. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7138. nop __LINE__
  7139. adds L = -1, L
  7140. }
  7141. ;;
  7142. { .mmi
  7143. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  7144. cmp.eq p3, p0 = r0, r0
  7145. mov ar.lc = L
  7146. }
  7147. ;;
  7148. .align 32
  7149. .L102:
  7150. { .mfi
  7151. lfetch.nt1 [PREA], 8 * SIZE
  7152. FMA f64 = f32, f48, f64 // A1 * B1
  7153. cmp.ne p4, p5 = 0, L
  7154. }
  7155. { .mfi
  7156. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  7157. FMA f72 = f32, f49, f72 // A1 * B2
  7158. (p12) cmp.ne p3, p0 = 0, L
  7159. }
  7160. ;;
  7161. { .mfi
  7162. lfetch.nt1 [PREB], 4 * SIZE
  7163. FMA f65 = f33, f48, f65 // A2 * B1
  7164. adds C9 = 2 * SIZE, C1
  7165. }
  7166. { .mfi
  7167. nop __LINE__
  7168. FMA f73 = f33, f49, f73 // A2 * B2
  7169. adds C10 = 2 * SIZE, C2
  7170. }
  7171. ;;
  7172. { .mfb
  7173. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  7174. FMA f66 = f34, f48, f66 // A3 * B1
  7175. nop __LINE__
  7176. }
  7177. { .mfb
  7178. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7179. (p5) LDFD f68 = [C1 ], SIZE
  7180. #else
  7181. nop __LINE__
  7182. #endif
  7183. FMA f74 = f34, f49, f74 // A3 * B2
  7184. nop __LINE__
  7185. }
  7186. ;;
  7187. { .mfb
  7188. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  7189. FMA f67 = f35, f48, f67 // A4 * B1
  7190. nop __LINE__
  7191. }
  7192. { .mfb
  7193. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7194. (p5) LDFD f70 = [C9 ], SIZE
  7195. #else
  7196. nop __LINE__
  7197. #endif
  7198. FMA f75 = f35, f49, f75 // A4 * B2
  7199. nop __LINE__
  7200. }
  7201. ;;
  7202. { .mfb
  7203. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  7204. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  7205. nop __LINE__
  7206. }
  7207. { .mfb
  7208. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7209. (p5) LDFD f69 = [C1 ], -1 * SIZE
  7210. #else
  7211. nop __LINE__
  7212. #endif
  7213. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  7214. nop __LINE__
  7215. }
  7216. ;;
  7217. { .mfb
  7218. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7219. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  7220. nop __LINE__
  7221. }
  7222. { .mfb
  7223. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7224. (p5) LDFD f71 = [C9 ], -1 * SIZE
  7225. #else
  7226. nop __LINE__
  7227. #endif
  7228. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  7229. nop __LINE__
  7230. }
  7231. ;;
  7232. { .mfb
  7233. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7234. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  7235. nop __LINE__
  7236. }
  7237. { .mfb
  7238. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7239. (p5) LDFD f76 = [C2 ], SIZE
  7240. #else
  7241. nop __LINE__
  7242. #endif
  7243. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  7244. nop __LINE__
  7245. }
  7246. ;;
  7247. { .mfi
  7248. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  7249. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  7250. adds L = -1, L
  7251. }
  7252. { .mfb
  7253. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7254. (p5) LDFD f78 = [C10], SIZE
  7255. #else
  7256. nop __LINE__
  7257. #endif
  7258. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  7259. br.cloop.sptk.few .L102
  7260. }
  7261. ;;
  7262. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7263. { .mfb
  7264. LDFD f77 = [C2 ], -1 * SIZE
  7265. FMA f64 = ALPHA, f64, f68
  7266. nop __LINE__
  7267. }
  7268. { .mfb
  7269. LDFD f79 = [C10], -1 * SIZE
  7270. FMA f66 = ALPHA, f66, f70
  7271. nop __LINE__
  7272. }
  7273. ;;
  7274. FMA f65 = ALPHA, f65, f69
  7275. adds L = 1, K
  7276. FMA f67 = ALPHA, f67, f71
  7277. ;;
  7278. FMA f72 = ALPHA, f72, f76
  7279. shr L = L, 1
  7280. FMA f74 = ALPHA, f74, f78
  7281. FMA f73 = ALPHA, f73, f77
  7282. FMA f75 = ALPHA, f75, f79
  7283. ;;
  7284. { .mmf
  7285. STFD [C1 ] = f64, SIZE
  7286. STFD [C9 ] = f66, SIZE
  7287. mov f64 = f0
  7288. }
  7289. ;;
  7290. { .mmf
  7291. STFD [C1 ] = f65, 3 * SIZE
  7292. STFD [C9 ] = f67, 3 * SIZE
  7293. mov f65 = f0
  7294. }
  7295. ;;
  7296. { .mmf
  7297. STFD [C2 ] = f72, SIZE
  7298. STFD [C10] = f74, SIZE
  7299. mov f72 = f0
  7300. }
  7301. ;;
  7302. { .mmf
  7303. STFD [C2 ] = f73, 3 * SIZE
  7304. STFD [C10] = f75, 3 * SIZE
  7305. mov f73 = f0
  7306. }
  7307. ;;
  7308. #else
  7309. { .mfb
  7310. nop __LINE__
  7311. FMPY f64 = ALPHA, f64
  7312. nop __LINE__
  7313. }
  7314. { .mfb
  7315. nop __LINE__
  7316. FMPY f66 = ALPHA, f66
  7317. nop __LINE__
  7318. }
  7319. ;;
  7320. FMPY f65 = ALPHA, f65
  7321. FMPY f67 = ALPHA, f67
  7322. ;;
  7323. { .mfi
  7324. nop __LINE__
  7325. FMPY f72 = ALPHA, f72
  7326. #if defined(TRMMKERNEL) && \
  7327. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7328. sub L = K, KK
  7329. #else
  7330. nop __LINE__
  7331. #endif
  7332. }
  7333. { .mfi
  7334. nop __LINE__
  7335. FMPY f74 = ALPHA, f74
  7336. nop __LINE__
  7337. }
  7338. ;;
  7339. { .mfi
  7340. nop __LINE__
  7341. FMPY f73 = ALPHA, f73
  7342. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  7343. adds L = -4, L
  7344. #else
  7345. nop __LINE__
  7346. #endif
  7347. }
  7348. { .mfi
  7349. nop __LINE__
  7350. FMPY f75 = ALPHA, f75
  7351. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  7352. adds L = -2, L
  7353. #else
  7354. nop __LINE__
  7355. #endif
  7356. }
  7357. ;;
  7358. { .mfi
  7359. STFD [C1 ] = f64, SIZE
  7360. mov f64 = f0
  7361. #if defined(TRMMKERNEL) && \
  7362. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7363. shladd KK8 = L, BASE_SHIFT, r0
  7364. #else
  7365. nop __LINE__
  7366. #endif
  7367. }
  7368. { .mmi
  7369. STFD [C9 ] = f66, SIZE
  7370. nop __LINE__
  7371. nop __LINE__
  7372. }
  7373. ;;
  7374. { .mfi
  7375. STFD [C1 ] = f65, 3 * SIZE
  7376. mov f65 = f0
  7377. #if defined(TRMMKERNEL) && \
  7378. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7379. shladd AOFFSET = KK8, 2, AOFFSET
  7380. #else
  7381. nop __LINE__
  7382. #endif
  7383. }
  7384. { .mmi
  7385. STFD [C9 ] = f67, 3 * SIZE
  7386. nop __LINE__
  7387. #if defined(TRMMKERNEL) && \
  7388. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7389. shladd BOFFSET = KK8, 1, BOFFSET
  7390. #else
  7391. nop __LINE__
  7392. #endif
  7393. }
  7394. ;;
  7395. { .mfi
  7396. STFD [C2 ] = f72, SIZE
  7397. mov f72 = f0
  7398. #if defined(TRMMKERNEL) && defined(LEFT)
  7399. adds KK = 4, KK
  7400. #else
  7401. nop __LINE__
  7402. #endif
  7403. }
  7404. { .mmi
  7405. STFD [C10] = f74, SIZE
  7406. nop __LINE__
  7407. nop __LINE__
  7408. }
  7409. ;;
  7410. { .mfi
  7411. STFD [C2 ] = f73, 3 * SIZE
  7412. mov f73 = f0
  7413. #ifdef TRMMKERNEL
  7414. shladd KK8 = KK, BASE_SHIFT, r0
  7415. #else
  7416. nop __LINE__
  7417. #endif
  7418. }
  7419. { .mib
  7420. STFD [C10] = f75, 3 * SIZE
  7421. nop __LINE__
  7422. nop __LINE__
  7423. }
  7424. ;;
  7425. #endif
  7426. .align 32
  7427. .L110:
  7428. { .mib
  7429. #ifndef TRMMKERNEL
  7430. nop __LINE__
  7431. #else
  7432. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  7433. sub L = K, KK
  7434. #elif defined(LEFT)
  7435. adds L = 2, KK
  7436. #else
  7437. adds L = 2, KK
  7438. #endif
  7439. #endif
  7440. tbit.z p6, p7 = M, 1
  7441. (p6) br.cond.dptk .L120
  7442. }
  7443. ;;
  7444. #if !defined(TRMMKERNEL) || \
  7445. defined(TRMMKERNEL) && \
  7446. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7447. { .mmi
  7448. LDFPD f48, f49 = [B]
  7449. adds BOFFSET = 2 * SIZE, B
  7450. #ifndef TRMMKERNEL
  7451. adds L = 1, K
  7452. #else
  7453. adds L = 1, L
  7454. #endif
  7455. }
  7456. ;;
  7457. #else
  7458. { .mmi
  7459. shladd BOFFSET = KK8, 1, B
  7460. shladd AOFFSET = KK8, 1, AOFFSET
  7461. }
  7462. ;;
  7463. { .mmi
  7464. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7465. nop __LINE__
  7466. adds L = 1, L
  7467. }
  7468. ;;
  7469. #endif
  7470. ;;
  7471. { .mii
  7472. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  7473. tbit.z p12, p0 = L, 0
  7474. shr L = L, 1
  7475. }
  7476. ;;
  7477. { .mmi
  7478. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7479. nop __LINE__
  7480. adds L = -1, L
  7481. }
  7482. ;;
  7483. { .mmi
  7484. cmp.eq p3, p0 = r0, r0
  7485. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  7486. mov ar.lc = L
  7487. }
  7488. ;;
  7489. .align 32
  7490. .L112:
  7491. { .mfi
  7492. lfetch.nt1 [PREA], 4 * SIZE
  7493. FMA f64 = f32, f48, f64 // A1 * B1
  7494. cmp.ne p4, p5 = 0, L
  7495. }
  7496. { .mfi
  7497. lfetch.nt1 [PREB], 4 * SIZE
  7498. FMA f72 = f32, f49, f72 // A1 * B2
  7499. (p12) cmp.ne p3, p0 = 0, L
  7500. }
  7501. ;;
  7502. { .mmf
  7503. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  7504. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  7505. FMA f65 = f33, f48, f65 // A2 * B1
  7506. }
  7507. { .mmf
  7508. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7509. (p5) LDFD f68 = [C1 ], SIZE
  7510. (p5) LDFD f76 = [C2 ], SIZE
  7511. #else
  7512. nop __LINE__
  7513. nop __LINE__
  7514. #endif
  7515. FMA f73 = f33, f49, f73 // A2 * B2
  7516. }
  7517. ;;
  7518. { .mfb
  7519. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7520. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  7521. nop __LINE__
  7522. }
  7523. { .mfb
  7524. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7525. (p5) LDFD f69 = [C1 ], -1 * SIZE
  7526. #else
  7527. nop __LINE__
  7528. #endif
  7529. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  7530. nop __LINE__
  7531. }
  7532. ;;
  7533. { .mfi
  7534. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7535. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  7536. adds L = -1, L
  7537. }
  7538. { .mfb
  7539. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7540. (p5) LDFD f77 = [C2 ], -1 * SIZE
  7541. #else
  7542. nop __LINE__
  7543. #endif
  7544. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  7545. br.cloop.sptk.few .L112
  7546. }
  7547. ;;
  7548. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7549. FMA f64 = ALPHA, f64, f68
  7550. FMA f65 = ALPHA, f65, f69
  7551. FMA f72 = ALPHA, f72, f76
  7552. FMA f73 = ALPHA, f73, f77
  7553. ;;
  7554. { .mfi
  7555. STFD [C1 ] = f64, SIZE
  7556. mov f64 = f0
  7557. nop __LINE__
  7558. }
  7559. { .mfb
  7560. STFD [C2 ] = f72, SIZE
  7561. mov f72 = f0
  7562. nop __LINE__
  7563. }
  7564. ;;
  7565. { .mfi
  7566. STFD [C1 ] = f65, SIZE
  7567. mov f65 = f0
  7568. nop __LINE__
  7569. }
  7570. { .mfb
  7571. STFD [C2 ] = f73, SIZE
  7572. mov f73 = f0
  7573. nop __LINE__
  7574. }
  7575. ;;
  7576. #else
  7577. { .mfi
  7578. nop __LINE__
  7579. FMPY f64 = ALPHA, f64
  7580. #if defined(TRMMKERNEL) && \
  7581. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7582. sub L = K, KK
  7583. #else
  7584. nop __LINE__
  7585. #endif
  7586. }
  7587. { .mfi
  7588. nop __LINE__
  7589. FMPY f65 = ALPHA, f65
  7590. nop __LINE__
  7591. }
  7592. ;;
  7593. { .mfi
  7594. nop __LINE__
  7595. FMPY f72 = ALPHA, f72
  7596. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  7597. adds L = -2, L
  7598. #else
  7599. nop __LINE__
  7600. #endif
  7601. }
  7602. { .mfi
  7603. nop __LINE__
  7604. FMPY f73 = ALPHA, f73
  7605. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  7606. adds L = -2, L
  7607. #else
  7608. nop __LINE__
  7609. #endif
  7610. }
  7611. ;;
  7612. { .mmi
  7613. #if defined(TRMMKERNEL) && \
  7614. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7615. shladd KK8 = L, BASE_SHIFT, r0
  7616. #else
  7617. nop __LINE__
  7618. #endif
  7619. ;;
  7620. #if defined(TRMMKERNEL) && \
  7621. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7622. shladd AOFFSET = KK8, 1, AOFFSET
  7623. #else
  7624. nop __LINE__
  7625. #endif
  7626. #if defined(TRMMKERNEL) && \
  7627. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7628. shladd BOFFSET = KK8, 1, BOFFSET
  7629. #else
  7630. nop __LINE__
  7631. #endif
  7632. }
  7633. ;;
  7634. { .mfi
  7635. STFD [C1 ] = f64, SIZE
  7636. mov f64 = f0
  7637. #if defined(TRMMKERNEL) && defined(LEFT)
  7638. adds KK = 2, KK
  7639. #else
  7640. nop __LINE__
  7641. #endif
  7642. }
  7643. { .mfb
  7644. STFD [C2 ] = f72, SIZE
  7645. mov f72 = f0
  7646. nop __LINE__
  7647. }
  7648. ;;
  7649. { .mfi
  7650. STFD [C1 ] = f65, SIZE
  7651. mov f65 = f0
  7652. #ifdef TRMMKERNEL
  7653. shladd KK8 = KK, BASE_SHIFT, r0
  7654. #else
  7655. nop __LINE__
  7656. #endif
  7657. }
  7658. { .mfb
  7659. STFD [C2 ] = f73, SIZE
  7660. mov f73 = f0
  7661. nop __LINE__
  7662. }
  7663. ;;
  7664. #endif
  7665. .align 32
  7666. .L120:
  7667. { .mib
  7668. #ifndef TRMMKERNEL
  7669. nop __LINE__
  7670. #else
  7671. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  7672. sub L = K, KK
  7673. #elif defined(LEFT)
  7674. adds L = 1, KK
  7675. #else
  7676. adds L = 2, KK
  7677. #endif
  7678. #endif
  7679. tbit.z p6, p7 = M, 0
  7680. (p6) br.cond.dptk .L129
  7681. }
  7682. ;;
  7683. #if !defined(TRMMKERNEL) || \
  7684. defined(TRMMKERNEL) && \
  7685. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7686. { .mmi
  7687. LDFPD f48, f49 = [B]
  7688. adds BOFFSET = 2 * SIZE, B
  7689. #ifndef TRMMKERNEL
  7690. adds L = 1, K
  7691. #else
  7692. adds L = 1, L
  7693. #endif
  7694. }
  7695. ;;
  7696. #else
  7697. { .mmi
  7698. shladd BOFFSET = KK8, 1, B
  7699. add AOFFSET = KK8, AOFFSET
  7700. }
  7701. ;;
  7702. { .mmi
  7703. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7704. nop __LINE__
  7705. adds L = 1, L
  7706. }
  7707. ;;
  7708. #endif
  7709. { .mii
  7710. nop __LINE__
  7711. tbit.z p12, p0 = L, 0
  7712. shr L = L, 1
  7713. }
  7714. ;;
  7715. { .mmi
  7716. LDFD f32 = [AOFFSET], 1 * SIZE
  7717. nop __LINE__
  7718. adds L = -1, L
  7719. }
  7720. ;;
  7721. { .mmi
  7722. cmp.eq p3, p0 = r0, r0
  7723. nop __LINE__
  7724. mov ar.lc = L
  7725. }
  7726. ;;
  7727. .align 32
  7728. .L122:
  7729. { .mfi
  7730. FMA f64 = f32, f48, f64 // A1 * B1
  7731. cmp.ne p4, p5 = 0, L
  7732. }
  7733. { .mfi
  7734. nop __LINE__
  7735. FMA f72 = f32, f49, f72 // A1 * B2
  7736. (p12) cmp.ne p3, p0 = 0, L
  7737. }
  7738. ;;
  7739. { .mmi
  7740. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  7741. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  7742. nop __LINE__
  7743. }
  7744. { .mmi
  7745. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7746. (p5) LDFD f68 = [C1]
  7747. (p5) LDFD f76 = [C2]
  7748. #else
  7749. nop __LINE__
  7750. nop __LINE__
  7751. #endif
  7752. nop __LINE__
  7753. }
  7754. ;;
  7755. { .mfi
  7756. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7757. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  7758. adds L = -1, L
  7759. }
  7760. { .mfb
  7761. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  7762. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  7763. br.cloop.sptk.few .L122
  7764. }
  7765. ;;
  7766. .L128:
  7767. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7768. FMA f64 = ALPHA, f64, f68
  7769. FMA f72 = ALPHA, f72, f76
  7770. ;;
  7771. { .mfi
  7772. STFD [C1 ] = f64
  7773. mov f64 = f0
  7774. }
  7775. { .mfb
  7776. STFD [C2 ] = f72
  7777. mov f72 = f0
  7778. }
  7779. ;;
  7780. #else
  7781. { .mfi
  7782. nop __LINE__
  7783. FMPY f64 = ALPHA, f64
  7784. #if defined(TRMMKERNEL) && \
  7785. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7786. sub L = K, KK
  7787. #else
  7788. nop __LINE__
  7789. #endif
  7790. }
  7791. { .mfi
  7792. nop __LINE__
  7793. FMPY f72 = ALPHA, f72
  7794. nop __LINE__
  7795. }
  7796. ;;
  7797. { .mmi
  7798. nop __LINE__
  7799. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  7800. adds L = -1, L
  7801. #else
  7802. nop __LINE__
  7803. #endif
  7804. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  7805. adds L = -2, L
  7806. #else
  7807. nop __LINE__
  7808. #endif
  7809. }
  7810. ;;
  7811. { .mmi
  7812. #if defined(TRMMKERNEL) && \
  7813. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7814. shladd KK8 = L, BASE_SHIFT, r0
  7815. #else
  7816. nop __LINE__
  7817. #endif
  7818. ;;
  7819. #if defined(TRMMKERNEL) && \
  7820. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7821. add AOFFSET = KK8, AOFFSET
  7822. #else
  7823. nop __LINE__
  7824. #endif
  7825. #if defined(TRMMKERNEL) && \
  7826. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7827. shladd BOFFSET = KK8, 1, BOFFSET
  7828. #else
  7829. nop __LINE__
  7830. #endif
  7831. }
  7832. ;;
  7833. #if defined(TRMMKERNEL) && defined(LEFT)
  7834. adds KK = 1, KK
  7835. #else
  7836. nop __LINE__
  7837. #endif
  7838. ;;
  7839. { .mfi
  7840. STFD [C1 ] = f64
  7841. mov f64 = f0
  7842. #ifdef TRMMKERNEL
  7843. shladd KK8 = KK, BASE_SHIFT, r0
  7844. #else
  7845. nop __LINE__
  7846. #endif
  7847. }
  7848. { .mfb
  7849. STFD [C2 ] = f72
  7850. mov f72 = f0
  7851. }
  7852. ;;
  7853. #endif
  7854. .align 32
  7855. .L129:
  7856. { .mmi
  7857. mov B = BOFFSET
  7858. mov AOFFSET = A
  7859. #if defined(TRMMKERNEL) && !defined(LEFT)
  7860. adds KK = 2, KK
  7861. #else
  7862. nop __LINE__
  7863. #endif
  7864. }
  7865. ;;
  7866. .align 16
  7867. .L130:
  7868. { .mfi
  7869. #if defined(TRMMKERNEL) && defined(LEFT)
  7870. mov KK = OFFSET
  7871. #else
  7872. nop __LINE__
  7873. #endif
  7874. mov f64 = f0
  7875. tbit.z p6, p0 = N, 0
  7876. }
  7877. { .mib
  7878. mov AOFFSET = A
  7879. shr I = M, 3
  7880. (p6) br.cond.dpnt .L999
  7881. }
  7882. ;;
  7883. { .mfi
  7884. mov C1 = C
  7885. mov f65 = f0
  7886. #ifdef TRMMKERNEL
  7887. shladd KK8 = KK, BASE_SHIFT, r0
  7888. #else
  7889. nop __LINE__
  7890. #endif
  7891. }
  7892. ;;
  7893. { .mfi
  7894. nop __LINE__
  7895. mov f66 = f0
  7896. nop __LINE__
  7897. }
  7898. { .mfb
  7899. cmp.eq p7, p0 = 0, I
  7900. mov f67 = f0
  7901. (p7) br.cond.dpnt .L140
  7902. }
  7903. ;;
  7904. .align 32
  7905. .L132:
  7906. #if !defined(TRMMKERNEL) || \
  7907. defined(TRMMKERNEL) && \
  7908. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7909. { .mfb
  7910. LDFD f48 = [B]
  7911. mov f68 = f0
  7912. nop __LINE__
  7913. }
  7914. { .mfi
  7915. adds BOFFSET = 1 * SIZE, B
  7916. mov f69 = f0
  7917. #ifndef TRMMKERNEL
  7918. nop __LINE__
  7919. #else
  7920. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  7921. sub L = K, KK
  7922. #elif defined(LEFT)
  7923. adds L = 8, KK
  7924. #else
  7925. adds L = 1, KK
  7926. #endif
  7927. #endif
  7928. }
  7929. ;;
  7930. #else
  7931. { .mfi
  7932. add BOFFSET = KK8, B
  7933. mov f68 = f0
  7934. shladd AOFFSET = KK8, 3, AOFFSET
  7935. }
  7936. ;;
  7937. { .mfi
  7938. LDFD f48 = [BOFFSET], 1 * SIZE
  7939. mov f69 = f0
  7940. #ifndef TRMMKERNEL
  7941. nop __LINE__
  7942. #else
  7943. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  7944. sub L = K, KK
  7945. #elif defined(LEFT)
  7946. adds L = 8, KK
  7947. #else
  7948. adds L = 1, KK
  7949. #endif
  7950. #endif
  7951. }
  7952. ;;
  7953. #endif
  7954. { .mfi
  7955. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7956. mov f70 = f0
  7957. #ifndef TRMMKERNEL
  7958. adds L = 1, K
  7959. #else
  7960. adds L = 1, L
  7961. #endif
  7962. }
  7963. ;;
  7964. { .mii
  7965. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  7966. tbit.z p12, p0 = L, 0
  7967. shr L = L, 1
  7968. }
  7969. ;;
  7970. { .mfi
  7971. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  7972. mov f71 = f0
  7973. adds L = -1, L
  7974. }
  7975. ;;
  7976. { .mmi
  7977. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  7978. adds PREC = CPREFETCHSIZE * SIZE, C1
  7979. cmp.eq p3, p0 = r0, r0
  7980. }
  7981. ;;
  7982. { .mmi
  7983. CPREFETCH [PREC]
  7984. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  7985. mov ar.lc = L
  7986. }
  7987. ;;
  7988. .align 32
  7989. .L133:
  7990. { .mfi
  7991. lfetch.nt1 [PREA], 16 * SIZE
  7992. FMA f64 = f32, f48, f64 // A1 * B1
  7993. cmp.ne p4, p5 = 0, L
  7994. }
  7995. { .mfi
  7996. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  7997. FMA f65 = f33, f48, f65 // A2 * B1
  7998. (p12) cmp.ne p3, p0 = 0, L
  7999. }
  8000. ;;
  8001. { .mfi
  8002. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  8003. FMA f66 = f34, f48, f66 // A3 * B1
  8004. adds C9 = 4 * SIZE, C1
  8005. }
  8006. { .mmf
  8007. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  8008. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8009. (p5) LDFD f6 = [C1 ], SIZE
  8010. #else
  8011. nop __LINE__
  8012. #endif
  8013. FMA f67 = f35, f48, f67 // A4 * B1
  8014. }
  8015. ;;
  8016. { .mfb
  8017. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  8018. FMA f68 = f36, f48, f68 // A5 * B1
  8019. nop __LINE__
  8020. }
  8021. { .mfb
  8022. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8023. (p5) LDFD f7 = [C9 ], SIZE
  8024. #else
  8025. nop __LINE__
  8026. #endif
  8027. FMA f69 = f37, f48, f69 // A6 * B1
  8028. nop __LINE__
  8029. }
  8030. ;;
  8031. { .mfb
  8032. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  8033. FMA f70 = f38, f48, f70 // A7 * B1
  8034. nop __LINE__
  8035. }
  8036. { .mfb
  8037. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8038. (p5) LDFD f10 = [C1 ], SIZE
  8039. #else
  8040. nop __LINE__
  8041. #endif
  8042. FMA f71 = f39, f48, f71 // A8 * B1
  8043. nop __LINE__
  8044. }
  8045. ;;
  8046. { .mfb
  8047. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  8048. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  8049. nop __LINE__
  8050. }
  8051. { .mfb
  8052. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8053. (p5) LDFD f11 = [C9 ], SIZE
  8054. #else
  8055. nop __LINE__
  8056. #endif
  8057. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  8058. nop __LINE__
  8059. }
  8060. ;;
  8061. { .mfb
  8062. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8063. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  8064. nop __LINE__
  8065. }
  8066. { .mmf
  8067. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  8068. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8069. (p5) LDFD f12 = [C1 ], SIZE
  8070. #else
  8071. nop __LINE__
  8072. #endif
  8073. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  8074. }
  8075. ;;
  8076. { .mfb
  8077. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  8078. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  8079. nop __LINE__
  8080. }
  8081. { .mfb
  8082. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8083. (p5) LDFD f13 = [C9 ], SIZE
  8084. #else
  8085. nop __LINE__
  8086. #endif
  8087. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  8088. nop __LINE__
  8089. }
  8090. ;;
  8091. { .mfi
  8092. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  8093. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  8094. adds L = -1, L
  8095. }
  8096. { .mfb
  8097. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8098. (p5) LDFD f14 = [C1 ], -3 * SIZE
  8099. #else
  8100. nop __LINE__
  8101. #endif
  8102. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  8103. nop __LINE__
  8104. }
  8105. ;;
  8106. { .mfb
  8107. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  8108. nop __LINE__
  8109. nop __LINE__
  8110. }
  8111. { .mfb
  8112. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8113. (p5) LDFD f15 = [C9 ], -3 * SIZE
  8114. #else
  8115. nop __LINE__
  8116. #endif
  8117. nop __LINE__
  8118. br.cloop.sptk.few .L133
  8119. }
  8120. ;;
  8121. .L138:
  8122. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8123. { .mfi
  8124. FMA f64 = ALPHA, f64, f6
  8125. cmp.ne p6, p0 = 1, I
  8126. }
  8127. { .mfb
  8128. FMA f68 = ALPHA, f68, f7
  8129. }
  8130. ;;
  8131. { .mfi
  8132. FMA f65 = ALPHA, f65, f10
  8133. adds I = -1, I
  8134. }
  8135. { .mfb
  8136. FMA f69 = ALPHA, f69, f11
  8137. }
  8138. ;;
  8139. { .mfi
  8140. FMA f66 = ALPHA, f66, f12
  8141. }
  8142. { .mfb
  8143. FMA f70 = ALPHA, f70, f13
  8144. }
  8145. ;;
  8146. { .mfb
  8147. FMA f67 = ALPHA, f67, f14
  8148. }
  8149. { .mfb
  8150. FMA f71 = ALPHA, f71, f15
  8151. }
  8152. ;;
  8153. { .mmf
  8154. STFD [C1 ] = f64, SIZE
  8155. STFD [C9 ] = f68, SIZE
  8156. mov f64 = f0
  8157. }
  8158. ;;
  8159. { .mmf
  8160. STFD [C1 ] = f65, SIZE
  8161. STFD [C9 ] = f69, SIZE
  8162. mov f65 = f0
  8163. }
  8164. ;;
  8165. { .mmf
  8166. STFD [C1 ] = f66, SIZE
  8167. STFD [C9 ] = f70, SIZE
  8168. mov f66 = f0
  8169. }
  8170. ;;
  8171. { .mmf
  8172. STFD [C1 ] = f67, 5 * SIZE
  8173. nop __LINE__
  8174. mov f67 = f0
  8175. }
  8176. { .mmb
  8177. STFD [C9 ] = f71, 5 * SIZE
  8178. nop __LINE__
  8179. (p6) br.cond.dptk .L132
  8180. }
  8181. ;;
  8182. #else
  8183. { .mfi
  8184. FMPY f64 = ALPHA, f64
  8185. cmp.ne p6, p0 = 1, I
  8186. }
  8187. { .mfb
  8188. FMPY f68 = ALPHA, f68
  8189. }
  8190. ;;
  8191. { .mfi
  8192. FMPY f65 = ALPHA, f65
  8193. adds I = -1, I
  8194. }
  8195. { .mfb
  8196. FMPY f69 = ALPHA, f69
  8197. }
  8198. ;;
  8199. { .mfi
  8200. FMPY f66 = ALPHA, f66
  8201. #if defined(TRMMKERNEL) && \
  8202. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8203. sub L = K, KK
  8204. #else
  8205. nop __LINE__
  8206. #endif
  8207. }
  8208. { .mfb
  8209. FMPY f70 = ALPHA, f70
  8210. }
  8211. ;;
  8212. { .mfi
  8213. FMPY f67 = ALPHA, f67
  8214. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  8215. adds L = -8, L
  8216. #else
  8217. nop __LINE__
  8218. #endif
  8219. }
  8220. { .mfi
  8221. FMPY f71 = ALPHA, f71
  8222. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  8223. adds L = -1, L
  8224. #else
  8225. nop __LINE__
  8226. #endif
  8227. }
  8228. ;;
  8229. { .mfi
  8230. STFD [C1 ] = f64, SIZE
  8231. mov f64 = f0
  8232. #if defined(TRMMKERNEL) && \
  8233. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8234. shladd KK8 = L, BASE_SHIFT, r0
  8235. #else
  8236. nop __LINE__
  8237. #endif
  8238. }
  8239. { .mmi
  8240. STFD [C9 ] = f68, SIZE
  8241. nop __LINE__
  8242. nop __LINE__
  8243. }
  8244. ;;
  8245. { .mfi
  8246. STFD [C1 ] = f65, SIZE
  8247. mov f65 = f0
  8248. #if defined(TRMMKERNEL) && \
  8249. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8250. shladd AOFFSET = KK8, 3, AOFFSET
  8251. #else
  8252. nop __LINE__
  8253. #endif
  8254. }
  8255. { .mmi
  8256. STFD [C9 ] = f69, SIZE
  8257. nop __LINE__
  8258. #if defined(TRMMKERNEL) && \
  8259. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8260. add BOFFSET = KK8, BOFFSET
  8261. #else
  8262. nop __LINE__
  8263. #endif
  8264. }
  8265. ;;
  8266. { .mfi
  8267. STFD [C1 ] = f66, SIZE
  8268. mov f66 = f0
  8269. #if defined(TRMMKERNEL) && defined(LEFT)
  8270. adds KK = 8, KK
  8271. #else
  8272. nop __LINE__
  8273. #endif
  8274. }
  8275. { .mmi
  8276. STFD [C9 ] = f70, SIZE
  8277. nop __LINE__
  8278. nop __LINE__
  8279. }
  8280. ;;
  8281. { .mfi
  8282. STFD [C1 ] = f67, 5 * SIZE
  8283. mov f67 = f0
  8284. #ifdef TRMMKERNEL
  8285. shladd KK8 = KK, BASE_SHIFT, r0
  8286. #else
  8287. nop __LINE__
  8288. #endif
  8289. }
  8290. { .mmb
  8291. STFD [C9 ] = f71, 5 * SIZE
  8292. nop __LINE__
  8293. (p6) br.cond.dptk .L132
  8294. }
  8295. ;;
  8296. #endif
  8297. .align 32
  8298. .L140:
  8299. { .mib
  8300. #ifndef TRMMKERNEL
  8301. nop __LINE__
  8302. #else
  8303. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  8304. sub L = K, KK
  8305. #elif defined(LEFT)
  8306. adds L = 4, KK
  8307. #else
  8308. adds L = 1, KK
  8309. #endif
  8310. #endif
  8311. tbit.z p6, p7 = M, 2
  8312. (p6) br.cond.dptk .L150
  8313. }
  8314. ;;
  8315. #if !defined(TRMMKERNEL) || \
  8316. defined(TRMMKERNEL) && \
  8317. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8318. { .mmi
  8319. LDFD f48 = [B]
  8320. adds BOFFSET = 1 * SIZE, B
  8321. #ifndef TRMMKERNEL
  8322. adds L = 1, K
  8323. #else
  8324. adds L = 1, L
  8325. #endif
  8326. }
  8327. ;;
  8328. #else
  8329. { .mmi
  8330. add BOFFSET = KK8, B
  8331. shladd AOFFSET = KK8, 2, AOFFSET
  8332. nop __LINE__
  8333. }
  8334. ;;
  8335. { .mmi
  8336. LDFD f48 = [BOFFSET], 1 * SIZE
  8337. nop __LINE__
  8338. #ifndef TRMMKERNEL
  8339. adds L = 1, K
  8340. #else
  8341. adds L = 1, L
  8342. #endif
  8343. }
  8344. ;;
  8345. #endif
  8346. { .mii
  8347. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8348. tbit.z p12, p0 = L, 0
  8349. shr L = L, 1
  8350. }
  8351. ;;
  8352. { .mmi
  8353. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  8354. adds L = -1, L
  8355. nop __LINE__
  8356. }
  8357. ;;
  8358. { .mmi
  8359. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  8360. cmp.eq p3, p0 = r0, r0
  8361. mov ar.lc = L
  8362. }
  8363. ;;
  8364. .align 32
  8365. .L142:
  8366. { .mfi
  8367. lfetch.nt1 [PREA], 8 * SIZE
  8368. FMA f64 = f32, f48, f64 // A1 * B1
  8369. cmp.ne p4, p5 = 0, L
  8370. }
  8371. { .mfi
  8372. nop __LINE__
  8373. FMA f65 = f33, f48, f65 // A2 * B1
  8374. (p12) cmp.ne p3, p0 = 0, L
  8375. }
  8376. ;;
  8377. { .mfi
  8378. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  8379. FMA f66 = f34, f48, f66 // A3 * B1
  8380. (p5) adds C9 = 2 * SIZE, C1
  8381. }
  8382. { .mmf
  8383. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8384. (p5) LDFD f68 = [C1 ], SIZE
  8385. #else
  8386. nop __LINE__
  8387. #endif
  8388. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  8389. FMA f67 = f35, f48, f67 // A4 * B1
  8390. }
  8391. ;;
  8392. { .mfi
  8393. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  8394. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  8395. (p5) adds C10 = 2 * SIZE, C2
  8396. }
  8397. { .mfb
  8398. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8399. (p5) LDFD f70 = [C9 ], SIZE
  8400. #else
  8401. nop __LINE__
  8402. #endif
  8403. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  8404. nop __LINE__
  8405. }
  8406. ;;
  8407. { .mfb
  8408. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8409. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  8410. nop __LINE__
  8411. }
  8412. { .mmf
  8413. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  8414. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8415. (p5) LDFD f69 = [C1 ], -1 * SIZE
  8416. #else
  8417. nop __LINE__
  8418. #endif
  8419. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  8420. }
  8421. ;;
  8422. { .mfi
  8423. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  8424. nop __LINE__
  8425. adds L = -1, L
  8426. }
  8427. { .mfb
  8428. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8429. (p5) LDFD f71 = [C9 ], -1 * SIZE
  8430. #else
  8431. nop __LINE__
  8432. #endif
  8433. nop.f 0
  8434. br.cloop.sptk.few .L142
  8435. }
  8436. ;;
  8437. .L148:
  8438. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8439. FMA f64 = ALPHA, f64, f68
  8440. FMA f66 = ALPHA, f66, f70
  8441. FMA f65 = ALPHA, f65, f69
  8442. FMA f67 = ALPHA, f67, f71
  8443. ;;
  8444. { .mfi
  8445. STFD [C1 ] = f64, SIZE
  8446. mov f64 = f0
  8447. adds L = 1, K
  8448. }
  8449. { .mfb
  8450. STFD [C9 ] = f66, SIZE
  8451. mov f66 = f0
  8452. nop __LINE__
  8453. }
  8454. ;;
  8455. { .mfi
  8456. STFD [C1 ] = f65, 3 * SIZE
  8457. mov f65 = f0
  8458. shr L = L, 1
  8459. }
  8460. { .mfb
  8461. STFD [C9 ] = f67, 3 * SIZE
  8462. mov f67 = f0
  8463. nop __LINE__
  8464. }
  8465. ;;
  8466. #else
  8467. { .mfi
  8468. nop __LINE__
  8469. FMPY f64 = ALPHA, f64
  8470. #if defined(TRMMKERNEL) && \
  8471. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8472. sub L = K, KK
  8473. #else
  8474. nop __LINE__
  8475. #endif
  8476. }
  8477. { .mfi
  8478. nop __LINE__
  8479. FMPY f66 = ALPHA, f66
  8480. nop __LINE__
  8481. }
  8482. ;;
  8483. { .mfi
  8484. nop __LINE__
  8485. FMPY f65 = ALPHA, f65
  8486. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  8487. adds L = -4, L
  8488. #else
  8489. nop __LINE__
  8490. #endif
  8491. }
  8492. { .mfi
  8493. nop __LINE__
  8494. FMPY f67 = ALPHA, f67
  8495. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  8496. adds L = -1, L
  8497. #else
  8498. nop __LINE__
  8499. #endif
  8500. }
  8501. ;;
  8502. { .mmi
  8503. #if defined(TRMMKERNEL) && \
  8504. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8505. shladd KK8 = L, BASE_SHIFT, r0
  8506. #else
  8507. nop __LINE__
  8508. #endif
  8509. ;;
  8510. #if defined(TRMMKERNEL) && \
  8511. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8512. shladd AOFFSET = KK8, 2, AOFFSET
  8513. #else
  8514. nop __LINE__
  8515. #endif
  8516. #if defined(TRMMKERNEL) && \
  8517. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8518. add BOFFSET = KK8, BOFFSET
  8519. #else
  8520. nop __LINE__
  8521. #endif
  8522. }
  8523. ;;
  8524. { .mfi
  8525. STFD [C1 ] = f64, SIZE
  8526. mov f64 = f0
  8527. #if defined(TRMMKERNEL) && defined(LEFT)
  8528. adds KK = 4, KK
  8529. #else
  8530. nop __LINE__
  8531. #endif
  8532. }
  8533. { .mfb
  8534. STFD [C9 ] = f66, SIZE
  8535. mov f66 = f0
  8536. nop __LINE__
  8537. }
  8538. ;;
  8539. { .mfi
  8540. STFD [C1 ] = f65, 3 * SIZE
  8541. mov f65 = f0
  8542. #ifdef TRMMKERNEL
  8543. shladd KK8 = KK, BASE_SHIFT, r0
  8544. #else
  8545. nop __LINE__
  8546. #endif
  8547. }
  8548. { .mfb
  8549. STFD [C9 ] = f67, 3 * SIZE
  8550. mov f67 = f0
  8551. nop __LINE__
  8552. }
  8553. ;;
  8554. #endif
  8555. .align 32
  8556. .L150:
  8557. { .mib
  8558. #ifndef TRMMKERNEL
  8559. nop __LINE__
  8560. #else
  8561. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  8562. sub L = K, KK
  8563. #elif defined(LEFT)
  8564. adds L = 2, KK
  8565. #else
  8566. adds L = 1, KK
  8567. #endif
  8568. #endif
  8569. tbit.z p6, p7 = M, 1
  8570. (p6) br.cond.dptk .L160
  8571. }
  8572. ;;
  8573. #if !defined(TRMMKERNEL) || \
  8574. defined(TRMMKERNEL) && \
  8575. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8576. { .mmi
  8577. LDFD f48 = [B]
  8578. adds BOFFSET = 1 * SIZE, B
  8579. #ifndef TRMMKERNEL
  8580. adds L = 1, K
  8581. #else
  8582. adds L = 1, L
  8583. #endif
  8584. }
  8585. ;;
  8586. #else
  8587. { .mmi
  8588. add BOFFSET = KK8, B
  8589. shladd AOFFSET = KK8, 1, AOFFSET
  8590. nop __LINE__
  8591. }
  8592. ;;
  8593. { .mmi
  8594. LDFD f48 = [BOFFSET], 1 * SIZE
  8595. nop __LINE__
  8596. #ifndef TRMMKERNEL
  8597. adds L = 1, K
  8598. #else
  8599. adds L = 1, L
  8600. #endif
  8601. }
  8602. ;;
  8603. #endif
  8604. { .mii
  8605. cmp.eq p3, p0 = r0, r0
  8606. tbit.z p12, p0 = L, 0
  8607. shr L = L, 1
  8608. }
  8609. ;;
  8610. { .mii
  8611. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8612. adds L = -1, L
  8613. ;;
  8614. mov ar.lc = L
  8615. }
  8616. ;;
  8617. .align 32
  8618. .L152:
  8619. { .mfi
  8620. cmp.ne p4, p5 = 0, L
  8621. FMA f64 = f32, f48, f64 // A1 * B1
  8622. (p12) cmp.ne p3, p0 = 0, L
  8623. }
  8624. ;;
  8625. { .mmf
  8626. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  8627. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  8628. FMA f65 = f33, f48, f65 // A2 * B1
  8629. }
  8630. ;;
  8631. { .mfi
  8632. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8633. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  8634. adds L = -1, L
  8635. }
  8636. ;;
  8637. { .mfb
  8638. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  8639. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  8640. br.cloop.sptk.few .L152
  8641. }
  8642. ;;
  8643. .L158:
  8644. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8645. LDFD f68 = [C1 ], SIZE
  8646. ;;
  8647. LDFD f69 = [C1 ], -1 * SIZE
  8648. ;;
  8649. FMA f64 = ALPHA, f64, f68
  8650. FMA f65 = ALPHA, f65, f69
  8651. ;;
  8652. STFD [C1 ] = f64, SIZE
  8653. mov f64 = f0
  8654. ;;
  8655. { .mfi
  8656. STFD [C1 ] = f65, SIZE
  8657. mov f65 = f0
  8658. }
  8659. ;;
  8660. #else
  8661. { .mfi
  8662. nop __LINE__
  8663. FMPY f64 = ALPHA, f64
  8664. #if defined(TRMMKERNEL) && \
  8665. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8666. sub L = K, KK
  8667. #else
  8668. nop __LINE__
  8669. #endif
  8670. }
  8671. { .mfi
  8672. nop __LINE__
  8673. FMPY f65 = ALPHA, f65
  8674. nop __LINE__
  8675. }
  8676. ;;
  8677. { .mii
  8678. nop __LINE__
  8679. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  8680. adds L = -2, L
  8681. #else
  8682. nop __LINE__
  8683. #endif
  8684. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  8685. adds L = -1, L
  8686. #else
  8687. nop __LINE__
  8688. #endif
  8689. }
  8690. ;;
  8691. { .mmi
  8692. #if defined(TRMMKERNEL) && \
  8693. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8694. shladd KK8 = L, BASE_SHIFT, r0
  8695. #else
  8696. nop __LINE__
  8697. #endif
  8698. ;;
  8699. #if defined(TRMMKERNEL) && \
  8700. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8701. shladd AOFFSET = KK8, 1, AOFFSET
  8702. #else
  8703. nop __LINE__
  8704. #endif
  8705. #if defined(TRMMKERNEL) && \
  8706. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8707. add BOFFSET = KK8, BOFFSET
  8708. #else
  8709. nop __LINE__
  8710. #endif
  8711. }
  8712. ;;
  8713. { .mfi
  8714. STFD [C1 ] = f64, SIZE
  8715. mov f64 = f0
  8716. #if defined(TRMMKERNEL) && defined(LEFT)
  8717. adds KK = 2, KK
  8718. #else
  8719. nop __LINE__
  8720. #endif
  8721. }
  8722. ;;
  8723. { .mfi
  8724. STFD [C1 ] = f65, SIZE
  8725. mov f65 = f0
  8726. #ifdef TRMMKERNEL
  8727. shladd KK8 = KK, BASE_SHIFT, r0
  8728. #else
  8729. nop __LINE__
  8730. #endif
  8731. }
  8732. ;;
  8733. #endif
  8734. .align 32
  8735. .L160:
  8736. { .mib
  8737. #ifndef TRMMKERNEL
  8738. nop __LINE__
  8739. #else
  8740. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  8741. sub L = K, KK
  8742. #elif defined(LEFT)
  8743. adds L = 1, KK
  8744. #else
  8745. adds L = 1, KK
  8746. #endif
  8747. #endif
  8748. tbit.z p6, p7 = M, 0
  8749. (p6) br.cond.dptk .L169
  8750. }
  8751. ;;
  8752. #if !defined(TRMMKERNEL) || \
  8753. defined(TRMMKERNEL) && \
  8754. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8755. { .mmi
  8756. LDFD f48 = [B]
  8757. adds BOFFSET = 1 * SIZE, B
  8758. #ifndef TRMMKERNEL
  8759. adds L = 1, K
  8760. #else
  8761. adds L = 1, L
  8762. #endif
  8763. }
  8764. ;;
  8765. #else
  8766. { .mmi
  8767. add BOFFSET = KK8, B
  8768. add AOFFSET = KK8, AOFFSET
  8769. nop __LINE__
  8770. }
  8771. ;;
  8772. { .mmi
  8773. LDFD f48 = [BOFFSET], 1 * SIZE
  8774. nop __LINE__
  8775. #ifndef TRMMKERNEL
  8776. adds L = 1, K
  8777. #else
  8778. adds L = 1, L
  8779. #endif
  8780. }
  8781. ;;
  8782. #endif
  8783. ;;
  8784. { .mii
  8785. LDFD f32 = [AOFFSET], 1 * SIZE
  8786. tbit.z p12, p0 = L, 0
  8787. shr L = L, 1
  8788. }
  8789. ;;
  8790. { .mii
  8791. adds L = -1, L
  8792. cmp.eq p3, p0 = r0, r0
  8793. ;;
  8794. mov ar.lc = L
  8795. }
  8796. ;;
  8797. .align 32
  8798. .L162:
  8799. { .mmf
  8800. cmp.ne p4, p5 = 0, L
  8801. (p12) cmp.ne p3, p0 = 0, L
  8802. FMA f64 = f32, f48, f64 // A1 * B1
  8803. }
  8804. ;;
  8805. { .mmi
  8806. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  8807. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  8808. nop __LINE__
  8809. }
  8810. ;;
  8811. { .mmi
  8812. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  8813. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8814. (p5) LDFD f68 = [C1]
  8815. #else
  8816. nop __LINE__
  8817. #endif
  8818. adds L = -1, L
  8819. }
  8820. { .mfb
  8821. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  8822. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  8823. br.cloop.sptk.few .L162
  8824. }
  8825. ;;
  8826. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8827. FMA f64 = ALPHA, f64, f68
  8828. #else
  8829. FMPY f64 = ALPHA, f64
  8830. #endif
  8831. ;;
  8832. STFD [C1 ] = f64
  8833. ;;
  8834. .align 32
  8835. .L169:
  8836. { .mmi
  8837. mov B = BOFFSET
  8838. mov AOFFSET = A
  8839. #if defined(TRMMKERNEL) && !defined(LEFT)
  8840. adds KK = 1, KK
  8841. #else
  8842. nop __LINE__
  8843. #endif
  8844. }
  8845. ;;
  8846. .align 16
  8847. .L999:
  8848. mov r8 = r0
  8849. adds r9 = 1 * 16, SP
  8850. ;;
  8851. ldf.fill f16 = [SP], 32
  8852. ldf.fill f17 = [r9], 32
  8853. ;;
  8854. ldf.fill f18 = [SP], 32
  8855. ldf.fill f19 = [r9], 32
  8856. ;;
  8857. ldf.fill f20 = [SP], 32
  8858. ldf.fill f21 = [r9], 32
  8859. ;;
  8860. ldf.fill f22 = [SP], 32
  8861. ldf.fill f23 = [r9], 32
  8862. mov ar.lc = ARLC
  8863. ;;
  8864. ldf.fill f24 = [SP], 32
  8865. ldf.fill f25 = [r9], 32
  8866. mov pr = PR, -1
  8867. ;;
  8868. ldf.fill f26 = [SP], 32
  8869. ldf.fill f27 = [r9], 32
  8870. mov ar.pfs = ARPFS
  8871. ;;
  8872. ldf.fill f28 = [SP], 32
  8873. ldf.fill f29 = [r9], 32
  8874. ;;
  8875. ldf.fill f30 = [SP], 32
  8876. ldf.fill f31 = [r9]
  8877. br.ret.sptk.many b0
  8878. EPILOGUE