You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qgemm_kernel.S 147 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCHSIZE (8 * 16)
  41. #define CPREFETCHSIZE 7
  42. #define CPREFETCH lfetch.excl.nt2
  43. #define M r32
  44. #define N r33
  45. #define K r34
  46. #define A r38
  47. #define B r39
  48. #define C r36
  49. #define LDC r37
  50. #define I r15
  51. #define J r16
  52. #define AOFFSET r17
  53. #define BOFFSET r18
  54. #define TEMP r19
  55. #define L r20
  56. #define C1 r21
  57. #define C2 r22
  58. #define C3 r23
  59. #define C4 r24
  60. #define C5 r25
  61. #define C6 r26
  62. #define C7 r27
  63. #define C8 r28
  64. #define C9 loc0
  65. #define C10 loc1
  66. #define C11 loc2
  67. #define C12 loc3
  68. #define C13 loc4
  69. #define C14 loc5
  70. #define C15 loc6
  71. #define C16 loc7
  72. #define PREA r8
  73. #define PREB r9
  74. #define PREC r10
  75. #define SP r12
  76. #define ARLC r29
  77. #define PR r30
  78. #define ARPFS r31
  79. #define ALPHA f8
  80. #define AORIG loc8
  81. #define KK loc9
  82. #define KK8 loc10
  83. #define OFFSET loc11
  84. PROLOGUE
  85. .prologue
  86. PROFCODE
  87. { .mmi
  88. .save ar.pfs, ARPFS
  89. #ifdef TRMMKERNEL
  90. alloc ARPFS = ar.pfs, 8, 16, 0, 0
  91. #else
  92. alloc ARPFS = ar.pfs, 8, 8, 0, 0
  93. #endif
  94. adds r14 = 16, SP
  95. mov ARLC = ar.lc
  96. }
  97. { .mmi
  98. adds r8 = -16 * 16, SP
  99. adds r9 = -15 * 16, SP
  100. adds SP = -16 * 16, SP
  101. }
  102. ;;
  103. stf.spill [r8] = f16, 32
  104. stf.spill [r9] = f17, 32
  105. mov PR = pr
  106. ;;
  107. stf.spill [r8] = f18, 32
  108. stf.spill [r9] = f19, 32
  109. ;;
  110. stf.spill [r8] = f20, 32
  111. stf.spill [r9] = f21, 32
  112. shr J = N, 3
  113. ;;
  114. stf.spill [r8] = f22, 32
  115. stf.spill [r9] = f23, 32
  116. mov AOFFSET = A
  117. ;;
  118. stf.spill [r8] = f24, 32
  119. stf.spill [r9] = f25, 32
  120. cmp.ge p6, p0 = 0, J
  121. ;;
  122. stf.spill [r8] = f26, 32
  123. stf.spill [r9] = f27, 32
  124. ;;
  125. stf.spill [r8] = f28, 32
  126. stf.spill [r9] = f29, 32
  127. ;;
  128. stf.spill [r8] = f30
  129. stf.spill [r9] = f31
  130. ld8 C = [r14], 8
  131. ;;
  132. ld8 LDC = [r14], 8
  133. ;;
  134. shladd LDC = LDC, BASE_SHIFT, r0
  135. ;;
  136. #ifndef TRMMKERNEL
  137. (p6) br.cond.dpnt .L050
  138. .body
  139. ;;
  140. #else
  141. .body
  142. ;;
  143. ld8 OFFSET = [r14], 8
  144. ;;
  145. #if defined(TRMMKERNEL) && !defined(LEFT)
  146. ;;
  147. sub KK = r0, OFFSET
  148. #endif
  149. (p6) br.cond.dpnt .L050
  150. ;;
  151. #endif
  152. .align 32
  153. .L010:
  154. { .mfi
  155. adds J = -1, J
  156. mov f64 = f0
  157. shr I = M, 3
  158. }
  159. { .mfi
  160. mov C1 = C // coffset1 = c + 0 * ldc
  161. mov f72 = f0
  162. }
  163. ;;
  164. { .mmf
  165. cmp.eq p6, p7 = 0, I
  166. #if defined(TRMMKERNEL) && defined(LEFT)
  167. mov KK = OFFSET
  168. #else
  169. nop __LINE__
  170. #endif
  171. mov f80 = f0
  172. }
  173. { .mmf
  174. add C2 = LDC, C // coffset2 = c + 1 * ldc
  175. shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc
  176. mov f88 = f0
  177. }
  178. ;;
  179. { .mmf
  180. shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc
  181. shladd C = LDC, 3, C // coffset += 8 * ldc
  182. mov f96 = f0
  183. }
  184. { .mmf
  185. shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc
  186. shladd C6 = LDC, 2, C2 // coffset6 = c + 5 * ldc
  187. mov f104 = f0
  188. }
  189. ;;
  190. { .mfi
  191. shladd C7 = LDC, 2, C3 // coffset7 = c + 6 * ldc
  192. mov f112 = f0
  193. #ifdef TRMMKERNEL
  194. shladd KK8 = KK, BASE_SHIFT, r0
  195. #else
  196. nop __LINE__
  197. #endif
  198. }{ .mfb
  199. sub C8 = C, LDC // coffset8 = c + 7 * ldc
  200. mov f120 = f0
  201. (p6) br.cond.dpnt .L020
  202. }
  203. ;;
  204. .align 16
  205. .L011:
  206. #if !defined(TRMMKERNEL) || \
  207. defined(TRMMKERNEL) && \
  208. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  209. mov BOFFSET = B
  210. ;;
  211. { .mfb
  212. LDFD f48 = [BOFFSET], SIZE
  213. mov f65 = f0
  214. nop __LINE__
  215. }
  216. ;;
  217. { .mfb
  218. LDFD f49 = [BOFFSET], SIZE
  219. mov f73 = f0
  220. nop __LINE__
  221. }
  222. ;;
  223. #else
  224. { .mfi
  225. shladd BOFFSET = KK8, 3, B
  226. mov f65 = f0
  227. shladd AOFFSET = KK8, 3, AOFFSET
  228. }
  229. ;;
  230. LDFD f48 = [BOFFSET], SIZE
  231. ;;
  232. { .mfi
  233. LDFD f49 = [BOFFSET], SIZE
  234. mov f73 = f0
  235. nop __LINE__
  236. }
  237. ;;
  238. #endif
  239. LDFD f32 = [AOFFSET], SIZE
  240. LDFD f50 = [BOFFSET], SIZE
  241. ;;
  242. { .mfb
  243. LDFD f33 = [AOFFSET], SIZE
  244. mov f81 = f0
  245. nop __LINE__
  246. }
  247. { .mfb
  248. LDFD f51 = [BOFFSET], SIZE
  249. mov f89 = f0
  250. nop __LINE__
  251. }
  252. ;;
  253. LDFD f52 = [BOFFSET], SIZE
  254. ;;
  255. { .mmf
  256. LDFD f53 = [BOFFSET], SIZE
  257. setf.d f97 = r0
  258. mov f105 = f0
  259. }
  260. { .mfb
  261. setf.d f113 = r0
  262. mov f121 = f0
  263. nop __LINE__
  264. }
  265. ;;
  266. LDFD f54 = [BOFFSET], SIZE
  267. ;;
  268. { .mmf
  269. LDFD f55 = [BOFFSET], SIZE
  270. setf.d f66 = r0
  271. mov f74 = f0
  272. }
  273. { .mfb
  274. setf.d f82 = r0
  275. mov f90 = f0
  276. nop __LINE__
  277. }
  278. ;;
  279. LDFD f34 = [AOFFSET], SIZE
  280. ;;
  281. { .mmf
  282. LDFD f35 = [AOFFSET], SIZE
  283. setf.d f98 = r0
  284. mov f106 = f0
  285. }
  286. { .mfb
  287. setf.d f114 = r0
  288. mov f122 = f0
  289. nop __LINE__
  290. }
  291. ;;
  292. LDFD f36 = [AOFFSET], SIZE
  293. ;;
  294. { .mmf
  295. LDFD f37 = [AOFFSET], SIZE
  296. setf.d f67 = r0
  297. mov f75 = f0
  298. }
  299. { .mfi
  300. setf.d f83 = r0
  301. mov f91 = f0
  302. #ifndef TRMMKERNEL
  303. nop __LINE__
  304. #else
  305. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  306. sub L = K, KK
  307. #elif defined(LEFT)
  308. adds L = 8, KK
  309. #else
  310. adds L = 8, KK
  311. #endif
  312. #endif
  313. }
  314. ;;
  315. LDFD f38 = [AOFFSET], SIZE
  316. ;;
  317. { .mmf
  318. LDFD f39 = [AOFFSET], SIZE
  319. setf.d f99 = r0
  320. mov f107 = f0
  321. }
  322. { .mfi
  323. setf.d f115 = r0
  324. mov f123 = f0
  325. adds PREC = CPREFETCHSIZE * SIZE, C1
  326. }
  327. ;;
  328. { .mmf
  329. CPREFETCH [PREC], LDC
  330. setf.d f68 = r0
  331. mov f76 = f0
  332. }
  333. { .mfi
  334. setf.d f84 = r0
  335. mov f92 = f0
  336. #ifndef TRMMKERNEL
  337. adds L = 1, K
  338. #else
  339. adds L = 1, L
  340. #endif
  341. }
  342. ;;
  343. { .mmf
  344. CPREFETCH [PREC], LDC
  345. setf.d f100 = r0
  346. mov f108 = f0
  347. }
  348. { .mfi
  349. setf.d f116 = r0
  350. mov f124 = f0
  351. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  352. }
  353. ;;
  354. { .mmf
  355. CPREFETCH [PREC], LDC
  356. setf.d f69 = r0
  357. mov f77 = f0
  358. }
  359. { .mfi
  360. setf.d f85 = r0
  361. mov f93 = f0
  362. adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET
  363. }
  364. ;;
  365. { .mmf
  366. CPREFETCH [PREC], LDC
  367. setf.d f101 = r0
  368. mov f109 = f0
  369. }
  370. { .mfi
  371. setf.d f117 = r0
  372. mov f125 = f0
  373. tbit.z p12, p0 = L, 0
  374. }
  375. ;;
  376. { .mmf
  377. CPREFETCH [PREC], LDC
  378. setf.d f70 = r0
  379. mov f78 = f0
  380. }
  381. { .mfi
  382. setf.d f86 = r0
  383. mov f94 = f0
  384. shr L = L, 1
  385. }
  386. ;;
  387. { .mmf
  388. CPREFETCH [PREC], LDC
  389. setf.d f102 = r0
  390. mov f110 = f0
  391. }
  392. { .mfi
  393. setf.d f118 = r0
  394. mov f126 = f0
  395. adds L = -1, L
  396. }
  397. ;;
  398. { .mmf
  399. CPREFETCH [PREC], LDC
  400. setf.d f71 = r0
  401. mov f79 = f0
  402. }
  403. { .mfi
  404. setf.d f87 = r0
  405. mov f95 = f0
  406. mov ar.lc = L
  407. }
  408. ;;
  409. { .mmf
  410. CPREFETCH [PREC]
  411. setf.d f103 = r0
  412. mov f111 = f0
  413. }
  414. { .mfi
  415. setf.d f119 = r0
  416. mov f127 = f0
  417. cmp.eq p3, p0 = r0, r0
  418. }
  419. ;;
  420. .align 16
  421. .L012:
  422. /* 1 */
  423. { .mfi
  424. lfetch.fault.nt1 [PREA], 8 * SIZE
  425. FMA f64 = f32, f48, f64 // A1 * B1
  426. nop __LINE__
  427. }
  428. { .mfb
  429. (p12) cmp.ne p3, p0 = 0, L
  430. FMA f72 = f32, f49, f72 // A1 * B2
  431. nop __LINE__
  432. }
  433. ;;
  434. /* 2 */
  435. { .mfi
  436. lfetch.nt1 [PREB], 8 * SIZE
  437. FMA f80 = f32, f50, f80 // A1 * B3
  438. nop __LINE__
  439. }
  440. { .mfb
  441. cmp.ne p4, p5 = 0, L
  442. FMA f88 = f32, f51, f88 // A1 * B4
  443. nop __LINE__
  444. }
  445. ;;
  446. /* 3 */
  447. { .mfb
  448. (p3) LDFD f40 = [AOFFSET], SIZE
  449. FMA f96 = f32, f52, f96 // A1 * B5
  450. nop __LINE__
  451. }
  452. { .mfb
  453. adds C9 = 4 * SIZE, C1
  454. FMA f104 = f32, f53, f104 // A1 * B6
  455. nop __LINE__
  456. }
  457. ;;
  458. /* 4 */
  459. { .mfi
  460. (p3) LDFD f56 = [BOFFSET], SIZE
  461. FMA f112 = f32, f54, f112 // A1 * B7
  462. adds C10 = 4 * SIZE, C2
  463. }
  464. { .mfb
  465. (p3) LDFD f41 = [AOFFSET], SIZE
  466. FMA f120 = f32, f55, f120 // A1 * B8
  467. nop __LINE__
  468. }
  469. ;;
  470. /* 5 */
  471. { .mfi
  472. (p3) LDFD f57 = [BOFFSET], SIZE
  473. FMA f65 = f33, f48, f65 // A2 * B1
  474. adds C11 = 4 * SIZE, C3
  475. }
  476. { .mfb
  477. (p3) LDFD f42 = [AOFFSET], SIZE
  478. FMA f73 = f33, f49, f73 // A2 * B2
  479. nop __LINE__
  480. }
  481. ;;
  482. /* 6 */
  483. { .mfi
  484. (p3) LDFD f58 = [BOFFSET], SIZE
  485. FMA f81 = f33, f50, f81 // A2 * B3
  486. adds C12 = 4 * SIZE, C4
  487. }
  488. { .mfb
  489. (p3) LDFD f43 = [AOFFSET], SIZE
  490. FMA f89 = f33, f51, f89 // A2 * B4
  491. nop __LINE__
  492. }
  493. ;;
  494. /* 7 */
  495. { .mfi
  496. (p3) LDFD f59 = [BOFFSET], SIZE
  497. FMA f97 = f33, f52, f97 // A2 * B5
  498. adds C13 = 4 * SIZE, C5
  499. }
  500. { .mfb
  501. (p3) LDFD f44 = [AOFFSET], SIZE
  502. FMA f105 = f33, f53, f105 // A2 * B6
  503. nop __LINE__
  504. }
  505. ;;
  506. /* 8 */
  507. { .mfi
  508. (p3) LDFD f60 = [BOFFSET], SIZE
  509. FMA f113 = f33, f54, f113 // A2 * B7
  510. adds C14 = 4 * SIZE, C6
  511. }
  512. { .mfb
  513. (p3) LDFD f45 = [AOFFSET], SIZE
  514. FMA f121 = f33, f55, f121 // A2 * B8
  515. nop __LINE__
  516. }
  517. ;;
  518. /* 9 */
  519. { .mfi
  520. (p3) LDFD f61 = [BOFFSET], SIZE
  521. FMA f66 = f34, f48, f66 // A3 * B1
  522. adds C15 = 4 * SIZE, C7
  523. }
  524. { .mfb
  525. (p3) LDFD f46 = [AOFFSET], SIZE
  526. FMA f74 = f34, f49, f74 // A3 * B2
  527. nop __LINE__
  528. }
  529. ;;
  530. /* 10 */
  531. { .mfi
  532. (p3) LDFD f62 = [BOFFSET], SIZE
  533. FMA f82 = f34, f50, f82 // A3 * B3
  534. adds C16 = 4 * SIZE, C8
  535. }
  536. { .mfb
  537. (p3) LDFD f47 = [AOFFSET], SIZE
  538. FMA f90 = f34, f51, f90 // A3 * B4
  539. nop __LINE__
  540. }
  541. ;;
  542. /* 11 */
  543. { .mfb
  544. (p3) LDFD f63 = [BOFFSET], SIZE
  545. FMA f98 = f34, f52, f98 // A3 * B5
  546. nop __LINE__
  547. }
  548. { .mfb
  549. nop __LINE__
  550. FMA f106 = f34, f53, f106 // A3 * B6
  551. nop __LINE__
  552. }
  553. ;;
  554. /* 12 */
  555. { .mfb
  556. nop __LINE__
  557. FMA f114 = f34, f54, f114 // A3 * B7
  558. nop __LINE__
  559. }
  560. { .mfb
  561. nop __LINE__
  562. FMA f122 = f34, f55, f122 // A3 * B8
  563. nop __LINE__
  564. }
  565. ;;
  566. /* 13 */
  567. { .mfb
  568. nop __LINE__
  569. FMA f67 = f35, f48, f67 // A4 * B1
  570. nop __LINE__
  571. }
  572. { .mfb
  573. nop __LINE__
  574. FMA f75 = f35, f49, f75 // A4 * B2
  575. nop __LINE__
  576. }
  577. ;;
  578. /* 14 */
  579. { .mfb
  580. nop __LINE__
  581. FMA f83 = f35, f50, f83 // A4 * B3
  582. nop __LINE__
  583. }
  584. { .mfb
  585. nop __LINE__
  586. FMA f91 = f35, f51, f91 // A4 * B4
  587. nop __LINE__
  588. }
  589. ;;
  590. /* 15 */
  591. { .mfb
  592. nop __LINE__
  593. FMA f99 = f35, f52, f99 // A4 * B5
  594. nop __LINE__
  595. }
  596. { .mfb
  597. nop __LINE__
  598. FMA f107 = f35, f53, f107 // A4 * B6
  599. nop __LINE__
  600. }
  601. ;;
  602. /* 16 */
  603. { .mfb
  604. nop __LINE__
  605. FMA f115 = f35, f54, f115 // A4 * B7
  606. nop __LINE__
  607. }
  608. { .mfb
  609. nop __LINE__
  610. FMA f123 = f35, f55, f123 // A4 * B8
  611. nop __LINE__
  612. }
  613. ;;
  614. /* 17 */
  615. { .mfb
  616. nop __LINE__
  617. FMA f68 = f36, f48, f68 // A5 * B1
  618. nop __LINE__
  619. }
  620. { .mfb
  621. nop __LINE__
  622. FMA f76 = f36, f49, f76 // A5 * B2
  623. nop __LINE__
  624. }
  625. ;;
  626. /* 18 */
  627. { .mfb
  628. nop __LINE__
  629. FMA f84 = f36, f50, f84 // A5 * B3
  630. nop __LINE__
  631. }
  632. { .mfb
  633. nop __LINE__
  634. FMA f92 = f36, f51, f92 // A5 * B4
  635. nop __LINE__
  636. }
  637. ;;
  638. /* 19 */
  639. { .mfb
  640. nop __LINE__
  641. FMA f100 = f36, f52, f100 // A5 * B5
  642. nop __LINE__
  643. }
  644. { .mfb
  645. nop __LINE__
  646. FMA f108 = f36, f53, f108 // A5 * B6
  647. nop __LINE__
  648. }
  649. ;;
  650. /* 20 */
  651. { .mfb
  652. nop __LINE__
  653. FMA f116 = f36, f54, f116 // A5 * B7
  654. nop __LINE__
  655. }
  656. { .mfb
  657. nop __LINE__
  658. FMA f124 = f36, f55, f124 // A5 * B8
  659. nop __LINE__
  660. }
  661. ;;
  662. /* 21 */
  663. { .mfb
  664. nop __LINE__
  665. FMA f69 = f37, f48, f69 // A6 * B1
  666. nop __LINE__
  667. }
  668. { .mfb
  669. nop __LINE__
  670. FMA f77 = f37, f49, f77 // A6 * B2
  671. nop __LINE__
  672. }
  673. ;;
  674. /* 22 */
  675. { .mfb
  676. nop __LINE__
  677. FMA f85 = f37, f50, f85 // A6 * B3
  678. nop __LINE__
  679. }
  680. { .mfb
  681. nop __LINE__
  682. FMA f93 = f37, f51, f93 // A6 * B4
  683. nop __LINE__
  684. }
  685. ;;
  686. /* 23 */
  687. { .mfb
  688. nop __LINE__
  689. FMA f101 = f37, f52, f101 // A6 * B5
  690. nop __LINE__
  691. }
  692. { .mfb
  693. nop __LINE__
  694. FMA f109 = f37, f53, f109 // A6 * B6
  695. nop __LINE__
  696. }
  697. ;;
  698. /* 24 */
  699. { .mfb
  700. nop __LINE__
  701. FMA f117 = f37, f54, f117 // A6 * B7
  702. nop __LINE__
  703. }
  704. { .mfb
  705. nop __LINE__
  706. FMA f125 = f37, f55, f125 // A6 * B8
  707. nop __LINE__
  708. }
  709. ;;
  710. /* 25 */
  711. { .mfb
  712. nop __LINE__
  713. FMA f70 = f38, f48, f70 // A7 * B1
  714. nop __LINE__
  715. }
  716. { .mfb
  717. nop __LINE__
  718. FMA f78 = f38, f49, f78 // A7 * B2
  719. nop __LINE__
  720. }
  721. ;;
  722. /* 26 */
  723. { .mfb
  724. nop __LINE__
  725. FMA f86 = f38, f50, f86 // A7 * B3
  726. nop __LINE__
  727. }
  728. { .mfb
  729. nop __LINE__
  730. FMA f94 = f38, f51, f94 // A7 * B4
  731. nop __LINE__
  732. }
  733. ;;
  734. /* 27 */
  735. { .mfb
  736. nop __LINE__
  737. FMA f102 = f38, f52, f102 // A7 * B5
  738. nop __LINE__
  739. }
  740. { .mfb
  741. nop __LINE__
  742. FMA f110 = f38, f53, f110 // A7 * B6
  743. nop __LINE__
  744. }
  745. ;;
  746. /* 28 */
  747. { .mfb
  748. nop __LINE__
  749. FMA f118 = f38, f54, f118 // A7 * B7
  750. nop __LINE__
  751. }
  752. { .mfb
  753. nop __LINE__
  754. FMA f126 = f38, f55, f126 // A7 * B8
  755. nop __LINE__
  756. }
  757. ;;
  758. /* 29 */
  759. { .mfb
  760. (p4) LDFD f32 = [AOFFSET], SIZE
  761. FMA f71 = f39, f48, f71 // A8 * B1
  762. nop __LINE__
  763. }
  764. { .mfb
  765. nop __LINE__
  766. FMA f79 = f39, f49, f79 // A8 * B2
  767. nop __LINE__
  768. }
  769. ;;
  770. /* 30 */
  771. { .mfb
  772. (p4) LDFD f33 = [AOFFSET], SIZE
  773. FMA f87 = f39, f50, f87 // A8 * B3
  774. nop __LINE__
  775. }
  776. { .mfb
  777. (p4) LDFD f48 = [BOFFSET], SIZE
  778. FMA f95 = f39, f51, f95 // A8 * B4
  779. nop __LINE__
  780. }
  781. ;;
  782. /* 31 */
  783. { .mfb
  784. (p4) LDFD f34 = [AOFFSET], SIZE
  785. FMA f103 = f39, f52, f103 // A8 * B5
  786. nop __LINE__
  787. }
  788. { .mfb
  789. (p4) LDFD f49 = [BOFFSET], SIZE
  790. FMA f111 = f39, f53, f111 // A8 * B6
  791. nop __LINE__
  792. }
  793. ;;
  794. /* 32 */
  795. { .mfb
  796. lfetch.fault.nt1 [PREA], 8 * SIZE
  797. FMA f119 = f39, f54, f119 // A8 * B7
  798. nop __LINE__
  799. }
  800. { .mfb
  801. nop __LINE__
  802. FMA f127 = f39, f55, f127 // A8 * B8
  803. nop __LINE__
  804. }
  805. ;;
  806. /* 33 */
  807. { .mfb
  808. lfetch.nt1 [PREB], 8 * SIZE
  809. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  810. nop __LINE__
  811. }
  812. { .mfb
  813. nop __LINE__
  814. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  815. nop __LINE__
  816. }
  817. ;;
  818. /* 34 */
  819. { .mfb
  820. (p4) LDFD f35 = [AOFFSET], SIZE
  821. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  822. nop __LINE__
  823. }
  824. { .mfb
  825. (p4) LDFD f50 = [BOFFSET], SIZE
  826. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  827. nop __LINE__
  828. }
  829. ;;
  830. /* 35 */
  831. { .mfb
  832. (p4) LDFD f36 = [AOFFSET], SIZE
  833. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  834. nop __LINE__
  835. }
  836. { .mfb
  837. (p4) LDFD f51 = [BOFFSET], SIZE
  838. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  839. nop __LINE__
  840. }
  841. ;;
  842. /* 36 */
  843. { .mfb
  844. (p4) LDFD f37 = [AOFFSET], SIZE
  845. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  846. nop __LINE__
  847. }
  848. { .mfb
  849. (p4) LDFD f52 = [BOFFSET], SIZE
  850. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  851. nop __LINE__
  852. }
  853. ;;
  854. /* 37 */
  855. { .mfb
  856. (p4) LDFD f38 = [AOFFSET], SIZE
  857. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  858. nop __LINE__
  859. }
  860. { .mfb
  861. (p4) LDFD f53 = [BOFFSET], SIZE
  862. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  863. nop __LINE__
  864. }
  865. ;;
  866. /* 38 */
  867. { .mfb
  868. (p4) LDFD f39 = [AOFFSET], SIZE
  869. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  870. nop __LINE__
  871. }
  872. { .mfb
  873. (p4) LDFD f54 = [BOFFSET], SIZE
  874. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  875. nop __LINE__
  876. }
  877. ;;
  878. /* 39 */
  879. { .mfb
  880. (p4) LDFD f55 = [BOFFSET], SIZE
  881. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  882. nop __LINE__
  883. }
  884. { .mfb
  885. nop __LINE__
  886. (p3) FMA f105 = f41, f61, f105 // A2 * B6
  887. nop __LINE__
  888. }
  889. ;;
  890. /* 40 */
  891. { .mfb
  892. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  893. (p5) LDFD f6 = [C1 ], SIZE
  894. #else
  895. nop __LINE__
  896. #endif
  897. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  898. nop __LINE__
  899. }
  900. { .mfb
  901. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  902. (p5) LDFD f7 = [C9 ], SIZE
  903. #else
  904. nop __LINE__
  905. #endif
  906. (p3) FMA f121 = f41, f63, f121 // A2 * B8
  907. nop __LINE__
  908. }
  909. ;;
  910. /* 41 */
  911. { .mfb
  912. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  913. (p5) LDFD f10 = [C1 ], SIZE
  914. #else
  915. nop __LINE__
  916. #endif
  917. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  918. nop __LINE__
  919. }
  920. { .mfb
  921. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  922. (p5) LDFD f11 = [C9 ], SIZE
  923. #else
  924. nop __LINE__
  925. #endif
  926. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  927. nop __LINE__
  928. }
  929. ;;
  930. /* 42 */
  931. { .mfb
  932. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  933. (p5) LDFD f12 = [C1 ], SIZE
  934. #else
  935. nop __LINE__
  936. #endif
  937. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  938. nop __LINE__
  939. }
  940. { .mfb
  941. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  942. (p5) LDFD f13 = [C9 ], SIZE
  943. #else
  944. nop __LINE__
  945. #endif
  946. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  947. nop __LINE__
  948. }
  949. ;;
  950. /* 43 */
  951. { .mfb
  952. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  953. (p5) LDFD f14 = [C1 ], -3 * SIZE
  954. #else
  955. nop __LINE__
  956. #endif
  957. (p3) FMA f98 = f42, f60, f98 // A3 * B5
  958. nop __LINE__
  959. }
  960. { .mfb
  961. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  962. (p5) LDFD f15 = [C9 ], -3 * SIZE
  963. #else
  964. nop __LINE__
  965. #endif
  966. (p3) FMA f106 = f42, f61, f106 // A3 * B6
  967. nop __LINE__
  968. }
  969. ;;
  970. /* 44 */
  971. { .mfb
  972. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  973. (p5) LDFD f16 = [C2 ], SIZE
  974. #else
  975. nop __LINE__
  976. #endif
  977. (p3) FMA f114 = f42, f62, f114 // A3 * B7
  978. nop __LINE__
  979. }
  980. { .mfb
  981. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  982. (p5) LDFD f17 = [C10], SIZE
  983. #else
  984. nop __LINE__
  985. #endif
  986. (p3) FMA f122 = f42, f63, f122 // A3 * B8
  987. nop __LINE__
  988. }
  989. ;;
  990. /* 45 */
  991. { .mfb
  992. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  993. (p5) LDFD f18 = [C2 ], SIZE
  994. #else
  995. nop __LINE__
  996. #endif
  997. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  998. nop __LINE__
  999. }
  1000. { .mfb
  1001. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1002. (p5) LDFD f19 = [C10], SIZE
  1003. #else
  1004. nop __LINE__
  1005. #endif
  1006. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  1007. nop __LINE__
  1008. }
  1009. ;;
  1010. /* 46 */
  1011. { .mfb
  1012. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1013. (p5) LDFD f20 = [C2 ], SIZE
  1014. #else
  1015. nop __LINE__
  1016. #endif
  1017. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  1018. nop __LINE__
  1019. }
  1020. { .mfb
  1021. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1022. (p5) LDFD f21 = [C10], SIZE
  1023. #else
  1024. nop __LINE__
  1025. #endif
  1026. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  1027. nop __LINE__
  1028. }
  1029. ;;
  1030. /* 47 */
  1031. { .mfb
  1032. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1033. (p5) LDFD f22 = [C2 ], -3 * SIZE
  1034. #else
  1035. nop __LINE__
  1036. #endif
  1037. (p3) FMA f99 = f43, f60, f99 // A4 * B5
  1038. nop __LINE__
  1039. }
  1040. { .mfb
  1041. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1042. (p5) LDFD f23 = [C10], -3 * SIZE
  1043. #else
  1044. nop __LINE__
  1045. #endif
  1046. (p3) FMA f107 = f43, f61, f107 // A4 * B6
  1047. nop __LINE__
  1048. }
  1049. ;;
  1050. /* 48 */
  1051. { .mfb
  1052. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1053. (p5) LDFD f24 = [C3 ], SIZE
  1054. #else
  1055. nop __LINE__
  1056. #endif
  1057. (p3) FMA f115 = f43, f62, f115 // A4 * B7
  1058. nop __LINE__
  1059. }
  1060. { .mfb
  1061. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1062. (p5) LDFD f25 = [C11], SIZE
  1063. #else
  1064. nop __LINE__
  1065. #endif
  1066. (p3) FMA f123 = f43, f63, f123 // A4 * B8
  1067. nop __LINE__
  1068. }
  1069. ;;
  1070. /* 49 */
  1071. { .mfb
  1072. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1073. (p5) LDFD f26 = [C3 ], SIZE
  1074. #else
  1075. nop __LINE__
  1076. #endif
  1077. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  1078. nop __LINE__
  1079. }
  1080. { .mfb
  1081. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1082. (p5) LDFD f27 = [C11], SIZE
  1083. #else
  1084. nop __LINE__
  1085. #endif
  1086. (p3) FMA f76 = f44, f57, f76 // A5 * B2
  1087. nop __LINE__
  1088. }
  1089. ;;
  1090. /* 50 */
  1091. { .mfb
  1092. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1093. (p5) LDFD f28 = [C3 ], SIZE
  1094. #else
  1095. nop __LINE__
  1096. #endif
  1097. (p3) FMA f84 = f44, f58, f84 // A5 * B3
  1098. nop __LINE__
  1099. }
  1100. { .mfb
  1101. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1102. (p5) LDFD f29 = [C11], SIZE
  1103. #else
  1104. nop __LINE__
  1105. #endif
  1106. (p3) FMA f92 = f44, f59, f92 // A5 * B4
  1107. nop __LINE__
  1108. }
  1109. ;;
  1110. /* 51 */
  1111. { .mfb
  1112. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1113. (p5) LDFD f30 = [C3 ], -3 * SIZE
  1114. #else
  1115. nop __LINE__
  1116. #endif
  1117. (p3) FMA f100 = f44, f60, f100 // A5 * B5
  1118. nop __LINE__
  1119. }
  1120. { .mfb
  1121. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1122. (p5) LDFD f31 = [C11], -3 * SIZE
  1123. #else
  1124. nop __LINE__
  1125. #endif
  1126. (p3) FMA f108 = f44, f61, f108 // A5 * B6
  1127. nop __LINE__
  1128. }
  1129. ;;
  1130. /* 52 */
  1131. { .mfb
  1132. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1133. (p5) LDFD f32 = [C4 ], SIZE
  1134. #else
  1135. nop __LINE__
  1136. #endif
  1137. (p3) FMA f116 = f44, f62, f116 // A5 * B7
  1138. nop __LINE__
  1139. }
  1140. { .mfb
  1141. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1142. (p5) LDFD f33 = [C12], SIZE
  1143. #else
  1144. nop __LINE__
  1145. #endif
  1146. (p3) FMA f124 = f44, f63, f124 // A5 * B8
  1147. nop __LINE__
  1148. }
  1149. ;;
  1150. /* 53 */
  1151. { .mfb
  1152. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1153. (p5) LDFD f34 = [C4 ], SIZE
  1154. #else
  1155. nop __LINE__
  1156. #endif
  1157. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  1158. nop __LINE__
  1159. }
  1160. { .mfb
  1161. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1162. (p5) LDFD f35 = [C12], SIZE
  1163. #else
  1164. nop __LINE__
  1165. #endif
  1166. (p3) FMA f77 = f45, f57, f77 // A6 * B2
  1167. nop __LINE__
  1168. }
  1169. ;;
  1170. /* 54 */
  1171. { .mfb
  1172. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1173. (p5) LDFD f36 = [C4 ], SIZE
  1174. #else
  1175. nop __LINE__
  1176. #endif
  1177. (p3) FMA f85 = f45, f58, f85 // A6 * B3
  1178. nop __LINE__
  1179. }
  1180. { .mfb
  1181. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1182. (p5) LDFD f37 = [C12], SIZE
  1183. #else
  1184. nop __LINE__
  1185. #endif
  1186. (p3) FMA f93 = f45, f59, f93 // A6 * B4
  1187. nop __LINE__
  1188. }
  1189. ;;
  1190. /* 55 */
  1191. { .mfb
  1192. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1193. (p5) LDFD f38 = [C4 ], -3 * SIZE
  1194. #else
  1195. nop __LINE__
  1196. #endif
  1197. (p3) FMA f101 = f45, f60, f101 // A6 * B5
  1198. nop __LINE__
  1199. }
  1200. { .mfb
  1201. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1202. (p5) LDFD f39 = [C12], -3 * SIZE
  1203. #else
  1204. nop __LINE__
  1205. #endif
  1206. (p3) FMA f109 = f45, f61, f109 // A6 * B6
  1207. nop __LINE__
  1208. }
  1209. ;;
  1210. /* 56 */
  1211. { .mfb
  1212. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1213. (p5) LDFD f48 = [C5 ], SIZE
  1214. #else
  1215. nop __LINE__
  1216. #endif
  1217. (p3) FMA f117 = f45, f62, f117 // A6 * B7
  1218. nop __LINE__
  1219. }
  1220. { .mfb
  1221. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1222. (p5) LDFD f49 = [C13], SIZE
  1223. #else
  1224. nop __LINE__
  1225. #endif
  1226. (p3) FMA f125 = f45, f63, f125 // A6 * B8
  1227. nop __LINE__
  1228. }
  1229. ;;
  1230. /* 57 */
  1231. { .mfb
  1232. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1233. (p5) LDFD f50 = [C5 ], SIZE
  1234. #else
  1235. nop __LINE__
  1236. #endif
  1237. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  1238. nop __LINE__
  1239. }
  1240. { .mfb
  1241. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1242. (p5) LDFD f51 = [C13], SIZE
  1243. #else
  1244. nop __LINE__
  1245. #endif
  1246. (p3) FMA f78 = f46, f57, f78 // A7 * B2
  1247. nop __LINE__
  1248. }
  1249. ;;
  1250. /* 58 */
  1251. { .mfb
  1252. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1253. (p5) LDFD f52 = [C5 ], SIZE
  1254. #else
  1255. nop __LINE__
  1256. #endif
  1257. (p3) FMA f86 = f46, f58, f86 // A7 * B3
  1258. nop __LINE__
  1259. }
  1260. { .mfb
  1261. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1262. (p5) LDFD f53 = [C13], SIZE
  1263. #else
  1264. nop __LINE__
  1265. #endif
  1266. (p3) FMA f94 = f46, f59, f94 // A7 * B4
  1267. nop __LINE__
  1268. }
  1269. ;;
  1270. /* 59 */
  1271. { .mfb
  1272. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1273. (p5) LDFD f54 = [C5 ], -3 * SIZE
  1274. #else
  1275. nop __LINE__
  1276. #endif
  1277. (p3) FMA f102 = f46, f60, f102 // A7 * B5
  1278. nop __LINE__
  1279. }
  1280. { .mfb
  1281. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1282. (p5) LDFD f55 = [C13], -3 * SIZE
  1283. #else
  1284. nop __LINE__
  1285. #endif
  1286. (p3) FMA f110 = f46, f61, f110 // A7 * B6
  1287. nop __LINE__
  1288. }
  1289. ;;
  1290. /* 60 */
  1291. { .mfb
  1292. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1293. (p5) LDFD f40 = [C6 ], SIZE
  1294. #else
  1295. nop __LINE__
  1296. #endif
  1297. (p3) FMA f118 = f46, f62, f118 // A7 * B7
  1298. nop __LINE__
  1299. }
  1300. { .mfb
  1301. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1302. (p5) LDFD f41 = [C14], SIZE
  1303. #else
  1304. nop __LINE__
  1305. #endif
  1306. (p3) FMA f126 = f46, f63, f126 // A7 * B8
  1307. nop __LINE__
  1308. }
  1309. ;;
  1310. /* 61 */
  1311. { .mfb
  1312. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1313. (p5) LDFD f42 = [C6 ], SIZE
  1314. #else
  1315. nop __LINE__
  1316. #endif
  1317. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  1318. nop __LINE__
  1319. }
  1320. { .mfb
  1321. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1322. (p5) LDFD f43 = [C14], SIZE
  1323. #else
  1324. nop __LINE__
  1325. #endif
  1326. (p3) FMA f79 = f47, f57, f79 // A8 * B2
  1327. nop __LINE__
  1328. }
  1329. ;;
  1330. /* 62 */
  1331. { .mfb
  1332. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1333. (p5) LDFD f44 = [C6 ], SIZE
  1334. #else
  1335. nop __LINE__
  1336. #endif
  1337. (p3) FMA f87 = f47, f58, f87 // A8 * B3
  1338. nop __LINE__
  1339. }
  1340. { .mfb
  1341. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1342. (p5) LDFD f45 = [C14], SIZE
  1343. #else
  1344. nop __LINE__
  1345. #endif
  1346. (p3) FMA f95 = f47, f59, f95 // A8 * B4
  1347. nop __LINE__
  1348. }
  1349. ;;
  1350. /* 63 */
  1351. { .mfb
  1352. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1353. (p5) LDFD f59 = [C6 ], -3 * SIZE
  1354. #else
  1355. nop __LINE__
  1356. #endif
  1357. (p3) FMA f103 = f47, f60, f103 // A8 * B5
  1358. nop __LINE__
  1359. }
  1360. { .mfb
  1361. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1362. (p5) LDFD f60 = [C14], -3 * SIZE
  1363. #else
  1364. nop __LINE__
  1365. #endif
  1366. (p3) FMA f111 = f47, f61, f111 // A8 * B6
  1367. nop __LINE__
  1368. }
  1369. ;;
  1370. /* 64 */
  1371. { .mfi
  1372. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1373. (p5) LDFD f61 = [C7 ], SIZE
  1374. #else
  1375. nop __LINE__
  1376. #endif
  1377. (p3) FMA f119 = f47, f62, f119 // A8 * B7
  1378. adds L = -1, L
  1379. }
  1380. { .mfb
  1381. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1382. (p5) LDFD f62 = [C15], SIZE
  1383. #else
  1384. nop __LINE__
  1385. #endif
  1386. (p3) FMA f127 = f47, f63, f127 // A8 * B8
  1387. br.cloop.sptk.few .L012
  1388. }
  1389. ;;
  1390. .L013:
  1391. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1392. { .mfi
  1393. (p5) LDFD f63 = [C7 ], SIZE
  1394. FMA f64 = ALPHA, f64, f6
  1395. cmp.ne p6, p0 = 1, I
  1396. }
  1397. { .mfb
  1398. (p5) LDFD f6 = [C15], SIZE
  1399. FMA f68 = ALPHA, f68, f7
  1400. nop __LINE__
  1401. }
  1402. ;;
  1403. { .mfi
  1404. (p5) LDFD f7 = [C7 ], SIZE
  1405. FMA f65 = ALPHA, f65, f10
  1406. adds I = -1, I
  1407. }
  1408. { .mfb
  1409. (p5) LDFD f10 = [C15], SIZE
  1410. FMA f69 = ALPHA, f69, f11
  1411. nop __LINE__
  1412. }
  1413. ;;
  1414. { .mfb
  1415. (p5) LDFD f11 = [C7 ], -3 * SIZE
  1416. FMA f66 = ALPHA, f66, f12
  1417. nop __LINE__
  1418. }
  1419. { .mfb
  1420. (p5) LDFD f12 = [C15], -3 * SIZE
  1421. FMA f70 = ALPHA, f70, f13
  1422. nop __LINE__
  1423. }
  1424. ;;
  1425. { .mfb
  1426. LDFD f13 = [C8 ], SIZE
  1427. FMA f67 = ALPHA, f67, f14
  1428. nop __LINE__
  1429. }
  1430. { .mfb
  1431. LDFD f14 = [C16], SIZE
  1432. FMA f71 = ALPHA, f71, f15
  1433. nop __LINE__
  1434. }
  1435. ;;
  1436. { .mmf
  1437. STFD [C1 ] = f64, SIZE
  1438. STFD [C9 ] = f68, SIZE
  1439. FMA f72 = ALPHA, f72, f16
  1440. }
  1441. { .mmf
  1442. LDFD f15 = [C8 ], SIZE
  1443. LDFD f16 = [C16], SIZE
  1444. FMA f76 = ALPHA, f76, f17
  1445. }
  1446. ;;
  1447. { .mmf
  1448. STFD [C1 ] = f65, SIZE
  1449. STFD [C9 ] = f69, SIZE
  1450. FMA f73 = ALPHA, f73, f18
  1451. }
  1452. { .mmf
  1453. LDFD f17 = [C8 ], SIZE
  1454. LDFD f18 = [C16], SIZE
  1455. FMA f77 = ALPHA, f77, f19
  1456. }
  1457. ;;
  1458. { .mmf
  1459. STFD [C1 ] = f66, SIZE
  1460. STFD [C9 ] = f70, SIZE
  1461. FMA f74 = ALPHA, f74, f20
  1462. }
  1463. { .mmf
  1464. LDFD f19 = [C8 ], -3 * SIZE
  1465. LDFD f20 = [C16], -3 * SIZE
  1466. FMA f78 = ALPHA, f78, f21
  1467. }
  1468. ;;
  1469. { .mfb
  1470. STFD [C1 ] = f67, 5 * SIZE
  1471. FMA f75 = ALPHA, f75, f22
  1472. nop __LINE__
  1473. }
  1474. { .mfb
  1475. STFD [C9 ] = f71, 5 * SIZE
  1476. FMA f79 = ALPHA, f79, f23
  1477. nop __LINE__
  1478. }
  1479. ;;
  1480. { .mfb
  1481. STFD [C2 ] = f72, SIZE
  1482. FMA f80 = ALPHA, f80, f24
  1483. nop __LINE__
  1484. }
  1485. { .mfb
  1486. STFD [C10] = f76, SIZE
  1487. FMA f84 = ALPHA, f84, f25
  1488. nop __LINE__
  1489. }
  1490. ;;
  1491. { .mfb
  1492. STFD [C2 ] = f73, SIZE
  1493. FMA f81 = ALPHA, f81, f26
  1494. nop __LINE__
  1495. }
  1496. { .mfb
  1497. STFD [C10] = f77, SIZE
  1498. FMA f85 = ALPHA, f85, f27
  1499. nop __LINE__
  1500. }
  1501. ;;
  1502. { .mfb
  1503. STFD [C2 ] = f74, SIZE
  1504. FMA f82 = ALPHA, f82, f28
  1505. nop __LINE__
  1506. }
  1507. { .mfb
  1508. STFD [C10] = f78, SIZE
  1509. FMA f86 = ALPHA, f86, f29
  1510. nop __LINE__
  1511. }
  1512. ;;
  1513. { .mfb
  1514. STFD [C2 ] = f75, 5 * SIZE
  1515. FMA f83 = ALPHA, f83, f30
  1516. nop __LINE__
  1517. }
  1518. { .mfb
  1519. STFD [C10] = f79, 5 * SIZE
  1520. FMA f87 = ALPHA, f87, f31
  1521. nop __LINE__
  1522. }
  1523. ;;
  1524. { .mfb
  1525. STFD [C3 ] = f80, SIZE
  1526. FMA f88 = ALPHA, f88, f32
  1527. nop __LINE__
  1528. }
  1529. { .mfb
  1530. STFD [C11] = f84, SIZE
  1531. FMA f92 = ALPHA, f92, f33
  1532. nop __LINE__
  1533. }
  1534. ;;
  1535. { .mfb
  1536. STFD [C3 ] = f81, SIZE
  1537. FMA f89 = ALPHA, f89, f34
  1538. nop __LINE__
  1539. }
  1540. { .mfb
  1541. STFD [C11] = f85, SIZE
  1542. FMA f93 = ALPHA, f93, f35
  1543. nop __LINE__
  1544. }
  1545. ;;
  1546. { .mfb
  1547. STFD [C3 ] = f82, SIZE
  1548. FMA f90 = ALPHA, f90, f36
  1549. nop __LINE__
  1550. }
  1551. { .mfb
  1552. STFD [C11] = f86, SIZE
  1553. FMA f94 = ALPHA, f94, f37
  1554. nop __LINE__
  1555. }
  1556. ;;
  1557. { .mfb
  1558. STFD [C3 ] = f83, 5 * SIZE
  1559. FMA f91 = ALPHA, f91, f38
  1560. nop __LINE__
  1561. }
  1562. { .mfb
  1563. STFD [C11] = f87, 5 * SIZE
  1564. FMA f95 = ALPHA, f95, f39
  1565. nop __LINE__
  1566. }
  1567. ;;
  1568. { .mfb
  1569. STFD [C4 ] = f88, SIZE
  1570. FMA f96 = ALPHA, f96, f48
  1571. nop __LINE__
  1572. }
  1573. { .mfb
  1574. STFD [C12] = f92, SIZE
  1575. FMA f100 = ALPHA, f100, f49
  1576. nop __LINE__
  1577. }
  1578. ;;
  1579. { .mfb
  1580. STFD [C4 ] = f89, SIZE
  1581. FMA f97 = ALPHA, f97, f50
  1582. nop __LINE__
  1583. }
  1584. { .mfb
  1585. STFD [C12] = f93, SIZE
  1586. FMA f101 = ALPHA, f101, f51
  1587. nop __LINE__
  1588. }
  1589. ;;
  1590. { .mfb
  1591. STFD [C4 ] = f90, SIZE
  1592. FMA f98 = ALPHA, f98, f52
  1593. nop __LINE__
  1594. }
  1595. { .mfb
  1596. STFD [C12] = f94, SIZE
  1597. FMA f102 = ALPHA, f102, f53
  1598. nop __LINE__
  1599. }
  1600. ;;
  1601. { .mfb
  1602. STFD [C4 ] = f91, 5 * SIZE
  1603. FMA f99 = ALPHA, f99, f54
  1604. nop __LINE__
  1605. }
  1606. { .mfb
  1607. STFD [C12] = f95, 5 * SIZE
  1608. FMA f103 = ALPHA, f103, f55
  1609. nop __LINE__
  1610. }
  1611. ;;
  1612. { .mfb
  1613. STFD [C5 ] = f96, SIZE
  1614. FMA f104 = ALPHA, f104, f40
  1615. nop __LINE__
  1616. }
  1617. { .mfb
  1618. STFD [C13] = f100, SIZE
  1619. FMA f108 = ALPHA, f108, f41
  1620. nop __LINE__
  1621. }
  1622. ;;
  1623. { .mfb
  1624. STFD [C5 ] = f97, SIZE
  1625. FMA f105 = ALPHA, f105, f42
  1626. nop __LINE__
  1627. }
  1628. { .mfb
  1629. STFD [C13] = f101, SIZE
  1630. FMA f109 = ALPHA, f109, f43
  1631. nop __LINE__
  1632. }
  1633. ;;
  1634. { .mfb
  1635. STFD [C5 ] = f98, SIZE
  1636. FMA f106 = ALPHA, f106, f44
  1637. nop __LINE__
  1638. }
  1639. { .mfb
  1640. STFD [C13] = f102, SIZE
  1641. FMA f110 = ALPHA, f110, f45
  1642. nop __LINE__
  1643. }
  1644. ;;
  1645. { .mfb
  1646. STFD [C5 ] = f99, 5 * SIZE
  1647. FMA f107 = ALPHA, f107, f59
  1648. nop __LINE__
  1649. }
  1650. { .mfb
  1651. STFD [C13] = f103, 5 * SIZE
  1652. FMA f111 = ALPHA, f111, f60
  1653. nop __LINE__
  1654. }
  1655. ;;
  1656. { .mfb
  1657. STFD [C6 ] = f104, SIZE
  1658. FMA f112 = ALPHA, f112, f61
  1659. nop __LINE__
  1660. }
  1661. { .mfb
  1662. STFD [C14] = f108, SIZE
  1663. FMA f116 = ALPHA, f116, f62
  1664. nop __LINE__
  1665. }
  1666. ;;
  1667. { .mfb
  1668. STFD [C6 ] = f105, SIZE
  1669. FMA f113 = ALPHA, f113, f63
  1670. nop __LINE__
  1671. }
  1672. { .mfb
  1673. STFD [C14] = f109, SIZE
  1674. FMA f117 = ALPHA, f117, f6
  1675. nop __LINE__
  1676. }
  1677. ;;
  1678. { .mfb
  1679. STFD [C6 ] = f106, SIZE
  1680. FMA f114 = ALPHA, f114, f7
  1681. nop __LINE__
  1682. }
  1683. { .mfb
  1684. STFD [C14] = f110, SIZE
  1685. FMA f118 = ALPHA, f118, f10
  1686. nop __LINE__
  1687. }
  1688. ;;
  1689. { .mfb
  1690. STFD [C6 ] = f107, 5 * SIZE
  1691. FMA f115 = ALPHA, f115, f11
  1692. nop __LINE__
  1693. }
  1694. { .mfb
  1695. STFD [C14] = f111, 5 * SIZE
  1696. FMA f119 = ALPHA, f119, f12
  1697. nop __LINE__
  1698. }
  1699. ;;
  1700. { .mfb
  1701. STFD [C7 ] = f112, SIZE
  1702. FMA f120 = ALPHA, f120, f13
  1703. nop __LINE__
  1704. }
  1705. { .mfb
  1706. STFD [C15] = f116, SIZE
  1707. FMA f124 = ALPHA, f124, f14
  1708. nop __LINE__
  1709. }
  1710. ;;
  1711. { .mfb
  1712. STFD [C7 ] = f113, SIZE
  1713. FMA f121 = ALPHA, f121, f15
  1714. nop __LINE__
  1715. }
  1716. { .mfb
  1717. STFD [C15] = f117, SIZE
  1718. FMA f125 = ALPHA, f125, f16
  1719. nop __LINE__
  1720. }
  1721. ;;
  1722. { .mfb
  1723. STFD [C7 ] = f114, SIZE
  1724. FMA f122 = ALPHA, f122, f17
  1725. nop __LINE__
  1726. }
  1727. { .mfb
  1728. STFD [C15] = f118, SIZE
  1729. FMA f126 = ALPHA, f126, f18
  1730. nop __LINE__
  1731. }
  1732. ;;
  1733. { .mfb
  1734. STFD [C7 ] = f115, 5 * SIZE
  1735. FMA f123 = ALPHA, f123, f19
  1736. nop __LINE__
  1737. }
  1738. { .mfb
  1739. STFD [C15] = f119, 5 * SIZE
  1740. FMA f127 = ALPHA, f127, f20
  1741. nop __LINE__
  1742. }
  1743. ;;
  1744. { .mfb
  1745. STFD [C8 ] = f120, SIZE
  1746. mov f64 = f0
  1747. nop __LINE__
  1748. }
  1749. { .mfb
  1750. STFD [C16] = f124, SIZE
  1751. mov f72 = f0
  1752. nop __LINE__
  1753. }
  1754. ;;
  1755. { .mfi
  1756. STFD [C8 ] = f121, SIZE
  1757. mov f80 = f0
  1758. nop __LINE__
  1759. }
  1760. { .mfb
  1761. STFD [C16] = f125, SIZE
  1762. mov f88 = f0
  1763. nop __LINE__
  1764. }
  1765. ;;
  1766. { .mfi
  1767. STFD [C8 ] = f122, SIZE
  1768. mov f96 = f0
  1769. nop __LINE__
  1770. }
  1771. { .mfb
  1772. STFD [C16] = f126, SIZE
  1773. mov f104 = f0
  1774. nop __LINE__
  1775. }
  1776. ;;
  1777. { .mfi
  1778. STFD [C8 ] = f123, 5 * SIZE
  1779. mov f112 = f0
  1780. nop __LINE__
  1781. }
  1782. { .mfb
  1783. STFD [C16] = f127, 5 * SIZE
  1784. mov f120 = f0
  1785. (p6) br.cond.dptk .L011
  1786. }
  1787. ;;
  1788. #else
  1789. { .mfi
  1790. nop __LINE__
  1791. FMPY f64 = ALPHA, f64
  1792. cmp.ne p6, p0 = 1, I
  1793. }
  1794. { .mfb
  1795. nop __LINE__
  1796. FMPY f68 = ALPHA, f68
  1797. nop __LINE__
  1798. }
  1799. ;;
  1800. { .mfi
  1801. nop __LINE__
  1802. FMPY f65 = ALPHA, f65
  1803. adds I = -1, I
  1804. }
  1805. { .mfb
  1806. nop __LINE__
  1807. FMPY f69 = ALPHA, f69
  1808. nop __LINE__
  1809. }
  1810. ;;
  1811. { .mfb
  1812. nop __LINE__
  1813. FMPY f66 = ALPHA, f66
  1814. nop __LINE__
  1815. }
  1816. { .mfb
  1817. nop __LINE__
  1818. FMPY f70 = ALPHA, f70
  1819. nop __LINE__
  1820. }
  1821. ;;
  1822. { .mfb
  1823. nop __LINE__
  1824. FMPY f67 = ALPHA, f67
  1825. nop __LINE__
  1826. }
  1827. { .mfb
  1828. nop __LINE__
  1829. FMPY f71 = ALPHA, f71
  1830. nop __LINE__
  1831. }
  1832. ;;
  1833. { .mmf
  1834. STFD [C1 ] = f64, SIZE
  1835. STFD [C9 ] = f68, SIZE
  1836. FMPY f72 = ALPHA, f72
  1837. }
  1838. { .mmf
  1839. nop __LINE__
  1840. nop __LINE__
  1841. FMPY f76 = ALPHA, f76
  1842. }
  1843. ;;
  1844. { .mmf
  1845. STFD [C1 ] = f65, SIZE
  1846. STFD [C9 ] = f69, SIZE
  1847. FMPY f73 = ALPHA, f73
  1848. }
  1849. { .mmf
  1850. nop __LINE__
  1851. nop __LINE__
  1852. FMPY f77 = ALPHA, f77
  1853. }
  1854. ;;
  1855. { .mmf
  1856. STFD [C1 ] = f66, SIZE
  1857. STFD [C9 ] = f70, SIZE
  1858. FMPY f74 = ALPHA, f74
  1859. }
  1860. { .mmf
  1861. nop __LINE__
  1862. nop __LINE__
  1863. FMPY f78 = ALPHA, f78
  1864. }
  1865. ;;
  1866. { .mfb
  1867. STFD [C1 ] = f67, 5 * SIZE
  1868. FMPY f75 = ALPHA, f75
  1869. nop __LINE__
  1870. }
  1871. { .mfb
  1872. STFD [C9 ] = f71, 5 * SIZE
  1873. FMPY f79 = ALPHA, f79
  1874. nop __LINE__
  1875. }
  1876. ;;
  1877. { .mfb
  1878. STFD [C2 ] = f72, SIZE
  1879. FMPY f80 = ALPHA, f80
  1880. nop __LINE__
  1881. }
  1882. { .mfb
  1883. STFD [C10] = f76, SIZE
  1884. FMPY f84 = ALPHA, f84
  1885. nop __LINE__
  1886. }
  1887. ;;
  1888. { .mfb
  1889. STFD [C2 ] = f73, SIZE
  1890. FMPY f81 = ALPHA, f81
  1891. nop __LINE__
  1892. }
  1893. { .mfb
  1894. STFD [C10] = f77, SIZE
  1895. FMPY f85 = ALPHA, f85
  1896. nop __LINE__
  1897. }
  1898. ;;
  1899. { .mfb
  1900. STFD [C2 ] = f74, SIZE
  1901. FMPY f82 = ALPHA, f82
  1902. nop __LINE__
  1903. }
  1904. { .mfb
  1905. STFD [C10] = f78, SIZE
  1906. FMPY f86 = ALPHA, f86
  1907. nop __LINE__
  1908. }
  1909. ;;
  1910. { .mfb
  1911. STFD [C2 ] = f75, 5 * SIZE
  1912. FMPY f83 = ALPHA, f83
  1913. nop __LINE__
  1914. }
  1915. { .mfb
  1916. STFD [C10] = f79, 5 * SIZE
  1917. FMPY f87 = ALPHA, f87
  1918. nop __LINE__
  1919. }
  1920. ;;
  1921. { .mfb
  1922. STFD [C3 ] = f80, SIZE
  1923. FMPY f88 = ALPHA, f88
  1924. nop __LINE__
  1925. }
  1926. { .mfb
  1927. STFD [C11] = f84, SIZE
  1928. FMPY f92 = ALPHA, f92
  1929. nop __LINE__
  1930. }
  1931. ;;
  1932. { .mfb
  1933. STFD [C3 ] = f81, SIZE
  1934. FMPY f89 = ALPHA, f89
  1935. nop __LINE__
  1936. }
  1937. { .mfb
  1938. STFD [C11] = f85, SIZE
  1939. FMPY f93 = ALPHA, f93
  1940. nop __LINE__
  1941. }
  1942. ;;
  1943. { .mfb
  1944. STFD [C3 ] = f82, SIZE
  1945. FMPY f90 = ALPHA, f90
  1946. nop __LINE__
  1947. }
  1948. { .mfb
  1949. STFD [C11] = f86, SIZE
  1950. FMPY f94 = ALPHA, f94
  1951. nop __LINE__
  1952. }
  1953. ;;
  1954. { .mfb
  1955. STFD [C3 ] = f83, 5 * SIZE
  1956. FMPY f91 = ALPHA, f91
  1957. nop __LINE__
  1958. }
  1959. { .mfb
  1960. STFD [C11] = f87, 5 * SIZE
  1961. FMPY f95 = ALPHA, f95
  1962. nop __LINE__
  1963. }
  1964. ;;
  1965. { .mfb
  1966. STFD [C4 ] = f88, SIZE
  1967. FMPY f96 = ALPHA, f96
  1968. nop __LINE__
  1969. }
  1970. { .mfb
  1971. STFD [C12] = f92, SIZE
  1972. FMPY f100 = ALPHA, f100
  1973. nop __LINE__
  1974. }
  1975. ;;
  1976. { .mfb
  1977. STFD [C4 ] = f89, SIZE
  1978. FMPY f97 = ALPHA, f97
  1979. nop __LINE__
  1980. }
  1981. { .mfb
  1982. STFD [C12] = f93, SIZE
  1983. FMPY f101 = ALPHA, f101
  1984. nop __LINE__
  1985. }
  1986. ;;
  1987. { .mfb
  1988. STFD [C4 ] = f90, SIZE
  1989. FMPY f98 = ALPHA, f98
  1990. nop __LINE__
  1991. }
  1992. { .mfb
  1993. STFD [C12] = f94, SIZE
  1994. FMPY f102 = ALPHA, f102
  1995. nop __LINE__
  1996. }
  1997. ;;
  1998. { .mfb
  1999. STFD [C4 ] = f91, 5 * SIZE
  2000. FMPY f99 = ALPHA, f99
  2001. nop __LINE__
  2002. }
  2003. { .mfb
  2004. STFD [C12] = f95, 5 * SIZE
  2005. FMPY f103 = ALPHA, f103
  2006. nop __LINE__
  2007. }
  2008. ;;
  2009. { .mfb
  2010. STFD [C5 ] = f96, SIZE
  2011. FMPY f104 = ALPHA, f104
  2012. nop __LINE__
  2013. }
  2014. { .mfb
  2015. STFD [C13] = f100, SIZE
  2016. FMPY f108 = ALPHA, f108
  2017. nop __LINE__
  2018. }
  2019. ;;
  2020. { .mfb
  2021. STFD [C5 ] = f97, SIZE
  2022. FMPY f105 = ALPHA, f105
  2023. nop __LINE__
  2024. }
  2025. { .mfb
  2026. STFD [C13] = f101, SIZE
  2027. FMPY f109 = ALPHA, f109
  2028. nop __LINE__
  2029. }
  2030. ;;
  2031. { .mfb
  2032. STFD [C5 ] = f98, SIZE
  2033. FMPY f106 = ALPHA, f106
  2034. nop __LINE__
  2035. }
  2036. { .mfb
  2037. STFD [C13] = f102, SIZE
  2038. FMPY f110 = ALPHA, f110
  2039. nop __LINE__
  2040. }
  2041. ;;
  2042. { .mfb
  2043. STFD [C5 ] = f99, 5 * SIZE
  2044. FMPY f107 = ALPHA, f107
  2045. nop __LINE__
  2046. }
  2047. { .mfb
  2048. STFD [C13] = f103, 5 * SIZE
  2049. FMPY f111 = ALPHA, f111
  2050. nop __LINE__
  2051. }
  2052. ;;
  2053. { .mfb
  2054. STFD [C6 ] = f104, SIZE
  2055. FMPY f112 = ALPHA, f112
  2056. nop __LINE__
  2057. }
  2058. { .mfb
  2059. STFD [C14] = f108, SIZE
  2060. FMPY f116 = ALPHA, f116
  2061. nop __LINE__
  2062. }
  2063. ;;
  2064. { .mfb
  2065. STFD [C6 ] = f105, SIZE
  2066. FMPY f113 = ALPHA, f113
  2067. nop __LINE__
  2068. }
  2069. { .mfb
  2070. STFD [C14] = f109, SIZE
  2071. FMPY f117 = ALPHA, f117
  2072. nop __LINE__
  2073. }
  2074. ;;
  2075. { .mfb
  2076. STFD [C6 ] = f106, SIZE
  2077. FMPY f114 = ALPHA, f114
  2078. nop __LINE__
  2079. }
  2080. { .mfb
  2081. STFD [C14] = f110, SIZE
  2082. FMPY f118 = ALPHA, f118
  2083. nop __LINE__
  2084. }
  2085. ;;
  2086. { .mfb
  2087. STFD [C6 ] = f107, 5 * SIZE
  2088. FMPY f115 = ALPHA, f115
  2089. nop __LINE__
  2090. }
  2091. { .mfb
  2092. STFD [C14] = f111, 5 * SIZE
  2093. FMPY f119 = ALPHA, f119
  2094. nop __LINE__
  2095. }
  2096. ;;
  2097. { .mfb
  2098. STFD [C7 ] = f112, SIZE
  2099. FMPY f120 = ALPHA, f120
  2100. nop __LINE__
  2101. }
  2102. { .mfb
  2103. STFD [C15] = f116, SIZE
  2104. FMPY f124 = ALPHA, f124
  2105. nop __LINE__
  2106. }
  2107. ;;
  2108. { .mfb
  2109. STFD [C7 ] = f113, SIZE
  2110. FMPY f121 = ALPHA, f121
  2111. nop __LINE__
  2112. }
  2113. { .mfb
  2114. STFD [C15] = f117, SIZE
  2115. FMPY f125 = ALPHA, f125
  2116. nop __LINE__
  2117. }
  2118. ;;
  2119. { .mfi
  2120. STFD [C7 ] = f114, SIZE
  2121. FMPY f122 = ALPHA, f122
  2122. #if defined(TRMMKERNEL) && \
  2123. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2124. sub L = K, KK
  2125. #else
  2126. nop __LINE__
  2127. #endif
  2128. }
  2129. { .mfb
  2130. STFD [C15] = f118, SIZE
  2131. FMPY f126 = ALPHA, f126
  2132. nop __LINE__
  2133. }
  2134. ;;
  2135. { .mfi
  2136. STFD [C7 ] = f115, 5 * SIZE
  2137. FMPY f123 = ALPHA, f123
  2138. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  2139. adds L = -8, L
  2140. #else
  2141. nop __LINE__
  2142. #endif
  2143. }
  2144. { .mfi
  2145. STFD [C15] = f119, 5 * SIZE
  2146. FMPY f127 = ALPHA, f127
  2147. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  2148. adds L = -8, L
  2149. #else
  2150. nop __LINE__
  2151. #endif
  2152. }
  2153. ;;
  2154. { .mfi
  2155. STFD [C8 ] = f120, SIZE
  2156. mov f64 = f0
  2157. #if defined(TRMMKERNEL) && \
  2158. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2159. shladd KK8 = L, BASE_SHIFT, r0
  2160. #else
  2161. nop __LINE__
  2162. #endif
  2163. }
  2164. { .mfb
  2165. STFD [C16] = f124, SIZE
  2166. mov f72 = f0
  2167. nop __LINE__
  2168. }
  2169. ;;
  2170. { .mfi
  2171. STFD [C8 ] = f121, SIZE
  2172. mov f80 = f0
  2173. #if defined(TRMMKERNEL) && \
  2174. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2175. shladd AOFFSET = KK8, 3, AOFFSET
  2176. #else
  2177. nop __LINE__
  2178. #endif
  2179. }
  2180. { .mfi
  2181. STFD [C16] = f125, SIZE
  2182. mov f88 = f0
  2183. #if defined(TRMMKERNEL) && \
  2184. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2185. shladd BOFFSET = KK8, 3, BOFFSET
  2186. #else
  2187. nop __LINE__
  2188. #endif
  2189. }
  2190. ;;
  2191. { .mfi
  2192. STFD [C8 ] = f122, SIZE
  2193. mov f96 = f0
  2194. #if defined(TRMMKERNEL) && defined(LEFT)
  2195. adds KK = 8, KK
  2196. #else
  2197. nop __LINE__
  2198. #endif
  2199. }
  2200. { .mfb
  2201. STFD [C16] = f126, SIZE
  2202. mov f104 = f0
  2203. nop __LINE__
  2204. }
  2205. ;;
  2206. { .mfi
  2207. STFD [C8 ] = f123, 5 * SIZE
  2208. mov f112 = f0
  2209. #ifdef TRMMKERNEL
  2210. shladd KK8 = KK, BASE_SHIFT, r0
  2211. #else
  2212. nop __LINE__
  2213. #endif
  2214. }
  2215. { .mfb
  2216. STFD [C16] = f127, 5 * SIZE
  2217. mov f120 = f0
  2218. (p6) br.cond.dptk .L011
  2219. }
  2220. ;;
  2221. #endif
  2222. .L020:
  2223. #if 0
  2224. { .mfi
  2225. cmp.eq p3, p0 = r0, r0
  2226. mov f89 = f0
  2227. tbit.z p6, p7 = M, 2
  2228. }
  2229. { .mfb
  2230. #ifndef TRMMKERNEL
  2231. nop __LINE__
  2232. #else
  2233. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2234. sub L = K, KK
  2235. #elif defined(LEFT)
  2236. adds L = 4, KK
  2237. #else
  2238. adds L = 8, KK
  2239. #endif
  2240. #endif
  2241. mov f81 = f0
  2242. (p6) br.cond.dptk .L030
  2243. }
  2244. ;;
  2245. #if !defined(TRMMKERNEL) || \
  2246. defined(TRMMKERNEL) && \
  2247. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  2248. { .mfi
  2249. LDFPD f48, f49 = [B]
  2250. mov f65 = f0
  2251. nop __LINE__
  2252. }
  2253. { .mfi
  2254. adds BOFFSET = 2 * SIZE, B
  2255. mov f73 = f0
  2256. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  2257. }
  2258. ;;
  2259. #else
  2260. { .mfi
  2261. shladd BOFFSET = KK8, 3, B
  2262. mov f65 = f0
  2263. shladd AOFFSET = KK8, 2, AOFFSET
  2264. }
  2265. ;;
  2266. { .mfi
  2267. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  2268. mov f73 = f0
  2269. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  2270. }
  2271. ;;
  2272. #endif
  2273. { .mmf
  2274. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  2275. setf.d f97 = r0
  2276. mov f105 = f0
  2277. }
  2278. { .mfi
  2279. setf.d f113 = r0
  2280. mov f121 = f0
  2281. #ifndef TRMMKERNEL
  2282. adds L = 1, K
  2283. #else
  2284. adds L = 1, L
  2285. #endif
  2286. }
  2287. ;;
  2288. { .mmf
  2289. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  2290. setf.d f66 = r0
  2291. mov f74 = f0
  2292. }
  2293. { .mfi
  2294. setf.d f82 = r0
  2295. mov f90 = f0
  2296. tbit.z p12, p0 = L, 0
  2297. }
  2298. ;;
  2299. { .mmf
  2300. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  2301. setf.d f98 = r0
  2302. mov f106 = f0
  2303. }
  2304. { .mfi
  2305. setf.d f114 = r0
  2306. mov f122 = f0
  2307. shr L = L, 1
  2308. }
  2309. ;;
  2310. { .mfi
  2311. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  2312. mov f75 = f0
  2313. adds L = -1, L
  2314. }
  2315. { .mmf
  2316. setf.d f67 = r0
  2317. setf.d f83 = r0
  2318. mov f91 = f0
  2319. }
  2320. ;;
  2321. { .mfi
  2322. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  2323. mov f107 = f0
  2324. mov ar.lc = L
  2325. }
  2326. { .mmf
  2327. setf.d f99 = r0
  2328. setf.d f115 = r0
  2329. mov f123 = f0
  2330. }
  2331. ;;
  2332. .align 32
  2333. .L022:
  2334. { .mfi
  2335. lfetch.nt1 [PREA], 16 * SIZE
  2336. FMA f64 = f32, f48, f64 // A1 * B1
  2337. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  2338. }
  2339. { .mfi
  2340. nop __LINE__
  2341. FMA f72 = f32, f49, f72 // A1 * B2
  2342. (p12) cmp.ne p3, p0 = 0, L
  2343. }
  2344. ;;
  2345. { .mfi
  2346. lfetch.nt1 [PREB], 16 * SIZE
  2347. FMA f80 = f32, f50, f80 // A1 * B3
  2348. cmp.ne p4, p5 = 0, L
  2349. }
  2350. { .mfb
  2351. nop __LINE__
  2352. FMA f88 = f32, f51, f88 // A1 * B4
  2353. nop __LINE__
  2354. }
  2355. ;;
  2356. { .mfi
  2357. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  2358. FMA f96 = f32, f52, f96 // A1 * B5
  2359. (p5) adds C9 = 2 * SIZE, C1
  2360. }
  2361. { .mfi
  2362. nop __LINE__
  2363. FMA f104 = f32, f53, f104 // A1 * B6
  2364. (p5) adds C10 = 2 * SIZE, C2
  2365. }
  2366. ;;
  2367. { .mfi
  2368. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  2369. FMA f112 = f32, f54, f112 // A1 * B7
  2370. (p5) adds C11 = 2 * SIZE, C3
  2371. }
  2372. { .mfi
  2373. nop __LINE__
  2374. FMA f120 = f32, f55, f120 // A1 * B8
  2375. (p5) adds C12 = 2 * SIZE, C4
  2376. }
  2377. ;;
  2378. { .mfi
  2379. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  2380. FMA f65 = f33, f48, f65 // A2 * B1
  2381. (p5) adds C13 = 2 * SIZE, C5
  2382. }
  2383. { .mfi
  2384. nop __LINE__
  2385. FMA f73 = f33, f49, f73 // A2 * B2
  2386. (p5) adds C14 = 2 * SIZE, C6
  2387. }
  2388. ;;
  2389. { .mfi
  2390. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  2391. FMA f81 = f33, f50, f81 // A2 * B3
  2392. (p5) adds C15 = 2 * SIZE, C7
  2393. }
  2394. { .mfi
  2395. nop __LINE__
  2396. FMA f89 = f33, f51, f89 // A2 * B4
  2397. (p5) adds C16 = 2 * SIZE, C8
  2398. }
  2399. ;;
  2400. { .mfb
  2401. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  2402. FMA f97 = f33, f52, f97 // A2 * B5
  2403. nop __LINE__
  2404. }
  2405. { .mfb
  2406. nop __LINE__
  2407. FMA f105 = f33, f53, f105 // A2 * B6
  2408. nop __LINE__
  2409. }
  2410. ;;
  2411. { .mfb
  2412. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  2413. FMA f113 = f33, f54, f113 // A2 * B7
  2414. nop __LINE__
  2415. }
  2416. { .mfb
  2417. nop __LINE__
  2418. FMA f121 = f33, f55, f121 // A2 * B8
  2419. nop __LINE__
  2420. }
  2421. ;;
  2422. { .mfb
  2423. nop __LINE__
  2424. FMA f66 = f34, f48, f66 // A3 * B1
  2425. nop __LINE__
  2426. }
  2427. { .mfb
  2428. nop __LINE__
  2429. FMA f74 = f34, f49, f74 // A3 * B2
  2430. nop __LINE__
  2431. }
  2432. ;;
  2433. { .mfb
  2434. nop __LINE__
  2435. FMA f82 = f34, f50, f82 // A3 * B3
  2436. nop __LINE__
  2437. }
  2438. { .mfb
  2439. nop __LINE__
  2440. FMA f90 = f34, f51, f90 // A3 * B4
  2441. nop __LINE__
  2442. }
  2443. ;;
  2444. { .mfb
  2445. nop __LINE__
  2446. FMA f98 = f34, f52, f98 // A3 * B5
  2447. nop __LINE__
  2448. }
  2449. { .mfb
  2450. nop __LINE__
  2451. FMA f106 = f34, f53, f106 // A3 * B6
  2452. nop __LINE__
  2453. }
  2454. { .mfb
  2455. nop __LINE__
  2456. FMA f114 = f34, f54, f114 // A3 * B7
  2457. nop __LINE__
  2458. }
  2459. { .mfb
  2460. nop __LINE__
  2461. FMA f122 = f34, f55, f122 // A3 * B8
  2462. nop __LINE__
  2463. }
  2464. { .mfb
  2465. nop __LINE__
  2466. FMA f67 = f35, f48, f67 // A4 * B1
  2467. nop __LINE__
  2468. }
  2469. { .mfb
  2470. nop __LINE__
  2471. FMA f75 = f35, f49, f75 // A4 * B2
  2472. nop __LINE__
  2473. }
  2474. { .mfb
  2475. nop __LINE__
  2476. FMA f83 = f35, f50, f83 // A4 * B3
  2477. nop __LINE__
  2478. }
  2479. { .mfb
  2480. nop __LINE__
  2481. FMA f91 = f35, f51, f91 // A4 * B4
  2482. nop __LINE__
  2483. }
  2484. { .mfb
  2485. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  2486. FMA f99 = f35, f52, f99 // A4 * B5
  2487. nop __LINE__
  2488. }
  2489. { .mfb
  2490. nop __LINE__
  2491. FMA f107 = f35, f53, f107 // A4 * B6
  2492. nop __LINE__
  2493. }
  2494. { .mfb
  2495. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  2496. FMA f115 = f35, f54, f115 // A4 * B7
  2497. nop __LINE__
  2498. }
  2499. { .mfb
  2500. nop __LINE__
  2501. FMA f123 = f35, f55, f123 // A4 * B8
  2502. nop __LINE__
  2503. }
  2504. ;;
  2505. { .mfb
  2506. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  2507. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  2508. nop __LINE__
  2509. }
  2510. { .mfb
  2511. nop __LINE__
  2512. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  2513. nop __LINE__
  2514. }
  2515. ;;
  2516. { .mfb
  2517. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  2518. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  2519. nop __LINE__
  2520. }
  2521. { .mfb
  2522. nop __LINE__
  2523. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  2524. nop __LINE__
  2525. }
  2526. ;;
  2527. { .mfb
  2528. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2529. (p5) LDFD f68 = [C1 ], SIZE
  2530. #else
  2531. nop __LINE__
  2532. #endif
  2533. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  2534. nop __LINE__
  2535. }
  2536. { .mfb
  2537. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2538. (p5) LDFD f70 = [C9 ], SIZE
  2539. #else
  2540. nop __LINE__
  2541. #endif
  2542. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  2543. nop __LINE__
  2544. }
  2545. ;;
  2546. { .mfb
  2547. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2548. (p5) LDFD f69 = [C1 ], -1 * SIZE
  2549. #else
  2550. nop __LINE__
  2551. #endif
  2552. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  2553. nop __LINE__
  2554. }
  2555. { .mfb
  2556. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2557. (p5) LDFD f71 = [C9 ], -1 * SIZE
  2558. #else
  2559. nop __LINE__
  2560. #endif
  2561. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  2562. nop __LINE__
  2563. }
  2564. ;;
  2565. { .mfb
  2566. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  2567. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  2568. nop __LINE__
  2569. }
  2570. { .mfb
  2571. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  2572. nop __LINE__
  2573. }
  2574. { .mfb
  2575. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  2576. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  2577. nop __LINE__
  2578. }
  2579. { .mfb
  2580. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  2581. nop __LINE__
  2582. }
  2583. ;;
  2584. { .mfb
  2585. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2586. (p5) LDFD f76 = [C2 ], SIZE
  2587. #else
  2588. nop __LINE__
  2589. #endif
  2590. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  2591. nop __LINE__
  2592. }
  2593. { .mfb
  2594. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2595. (p5) LDFD f78 = [C10], SIZE
  2596. #else
  2597. nop __LINE__
  2598. #endif
  2599. (p3) FMA f105 = f41, f61, f105 // A2 * B6
  2600. nop __LINE__
  2601. }
  2602. ;;
  2603. { .mfb
  2604. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2605. (p5) LDFD f77 = [C2 ], -1 * SIZE
  2606. #else
  2607. nop __LINE__
  2608. #endif
  2609. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  2610. nop __LINE__
  2611. }
  2612. { .mfb
  2613. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2614. (p5) LDFD f79 = [C10], -1 * SIZE
  2615. #else
  2616. nop __LINE__
  2617. #endif
  2618. (p3) FMA f121 = f41, f63, f121 // A2 * B8
  2619. nop __LINE__
  2620. }
  2621. ;;
  2622. { .mfb
  2623. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2624. (p5) LDFD f84 = [C3 ], SIZE
  2625. #else
  2626. nop __LINE__
  2627. #endif
  2628. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  2629. nop __LINE__
  2630. }
  2631. { .mfb
  2632. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2633. (p5) LDFD f86 = [C11], SIZE
  2634. #else
  2635. nop __LINE__
  2636. #endif
  2637. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  2638. nop __LINE__
  2639. }
  2640. ;;
  2641. { .mfb
  2642. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2643. (p5) LDFD f85 = [C3 ], -1 * SIZE
  2644. #else
  2645. nop __LINE__
  2646. #endif
  2647. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  2648. nop __LINE__
  2649. }
  2650. { .mfb
  2651. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2652. (p5) LDFD f87 = [C11], -1 * SIZE
  2653. #else
  2654. nop __LINE__
  2655. #endif
  2656. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  2657. nop __LINE__
  2658. }
  2659. ;;
  2660. { .mfb
  2661. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2662. (p5) LDFD f92 = [C4 ], SIZE
  2663. #else
  2664. nop __LINE__
  2665. #endif
  2666. (p3) FMA f98 = f42, f60, f98 // A3 * B5
  2667. nop __LINE__
  2668. }
  2669. { .mfb
  2670. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2671. (p5) LDFD f94 = [C12], SIZE
  2672. #else
  2673. nop __LINE__
  2674. #endif
  2675. (p3) FMA f106 = f42, f61, f106 // A3 * B6
  2676. nop __LINE__
  2677. }
  2678. ;;
  2679. { .mfb
  2680. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2681. (p5) LDFD f93 = [C4 ], -1 * SIZE
  2682. #else
  2683. nop __LINE__
  2684. #endif
  2685. (p3) FMA f114 = f42, f62, f114 // A3 * B7
  2686. nop __LINE__
  2687. }
  2688. { .mfb
  2689. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2690. (p5) LDFD f95 = [C12], -1 * SIZE
  2691. #else
  2692. nop __LINE__
  2693. #endif
  2694. (p3) FMA f122 = f42, f63, f122 // A3 * B8
  2695. nop __LINE__
  2696. }
  2697. ;;
  2698. { .mfb
  2699. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2700. (p5) LDFD f100 = [C5 ], SIZE
  2701. #else
  2702. nop __LINE__
  2703. #endif
  2704. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  2705. nop __LINE__
  2706. }
  2707. { .mfb
  2708. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2709. (p5) LDFD f102 = [C13], SIZE
  2710. #else
  2711. nop __LINE__
  2712. #endif
  2713. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  2714. nop __LINE__
  2715. }
  2716. ;;
  2717. { .mfb
  2718. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2719. (p5) LDFD f101 = [C5 ], -1 * SIZE
  2720. #else
  2721. nop __LINE__
  2722. #endif
  2723. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  2724. nop __LINE__
  2725. }
  2726. { .mfb
  2727. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2728. (p5) LDFD f103 = [C13], -1 * SIZE
  2729. #else
  2730. nop __LINE__
  2731. #endif
  2732. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  2733. nop __LINE__
  2734. }
  2735. ;;
  2736. { .mfb
  2737. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2738. (p5) LDFD f108 = [C6 ], SIZE
  2739. #else
  2740. nop __LINE__
  2741. #endif
  2742. (p3) FMA f99 = f43, f60, f99 // A4 * B5
  2743. nop __LINE__
  2744. }
  2745. { .mfb
  2746. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2747. (p5) LDFD f110 = [C14], SIZE
  2748. #else
  2749. nop __LINE__
  2750. #endif
  2751. (p3) FMA f107 = f43, f61, f107 // A4 * B6
  2752. nop __LINE__
  2753. }
  2754. ;;
  2755. { .mfi
  2756. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2757. (p5) LDFD f109 = [C6 ], -1 * SIZE
  2758. #else
  2759. nop __LINE__
  2760. #endif
  2761. (p3) FMA f115 = f43, f62, f115 // A4 * B7
  2762. adds L = -1, L
  2763. }
  2764. { .mfb
  2765. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2766. (p5) LDFD f111 = [C14], -1 * SIZE
  2767. #else
  2768. nop __LINE__
  2769. #endif
  2770. (p3) FMA f123 = f43, f63, f123 // A4 * B8
  2771. br.cloop.sptk.few .L022
  2772. }
  2773. ;;
  2774. .L028:
  2775. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  2776. { .mfb
  2777. LDFD f116 = [C7 ], SIZE
  2778. FMA f64 = ALPHA, f64, f68
  2779. nop __LINE__
  2780. }
  2781. { .mfb
  2782. LDFD f118 = [C15], SIZE
  2783. FMA f66 = ALPHA, f66, f70
  2784. nop __LINE__
  2785. }
  2786. ;;
  2787. { .mfb
  2788. LDFD f117 = [C7 ], -1 * SIZE
  2789. FMA f65 = ALPHA, f65, f69
  2790. nop __LINE__
  2791. }
  2792. { .mfb
  2793. LDFD f119 = [C15], -1 * SIZE
  2794. FMA f67 = ALPHA, f67, f71
  2795. nop __LINE__
  2796. }
  2797. ;;
  2798. { .mfb
  2799. LDFD f124 = [C8], SIZE
  2800. FMA f72 = ALPHA, f72, f76
  2801. nop __LINE__
  2802. }
  2803. { .mfb
  2804. LDFD f126 = [C16], SIZE
  2805. FMA f74 = ALPHA, f74, f78
  2806. nop __LINE__
  2807. }
  2808. ;;
  2809. { .mfb
  2810. LDFD f125 = [C8], -1 * SIZE
  2811. FMA f73 = ALPHA, f73, f77
  2812. nop __LINE__
  2813. }
  2814. { .mfb
  2815. LDFD f127 = [C16], -1 * SIZE
  2816. FMA f75 = ALPHA, f75, f79
  2817. nop __LINE__
  2818. }
  2819. ;;
  2820. { .mfb
  2821. STFD [C1 ] = f64, SIZE
  2822. FMA f80 = ALPHA, f80, f84
  2823. nop __LINE__
  2824. }
  2825. { .mfb
  2826. STFD [C9 ] = f66, SIZE
  2827. FMA f82 = ALPHA, f82, f86
  2828. nop __LINE__
  2829. }
  2830. ;;
  2831. { .mfb
  2832. STFD [C1 ] = f65, 3 * SIZE
  2833. FMA f81 = ALPHA, f81, f85
  2834. nop __LINE__
  2835. }
  2836. { .mfb
  2837. STFD [C9 ] = f67, 3 * SIZE
  2838. FMA f83 = ALPHA, f83, f87
  2839. nop __LINE__
  2840. }
  2841. ;;
  2842. { .mfb
  2843. STFD [C2 ] = f72, SIZE
  2844. FMA f88 = ALPHA, f88, f92
  2845. nop __LINE__
  2846. }
  2847. { .mfb
  2848. STFD [C10] = f74, SIZE
  2849. FMA f90 = ALPHA, f90, f94
  2850. nop __LINE__
  2851. }
  2852. ;;
  2853. { .mfb
  2854. STFD [C2 ] = f73, 3 * SIZE
  2855. FMA f89 = ALPHA, f89, f93
  2856. nop __LINE__
  2857. }
  2858. { .mfb
  2859. STFD [C10] = f75, 3 * SIZE
  2860. FMA f91 = ALPHA, f91, f95
  2861. nop __LINE__
  2862. }
  2863. ;;
  2864. { .mfb
  2865. STFD [C3 ] = f80, SIZE
  2866. FMA f96 = ALPHA, f96, f100
  2867. nop __LINE__
  2868. }
  2869. { .mfb
  2870. STFD [C11] = f82, SIZE
  2871. FMA f98 = ALPHA, f98, f102
  2872. nop __LINE__
  2873. }
  2874. ;;
  2875. { .mfb
  2876. STFD [C3 ] = f81, 3 * SIZE
  2877. FMA f97 = ALPHA, f97, f101
  2878. nop __LINE__
  2879. }
  2880. { .mfb
  2881. STFD [C11] = f83, 3 * SIZE
  2882. FMA f99 = ALPHA, f99, f103
  2883. nop __LINE__
  2884. }
  2885. ;;
  2886. { .mfb
  2887. STFD [C4 ] = f88, SIZE
  2888. FMA f104 = ALPHA, f104, f108
  2889. nop __LINE__
  2890. }
  2891. { .mfb
  2892. STFD [C12] = f90, SIZE
  2893. FMA f106 = ALPHA, f106, f110
  2894. nop __LINE__
  2895. }
  2896. ;;
  2897. { .mfb
  2898. STFD [C4 ] = f89, 3 * SIZE
  2899. FMA f105 = ALPHA, f105, f109
  2900. nop __LINE__
  2901. }
  2902. { .mfb
  2903. STFD [C12] = f91, 3 * SIZE
  2904. FMA f107 = ALPHA, f107, f111
  2905. nop __LINE__
  2906. }
  2907. ;;
  2908. { .mfb
  2909. STFD [C5 ] = f96, SIZE
  2910. FMA f112 = ALPHA, f112, f116
  2911. nop __LINE__
  2912. }
  2913. { .mfb
  2914. STFD [C13] = f98, SIZE
  2915. FMA f114 = ALPHA, f114, f118
  2916. nop __LINE__
  2917. }
  2918. ;;
  2919. { .mfb
  2920. STFD [C5 ] = f97, 3 * SIZE
  2921. FMA f113 = ALPHA, f113, f117
  2922. nop __LINE__
  2923. }
  2924. { .mfb
  2925. STFD [C13] = f99, 3 * SIZE
  2926. FMA f115 = ALPHA, f115, f119
  2927. nop __LINE__
  2928. }
  2929. ;;
  2930. { .mfb
  2931. STFD [C6 ] = f104, SIZE
  2932. FMA f120 = ALPHA, f120, f124
  2933. nop __LINE__
  2934. }
  2935. { .mfb
  2936. STFD [C14] = f106, SIZE
  2937. FMA f122 = ALPHA, f122, f126
  2938. nop __LINE__
  2939. }
  2940. ;;
  2941. { .mfb
  2942. STFD [C6 ] = f105, 3 * SIZE
  2943. FMA f121 = ALPHA, f121, f125
  2944. nop __LINE__
  2945. }
  2946. { .mfb
  2947. STFD [C14] = f107, 3 * SIZE
  2948. FMA f123 = ALPHA, f123, f127
  2949. nop __LINE__
  2950. }
  2951. ;;
  2952. { .mfb
  2953. STFD [C7 ] = f112, SIZE
  2954. mov f64 = f0
  2955. nop __LINE__
  2956. }
  2957. { .mfb
  2958. STFD [C15] = f114, SIZE
  2959. mov f72 = f0
  2960. nop __LINE__
  2961. }
  2962. ;;
  2963. { .mfb
  2964. STFD [C7 ] = f113, 3 * SIZE
  2965. mov f80 = f0
  2966. nop __LINE__
  2967. }
  2968. { .mfb
  2969. STFD [C15] = f115, 3 * SIZE
  2970. mov f88 = f0
  2971. nop __LINE__
  2972. }
  2973. ;;
  2974. { .mfb
  2975. STFD [C8 ] = f120, SIZE
  2976. mov f96 = f0
  2977. nop __LINE__
  2978. }
  2979. { .mfb
  2980. STFD [C16] = f122, SIZE
  2981. mov f104 = f0
  2982. nop __LINE__
  2983. }
  2984. ;;
  2985. { .mfb
  2986. STFD [C8 ] = f121, 3 * SIZE
  2987. mov f112 = f0
  2988. nop __LINE__
  2989. }
  2990. { .mfb
  2991. STFD [C16] = f123, 3 * SIZE
  2992. mov f120 = f0
  2993. nop __LINE__
  2994. }
  2995. ;;
  2996. #else
  2997. { .mfb
  2998. FMPY f64 = ALPHA, f64
  2999. nop __LINE__
  3000. }
  3001. { .mfb
  3002. FMPY f66 = ALPHA, f66
  3003. nop __LINE__
  3004. }
  3005. ;;
  3006. { .mfb
  3007. FMPY f65 = ALPHA, f65
  3008. nop __LINE__
  3009. }
  3010. { .mfb
  3011. FMPY f67 = ALPHA, f67
  3012. nop __LINE__
  3013. }
  3014. ;;
  3015. { .mfb
  3016. FMPY f72 = ALPHA, f72
  3017. nop __LINE__
  3018. }
  3019. { .mfb
  3020. FMPY f74 = ALPHA, f74
  3021. nop __LINE__
  3022. }
  3023. ;;
  3024. { .mfb
  3025. FMPY f73 = ALPHA, f73
  3026. nop __LINE__
  3027. }
  3028. { .mfb
  3029. FMPY f75 = ALPHA, f75
  3030. nop __LINE__
  3031. }
  3032. ;;
  3033. { .mfb
  3034. STFD [C1 ] = f64, SIZE
  3035. FMPY f80 = ALPHA, f80
  3036. nop __LINE__
  3037. }
  3038. { .mfb
  3039. STFD [C9 ] = f66, SIZE
  3040. FMPY f82 = ALPHA, f82
  3041. nop __LINE__
  3042. }
  3043. ;;
  3044. { .mfb
  3045. STFD [C1 ] = f65, 3 * SIZE
  3046. FMPY f81 = ALPHA, f81
  3047. nop __LINE__
  3048. }
  3049. { .mfb
  3050. STFD [C9 ] = f67, 3 * SIZE
  3051. FMPY f83 = ALPHA, f83
  3052. nop __LINE__
  3053. }
  3054. ;;
  3055. { .mfb
  3056. STFD [C2 ] = f72, SIZE
  3057. FMPY f88 = ALPHA, f88
  3058. nop __LINE__
  3059. }
  3060. { .mfb
  3061. STFD [C10] = f74, SIZE
  3062. FMPY f90 = ALPHA, f90
  3063. nop __LINE__
  3064. }
  3065. ;;
  3066. { .mfb
  3067. STFD [C2 ] = f73, 3 * SIZE
  3068. FMPY f89 = ALPHA, f89
  3069. nop __LINE__
  3070. }
  3071. { .mfb
  3072. STFD [C10] = f75, 3 * SIZE
  3073. FMPY f91 = ALPHA, f91
  3074. nop __LINE__
  3075. }
  3076. ;;
  3077. { .mfb
  3078. STFD [C3 ] = f80, SIZE
  3079. FMPY f96 = ALPHA, f96
  3080. nop __LINE__
  3081. }
  3082. { .mfb
  3083. STFD [C11] = f82, SIZE
  3084. FMPY f98 = ALPHA, f98
  3085. nop __LINE__
  3086. }
  3087. ;;
  3088. { .mfb
  3089. STFD [C3 ] = f81, 3 * SIZE
  3090. FMPY f97 = ALPHA, f97
  3091. nop __LINE__
  3092. }
  3093. { .mfb
  3094. STFD [C11] = f83, 3 * SIZE
  3095. FMPY f99 = ALPHA, f99
  3096. nop __LINE__
  3097. }
  3098. ;;
  3099. { .mfb
  3100. STFD [C4 ] = f88, SIZE
  3101. FMPY f104 = ALPHA, f104
  3102. nop __LINE__
  3103. }
  3104. { .mfb
  3105. STFD [C12] = f90, SIZE
  3106. FMPY f106 = ALPHA, f106
  3107. nop __LINE__
  3108. }
  3109. ;;
  3110. { .mfb
  3111. STFD [C4 ] = f89, 3 * SIZE
  3112. FMPY f105 = ALPHA, f105
  3113. nop __LINE__
  3114. }
  3115. { .mfb
  3116. STFD [C12] = f91, 3 * SIZE
  3117. FMPY f107 = ALPHA, f107
  3118. nop __LINE__
  3119. }
  3120. ;;
  3121. { .mfb
  3122. STFD [C5 ] = f96, SIZE
  3123. FMPY f112 = ALPHA, f112
  3124. nop __LINE__
  3125. }
  3126. { .mfb
  3127. STFD [C13] = f98, SIZE
  3128. FMPY f114 = ALPHA, f114
  3129. nop __LINE__
  3130. }
  3131. ;;
  3132. { .mfb
  3133. STFD [C5 ] = f97, 3 * SIZE
  3134. FMPY f113 = ALPHA, f113
  3135. nop __LINE__
  3136. }
  3137. { .mfb
  3138. STFD [C13] = f99, 3 * SIZE
  3139. FMPY f115 = ALPHA, f115
  3140. nop __LINE__
  3141. }
  3142. ;;
  3143. { .mfi
  3144. STFD [C6 ] = f104, SIZE
  3145. FMPY f120 = ALPHA, f120
  3146. #if defined(TRMMKERNEL) && \
  3147. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3148. sub L = K, KK
  3149. #else
  3150. nop __LINE__
  3151. #endif
  3152. }
  3153. { .mfb
  3154. STFD [C14] = f106, SIZE
  3155. FMPY f122 = ALPHA, f122
  3156. nop __LINE__
  3157. }
  3158. ;;
  3159. { .mfi
  3160. STFD [C6 ] = f105, 3 * SIZE
  3161. FMPY f121 = ALPHA, f121
  3162. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  3163. adds L = -4, L
  3164. #else
  3165. nop __LINE__
  3166. #endif
  3167. }
  3168. { .mfi
  3169. STFD [C14] = f107, 3 * SIZE
  3170. FMPY f123 = ALPHA, f123
  3171. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  3172. adds L = -8, L
  3173. #else
  3174. nop __LINE__
  3175. #endif
  3176. }
  3177. ;;
  3178. { .mfi
  3179. STFD [C7 ] = f112, SIZE
  3180. mov f64 = f0
  3181. #if defined(TRMMKERNEL) && \
  3182. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3183. shladd KK8 = L, BASE_SHIFT, r0
  3184. #else
  3185. nop __LINE__
  3186. #endif
  3187. }
  3188. { .mfb
  3189. STFD [C15] = f114, SIZE
  3190. mov f72 = f0
  3191. nop __LINE__
  3192. }
  3193. ;;
  3194. { .mfi
  3195. STFD [C7 ] = f113, 3 * SIZE
  3196. mov f80 = f0
  3197. #if defined(TRMMKERNEL) && \
  3198. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3199. shladd AOFFSET = KK8, 2, AOFFSET
  3200. #else
  3201. nop __LINE__
  3202. #endif
  3203. }
  3204. { .mfi
  3205. STFD [C15] = f115, 3 * SIZE
  3206. mov f88 = f0
  3207. #if defined(TRMMKERNEL) && \
  3208. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3209. shladd BOFFSET = KK8, 3, BOFFSET
  3210. #else
  3211. nop __LINE__
  3212. #endif
  3213. }
  3214. ;;
  3215. { .mfi
  3216. STFD [C8 ] = f120, SIZE
  3217. mov f96 = f0
  3218. #if defined(TRMMKERNEL) && defined(LEFT)
  3219. adds KK = 4, KK
  3220. #else
  3221. nop __LINE__
  3222. #endif
  3223. }
  3224. { .mfb
  3225. STFD [C16] = f122, SIZE
  3226. mov f104 = f0
  3227. nop __LINE__
  3228. }
  3229. ;;
  3230. { .mfi
  3231. STFD [C8 ] = f121, 3 * SIZE
  3232. mov f112 = f0
  3233. #ifdef TRMMKERNEL
  3234. shladd KK8 = KK, BASE_SHIFT, r0
  3235. #else
  3236. nop __LINE__
  3237. #endif
  3238. }
  3239. { .mfb
  3240. STFD [C16] = f123, 3 * SIZE
  3241. mov f120 = f0
  3242. nop __LINE__
  3243. }
  3244. ;;
  3245. #endif
  3246. .align 32
  3247. .L030:
  3248. { .mib
  3249. #ifndef TRMMKERNEL
  3250. nop __LINE__
  3251. #else
  3252. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3253. sub L = K, KK
  3254. #elif defined(LEFT)
  3255. adds L = 2, KK
  3256. #else
  3257. adds L = 8, KK
  3258. #endif
  3259. #endif
  3260. tbit.z p6, p7 = M, 1
  3261. (p6) br.cond.dptk .L040
  3262. }
  3263. ;;
  3264. #if !defined(TRMMKERNEL) || \
  3265. defined(TRMMKERNEL) && \
  3266. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3267. { .mfi
  3268. LDFPD f48, f49 = [B]
  3269. mov f65 = f0
  3270. nop __LINE__
  3271. }
  3272. { .mfi
  3273. adds BOFFSET = 2 * SIZE, B
  3274. mov f73 = f0
  3275. #ifndef TRMMKERNEL
  3276. adds L = 1, K
  3277. #else
  3278. adds L = 1, L
  3279. #endif
  3280. }
  3281. #else
  3282. { .mmf
  3283. shladd BOFFSET = KK8, 3, B
  3284. shladd AOFFSET = KK8, 1, AOFFSET
  3285. mov f65 = f0
  3286. }
  3287. ;;
  3288. { .mfi
  3289. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3290. mov f73 = f0
  3291. #ifndef TRMMKERNEL
  3292. adds L = 1, K
  3293. #else
  3294. adds L = 1, L
  3295. #endif
  3296. }
  3297. #endif
  3298. ;;
  3299. { .mfi
  3300. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3301. mov f81 = f0
  3302. tbit.z p12, p0 = L, 0
  3303. }
  3304. { .mfi
  3305. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  3306. mov f89 = f0
  3307. shr L = L, 1
  3308. }
  3309. ;;
  3310. { .mfi
  3311. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  3312. mov f97 = f0
  3313. adds L = -1, L
  3314. }
  3315. { .mfi
  3316. nop __LINE__
  3317. mov f105 = f0
  3318. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  3319. }
  3320. ;;
  3321. { .mfi
  3322. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  3323. mov f113 = f0
  3324. mov ar.lc = L
  3325. }
  3326. { .mfi
  3327. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  3328. mov f121 = f0
  3329. cmp.eq p3, p0 = r0, r0
  3330. }
  3331. ;;
  3332. .align 32
  3333. .L032:
  3334. { .mfb
  3335. lfetch.nt1 [PREA], 4 * SIZE
  3336. FMA f64 = f32, f48, f64 // A1 * B1
  3337. nop __LINE__
  3338. }
  3339. { .mfi
  3340. nop __LINE__
  3341. FMA f72 = f32, f49, f72 // A1 * B2
  3342. (p12) cmp.ne p3, p0 = 0, L
  3343. }
  3344. ;;
  3345. { .mfi
  3346. lfetch.nt1 [PREB], 16 * SIZE
  3347. FMA f80 = f32, f50, f80 // A1 * B3
  3348. cmp.ne p4, p5 = 0, L
  3349. }
  3350. { .mfb
  3351. nop __LINE__
  3352. FMA f88 = f32, f51, f88 // A1 * B4
  3353. nop __LINE__
  3354. }
  3355. ;;
  3356. { .mfb
  3357. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  3358. FMA f96 = f32, f52, f96 // A1 * B5
  3359. nop __LINE__
  3360. }
  3361. { .mfb
  3362. nop __LINE__
  3363. FMA f104 = f32, f53, f104 // A1 * B6
  3364. nop __LINE__
  3365. }
  3366. ;;
  3367. { .mfb
  3368. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  3369. FMA f112 = f32, f54, f112 // A1 * B7
  3370. nop __LINE__
  3371. }
  3372. { .mfb
  3373. nop __LINE__
  3374. FMA f120 = f32, f55, f120 // A1 * B8
  3375. nop __LINE__
  3376. }
  3377. ;;
  3378. { .mfb
  3379. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  3380. FMA f65 = f33, f48, f65 // A2 * B1
  3381. nop __LINE__
  3382. }
  3383. { .mfb
  3384. nop __LINE__
  3385. FMA f73 = f33, f49, f73 // A2 * B2
  3386. nop __LINE__
  3387. }
  3388. ;;
  3389. { .mfb
  3390. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  3391. FMA f81 = f33, f50, f81 // A2 * B3
  3392. nop __LINE__
  3393. }
  3394. { .mfb
  3395. nop __LINE__
  3396. FMA f89 = f33, f51, f89 // A2 * B4
  3397. nop __LINE__
  3398. }
  3399. ;;
  3400. { .mfb
  3401. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  3402. FMA f97 = f33, f52, f97 // A2 * B5
  3403. nop __LINE__
  3404. }
  3405. { .mfb
  3406. nop __LINE__
  3407. FMA f105 = f33, f53, f105 // A2 * B6
  3408. nop __LINE__
  3409. }
  3410. ;;
  3411. { .mfb
  3412. nop __LINE__
  3413. FMA f113 = f33, f54, f113 // A2 * B7
  3414. nop __LINE__
  3415. }
  3416. { .mfb
  3417. nop __LINE__
  3418. FMA f121 = f33, f55, f121 // A2 * B8
  3419. nop __LINE__
  3420. }
  3421. ;;
  3422. { .mfb
  3423. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  3424. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  3425. nop __LINE__
  3426. }
  3427. { .mfb
  3428. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3429. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  3430. nop __LINE__
  3431. }
  3432. ;;
  3433. { .mfb
  3434. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3435. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  3436. nop __LINE__
  3437. }
  3438. { .mfb
  3439. nop __LINE__
  3440. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  3441. nop __LINE__
  3442. }
  3443. ;;
  3444. { .mfb
  3445. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  3446. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  3447. nop __LINE__
  3448. }
  3449. { .mfb
  3450. nop __LINE__
  3451. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  3452. nop __LINE__
  3453. }
  3454. ;;
  3455. { .mfb
  3456. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3457. (p5) LDFD f68 = [C1], SIZE
  3458. #else
  3459. nop __LINE__
  3460. #endif
  3461. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  3462. nop __LINE__
  3463. }
  3464. { .mfb
  3465. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3466. (p5) LDFD f76 = [C2], SIZE
  3467. #else
  3468. nop __LINE__
  3469. #endif
  3470. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  3471. nop __LINE__
  3472. }
  3473. ;;
  3474. { .mfb
  3475. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  3476. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  3477. nop __LINE__
  3478. }
  3479. { .mfb
  3480. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  3481. nop __LINE__
  3482. }
  3483. { .mfb
  3484. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3485. (p5) LDFD f69 = [C1], -1 * SIZE
  3486. #else
  3487. nop __LINE__
  3488. #endif
  3489. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  3490. nop __LINE__
  3491. }
  3492. { .mfb
  3493. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3494. (p5) LDFD f77 = [C2], -1 * SIZE
  3495. #else
  3496. nop __LINE__
  3497. #endif
  3498. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  3499. nop __LINE__
  3500. }
  3501. ;;
  3502. { .mfb
  3503. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3504. (p5) LDFD f84 = [C3], SIZE
  3505. #else
  3506. nop __LINE__
  3507. #endif
  3508. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  3509. nop __LINE__
  3510. }
  3511. { .mfb
  3512. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3513. (p5) LDFD f92 = [C4], SIZE
  3514. #else
  3515. nop __LINE__
  3516. #endif
  3517. (p3) FMA f105 = f41, f61, f105 // A2 * B6
  3518. nop __LINE__
  3519. }
  3520. ;;
  3521. { .mfi
  3522. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3523. (p5) LDFD f85 = [C3], -1 * SIZE
  3524. #else
  3525. nop __LINE__
  3526. #endif
  3527. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  3528. adds L = -1, L
  3529. }
  3530. { .mfb
  3531. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3532. (p5) LDFD f93 = [C4], -1 * SIZE
  3533. #else
  3534. nop __LINE__
  3535. #endif
  3536. (p3) FMA f121 = f41, f63, f121 // A2 * B8
  3537. br.cloop.sptk.few .L032
  3538. }
  3539. ;;
  3540. .L038:
  3541. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3542. { .mfb
  3543. LDFD f100 = [C5], SIZE
  3544. FMA f64 = ALPHA, f64, f68
  3545. nop __LINE__
  3546. }
  3547. { .mfb
  3548. LDFD f108 = [C6], SIZE
  3549. FMA f65 = ALPHA, f65, f69
  3550. nop __LINE__
  3551. }
  3552. ;;
  3553. { .mfb
  3554. LDFD f101 = [C5], -1 * SIZE
  3555. FMA f72 = ALPHA, f72, f76
  3556. nop __LINE__
  3557. }
  3558. { .mfb
  3559. LDFD f109 = [C6], -1 * SIZE
  3560. FMA f73 = ALPHA, f73, f77
  3561. nop __LINE__
  3562. }
  3563. ;;
  3564. { .mfb
  3565. LDFD f116 = [C7], SIZE
  3566. FMA f80 = ALPHA, f80, f84
  3567. nop __LINE__
  3568. }
  3569. { .mfb
  3570. LDFD f124 = [C8], SIZE
  3571. FMA f81 = ALPHA, f81, f85
  3572. nop __LINE__
  3573. }
  3574. ;;
  3575. { .mfb
  3576. LDFD f117 = [C7], -1 * SIZE
  3577. FMA f88 = ALPHA, f88, f92
  3578. nop __LINE__
  3579. }
  3580. { .mfb
  3581. LDFD f125 = [C8], -1 * SIZE
  3582. FMA f89 = ALPHA, f89, f93
  3583. nop __LINE__
  3584. }
  3585. ;;
  3586. { .mfb
  3587. STFD [C1 ] = f64, SIZE
  3588. FMA f96 = ALPHA, f96, f100
  3589. nop __LINE__
  3590. }
  3591. { .mfb
  3592. STFD [C2 ] = f72, SIZE
  3593. FMA f104 = ALPHA, f104, f108
  3594. nop __LINE__
  3595. }
  3596. ;;
  3597. { .mfb
  3598. STFD [C1 ] = f65, SIZE
  3599. FMA f97 = ALPHA, f97, f101
  3600. nop __LINE__
  3601. }
  3602. { .mfb
  3603. STFD [C2 ] = f73, SIZE
  3604. FMA f105 = ALPHA, f105, f109
  3605. nop __LINE__
  3606. }
  3607. ;;
  3608. { .mfb
  3609. STFD [C3 ] = f80, SIZE
  3610. FMA f112 = ALPHA, f112, f116
  3611. nop __LINE__
  3612. }
  3613. { .mfb
  3614. STFD [C4 ] = f88, SIZE
  3615. FMA f120 = ALPHA, f120, f124
  3616. nop __LINE__
  3617. }
  3618. ;;
  3619. { .mfb
  3620. STFD [C3 ] = f81, SIZE
  3621. FMA f113 = ALPHA, f113, f117
  3622. nop __LINE__
  3623. }
  3624. { .mfb
  3625. STFD [C4 ] = f89, SIZE
  3626. FMA f121 = ALPHA, f121, f125
  3627. nop __LINE__
  3628. }
  3629. ;;
  3630. { .mfb
  3631. STFD [C5 ] = f96, SIZE
  3632. mov f64 = f0
  3633. nop __LINE__
  3634. }
  3635. { .mfb
  3636. STFD [C6 ] = f104, SIZE
  3637. mov f72 = f0
  3638. nop __LINE__
  3639. }
  3640. ;;
  3641. { .mfb
  3642. STFD [C5 ] = f97, SIZE
  3643. mov f80 = f0
  3644. nop __LINE__
  3645. }
  3646. { .mfb
  3647. STFD [C6 ] = f105, SIZE
  3648. mov f88 = f0
  3649. nop __LINE__
  3650. }
  3651. ;;
  3652. { .mfb
  3653. STFD [C7 ] = f112, SIZE
  3654. mov f96 = f0
  3655. nop __LINE__
  3656. }
  3657. { .mfb
  3658. STFD [C8 ] = f120, SIZE
  3659. mov f104 = f0
  3660. nop __LINE__
  3661. }
  3662. ;;
  3663. { .mfb
  3664. STFD [C7 ] = f113, SIZE
  3665. mov f112 = f0
  3666. nop __LINE__
  3667. }
  3668. { .mfb
  3669. STFD [C8 ] = f121, SIZE
  3670. mov f120 = f0
  3671. nop __LINE__
  3672. }
  3673. ;;
  3674. #else
  3675. { .mfb
  3676. nop __LINE__
  3677. FMPY f64 = ALPHA, f64
  3678. nop __LINE__
  3679. }
  3680. { .mfb
  3681. nop __LINE__
  3682. FMPY f65 = ALPHA, f65
  3683. nop __LINE__
  3684. }
  3685. ;;
  3686. { .mfb
  3687. nop __LINE__
  3688. FMPY f72 = ALPHA, f72
  3689. nop __LINE__
  3690. }
  3691. { .mfb
  3692. nop __LINE__
  3693. FMPY f73 = ALPHA, f73
  3694. nop __LINE__
  3695. }
  3696. ;;
  3697. { .mfb
  3698. nop __LINE__
  3699. FMPY f80 = ALPHA, f80
  3700. nop __LINE__
  3701. }
  3702. { .mfb
  3703. nop __LINE__
  3704. FMPY f81 = ALPHA, f81
  3705. nop __LINE__
  3706. }
  3707. ;;
  3708. { .mfb
  3709. nop __LINE__
  3710. FMPY f88 = ALPHA, f88
  3711. nop __LINE__
  3712. }
  3713. { .mfb
  3714. nop __LINE__
  3715. FMPY f89 = ALPHA, f89
  3716. nop __LINE__
  3717. }
  3718. ;;
  3719. { .mfb
  3720. STFD [C1 ] = f64, SIZE
  3721. FMPY f96 = ALPHA, f96
  3722. nop __LINE__
  3723. }
  3724. { .mfb
  3725. STFD [C2 ] = f72, SIZE
  3726. FMPY f104 = ALPHA, f104
  3727. nop __LINE__
  3728. }
  3729. ;;
  3730. { .mfb
  3731. STFD [C1 ] = f65, SIZE
  3732. FMPY f97 = ALPHA, f97
  3733. nop __LINE__
  3734. }
  3735. { .mfb
  3736. STFD [C2 ] = f73, SIZE
  3737. FMPY f105 = ALPHA, f105
  3738. nop __LINE__
  3739. }
  3740. ;;
  3741. { .mfi
  3742. STFD [C3 ] = f80, SIZE
  3743. FMPY f112 = ALPHA, f112
  3744. #if defined(TRMMKERNEL) && \
  3745. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3746. sub L = K, KK
  3747. #else
  3748. nop __LINE__
  3749. #endif
  3750. }
  3751. { .mfb
  3752. STFD [C4 ] = f88, SIZE
  3753. FMPY f120 = ALPHA, f120
  3754. nop __LINE__
  3755. }
  3756. ;;
  3757. { .mfi
  3758. STFD [C3 ] = f81, SIZE
  3759. FMPY f113 = ALPHA, f113
  3760. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  3761. adds L = -2, L
  3762. #else
  3763. nop __LINE__
  3764. #endif
  3765. }
  3766. { .mfi
  3767. STFD [C4 ] = f89, SIZE
  3768. FMPY f121 = ALPHA, f121
  3769. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  3770. adds L = -8, L
  3771. #else
  3772. nop __LINE__
  3773. #endif
  3774. }
  3775. ;;
  3776. { .mfi
  3777. STFD [C5 ] = f96, SIZE
  3778. mov f64 = f0
  3779. #if defined(TRMMKERNEL) && \
  3780. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3781. shladd KK8 = L, BASE_SHIFT, r0
  3782. #else
  3783. nop __LINE__
  3784. #endif
  3785. }
  3786. { .mfb
  3787. STFD [C6 ] = f104, SIZE
  3788. mov f72 = f0
  3789. nop __LINE__
  3790. }
  3791. ;;
  3792. { .mfi
  3793. STFD [C5 ] = f97, SIZE
  3794. mov f80 = f0
  3795. #if defined(TRMMKERNEL) && \
  3796. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3797. shladd AOFFSET = KK8, 1, AOFFSET
  3798. #else
  3799. nop __LINE__
  3800. #endif
  3801. }
  3802. { .mfi
  3803. STFD [C6 ] = f105, SIZE
  3804. mov f88 = f0
  3805. #if defined(TRMMKERNEL) && \
  3806. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3807. shladd BOFFSET = KK8, 3, BOFFSET
  3808. #else
  3809. nop __LINE__
  3810. #endif
  3811. }
  3812. ;;
  3813. { .mfi
  3814. STFD [C7 ] = f112, SIZE
  3815. mov f96 = f0
  3816. #if defined(TRMMKERNEL) && defined(LEFT)
  3817. adds KK = 2, KK
  3818. #else
  3819. nop __LINE__
  3820. #endif
  3821. }
  3822. { .mfb
  3823. STFD [C8 ] = f120, SIZE
  3824. mov f104 = f0
  3825. nop __LINE__
  3826. }
  3827. ;;
  3828. { .mfi
  3829. STFD [C7 ] = f113, SIZE
  3830. mov f112 = f0
  3831. #ifdef TRMMKERNEL
  3832. shladd KK8 = KK, BASE_SHIFT, r0
  3833. #else
  3834. nop __LINE__
  3835. #endif
  3836. }
  3837. { .mfb
  3838. STFD [C8 ] = f121, SIZE
  3839. mov f120 = f0
  3840. nop __LINE__
  3841. }
  3842. ;;
  3843. #endif
  3844. .align 32
  3845. .L040:
  3846. { .mib
  3847. #ifndef TRMMKERNEL
  3848. nop __LINE__
  3849. #else
  3850. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3851. sub L = K, KK
  3852. #elif defined(LEFT)
  3853. adds L = 1, KK
  3854. #else
  3855. adds L = 8, KK
  3856. #endif
  3857. #endif
  3858. tbit.z p6, p7 = M, 0
  3859. (p6) br.cond.dptk .L049
  3860. }
  3861. ;;
  3862. #if !defined(TRMMKERNEL) || \
  3863. defined(TRMMKERNEL) && \
  3864. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  3865. { .mmi
  3866. LDFPD f48, f49 = [B]
  3867. adds BOFFSET = 2 * SIZE, B
  3868. #ifndef TRMMKERNEL
  3869. adds L = 1, K
  3870. #else
  3871. adds L = 1, L
  3872. #endif
  3873. }
  3874. #else
  3875. { .mmi
  3876. shladd BOFFSET = KK8, 3, B
  3877. add AOFFSET = KK8, AOFFSET
  3878. nop __LINE__
  3879. }
  3880. ;;
  3881. { .mmi
  3882. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3883. nop __LINE__
  3884. #ifndef TRMMKERNEL
  3885. adds L = 1, K
  3886. #else
  3887. adds L = 1, L
  3888. #endif
  3889. }
  3890. #endif
  3891. ;;
  3892. { .mii
  3893. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3894. tbit.z p12, p0 = L, 0
  3895. shr L = L, 1
  3896. }
  3897. ;;
  3898. { .mmi
  3899. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  3900. LDFD f32 = [AOFFSET], 1 * SIZE
  3901. adds L = -1, L
  3902. }
  3903. ;;
  3904. { .mmi
  3905. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  3906. cmp.eq p3, p0 = r0, r0
  3907. mov ar.lc = L
  3908. }
  3909. { .mmi
  3910. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  3911. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  3912. nop __LINE__
  3913. }
  3914. ;;
  3915. .align 32
  3916. .L042:
  3917. { .mfb
  3918. lfetch.nt1 [PREB], 16 * SIZE
  3919. FMA f64 = f32, f48, f64 // A1 * B1
  3920. nop __LINE__
  3921. }
  3922. { .mfb
  3923. (p12) cmp.ne p3, p0 = 0, L
  3924. FMA f72 = f32, f49, f72 // A1 * B2
  3925. nop __LINE__
  3926. }
  3927. ;;
  3928. { .mfi
  3929. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  3930. FMA f80 = f32, f50, f80 // A1 * B3
  3931. cmp.ne p4, p5 = 0, L
  3932. }
  3933. { .mfb
  3934. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  3935. FMA f88 = f32, f51, f88 // A1 * B4
  3936. nop __LINE__
  3937. }
  3938. ;;
  3939. { .mfb
  3940. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  3941. FMA f96 = f32, f52, f96 // A1 * B5
  3942. nop __LINE__
  3943. }
  3944. { .mfb
  3945. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3946. (p5) LDFD f68 = [C1]
  3947. #else
  3948. nop __LINE__
  3949. #endif
  3950. FMA f104 = f32, f53, f104 // A1 * B6
  3951. nop __LINE__
  3952. }
  3953. ;;
  3954. { .mfb
  3955. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  3956. FMA f112 = f32, f54, f112 // A1 * B7
  3957. nop __LINE__
  3958. }
  3959. { .mfb
  3960. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3961. (p5) LDFD f76 = [C2]
  3962. #else
  3963. nop __LINE__
  3964. #endif
  3965. FMA f120 = f32, f55, f120 // A1 * B8
  3966. nop __LINE__
  3967. }
  3968. ;;
  3969. { .mfb
  3970. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  3971. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  3972. nop __LINE__
  3973. }
  3974. { .mfb
  3975. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  3976. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  3977. nop __LINE__
  3978. }
  3979. ;;
  3980. { .mfb
  3981. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3982. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  3983. nop __LINE__
  3984. }
  3985. { .mfb
  3986. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  3987. (p5) LDFD f84 = [C3]
  3988. #else
  3989. nop __LINE__
  3990. #endif
  3991. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  3992. nop __LINE__
  3993. }
  3994. ;;
  3995. { .mfb
  3996. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3997. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  3998. nop __LINE__
  3999. }
  4000. { .mfb
  4001. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4002. (p5) LDFD f92 = [C4]
  4003. #else
  4004. nop __LINE__
  4005. #endif
  4006. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  4007. nop __LINE__
  4008. }
  4009. ;;
  4010. { .mfi
  4011. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  4012. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  4013. adds L = -1, L
  4014. }
  4015. { .mmb
  4016. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4017. (p5) LDFD f100 = [C5]
  4018. (p5) LDFD f108 = [C6]
  4019. #else
  4020. nop __LINE__
  4021. nop __LINE__
  4022. #endif
  4023. nop __LINE__
  4024. }
  4025. ;;
  4026. { .mfb
  4027. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  4028. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  4029. nop __LINE__
  4030. }
  4031. { .mmb
  4032. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4033. (p5) LDFD f116 = [C7]
  4034. (p5) LDFD f124 = [C8]
  4035. #else
  4036. nop __LINE__
  4037. nop __LINE__
  4038. #endif
  4039. br.cloop.sptk.few .L042
  4040. }
  4041. ;;
  4042. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4043. FMA f64 = ALPHA, f64, f68
  4044. FMA f72 = ALPHA, f72, f76
  4045. FMA f80 = ALPHA, f80, f84
  4046. FMA f88 = ALPHA, f88, f92
  4047. FMA f96 = ALPHA, f96, f100
  4048. FMA f104 = ALPHA, f104, f108
  4049. FMA f112 = ALPHA, f112, f116
  4050. FMA f120 = ALPHA, f120, f124
  4051. ;;
  4052. STFD [C1 ] = f64, SIZE
  4053. mov f64 = f0
  4054. STFD [C2 ] = f72, SIZE
  4055. mov f72 = f0
  4056. ;;
  4057. STFD [C3 ] = f80, SIZE
  4058. mov f80 = f0
  4059. STFD [C4 ] = f88, SIZE
  4060. mov f88 = f0
  4061. ;;
  4062. STFD [C5 ] = f96, SIZE
  4063. mov f96 = f0
  4064. STFD [C6 ] = f104, SIZE
  4065. mov f104 = f0
  4066. ;;
  4067. STFD [C7 ] = f112, SIZE
  4068. mov f112 = f0
  4069. STFD [C8 ] = f120, SIZE
  4070. mov f120 = f0
  4071. ;;
  4072. #else
  4073. FMPY f64 = ALPHA, f64
  4074. FMPY f72 = ALPHA, f72
  4075. FMPY f80 = ALPHA, f80
  4076. FMPY f88 = ALPHA, f88
  4077. { .mfi
  4078. FMPY f96 = ALPHA, f96
  4079. #if defined(TRMMKERNEL) && \
  4080. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4081. sub L = K, KK
  4082. #else
  4083. nop __LINE__
  4084. #endif
  4085. }
  4086. { .mfi
  4087. nop __LINE__
  4088. FMPY f104 = ALPHA, f104
  4089. nop __LINE__
  4090. }
  4091. ;;
  4092. { .mfi
  4093. nop __LINE__
  4094. FMPY f112 = ALPHA, f112
  4095. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  4096. adds L = -1, L
  4097. #else
  4098. nop __LINE__
  4099. #endif
  4100. }
  4101. { .mfi
  4102. nop __LINE__
  4103. FMPY f120 = ALPHA, f120
  4104. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  4105. adds L = -8, L
  4106. #else
  4107. nop __LINE__
  4108. #endif
  4109. }
  4110. ;;
  4111. { .mfi
  4112. STFD [C1 ] = f64, SIZE
  4113. mov f64 = f0
  4114. #if defined(TRMMKERNEL) && \
  4115. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4116. shladd KK8 = L, BASE_SHIFT, r0
  4117. #else
  4118. nop __LINE__
  4119. #endif
  4120. }
  4121. { .mfi
  4122. STFD [C2 ] = f72, SIZE
  4123. mov f72 = f0
  4124. nop __LINE__
  4125. }
  4126. ;;
  4127. { .mfi
  4128. STFD [C3 ] = f80, SIZE
  4129. mov f80 = f0
  4130. #if defined(TRMMKERNEL) && \
  4131. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4132. add AOFFSET = KK8, AOFFSET
  4133. #else
  4134. nop __LINE__
  4135. #endif
  4136. }
  4137. { .mfi
  4138. STFD [C4 ] = f88, SIZE
  4139. mov f88 = f0
  4140. #if defined(TRMMKERNEL) && \
  4141. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4142. shladd BOFFSET = KK8, 3, BOFFSET
  4143. #else
  4144. nop __LINE__
  4145. #endif
  4146. }
  4147. ;;
  4148. { .mfi
  4149. STFD [C5 ] = f96, SIZE
  4150. mov f96 = f0
  4151. #if defined(TRMMKERNEL) && defined(LEFT)
  4152. adds KK = 1, KK
  4153. #else
  4154. nop __LINE__
  4155. #endif
  4156. }
  4157. { .mfi
  4158. STFD [C6 ] = f104, SIZE
  4159. mov f104 = f0
  4160. nop __LINE__
  4161. }
  4162. ;;
  4163. { .mfi
  4164. STFD [C7 ] = f112, SIZE
  4165. mov f112 = f0
  4166. #ifdef TRMMKERNEL
  4167. shladd KK8 = KK, BASE_SHIFT, r0
  4168. #else
  4169. nop __LINE__
  4170. #endif
  4171. }
  4172. { .mfi
  4173. STFD [C8 ] = f120, SIZE
  4174. mov f120 = f0
  4175. nop __LINE__
  4176. }
  4177. ;;
  4178. #endif
  4179. .align 32
  4180. #endif
  4181. .L049:
  4182. { .mmi
  4183. mov B = BOFFSET
  4184. mov AOFFSET = A
  4185. #if defined(TRMMKERNEL) && !defined(LEFT)
  4186. adds KK = 8, KK
  4187. #else
  4188. nop __LINE__
  4189. #endif
  4190. }
  4191. ;;
  4192. { .mmb
  4193. nop __LINE__
  4194. cmp.lt p6, p0 = 0, J
  4195. (p6) br.cond.dptk .L010
  4196. }
  4197. ;;
  4198. .align 32
  4199. .L050:
  4200. { .mfi
  4201. mov C1 = C
  4202. mov f64 = f0
  4203. tbit.z p6, p0 = N, 2
  4204. }
  4205. { .mfi
  4206. add C2 = LDC, C
  4207. mov f72 = f0
  4208. shr I = M, 3
  4209. }
  4210. ;;
  4211. { .mfi
  4212. shladd C3 = LDC, 1, C
  4213. mov f80 = f0
  4214. nop __LINE__
  4215. }
  4216. { .mfb
  4217. mov AOFFSET = A
  4218. mov f88 = f0
  4219. (p6) br.cond.dpnt .L090
  4220. }
  4221. ;;
  4222. #if 0
  4223. { .mfi
  4224. cmp.eq p6, p7 = 0, I
  4225. mov f65 = f0
  4226. #if defined(TRMMKERNEL) && defined(LEFT)
  4227. mov KK = OFFSET
  4228. #else
  4229. nop __LINE__
  4230. #endif
  4231. }
  4232. { .mfi
  4233. shladd C4 = LDC, 1, C2
  4234. mov f73 = f0
  4235. nop __LINE__
  4236. }
  4237. ;;
  4238. { .mfi
  4239. nop __LINE__
  4240. mov f81 = f0
  4241. #ifdef TRMMKERNEL
  4242. shladd KK8 = KK, BASE_SHIFT, r0
  4243. #else
  4244. nop __LINE__
  4245. #endif
  4246. }
  4247. { .mfb
  4248. shladd C = LDC, 2, C
  4249. mov f89 = f0
  4250. (p6) br.cond.dpnt .L060
  4251. }
  4252. ;;
  4253. .align 32
  4254. .L052:
  4255. #if !defined(TRMMKERNEL) || \
  4256. defined(TRMMKERNEL) && \
  4257. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  4258. { .mfb
  4259. LDFPD f48, f49 = [B]
  4260. mov f66 = f0
  4261. nop __LINE__
  4262. }
  4263. { .mfb
  4264. adds BOFFSET = 2 * SIZE, B
  4265. mov f74 = f0
  4266. nop __LINE__
  4267. }
  4268. ;;
  4269. #else
  4270. { .mfi
  4271. shladd BOFFSET = KK8, 2, B
  4272. mov f66 = f0
  4273. shladd AOFFSET = KK8, 3, AOFFSET
  4274. }
  4275. ;;
  4276. { .mfi
  4277. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  4278. mov f74 = f0
  4279. nop __LINE__
  4280. }
  4281. ;;
  4282. #endif
  4283. ;;
  4284. { .mfi
  4285. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  4286. mov f82 = f0
  4287. #ifndef TRMMKERNEL
  4288. nop __LINE__
  4289. #else
  4290. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4291. sub L = K, KK
  4292. #elif defined(LEFT)
  4293. adds L = 8, KK
  4294. #else
  4295. adds L = 4, KK
  4296. #endif
  4297. #endif
  4298. }
  4299. { .mfi
  4300. setf.d f84 = r0
  4301. mov f90 = f0
  4302. nop __LINE__
  4303. }
  4304. ;;
  4305. { .mfi
  4306. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  4307. mov f67 = f0
  4308. adds PREC = CPREFETCHSIZE * SIZE, C1
  4309. }
  4310. { .mfi
  4311. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  4312. mov f75 = f0
  4313. #ifndef TRMMKERNEL
  4314. adds L = 1, K
  4315. #else
  4316. adds L = 1, L
  4317. #endif
  4318. }
  4319. ;;
  4320. { .mfi
  4321. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  4322. mov f83 = f0
  4323. tbit.z p12, p0 = L, 0
  4324. }
  4325. { .mfi
  4326. setf.d f91 = r0
  4327. mov f68 = f0
  4328. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  4329. }
  4330. ;;
  4331. { .mfi
  4332. CPREFETCH [PREC], LDC
  4333. mov f76 = f0
  4334. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  4335. }
  4336. { .mfi
  4337. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  4338. mov f92 = f0
  4339. cmp.eq p3, p0 = r0, r0
  4340. }
  4341. ;;
  4342. { .mfi
  4343. CPREFETCH [PREC], LDC
  4344. mov f69 = f0
  4345. shr L = L, 1
  4346. }
  4347. { .mmf
  4348. setf.d f77 = r0
  4349. setf.d f85 = r0
  4350. mov f93 = f0
  4351. }
  4352. ;;
  4353. { .mfi
  4354. CPREFETCH [PREC], LDC
  4355. mov f70 = f0
  4356. adds L = -1, L
  4357. }
  4358. { .mmf
  4359. setf.d f78 = r0
  4360. setf.d f86 = r0
  4361. mov f94 = f0
  4362. }
  4363. ;;
  4364. { .mfi
  4365. CPREFETCH [PREC]
  4366. mov f71 = f0
  4367. mov ar.lc = L
  4368. }
  4369. { .mmf
  4370. setf.d f79 = r0
  4371. setf.d f87 = r0
  4372. mov f95 = f0
  4373. }
  4374. ;;
  4375. .align 32
  4376. .L053:
  4377. { .mfb
  4378. lfetch.nt1 [PREA], 16 * SIZE
  4379. FMA f64 = f32, f48, f64 // A1 * B1
  4380. nop __LINE__
  4381. }
  4382. { .mfi
  4383. nop __LINE__
  4384. FMA f72 = f32, f49, f72 // A1 * B2
  4385. (p12) cmp.ne p3, p0 = 0, L
  4386. }
  4387. ;;
  4388. { .mfi
  4389. lfetch.nt1 [PREB], 8 * SIZE
  4390. FMA f80 = f32, f50, f80 // A1 * B3
  4391. cmp.ne p4, p5 = 0, L
  4392. }
  4393. { .mfi
  4394. nop __LINE__
  4395. FMA f88 = f32, f51, f88 // A1 * B4
  4396. adds C9 = 4 * SIZE, C1
  4397. }
  4398. ;;
  4399. { .mfi
  4400. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  4401. FMA f65 = f33, f48, f65 // A2 * B1
  4402. adds C10 = 4 * SIZE, C2
  4403. }
  4404. { .mfi
  4405. nop __LINE__
  4406. FMA f73 = f33, f49, f73 // A2 * B2
  4407. adds C11 = 4 * SIZE, C3
  4408. }
  4409. ;;
  4410. { .mfi
  4411. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  4412. FMA f81 = f33, f50, f81 // A2 * B3
  4413. adds C12 = 4 * SIZE, C4
  4414. }
  4415. { .mfb
  4416. nop __LINE__
  4417. FMA f89 = f33, f51, f89 // A2 * B4
  4418. nop __LINE__
  4419. }
  4420. ;;
  4421. { .mfb
  4422. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  4423. FMA f66 = f34, f48, f66 // A3 * B1
  4424. nop __LINE__
  4425. }
  4426. { .mfb
  4427. nop __LINE__
  4428. FMA f74 = f34, f49, f74 // A3 * B2
  4429. nop __LINE__
  4430. }
  4431. ;;
  4432. { .mfb
  4433. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  4434. FMA f82 = f34, f50, f82 // A3 * B3
  4435. nop __LINE__
  4436. }
  4437. { .mfb
  4438. nop __LINE__
  4439. FMA f90 = f34, f51, f90 // A3 * B4
  4440. nop __LINE__
  4441. }
  4442. ;;
  4443. { .mfb
  4444. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  4445. FMA f67 = f35, f48, f67 // A4 * B1
  4446. nop __LINE__
  4447. }
  4448. { .mfb
  4449. nop __LINE__
  4450. FMA f75 = f35, f49, f75 // A4 * B2
  4451. nop __LINE__
  4452. }
  4453. ;;
  4454. { .mfb
  4455. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  4456. FMA f83 = f35, f50, f83 // A4 * B3
  4457. nop __LINE__
  4458. }
  4459. { .mfb
  4460. nop __LINE__
  4461. FMA f91 = f35, f51, f91 // A4 * B4
  4462. nop __LINE__
  4463. }
  4464. ;;
  4465. { .mfb
  4466. nop __LINE__
  4467. FMA f68 = f36, f48, f68 // A5 * B1
  4468. nop __LINE__
  4469. }
  4470. { .mfb
  4471. nop __LINE__
  4472. FMA f76 = f36, f49, f76 // A5 * B2
  4473. nop __LINE__
  4474. }
  4475. ;;
  4476. { .mfb
  4477. nop __LINE__
  4478. FMA f84 = f36, f50, f84 // A5 * B3
  4479. nop __LINE__
  4480. }
  4481. { .mfb
  4482. nop __LINE__
  4483. FMA f92 = f36, f51, f92 // A5 * B4
  4484. nop __LINE__
  4485. }
  4486. ;;
  4487. { .mfb
  4488. nop __LINE__
  4489. FMA f69 = f37, f48, f69 // A6 * B1
  4490. nop __LINE__
  4491. }
  4492. { .mfb
  4493. nop __LINE__
  4494. FMA f77 = f37, f49, f77 // A6 * B2
  4495. nop __LINE__
  4496. }
  4497. ;;
  4498. { .mfb
  4499. nop __LINE__
  4500. FMA f85 = f37, f50, f85 // A6 * B3
  4501. nop __LINE__
  4502. }
  4503. { .mfb
  4504. nop __LINE__
  4505. FMA f93 = f37, f51, f93 // A6 * B4
  4506. nop __LINE__
  4507. }
  4508. ;;
  4509. { .mfb
  4510. nop __LINE__
  4511. FMA f70 = f38, f48, f70 // A7 * B1
  4512. nop __LINE__
  4513. }
  4514. { .mfb
  4515. nop __LINE__
  4516. FMA f78 = f38, f49, f78 // A7 * B2
  4517. nop __LINE__
  4518. }
  4519. ;;
  4520. { .mfb
  4521. nop __LINE__
  4522. FMA f86 = f38, f50, f86 // A7 * B3
  4523. nop __LINE__
  4524. }
  4525. { .mfb
  4526. nop __LINE__
  4527. FMA f94 = f38, f51, f94 // A7 * B4
  4528. nop __LINE__
  4529. }
  4530. ;;
  4531. { .mfb
  4532. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  4533. FMA f71 = f39, f48, f71 // A8 * B1
  4534. nop __LINE__
  4535. }
  4536. { .mfb
  4537. nop __LINE__
  4538. FMA f79 = f39, f49, f79 // A8 * B2
  4539. nop __LINE__
  4540. }
  4541. ;;
  4542. { .mfb
  4543. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  4544. FMA f87 = f39, f50, f87 // A8 * B3
  4545. nop __LINE__
  4546. }
  4547. { .mfb
  4548. nop __LINE__
  4549. FMA f95 = f39, f51, f95 // A8 * B4
  4550. nop __LINE__
  4551. }
  4552. ;;
  4553. { .mfb
  4554. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  4555. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  4556. nop __LINE__
  4557. }
  4558. { .mfb
  4559. nop __LINE__
  4560. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  4561. nop __LINE__
  4562. }
  4563. ;;
  4564. { .mfb
  4565. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  4566. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  4567. nop __LINE__
  4568. }
  4569. { .mfb
  4570. nop __LINE__
  4571. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  4572. nop __LINE__
  4573. }
  4574. ;;
  4575. { .mfb
  4576. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  4577. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  4578. nop __LINE__
  4579. }
  4580. { .mfb
  4581. nop __LINE__
  4582. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  4583. nop __LINE__
  4584. }
  4585. ;;
  4586. { .mfb
  4587. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  4588. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  4589. nop __LINE__
  4590. }
  4591. { .mfb
  4592. nop __LINE__
  4593. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  4594. nop __LINE__
  4595. }
  4596. ;;
  4597. { .mfb
  4598. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4599. (p5) LDFD f96 = [C1 ], SIZE
  4600. #else
  4601. nop __LINE__
  4602. #endif
  4603. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  4604. nop __LINE__
  4605. }
  4606. { .mfb
  4607. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4608. (p5) LDFD f97 = [C9 ], SIZE
  4609. #else
  4610. nop __LINE__
  4611. #endif
  4612. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  4613. nop __LINE__
  4614. }
  4615. ;;
  4616. { .mfb
  4617. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4618. (p5) LDFD f98 = [C1 ], SIZE
  4619. #else
  4620. nop __LINE__
  4621. #endif
  4622. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  4623. nop __LINE__
  4624. }
  4625. { .mfb
  4626. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4627. (p5) LDFD f99 = [C9 ], SIZE
  4628. #else
  4629. nop __LINE__
  4630. #endif
  4631. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  4632. nop __LINE__
  4633. }
  4634. ;;
  4635. { .mfb
  4636. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4637. (p5) LDFD f100 = [C1 ], SIZE
  4638. #else
  4639. nop __LINE__
  4640. #endif
  4641. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  4642. nop __LINE__
  4643. }
  4644. { .mfb
  4645. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4646. (p5) LDFD f101 = [C9 ], SIZE
  4647. #else
  4648. nop __LINE__
  4649. #endif
  4650. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  4651. nop __LINE__
  4652. }
  4653. ;;
  4654. { .mfb
  4655. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4656. (p5) LDFD f102 = [C1 ], -3 * SIZE
  4657. #else
  4658. nop __LINE__
  4659. #endif
  4660. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  4661. nop __LINE__
  4662. }
  4663. { .mfb
  4664. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4665. (p5) LDFD f103 = [C9 ], -3 * SIZE
  4666. #else
  4667. nop __LINE__
  4668. #endif
  4669. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  4670. nop __LINE__
  4671. }
  4672. ;;
  4673. { .mfb
  4674. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4675. (p5) LDFD f104 = [C2 ], SIZE
  4676. #else
  4677. nop __LINE__
  4678. #endif
  4679. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  4680. nop __LINE__
  4681. }
  4682. { .mfb
  4683. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4684. (p5) LDFD f105 = [C10], SIZE
  4685. #else
  4686. nop __LINE__
  4687. #endif
  4688. (p3) FMA f76 = f44, f57, f76 // A5 * B2
  4689. nop __LINE__
  4690. }
  4691. ;;
  4692. { .mfb
  4693. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4694. (p5) LDFD f106 = [C2 ], SIZE
  4695. #else
  4696. nop __LINE__
  4697. #endif
  4698. (p3) FMA f84 = f44, f58, f84 // A5 * B3
  4699. nop __LINE__
  4700. }
  4701. { .mfb
  4702. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4703. (p5) LDFD f107 = [C10], SIZE
  4704. #else
  4705. nop __LINE__
  4706. #endif
  4707. (p3) FMA f92 = f44, f59, f92 // A5 * B4
  4708. nop __LINE__
  4709. }
  4710. ;;
  4711. { .mfb
  4712. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4713. (p5) LDFD f108 = [C2 ], SIZE
  4714. #else
  4715. nop __LINE__
  4716. #endif
  4717. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  4718. nop __LINE__
  4719. }
  4720. { .mfb
  4721. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4722. (p5) LDFD f109 = [C10], SIZE
  4723. #else
  4724. nop __LINE__
  4725. #endif
  4726. (p3) FMA f77 = f45, f57, f77 // A6 * B2
  4727. nop __LINE__
  4728. }
  4729. ;;
  4730. { .mfb
  4731. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4732. (p5) LDFD f110 = [C2 ], -3 * SIZE
  4733. #else
  4734. nop __LINE__
  4735. #endif
  4736. (p3) FMA f85 = f45, f58, f85 // A6 * B3
  4737. nop __LINE__
  4738. }
  4739. { .mfb
  4740. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4741. (p5) LDFD f111 = [C10], -3 * SIZE
  4742. #else
  4743. nop __LINE__
  4744. #endif
  4745. (p3) FMA f93 = f45, f59, f93 // A6 * B4
  4746. nop __LINE__
  4747. }
  4748. ;;
  4749. { .mfb
  4750. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4751. (p5) LDFD f112 = [C3 ], SIZE
  4752. #else
  4753. nop __LINE__
  4754. #endif
  4755. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  4756. nop __LINE__
  4757. }
  4758. { .mfb
  4759. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4760. (p5) LDFD f113 = [C11], SIZE
  4761. #else
  4762. nop __LINE__
  4763. #endif
  4764. (p3) FMA f78 = f46, f57, f78 // A7 * B2
  4765. nop __LINE__
  4766. }
  4767. ;;
  4768. { .mfb
  4769. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4770. (p5) LDFD f114 = [C3 ], SIZE
  4771. #else
  4772. nop __LINE__
  4773. #endif
  4774. (p3) FMA f86 = f46, f58, f86 // A7 * B3
  4775. nop __LINE__
  4776. }
  4777. { .mfb
  4778. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4779. (p5) LDFD f115 = [C11], SIZE
  4780. #else
  4781. nop __LINE__
  4782. #endif
  4783. (p3) FMA f94 = f46, f59, f94 // A7 * B4
  4784. nop __LINE__
  4785. }
  4786. ;;
  4787. { .mfb
  4788. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4789. (p5) LDFD f116 = [C3 ], SIZE
  4790. #else
  4791. nop __LINE__
  4792. #endif
  4793. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  4794. nop __LINE__
  4795. }
  4796. { .mfb
  4797. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4798. (p5) LDFD f117 = [C11], SIZE
  4799. #else
  4800. nop __LINE__
  4801. #endif
  4802. (p3) FMA f79 = f47, f57, f79 // A8 * B2
  4803. nop __LINE__
  4804. }
  4805. ;;
  4806. { .mfi
  4807. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4808. (p5) LDFD f118 = [C3 ], -3 * SIZE
  4809. #else
  4810. nop __LINE__
  4811. #endif
  4812. (p3) FMA f87 = f47, f58, f87 // A8 * B3
  4813. adds L = -1, L
  4814. }
  4815. { .mfb
  4816. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4817. (p5) LDFD f119 = [C11], -3 * SIZE
  4818. #else
  4819. nop __LINE__
  4820. #endif
  4821. (p3) FMA f95 = f47, f59, f95 // A8 * B4
  4822. br.cloop.sptk.few .L053
  4823. }
  4824. ;;
  4825. .align 32
  4826. .L058:
  4827. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  4828. { .mfi
  4829. LDFD f120 = [C4 ], SIZE
  4830. FMA f64 = ALPHA, f64, f96
  4831. cmp.ne p6, p0 = 1, I
  4832. }
  4833. { .mfb
  4834. LDFD f121 = [C12], SIZE
  4835. FMA f68 = ALPHA, f68, f97
  4836. nop __LINE__
  4837. }
  4838. ;;
  4839. { .mfi
  4840. LDFD f122 = [C4 ], SIZE
  4841. FMA f65 = ALPHA, f65, f98
  4842. adds I = -1, I
  4843. }
  4844. { .mfb
  4845. LDFD f123 = [C12], SIZE
  4846. FMA f69 = ALPHA, f69, f99
  4847. nop __LINE__
  4848. }
  4849. ;;
  4850. { .mfb
  4851. LDFD f124 = [C4 ], SIZE
  4852. FMA f66 = ALPHA, f66, f100
  4853. nop __LINE__
  4854. }
  4855. { .mfb
  4856. LDFD f125 = [C12], SIZE
  4857. FMA f70 = ALPHA, f70, f101
  4858. nop __LINE__
  4859. }
  4860. ;;
  4861. { .mfb
  4862. LDFD f126 = [C4 ], -3 * SIZE
  4863. FMA f67 = ALPHA, f67, f102
  4864. nop __LINE__
  4865. }
  4866. { .mfb
  4867. LDFD f127 = [C12], -3 * SIZE
  4868. FMA f71 = ALPHA, f71, f103
  4869. nop __LINE__
  4870. }
  4871. ;;
  4872. { .mfb
  4873. STFD [C1 ] = f64, SIZE
  4874. FMA f72 = ALPHA, f72, f104
  4875. nop __LINE__
  4876. }
  4877. { .mfb
  4878. STFD [C9 ] = f68, SIZE
  4879. FMA f76 = ALPHA, f76, f105
  4880. nop __LINE__
  4881. }
  4882. ;;
  4883. { .mfb
  4884. STFD [C1 ] = f65, SIZE
  4885. FMA f73 = ALPHA, f73, f106
  4886. nop __LINE__
  4887. }
  4888. { .mfb
  4889. STFD [C9 ] = f69, SIZE
  4890. FMA f77 = ALPHA, f77, f107
  4891. nop __LINE__
  4892. }
  4893. ;;
  4894. { .mfb
  4895. STFD [C1 ] = f66, SIZE
  4896. FMA f74 = ALPHA, f74, f108
  4897. nop __LINE__
  4898. }
  4899. { .mfb
  4900. STFD [C9 ] = f70, SIZE
  4901. FMA f78 = ALPHA, f78, f109
  4902. nop __LINE__
  4903. }
  4904. ;;
  4905. { .mfb
  4906. STFD [C1 ] = f67, 5 * SIZE
  4907. FMA f75 = ALPHA, f75, f110
  4908. nop __LINE__
  4909. }
  4910. { .mfb
  4911. STFD [C9 ] = f71, 5 * SIZE
  4912. FMA f79 = ALPHA, f79, f111
  4913. nop __LINE__
  4914. }
  4915. ;;
  4916. { .mfb
  4917. STFD [C2 ] = f72, SIZE
  4918. FMA f80 = ALPHA, f80, f112
  4919. nop __LINE__
  4920. }
  4921. { .mfb
  4922. STFD [C10] = f76, SIZE
  4923. FMA f84 = ALPHA, f84, f113
  4924. nop __LINE__
  4925. }
  4926. ;;
  4927. { .mfb
  4928. STFD [C2 ] = f73, SIZE
  4929. FMA f81 = ALPHA, f81, f114
  4930. nop __LINE__
  4931. }
  4932. { .mfb
  4933. STFD [C10] = f77, SIZE
  4934. FMA f85 = ALPHA, f85, f115
  4935. nop __LINE__
  4936. }
  4937. ;;
  4938. { .mfb
  4939. STFD [C2 ] = f74, SIZE
  4940. FMA f82 = ALPHA, f82, f116
  4941. nop __LINE__
  4942. }
  4943. { .mfb
  4944. STFD [C10] = f78, SIZE
  4945. FMA f86 = ALPHA, f86, f117
  4946. nop __LINE__
  4947. }
  4948. ;;
  4949. { .mfb
  4950. STFD [C2 ] = f75, 5 * SIZE
  4951. FMA f83 = ALPHA, f83, f118
  4952. nop __LINE__
  4953. }
  4954. { .mfb
  4955. STFD [C10] = f79, 5 * SIZE
  4956. FMA f87 = ALPHA, f87, f119
  4957. nop __LINE__
  4958. }
  4959. ;;
  4960. { .mfb
  4961. STFD [C3 ] = f80, SIZE
  4962. FMA f88 = ALPHA, f88, f120
  4963. nop __LINE__
  4964. }
  4965. { .mfb
  4966. STFD [C11] = f84, SIZE
  4967. FMA f92 = ALPHA, f92, f121
  4968. nop __LINE__
  4969. }
  4970. ;;
  4971. { .mfb
  4972. STFD [C3 ] = f81, SIZE
  4973. FMA f89 = ALPHA, f89, f122
  4974. nop __LINE__
  4975. }
  4976. { .mfb
  4977. STFD [C11] = f85, SIZE
  4978. FMA f93 = ALPHA, f93, f123
  4979. nop __LINE__
  4980. }
  4981. ;;
  4982. { .mfb
  4983. STFD [C3 ] = f82, SIZE
  4984. FMA f90 = ALPHA, f90, f124
  4985. nop __LINE__
  4986. }
  4987. { .mfb
  4988. STFD [C11] = f86, SIZE
  4989. FMA f94 = ALPHA, f94, f125
  4990. nop __LINE__
  4991. }
  4992. ;;
  4993. { .mfb
  4994. STFD [C3 ] = f83, 5 * SIZE
  4995. FMA f91 = ALPHA, f91, f126
  4996. nop __LINE__
  4997. }
  4998. { .mfb
  4999. STFD [C11] = f87, 5 * SIZE
  5000. FMA f95 = ALPHA, f95, f127
  5001. nop __LINE__
  5002. }
  5003. ;;
  5004. { .mfb
  5005. STFD [C4 ] = f88, SIZE
  5006. mov f64 = f0
  5007. nop __LINE__
  5008. }
  5009. { .mfb
  5010. STFD [C12] = f92, SIZE
  5011. mov f72 = f0
  5012. nop __LINE__
  5013. }
  5014. ;;
  5015. { .mfb
  5016. STFD [C4 ] = f89, SIZE
  5017. mov f80 = f0
  5018. nop __LINE__
  5019. }
  5020. { .mfb
  5021. STFD [C12] = f93, SIZE
  5022. mov f88 = f0
  5023. nop __LINE__
  5024. }
  5025. ;;
  5026. { .mfb
  5027. STFD [C4 ] = f90, SIZE
  5028. mov f65 = f0
  5029. nop __LINE__
  5030. }
  5031. { .mfb
  5032. STFD [C12] = f94, SIZE
  5033. mov f73 = f0
  5034. nop __LINE__
  5035. }
  5036. ;;
  5037. { .mfb
  5038. STFD [C4 ] = f91, 5 * SIZE
  5039. mov f81 = f0
  5040. nop __LINE__
  5041. }
  5042. { .mfb
  5043. STFD [C12] = f95, 5 * SIZE
  5044. mov f89 = f0
  5045. (p6) br.cond.dptk .L052
  5046. }
  5047. ;;
  5048. #else
  5049. { .mfi
  5050. nop __LINE__
  5051. FMPY f64 = ALPHA, f64
  5052. cmp.ne p6, p0 = 1, I
  5053. }
  5054. { .mfb
  5055. nop __LINE__
  5056. FMPY f68 = ALPHA, f68
  5057. nop __LINE__
  5058. }
  5059. ;;
  5060. { .mfi
  5061. nop __LINE__
  5062. FMPY f65 = ALPHA, f65
  5063. adds I = -1, I
  5064. }
  5065. { .mfb
  5066. nop __LINE__
  5067. FMPY f69 = ALPHA, f69
  5068. nop __LINE__
  5069. }
  5070. ;;
  5071. { .mfb
  5072. nop __LINE__
  5073. FMPY f66 = ALPHA, f66
  5074. nop __LINE__
  5075. }
  5076. { .mfb
  5077. nop __LINE__
  5078. FMPY f70 = ALPHA, f70
  5079. nop __LINE__
  5080. }
  5081. ;;
  5082. { .mfb
  5083. nop __LINE__
  5084. FMPY f67 = ALPHA, f67
  5085. nop __LINE__
  5086. }
  5087. { .mfb
  5088. nop __LINE__
  5089. FMPY f71 = ALPHA, f71
  5090. nop __LINE__
  5091. }
  5092. ;;
  5093. { .mfb
  5094. STFD [C1 ] = f64, SIZE
  5095. FMPY f72 = ALPHA, f72
  5096. nop __LINE__
  5097. }
  5098. { .mfb
  5099. STFD [C9 ] = f68, SIZE
  5100. FMPY f76 = ALPHA, f76
  5101. nop __LINE__
  5102. }
  5103. ;;
  5104. { .mfb
  5105. STFD [C1 ] = f65, SIZE
  5106. FMPY f73 = ALPHA, f73
  5107. nop __LINE__
  5108. }
  5109. { .mfb
  5110. STFD [C9 ] = f69, SIZE
  5111. FMPY f77 = ALPHA, f77
  5112. nop __LINE__
  5113. }
  5114. ;;
  5115. { .mfb
  5116. STFD [C1 ] = f66, SIZE
  5117. FMPY f74 = ALPHA, f74
  5118. nop __LINE__
  5119. }
  5120. { .mfb
  5121. STFD [C9 ] = f70, SIZE
  5122. FMPY f78 = ALPHA, f78
  5123. nop __LINE__
  5124. }
  5125. ;;
  5126. { .mfb
  5127. STFD [C1 ] = f67, 5 * SIZE
  5128. FMPY f75 = ALPHA, f75
  5129. nop __LINE__
  5130. }
  5131. { .mfb
  5132. STFD [C9 ] = f71, 5 * SIZE
  5133. FMPY f79 = ALPHA, f79
  5134. nop __LINE__
  5135. }
  5136. ;;
  5137. { .mfb
  5138. STFD [C2 ] = f72, SIZE
  5139. FMPY f80 = ALPHA, f80
  5140. nop __LINE__
  5141. }
  5142. { .mfb
  5143. STFD [C10] = f76, SIZE
  5144. FMPY f84 = ALPHA, f84
  5145. nop __LINE__
  5146. }
  5147. ;;
  5148. { .mfb
  5149. STFD [C2 ] = f73, SIZE
  5150. FMPY f81 = ALPHA, f81
  5151. nop __LINE__
  5152. }
  5153. { .mfb
  5154. STFD [C10] = f77, SIZE
  5155. FMPY f85 = ALPHA, f85
  5156. nop __LINE__
  5157. }
  5158. ;;
  5159. { .mfb
  5160. STFD [C2 ] = f74, SIZE
  5161. FMPY f82 = ALPHA, f82
  5162. nop __LINE__
  5163. }
  5164. { .mfb
  5165. STFD [C10] = f78, SIZE
  5166. FMPY f86 = ALPHA, f86
  5167. nop __LINE__
  5168. }
  5169. ;;
  5170. { .mfb
  5171. STFD [C2 ] = f75, 5 * SIZE
  5172. FMPY f83 = ALPHA, f83
  5173. nop __LINE__
  5174. }
  5175. { .mfb
  5176. STFD [C10] = f79, 5 * SIZE
  5177. FMPY f87 = ALPHA, f87
  5178. nop __LINE__
  5179. }
  5180. ;;
  5181. { .mfb
  5182. STFD [C3 ] = f80, SIZE
  5183. FMPY f88 = ALPHA, f88
  5184. nop __LINE__
  5185. }
  5186. { .mfb
  5187. STFD [C11] = f84, SIZE
  5188. FMPY f92 = ALPHA, f92
  5189. nop __LINE__
  5190. }
  5191. ;;
  5192. { .mfb
  5193. STFD [C3 ] = f81, SIZE
  5194. FMPY f89 = ALPHA, f89
  5195. nop __LINE__
  5196. }
  5197. { .mfb
  5198. STFD [C11] = f85, SIZE
  5199. FMPY f93 = ALPHA, f93
  5200. nop __LINE__
  5201. }
  5202. ;;
  5203. { .mfi
  5204. STFD [C3 ] = f82, SIZE
  5205. FMPY f90 = ALPHA, f90
  5206. #if defined(TRMMKERNEL) && \
  5207. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5208. sub L = K, KK
  5209. #else
  5210. nop __LINE__
  5211. #endif
  5212. }
  5213. { .mfb
  5214. STFD [C11] = f86, SIZE
  5215. FMPY f94 = ALPHA, f94
  5216. nop __LINE__
  5217. }
  5218. ;;
  5219. { .mfi
  5220. STFD [C3 ] = f83, 5 * SIZE
  5221. FMPY f91 = ALPHA, f91
  5222. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  5223. adds L = -8, L
  5224. #else
  5225. nop __LINE__
  5226. #endif
  5227. }
  5228. { .mfi
  5229. STFD [C11] = f87, 5 * SIZE
  5230. FMPY f95 = ALPHA, f95
  5231. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  5232. adds L = -4, L
  5233. #else
  5234. nop __LINE__
  5235. #endif
  5236. }
  5237. ;;
  5238. { .mfi
  5239. STFD [C4 ] = f88, SIZE
  5240. mov f64 = f0
  5241. #if defined(TRMMKERNEL) && \
  5242. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5243. shladd KK8 = L, BASE_SHIFT, r0
  5244. #else
  5245. nop __LINE__
  5246. #endif
  5247. }
  5248. { .mfb
  5249. STFD [C12] = f92, SIZE
  5250. mov f72 = f0
  5251. nop __LINE__
  5252. }
  5253. ;;
  5254. { .mfi
  5255. STFD [C4 ] = f89, SIZE
  5256. mov f80 = f0
  5257. #if defined(TRMMKERNEL) && \
  5258. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5259. shladd AOFFSET = KK8, 3, AOFFSET
  5260. #else
  5261. nop __LINE__
  5262. #endif
  5263. }
  5264. { .mfi
  5265. STFD [C12] = f93, SIZE
  5266. mov f88 = f0
  5267. #if defined(TRMMKERNEL) && \
  5268. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5269. shladd BOFFSET = KK8, 2, BOFFSET
  5270. #else
  5271. nop __LINE__
  5272. #endif
  5273. }
  5274. ;;
  5275. { .mfi
  5276. STFD [C4 ] = f90, SIZE
  5277. mov f65 = f0
  5278. #if defined(TRMMKERNEL) && defined(LEFT)
  5279. adds KK = 8, KK
  5280. #else
  5281. nop __LINE__
  5282. #endif
  5283. }
  5284. { .mfb
  5285. STFD [C12] = f94, SIZE
  5286. mov f73 = f0
  5287. nop __LINE__
  5288. }
  5289. ;;
  5290. { .mfi
  5291. STFD [C4 ] = f91, 5 * SIZE
  5292. mov f81 = f0
  5293. #ifdef TRMMKERNEL
  5294. shladd KK8 = KK, BASE_SHIFT, r0
  5295. #else
  5296. nop __LINE__
  5297. #endif
  5298. }
  5299. { .mfb
  5300. STFD [C12] = f95, 5 * SIZE
  5301. mov f89 = f0
  5302. (p6) br.cond.dptk .L052
  5303. }
  5304. ;;
  5305. #endif
  5306. .align 32
  5307. .L060:
  5308. { .mfi
  5309. nop __LINE__
  5310. mov f66 = f0
  5311. tbit.z p6, p7 = M, 2
  5312. }
  5313. { .mfb
  5314. #ifndef TRMMKERNEL
  5315. nop __LINE__
  5316. #else
  5317. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5318. sub L = K, KK
  5319. #elif defined(LEFT)
  5320. adds L = 4, KK
  5321. #else
  5322. adds L = 4, KK
  5323. #endif
  5324. #endif
  5325. mov f74 = f0
  5326. (p6) br.cond.dptk .L070
  5327. }
  5328. ;;
  5329. #if !defined(TRMMKERNEL) || \
  5330. defined(TRMMKERNEL) && \
  5331. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5332. { .mfb
  5333. LDFPD f48, f49 = [B]
  5334. mov f82 = f0
  5335. nop __LINE__
  5336. }
  5337. { .mfi
  5338. adds BOFFSET = 2 * SIZE, B
  5339. mov f90 = f0
  5340. #ifndef TRMMKERNEL
  5341. adds L = 1, K
  5342. #else
  5343. adds L = 1, L
  5344. #endif
  5345. }
  5346. ;;
  5347. #else
  5348. { .mfi
  5349. shladd BOFFSET = KK8, 2, B
  5350. mov f82 = f0
  5351. shladd AOFFSET = KK8, 2, AOFFSET
  5352. }
  5353. ;;
  5354. { .mfi
  5355. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5356. mov f90 = f0
  5357. #ifndef TRMMKERNEL
  5358. adds L = 1, K
  5359. #else
  5360. adds L = 1, L
  5361. #endif
  5362. }
  5363. ;;
  5364. #endif
  5365. ;;
  5366. { .mii
  5367. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5368. tbit.z p12, p0 = L, 0
  5369. shr L = L, 1
  5370. }
  5371. ;;
  5372. { .mfi
  5373. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  5374. mov f67 = f0
  5375. adds L = -1, L
  5376. }
  5377. { .mfi
  5378. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  5379. mov f75 = f0
  5380. nop __LINE__
  5381. }
  5382. ;;
  5383. { .mfi
  5384. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  5385. mov f83 = f0
  5386. mov ar.lc = L
  5387. }
  5388. { .mfi
  5389. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  5390. mov f91 = f0
  5391. cmp.eq p3, p0 = r0, r0
  5392. }
  5393. ;;
  5394. .align 32
  5395. .L062:
  5396. { .mfi
  5397. lfetch.nt1 [PREA], 8 * SIZE
  5398. FMA f64 = f32, f48, f64 // A1 * B1
  5399. cmp.ne p4, p5 = 0, L
  5400. }
  5401. { .mfi
  5402. nop __LINE__
  5403. FMA f72 = f32, f49, f72 // A1 * B2
  5404. (p12) cmp.ne p3, p0 = 0, L
  5405. }
  5406. ;;
  5407. { .mfi
  5408. lfetch.nt1 [PREB], 8 * SIZE
  5409. FMA f80 = f32, f50, f80 // A1 * B3
  5410. (p5) adds C9 = 2 * SIZE, C1
  5411. }
  5412. { .mfi
  5413. nop __LINE__
  5414. FMA f88 = f32, f51, f88 // A1 * B4
  5415. (p5) adds C10 = 2 * SIZE, C2
  5416. }
  5417. ;;
  5418. { .mfi
  5419. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  5420. FMA f65 = f33, f48, f65 // A2 * B1
  5421. (p5) adds C11 = 2 * SIZE, C3
  5422. }
  5423. { .mfi
  5424. nop __LINE__
  5425. FMA f73 = f33, f49, f73 // A2 * B2
  5426. (p5) adds C12 = 2 * SIZE, C4
  5427. }
  5428. ;;
  5429. { .mfb
  5430. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  5431. FMA f81 = f33, f50, f81 // A2 * B3
  5432. nop __LINE__
  5433. }
  5434. { .mfb
  5435. nop __LINE__
  5436. FMA f89 = f33, f51, f89 // A2 * B4
  5437. nop __LINE__
  5438. }
  5439. ;;
  5440. { .mfb
  5441. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  5442. FMA f66 = f34, f48, f66 // A3 * B1
  5443. nop __LINE__
  5444. }
  5445. { .mfb
  5446. nop __LINE__
  5447. FMA f74 = f34, f49, f74 // A3 * B2
  5448. nop __LINE__
  5449. }
  5450. ;;
  5451. { .mfb
  5452. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  5453. FMA f82 = f34, f50, f82 // A3 * B3
  5454. nop __LINE__
  5455. }
  5456. { .mfb
  5457. nop __LINE__
  5458. FMA f90 = f34, f51, f90 // A3 * B4
  5459. nop __LINE__
  5460. }
  5461. ;;
  5462. { .mfb
  5463. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5464. FMA f67 = f35, f48, f67 // A4 * B1
  5465. }
  5466. { .mfb
  5467. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5468. (p5) LDFD f68 = [C1 ], SIZE
  5469. #else
  5470. nop __LINE__
  5471. #endif
  5472. FMA f75 = f35, f49, f75 // A4 * B2
  5473. nop __LINE__
  5474. }
  5475. { .mfb
  5476. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5477. FMA f83 = f35, f50, f83 // A4 * B3
  5478. nop __LINE__
  5479. }
  5480. { .mfb
  5481. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5482. (p5) LDFD f70 = [C9 ], SIZE
  5483. #else
  5484. nop __LINE__
  5485. #endif
  5486. FMA f91 = f35, f51, f91 // A4 * B4
  5487. nop __LINE__
  5488. }
  5489. ;;
  5490. { .mfb
  5491. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  5492. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  5493. nop __LINE__
  5494. }
  5495. { .mfb
  5496. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5497. (p5) LDFD f69 = [C1 ], -1 * SIZE
  5498. #else
  5499. nop __LINE__
  5500. #endif
  5501. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  5502. nop __LINE__
  5503. }
  5504. ;;
  5505. { .mfb
  5506. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  5507. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  5508. nop __LINE__
  5509. }
  5510. { .mfb
  5511. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5512. (p5) LDFD f71 = [C9 ], -1 * SIZE
  5513. #else
  5514. nop __LINE__
  5515. #endif
  5516. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  5517. nop __LINE__
  5518. }
  5519. ;;
  5520. { .mfb
  5521. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5522. (p5) LDFD f76 = [C2 ], SIZE
  5523. #else
  5524. nop __LINE__
  5525. #endif
  5526. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  5527. nop __LINE__
  5528. }
  5529. { .mfb
  5530. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5531. (p5) LDFD f78 = [C10], SIZE
  5532. #else
  5533. nop __LINE__
  5534. #endif
  5535. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  5536. nop __LINE__
  5537. }
  5538. ;;
  5539. { .mfb
  5540. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5541. (p5) LDFD f77 = [C2 ], -1 * SIZE
  5542. #else
  5543. nop __LINE__
  5544. #endif
  5545. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  5546. nop __LINE__
  5547. }
  5548. { .mfb
  5549. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5550. (p5) LDFD f79 = [C10], -1 * SIZE
  5551. #else
  5552. nop __LINE__
  5553. #endif
  5554. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  5555. nop __LINE__
  5556. }
  5557. ;;
  5558. { .mfb
  5559. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5560. (p5) LDFD f84 = [C3 ], SIZE
  5561. #else
  5562. nop __LINE__
  5563. #endif
  5564. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  5565. nop __LINE__
  5566. }
  5567. { .mfb
  5568. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5569. (p5) LDFD f86 = [C11], SIZE
  5570. #else
  5571. nop __LINE__
  5572. #endif
  5573. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  5574. nop __LINE__
  5575. }
  5576. ;;
  5577. { .mfb
  5578. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5579. (p5) LDFD f85 = [C3 ], -1 * SIZE
  5580. #else
  5581. nop __LINE__
  5582. #endif
  5583. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  5584. nop __LINE__
  5585. }
  5586. { .mfb
  5587. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5588. (p5) LDFD f87 = [C11], -1 * SIZE
  5589. #else
  5590. nop __LINE__
  5591. #endif
  5592. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  5593. nop __LINE__
  5594. }
  5595. ;;
  5596. { .mfb
  5597. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5598. (p5) LDFD f92 = [C4 ], SIZE
  5599. #else
  5600. nop __LINE__
  5601. #endif
  5602. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  5603. nop __LINE__
  5604. }
  5605. { .mfb
  5606. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5607. (p5) LDFD f94 = [C12], SIZE
  5608. #else
  5609. nop __LINE__
  5610. #endif
  5611. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  5612. nop __LINE__
  5613. }
  5614. ;;
  5615. { .mfi
  5616. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5617. (p5) LDFD f93 = [C4 ], -1 * SIZE
  5618. #else
  5619. nop __LINE__
  5620. #endif
  5621. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  5622. adds L = -1, L
  5623. }
  5624. { .mfb
  5625. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5626. (p5) LDFD f95 = [C12], -1 * SIZE
  5627. #else
  5628. nop __LINE__
  5629. #endif
  5630. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  5631. br.cloop.sptk.few .L062
  5632. }
  5633. ;;
  5634. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5635. FMA f64 = ALPHA, f64, f68
  5636. FMA f66 = ALPHA, f66, f70
  5637. FMA f65 = ALPHA, f65, f69
  5638. FMA f67 = ALPHA, f67, f71
  5639. FMA f72 = ALPHA, f72, f76
  5640. FMA f74 = ALPHA, f74, f78
  5641. FMA f73 = ALPHA, f73, f77
  5642. FMA f75 = ALPHA, f75, f79
  5643. ;;
  5644. { .mfb
  5645. STFD [C1 ] = f64, SIZE
  5646. FMA f80 = ALPHA, f80, f84
  5647. nop __LINE__
  5648. }
  5649. { .mfb
  5650. STFD [C9 ] = f66, SIZE
  5651. FMA f82 = ALPHA, f82, f86
  5652. nop __LINE__
  5653. }
  5654. ;;
  5655. { .mfb
  5656. STFD [C1 ] = f65, 3 * SIZE
  5657. FMA f81 = ALPHA, f81, f85
  5658. nop __LINE__
  5659. }
  5660. { .mfb
  5661. STFD [C9 ] = f67, 3 * SIZE
  5662. FMA f83 = ALPHA, f83, f87
  5663. nop __LINE__
  5664. }
  5665. ;;
  5666. { .mfb
  5667. STFD [C2 ] = f72, SIZE
  5668. FMA f88 = ALPHA, f88, f92
  5669. nop __LINE__
  5670. }
  5671. { .mfb
  5672. STFD [C10] = f74, SIZE
  5673. FMA f90 = ALPHA, f90, f94
  5674. nop __LINE__
  5675. }
  5676. ;;
  5677. { .mfb
  5678. STFD [C2 ] = f73, 3 * SIZE
  5679. FMA f89 = ALPHA, f89, f93
  5680. nop __LINE__
  5681. }
  5682. { .mfb
  5683. STFD [C10] = f75, 3 * SIZE
  5684. FMA f91 = ALPHA, f91, f95
  5685. nop __LINE__
  5686. }
  5687. ;;
  5688. { .mfb
  5689. STFD [C3 ] = f80, SIZE
  5690. mov f80 = f0
  5691. nop __LINE__
  5692. }
  5693. { .mfb
  5694. STFD [C11] = f82, SIZE
  5695. mov f64 = f0
  5696. nop __LINE__
  5697. }
  5698. ;;
  5699. { .mfb
  5700. STFD [C3 ] = f81, 3 * SIZE
  5701. mov f81 = f0
  5702. nop __LINE__
  5703. }
  5704. { .mfb
  5705. STFD [C11] = f83, 3 * SIZE
  5706. mov f72 = f0
  5707. nop __LINE__
  5708. }
  5709. ;;
  5710. { .mfi
  5711. STFD [C4 ] = f88, SIZE
  5712. mov f88 = f0
  5713. adds L = 1, K
  5714. }
  5715. { .mfb
  5716. STFD [C12] = f90, SIZE
  5717. mov f65 = f0
  5718. nop __LINE__
  5719. }
  5720. ;;
  5721. { .mfi
  5722. STFD [C4 ] = f89, 3 * SIZE
  5723. mov f89 = f0
  5724. shr L = L, 1
  5725. }
  5726. { .mfb
  5727. STFD [C12] = f91, 3 * SIZE
  5728. mov f73 = f0
  5729. nop __LINE__
  5730. }
  5731. ;;
  5732. #else
  5733. FMPY f64 = ALPHA, f64
  5734. FMPY f66 = ALPHA, f66
  5735. FMPY f65 = ALPHA, f65
  5736. FMPY f67 = ALPHA, f67
  5737. FMPY f72 = ALPHA, f72
  5738. FMPY f74 = ALPHA, f74
  5739. FMPY f73 = ALPHA, f73
  5740. FMPY f75 = ALPHA, f75
  5741. ;;
  5742. { .mfb
  5743. STFD [C1 ] = f64, SIZE
  5744. FMPY f80 = ALPHA, f80
  5745. nop __LINE__
  5746. }
  5747. { .mfb
  5748. STFD [C9 ] = f66, SIZE
  5749. FMPY f82 = ALPHA, f82
  5750. nop __LINE__
  5751. }
  5752. ;;
  5753. { .mfb
  5754. STFD [C1 ] = f65, 3 * SIZE
  5755. FMPY f81 = ALPHA, f81
  5756. nop __LINE__
  5757. }
  5758. { .mfb
  5759. STFD [C9 ] = f67, 3 * SIZE
  5760. FMPY f83 = ALPHA, f83
  5761. nop __LINE__
  5762. }
  5763. ;;
  5764. { .mfi
  5765. STFD [C2 ] = f72, SIZE
  5766. FMPY f88 = ALPHA, f88
  5767. #if defined(TRMMKERNEL) && \
  5768. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5769. sub L = K, KK
  5770. #else
  5771. nop __LINE__
  5772. #endif
  5773. }
  5774. { .mfb
  5775. STFD [C10] = f74, SIZE
  5776. FMPY f90 = ALPHA, f90
  5777. nop __LINE__
  5778. }
  5779. ;;
  5780. { .mfi
  5781. STFD [C2 ] = f73, 3 * SIZE
  5782. FMPY f89 = ALPHA, f89
  5783. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  5784. adds L = -4, L
  5785. #else
  5786. nop __LINE__
  5787. #endif
  5788. }
  5789. { .mfi
  5790. STFD [C10] = f75, 3 * SIZE
  5791. FMPY f91 = ALPHA, f91
  5792. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  5793. adds L = -4, L
  5794. #else
  5795. nop __LINE__
  5796. #endif
  5797. }
  5798. ;;
  5799. { .mfi
  5800. STFD [C3 ] = f80, SIZE
  5801. mov f80 = f0
  5802. #if defined(TRMMKERNEL) && \
  5803. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5804. shladd KK8 = L, BASE_SHIFT, r0
  5805. #else
  5806. nop __LINE__
  5807. #endif
  5808. }
  5809. { .mfb
  5810. STFD [C11] = f82, SIZE
  5811. mov f64 = f0
  5812. nop __LINE__
  5813. }
  5814. ;;
  5815. { .mfi
  5816. STFD [C3 ] = f81, 3 * SIZE
  5817. mov f81 = f0
  5818. #if defined(TRMMKERNEL) && \
  5819. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5820. shladd AOFFSET = KK8, 2, AOFFSET
  5821. #else
  5822. nop __LINE__
  5823. #endif
  5824. }
  5825. { .mfi
  5826. STFD [C11] = f83, 3 * SIZE
  5827. mov f72 = f0
  5828. #if defined(TRMMKERNEL) && \
  5829. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5830. shladd BOFFSET = KK8, 2, BOFFSET
  5831. #else
  5832. nop __LINE__
  5833. #endif
  5834. }
  5835. ;;
  5836. { .mfi
  5837. STFD [C4 ] = f88, SIZE
  5838. mov f88 = f0
  5839. #if defined(TRMMKERNEL) && defined(LEFT)
  5840. adds KK = 4, KK
  5841. #else
  5842. nop __LINE__
  5843. #endif
  5844. }
  5845. { .mfb
  5846. STFD [C12] = f90, SIZE
  5847. mov f65 = f0
  5848. nop __LINE__
  5849. }
  5850. ;;
  5851. { .mfi
  5852. STFD [C4 ] = f89, 3 * SIZE
  5853. mov f89 = f0
  5854. #ifdef TRMMKERNEL
  5855. shladd KK8 = KK, BASE_SHIFT, r0
  5856. #else
  5857. nop __LINE__
  5858. #endif
  5859. }
  5860. { .mfb
  5861. STFD [C12] = f91, 3 * SIZE
  5862. mov f73 = f0
  5863. nop __LINE__
  5864. }
  5865. ;;
  5866. #endif
  5867. .align 32
  5868. .L070:
  5869. { .mib
  5870. #ifndef TRMMKERNEL
  5871. nop __LINE__
  5872. #else
  5873. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  5874. sub L = K, KK
  5875. #elif defined(LEFT)
  5876. adds L = 2, KK
  5877. #else
  5878. adds L = 4, KK
  5879. #endif
  5880. #endif
  5881. tbit.z p6,p7 = M, 1
  5882. (p6) br.cond.dptk .L080
  5883. }
  5884. ;;
  5885. #if !defined(TRMMKERNEL) || \
  5886. defined(TRMMKERNEL) && \
  5887. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  5888. { .mmi
  5889. LDFPD f48, f49 = [B]
  5890. adds BOFFSET = 2 * SIZE, B
  5891. #ifndef TRMMKERNEL
  5892. adds L = 1, K
  5893. #else
  5894. adds L = 1, L
  5895. #endif
  5896. }
  5897. ;;
  5898. #else
  5899. { .mmi
  5900. shladd BOFFSET = KK8, 2, B
  5901. shladd AOFFSET = KK8, 1, AOFFSET
  5902. nop __LINE__
  5903. }
  5904. ;;
  5905. { .mmi
  5906. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5907. #ifndef TRMMKERNEL
  5908. adds L = 1, K
  5909. #else
  5910. adds L = 1, L
  5911. #endif
  5912. nop __LINE__
  5913. }
  5914. ;;
  5915. #endif
  5916. { .mii
  5917. cmp.eq p3, p0 = r0, r0
  5918. tbit.z p12, p0 = L, 0
  5919. shr L = L, 1
  5920. }
  5921. ;;
  5922. { .mmi
  5923. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5924. adds L = -1, L
  5925. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  5926. }
  5927. ;;
  5928. { .mmi
  5929. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  5930. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  5931. mov ar.lc = L
  5932. }
  5933. ;;
  5934. .align 32
  5935. .L072:
  5936. { .mfb
  5937. lfetch.nt1 [PREA], 4 * SIZE
  5938. FMA f64 = f32, f48, f64 // A1 * B1
  5939. nop __LINE__
  5940. }
  5941. { .mfi
  5942. nop __LINE__
  5943. FMA f72 = f32, f49, f72 // A1 * B2
  5944. (p12) cmp.ne p3, p0 = 0, L
  5945. }
  5946. ;;
  5947. { .mfi
  5948. lfetch.nt1 [PREB], 8 * SIZE
  5949. FMA f80 = f32, f50, f80 // A1 * B3
  5950. cmp.ne p4, p5 = 0, L
  5951. }
  5952. { .mfb
  5953. nop __LINE__
  5954. FMA f88 = f32, f51, f88 // A1 * B4
  5955. nop __LINE__
  5956. }
  5957. ;;
  5958. { .mfi
  5959. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  5960. FMA f65 = f33, f48, f65 // A2 * B1
  5961. }
  5962. { .mfi
  5963. nop __LINE__
  5964. FMA f73 = f33, f49, f73 // A2 * B2
  5965. }
  5966. ;;
  5967. { .mfi
  5968. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  5969. FMA f81 = f33, f50, f81 // A2 * B3
  5970. }
  5971. { .mmf
  5972. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5973. (p5) LDFD f68 = [C1 ], SIZE
  5974. (p5) LDFD f76 = [C2 ], SIZE
  5975. #else
  5976. nop __LINE__
  5977. nop __LINE__
  5978. #endif
  5979. FMA f89 = f33, f51, f89 // A2 * B4
  5980. }
  5981. ;;
  5982. { .mfb
  5983. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  5984. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  5985. nop __LINE__
  5986. }
  5987. { .mmf
  5988. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  5989. (p5) LDFD f69 = [C1 ], -1 * SIZE
  5990. (p5) LDFD f77 = [C2 ], -1 * SIZE
  5991. #else
  5992. nop __LINE__
  5993. nop __LINE__
  5994. #endif
  5995. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  5996. }
  5997. ;;
  5998. { .mfb
  5999. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6000. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  6001. nop __LINE__
  6002. }
  6003. { .mmf
  6004. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6005. (p5) LDFD f84 = [C3 ], SIZE
  6006. (p5) LDFD f92 = [C4 ], SIZE
  6007. #else
  6008. nop __LINE__
  6009. nop __LINE__
  6010. #endif
  6011. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  6012. }
  6013. ;;
  6014. { .mfb
  6015. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6016. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  6017. nop __LINE__
  6018. }
  6019. { .mfb
  6020. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6021. (p5) LDFD f85 = [C3 ], -1 * SIZE
  6022. #else
  6023. nop __LINE__
  6024. #endif
  6025. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  6026. nop __LINE__
  6027. }
  6028. ;;
  6029. { .mfi
  6030. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  6031. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  6032. adds L = -1, L
  6033. }
  6034. { .mfb
  6035. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6036. (p5) LDFD f93 = [C4 ], -1 * SIZE
  6037. #else
  6038. nop __LINE__
  6039. #endif
  6040. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  6041. br.cloop.sptk.few .L072
  6042. }
  6043. ;;
  6044. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6045. FMA f64 = ALPHA, f64, f68
  6046. FMA f65 = ALPHA, f65, f69
  6047. FMA f72 = ALPHA, f72, f76
  6048. FMA f73 = ALPHA, f73, f77
  6049. FMA f80 = ALPHA, f80, f84
  6050. FMA f81 = ALPHA, f81, f85
  6051. FMA f88 = ALPHA, f88, f92
  6052. FMA f89 = ALPHA, f89, f93
  6053. ;;
  6054. { .mfb
  6055. STFD [C1 ] = f64, SIZE
  6056. mov f64 = f0
  6057. nop __LINE__
  6058. }
  6059. { .mfb
  6060. STFD [C2 ] = f72, SIZE
  6061. mov f72 = f0
  6062. nop __LINE__
  6063. }
  6064. ;;
  6065. { .mmi
  6066. STFD [C1 ] = f65, SIZE
  6067. STFD [C2 ] = f73, SIZE
  6068. nop __LINE__
  6069. }
  6070. ;;
  6071. { .mfi
  6072. STFD [C3 ] = f80, SIZE
  6073. mov f80 = f0
  6074. adds L = 1, K
  6075. }
  6076. { .mfb
  6077. STFD [C4 ] = f88, SIZE
  6078. mov f88 = f0
  6079. nop __LINE__
  6080. }
  6081. ;;
  6082. { .mmi
  6083. STFD [C3 ] = f81, SIZE
  6084. STFD [C4 ] = f89, SIZE
  6085. shr L = L, 1
  6086. }
  6087. ;;
  6088. #else
  6089. FMPY f64 = ALPHA, f64
  6090. FMPY f65 = ALPHA, f65
  6091. ;;
  6092. { .mfi
  6093. nop __LINE__
  6094. FMPY f72 = ALPHA, f72
  6095. #if defined(TRMMKERNEL) && \
  6096. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6097. sub L = K, KK
  6098. #else
  6099. nop __LINE__
  6100. #endif
  6101. }
  6102. { .mfi
  6103. nop __LINE__
  6104. FMPY f73 = ALPHA, f73
  6105. nop __LINE__
  6106. }
  6107. ;;
  6108. { .mfi
  6109. FMPY f80 = ALPHA, f80
  6110. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  6111. adds L = -2, L
  6112. #else
  6113. nop __LINE__
  6114. #endif
  6115. }
  6116. { .mfi
  6117. FMPY f81 = ALPHA, f81
  6118. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  6119. adds L = -4, L
  6120. #else
  6121. nop __LINE__
  6122. #endif
  6123. }
  6124. ;;
  6125. { .mfi
  6126. nop __LINE__
  6127. FMPY f88 = ALPHA, f88
  6128. #if defined(TRMMKERNEL) && \
  6129. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6130. shladd KK8 = L, BASE_SHIFT, r0
  6131. #else
  6132. nop __LINE__
  6133. #endif
  6134. }
  6135. { .mfi
  6136. nop __LINE__
  6137. FMPY f89 = ALPHA, f89
  6138. nop __LINE__
  6139. }
  6140. ;;
  6141. { .mfi
  6142. STFD [C1 ] = f64, SIZE
  6143. mov f64 = f0
  6144. #if defined(TRMMKERNEL) && \
  6145. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6146. shladd AOFFSET = KK8, 1, AOFFSET
  6147. #else
  6148. nop __LINE__
  6149. #endif
  6150. }
  6151. { .mfb
  6152. STFD [C2 ] = f72, SIZE
  6153. mov f72 = f0
  6154. nop __LINE__
  6155. }
  6156. ;;
  6157. { .mmi
  6158. STFD [C1 ] = f65, SIZE
  6159. STFD [C2 ] = f73, SIZE
  6160. #if defined(TRMMKERNEL) && \
  6161. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6162. shladd BOFFSET = KK8, 2, BOFFSET
  6163. #else
  6164. nop __LINE__
  6165. #endif
  6166. }
  6167. ;;
  6168. { .mfi
  6169. STFD [C3 ] = f80, SIZE
  6170. mov f80 = f0
  6171. #if defined(TRMMKERNEL) && defined(LEFT)
  6172. adds KK = 2, KK
  6173. #else
  6174. nop __LINE__
  6175. #endif
  6176. }
  6177. { .mfb
  6178. STFD [C4 ] = f88, SIZE
  6179. mov f88 = f0
  6180. nop __LINE__
  6181. }
  6182. ;;
  6183. { .mmi
  6184. STFD [C3 ] = f81, SIZE
  6185. STFD [C4 ] = f89, SIZE
  6186. #ifdef TRMMKERNEL
  6187. shladd KK8 = KK, BASE_SHIFT, r0
  6188. #else
  6189. nop __LINE__
  6190. #endif
  6191. }
  6192. ;;
  6193. #endif
  6194. .align 32
  6195. .L080:
  6196. { .mib
  6197. #ifndef TRMMKERNEL
  6198. nop __LINE__
  6199. #else
  6200. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  6201. sub L = K, KK
  6202. #elif defined(LEFT)
  6203. adds L = 1, KK
  6204. #else
  6205. adds L = 4, KK
  6206. #endif
  6207. #endif
  6208. tbit.z p6,p7 = M, 0
  6209. (p6) br.cond.dptk .L089
  6210. }
  6211. ;;
  6212. #if !defined(TRMMKERNEL) || \
  6213. defined(TRMMKERNEL) && \
  6214. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6215. { .mmi
  6216. LDFPD f48, f49 = [B]
  6217. adds BOFFSET = 2 * SIZE, B
  6218. #ifndef TRMMKERNEL
  6219. adds L = 1, K
  6220. #else
  6221. adds L = 1, L
  6222. #endif
  6223. }
  6224. ;;
  6225. #else
  6226. { .mmi
  6227. shladd BOFFSET = KK8, 2, B
  6228. add AOFFSET = KK8, AOFFSET
  6229. nop __LINE__
  6230. }
  6231. ;;
  6232. { .mmi
  6233. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6234. #ifndef TRMMKERNEL
  6235. adds L = 1, K
  6236. #else
  6237. adds L = 1, L
  6238. #endif
  6239. nop __LINE__
  6240. }
  6241. ;;
  6242. #endif
  6243. { .mii
  6244. LDFD f32 = [AOFFSET], 1 * SIZE
  6245. tbit.z p12, p0 = L, 0
  6246. shr L = L, 1
  6247. }
  6248. ;;
  6249. { .mmi
  6250. nop __LINE__
  6251. nop __LINE__
  6252. adds L = -1, L
  6253. }
  6254. ;;
  6255. { .mmi
  6256. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  6257. cmp.eq p3, p0 = r0, r0
  6258. mov ar.lc = L
  6259. }
  6260. ;;
  6261. .align 32
  6262. .L082:
  6263. { .mfb
  6264. cmp.ne p4, p5 = 0, L
  6265. FMA f64 = f32, f48, f64 // A1 * B1
  6266. nop __LINE__
  6267. }
  6268. { .mfi
  6269. (p12) cmp.ne p3, p0 = 0, L
  6270. FMA f72 = f32, f49, f72 // A1 * B2
  6271. nop __LINE__
  6272. }
  6273. ;;
  6274. { .mfb
  6275. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  6276. FMA f80 = f32, f50, f80 // A1 * B3
  6277. nop __LINE__
  6278. }
  6279. { .mfb
  6280. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  6281. FMA f88 = f32, f51, f88 // A1 * B4
  6282. nop __LINE__
  6283. }
  6284. ;;
  6285. { .mfb
  6286. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  6287. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6288. nop __LINE__
  6289. }
  6290. { .mfb
  6291. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6292. (p5) LDFD f68 = [C1]
  6293. #else
  6294. nop __LINE__
  6295. #endif
  6296. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  6297. nop __LINE__
  6298. }
  6299. ;;
  6300. { .mmf
  6301. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6302. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  6303. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  6304. }
  6305. { .mmf
  6306. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6307. (p5) LDFD f76 = [C2]
  6308. (p5) LDFD f84 = [C3]
  6309. #else
  6310. nop __LINE__
  6311. nop __LINE__
  6312. #endif
  6313. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  6314. }
  6315. ;;
  6316. { .mib
  6317. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  6318. nop __LINE__
  6319. nop __LINE__
  6320. }
  6321. { .mmb
  6322. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6323. (p5) LDFD f92 = [C4]
  6324. #else
  6325. nop __LINE__
  6326. #endif
  6327. adds L = -1, L
  6328. br.cloop.sptk.few .L082
  6329. }
  6330. ;;
  6331. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6332. FMA f64 = ALPHA, f64, f68
  6333. FMA f72 = ALPHA, f72, f76
  6334. FMA f80 = ALPHA, f80, f84
  6335. FMA f88 = ALPHA, f88, f92
  6336. ;;
  6337. STFD [C1 ] = f64, SIZE
  6338. STFD [C2 ] = f72, SIZE
  6339. STFD [C3 ] = f80, SIZE
  6340. STFD [C4 ] = f88, SIZE
  6341. ;;
  6342. #else
  6343. { .mfi
  6344. nop __LINE__
  6345. FMPY f64 = ALPHA, f64
  6346. #if defined(TRMMKERNEL) && \
  6347. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6348. sub L = K, KK
  6349. #else
  6350. nop __LINE__
  6351. #endif
  6352. }
  6353. { .mfi
  6354. nop __LINE__
  6355. FMPY f72 = ALPHA, f72
  6356. nop __LINE__
  6357. }
  6358. ;;
  6359. { .mfi
  6360. FMPY f80 = ALPHA, f80
  6361. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  6362. adds L = -1, L
  6363. #else
  6364. nop __LINE__
  6365. #endif
  6366. }
  6367. { .mfi
  6368. FMPY f88 = ALPHA, f88
  6369. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  6370. adds L = -4, L
  6371. #else
  6372. nop __LINE__
  6373. #endif
  6374. }
  6375. ;;
  6376. { .mmi
  6377. #if defined(TRMMKERNEL) && \
  6378. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6379. shladd KK8 = L, BASE_SHIFT, r0
  6380. #else
  6381. nop __LINE__
  6382. #endif
  6383. ;;
  6384. #if defined(TRMMKERNEL) && \
  6385. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6386. add AOFFSET = KK8, AOFFSET
  6387. #else
  6388. nop __LINE__
  6389. #endif
  6390. #if defined(TRMMKERNEL) && \
  6391. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6392. shladd BOFFSET = KK8, 2, BOFFSET
  6393. #else
  6394. nop __LINE__
  6395. #endif
  6396. }
  6397. ;;
  6398. { .mmi
  6399. STFD [C1 ] = f64, SIZE
  6400. STFD [C2 ] = f72, SIZE
  6401. #if defined(TRMMKERNEL) && defined(LEFT)
  6402. adds KK = 1, KK
  6403. #else
  6404. nop __LINE__
  6405. #endif
  6406. }
  6407. ;;
  6408. { .mmi
  6409. STFD [C3 ] = f80, SIZE
  6410. STFD [C4 ] = f88, SIZE
  6411. #ifdef TRMMKERNEL
  6412. shladd KK8 = KK, BASE_SHIFT, r0
  6413. #else
  6414. nop __LINE__
  6415. #endif
  6416. }
  6417. ;;
  6418. #endif
  6419. .align 32
  6420. .L089:
  6421. { .mmi
  6422. mov B = BOFFSET
  6423. mov AOFFSET = A
  6424. #if defined(TRMMKERNEL) && !defined(LEFT)
  6425. adds KK = 4, KK
  6426. #else
  6427. nop __LINE__
  6428. #endif
  6429. }
  6430. ;;
  6431. .align 16
  6432. #endif
  6433. .L090:
  6434. { .mfi
  6435. mov C1 = C
  6436. mov f64 = f0
  6437. tbit.z p6, p0 = N, 1
  6438. }
  6439. { .mfi
  6440. add C2 = LDC, C
  6441. mov f72 = f0
  6442. shr I = M, 3
  6443. }
  6444. ;;
  6445. { .mfi
  6446. setf.d f66 = r0
  6447. mov f65 = f0
  6448. #if defined(TRMMKERNEL) && defined(LEFT)
  6449. mov KK = OFFSET
  6450. #else
  6451. nop __LINE__
  6452. #endif
  6453. }
  6454. { .mfb
  6455. mov AOFFSET = A
  6456. mov f73 = f0
  6457. (p6) br.cond.dpnt .L130
  6458. }
  6459. ;;
  6460. #if 0
  6461. { .mfi
  6462. #ifdef TRMMKERNEL
  6463. shladd KK8 = KK, BASE_SHIFT, r0
  6464. #else
  6465. nop __LINE__
  6466. #endif
  6467. mov f67 = f0
  6468. shladd C = LDC, 1, C
  6469. }
  6470. { .mfb
  6471. cmp.eq p6, p7 = 0, I
  6472. mov f74 = f0
  6473. (p6) br.cond.dpnt .L100
  6474. }
  6475. ;;
  6476. .align 32
  6477. .L092:
  6478. #if !defined(TRMMKERNEL) || \
  6479. defined(TRMMKERNEL) && \
  6480. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  6481. { .mfb
  6482. LDFPD f48, f49 = [B]
  6483. mov f68 = f0
  6484. nop __LINE__
  6485. }
  6486. { .mfb
  6487. adds BOFFSET = 2 * SIZE, B
  6488. mov f79 = f0
  6489. nop __LINE__
  6490. }
  6491. ;;
  6492. #else
  6493. { .mfi
  6494. shladd BOFFSET = KK8, 1, B
  6495. mov f68 = f0
  6496. shladd AOFFSET = KK8, 3, AOFFSET
  6497. }
  6498. ;;
  6499. { .mfi
  6500. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6501. mov f79 = f0
  6502. nop __LINE__
  6503. }
  6504. ;;
  6505. #endif
  6506. { .mfi
  6507. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6508. mov f75 = f0
  6509. #ifndef TRMMKERNEL
  6510. nop __LINE__
  6511. #else
  6512. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  6513. sub L = K, KK
  6514. #elif defined(LEFT)
  6515. adds L = 8, KK
  6516. #else
  6517. adds L = 2, KK
  6518. #endif
  6519. #endif
  6520. }
  6521. ;;
  6522. { .mfi
  6523. adds PREC = CPREFETCHSIZE * SIZE, C1
  6524. mov f76 = f0
  6525. #ifndef TRMMKERNEL
  6526. adds L = 1, K
  6527. #else
  6528. adds L = 1, L
  6529. #endif
  6530. }
  6531. ;;
  6532. { .mfi
  6533. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  6534. mov f69 = f0
  6535. tbit.z p12, p0 = L, 0
  6536. }
  6537. { .mfi
  6538. cmp.eq p3, p0 = r0, r0
  6539. mov f77 = f0
  6540. shr L = L, 1
  6541. }
  6542. ;;
  6543. { .mfi
  6544. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  6545. adds L = -1, L
  6546. }
  6547. { .mmf
  6548. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  6549. CPREFETCH [PREC], LDC
  6550. mov f70 = f0
  6551. }
  6552. ;;
  6553. { .mfi
  6554. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  6555. mov f78 = f0
  6556. mov ar.lc = L
  6557. }
  6558. { .mfi
  6559. CPREFETCH [PREC]
  6560. mov f71 = f0
  6561. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  6562. }
  6563. ;;
  6564. .align 32
  6565. .L093:
  6566. /* 1 */
  6567. { .mfi
  6568. lfetch.nt1 [PREA], 16 * SIZE
  6569. FMA f64 = f32, f48, f64 // A1 * B1
  6570. cmp.ne p4, p5 = 0, L
  6571. }
  6572. { .mfi
  6573. nop __LINE__
  6574. FMA f72 = f32, f49, f72 // A1 * B2
  6575. (p12) cmp.ne p3, p0 = 0, L
  6576. }
  6577. ;;
  6578. { .mfi
  6579. lfetch.nt1 [PREB], 4 * SIZE
  6580. FMA f65 = f33, f48, f65 // A2 * B1
  6581. adds C9 = 4 * SIZE, C1
  6582. }
  6583. { .mfi
  6584. nop __LINE__
  6585. FMA f73 = f33, f49, f73 // A2 * B2
  6586. adds C10 = 4 * SIZE, C2
  6587. }
  6588. ;;
  6589. { .mfi
  6590. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  6591. FMA f66 = f34, f48, f66 // A3 * B1
  6592. adds C11 = 4 * SIZE, C3
  6593. }
  6594. { .mfi
  6595. nop __LINE__
  6596. FMA f74 = f34, f49, f74 // A3 * B2
  6597. adds C12 = 4 * SIZE, C4
  6598. }
  6599. ;;
  6600. { .mfb
  6601. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  6602. FMA f67 = f35, f48, f67 // A4 * B1
  6603. nop __LINE__
  6604. }
  6605. { .mfb
  6606. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6607. (p5) LDFD f96 = [C1 ], SIZE
  6608. #else
  6609. nop __LINE__
  6610. #endif
  6611. FMA f75 = f35, f49, f75 // A4 * B2
  6612. nop __LINE__
  6613. }
  6614. ;;
  6615. { .mfb
  6616. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  6617. FMA f68 = f36, f48, f68 // A5 * B1
  6618. nop __LINE__
  6619. }
  6620. { .mfb
  6621. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6622. (p5) LDFD f97 = [C9 ], SIZE
  6623. #else
  6624. nop __LINE__
  6625. #endif
  6626. FMA f76 = f36, f49, f76 // A5 * B2
  6627. nop __LINE__
  6628. }
  6629. ;;
  6630. { .mfb
  6631. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  6632. FMA f69 = f37, f48, f69 // A6 * B1
  6633. nop __LINE__
  6634. }
  6635. { .mfb
  6636. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6637. (p5) LDFD f98 = [C1 ], SIZE
  6638. #else
  6639. nop __LINE__
  6640. #endif
  6641. FMA f77 = f37, f49, f77 // A6 * B2
  6642. nop __LINE__
  6643. }
  6644. ;;
  6645. { .mfb
  6646. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  6647. FMA f70 = f38, f48, f70 // A7 * B1
  6648. nop __LINE__
  6649. }
  6650. { .mfb
  6651. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6652. (p5) LDFD f99 = [C9 ], SIZE
  6653. #else
  6654. nop __LINE__
  6655. #endif
  6656. FMA f78 = f38, f49, f78 // A7 * B2
  6657. nop __LINE__
  6658. }
  6659. ;;
  6660. { .mfb
  6661. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6662. FMA f71 = f39, f48, f71 // A8 * B1
  6663. nop __LINE__
  6664. }
  6665. { .mfb
  6666. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6667. (p5) LDFD f100 = [C1 ], SIZE
  6668. #else
  6669. nop __LINE__
  6670. #endif
  6671. FMA f79 = f39, f49, f79 // A8 * B2
  6672. nop __LINE__
  6673. }
  6674. ;;
  6675. { .mfb
  6676. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6677. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6678. nop __LINE__
  6679. }
  6680. { .mfb
  6681. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6682. (p5) LDFD f101 = [C9 ], SIZE
  6683. #else
  6684. nop __LINE__
  6685. #endif
  6686. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  6687. nop __LINE__
  6688. }
  6689. ;;
  6690. { .mfb
  6691. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  6692. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  6693. nop __LINE__
  6694. }
  6695. { .mfb
  6696. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6697. (p5) LDFD f102 = [C1 ], -3 * SIZE
  6698. #else
  6699. nop __LINE__
  6700. #endif
  6701. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  6702. nop __LINE__
  6703. }
  6704. ;;
  6705. { .mfb
  6706. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  6707. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  6708. nop __LINE__
  6709. }
  6710. { .mfb
  6711. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6712. (p5) LDFD f103 = [C9 ], -3 * SIZE
  6713. #else
  6714. nop __LINE__
  6715. #endif
  6716. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  6717. nop __LINE__
  6718. }
  6719. ;;
  6720. { .mfb
  6721. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  6722. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  6723. nop __LINE__
  6724. }
  6725. { .mfb
  6726. nop __LINE__
  6727. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  6728. nop __LINE__
  6729. }
  6730. ;;
  6731. { .mfb
  6732. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6733. (p5) LDFD f104 = [C2 ], SIZE
  6734. #else
  6735. nop __LINE__
  6736. #endif
  6737. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  6738. nop __LINE__
  6739. }
  6740. { .mfb
  6741. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6742. (p5) LDFD f105 = [C10], SIZE
  6743. #else
  6744. nop __LINE__
  6745. #endif
  6746. (p3) FMA f76 = f44, f57, f76 // A5 * B2
  6747. nop __LINE__
  6748. }
  6749. ;;
  6750. { .mfb
  6751. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6752. (p5) LDFD f106 = [C2 ], SIZE
  6753. #else
  6754. nop __LINE__
  6755. #endif
  6756. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  6757. nop __LINE__
  6758. }
  6759. { .mfb
  6760. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6761. (p5) LDFD f107 = [C10], SIZE
  6762. #else
  6763. nop __LINE__
  6764. #endif
  6765. (p3) FMA f77 = f45, f57, f77 // A6 * B2
  6766. nop __LINE__
  6767. }
  6768. ;;
  6769. { .mfb
  6770. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6771. (p5) LDFD f108 = [C2 ], SIZE
  6772. #else
  6773. nop __LINE__
  6774. #endif
  6775. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  6776. nop __LINE__
  6777. }
  6778. { .mfb
  6779. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6780. (p5) LDFD f109 = [C10], SIZE
  6781. #else
  6782. nop __LINE__
  6783. #endif
  6784. (p3) FMA f78 = f46, f57, f78 // A7 * B2
  6785. nop __LINE__
  6786. }
  6787. ;;
  6788. { .mfi
  6789. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6790. (p5) LDFD f110 = [C2 ], -3 * SIZE
  6791. #else
  6792. nop __LINE__
  6793. #endif
  6794. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  6795. adds L = -1, L
  6796. }
  6797. { .mfb
  6798. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6799. (p5) LDFD f111 = [C10], -3 * SIZE
  6800. #else
  6801. nop __LINE__
  6802. #endif
  6803. (p3) FMA f79 = f47, f57, f79 // A8 * B2
  6804. br.cloop.sptk.few .L093
  6805. }
  6806. ;;
  6807. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  6808. { .mfi
  6809. nop __LINE__
  6810. FMA f64 = ALPHA, f64, f96
  6811. cmp.ne p6, p0 = 1, I
  6812. }
  6813. { .mfb
  6814. nop __LINE__
  6815. FMA f68 = ALPHA, f68, f97
  6816. nop __LINE__
  6817. }
  6818. ;;
  6819. { .mfi
  6820. nop __LINE__
  6821. FMA f65 = ALPHA, f65, f98
  6822. adds I = -1, I
  6823. }
  6824. { .mfb
  6825. nop __LINE__
  6826. FMA f69 = ALPHA, f69, f99
  6827. nop __LINE__
  6828. }
  6829. ;;
  6830. { .mfi
  6831. nop __LINE__
  6832. FMA f66 = ALPHA, f66, f100
  6833. nop __LINE__
  6834. }
  6835. { .mfb
  6836. nop __LINE__
  6837. FMA f70 = ALPHA, f70, f101
  6838. nop __LINE__
  6839. }
  6840. ;;
  6841. { .mfb
  6842. nop __LINE__
  6843. FMA f67 = ALPHA, f67, f102
  6844. nop __LINE__
  6845. }
  6846. { .mfb
  6847. nop __LINE__
  6848. FMA f71 = ALPHA, f71, f103
  6849. nop __LINE__
  6850. }
  6851. ;;
  6852. { .mfb
  6853. STFD [C1 ] = f64, SIZE
  6854. FMA f72 = ALPHA, f72, f104
  6855. nop __LINE__
  6856. }
  6857. { .mfb
  6858. STFD [C9 ] = f68, SIZE
  6859. FMA f76 = ALPHA, f76, f105
  6860. nop __LINE__
  6861. }
  6862. ;;
  6863. { .mfb
  6864. STFD [C1 ] = f65, SIZE
  6865. FMA f73 = ALPHA, f73, f106
  6866. nop __LINE__
  6867. }
  6868. { .mfb
  6869. STFD [C9 ] = f69, SIZE
  6870. FMA f77 = ALPHA, f77, f107
  6871. nop __LINE__
  6872. }
  6873. ;;
  6874. { .mfb
  6875. STFD [C1 ] = f66, SIZE
  6876. FMA f74 = ALPHA, f74, f108
  6877. nop __LINE__
  6878. }
  6879. { .mfb
  6880. STFD [C9 ] = f70, SIZE
  6881. FMA f78 = ALPHA, f78, f109
  6882. nop __LINE__
  6883. }
  6884. ;;
  6885. { .mfb
  6886. STFD [C1 ] = f67, 5 * SIZE
  6887. FMA f75 = ALPHA, f75, f110
  6888. nop __LINE__
  6889. }
  6890. { .mfb
  6891. STFD [C9 ] = f71, 5 * SIZE
  6892. FMA f79 = ALPHA, f79, f111
  6893. nop __LINE__
  6894. }
  6895. ;;
  6896. { .mfb
  6897. STFD [C2 ] = f72, SIZE
  6898. mov f64 = f0
  6899. nop __LINE__
  6900. }
  6901. { .mfb
  6902. STFD [C10] = f76, SIZE
  6903. mov f72 = f0
  6904. nop __LINE__
  6905. }
  6906. ;;
  6907. { .mfb
  6908. STFD [C2 ] = f73, SIZE
  6909. mov f65 = f0
  6910. nop __LINE__
  6911. }
  6912. { .mfb
  6913. STFD [C10] = f77, SIZE
  6914. mov f73 = f0
  6915. nop __LINE__
  6916. }
  6917. ;;
  6918. { .mfb
  6919. STFD [C2 ] = f74, SIZE
  6920. mov f66 = f0
  6921. nop __LINE__
  6922. }
  6923. { .mfb
  6924. STFD [C10] = f78, SIZE
  6925. mov f74 = f0
  6926. nop __LINE__
  6927. }
  6928. ;;
  6929. { .mfb
  6930. STFD [C2 ] = f75, 5 * SIZE
  6931. mov f67 = f0
  6932. nop __LINE__
  6933. }
  6934. { .mfb
  6935. STFD [C10] = f79, 5 * SIZE
  6936. (p6) br.cond.dptk .L092
  6937. }
  6938. ;;
  6939. #else
  6940. { .mfi
  6941. nop __LINE__
  6942. FMPY f64 = ALPHA, f64
  6943. cmp.ne p6, p0 = 1, I
  6944. }
  6945. { .mfb
  6946. nop __LINE__
  6947. FMPY f68 = ALPHA, f68
  6948. nop __LINE__
  6949. }
  6950. ;;
  6951. { .mfi
  6952. nop __LINE__
  6953. FMPY f65 = ALPHA, f65
  6954. adds I = -1, I
  6955. }
  6956. { .mfb
  6957. nop __LINE__
  6958. FMPY f69 = ALPHA, f69
  6959. nop __LINE__
  6960. }
  6961. ;;
  6962. { .mfi
  6963. nop __LINE__
  6964. FMPY f66 = ALPHA, f66
  6965. nop __LINE__
  6966. }
  6967. { .mfb
  6968. nop __LINE__
  6969. FMPY f70 = ALPHA, f70
  6970. nop __LINE__
  6971. }
  6972. ;;
  6973. { .mfb
  6974. nop __LINE__
  6975. FMPY f67 = ALPHA, f67
  6976. nop __LINE__
  6977. }
  6978. { .mfb
  6979. nop __LINE__
  6980. FMPY f71 = ALPHA, f71
  6981. nop __LINE__
  6982. }
  6983. ;;
  6984. { .mfb
  6985. STFD [C1 ] = f64, SIZE
  6986. FMPY f72 = ALPHA, f72
  6987. nop __LINE__
  6988. }
  6989. { .mfb
  6990. STFD [C9 ] = f68, SIZE
  6991. FMPY f76 = ALPHA, f76
  6992. nop __LINE__
  6993. }
  6994. ;;
  6995. { .mfb
  6996. STFD [C1 ] = f65, SIZE
  6997. FMPY f73 = ALPHA, f73
  6998. nop __LINE__
  6999. }
  7000. { .mfb
  7001. STFD [C9 ] = f69, SIZE
  7002. FMPY f77 = ALPHA, f77
  7003. nop __LINE__
  7004. }
  7005. ;;
  7006. { .mfi
  7007. STFD [C1 ] = f66, SIZE
  7008. FMPY f74 = ALPHA, f74
  7009. #if defined(TRMMKERNEL) && \
  7010. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7011. sub L = K, KK
  7012. #else
  7013. nop __LINE__
  7014. #endif
  7015. }
  7016. { .mfb
  7017. STFD [C9 ] = f70, SIZE
  7018. FMPY f78 = ALPHA, f78
  7019. nop __LINE__
  7020. }
  7021. ;;
  7022. { .mfi
  7023. STFD [C1 ] = f67, 5 * SIZE
  7024. FMPY f75 = ALPHA, f75
  7025. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  7026. adds L = -8, L
  7027. #else
  7028. nop __LINE__
  7029. #endif
  7030. }
  7031. { .mfi
  7032. STFD [C9 ] = f71, 5 * SIZE
  7033. FMPY f79 = ALPHA, f79
  7034. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  7035. adds L = -2, L
  7036. #else
  7037. nop __LINE__
  7038. #endif
  7039. }
  7040. ;;
  7041. { .mfi
  7042. STFD [C2 ] = f72, SIZE
  7043. mov f64 = f0
  7044. #if defined(TRMMKERNEL) && \
  7045. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7046. shladd KK8 = L, BASE_SHIFT, r0
  7047. #else
  7048. nop __LINE__
  7049. #endif
  7050. }
  7051. { .mfb
  7052. STFD [C10] = f76, SIZE
  7053. mov f72 = f0
  7054. nop __LINE__
  7055. }
  7056. ;;
  7057. { .mfi
  7058. STFD [C2 ] = f73, SIZE
  7059. mov f65 = f0
  7060. #if defined(TRMMKERNEL) && \
  7061. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7062. shladd AOFFSET = KK8, 3, AOFFSET
  7063. #else
  7064. nop __LINE__
  7065. #endif
  7066. }
  7067. { .mfi
  7068. STFD [C10] = f77, SIZE
  7069. mov f73 = f0
  7070. #if defined(TRMMKERNEL) && \
  7071. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7072. shladd BOFFSET = KK8, 1, BOFFSET
  7073. #else
  7074. nop __LINE__
  7075. #endif
  7076. }
  7077. ;;
  7078. { .mfi
  7079. STFD [C2 ] = f74, SIZE
  7080. mov f66 = f0
  7081. #if defined(TRMMKERNEL) && defined(LEFT)
  7082. adds KK = 8, KK
  7083. #else
  7084. nop __LINE__
  7085. #endif
  7086. }
  7087. { .mfb
  7088. STFD [C10] = f78, SIZE
  7089. mov f74 = f0
  7090. nop __LINE__
  7091. }
  7092. ;;
  7093. { .mfi
  7094. STFD [C2 ] = f75, 5 * SIZE
  7095. mov f67 = f0
  7096. #ifdef TRMMKERNEL
  7097. shladd KK8 = KK, BASE_SHIFT, r0
  7098. #else
  7099. nop __LINE__
  7100. #endif
  7101. }
  7102. { .mib
  7103. STFD [C10] = f79, 5 * SIZE
  7104. nop __LINE__
  7105. (p6) br.cond.dptk .L092
  7106. }
  7107. ;;
  7108. #endif
  7109. .align 32
  7110. .L100:
  7111. { .mib
  7112. #ifndef TRMMKERNEL
  7113. nop __LINE__
  7114. #else
  7115. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  7116. sub L = K, KK
  7117. #elif defined(LEFT)
  7118. adds L = 4, KK
  7119. #else
  7120. adds L = 2, KK
  7121. #endif
  7122. #endif
  7123. tbit.z p6, p7 = M, 2
  7124. (p6) br.cond.dptk .L110
  7125. }
  7126. ;;
  7127. #if !defined(TRMMKERNEL) || \
  7128. defined(TRMMKERNEL) && \
  7129. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7130. { .mmf
  7131. LDFPD f48, f49 = [B]
  7132. adds BOFFSET = 2 * SIZE, B
  7133. mov f75 = f0
  7134. }
  7135. { .mii
  7136. nop __LINE__
  7137. #ifndef TRMMKERNEL
  7138. adds L = 1, K
  7139. #else
  7140. adds L = 1, L
  7141. #endif
  7142. }
  7143. ;;
  7144. #else
  7145. { .mfi
  7146. shladd BOFFSET = KK8, 1, B
  7147. mov f75 = f0
  7148. shladd AOFFSET = KK8, 2, AOFFSET
  7149. }
  7150. ;;
  7151. { .mmi
  7152. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7153. nop __LINE__
  7154. adds L = 1, L
  7155. }
  7156. ;;
  7157. #endif
  7158. ;;
  7159. { .mii
  7160. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  7161. tbit.z p12, p0 = L, 0
  7162. shr L = L, 1
  7163. }
  7164. ;;
  7165. { .mmi
  7166. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7167. nop __LINE__
  7168. adds L = -1, L
  7169. }
  7170. ;;
  7171. { .mmi
  7172. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  7173. cmp.eq p3, p0 = r0, r0
  7174. mov ar.lc = L
  7175. }
  7176. ;;
  7177. .align 32
  7178. .L102:
  7179. { .mfi
  7180. lfetch.nt1 [PREA], 8 * SIZE
  7181. FMA f64 = f32, f48, f64 // A1 * B1
  7182. cmp.ne p4, p5 = 0, L
  7183. }
  7184. { .mfi
  7185. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  7186. FMA f72 = f32, f49, f72 // A1 * B2
  7187. (p12) cmp.ne p3, p0 = 0, L
  7188. }
  7189. ;;
  7190. { .mfi
  7191. lfetch.nt1 [PREB], 4 * SIZE
  7192. FMA f65 = f33, f48, f65 // A2 * B1
  7193. adds C9 = 2 * SIZE, C1
  7194. }
  7195. { .mfi
  7196. nop __LINE__
  7197. FMA f73 = f33, f49, f73 // A2 * B2
  7198. adds C10 = 2 * SIZE, C2
  7199. }
  7200. ;;
  7201. { .mfb
  7202. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  7203. FMA f66 = f34, f48, f66 // A3 * B1
  7204. nop __LINE__
  7205. }
  7206. { .mfb
  7207. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7208. (p5) LDFD f68 = [C1 ], SIZE
  7209. #else
  7210. nop __LINE__
  7211. #endif
  7212. FMA f74 = f34, f49, f74 // A3 * B2
  7213. nop __LINE__
  7214. }
  7215. ;;
  7216. { .mfb
  7217. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  7218. FMA f67 = f35, f48, f67 // A4 * B1
  7219. nop __LINE__
  7220. }
  7221. { .mfb
  7222. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7223. (p5) LDFD f70 = [C9 ], SIZE
  7224. #else
  7225. nop __LINE__
  7226. #endif
  7227. FMA f75 = f35, f49, f75 // A4 * B2
  7228. nop __LINE__
  7229. }
  7230. ;;
  7231. { .mfb
  7232. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  7233. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  7234. nop __LINE__
  7235. }
  7236. { .mfb
  7237. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7238. (p5) LDFD f69 = [C1 ], -1 * SIZE
  7239. #else
  7240. nop __LINE__
  7241. #endif
  7242. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  7243. nop __LINE__
  7244. }
  7245. ;;
  7246. { .mfb
  7247. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7248. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  7249. nop __LINE__
  7250. }
  7251. { .mfb
  7252. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7253. (p5) LDFD f71 = [C9 ], -1 * SIZE
  7254. #else
  7255. nop __LINE__
  7256. #endif
  7257. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  7258. nop __LINE__
  7259. }
  7260. ;;
  7261. { .mfb
  7262. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7263. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  7264. nop __LINE__
  7265. }
  7266. { .mfb
  7267. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7268. (p5) LDFD f76 = [C2 ], SIZE
  7269. #else
  7270. nop __LINE__
  7271. #endif
  7272. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  7273. nop __LINE__
  7274. }
  7275. ;;
  7276. { .mfi
  7277. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  7278. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  7279. adds L = -1, L
  7280. }
  7281. { .mfb
  7282. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7283. (p5) LDFD f78 = [C10], SIZE
  7284. #else
  7285. nop __LINE__
  7286. #endif
  7287. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  7288. br.cloop.sptk.few .L102
  7289. }
  7290. ;;
  7291. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7292. { .mfb
  7293. LDFD f77 = [C2 ], -1 * SIZE
  7294. FMA f64 = ALPHA, f64, f68
  7295. nop __LINE__
  7296. }
  7297. { .mfb
  7298. LDFD f79 = [C10], -1 * SIZE
  7299. FMA f66 = ALPHA, f66, f70
  7300. nop __LINE__
  7301. }
  7302. ;;
  7303. FMA f65 = ALPHA, f65, f69
  7304. adds L = 1, K
  7305. FMA f67 = ALPHA, f67, f71
  7306. ;;
  7307. FMA f72 = ALPHA, f72, f76
  7308. shr L = L, 1
  7309. FMA f74 = ALPHA, f74, f78
  7310. FMA f73 = ALPHA, f73, f77
  7311. FMA f75 = ALPHA, f75, f79
  7312. ;;
  7313. { .mmf
  7314. STFD [C1 ] = f64, SIZE
  7315. STFD [C9 ] = f66, SIZE
  7316. mov f64 = f0
  7317. }
  7318. ;;
  7319. { .mmf
  7320. STFD [C1 ] = f65, 3 * SIZE
  7321. STFD [C9 ] = f67, 3 * SIZE
  7322. mov f65 = f0
  7323. }
  7324. ;;
  7325. { .mmf
  7326. STFD [C2 ] = f72, SIZE
  7327. STFD [C10] = f74, SIZE
  7328. mov f72 = f0
  7329. }
  7330. ;;
  7331. { .mmf
  7332. STFD [C2 ] = f73, 3 * SIZE
  7333. STFD [C10] = f75, 3 * SIZE
  7334. mov f73 = f0
  7335. }
  7336. ;;
  7337. #else
  7338. { .mfb
  7339. nop __LINE__
  7340. FMPY f64 = ALPHA, f64
  7341. nop __LINE__
  7342. }
  7343. { .mfb
  7344. nop __LINE__
  7345. FMPY f66 = ALPHA, f66
  7346. nop __LINE__
  7347. }
  7348. ;;
  7349. FMPY f65 = ALPHA, f65
  7350. FMPY f67 = ALPHA, f67
  7351. ;;
  7352. { .mfi
  7353. nop __LINE__
  7354. FMPY f72 = ALPHA, f72
  7355. #if defined(TRMMKERNEL) && \
  7356. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7357. sub L = K, KK
  7358. #else
  7359. nop __LINE__
  7360. #endif
  7361. }
  7362. { .mfi
  7363. nop __LINE__
  7364. FMPY f74 = ALPHA, f74
  7365. nop __LINE__
  7366. }
  7367. ;;
  7368. { .mfi
  7369. nop __LINE__
  7370. FMPY f73 = ALPHA, f73
  7371. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  7372. adds L = -4, L
  7373. #else
  7374. nop __LINE__
  7375. #endif
  7376. }
  7377. { .mfi
  7378. nop __LINE__
  7379. FMPY f75 = ALPHA, f75
  7380. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  7381. adds L = -2, L
  7382. #else
  7383. nop __LINE__
  7384. #endif
  7385. }
  7386. ;;
  7387. { .mfi
  7388. STFD [C1 ] = f64, SIZE
  7389. mov f64 = f0
  7390. #if defined(TRMMKERNEL) && \
  7391. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7392. shladd KK8 = L, BASE_SHIFT, r0
  7393. #else
  7394. nop __LINE__
  7395. #endif
  7396. }
  7397. { .mmi
  7398. STFD [C9 ] = f66, SIZE
  7399. nop __LINE__
  7400. nop __LINE__
  7401. }
  7402. ;;
  7403. { .mfi
  7404. STFD [C1 ] = f65, 3 * SIZE
  7405. mov f65 = f0
  7406. #if defined(TRMMKERNEL) && \
  7407. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7408. shladd AOFFSET = KK8, 2, AOFFSET
  7409. #else
  7410. nop __LINE__
  7411. #endif
  7412. }
  7413. { .mmi
  7414. STFD [C9 ] = f67, 3 * SIZE
  7415. nop __LINE__
  7416. #if defined(TRMMKERNEL) && \
  7417. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7418. shladd BOFFSET = KK8, 1, BOFFSET
  7419. #else
  7420. nop __LINE__
  7421. #endif
  7422. }
  7423. ;;
  7424. { .mfi
  7425. STFD [C2 ] = f72, SIZE
  7426. mov f72 = f0
  7427. #if defined(TRMMKERNEL) && defined(LEFT)
  7428. adds KK = 4, KK
  7429. #else
  7430. nop __LINE__
  7431. #endif
  7432. }
  7433. { .mmi
  7434. STFD [C10] = f74, SIZE
  7435. nop __LINE__
  7436. nop __LINE__
  7437. }
  7438. ;;
  7439. { .mfi
  7440. STFD [C2 ] = f73, 3 * SIZE
  7441. mov f73 = f0
  7442. #ifdef TRMMKERNEL
  7443. shladd KK8 = KK, BASE_SHIFT, r0
  7444. #else
  7445. nop __LINE__
  7446. #endif
  7447. }
  7448. { .mib
  7449. STFD [C10] = f75, 3 * SIZE
  7450. nop __LINE__
  7451. nop __LINE__
  7452. }
  7453. ;;
  7454. #endif
  7455. .align 32
  7456. .L110:
  7457. { .mib
  7458. #ifndef TRMMKERNEL
  7459. nop __LINE__
  7460. #else
  7461. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  7462. sub L = K, KK
  7463. #elif defined(LEFT)
  7464. adds L = 2, KK
  7465. #else
  7466. adds L = 2, KK
  7467. #endif
  7468. #endif
  7469. tbit.z p6, p7 = M, 1
  7470. (p6) br.cond.dptk .L120
  7471. }
  7472. ;;
  7473. #if !defined(TRMMKERNEL) || \
  7474. defined(TRMMKERNEL) && \
  7475. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7476. { .mmi
  7477. LDFPD f48, f49 = [B]
  7478. adds BOFFSET = 2 * SIZE, B
  7479. #ifndef TRMMKERNEL
  7480. adds L = 1, K
  7481. #else
  7482. adds L = 1, L
  7483. #endif
  7484. }
  7485. ;;
  7486. #else
  7487. { .mmi
  7488. shladd BOFFSET = KK8, 1, B
  7489. shladd AOFFSET = KK8, 1, AOFFSET
  7490. }
  7491. ;;
  7492. { .mmi
  7493. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7494. nop __LINE__
  7495. adds L = 1, L
  7496. }
  7497. ;;
  7498. #endif
  7499. ;;
  7500. { .mii
  7501. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  7502. tbit.z p12, p0 = L, 0
  7503. shr L = L, 1
  7504. }
  7505. ;;
  7506. { .mmi
  7507. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7508. nop __LINE__
  7509. adds L = -1, L
  7510. }
  7511. ;;
  7512. { .mmi
  7513. cmp.eq p3, p0 = r0, r0
  7514. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  7515. mov ar.lc = L
  7516. }
  7517. ;;
  7518. .align 32
  7519. .L112:
  7520. { .mfi
  7521. lfetch.nt1 [PREA], 4 * SIZE
  7522. FMA f64 = f32, f48, f64 // A1 * B1
  7523. cmp.ne p4, p5 = 0, L
  7524. }
  7525. { .mfi
  7526. lfetch.nt1 [PREB], 4 * SIZE
  7527. FMA f72 = f32, f49, f72 // A1 * B2
  7528. (p12) cmp.ne p3, p0 = 0, L
  7529. }
  7530. ;;
  7531. { .mmf
  7532. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  7533. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  7534. FMA f65 = f33, f48, f65 // A2 * B1
  7535. }
  7536. { .mmf
  7537. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7538. (p5) LDFD f68 = [C1 ], SIZE
  7539. (p5) LDFD f76 = [C2 ], SIZE
  7540. #else
  7541. nop __LINE__
  7542. nop __LINE__
  7543. #endif
  7544. FMA f73 = f33, f49, f73 // A2 * B2
  7545. }
  7546. ;;
  7547. { .mfb
  7548. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7549. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  7550. nop __LINE__
  7551. }
  7552. { .mfb
  7553. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7554. (p5) LDFD f69 = [C1 ], -1 * SIZE
  7555. #else
  7556. nop __LINE__
  7557. #endif
  7558. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  7559. nop __LINE__
  7560. }
  7561. ;;
  7562. { .mfi
  7563. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7564. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  7565. adds L = -1, L
  7566. }
  7567. { .mfb
  7568. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7569. (p5) LDFD f77 = [C2 ], -1 * SIZE
  7570. #else
  7571. nop __LINE__
  7572. #endif
  7573. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  7574. br.cloop.sptk.few .L112
  7575. }
  7576. ;;
  7577. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7578. FMA f64 = ALPHA, f64, f68
  7579. FMA f65 = ALPHA, f65, f69
  7580. FMA f72 = ALPHA, f72, f76
  7581. FMA f73 = ALPHA, f73, f77
  7582. ;;
  7583. { .mfi
  7584. STFD [C1 ] = f64, SIZE
  7585. mov f64 = f0
  7586. nop __LINE__
  7587. }
  7588. { .mfb
  7589. STFD [C2 ] = f72, SIZE
  7590. mov f72 = f0
  7591. nop __LINE__
  7592. }
  7593. ;;
  7594. { .mfi
  7595. STFD [C1 ] = f65, SIZE
  7596. mov f65 = f0
  7597. nop __LINE__
  7598. }
  7599. { .mfb
  7600. STFD [C2 ] = f73, SIZE
  7601. mov f73 = f0
  7602. nop __LINE__
  7603. }
  7604. ;;
  7605. #else
  7606. { .mfi
  7607. nop __LINE__
  7608. FMPY f64 = ALPHA, f64
  7609. #if defined(TRMMKERNEL) && \
  7610. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7611. sub L = K, KK
  7612. #else
  7613. nop __LINE__
  7614. #endif
  7615. }
  7616. { .mfi
  7617. nop __LINE__
  7618. FMPY f65 = ALPHA, f65
  7619. nop __LINE__
  7620. }
  7621. ;;
  7622. { .mfi
  7623. nop __LINE__
  7624. FMPY f72 = ALPHA, f72
  7625. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  7626. adds L = -2, L
  7627. #else
  7628. nop __LINE__
  7629. #endif
  7630. }
  7631. { .mfi
  7632. nop __LINE__
  7633. FMPY f73 = ALPHA, f73
  7634. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  7635. adds L = -2, L
  7636. #else
  7637. nop __LINE__
  7638. #endif
  7639. }
  7640. ;;
  7641. { .mmi
  7642. #if defined(TRMMKERNEL) && \
  7643. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7644. shladd KK8 = L, BASE_SHIFT, r0
  7645. #else
  7646. nop __LINE__
  7647. #endif
  7648. ;;
  7649. #if defined(TRMMKERNEL) && \
  7650. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7651. shladd AOFFSET = KK8, 1, AOFFSET
  7652. #else
  7653. nop __LINE__
  7654. #endif
  7655. #if defined(TRMMKERNEL) && \
  7656. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7657. shladd BOFFSET = KK8, 1, BOFFSET
  7658. #else
  7659. nop __LINE__
  7660. #endif
  7661. }
  7662. ;;
  7663. { .mfi
  7664. STFD [C1 ] = f64, SIZE
  7665. mov f64 = f0
  7666. #if defined(TRMMKERNEL) && defined(LEFT)
  7667. adds KK = 2, KK
  7668. #else
  7669. nop __LINE__
  7670. #endif
  7671. }
  7672. { .mfb
  7673. STFD [C2 ] = f72, SIZE
  7674. mov f72 = f0
  7675. nop __LINE__
  7676. }
  7677. ;;
  7678. { .mfi
  7679. STFD [C1 ] = f65, SIZE
  7680. mov f65 = f0
  7681. #ifdef TRMMKERNEL
  7682. shladd KK8 = KK, BASE_SHIFT, r0
  7683. #else
  7684. nop __LINE__
  7685. #endif
  7686. }
  7687. { .mfb
  7688. STFD [C2 ] = f73, SIZE
  7689. mov f73 = f0
  7690. nop __LINE__
  7691. }
  7692. ;;
  7693. #endif
  7694. .align 32
  7695. .L120:
  7696. { .mib
  7697. #ifndef TRMMKERNEL
  7698. nop __LINE__
  7699. #else
  7700. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  7701. sub L = K, KK
  7702. #elif defined(LEFT)
  7703. adds L = 1, KK
  7704. #else
  7705. adds L = 2, KK
  7706. #endif
  7707. #endif
  7708. tbit.z p6, p7 = M, 0
  7709. (p6) br.cond.dptk .L129
  7710. }
  7711. ;;
  7712. #if !defined(TRMMKERNEL) || \
  7713. defined(TRMMKERNEL) && \
  7714. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7715. { .mmi
  7716. LDFPD f48, f49 = [B]
  7717. adds BOFFSET = 2 * SIZE, B
  7718. #ifndef TRMMKERNEL
  7719. adds L = 1, K
  7720. #else
  7721. adds L = 1, L
  7722. #endif
  7723. }
  7724. ;;
  7725. #else
  7726. { .mmi
  7727. shladd BOFFSET = KK8, 1, B
  7728. add AOFFSET = KK8, AOFFSET
  7729. }
  7730. ;;
  7731. { .mmi
  7732. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7733. nop __LINE__
  7734. adds L = 1, L
  7735. }
  7736. ;;
  7737. #endif
  7738. { .mii
  7739. nop __LINE__
  7740. tbit.z p12, p0 = L, 0
  7741. shr L = L, 1
  7742. }
  7743. ;;
  7744. { .mmi
  7745. LDFD f32 = [AOFFSET], 1 * SIZE
  7746. nop __LINE__
  7747. adds L = -1, L
  7748. }
  7749. ;;
  7750. { .mmi
  7751. cmp.eq p3, p0 = r0, r0
  7752. nop __LINE__
  7753. mov ar.lc = L
  7754. }
  7755. ;;
  7756. .align 32
  7757. .L122:
  7758. { .mfi
  7759. FMA f64 = f32, f48, f64 // A1 * B1
  7760. cmp.ne p4, p5 = 0, L
  7761. }
  7762. { .mfi
  7763. nop __LINE__
  7764. FMA f72 = f32, f49, f72 // A1 * B2
  7765. (p12) cmp.ne p3, p0 = 0, L
  7766. }
  7767. ;;
  7768. { .mmi
  7769. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  7770. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  7771. nop __LINE__
  7772. }
  7773. { .mmi
  7774. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7775. (p5) LDFD f68 = [C1]
  7776. (p5) LDFD f76 = [C2]
  7777. #else
  7778. nop __LINE__
  7779. nop __LINE__
  7780. #endif
  7781. nop __LINE__
  7782. }
  7783. ;;
  7784. { .mfi
  7785. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7786. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  7787. adds L = -1, L
  7788. }
  7789. { .mfb
  7790. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  7791. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  7792. br.cloop.sptk.few .L122
  7793. }
  7794. ;;
  7795. .L128:
  7796. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  7797. FMA f64 = ALPHA, f64, f68
  7798. FMA f72 = ALPHA, f72, f76
  7799. ;;
  7800. { .mfi
  7801. STFD [C1 ] = f64
  7802. mov f64 = f0
  7803. }
  7804. { .mfb
  7805. STFD [C2 ] = f72
  7806. mov f72 = f0
  7807. }
  7808. ;;
  7809. #else
  7810. { .mfi
  7811. nop __LINE__
  7812. FMPY f64 = ALPHA, f64
  7813. #if defined(TRMMKERNEL) && \
  7814. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7815. sub L = K, KK
  7816. #else
  7817. nop __LINE__
  7818. #endif
  7819. }
  7820. { .mfi
  7821. nop __LINE__
  7822. FMPY f72 = ALPHA, f72
  7823. nop __LINE__
  7824. }
  7825. ;;
  7826. { .mmi
  7827. nop __LINE__
  7828. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  7829. adds L = -1, L
  7830. #else
  7831. nop __LINE__
  7832. #endif
  7833. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  7834. adds L = -2, L
  7835. #else
  7836. nop __LINE__
  7837. #endif
  7838. }
  7839. ;;
  7840. { .mmi
  7841. #if defined(TRMMKERNEL) && \
  7842. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7843. shladd KK8 = L, BASE_SHIFT, r0
  7844. #else
  7845. nop __LINE__
  7846. #endif
  7847. ;;
  7848. #if defined(TRMMKERNEL) && \
  7849. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7850. add AOFFSET = KK8, AOFFSET
  7851. #else
  7852. nop __LINE__
  7853. #endif
  7854. #if defined(TRMMKERNEL) && \
  7855. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7856. shladd BOFFSET = KK8, 1, BOFFSET
  7857. #else
  7858. nop __LINE__
  7859. #endif
  7860. }
  7861. ;;
  7862. #if defined(TRMMKERNEL) && defined(LEFT)
  7863. adds KK = 1, KK
  7864. #else
  7865. nop __LINE__
  7866. #endif
  7867. ;;
  7868. { .mfi
  7869. STFD [C1 ] = f64
  7870. mov f64 = f0
  7871. #ifdef TRMMKERNEL
  7872. shladd KK8 = KK, BASE_SHIFT, r0
  7873. #else
  7874. nop __LINE__
  7875. #endif
  7876. }
  7877. { .mfb
  7878. STFD [C2 ] = f72
  7879. mov f72 = f0
  7880. }
  7881. ;;
  7882. #endif
  7883. .align 32
  7884. .L129:
  7885. { .mmi
  7886. mov B = BOFFSET
  7887. mov AOFFSET = A
  7888. #if defined(TRMMKERNEL) && !defined(LEFT)
  7889. adds KK = 2, KK
  7890. #else
  7891. nop __LINE__
  7892. #endif
  7893. }
  7894. ;;
  7895. .align 16
  7896. #endif
  7897. .L130:
  7898. { .mfi
  7899. #if defined(TRMMKERNEL) && defined(LEFT)
  7900. mov KK = OFFSET
  7901. #else
  7902. nop __LINE__
  7903. #endif
  7904. mov f64 = f0
  7905. tbit.z p6, p0 = N, 0
  7906. }
  7907. { .mib
  7908. mov AOFFSET = A
  7909. shr I = M, 3
  7910. (p6) br.cond.dpnt .L999
  7911. }
  7912. ;;
  7913. #if 0
  7914. { .mfi
  7915. mov C1 = C
  7916. mov f65 = f0
  7917. #ifdef TRMMKERNEL
  7918. shladd KK8 = KK, BASE_SHIFT, r0
  7919. #else
  7920. nop __LINE__
  7921. #endif
  7922. }
  7923. ;;
  7924. { .mfi
  7925. nop __LINE__
  7926. mov f66 = f0
  7927. nop __LINE__
  7928. }
  7929. { .mfb
  7930. cmp.eq p7, p0 = 0, I
  7931. mov f67 = f0
  7932. (p7) br.cond.dpnt .L140
  7933. }
  7934. ;;
  7935. .align 32
  7936. .L132:
  7937. #if !defined(TRMMKERNEL) || \
  7938. defined(TRMMKERNEL) && \
  7939. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  7940. { .mfb
  7941. LDFD f48 = [B]
  7942. mov f68 = f0
  7943. nop __LINE__
  7944. }
  7945. { .mfi
  7946. adds BOFFSET = 1 * SIZE, B
  7947. mov f69 = f0
  7948. #ifndef TRMMKERNEL
  7949. nop __LINE__
  7950. #else
  7951. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  7952. sub L = K, KK
  7953. #elif defined(LEFT)
  7954. adds L = 8, KK
  7955. #else
  7956. adds L = 1, KK
  7957. #endif
  7958. #endif
  7959. }
  7960. ;;
  7961. #else
  7962. { .mfi
  7963. add BOFFSET = KK8, B
  7964. mov f68 = f0
  7965. shladd AOFFSET = KK8, 3, AOFFSET
  7966. }
  7967. ;;
  7968. { .mfi
  7969. LDFD f48 = [BOFFSET], 1 * SIZE
  7970. mov f69 = f0
  7971. #ifndef TRMMKERNEL
  7972. nop __LINE__
  7973. #else
  7974. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  7975. sub L = K, KK
  7976. #elif defined(LEFT)
  7977. adds L = 8, KK
  7978. #else
  7979. adds L = 1, KK
  7980. #endif
  7981. #endif
  7982. }
  7983. ;;
  7984. #endif
  7985. { .mfi
  7986. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7987. mov f70 = f0
  7988. #ifndef TRMMKERNEL
  7989. adds L = 1, K
  7990. #else
  7991. adds L = 1, L
  7992. #endif
  7993. }
  7994. ;;
  7995. { .mii
  7996. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  7997. tbit.z p12, p0 = L, 0
  7998. shr L = L, 1
  7999. }
  8000. ;;
  8001. { .mfi
  8002. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  8003. mov f71 = f0
  8004. adds L = -1, L
  8005. }
  8006. ;;
  8007. { .mmi
  8008. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  8009. adds PREC = CPREFETCHSIZE * SIZE, C1
  8010. cmp.eq p3, p0 = r0, r0
  8011. }
  8012. ;;
  8013. { .mmi
  8014. CPREFETCH [PREC]
  8015. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  8016. mov ar.lc = L
  8017. }
  8018. ;;
  8019. .align 32
  8020. .L133:
  8021. { .mfi
  8022. lfetch.nt1 [PREA], 16 * SIZE
  8023. FMA f64 = f32, f48, f64 // A1 * B1
  8024. cmp.ne p4, p5 = 0, L
  8025. }
  8026. { .mfi
  8027. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  8028. FMA f65 = f33, f48, f65 // A2 * B1
  8029. (p12) cmp.ne p3, p0 = 0, L
  8030. }
  8031. ;;
  8032. { .mfi
  8033. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  8034. FMA f66 = f34, f48, f66 // A3 * B1
  8035. adds C9 = 4 * SIZE, C1
  8036. }
  8037. { .mmf
  8038. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  8039. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8040. (p5) LDFD f6 = [C1 ], SIZE
  8041. #else
  8042. nop __LINE__
  8043. #endif
  8044. FMA f67 = f35, f48, f67 // A4 * B1
  8045. }
  8046. ;;
  8047. { .mfb
  8048. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  8049. FMA f68 = f36, f48, f68 // A5 * B1
  8050. nop __LINE__
  8051. }
  8052. { .mfb
  8053. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8054. (p5) LDFD f7 = [C9 ], SIZE
  8055. #else
  8056. nop __LINE__
  8057. #endif
  8058. FMA f69 = f37, f48, f69 // A6 * B1
  8059. nop __LINE__
  8060. }
  8061. ;;
  8062. { .mfb
  8063. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  8064. FMA f70 = f38, f48, f70 // A7 * B1
  8065. nop __LINE__
  8066. }
  8067. { .mfb
  8068. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8069. (p5) LDFD f10 = [C1 ], SIZE
  8070. #else
  8071. nop __LINE__
  8072. #endif
  8073. FMA f71 = f39, f48, f71 // A8 * B1
  8074. nop __LINE__
  8075. }
  8076. ;;
  8077. { .mfb
  8078. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  8079. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  8080. nop __LINE__
  8081. }
  8082. { .mfb
  8083. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8084. (p5) LDFD f11 = [C9 ], SIZE
  8085. #else
  8086. nop __LINE__
  8087. #endif
  8088. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  8089. nop __LINE__
  8090. }
  8091. ;;
  8092. { .mfb
  8093. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8094. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  8095. nop __LINE__
  8096. }
  8097. { .mmf
  8098. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  8099. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8100. (p5) LDFD f12 = [C1 ], SIZE
  8101. #else
  8102. nop __LINE__
  8103. #endif
  8104. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  8105. }
  8106. ;;
  8107. { .mfb
  8108. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  8109. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  8110. nop __LINE__
  8111. }
  8112. { .mfb
  8113. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8114. (p5) LDFD f13 = [C9 ], SIZE
  8115. #else
  8116. nop __LINE__
  8117. #endif
  8118. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  8119. nop __LINE__
  8120. }
  8121. ;;
  8122. { .mfi
  8123. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  8124. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  8125. adds L = -1, L
  8126. }
  8127. { .mfb
  8128. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8129. (p5) LDFD f14 = [C1 ], -3 * SIZE
  8130. #else
  8131. nop __LINE__
  8132. #endif
  8133. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  8134. nop __LINE__
  8135. }
  8136. ;;
  8137. { .mfb
  8138. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  8139. nop __LINE__
  8140. nop __LINE__
  8141. }
  8142. { .mfb
  8143. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8144. (p5) LDFD f15 = [C9 ], -3 * SIZE
  8145. #else
  8146. nop __LINE__
  8147. #endif
  8148. nop __LINE__
  8149. br.cloop.sptk.few .L133
  8150. }
  8151. ;;
  8152. .L138:
  8153. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8154. { .mfi
  8155. FMA f64 = ALPHA, f64, f6
  8156. cmp.ne p6, p0 = 1, I
  8157. }
  8158. { .mfb
  8159. FMA f68 = ALPHA, f68, f7
  8160. }
  8161. ;;
  8162. { .mfi
  8163. FMA f65 = ALPHA, f65, f10
  8164. adds I = -1, I
  8165. }
  8166. { .mfb
  8167. FMA f69 = ALPHA, f69, f11
  8168. }
  8169. ;;
  8170. { .mfi
  8171. FMA f66 = ALPHA, f66, f12
  8172. }
  8173. { .mfb
  8174. FMA f70 = ALPHA, f70, f13
  8175. }
  8176. ;;
  8177. { .mfb
  8178. FMA f67 = ALPHA, f67, f14
  8179. }
  8180. { .mfb
  8181. FMA f71 = ALPHA, f71, f15
  8182. }
  8183. ;;
  8184. { .mmf
  8185. STFD [C1 ] = f64, SIZE
  8186. STFD [C9 ] = f68, SIZE
  8187. mov f64 = f0
  8188. }
  8189. ;;
  8190. { .mmf
  8191. STFD [C1 ] = f65, SIZE
  8192. STFD [C9 ] = f69, SIZE
  8193. mov f65 = f0
  8194. }
  8195. ;;
  8196. { .mmf
  8197. STFD [C1 ] = f66, SIZE
  8198. STFD [C9 ] = f70, SIZE
  8199. mov f66 = f0
  8200. }
  8201. ;;
  8202. { .mmf
  8203. STFD [C1 ] = f67, 5 * SIZE
  8204. nop __LINE__
  8205. mov f67 = f0
  8206. }
  8207. { .mmb
  8208. STFD [C9 ] = f71, 5 * SIZE
  8209. nop __LINE__
  8210. (p6) br.cond.dptk .L132
  8211. }
  8212. ;;
  8213. #else
  8214. { .mfi
  8215. FMPY f64 = ALPHA, f64
  8216. cmp.ne p6, p0 = 1, I
  8217. }
  8218. { .mfb
  8219. FMPY f68 = ALPHA, f68
  8220. }
  8221. ;;
  8222. { .mfi
  8223. FMPY f65 = ALPHA, f65
  8224. adds I = -1, I
  8225. }
  8226. { .mfb
  8227. FMPY f69 = ALPHA, f69
  8228. }
  8229. ;;
  8230. { .mfi
  8231. FMPY f66 = ALPHA, f66
  8232. #if defined(TRMMKERNEL) && \
  8233. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8234. sub L = K, KK
  8235. #else
  8236. nop __LINE__
  8237. #endif
  8238. }
  8239. { .mfb
  8240. FMPY f70 = ALPHA, f70
  8241. }
  8242. ;;
  8243. { .mfi
  8244. FMPY f67 = ALPHA, f67
  8245. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  8246. adds L = -8, L
  8247. #else
  8248. nop __LINE__
  8249. #endif
  8250. }
  8251. { .mfi
  8252. FMPY f71 = ALPHA, f71
  8253. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  8254. adds L = -1, L
  8255. #else
  8256. nop __LINE__
  8257. #endif
  8258. }
  8259. ;;
  8260. { .mfi
  8261. STFD [C1 ] = f64, SIZE
  8262. mov f64 = f0
  8263. #if defined(TRMMKERNEL) && \
  8264. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8265. shladd KK8 = L, BASE_SHIFT, r0
  8266. #else
  8267. nop __LINE__
  8268. #endif
  8269. }
  8270. { .mmi
  8271. STFD [C9 ] = f68, SIZE
  8272. nop __LINE__
  8273. nop __LINE__
  8274. }
  8275. ;;
  8276. { .mfi
  8277. STFD [C1 ] = f65, SIZE
  8278. mov f65 = f0
  8279. #if defined(TRMMKERNEL) && \
  8280. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8281. shladd AOFFSET = KK8, 3, AOFFSET
  8282. #else
  8283. nop __LINE__
  8284. #endif
  8285. }
  8286. { .mmi
  8287. STFD [C9 ] = f69, SIZE
  8288. nop __LINE__
  8289. #if defined(TRMMKERNEL) && \
  8290. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8291. add BOFFSET = KK8, BOFFSET
  8292. #else
  8293. nop __LINE__
  8294. #endif
  8295. }
  8296. ;;
  8297. { .mfi
  8298. STFD [C1 ] = f66, SIZE
  8299. mov f66 = f0
  8300. #if defined(TRMMKERNEL) && defined(LEFT)
  8301. adds KK = 8, KK
  8302. #else
  8303. nop __LINE__
  8304. #endif
  8305. }
  8306. { .mmi
  8307. STFD [C9 ] = f70, SIZE
  8308. nop __LINE__
  8309. nop __LINE__
  8310. }
  8311. ;;
  8312. { .mfi
  8313. STFD [C1 ] = f67, 5 * SIZE
  8314. mov f67 = f0
  8315. #ifdef TRMMKERNEL
  8316. shladd KK8 = KK, BASE_SHIFT, r0
  8317. #else
  8318. nop __LINE__
  8319. #endif
  8320. }
  8321. { .mmb
  8322. STFD [C9 ] = f71, 5 * SIZE
  8323. nop __LINE__
  8324. (p6) br.cond.dptk .L132
  8325. }
  8326. ;;
  8327. #endif
  8328. .align 32
  8329. .L140:
  8330. { .mib
  8331. #ifndef TRMMKERNEL
  8332. nop __LINE__
  8333. #else
  8334. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  8335. sub L = K, KK
  8336. #elif defined(LEFT)
  8337. adds L = 4, KK
  8338. #else
  8339. adds L = 1, KK
  8340. #endif
  8341. #endif
  8342. tbit.z p6, p7 = M, 2
  8343. (p6) br.cond.dptk .L150
  8344. }
  8345. ;;
  8346. #if !defined(TRMMKERNEL) || \
  8347. defined(TRMMKERNEL) && \
  8348. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8349. { .mmi
  8350. LDFD f48 = [B]
  8351. adds BOFFSET = 1 * SIZE, B
  8352. #ifndef TRMMKERNEL
  8353. adds L = 1, K
  8354. #else
  8355. adds L = 1, L
  8356. #endif
  8357. }
  8358. ;;
  8359. #else
  8360. { .mmi
  8361. add BOFFSET = KK8, B
  8362. shladd AOFFSET = KK8, 2, AOFFSET
  8363. nop __LINE__
  8364. }
  8365. ;;
  8366. { .mmi
  8367. LDFD f48 = [BOFFSET], 1 * SIZE
  8368. nop __LINE__
  8369. #ifndef TRMMKERNEL
  8370. adds L = 1, K
  8371. #else
  8372. adds L = 1, L
  8373. #endif
  8374. }
  8375. ;;
  8376. #endif
  8377. { .mii
  8378. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8379. tbit.z p12, p0 = L, 0
  8380. shr L = L, 1
  8381. }
  8382. ;;
  8383. { .mmi
  8384. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  8385. adds L = -1, L
  8386. nop __LINE__
  8387. }
  8388. ;;
  8389. { .mmi
  8390. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  8391. cmp.eq p3, p0 = r0, r0
  8392. mov ar.lc = L
  8393. }
  8394. ;;
  8395. .align 32
  8396. .L142:
  8397. { .mfi
  8398. lfetch.nt1 [PREA], 8 * SIZE
  8399. FMA f64 = f32, f48, f64 // A1 * B1
  8400. cmp.ne p4, p5 = 0, L
  8401. }
  8402. { .mfi
  8403. nop __LINE__
  8404. FMA f65 = f33, f48, f65 // A2 * B1
  8405. (p12) cmp.ne p3, p0 = 0, L
  8406. }
  8407. ;;
  8408. { .mfi
  8409. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  8410. FMA f66 = f34, f48, f66 // A3 * B1
  8411. (p5) adds C9 = 2 * SIZE, C1
  8412. }
  8413. { .mmf
  8414. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8415. (p5) LDFD f68 = [C1 ], SIZE
  8416. #else
  8417. nop __LINE__
  8418. #endif
  8419. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  8420. FMA f67 = f35, f48, f67 // A4 * B1
  8421. }
  8422. ;;
  8423. { .mfi
  8424. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  8425. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  8426. (p5) adds C10 = 2 * SIZE, C2
  8427. }
  8428. { .mfb
  8429. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8430. (p5) LDFD f70 = [C9 ], SIZE
  8431. #else
  8432. nop __LINE__
  8433. #endif
  8434. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  8435. nop __LINE__
  8436. }
  8437. ;;
  8438. { .mfb
  8439. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8440. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  8441. nop __LINE__
  8442. }
  8443. { .mmf
  8444. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  8445. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8446. (p5) LDFD f69 = [C1 ], -1 * SIZE
  8447. #else
  8448. nop __LINE__
  8449. #endif
  8450. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  8451. }
  8452. ;;
  8453. { .mfi
  8454. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  8455. nop __LINE__
  8456. adds L = -1, L
  8457. }
  8458. { .mfb
  8459. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8460. (p5) LDFD f71 = [C9 ], -1 * SIZE
  8461. #else
  8462. nop __LINE__
  8463. #endif
  8464. nop.f 0
  8465. br.cloop.sptk.few .L142
  8466. }
  8467. ;;
  8468. .L148:
  8469. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8470. FMA f64 = ALPHA, f64, f68
  8471. FMA f66 = ALPHA, f66, f70
  8472. FMA f65 = ALPHA, f65, f69
  8473. FMA f67 = ALPHA, f67, f71
  8474. ;;
  8475. { .mfi
  8476. STFD [C1 ] = f64, SIZE
  8477. mov f64 = f0
  8478. adds L = 1, K
  8479. }
  8480. { .mfb
  8481. STFD [C9 ] = f66, SIZE
  8482. mov f66 = f0
  8483. nop __LINE__
  8484. }
  8485. ;;
  8486. { .mfi
  8487. STFD [C1 ] = f65, 3 * SIZE
  8488. mov f65 = f0
  8489. shr L = L, 1
  8490. }
  8491. { .mfb
  8492. STFD [C9 ] = f67, 3 * SIZE
  8493. mov f67 = f0
  8494. nop __LINE__
  8495. }
  8496. ;;
  8497. #else
  8498. { .mfi
  8499. nop __LINE__
  8500. FMPY f64 = ALPHA, f64
  8501. #if defined(TRMMKERNEL) && \
  8502. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8503. sub L = K, KK
  8504. #else
  8505. nop __LINE__
  8506. #endif
  8507. }
  8508. { .mfi
  8509. nop __LINE__
  8510. FMPY f66 = ALPHA, f66
  8511. nop __LINE__
  8512. }
  8513. ;;
  8514. { .mfi
  8515. nop __LINE__
  8516. FMPY f65 = ALPHA, f65
  8517. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  8518. adds L = -4, L
  8519. #else
  8520. nop __LINE__
  8521. #endif
  8522. }
  8523. { .mfi
  8524. nop __LINE__
  8525. FMPY f67 = ALPHA, f67
  8526. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  8527. adds L = -1, L
  8528. #else
  8529. nop __LINE__
  8530. #endif
  8531. }
  8532. ;;
  8533. { .mmi
  8534. #if defined(TRMMKERNEL) && \
  8535. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8536. shladd KK8 = L, BASE_SHIFT, r0
  8537. #else
  8538. nop __LINE__
  8539. #endif
  8540. ;;
  8541. #if defined(TRMMKERNEL) && \
  8542. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8543. shladd AOFFSET = KK8, 2, AOFFSET
  8544. #else
  8545. nop __LINE__
  8546. #endif
  8547. #if defined(TRMMKERNEL) && \
  8548. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8549. add BOFFSET = KK8, BOFFSET
  8550. #else
  8551. nop __LINE__
  8552. #endif
  8553. }
  8554. ;;
  8555. { .mfi
  8556. STFD [C1 ] = f64, SIZE
  8557. mov f64 = f0
  8558. #if defined(TRMMKERNEL) && defined(LEFT)
  8559. adds KK = 4, KK
  8560. #else
  8561. nop __LINE__
  8562. #endif
  8563. }
  8564. { .mfb
  8565. STFD [C9 ] = f66, SIZE
  8566. mov f66 = f0
  8567. nop __LINE__
  8568. }
  8569. ;;
  8570. { .mfi
  8571. STFD [C1 ] = f65, 3 * SIZE
  8572. mov f65 = f0
  8573. #ifdef TRMMKERNEL
  8574. shladd KK8 = KK, BASE_SHIFT, r0
  8575. #else
  8576. nop __LINE__
  8577. #endif
  8578. }
  8579. { .mfb
  8580. STFD [C9 ] = f67, 3 * SIZE
  8581. mov f67 = f0
  8582. nop __LINE__
  8583. }
  8584. ;;
  8585. #endif
  8586. .align 32
  8587. .L150:
  8588. { .mib
  8589. #ifndef TRMMKERNEL
  8590. nop __LINE__
  8591. #else
  8592. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  8593. sub L = K, KK
  8594. #elif defined(LEFT)
  8595. adds L = 2, KK
  8596. #else
  8597. adds L = 1, KK
  8598. #endif
  8599. #endif
  8600. tbit.z p6, p7 = M, 1
  8601. (p6) br.cond.dptk .L160
  8602. }
  8603. ;;
  8604. #if !defined(TRMMKERNEL) || \
  8605. defined(TRMMKERNEL) && \
  8606. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8607. { .mmi
  8608. LDFD f48 = [B]
  8609. adds BOFFSET = 1 * SIZE, B
  8610. #ifndef TRMMKERNEL
  8611. adds L = 1, K
  8612. #else
  8613. adds L = 1, L
  8614. #endif
  8615. }
  8616. ;;
  8617. #else
  8618. { .mmi
  8619. add BOFFSET = KK8, B
  8620. shladd AOFFSET = KK8, 1, AOFFSET
  8621. nop __LINE__
  8622. }
  8623. ;;
  8624. { .mmi
  8625. LDFD f48 = [BOFFSET], 1 * SIZE
  8626. nop __LINE__
  8627. #ifndef TRMMKERNEL
  8628. adds L = 1, K
  8629. #else
  8630. adds L = 1, L
  8631. #endif
  8632. }
  8633. ;;
  8634. #endif
  8635. { .mii
  8636. cmp.eq p3, p0 = r0, r0
  8637. tbit.z p12, p0 = L, 0
  8638. shr L = L, 1
  8639. }
  8640. ;;
  8641. { .mii
  8642. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8643. adds L = -1, L
  8644. ;;
  8645. mov ar.lc = L
  8646. }
  8647. ;;
  8648. .align 32
  8649. .L152:
  8650. { .mfi
  8651. cmp.ne p4, p5 = 0, L
  8652. FMA f64 = f32, f48, f64 // A1 * B1
  8653. (p12) cmp.ne p3, p0 = 0, L
  8654. }
  8655. ;;
  8656. { .mmf
  8657. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  8658. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  8659. FMA f65 = f33, f48, f65 // A2 * B1
  8660. }
  8661. ;;
  8662. { .mfi
  8663. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8664. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  8665. adds L = -1, L
  8666. }
  8667. ;;
  8668. { .mfb
  8669. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  8670. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  8671. br.cloop.sptk.few .L152
  8672. }
  8673. ;;
  8674. .L158:
  8675. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8676. LDFD f68 = [C1 ], SIZE
  8677. ;;
  8678. LDFD f69 = [C1 ], -1 * SIZE
  8679. ;;
  8680. FMA f64 = ALPHA, f64, f68
  8681. FMA f65 = ALPHA, f65, f69
  8682. ;;
  8683. STFD [C1 ] = f64, SIZE
  8684. mov f64 = f0
  8685. ;;
  8686. { .mfi
  8687. STFD [C1 ] = f65, SIZE
  8688. mov f65 = f0
  8689. }
  8690. ;;
  8691. #else
  8692. { .mfi
  8693. nop __LINE__
  8694. FMPY f64 = ALPHA, f64
  8695. #if defined(TRMMKERNEL) && \
  8696. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8697. sub L = K, KK
  8698. #else
  8699. nop __LINE__
  8700. #endif
  8701. }
  8702. { .mfi
  8703. nop __LINE__
  8704. FMPY f65 = ALPHA, f65
  8705. nop __LINE__
  8706. }
  8707. ;;
  8708. { .mii
  8709. nop __LINE__
  8710. #if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA))
  8711. adds L = -2, L
  8712. #else
  8713. nop __LINE__
  8714. #endif
  8715. #if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA))
  8716. adds L = -1, L
  8717. #else
  8718. nop __LINE__
  8719. #endif
  8720. }
  8721. ;;
  8722. { .mmi
  8723. #if defined(TRMMKERNEL) && \
  8724. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8725. shladd KK8 = L, BASE_SHIFT, r0
  8726. #else
  8727. nop __LINE__
  8728. #endif
  8729. ;;
  8730. #if defined(TRMMKERNEL) && \
  8731. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8732. shladd AOFFSET = KK8, 1, AOFFSET
  8733. #else
  8734. nop __LINE__
  8735. #endif
  8736. #if defined(TRMMKERNEL) && \
  8737. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8738. add BOFFSET = KK8, BOFFSET
  8739. #else
  8740. nop __LINE__
  8741. #endif
  8742. }
  8743. ;;
  8744. { .mfi
  8745. STFD [C1 ] = f64, SIZE
  8746. mov f64 = f0
  8747. #if defined(TRMMKERNEL) && defined(LEFT)
  8748. adds KK = 2, KK
  8749. #else
  8750. nop __LINE__
  8751. #endif
  8752. }
  8753. ;;
  8754. { .mfi
  8755. STFD [C1 ] = f65, SIZE
  8756. mov f65 = f0
  8757. #ifdef TRMMKERNEL
  8758. shladd KK8 = KK, BASE_SHIFT, r0
  8759. #else
  8760. nop __LINE__
  8761. #endif
  8762. }
  8763. ;;
  8764. #endif
  8765. .align 32
  8766. .L160:
  8767. { .mib
  8768. #ifndef TRMMKERNEL
  8769. nop __LINE__
  8770. #else
  8771. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  8772. sub L = K, KK
  8773. #elif defined(LEFT)
  8774. adds L = 1, KK
  8775. #else
  8776. adds L = 1, KK
  8777. #endif
  8778. #endif
  8779. tbit.z p6, p7 = M, 0
  8780. (p6) br.cond.dptk .L169
  8781. }
  8782. ;;
  8783. #if !defined(TRMMKERNEL) || \
  8784. defined(TRMMKERNEL) && \
  8785. ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))
  8786. { .mmi
  8787. LDFD f48 = [B]
  8788. adds BOFFSET = 1 * SIZE, B
  8789. #ifndef TRMMKERNEL
  8790. adds L = 1, K
  8791. #else
  8792. adds L = 1, L
  8793. #endif
  8794. }
  8795. ;;
  8796. #else
  8797. { .mmi
  8798. add BOFFSET = KK8, B
  8799. add AOFFSET = KK8, AOFFSET
  8800. nop __LINE__
  8801. }
  8802. ;;
  8803. { .mmi
  8804. LDFD f48 = [BOFFSET], 1 * SIZE
  8805. nop __LINE__
  8806. #ifndef TRMMKERNEL
  8807. adds L = 1, K
  8808. #else
  8809. adds L = 1, L
  8810. #endif
  8811. }
  8812. ;;
  8813. #endif
  8814. ;;
  8815. { .mii
  8816. LDFD f32 = [AOFFSET], 1 * SIZE
  8817. tbit.z p12, p0 = L, 0
  8818. shr L = L, 1
  8819. }
  8820. ;;
  8821. { .mii
  8822. adds L = -1, L
  8823. cmp.eq p3, p0 = r0, r0
  8824. ;;
  8825. mov ar.lc = L
  8826. }
  8827. ;;
  8828. .align 32
  8829. .L162:
  8830. { .mmf
  8831. cmp.ne p4, p5 = 0, L
  8832. (p12) cmp.ne p3, p0 = 0, L
  8833. FMA f64 = f32, f48, f64 // A1 * B1
  8834. }
  8835. ;;
  8836. { .mmi
  8837. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  8838. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  8839. nop __LINE__
  8840. }
  8841. ;;
  8842. { .mmi
  8843. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  8844. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8845. (p5) LDFD f68 = [C1]
  8846. #else
  8847. nop __LINE__
  8848. #endif
  8849. adds L = -1, L
  8850. }
  8851. { .mfb
  8852. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  8853. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  8854. br.cloop.sptk.few .L162
  8855. }
  8856. ;;
  8857. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  8858. FMA f64 = ALPHA, f64, f68
  8859. #else
  8860. FMPY f64 = ALPHA, f64
  8861. #endif
  8862. ;;
  8863. STFD [C1 ] = f64
  8864. ;;
  8865. .align 32
  8866. .L169:
  8867. { .mmi
  8868. mov B = BOFFSET
  8869. mov AOFFSET = A
  8870. #if defined(TRMMKERNEL) && !defined(LEFT)
  8871. adds KK = 1, KK
  8872. #else
  8873. nop __LINE__
  8874. #endif
  8875. }
  8876. ;;
  8877. .align 16
  8878. #endif
  8879. .L999:
  8880. mov r8 = r0
  8881. adds r9 = 1 * 16, SP
  8882. ;;
  8883. ldf.fill f16 = [SP], 32
  8884. ldf.fill f17 = [r9], 32
  8885. ;;
  8886. ldf.fill f18 = [SP], 32
  8887. ldf.fill f19 = [r9], 32
  8888. ;;
  8889. ldf.fill f20 = [SP], 32
  8890. ldf.fill f21 = [r9], 32
  8891. ;;
  8892. ldf.fill f22 = [SP], 32
  8893. ldf.fill f23 = [r9], 32
  8894. mov ar.lc = ARLC
  8895. ;;
  8896. ldf.fill f24 = [SP], 32
  8897. ldf.fill f25 = [r9], 32
  8898. mov pr = PR, -1
  8899. ;;
  8900. ldf.fill f26 = [SP], 32
  8901. ldf.fill f27 = [r9], 32
  8902. mov ar.pfs = ARPFS
  8903. ;;
  8904. ldf.fill f28 = [SP], 32
  8905. ldf.fill f29 = [r9], 32
  8906. ;;
  8907. ldf.fill f30 = [SP], 32
  8908. ldf.fill f31 = [r9]
  8909. br.ret.sptk.many b0
  8910. EPILOGUE