You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm3m_kernel.S 110 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef DOUBLE
  41. #define PREFETCHSIZE (16 * 8)
  42. #else
  43. #define PREFETCHSIZE (32 * 8)
  44. #endif
  45. #define CPREFETCHSIZE 15
  46. #define CPREFETCH lfetch.excl.nt1
  47. #define M r32
  48. #define N r33
  49. #define K r34
  50. #define A r37
  51. #define B r38
  52. #define C r39
  53. #define LDC r35
  54. #define I r15
  55. #define J r16
  56. #define AOFFSET r17
  57. #define BOFFSET r18
  58. #define L r20
  59. #define C1 r21
  60. #define C2 r22
  61. #define C3 r23
  62. #define C4 r24
  63. #define C5 r25
  64. #define C6 r26
  65. #define C7 r27
  66. #define C8 r28
  67. #define C9 loc0
  68. #define C10 loc1
  69. #define C11 loc2
  70. #define C12 loc3
  71. #define C13 loc4
  72. #define C14 loc5
  73. #define C15 loc6
  74. #define C16 loc7
  75. #define PREA r8
  76. #define PREB r9
  77. #define PREC r10
  78. #define SP r12
  79. #define ARLC r29
  80. #define PR r30
  81. #define ARPFS r31
  82. #define ALPHA_R f8
  83. #define ALPHA_I f9
  84. PROLOGUE
  85. .prologue
  86. PROFCODE
  87. { .mmi
  88. .save ar.pfs, ARPFS
  89. alloc ARPFS = ar.pfs, 8, 16, 0, 0
  90. adds r14 = 16, SP
  91. mov ARLC = ar.lc
  92. }
  93. { .mmi
  94. adds r8 = -16 * 16, SP
  95. adds r9 = -15 * 16, SP
  96. adds SP = -16 * 16, SP
  97. }
  98. ;;
  99. { .mmi
  100. stf.spill [r8] = f16, 32
  101. stf.spill [r9] = f17, 32
  102. mov PR = pr
  103. }
  104. { .mmi
  105. ld8 LDC = [r14], 8
  106. nop __LINE__
  107. nop __LINE__
  108. }
  109. ;;
  110. stf.spill [r8] = f18, 32
  111. stf.spill [r9] = f19, 32
  112. shr J = N, 3
  113. ;;
  114. stf.spill [r8] = f20, 32
  115. stf.spill [r9] = f21, 32
  116. shladd LDC = LDC, ZBASE_SHIFT, r0
  117. ;;
  118. stf.spill [r8] = f22, 32
  119. stf.spill [r9] = f23, 32
  120. mov AOFFSET = A
  121. ;;
  122. stf.spill [r8] = f24, 32
  123. stf.spill [r9] = f25, 32
  124. cmp.ge p6, p0 = 0, J
  125. ;;
  126. stf.spill [r8] = f26, 32
  127. stf.spill [r9] = f27, 32
  128. ;;
  129. stf.spill [r8] = f28, 32
  130. stf.spill [r9] = f29, 32
  131. ;;
  132. stf.spill [r8] = f30
  133. stf.spill [r9] = f31
  134. (p6) br.cond.dpnt .L050
  135. .body
  136. ;;
  137. .align 32
  138. .L010:
  139. { .mfi
  140. adds J = -1, J
  141. mov f64 = f0
  142. shr I = M, 3
  143. }
  144. { .mfi
  145. mov C1 = C // coffset1 = c + 0 * ldc
  146. mov f72 = f0
  147. }
  148. ;;
  149. { .mmf
  150. cmp.eq p6, p7 = 0, I
  151. nop __LINE__
  152. mov f80 = f0
  153. }
  154. { .mmf
  155. add C2 = LDC, C // coffset2 = c + 1 * ldc
  156. shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc
  157. mov f88 = f0
  158. }
  159. ;;
  160. { .mmf
  161. shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc
  162. shladd C = LDC, 3, C // coffset += 8 * ldc
  163. mov f96 = f0
  164. }
  165. { .mmf
  166. shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc
  167. shladd C6 = LDC, 2, C2 // coffset6 = c + 5 * ldc
  168. mov f104 = f0
  169. }
  170. ;;
  171. { .mfi
  172. shladd C7 = LDC, 2, C3 // coffset7 = c + 6 * ldc
  173. mov f112 = f0
  174. nop __LINE__
  175. }
  176. { .mfb
  177. sub C8 = C, LDC // coffset8 = c + 7 * ldc
  178. mov f120 = f0
  179. (p6) br.cond.dpnt .L020
  180. }
  181. ;;
  182. .align 16
  183. .L011:
  184. { .mfb
  185. LDFPD f48, f49 = [B]
  186. mov f65 = f0
  187. nop __LINE__
  188. }
  189. { .mfb
  190. adds BOFFSET = 2 * SIZE, B
  191. mov f73 = f0
  192. nop __LINE__
  193. }
  194. ;;
  195. { .mfb
  196. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  197. mov f81 = f0
  198. nop __LINE__
  199. }
  200. { .mfb
  201. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  202. mov f89 = f0
  203. nop __LINE__
  204. }
  205. ;;
  206. { .mmf
  207. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  208. setf.d f97 = r0
  209. mov f105 = f0
  210. }
  211. { .mfb
  212. setf.d f113 = r0
  213. mov f121 = f0
  214. nop __LINE__
  215. }
  216. ;;
  217. { .mmf
  218. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  219. setf.d f66 = r0
  220. mov f74 = f0
  221. }
  222. { .mfb
  223. setf.d f82 = r0
  224. mov f90 = f0
  225. nop __LINE__
  226. }
  227. ;;
  228. { .mmf
  229. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  230. setf.d f98 = r0
  231. mov f106 = f0
  232. }
  233. { .mfb
  234. setf.d f114 = r0
  235. mov f122 = f0
  236. nop __LINE__
  237. }
  238. ;;
  239. { .mmf
  240. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  241. setf.d f67 = r0
  242. mov f75 = f0
  243. }
  244. { .mfi
  245. setf.d f83 = r0
  246. mov f91 = f0
  247. nop __LINE__
  248. }
  249. ;;
  250. { .mmf
  251. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  252. setf.d f99 = r0
  253. mov f107 = f0
  254. }
  255. { .mfi
  256. setf.d f115 = r0
  257. mov f123 = f0
  258. adds PREC = CPREFETCHSIZE * SIZE, C1
  259. }
  260. ;;
  261. { .mmf
  262. CPREFETCH [PREC], LDC
  263. setf.d f68 = r0
  264. mov f76 = f0
  265. }
  266. { .mfi
  267. setf.d f84 = r0
  268. mov f92 = f0
  269. adds L = 1, K
  270. }
  271. ;;
  272. { .mmf
  273. CPREFETCH [PREC], LDC
  274. setf.d f100 = r0
  275. mov f108 = f0
  276. }
  277. { .mfi
  278. setf.d f116 = r0
  279. mov f124 = f0
  280. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  281. }
  282. ;;
  283. { .mmf
  284. CPREFETCH [PREC], LDC
  285. setf.d f69 = r0
  286. mov f77 = f0
  287. }
  288. { .mfi
  289. setf.d f85 = r0
  290. mov f93 = f0
  291. adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET
  292. }
  293. ;;
  294. { .mmf
  295. CPREFETCH [PREC], LDC
  296. setf.d f101 = r0
  297. mov f109 = f0
  298. }
  299. { .mfi
  300. setf.d f117 = r0
  301. mov f125 = f0
  302. tbit.z p12, p0 = L, 0
  303. }
  304. ;;
  305. { .mmf
  306. CPREFETCH [PREC], LDC
  307. setf.d f70 = r0
  308. mov f78 = f0
  309. }
  310. { .mfi
  311. setf.d f86 = r0
  312. mov f94 = f0
  313. shr L = L, 1
  314. }
  315. ;;
  316. { .mmf
  317. CPREFETCH [PREC], LDC
  318. setf.d f102 = r0
  319. mov f110 = f0
  320. }
  321. { .mfi
  322. setf.d f118 = r0
  323. mov f126 = f0
  324. adds L = -1, L
  325. }
  326. ;;
  327. { .mmf
  328. CPREFETCH [PREC], LDC
  329. setf.d f71 = r0
  330. mov f79 = f0
  331. }
  332. { .mfi
  333. setf.d f87 = r0
  334. mov f95 = f0
  335. mov ar.lc = L
  336. }
  337. ;;
  338. { .mmf
  339. CPREFETCH [PREC]
  340. setf.d f103 = r0
  341. mov f111 = f0
  342. }
  343. { .mfi
  344. setf.d f119 = r0
  345. mov f127 = f0
  346. cmp.eq p3, p0 = r0, r0
  347. }
  348. ;;
  349. .align 16
  350. .L012:
  351. /* 1 */
  352. { .mfi
  353. lfetch.nt1 [PREA], 16 * SIZE
  354. FMA f64 = f32, f48, f64 // A1 * B1
  355. nop __LINE__
  356. }
  357. { .mfi
  358. (p12) cmp.ne p3, p0 = 0, L
  359. FMA f72 = f32, f49, f72 // A1 * B2
  360. nop __LINE__
  361. }
  362. ;;
  363. /* 2 */
  364. { .mfi
  365. lfetch.nt1 [PREB], 16 * SIZE
  366. FMA f80 = f32, f50, f80 // A1 * B3
  367. nop __LINE__
  368. }
  369. { .mfi
  370. cmp.ne p4, p5 = 0, L
  371. FMA f88 = f32, f51, f88 // A1 * B4
  372. nop __LINE__
  373. }
  374. ;;
  375. /* 3 */
  376. { .mfi
  377. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  378. FMA f96 = f32, f52, f96 // A1 * B5
  379. nop __LINE__
  380. }
  381. { .mfi
  382. adds C9 = 4 * SIZE, C1
  383. FMA f104 = f32, f53, f104 // A1 * B6
  384. nop __LINE__
  385. }
  386. ;;
  387. /* 4 */
  388. { .mfi
  389. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  390. FMA f112 = f32, f54, f112 // A1 * B7
  391. nop __LINE__
  392. }
  393. { .mfi
  394. adds C10 = 4 * SIZE, C2
  395. FMA f120 = f32, f55, f120 // A1 * B8
  396. nop __LINE__
  397. }
  398. ;;
  399. /* 5 */
  400. { .mfi
  401. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  402. FMA f65 = f33, f48, f65 // A2 * B1
  403. nop __LINE__
  404. }
  405. { .mfi
  406. adds C11 = 4 * SIZE, C3
  407. FMA f73 = f33, f49, f73 // A2 * B2
  408. nop __LINE__
  409. }
  410. ;;
  411. /* 6 */
  412. { .mfi
  413. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  414. FMA f81 = f33, f50, f81 // A2 * B3
  415. nop __LINE__
  416. }
  417. { .mfi
  418. adds C12 = 4 * SIZE, C4
  419. FMA f89 = f33, f51, f89 // A2 * B4
  420. nop __LINE__
  421. }
  422. ;;
  423. /* 7 */
  424. { .mfi
  425. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  426. FMA f97 = f33, f52, f97 // A2 * B5
  427. nop __LINE__
  428. }
  429. { .mfi
  430. adds C13 = 4 * SIZE, C5
  431. FMA f105 = f33, f53, f105 // A2 * B6
  432. nop __LINE__
  433. }
  434. ;;
  435. /* 8 */
  436. { .mfi
  437. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  438. FMA f113 = f33, f54, f113 // A2 * B7
  439. nop __LINE__
  440. }
  441. { .mfi
  442. adds C14 = 4 * SIZE, C6
  443. FMA f121 = f33, f55, f121 // A2 * B8
  444. nop __LINE__
  445. }
  446. ;;
  447. /* 9 */
  448. { .mfi
  449. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  450. FMA f66 = f34, f48, f66 // A3 * B1
  451. nop __LINE__
  452. }
  453. { .mfi
  454. adds C15 = 4 * SIZE, C7
  455. FMA f74 = f34, f49, f74 // A3 * B2
  456. nop __LINE__
  457. }
  458. ;;
  459. /* 10 */
  460. { .mfi
  461. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  462. FMA f82 = f34, f50, f82 // A3 * B3
  463. nop __LINE__
  464. }
  465. { .mfi
  466. adds C16 = 4 * SIZE, C8
  467. FMA f90 = f34, f51, f90 // A3 * B4
  468. nop __LINE__
  469. }
  470. ;;
  471. /* 11 */
  472. { .mfi
  473. FMA f98 = f34, f52, f98 // A3 * B5
  474. nop __LINE__
  475. }
  476. { .mfi
  477. nop __LINE__
  478. FMA f106 = f34, f53, f106 // A3 * B6
  479. nop __LINE__
  480. }
  481. ;;
  482. /* 12 */
  483. { .mfi
  484. FMA f114 = f34, f54, f114 // A3 * B7
  485. nop __LINE__
  486. }
  487. { .mfi
  488. nop __LINE__
  489. FMA f122 = f34, f55, f122 // A3 * B8
  490. nop __LINE__
  491. }
  492. ;;
  493. /* 13 */
  494. { .mfi
  495. nop __LINE__
  496. FMA f67 = f35, f48, f67 // A4 * B1
  497. }
  498. { .mfi
  499. nop __LINE__
  500. FMA f75 = f35, f49, f75 // A4 * B2
  501. nop __LINE__
  502. }
  503. ;;
  504. /* 14 */
  505. { .mfi
  506. FMA f83 = f35, f50, f83 // A4 * B3
  507. nop __LINE__
  508. }
  509. { .mfi
  510. nop __LINE__
  511. FMA f91 = f35, f51, f91 // A4 * B4
  512. nop __LINE__
  513. }
  514. ;;
  515. /* 15 */
  516. { .mfi
  517. FMA f99 = f35, f52, f99 // A4 * B5
  518. nop __LINE__
  519. }
  520. { .mfi
  521. nop __LINE__
  522. FMA f107 = f35, f53, f107 // A4 * B6
  523. nop __LINE__
  524. }
  525. ;;
  526. /* 16 */
  527. { .mfi
  528. FMA f115 = f35, f54, f115 // A4 * B7
  529. nop __LINE__
  530. }
  531. { .mfi
  532. nop __LINE__
  533. FMA f123 = f35, f55, f123 // A4 * B8
  534. nop __LINE__
  535. }
  536. ;;
  537. /* 17 */
  538. { .mfi
  539. nop __LINE__
  540. FMA f68 = f36, f48, f68 // A5 * B1
  541. nop __LINE__
  542. }
  543. { .mfi
  544. nop __LINE__
  545. FMA f76 = f36, f49, f76 // A5 * B2
  546. nop __LINE__
  547. }
  548. ;;
  549. /* 18 */
  550. { .mfi
  551. nop __LINE__
  552. FMA f84 = f36, f50, f84 // A5 * B3
  553. nop __LINE__
  554. }
  555. { .mfi
  556. nop __LINE__
  557. FMA f92 = f36, f51, f92 // A5 * B4
  558. nop __LINE__
  559. }
  560. ;;
  561. /* 19 */
  562. { .mfi
  563. nop __LINE__
  564. FMA f100 = f36, f52, f100 // A5 * B5
  565. nop __LINE__
  566. }
  567. { .mfi
  568. nop __LINE__
  569. FMA f108 = f36, f53, f108 // A5 * B6
  570. nop __LINE__
  571. }
  572. ;;
  573. /* 20 */
  574. { .mfi
  575. nop __LINE__
  576. FMA f116 = f36, f54, f116 // A5 * B7
  577. nop __LINE__
  578. }
  579. { .mfi
  580. nop __LINE__
  581. FMA f124 = f36, f55, f124 // A5 * B8
  582. nop __LINE__
  583. }
  584. ;;
  585. /* 21 */
  586. { .mfi
  587. nop __LINE__
  588. FMA f69 = f37, f48, f69 // A6 * B1
  589. nop __LINE__
  590. }
  591. { .mfi
  592. nop __LINE__
  593. FMA f77 = f37, f49, f77 // A6 * B2
  594. nop __LINE__
  595. }
  596. ;;
  597. /* 22 */
  598. { .mfi
  599. nop __LINE__
  600. FMA f85 = f37, f50, f85 // A6 * B3
  601. nop __LINE__
  602. }
  603. { .mfi
  604. nop __LINE__
  605. FMA f93 = f37, f51, f93 // A6 * B4
  606. nop __LINE__
  607. }
  608. ;;
  609. /* 23 */
  610. { .mfi
  611. nop __LINE__
  612. FMA f101 = f37, f52, f101 // A6 * B5
  613. nop __LINE__
  614. }
  615. { .mfi
  616. nop __LINE__
  617. FMA f109 = f37, f53, f109 // A6 * B6
  618. nop __LINE__
  619. }
  620. ;;
  621. /* 24 */
  622. { .mfi
  623. nop __LINE__
  624. FMA f117 = f37, f54, f117 // A6 * B7
  625. nop __LINE__
  626. }
  627. { .mfi
  628. nop __LINE__
  629. FMA f125 = f37, f55, f125 // A6 * B8
  630. nop __LINE__
  631. }
  632. ;;
  633. /* 25 */
  634. { .mfi
  635. nop __LINE__
  636. FMA f70 = f38, f48, f70 // A7 * B1
  637. nop __LINE__
  638. }
  639. { .mfi
  640. nop __LINE__
  641. FMA f78 = f38, f49, f78 // A7 * B2
  642. nop __LINE__
  643. }
  644. ;;
  645. /* 26 */
  646. { .mfi
  647. nop __LINE__
  648. FMA f86 = f38, f50, f86 // A7 * B3
  649. nop __LINE__
  650. }
  651. { .mfi
  652. nop __LINE__
  653. FMA f94 = f38, f51, f94 // A7 * B4
  654. nop __LINE__
  655. }
  656. ;;
  657. /* 27 */
  658. { .mfi
  659. nop __LINE__
  660. FMA f102 = f38, f52, f102 // A7 * B5
  661. nop __LINE__
  662. }
  663. { .mfi
  664. nop __LINE__
  665. FMA f110 = f38, f53, f110 // A7 * B6
  666. nop __LINE__
  667. }
  668. ;;
  669. /* 28 */
  670. { .mfi
  671. nop __LINE__
  672. FMA f118 = f38, f54, f118 // A7 * B7
  673. nop __LINE__
  674. }
  675. { .mfi
  676. nop __LINE__
  677. FMA f126 = f38, f55, f126 // A7 * B8
  678. nop __LINE__
  679. }
  680. ;;
  681. /* 29 */
  682. { .mfi
  683. nop __LINE__
  684. FMA f71 = f39, f48, f71 // A8 * B1
  685. nop __LINE__
  686. }
  687. { .mfi
  688. nop __LINE__
  689. FMA f79 = f39, f49, f79 // A8 * B2
  690. nop __LINE__
  691. }
  692. ;;
  693. /* 30 */
  694. { .mfi
  695. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  696. FMA f87 = f39, f50, f87 // A8 * B3
  697. nop __LINE__
  698. }
  699. { .mfi
  700. nop __LINE__
  701. FMA f95 = f39, f51, f95 // A8 * B4
  702. nop __LINE__
  703. }
  704. ;;
  705. /* 31 */
  706. { .mfi
  707. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  708. FMA f103 = f39, f52, f103 // A8 * B5
  709. nop __LINE__
  710. }
  711. { .mfi
  712. nop __LINE__
  713. FMA f111 = f39, f53, f111 // A8 * B6
  714. nop __LINE__
  715. }
  716. ;;
  717. /* 32 */
  718. { .mfi
  719. nop __LINE__
  720. FMA f119 = f39, f54, f119 // A8 * B7
  721. nop __LINE__
  722. }
  723. { .mfi
  724. nop __LINE__
  725. FMA f127 = f39, f55, f127 // A8 * B8
  726. nop __LINE__
  727. }
  728. ;;
  729. /* 33 */
  730. { .mfi
  731. nop __LINE__
  732. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  733. nop __LINE__
  734. }
  735. { .mfi
  736. nop __LINE__
  737. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  738. nop __LINE__
  739. }
  740. ;;
  741. /* 34 */
  742. { .mfi
  743. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  744. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  745. nop __LINE__
  746. }
  747. { .mfi
  748. nop __LINE__
  749. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  750. nop __LINE__
  751. }
  752. ;;
  753. /* 35 */
  754. { .mfi
  755. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  756. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  757. nop __LINE__
  758. }
  759. { .mfi
  760. nop __LINE__
  761. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  762. nop __LINE__
  763. }
  764. ;;
  765. /* 36 */
  766. { .mfi
  767. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  768. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  769. nop __LINE__
  770. }
  771. { .mfi
  772. nop __LINE__
  773. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  774. nop __LINE__
  775. }
  776. ;;
  777. /* 37 */
  778. { .mfi
  779. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  780. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  781. nop __LINE__
  782. }
  783. { .mfi
  784. nop __LINE__
  785. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  786. nop __LINE__
  787. }
  788. ;;
  789. /* 38 */
  790. { .mfi
  791. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  792. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  793. nop __LINE__
  794. }
  795. { .mfi
  796. nop __LINE__
  797. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  798. nop __LINE__
  799. }
  800. ;;
  801. /* 39 */
  802. { .mfi
  803. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  804. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  805. nop __LINE__
  806. }
  807. { .mfi
  808. nop __LINE__
  809. (p3) FMA f105 = f41, f61, f105 // A2 * B6
  810. nop __LINE__
  811. }
  812. ;;
  813. /* 40 */
  814. { .mfi
  815. (p5) LDFD f6 = [C1 ], SIZE
  816. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  817. nop __LINE__
  818. }
  819. { .mfi
  820. (p5) LDFD f7 = [C9 ], SIZE
  821. (p3) FMA f121 = f41, f63, f121 // A2 * B8
  822. nop __LINE__
  823. }
  824. ;;
  825. /* 41 */
  826. { .mfi
  827. (p5) LDFD f10 = [C1 ], SIZE
  828. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  829. nop __LINE__
  830. }
  831. { .mfi
  832. (p5) LDFD f11 = [C9 ], SIZE
  833. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  834. nop __LINE__
  835. }
  836. ;;
  837. /* 42 */
  838. { .mfi
  839. (p5) LDFD f12 = [C1 ], SIZE
  840. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  841. nop __LINE__
  842. }
  843. { .mfi
  844. (p5) LDFD f13 = [C9 ], SIZE
  845. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  846. nop __LINE__
  847. }
  848. ;;
  849. /* 43 */
  850. { .mfi
  851. (p5) LDFD f14 = [C1 ], 5 * SIZE
  852. (p3) FMA f98 = f42, f60, f98 // A3 * B5
  853. nop __LINE__
  854. }
  855. { .mfi
  856. (p5) LDFD f15 = [C9 ], 5 * SIZE
  857. (p3) FMA f106 = f42, f61, f106 // A3 * B6
  858. nop __LINE__
  859. }
  860. ;;
  861. /* 44 */
  862. { .mfi
  863. (p5) LDFD f16 = [C1 ], SIZE
  864. (p3) FMA f114 = f42, f62, f114 // A3 * B7
  865. nop __LINE__
  866. }
  867. { .mfi
  868. (p5) LDFD f17 = [C9 ], SIZE
  869. (p3) FMA f122 = f42, f63, f122 // A3 * B8
  870. nop __LINE__
  871. }
  872. ;;
  873. /* 45 */
  874. { .mfi
  875. (p5) LDFD f18 = [C1 ], SIZE
  876. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  877. nop __LINE__
  878. }
  879. { .mfi
  880. (p5) LDFD f19 = [C9 ], SIZE
  881. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  882. nop __LINE__
  883. }
  884. ;;
  885. /* 46 */
  886. { .mfi
  887. (p5) LDFD f20 = [C1 ], SIZE
  888. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  889. nop __LINE__
  890. }
  891. { .mfi
  892. (p5) LDFD f21 = [C9 ], SIZE
  893. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  894. nop __LINE__
  895. }
  896. ;;
  897. /* 47 */
  898. { .mfi
  899. (p5) LDFD f22 = [C1 ], - 11 * SIZE
  900. (p3) FMA f99 = f43, f60, f99 // A4 * B5
  901. nop __LINE__
  902. }
  903. { .mfi
  904. (p5) LDFD f23 = [C9 ], - 11 * SIZE
  905. (p3) FMA f107 = f43, f61, f107 // A4 * B6
  906. nop __LINE__
  907. }
  908. ;;
  909. /* 48 */
  910. { .mfi
  911. (p5) LDFD f24 = [C2 ], SIZE
  912. (p3) FMA f115 = f43, f62, f115 // A4 * B7
  913. nop __LINE__
  914. }
  915. { .mfi
  916. (p5) LDFD f25 = [C10], SIZE
  917. (p3) FMA f123 = f43, f63, f123 // A4 * B8
  918. nop __LINE__
  919. }
  920. ;;
  921. /* 49 */
  922. { .mfi
  923. (p5) LDFD f26 = [C2 ], SIZE
  924. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  925. nop __LINE__
  926. }
  927. { .mfi
  928. (p5) LDFD f27 = [C10], SIZE
  929. (p3) FMA f76 = f44, f57, f76 // A5 * B2
  930. nop __LINE__
  931. }
  932. ;;
  933. /* 50 */
  934. { .mfi
  935. (p5) LDFD f28 = [C2 ], SIZE
  936. (p3) FMA f84 = f44, f58, f84 // A5 * B3
  937. nop __LINE__
  938. }
  939. { .mfi
  940. (p5) LDFD f29 = [C10], SIZE
  941. (p3) FMA f92 = f44, f59, f92 // A5 * B4
  942. nop __LINE__
  943. }
  944. ;;
  945. /* 51 */
  946. { .mfi
  947. (p5) LDFD f30 = [C2 ], 5 * SIZE
  948. (p3) FMA f100 = f44, f60, f100 // A5 * B5
  949. nop __LINE__
  950. }
  951. { .mfi
  952. (p5) LDFD f31 = [C10], 5 * SIZE
  953. (p3) FMA f108 = f44, f61, f108 // A5 * B6
  954. nop __LINE__
  955. }
  956. ;;
  957. /* 52 */
  958. { .mfi
  959. (p5) LDFD f32 = [C2 ], SIZE
  960. (p3) FMA f116 = f44, f62, f116 // A5 * B7
  961. nop __LINE__
  962. }
  963. { .mfi
  964. (p5) LDFD f33 = [C10], SIZE
  965. (p3) FMA f124 = f44, f63, f124 // A5 * B8
  966. nop __LINE__
  967. }
  968. ;;
  969. /* 53 */
  970. { .mfi
  971. (p5) LDFD f34 = [C2 ], SIZE
  972. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  973. nop __LINE__
  974. }
  975. { .mfi
  976. (p5) LDFD f35 = [C10], SIZE
  977. (p3) FMA f77 = f45, f57, f77 // A6 * B2
  978. nop __LINE__
  979. }
  980. ;;
  981. /* 54 */
  982. { .mfi
  983. (p5) LDFD f36 = [C2 ], SIZE
  984. (p3) FMA f85 = f45, f58, f85 // A6 * B3
  985. nop __LINE__
  986. }
  987. { .mfi
  988. (p5) LDFD f37 = [C10], SIZE
  989. (p3) FMA f93 = f45, f59, f93 // A6 * B4
  990. nop __LINE__
  991. }
  992. ;;
  993. /* 55 */
  994. { .mfi
  995. (p5) LDFD f38 = [C2 ], - 11 * SIZE
  996. (p3) FMA f101 = f45, f60, f101 // A6 * B5
  997. nop __LINE__
  998. }
  999. { .mfi
  1000. (p5) LDFD f39 = [C10], - 11 * SIZE
  1001. (p3) FMA f109 = f45, f61, f109 // A6 * B6
  1002. nop __LINE__
  1003. }
  1004. ;;
  1005. /* 56 */
  1006. { .mfi
  1007. (p5) LDFD f48 = [C3 ], SIZE
  1008. (p3) FMA f117 = f45, f62, f117 // A6 * B7
  1009. nop __LINE__
  1010. }
  1011. { .mfi
  1012. (p5) LDFD f49 = [C11], SIZE
  1013. (p3) FMA f125 = f45, f63, f125 // A6 * B8
  1014. nop __LINE__
  1015. }
  1016. ;;
  1017. /* 57 */
  1018. { .mfi
  1019. (p5) LDFD f50 = [C3 ], SIZE
  1020. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  1021. nop __LINE__
  1022. }
  1023. { .mfi
  1024. (p5) LDFD f51 = [C11], SIZE
  1025. (p3) FMA f78 = f46, f57, f78 // A7 * B2
  1026. nop __LINE__
  1027. }
  1028. ;;
  1029. /* 58 */
  1030. { .mfi
  1031. (p5) LDFD f52 = [C3 ], SIZE
  1032. (p3) FMA f86 = f46, f58, f86 // A7 * B3
  1033. nop __LINE__
  1034. }
  1035. { .mfi
  1036. (p5) LDFD f53 = [C11], SIZE
  1037. (p3) FMA f94 = f46, f59, f94 // A7 * B4
  1038. nop __LINE__
  1039. }
  1040. ;;
  1041. /* 59 */
  1042. { .mfi
  1043. (p5) LDFD f54 = [C3 ], 5 * SIZE
  1044. (p3) FMA f102 = f46, f60, f102 // A7 * B5
  1045. nop __LINE__
  1046. }
  1047. { .mfi
  1048. (p5) LDFD f55 = [C11], 5 * SIZE
  1049. (p3) FMA f110 = f46, f61, f110 // A7 * B6
  1050. nop __LINE__
  1051. }
  1052. ;;
  1053. /* 60 */
  1054. { .mfi
  1055. (p5) LDFD f40 = [C3 ], SIZE
  1056. (p3) FMA f118 = f46, f62, f118 // A7 * B7
  1057. nop __LINE__
  1058. }
  1059. { .mfi
  1060. (p5) LDFD f41 = [C11], SIZE
  1061. (p3) FMA f126 = f46, f63, f126 // A7 * B8
  1062. nop __LINE__
  1063. }
  1064. ;;
  1065. /* 61 */
  1066. { .mfi
  1067. (p5) LDFD f42 = [C3 ], SIZE
  1068. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  1069. nop __LINE__
  1070. }
  1071. { .mfi
  1072. (p5) LDFD f43 = [C11], SIZE
  1073. (p3) FMA f79 = f47, f57, f79 // A8 * B2
  1074. nop __LINE__
  1075. }
  1076. ;;
  1077. /* 62 */
  1078. { .mfi
  1079. (p5) LDFD f44 = [C3 ], SIZE
  1080. (p3) FMA f87 = f47, f58, f87 // A8 * B3
  1081. nop __LINE__
  1082. }
  1083. { .mfi
  1084. (p5) LDFD f45 = [C11], SIZE
  1085. (p3) FMA f95 = f47, f59, f95 // A8 * B4
  1086. nop __LINE__
  1087. }
  1088. ;;
  1089. /* 63 */
  1090. { .mfi
  1091. (p5) LDFD f46 = [C3 ], - 11 * SIZE
  1092. (p3) FMA f103 = f47, f60, f103 // A8 * B5
  1093. nop __LINE__
  1094. }
  1095. { .mfi
  1096. (p5) LDFD f56 = [C11], - 11 * SIZE
  1097. (p3) FMA f111 = f47, f61, f111 // A8 * B6
  1098. nop __LINE__
  1099. }
  1100. ;;
  1101. /* 64 */
  1102. { .mfi
  1103. (p5) LDFD f57 = [C4 ], SIZE
  1104. (p3) FMA f119 = f47, f62, f119 // A8 * B7
  1105. adds L = -1, L
  1106. }
  1107. { .mfb
  1108. (p5) LDFD f58 = [C12], SIZE
  1109. (p3) FMA f127 = f47, f63, f127 // A8 * B8
  1110. br.cloop.sptk.few .L012
  1111. }
  1112. ;;
  1113. .L013:
  1114. { .mmf
  1115. (p5) LDFD f59 = [C4 ], SIZE
  1116. (p5) LDFD f60 = [C12], SIZE
  1117. FMA f6 = ALPHA_R, f64, f6
  1118. }
  1119. { .mmf
  1120. cmp.ne p6, p0 = 1, I
  1121. nop __LINE__
  1122. FMA f7 = ALPHA_R, f66, f7
  1123. }
  1124. ;;
  1125. { .mmf
  1126. (p5) LDFD f61 = [C4 ], SIZE
  1127. (p5) LDFD f62 = [C12], SIZE
  1128. FMA f10 = ALPHA_I, f64, f10
  1129. }
  1130. { .mmf
  1131. nop __LINE__
  1132. nop __LINE__
  1133. FMA f11 = ALPHA_I, f66, f11
  1134. }
  1135. ;;
  1136. { .mmf
  1137. (p5) LDFD f63 = [C4 ], 5 * SIZE
  1138. (p5) LDFD f47 = [C12], 5 * SIZE
  1139. FMA f12 = ALPHA_R, f65, f12
  1140. }
  1141. { .mmf
  1142. nop __LINE__
  1143. nop __LINE__
  1144. FMA f13 = ALPHA_R, f67, f13
  1145. }
  1146. ;;
  1147. { .mfi
  1148. (p5) LDFD f64 = [C4 ], SIZE
  1149. FMA f14 = ALPHA_I, f65, f14
  1150. nop __LINE__
  1151. }
  1152. { .mfi
  1153. (p5) LDFD f65 = [C12], SIZE
  1154. FMA f15 = ALPHA_I, f67, f15
  1155. nop __LINE__
  1156. }
  1157. ;;
  1158. { .mmf
  1159. STFD [C1 ] = f6, SIZE
  1160. STFD [C9 ] = f7, SIZE
  1161. FMA f16 = ALPHA_R, f68, f16
  1162. }
  1163. { .mmf
  1164. (p5) LDFD f6 = [C4 ], SIZE
  1165. (p5) LDFD f7 = [C12], SIZE
  1166. FMA f17 = ALPHA_R, f70, f17
  1167. }
  1168. ;;
  1169. { .mmf
  1170. STFD [C1 ] = f10, SIZE
  1171. STFD [C9 ] = f11, SIZE
  1172. FMA f18 = ALPHA_I, f68, f18
  1173. }
  1174. { .mmf
  1175. (p5) LDFD f10 = [C4 ], SIZE
  1176. (p5) LDFD f11 = [C12], SIZE
  1177. FMA f19 = ALPHA_I, f70, f19
  1178. }
  1179. ;;
  1180. { .mmf
  1181. STFD [C1 ] = f12, SIZE
  1182. STFD [C9 ] = f13, SIZE
  1183. FMA f20 = ALPHA_R, f69, f20
  1184. }
  1185. { .mmf
  1186. (p5) LDFD f12 = [C4 ], - 11 * SIZE
  1187. (p5) LDFD f13 = [C12], - 11 * SIZE
  1188. FMA f21 = ALPHA_R, f71, f21
  1189. }
  1190. ;;
  1191. { .mmf
  1192. STFD [C1 ] = f14, 5 * SIZE
  1193. STFD [C9 ] = f15, 5 * SIZE
  1194. FMA f22 = ALPHA_I, f69, f22
  1195. }
  1196. { .mmf
  1197. (p5) LDFD f14 = [C5 ], SIZE
  1198. (p5) LDFD f15 = [C13], SIZE
  1199. FMA f23 = ALPHA_I, f71, f23
  1200. }
  1201. ;;
  1202. { .mmf
  1203. STFD [C1 ] = f16, SIZE
  1204. STFD [C9 ] = f17, SIZE
  1205. FMA f24 = ALPHA_R, f72, f24
  1206. }
  1207. { .mmf
  1208. (p5) LDFD f16 = [C5 ], SIZE
  1209. (p5) LDFD f17 = [C13], SIZE
  1210. FMA f25 = ALPHA_R, f74, f25
  1211. }
  1212. ;;
  1213. { .mmf
  1214. STFD [C1 ] = f18, SIZE
  1215. STFD [C9 ] = f19, SIZE
  1216. FMA f26 = ALPHA_I, f72, f26
  1217. }
  1218. { .mmf
  1219. (p5) LDFD f18 = [C5 ], SIZE
  1220. (p5) LDFD f19 = [C13], SIZE
  1221. FMA f27 = ALPHA_I, f74, f27
  1222. }
  1223. ;;
  1224. { .mmf
  1225. STFD [C1 ] = f20, SIZE
  1226. STFD [C9 ] = f21, SIZE
  1227. FMA f28 = ALPHA_R, f73, f28
  1228. }
  1229. { .mmf
  1230. (p5) LDFD f20 = [C5 ], 5 * SIZE
  1231. (p5) LDFD f21 = [C13], 5 * SIZE
  1232. FMA f29 = ALPHA_R, f75, f29
  1233. }
  1234. ;;
  1235. { .mmf
  1236. STFD [C1 ] = f22, 5 * SIZE
  1237. STFD [C9 ] = f23, 5 * SIZE
  1238. FMA f30 = ALPHA_I, f73, f30
  1239. }
  1240. { .mmf
  1241. (p5) LDFD f22 = [C5 ], SIZE
  1242. (p5) LDFD f23 = [C13], SIZE
  1243. FMA f31 = ALPHA_I, f75, f31
  1244. }
  1245. ;;
  1246. { .mmf
  1247. STFD [C2 ] = f24, SIZE
  1248. STFD [C10] = f25, SIZE
  1249. FMA f32 = ALPHA_R, f76, f32
  1250. }
  1251. { .mmf
  1252. (p5) LDFD f24 = [C5 ], SIZE
  1253. (p5) LDFD f25 = [C13], SIZE
  1254. FMA f33 = ALPHA_R, f78, f33
  1255. }
  1256. ;;
  1257. { .mmf
  1258. STFD [C2 ] = f26, SIZE
  1259. STFD [C10] = f27, SIZE
  1260. FMA f34 = ALPHA_I, f76, f34
  1261. }
  1262. { .mmf
  1263. (p5) LDFD f26 = [C5 ], SIZE
  1264. (p5) LDFD f27 = [C13], SIZE
  1265. FMA f35 = ALPHA_I, f78, f35
  1266. }
  1267. ;;
  1268. { .mmf
  1269. STFD [C2 ] = f28, SIZE
  1270. STFD [C10] = f29, SIZE
  1271. FMA f36 = ALPHA_R, f77, f36
  1272. }
  1273. { .mmf
  1274. (p5) LDFD f28 = [C5 ], - 11 * SIZE
  1275. (p5) LDFD f29 = [C13], - 11 * SIZE
  1276. FMA f37 = ALPHA_R, f79, f37
  1277. }
  1278. ;;
  1279. { .mmf
  1280. STFD [C2 ] = f30, 5 * SIZE
  1281. STFD [C10] = f31, 5 * SIZE
  1282. FMA f38 = ALPHA_I, f77, f38
  1283. }
  1284. { .mmf
  1285. (p5) LDFD f30 = [C6 ], SIZE
  1286. (p5) LDFD f31 = [C14], SIZE
  1287. FMA f39 = ALPHA_I, f79, f39
  1288. }
  1289. ;;
  1290. { .mmf
  1291. STFD [C2 ] = f32, SIZE
  1292. STFD [C10] = f33, SIZE
  1293. FMA f48 = ALPHA_R, f80, f48
  1294. }
  1295. { .mmf
  1296. (p5) LDFD f32 = [C6 ], SIZE
  1297. (p5) LDFD f33 = [C14], SIZE
  1298. FMA f49 = ALPHA_R, f82, f49
  1299. }
  1300. ;;
  1301. { .mmf
  1302. STFD [C2 ] = f34, SIZE
  1303. STFD [C10] = f35, SIZE
  1304. FMA f50 = ALPHA_I, f80, f50
  1305. }
  1306. { .mmf
  1307. (p5) LDFD f34 = [C6 ], SIZE
  1308. (p5) LDFD f35 = [C14], SIZE
  1309. FMA f51 = ALPHA_I, f82, f51
  1310. }
  1311. ;;
  1312. { .mmf
  1313. STFD [C2 ] = f36, SIZE
  1314. STFD [C10] = f37, SIZE
  1315. FMA f52 = ALPHA_R, f81, f52
  1316. }
  1317. { .mmf
  1318. (p5) LDFD f36 = [C6 ], 5 * SIZE
  1319. (p5) LDFD f37 = [C14], 5 * SIZE
  1320. FMA f53 = ALPHA_R, f83, f53
  1321. }
  1322. ;;
  1323. { .mmf
  1324. STFD [C2 ] = f38, 5 * SIZE
  1325. STFD [C10] = f39, 5 * SIZE
  1326. FMA f54 = ALPHA_I, f81, f54
  1327. }
  1328. { .mmf
  1329. (p5) LDFD f38 = [C6 ], SIZE
  1330. (p5) LDFD f39 = [C14], SIZE
  1331. FMA f55 = ALPHA_I, f83, f55
  1332. }
  1333. ;;
  1334. { .mmf
  1335. STFD [C3 ] = f48, SIZE
  1336. STFD [C11] = f49, SIZE
  1337. FMA f40 = ALPHA_R, f84, f40
  1338. }
  1339. { .mmf
  1340. (p5) LDFD f48 = [C6 ], SIZE
  1341. (p5) LDFD f49 = [C14], SIZE
  1342. FMA f41 = ALPHA_R, f86, f41
  1343. }
  1344. ;;
  1345. { .mmf
  1346. STFD [C3 ] = f50, SIZE
  1347. STFD [C11] = f51, SIZE
  1348. FMA f42 = ALPHA_I, f84, f42
  1349. }
  1350. { .mmf
  1351. (p5) LDFD f50 = [C6 ], SIZE
  1352. (p5) LDFD f51 = [C14], SIZE
  1353. FMA f43 = ALPHA_I, f86, f43
  1354. }
  1355. ;;
  1356. { .mmf
  1357. STFD [C3 ] = f52, SIZE
  1358. STFD [C11] = f53, SIZE
  1359. FMA f44 = ALPHA_R, f85, f44
  1360. }
  1361. { .mmf
  1362. (p5) LDFD f52 = [C6 ], - 11 * SIZE
  1363. (p5) LDFD f53 = [C14], - 11 * SIZE
  1364. FMA f45 = ALPHA_R, f87, f45
  1365. }
  1366. ;;
  1367. { .mmf
  1368. STFD [C3 ] = f54, 5 * SIZE
  1369. STFD [C11] = f55, 5 * SIZE
  1370. FMA f46 = ALPHA_I, f85, f46
  1371. }
  1372. { .mmf
  1373. (p5) LDFD f54 = [C7 ], SIZE
  1374. (p5) LDFD f55 = [C15], SIZE
  1375. FMA f56 = ALPHA_I, f87, f56
  1376. }
  1377. ;;
  1378. { .mmf
  1379. STFD [C3 ] = f40, SIZE
  1380. STFD [C11] = f41, SIZE
  1381. FMA f57 = ALPHA_R, f88, f57
  1382. }
  1383. { .mmf
  1384. (p5) LDFD f40 = [C7 ], SIZE
  1385. (p5) LDFD f41 = [C15], SIZE
  1386. FMA f58 = ALPHA_R, f90, f58
  1387. }
  1388. ;;
  1389. { .mmf
  1390. STFD [C3 ] = f42, SIZE
  1391. STFD [C11] = f43, SIZE
  1392. FMA f59 = ALPHA_I, f88, f59
  1393. }
  1394. { .mmf
  1395. (p5) LDFD f42 = [C7 ], SIZE
  1396. (p5) LDFD f43 = [C15], SIZE
  1397. FMA f60 = ALPHA_I, f90, f60
  1398. }
  1399. ;;
  1400. { .mmf
  1401. STFD [C3 ] = f44, SIZE
  1402. STFD [C11] = f45, SIZE
  1403. FMA f61 = ALPHA_R, f89, f61
  1404. }
  1405. { .mmf
  1406. (p5) LDFD f44 = [C7 ], 5 * SIZE
  1407. (p5) LDFD f45 = [C15], 5 * SIZE
  1408. FMA f62 = ALPHA_R, f91, f62
  1409. }
  1410. ;;
  1411. { .mmf
  1412. STFD [C3 ] = f46, 5 * SIZE
  1413. STFD [C11] = f56, 5 * SIZE
  1414. FMA f63 = ALPHA_I, f89, f63
  1415. }
  1416. { .mmf
  1417. (p5) LDFD f46 = [C7 ], SIZE
  1418. (p5) LDFD f56 = [C15], SIZE
  1419. FMA f47 = ALPHA_I, f91, f47
  1420. }
  1421. ;;
  1422. { .mmf
  1423. STFD [C4 ] = f57, SIZE
  1424. STFD [C12] = f58, SIZE
  1425. FMA f64 = ALPHA_R, f92, f64
  1426. }
  1427. { .mmf
  1428. (p5) LDFD f57 = [C7 ], SIZE
  1429. (p5) LDFD f58 = [C15], SIZE
  1430. FMA f65 = ALPHA_R, f94, f65
  1431. }
  1432. ;;
  1433. { .mmf
  1434. STFD [C4 ] = f59, SIZE
  1435. STFD [C12] = f60, SIZE
  1436. FMA f6 = ALPHA_I, f92, f6
  1437. }
  1438. { .mmf
  1439. (p5) LDFD f59 = [C7 ], SIZE
  1440. (p5) LDFD f60 = [C15], SIZE
  1441. FMA f7 = ALPHA_I, f94, f7
  1442. }
  1443. ;;
  1444. { .mmf
  1445. STFD [C4 ] = f61, SIZE
  1446. STFD [C12] = f62, SIZE
  1447. FMA f10 = ALPHA_R, f93, f10
  1448. }
  1449. { .mmf
  1450. (p5) LDFD f61 = [C7 ], - 11 * SIZE
  1451. (p5) LDFD f62 = [C15], - 11 * SIZE
  1452. FMA f11 = ALPHA_R, f95, f11
  1453. }
  1454. ;;
  1455. { .mmf
  1456. STFD [C4 ] = f63, 5 * SIZE
  1457. STFD [C12] = f47, 5 * SIZE
  1458. FMA f12 = ALPHA_I, f93, f12
  1459. }
  1460. { .mmf
  1461. (p5) LDFD f63 = [C8 ], SIZE
  1462. (p5) LDFD f47 = [C16], SIZE
  1463. FMA f13 = ALPHA_I, f95, f13
  1464. }
  1465. ;;
  1466. { .mmf
  1467. STFD [C4 ] = f64, SIZE
  1468. STFD [C12] = f65, SIZE
  1469. FMA f14 = ALPHA_R, f96, f14
  1470. }
  1471. { .mmf
  1472. (p5) LDFD f64 = [C8 ], SIZE
  1473. (p5) LDFD f65 = [C16], SIZE
  1474. FMA f15 = ALPHA_R, f98, f15
  1475. }
  1476. ;;
  1477. { .mmf
  1478. STFD [C4 ] = f6, SIZE
  1479. STFD [C12] = f7, SIZE
  1480. FMA f16 = ALPHA_I, f96, f16
  1481. }
  1482. { .mmf
  1483. (p5) LDFD f6 = [C8 ], SIZE
  1484. (p5) LDFD f7 = [C16], SIZE
  1485. FMA f17 = ALPHA_I, f98, f17
  1486. }
  1487. ;;
  1488. { .mmf
  1489. STFD [C4 ] = f10, SIZE
  1490. STFD [C12] = f11, SIZE
  1491. FMA f18 = ALPHA_R, f97, f18
  1492. }
  1493. { .mmf
  1494. (p5) LDFD f10 = [C8 ], 5 * SIZE
  1495. (p5) LDFD f11 = [C16], 5 * SIZE
  1496. FMA f19 = ALPHA_R, f99, f19
  1497. }
  1498. ;;
  1499. { .mmf
  1500. STFD [C4 ] = f12, 5 * SIZE
  1501. STFD [C12] = f13, 5 * SIZE
  1502. FMA f20 = ALPHA_I, f97, f20
  1503. }
  1504. { .mmf
  1505. (p5) LDFD f12 = [C8 ], SIZE
  1506. (p5) LDFD f13 = [C16], SIZE
  1507. FMA f21 = ALPHA_I, f99, f21
  1508. }
  1509. ;;
  1510. { .mmf
  1511. STFD [C5 ] = f14, SIZE
  1512. STFD [C13] = f15, SIZE
  1513. FMA f22 = ALPHA_R, f100, f22
  1514. }
  1515. { .mmf
  1516. (p5) LDFD f14 = [C8 ], SIZE
  1517. (p5) LDFD f15 = [C16], SIZE
  1518. FMA f23 = ALPHA_R, f102, f23
  1519. }
  1520. ;;
  1521. { .mmf
  1522. STFD [C5 ] = f16, SIZE
  1523. STFD [C13] = f17, SIZE
  1524. FMA f24 = ALPHA_I, f100, f24
  1525. }
  1526. { .mmf
  1527. (p5) LDFD f16 = [C8 ], SIZE
  1528. (p5) LDFD f17 = [C16], SIZE
  1529. FMA f25 = ALPHA_I, f102, f25
  1530. }
  1531. ;;
  1532. { .mmf
  1533. STFD [C5 ] = f18, SIZE
  1534. STFD [C13] = f19, SIZE
  1535. FMA f26 = ALPHA_R, f101, f26
  1536. }
  1537. { .mmf
  1538. (p5) LDFD f18 = [C8 ], - 11 * SIZE
  1539. (p5) LDFD f19 = [C16], - 11 * SIZE
  1540. FMA f27 = ALPHA_R, f103, f27
  1541. }
  1542. ;;
  1543. { .mmf
  1544. STFD [C5 ] = f20, 5 * SIZE
  1545. STFD [C13] = f21, 5 * SIZE
  1546. FMA f28 = ALPHA_I, f101, f28
  1547. }
  1548. { .mmf
  1549. nop __LINE__
  1550. nop __LINE__
  1551. FMA f29 = ALPHA_I, f103, f29
  1552. }
  1553. ;;
  1554. { .mmf
  1555. STFD [C5 ] = f22, SIZE
  1556. STFD [C13] = f23, SIZE
  1557. FMA f30 = ALPHA_R, f104, f30
  1558. }
  1559. { .mmf
  1560. nop __LINE__
  1561. nop __LINE__
  1562. FMA f31 = ALPHA_R, f106, f31
  1563. }
  1564. ;;
  1565. { .mmf
  1566. STFD [C5 ] = f24, SIZE
  1567. STFD [C13] = f25, SIZE
  1568. FMA f32 = ALPHA_I, f104, f32
  1569. }
  1570. { .mmf
  1571. nop __LINE__
  1572. nop __LINE__
  1573. FMA f33 = ALPHA_I, f106, f33
  1574. }
  1575. ;;
  1576. { .mmf
  1577. STFD [C5 ] = f26, SIZE
  1578. STFD [C13] = f27, SIZE
  1579. FMA f34 = ALPHA_R, f105, f34
  1580. }
  1581. { .mmf
  1582. nop __LINE__
  1583. nop __LINE__
  1584. FMA f35 = ALPHA_R, f107, f35
  1585. }
  1586. ;;
  1587. { .mmf
  1588. STFD [C5 ] = f28, 5 * SIZE
  1589. STFD [C13] = f29, 5 * SIZE
  1590. FMA f36 = ALPHA_I, f105, f36
  1591. }
  1592. { .mmf
  1593. nop __LINE__
  1594. nop __LINE__
  1595. FMA f37 = ALPHA_I, f107, f37
  1596. }
  1597. ;;
  1598. { .mmf
  1599. STFD [C6 ] = f30, SIZE
  1600. STFD [C14] = f31, SIZE
  1601. FMA f38 = ALPHA_R, f108, f38
  1602. }
  1603. { .mmf
  1604. nop __LINE__
  1605. nop __LINE__
  1606. FMA f39 = ALPHA_R, f110, f39
  1607. }
  1608. ;;
  1609. { .mmf
  1610. STFD [C6 ] = f32, SIZE
  1611. STFD [C14] = f33, SIZE
  1612. FMA f48 = ALPHA_I, f108, f48
  1613. }
  1614. { .mmf
  1615. nop __LINE__
  1616. nop __LINE__
  1617. FMA f49 = ALPHA_I, f110, f49
  1618. }
  1619. ;;
  1620. { .mmf
  1621. STFD [C6 ] = f34, SIZE
  1622. STFD [C14] = f35, SIZE
  1623. FMA f50 = ALPHA_R, f109, f50
  1624. }
  1625. { .mmf
  1626. nop __LINE__
  1627. nop __LINE__
  1628. FMA f51 = ALPHA_R, f111, f51
  1629. }
  1630. ;;
  1631. { .mmf
  1632. STFD [C6 ] = f36, 5 * SIZE
  1633. STFD [C14] = f37, 5 * SIZE
  1634. FMA f52 = ALPHA_I, f109, f52
  1635. }
  1636. { .mmf
  1637. nop __LINE__
  1638. nop __LINE__
  1639. FMA f53 = ALPHA_I, f111, f53
  1640. }
  1641. ;;
  1642. { .mmf
  1643. STFD [C6 ] = f38, SIZE
  1644. STFD [C14] = f39, SIZE
  1645. FMA f54 = ALPHA_R, f112, f54
  1646. }
  1647. { .mmf
  1648. nop __LINE__
  1649. nop __LINE__
  1650. FMA f55 = ALPHA_R, f114, f55
  1651. }
  1652. ;;
  1653. { .mmf
  1654. STFD [C6 ] = f48, SIZE
  1655. STFD [C14] = f49, SIZE
  1656. FMA f40 = ALPHA_I, f112, f40
  1657. }
  1658. { .mmf
  1659. nop __LINE__
  1660. nop __LINE__
  1661. FMA f41 = ALPHA_I, f114, f41
  1662. }
  1663. ;;
  1664. { .mmf
  1665. STFD [C6 ] = f50, SIZE
  1666. STFD [C14] = f51, SIZE
  1667. FMA f42 = ALPHA_R, f113, f42
  1668. }
  1669. { .mmf
  1670. nop __LINE__
  1671. nop __LINE__
  1672. FMA f43 = ALPHA_R, f115, f43
  1673. }
  1674. ;;
  1675. { .mmf
  1676. STFD [C6 ] = f52, 5 * SIZE
  1677. STFD [C14] = f53, 5 * SIZE
  1678. FMA f44 = ALPHA_I, f113, f44
  1679. }
  1680. { .mmf
  1681. nop __LINE__
  1682. nop __LINE__
  1683. FMA f45 = ALPHA_I, f115, f45
  1684. }
  1685. ;;
  1686. { .mmf
  1687. STFD [C7 ] = f54, SIZE
  1688. STFD [C15] = f55, SIZE
  1689. FMA f46 = ALPHA_R, f116, f46
  1690. }
  1691. { .mmf
  1692. nop __LINE__
  1693. nop __LINE__
  1694. FMA f56 = ALPHA_R, f118, f56
  1695. }
  1696. ;;
  1697. { .mmf
  1698. STFD [C7 ] = f40, SIZE
  1699. STFD [C15] = f41, SIZE
  1700. FMA f57 = ALPHA_I, f116, f57
  1701. }
  1702. { .mmf
  1703. nop __LINE__
  1704. nop __LINE__
  1705. FMA f58 = ALPHA_I, f118, f58
  1706. }
  1707. ;;
  1708. { .mmf
  1709. STFD [C7 ] = f42, SIZE
  1710. STFD [C15] = f43, SIZE
  1711. FMA f59 = ALPHA_R, f117, f59
  1712. }
  1713. { .mmf
  1714. nop __LINE__
  1715. nop __LINE__
  1716. FMA f60 = ALPHA_R, f119, f60
  1717. }
  1718. ;;
  1719. { .mmf
  1720. STFD [C7 ] = f44, 5 * SIZE
  1721. STFD [C15] = f45, 5 * SIZE
  1722. FMA f61 = ALPHA_I, f117, f61
  1723. }
  1724. { .mmf
  1725. nop __LINE__
  1726. nop __LINE__
  1727. FMA f62 = ALPHA_I, f119, f62
  1728. }
  1729. ;;
  1730. { .mmf
  1731. STFD [C7 ] = f46, SIZE
  1732. STFD [C15] = f56, SIZE
  1733. FMA f63 = ALPHA_R, f120, f63
  1734. }
  1735. { .mmf
  1736. nop __LINE__
  1737. nop __LINE__
  1738. FMA f47 = ALPHA_R, f122, f47
  1739. }
  1740. ;;
  1741. { .mmf
  1742. STFD [C7 ] = f57, SIZE
  1743. STFD [C15] = f58, SIZE
  1744. FMA f64 = ALPHA_I, f120, f64
  1745. }
  1746. { .mmf
  1747. nop __LINE__
  1748. nop __LINE__
  1749. FMA f65 = ALPHA_I, f122, f65
  1750. }
  1751. ;;
  1752. { .mmf
  1753. STFD [C7 ] = f59, SIZE
  1754. STFD [C15] = f60, SIZE
  1755. FMA f6 = ALPHA_R, f121, f6
  1756. }
  1757. { .mmf
  1758. nop __LINE__
  1759. nop __LINE__
  1760. FMA f7 = ALPHA_R, f123, f7
  1761. }
  1762. ;;
  1763. { .mmf
  1764. STFD [C7 ] = f61, 5 * SIZE
  1765. STFD [C15] = f62, 5 * SIZE
  1766. FMA f10 = ALPHA_I, f121, f10
  1767. }
  1768. { .mmf
  1769. nop __LINE__
  1770. nop __LINE__
  1771. FMA f11 = ALPHA_I, f123, f11
  1772. }
  1773. ;;
  1774. { .mmf
  1775. STFD [C8 ] = f63, SIZE
  1776. STFD [C16] = f47, SIZE
  1777. FMA f12 = ALPHA_R, f124, f12
  1778. }
  1779. { .mmf
  1780. nop __LINE__
  1781. nop __LINE__
  1782. FMA f13 = ALPHA_R, f126, f13
  1783. }
  1784. ;;
  1785. { .mmf
  1786. STFD [C8 ] = f64, SIZE
  1787. STFD [C16] = f65, SIZE
  1788. FMA f14 = ALPHA_I, f124, f14
  1789. }
  1790. { .mmf
  1791. nop __LINE__
  1792. nop __LINE__
  1793. FMA f15 = ALPHA_I, f126, f15
  1794. }
  1795. ;;
  1796. { .mmf
  1797. STFD [C8 ] = f6, SIZE
  1798. STFD [C16] = f7, SIZE
  1799. FMA f16 = ALPHA_R, f125, f16
  1800. }
  1801. { .mmf
  1802. nop __LINE__
  1803. nop __LINE__
  1804. FMA f17 = ALPHA_R, f127, f17
  1805. }
  1806. ;;
  1807. { .mmf
  1808. STFD [C8 ] = f10, 5 * SIZE
  1809. STFD [C16] = f11, 5 * SIZE
  1810. FMA f18 = ALPHA_I, f125, f18
  1811. }
  1812. { .mmf
  1813. nop __LINE__
  1814. nop __LINE__
  1815. FMA f19 = ALPHA_I, f127, f19
  1816. }
  1817. ;;
  1818. { .mmf
  1819. STFD [C8 ] = f12, SIZE
  1820. STFD [C16] = f13, SIZE
  1821. mov f64 = f0
  1822. }
  1823. { .mmf
  1824. nop __LINE__
  1825. nop __LINE__
  1826. mov f72 = f0
  1827. }
  1828. ;;
  1829. { .mmf
  1830. STFD [C8 ] = f14, SIZE
  1831. STFD [C16] = f15, SIZE
  1832. mov f80 = f0
  1833. }
  1834. { .mmf
  1835. nop __LINE__
  1836. nop __LINE__
  1837. mov f88 = f0
  1838. }
  1839. ;;
  1840. { .mmf
  1841. STFD [C8 ] = f16, SIZE
  1842. STFD [C16] = f17, SIZE
  1843. mov f96 = f0
  1844. }
  1845. { .mmf
  1846. nop __LINE__
  1847. nop __LINE__
  1848. mov f104 = f0
  1849. }
  1850. ;;
  1851. { .mmf
  1852. STFD [C8 ] = f18, 5 * SIZE
  1853. STFD [C16] = f19, 5 * SIZE
  1854. mov f112 = f0
  1855. }
  1856. { .mfb
  1857. adds I = -1, I
  1858. mov f120 = f0
  1859. (p6) br.cond.dptk .L011
  1860. }
  1861. ;;
  1862. .L020:
  1863. { .mfi
  1864. cmp.eq p3, p0 = r0, r0
  1865. mov f89 = f0
  1866. tbit.z p6, p7 = M, 2
  1867. }
  1868. { .mfb
  1869. nop __LINE__
  1870. mov f81 = f0
  1871. (p6) br.cond.dptk .L030
  1872. }
  1873. ;;
  1874. { .mfi
  1875. LDFPD f48, f49 = [B]
  1876. mov f65 = f0
  1877. nop __LINE__
  1878. }
  1879. { .mfi
  1880. adds BOFFSET = 2 * SIZE, B
  1881. mov f73 = f0
  1882. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  1883. }
  1884. ;;
  1885. { .mmf
  1886. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  1887. setf.d f97 = r0
  1888. mov f105 = f0
  1889. }
  1890. { .mfi
  1891. setf.d f113 = r0
  1892. mov f121 = f0
  1893. adds L = 1, K
  1894. }
  1895. ;;
  1896. { .mmf
  1897. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  1898. setf.d f66 = r0
  1899. mov f74 = f0
  1900. }
  1901. { .mfi
  1902. setf.d f82 = r0
  1903. mov f90 = f0
  1904. tbit.z p12, p0 = L, 0
  1905. }
  1906. ;;
  1907. { .mmf
  1908. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  1909. setf.d f98 = r0
  1910. mov f106 = f0
  1911. }
  1912. { .mfi
  1913. setf.d f114 = r0
  1914. mov f122 = f0
  1915. shr L = L, 1
  1916. }
  1917. ;;
  1918. { .mfi
  1919. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  1920. mov f75 = f0
  1921. adds L = -1, L
  1922. }
  1923. { .mmf
  1924. setf.d f67 = r0
  1925. setf.d f83 = r0
  1926. mov f91 = f0
  1927. }
  1928. ;;
  1929. { .mfi
  1930. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  1931. mov f107 = f0
  1932. mov ar.lc = L
  1933. }
  1934. { .mmf
  1935. setf.d f99 = r0
  1936. setf.d f115 = r0
  1937. mov f123 = f0
  1938. }
  1939. ;;
  1940. .align 32
  1941. .L022:
  1942. { .mfi
  1943. lfetch.nt1 [PREA], 16 * SIZE
  1944. FMA f64 = f32, f48, f64 // A1 * B1
  1945. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  1946. }
  1947. { .mfi
  1948. nop __LINE__
  1949. FMA f72 = f32, f49, f72 // A1 * B2
  1950. (p12) cmp.ne p3, p0 = 0, L
  1951. }
  1952. ;;
  1953. { .mfi
  1954. lfetch.nt1 [PREB], 16 * SIZE
  1955. FMA f80 = f32, f50, f80 // A1 * B3
  1956. cmp.ne p4, p5 = 0, L
  1957. }
  1958. { .mfb
  1959. nop __LINE__
  1960. FMA f88 = f32, f51, f88 // A1 * B4
  1961. nop __LINE__
  1962. }
  1963. ;;
  1964. { .mfi
  1965. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  1966. FMA f96 = f32, f52, f96 // A1 * B5
  1967. (p5) adds C9 = 4 * SIZE, C1
  1968. }
  1969. { .mfi
  1970. nop __LINE__
  1971. FMA f104 = f32, f53, f104 // A1 * B6
  1972. (p5) adds C10 = 4 * SIZE, C2
  1973. }
  1974. ;;
  1975. { .mfi
  1976. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  1977. FMA f112 = f32, f54, f112 // A1 * B7
  1978. (p5) adds C11 = 4 * SIZE, C3
  1979. }
  1980. { .mfi
  1981. nop __LINE__
  1982. FMA f120 = f32, f55, f120 // A1 * B8
  1983. (p5) adds C12 = 4 * SIZE, C4
  1984. }
  1985. ;;
  1986. { .mfi
  1987. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  1988. FMA f65 = f33, f48, f65 // A2 * B1
  1989. (p5) adds C13 = 4 * SIZE, C5
  1990. }
  1991. { .mfi
  1992. nop __LINE__
  1993. FMA f73 = f33, f49, f73 // A2 * B2
  1994. (p5) adds C14 = 4 * SIZE, C6
  1995. }
  1996. ;;
  1997. { .mfi
  1998. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  1999. FMA f81 = f33, f50, f81 // A2 * B3
  2000. (p5) adds C15 = 4 * SIZE, C7
  2001. }
  2002. { .mfi
  2003. nop __LINE__
  2004. FMA f89 = f33, f51, f89 // A2 * B4
  2005. (p5) adds C16 = 4 * SIZE, C8
  2006. }
  2007. ;;
  2008. { .mfb
  2009. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  2010. FMA f97 = f33, f52, f97 // A2 * B5
  2011. nop __LINE__
  2012. }
  2013. { .mfb
  2014. nop __LINE__
  2015. FMA f105 = f33, f53, f105 // A2 * B6
  2016. nop __LINE__
  2017. }
  2018. ;;
  2019. { .mfb
  2020. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  2021. FMA f113 = f33, f54, f113 // A2 * B7
  2022. nop __LINE__
  2023. }
  2024. { .mfb
  2025. nop __LINE__
  2026. FMA f121 = f33, f55, f121 // A2 * B8
  2027. nop __LINE__
  2028. }
  2029. ;;
  2030. { .mfb
  2031. nop __LINE__
  2032. FMA f66 = f34, f48, f66 // A3 * B1
  2033. nop __LINE__
  2034. }
  2035. { .mfb
  2036. nop __LINE__
  2037. FMA f74 = f34, f49, f74 // A3 * B2
  2038. nop __LINE__
  2039. }
  2040. ;;
  2041. { .mfb
  2042. nop __LINE__
  2043. FMA f82 = f34, f50, f82 // A3 * B3
  2044. nop __LINE__
  2045. }
  2046. { .mfb
  2047. nop __LINE__
  2048. FMA f90 = f34, f51, f90 // A3 * B4
  2049. nop __LINE__
  2050. }
  2051. ;;
  2052. { .mfb
  2053. nop __LINE__
  2054. FMA f98 = f34, f52, f98 // A3 * B5
  2055. nop __LINE__
  2056. }
  2057. { .mfb
  2058. nop __LINE__
  2059. FMA f106 = f34, f53, f106 // A3 * B6
  2060. nop __LINE__
  2061. }
  2062. ;;
  2063. { .mfb
  2064. nop __LINE__
  2065. FMA f114 = f34, f54, f114 // A3 * B7
  2066. nop __LINE__
  2067. }
  2068. { .mfb
  2069. nop __LINE__
  2070. FMA f122 = f34, f55, f122 // A3 * B8
  2071. nop __LINE__
  2072. }
  2073. ;;
  2074. { .mfb
  2075. nop __LINE__
  2076. FMA f67 = f35, f48, f67 // A4 * B1
  2077. nop __LINE__
  2078. }
  2079. { .mfb
  2080. nop __LINE__
  2081. FMA f75 = f35, f49, f75 // A4 * B2
  2082. nop __LINE__
  2083. }
  2084. ;;
  2085. { .mfb
  2086. nop __LINE__
  2087. FMA f83 = f35, f50, f83 // A4 * B3
  2088. nop __LINE__
  2089. }
  2090. { .mfb
  2091. nop __LINE__
  2092. FMA f91 = f35, f51, f91 // A4 * B4
  2093. nop __LINE__
  2094. }
  2095. ;;
  2096. { .mfb
  2097. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  2098. FMA f99 = f35, f52, f99 // A4 * B5
  2099. nop __LINE__
  2100. }
  2101. { .mfb
  2102. nop __LINE__
  2103. FMA f107 = f35, f53, f107 // A4 * B6
  2104. nop __LINE__
  2105. }
  2106. ;;
  2107. { .mfb
  2108. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  2109. FMA f115 = f35, f54, f115 // A4 * B7
  2110. nop __LINE__
  2111. }
  2112. { .mfb
  2113. nop __LINE__
  2114. FMA f123 = f35, f55, f123 // A4 * B8
  2115. nop __LINE__
  2116. }
  2117. ;;
  2118. { .mfb
  2119. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  2120. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  2121. nop __LINE__
  2122. }
  2123. { .mfb
  2124. nop __LINE__
  2125. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  2126. nop __LINE__
  2127. }
  2128. ;;
  2129. { .mfb
  2130. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  2131. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  2132. nop __LINE__
  2133. }
  2134. { .mfb
  2135. nop __LINE__
  2136. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  2137. nop __LINE__
  2138. }
  2139. ;;
  2140. { .mfb
  2141. (p5) LDFD f6 = [C1 ], SIZE
  2142. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  2143. nop __LINE__
  2144. }
  2145. { .mfb
  2146. (p5) LDFD f7 = [C9 ], SIZE
  2147. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  2148. nop __LINE__
  2149. }
  2150. ;;
  2151. { .mfb
  2152. (p5) LDFD f10 = [C1 ], SIZE
  2153. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  2154. nop __LINE__
  2155. }
  2156. { .mfb
  2157. (p5) LDFD f11 = [C9 ], SIZE
  2158. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  2159. nop __LINE__
  2160. }
  2161. ;;
  2162. { .mfb
  2163. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  2164. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  2165. nop __LINE__
  2166. }
  2167. { .mfb
  2168. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  2169. nop __LINE__
  2170. }
  2171. { .mfb
  2172. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  2173. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  2174. nop __LINE__
  2175. }
  2176. { .mfb
  2177. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  2178. nop __LINE__
  2179. }
  2180. ;;
  2181. { .mfb
  2182. (p5) LDFD f12 = [C1 ], SIZE
  2183. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  2184. nop __LINE__
  2185. }
  2186. { .mfb
  2187. (p5) LDFD f13 = [C9 ], SIZE
  2188. (p3) FMA f105 = f41, f61, f105 // A2 * B6
  2189. nop __LINE__
  2190. }
  2191. ;;
  2192. { .mfb
  2193. (p5) LDFD f14 = [C1 ], - 3 * SIZE
  2194. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  2195. nop __LINE__
  2196. }
  2197. { .mfb
  2198. (p5) LDFD f15 = [C9 ], - 3 * SIZE
  2199. (p3) FMA f121 = f41, f63, f121 // A2 * B8
  2200. nop __LINE__
  2201. }
  2202. ;;
  2203. { .mfb
  2204. (p5) LDFD f16 = [C2 ], SIZE
  2205. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  2206. nop __LINE__
  2207. }
  2208. { .mfb
  2209. (p5) LDFD f17 = [C10], SIZE
  2210. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  2211. nop __LINE__
  2212. }
  2213. ;;
  2214. { .mfb
  2215. (p5) LDFD f18 = [C2 ], SIZE
  2216. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  2217. nop __LINE__
  2218. }
  2219. { .mfb
  2220. (p5) LDFD f19 = [C10], SIZE
  2221. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  2222. nop __LINE__
  2223. }
  2224. ;;
  2225. { .mfb
  2226. (p5) LDFD f20 = [C2 ], SIZE
  2227. (p3) FMA f98 = f42, f60, f98 // A3 * B5
  2228. nop __LINE__
  2229. }
  2230. { .mfb
  2231. (p5) LDFD f21 = [C10], SIZE
  2232. (p3) FMA f106 = f42, f61, f106 // A3 * B6
  2233. nop __LINE__
  2234. }
  2235. ;;
  2236. { .mfb
  2237. (p5) LDFD f22 = [C2 ], - 3 * SIZE
  2238. (p3) FMA f114 = f42, f62, f114 // A3 * B7
  2239. nop __LINE__
  2240. }
  2241. { .mfb
  2242. (p5) LDFD f23 = [C10], - 3 * SIZE
  2243. (p3) FMA f122 = f42, f63, f122 // A3 * B8
  2244. nop __LINE__
  2245. }
  2246. ;;
  2247. { .mfb
  2248. (p5) LDFD f24 = [C3 ], SIZE
  2249. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  2250. nop __LINE__
  2251. }
  2252. { .mfb
  2253. (p5) LDFD f25 = [C11], SIZE
  2254. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  2255. nop __LINE__
  2256. }
  2257. ;;
  2258. { .mfb
  2259. (p5) LDFD f26 = [C3 ], SIZE
  2260. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  2261. nop __LINE__
  2262. }
  2263. { .mfb
  2264. (p5) LDFD f27 = [C11], SIZE
  2265. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  2266. nop __LINE__
  2267. }
  2268. ;;
  2269. { .mfb
  2270. (p5) LDFD f28 = [C3 ], SIZE
  2271. (p3) FMA f99 = f43, f60, f99 // A4 * B5
  2272. nop __LINE__
  2273. }
  2274. { .mfb
  2275. (p5) LDFD f29 = [C11], SIZE
  2276. (p3) FMA f107 = f43, f61, f107 // A4 * B6
  2277. nop __LINE__
  2278. }
  2279. ;;
  2280. { .mfi
  2281. (p5) LDFD f30 = [C3 ], - 3 * SIZE
  2282. (p3) FMA f115 = f43, f62, f115 // A4 * B7
  2283. adds L = -1, L
  2284. }
  2285. { .mfb
  2286. (p5) LDFD f31 = [C11], - 3 * SIZE
  2287. (p3) FMA f123 = f43, f63, f123 // A4 * B8
  2288. br.cloop.sptk.few .L022
  2289. }
  2290. ;;
  2291. .L028:
  2292. { .mmf
  2293. LDFD f68 = [C4 ], SIZE
  2294. LDFD f69 = [C12], SIZE
  2295. FMA f6 = ALPHA_R, f64, f6
  2296. }
  2297. { .mmf
  2298. nop __LINE__
  2299. nop __LINE__
  2300. FMA f7 = ALPHA_R, f66, f7
  2301. }
  2302. ;;
  2303. { .mmf
  2304. LDFD f70 = [C4 ], SIZE
  2305. LDFD f71 = [C12], SIZE
  2306. FMA f10 = ALPHA_I, f64, f10
  2307. }
  2308. { .mmf
  2309. nop __LINE__
  2310. nop __LINE__
  2311. FMA f11 = ALPHA_I, f66, f11
  2312. }
  2313. ;;
  2314. { .mmf
  2315. LDFD f76 = [C4 ], SIZE
  2316. LDFD f77 = [C12], SIZE
  2317. FMA f12 = ALPHA_R, f65, f12
  2318. }
  2319. { .mmf
  2320. nop __LINE__
  2321. nop __LINE__
  2322. FMA f13 = ALPHA_R, f67, f13
  2323. }
  2324. ;;
  2325. { .mmf
  2326. LDFD f78 = [C4 ], -3 * SIZE
  2327. LDFD f79 = [C12], -3 * SIZE
  2328. FMA f14 = ALPHA_I, f65, f14
  2329. }
  2330. { .mmf
  2331. nop __LINE__
  2332. nop __LINE__
  2333. FMA f15 = ALPHA_I, f67, f15
  2334. }
  2335. ;;
  2336. { .mmf
  2337. STFD [C1 ] = f6, SIZE
  2338. STFD [C9 ] = f7, SIZE
  2339. FMA f16 = ALPHA_R, f72, f16
  2340. }
  2341. { .mmf
  2342. LDFD f84 = [C5 ], SIZE
  2343. LDFD f85 = [C13], SIZE
  2344. FMA f17 = ALPHA_R, f74, f17
  2345. }
  2346. ;;
  2347. { .mmf
  2348. STFD [C1 ] = f10, SIZE
  2349. STFD [C9 ] = f11, SIZE
  2350. FMA f18 = ALPHA_I, f72, f18
  2351. }
  2352. { .mmf
  2353. LDFD f86 = [C5 ], SIZE
  2354. LDFD f87 = [C13], SIZE
  2355. FMA f19 = ALPHA_I, f74, f19
  2356. }
  2357. ;;
  2358. { .mmf
  2359. STFD [C1 ] = f12, SIZE
  2360. STFD [C9 ] = f13, SIZE
  2361. FMA f20 = ALPHA_R, f73, f20
  2362. }
  2363. { .mmf
  2364. LDFD f92 = [C5 ], SIZE
  2365. LDFD f93 = [C13], SIZE
  2366. FMA f21 = ALPHA_R, f75, f21
  2367. }
  2368. ;;
  2369. { .mmf
  2370. STFD [C1 ] = f14, 5 * SIZE
  2371. STFD [C9 ] = f15, 5 * SIZE
  2372. FMA f22 = ALPHA_I, f73, f22
  2373. }
  2374. { .mmf
  2375. LDFD f94 = [C5 ], -3 * SIZE
  2376. LDFD f95 = [C13], -3 * SIZE
  2377. FMA f23 = ALPHA_I, f75, f23
  2378. }
  2379. ;;
  2380. { .mmf
  2381. STFD [C2 ] = f16, SIZE
  2382. STFD [C10] = f17, SIZE
  2383. FMA f24 = ALPHA_R, f80, f24
  2384. }
  2385. { .mmf
  2386. LDFD f100 = [C6 ], SIZE
  2387. LDFD f101 = [C14], SIZE
  2388. FMA f25 = ALPHA_R, f82, f25
  2389. }
  2390. ;;
  2391. { .mmf
  2392. STFD [C2 ] = f18, SIZE
  2393. STFD [C10] = f19, SIZE
  2394. FMA f26 = ALPHA_I, f80, f26
  2395. }
  2396. { .mmf
  2397. LDFD f102 = [C6 ], SIZE
  2398. LDFD f103 = [C14], SIZE
  2399. FMA f27 = ALPHA_I, f82, f27
  2400. }
  2401. ;;
  2402. { .mmf
  2403. STFD [C2 ] = f20, SIZE
  2404. STFD [C10] = f21, SIZE
  2405. FMA f28 = ALPHA_R, f81, f28
  2406. }
  2407. { .mmf
  2408. LDFD f108 = [C6 ], SIZE
  2409. LDFD f109 = [C14], SIZE
  2410. FMA f29 = ALPHA_R, f83, f29
  2411. }
  2412. ;;
  2413. { .mmf
  2414. STFD [C2 ] = f22, 5 * SIZE
  2415. STFD [C10] = f23, 5 * SIZE
  2416. FMA f30 = ALPHA_I, f81, f30
  2417. }
  2418. { .mmf
  2419. LDFD f110 = [C6 ], -3 * SIZE
  2420. LDFD f111 = [C14], -3 * SIZE
  2421. FMA f31 = ALPHA_I, f83, f31
  2422. }
  2423. ;;
  2424. { .mmf
  2425. STFD [C3 ] = f24, SIZE
  2426. STFD [C11] = f25, SIZE
  2427. FMA f68 = ALPHA_R, f88, f68
  2428. }
  2429. { .mmf
  2430. LDFD f116 = [C7 ], SIZE
  2431. LDFD f117 = [C15], SIZE
  2432. FMA f69 = ALPHA_R, f90, f69
  2433. }
  2434. ;;
  2435. { .mmf
  2436. STFD [C3 ] = f26, SIZE
  2437. STFD [C11] = f27, SIZE
  2438. FMA f70 = ALPHA_I, f88, f70
  2439. }
  2440. { .mmf
  2441. LDFD f118 = [C7 ], SIZE
  2442. LDFD f119 = [C15], SIZE
  2443. FMA f71 = ALPHA_I, f90, f71
  2444. }
  2445. ;;
  2446. { .mmf
  2447. STFD [C3 ] = f28, SIZE
  2448. STFD [C11] = f29, SIZE
  2449. FMA f76 = ALPHA_R, f89, f76
  2450. }
  2451. { .mmf
  2452. LDFD f124 = [C7 ], SIZE
  2453. LDFD f125 = [C15], SIZE
  2454. FMA f77 = ALPHA_R, f91, f77
  2455. }
  2456. ;;
  2457. { .mmf
  2458. STFD [C3 ] = f30, 5 * SIZE
  2459. STFD [C11] = f31, 5 * SIZE
  2460. FMA f78 = ALPHA_I, f89, f78
  2461. }
  2462. { .mmf
  2463. LDFD f126 = [C7 ], -3 * SIZE
  2464. LDFD f127 = [C15], -3 * SIZE
  2465. FMA f79 = ALPHA_I, f91, f79
  2466. }
  2467. ;;
  2468. { .mmf
  2469. STFD [C4 ] = f68, SIZE
  2470. STFD [C12] = f69, SIZE
  2471. FMA f84 = ALPHA_R, f96, f84
  2472. }
  2473. { .mmf
  2474. LDFD f32 = [C8 ], SIZE
  2475. LDFD f33 = [C16], SIZE
  2476. FMA f85 = ALPHA_R, f98, f85
  2477. }
  2478. ;;
  2479. { .mmf
  2480. STFD [C4 ] = f70, SIZE
  2481. STFD [C12] = f71, SIZE
  2482. FMA f86 = ALPHA_I, f96, f86
  2483. }
  2484. { .mmf
  2485. LDFD f34 = [C8 ], SIZE
  2486. LDFD f35 = [C16], SIZE
  2487. FMA f87 = ALPHA_I, f98, f87
  2488. }
  2489. ;;
  2490. { .mmf
  2491. STFD [C4 ] = f76, SIZE
  2492. STFD [C12] = f77, SIZE
  2493. FMA f92 = ALPHA_R, f97, f92
  2494. }
  2495. { .mmf
  2496. LDFD f36 = [C8 ], SIZE
  2497. LDFD f37 = [C16], SIZE
  2498. FMA f93 = ALPHA_R, f99, f93
  2499. }
  2500. ;;
  2501. { .mmf
  2502. STFD [C4 ] = f78, 5 * SIZE
  2503. STFD [C12] = f79, 5 * SIZE
  2504. FMA f94 = ALPHA_I, f97, f94
  2505. }
  2506. { .mmf
  2507. LDFD f38 = [C8 ], -3 * SIZE
  2508. LDFD f39 = [C16], -3 * SIZE
  2509. FMA f95 = ALPHA_I, f99, f95
  2510. }
  2511. ;;
  2512. { .mmf
  2513. STFD [C5 ] = f84, SIZE
  2514. STFD [C13] = f85, SIZE
  2515. FMA f100 = ALPHA_R, f104, f100
  2516. }
  2517. { .mmf
  2518. nop __LINE__
  2519. nop __LINE__
  2520. FMA f101 = ALPHA_R, f106, f101
  2521. }
  2522. ;;
  2523. { .mmf
  2524. STFD [C5 ] = f86, SIZE
  2525. STFD [C13] = f87, SIZE
  2526. FMA f102 = ALPHA_I, f104, f102
  2527. }
  2528. { .mmf
  2529. nop __LINE__
  2530. nop __LINE__
  2531. FMA f103 = ALPHA_I, f106, f103
  2532. }
  2533. ;;
  2534. { .mmf
  2535. STFD [C5 ] = f92, SIZE
  2536. STFD [C13] = f93, SIZE
  2537. FMA f108 = ALPHA_R, f105, f108
  2538. }
  2539. { .mmf
  2540. nop __LINE__
  2541. nop __LINE__
  2542. FMA f109 = ALPHA_R, f107, f109
  2543. }
  2544. ;;
  2545. { .mmf
  2546. STFD [C5 ] = f94, 5 * SIZE
  2547. STFD [C13] = f95, 5 * SIZE
  2548. FMA f110 = ALPHA_I, f105, f110
  2549. }
  2550. { .mmf
  2551. nop __LINE__
  2552. nop __LINE__
  2553. FMA f111 = ALPHA_I, f107, f111
  2554. }
  2555. ;;
  2556. { .mmf
  2557. STFD [C6 ] = f100, SIZE
  2558. STFD [C14] = f101, SIZE
  2559. FMA f116 = ALPHA_R, f112, f116
  2560. }
  2561. { .mmf
  2562. nop __LINE__
  2563. nop __LINE__
  2564. FMA f117 = ALPHA_R, f114, f117
  2565. }
  2566. ;;
  2567. { .mmf
  2568. STFD [C6 ] = f102, SIZE
  2569. STFD [C14] = f103, SIZE
  2570. FMA f118 = ALPHA_I, f112, f118
  2571. }
  2572. { .mmf
  2573. nop __LINE__
  2574. nop __LINE__
  2575. FMA f119 = ALPHA_I, f114, f119
  2576. }
  2577. ;;
  2578. { .mmf
  2579. STFD [C6 ] = f108, SIZE
  2580. STFD [C14] = f109, SIZE
  2581. FMA f124 = ALPHA_R, f113, f124
  2582. }
  2583. { .mmf
  2584. nop __LINE__
  2585. nop __LINE__
  2586. FMA f125 = ALPHA_R, f115, f125
  2587. }
  2588. ;;
  2589. { .mmf
  2590. STFD [C6 ] = f110, 5 * SIZE
  2591. STFD [C14] = f111, 5 * SIZE
  2592. FMA f126 = ALPHA_I, f113, f126
  2593. }
  2594. { .mmf
  2595. nop __LINE__
  2596. nop __LINE__
  2597. FMA f127 = ALPHA_I, f115, f127
  2598. }
  2599. ;;
  2600. { .mmf
  2601. STFD [C7 ] = f116, SIZE
  2602. STFD [C15] = f117, SIZE
  2603. FMA f32 = ALPHA_R, f120, f32
  2604. }
  2605. { .mmf
  2606. nop __LINE__
  2607. nop __LINE__
  2608. FMA f33 = ALPHA_R, f122, f33
  2609. }
  2610. ;;
  2611. { .mmf
  2612. STFD [C7 ] = f118, SIZE
  2613. STFD [C15] = f119, SIZE
  2614. FMA f34 = ALPHA_I, f120, f34
  2615. }
  2616. { .mmf
  2617. nop __LINE__
  2618. nop __LINE__
  2619. FMA f35 = ALPHA_I, f122, f35
  2620. }
  2621. ;;
  2622. { .mmf
  2623. STFD [C7 ] = f124, SIZE
  2624. STFD [C15] = f125, SIZE
  2625. FMA f36 = ALPHA_R, f121, f36
  2626. }
  2627. { .mmf
  2628. nop __LINE__
  2629. nop __LINE__
  2630. FMA f37 = ALPHA_R, f123, f37
  2631. }
  2632. ;;
  2633. { .mmf
  2634. STFD [C7 ] = f126, 5 * SIZE
  2635. STFD [C15] = f127, 5 * SIZE
  2636. FMA f38 = ALPHA_I, f121, f38
  2637. }
  2638. { .mmf
  2639. nop __LINE__
  2640. nop __LINE__
  2641. FMA f39 = ALPHA_I, f123, f39
  2642. }
  2643. ;;
  2644. { .mmf
  2645. STFD [C8 ] = f32, SIZE
  2646. STFD [C16] = f33, SIZE
  2647. mov f64 = f0
  2648. }
  2649. { .mmf
  2650. nop __LINE__
  2651. nop __LINE__
  2652. mov f72 = f0
  2653. }
  2654. ;;
  2655. { .mmf
  2656. STFD [C8 ] = f34, SIZE
  2657. STFD [C16] = f35, SIZE
  2658. mov f80 = f0
  2659. }
  2660. { .mmf
  2661. nop __LINE__
  2662. nop __LINE__
  2663. mov f88 = f0
  2664. }
  2665. ;;
  2666. { .mmf
  2667. STFD [C8 ] = f36, SIZE
  2668. STFD [C16] = f37, SIZE
  2669. mov f96 = f0
  2670. }
  2671. { .mmf
  2672. nop __LINE__
  2673. nop __LINE__
  2674. mov f104 = f0
  2675. }
  2676. ;;
  2677. { .mmf
  2678. STFD [C8 ] = f38, 5 * SIZE
  2679. STFD [C16] = f39, 5 * SIZE
  2680. mov f112 = f0
  2681. }
  2682. { .mmf
  2683. nop __LINE__
  2684. nop __LINE__
  2685. mov f120 = f0
  2686. }
  2687. ;;
  2688. .align 32
  2689. .L030:
  2690. { .mib
  2691. nop __LINE__
  2692. tbit.z p6, p7 = M, 1
  2693. (p6) br.cond.dptk .L040
  2694. }
  2695. ;;
  2696. { .mfi
  2697. LDFPD f48, f49 = [B]
  2698. mov f65 = f0
  2699. nop __LINE__
  2700. }
  2701. { .mfi
  2702. adds BOFFSET = 2 * SIZE, B
  2703. mov f73 = f0
  2704. adds L = 1, K
  2705. }
  2706. ;;
  2707. { .mfi
  2708. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  2709. mov f81 = f0
  2710. tbit.z p12, p0 = L, 0
  2711. }
  2712. { .mfi
  2713. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  2714. mov f89 = f0
  2715. shr L = L, 1
  2716. }
  2717. ;;
  2718. { .mfi
  2719. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  2720. mov f97 = f0
  2721. adds L = -1, L
  2722. }
  2723. { .mfi
  2724. nop __LINE__
  2725. mov f105 = f0
  2726. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  2727. }
  2728. ;;
  2729. { .mfi
  2730. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  2731. mov f113 = f0
  2732. mov ar.lc = L
  2733. }
  2734. { .mfi
  2735. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  2736. mov f121 = f0
  2737. cmp.eq p3, p0 = r0, r0
  2738. }
  2739. ;;
  2740. .align 32
  2741. .L032:
  2742. { .mfb
  2743. lfetch.nt1 [PREA], 4 * SIZE
  2744. FMA f64 = f32, f48, f64 // A1 * B1
  2745. nop __LINE__
  2746. }
  2747. { .mfi
  2748. nop __LINE__
  2749. FMA f72 = f32, f49, f72 // A1 * B2
  2750. (p12) cmp.ne p3, p0 = 0, L
  2751. }
  2752. ;;
  2753. { .mfi
  2754. lfetch.nt1 [PREB], 16 * SIZE
  2755. FMA f80 = f32, f50, f80 // A1 * B3
  2756. cmp.ne p4, p5 = 0, L
  2757. }
  2758. { .mfb
  2759. nop __LINE__
  2760. FMA f88 = f32, f51, f88 // A1 * B4
  2761. nop __LINE__
  2762. }
  2763. ;;
  2764. { .mfb
  2765. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  2766. FMA f96 = f32, f52, f96 // A1 * B5
  2767. nop __LINE__
  2768. }
  2769. { .mfb
  2770. nop __LINE__
  2771. FMA f104 = f32, f53, f104 // A1 * B6
  2772. nop __LINE__
  2773. }
  2774. ;;
  2775. { .mfb
  2776. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  2777. FMA f112 = f32, f54, f112 // A1 * B7
  2778. nop __LINE__
  2779. }
  2780. { .mfb
  2781. nop __LINE__
  2782. FMA f120 = f32, f55, f120 // A1 * B8
  2783. nop __LINE__
  2784. }
  2785. ;;
  2786. { .mfb
  2787. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  2788. FMA f65 = f33, f48, f65 // A2 * B1
  2789. nop __LINE__
  2790. }
  2791. { .mfb
  2792. nop __LINE__
  2793. FMA f73 = f33, f49, f73 // A2 * B2
  2794. nop __LINE__
  2795. }
  2796. ;;
  2797. { .mfb
  2798. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  2799. FMA f81 = f33, f50, f81 // A2 * B3
  2800. nop __LINE__
  2801. }
  2802. { .mfb
  2803. nop __LINE__
  2804. FMA f89 = f33, f51, f89 // A2 * B4
  2805. nop __LINE__
  2806. }
  2807. ;;
  2808. { .mfb
  2809. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  2810. FMA f97 = f33, f52, f97 // A2 * B5
  2811. nop __LINE__
  2812. }
  2813. { .mfb
  2814. nop __LINE__
  2815. FMA f105 = f33, f53, f105 // A2 * B6
  2816. nop __LINE__
  2817. }
  2818. ;;
  2819. { .mfb
  2820. nop __LINE__
  2821. FMA f113 = f33, f54, f113 // A2 * B7
  2822. nop __LINE__
  2823. }
  2824. { .mfb
  2825. nop __LINE__
  2826. FMA f121 = f33, f55, f121 // A2 * B8
  2827. nop __LINE__
  2828. }
  2829. ;;
  2830. { .mfb
  2831. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  2832. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  2833. nop __LINE__
  2834. }
  2835. { .mfb
  2836. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  2837. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  2838. nop __LINE__
  2839. }
  2840. ;;
  2841. { .mfb
  2842. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  2843. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  2844. nop __LINE__
  2845. }
  2846. { .mfb
  2847. nop __LINE__
  2848. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  2849. nop __LINE__
  2850. }
  2851. ;;
  2852. { .mfb
  2853. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  2854. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  2855. nop __LINE__
  2856. }
  2857. { .mfb
  2858. nop __LINE__
  2859. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  2860. nop __LINE__
  2861. }
  2862. ;;
  2863. { .mfb
  2864. (p5) LDFD f6 = [C1], SIZE
  2865. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  2866. nop __LINE__
  2867. }
  2868. { .mfb
  2869. (p5) LDFD f12 = [C2], SIZE
  2870. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  2871. nop __LINE__
  2872. }
  2873. ;;
  2874. { .mfb
  2875. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  2876. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  2877. nop __LINE__
  2878. }
  2879. { .mfb
  2880. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  2881. nop __LINE__
  2882. }
  2883. { .mfb
  2884. (p5) LDFD f7 = [C1], SIZE
  2885. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  2886. nop __LINE__
  2887. }
  2888. { .mfb
  2889. (p5) LDFD f13 = [C2], SIZE
  2890. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  2891. nop __LINE__
  2892. }
  2893. ;;
  2894. { .mfb
  2895. (p5) LDFD f10 = [C1], SIZE
  2896. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  2897. nop __LINE__
  2898. }
  2899. { .mfb
  2900. (p5) LDFD f14 = [C2], SIZE
  2901. (p3) FMA f105 = f41, f61, f105 // A2 * B6
  2902. nop __LINE__
  2903. }
  2904. ;;
  2905. { .mfi
  2906. (p5) LDFD f11 = [C1], -3 * SIZE
  2907. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  2908. adds L = -1, L
  2909. }
  2910. { .mfb
  2911. (p5) LDFD f15 = [C2], -3 * SIZE
  2912. (p3) FMA f121 = f41, f63, f121 // A2 * B8
  2913. br.cloop.sptk.few .L032
  2914. }
  2915. ;;
  2916. .L038:
  2917. { .mmf
  2918. LDFD f16 = [C3], SIZE
  2919. LDFD f20 = [C4], SIZE
  2920. FMA f6 = ALPHA_R, f64, f6
  2921. }
  2922. { .mmf
  2923. nop __LINE__
  2924. nop __LINE__
  2925. FMA f12 = ALPHA_R, f72, f12
  2926. }
  2927. ;;
  2928. { .mmf
  2929. LDFD f17 = [C3], SIZE
  2930. LDFD f21 = [C4], SIZE
  2931. FMA f7 = ALPHA_I, f64, f7
  2932. }
  2933. { .mmf
  2934. nop __LINE__
  2935. nop __LINE__
  2936. FMA f13 = ALPHA_I, f72, f13
  2937. }
  2938. ;;
  2939. { .mmf
  2940. LDFD f18 = [C3], SIZE
  2941. LDFD f22 = [C4], SIZE
  2942. FMA f10 = ALPHA_R, f65, f10
  2943. }
  2944. { .mmf
  2945. nop __LINE__
  2946. nop __LINE__
  2947. FMA f14 = ALPHA_R, f73, f14
  2948. }
  2949. ;;
  2950. { .mmf
  2951. LDFD f19 = [C3], - 3 * SIZE
  2952. LDFD f23 = [C4], - 3 * SIZE
  2953. FMA f11 = ALPHA_I, f65, f11
  2954. }
  2955. { .mmf
  2956. nop __LINE__
  2957. nop __LINE__
  2958. FMA f15 = ALPHA_I, f73, f15
  2959. }
  2960. ;;
  2961. { .mmf
  2962. STFD [C1] = f6, SIZE
  2963. STFD [C2] = f12, SIZE
  2964. FMA f16 = ALPHA_R, f80, f16
  2965. }
  2966. { .mmf
  2967. LDFD f24 = [C5], SIZE
  2968. LDFD f28 = [C6], SIZE
  2969. FMA f20 = ALPHA_R, f88, f20
  2970. }
  2971. ;;
  2972. { .mmf
  2973. STFD [C1] = f7, SIZE
  2974. STFD [C2] = f13, SIZE
  2975. FMA f17 = ALPHA_I, f80, f17
  2976. }
  2977. { .mmf
  2978. LDFD f25 = [C5], SIZE
  2979. LDFD f29 = [C6], SIZE
  2980. FMA f21 = ALPHA_I, f88, f21
  2981. }
  2982. ;;
  2983. { .mmf
  2984. STFD [C1] = f10, SIZE
  2985. STFD [C2] = f14, SIZE
  2986. FMA f18 = ALPHA_R, f81, f18
  2987. }
  2988. { .mmf
  2989. LDFD f26 = [C5], SIZE
  2990. LDFD f30 = [C6], SIZE
  2991. FMA f22 = ALPHA_R, f89, f22
  2992. }
  2993. ;;
  2994. { .mmf
  2995. STFD [C1] = f11, SIZE
  2996. STFD [C2] = f15, SIZE
  2997. FMA f19 = ALPHA_I, f81, f19
  2998. }
  2999. { .mmf
  3000. LDFD f27 = [C5], - 3 * SIZE
  3001. LDFD f31 = [C6], - 3 * SIZE
  3002. FMA f23 = ALPHA_I, f89, f23
  3003. }
  3004. ;;
  3005. { .mmf
  3006. STFD [C3] = f16, SIZE
  3007. STFD [C4] = f20, SIZE
  3008. FMA f24 = ALPHA_R, f96, f24
  3009. }
  3010. { .mmf
  3011. LDFD f32 = [C7], SIZE
  3012. LDFD f36 = [C8], SIZE
  3013. FMA f28 = ALPHA_R, f104, f28
  3014. }
  3015. ;;
  3016. { .mmf
  3017. STFD [C3] = f17, SIZE
  3018. STFD [C4] = f21, SIZE
  3019. FMA f25 = ALPHA_I, f96, f25
  3020. }
  3021. { .mmf
  3022. LDFD f33 = [C7], SIZE
  3023. LDFD f37 = [C8], SIZE
  3024. FMA f29 = ALPHA_I, f104, f29
  3025. }
  3026. ;;
  3027. { .mmf
  3028. STFD [C3] = f18, SIZE
  3029. STFD [C4] = f22, SIZE
  3030. FMA f26 = ALPHA_R, f97, f26
  3031. }
  3032. { .mmf
  3033. LDFD f34 = [C7], SIZE
  3034. LDFD f38 = [C8], SIZE
  3035. FMA f30 = ALPHA_R, f105, f30
  3036. }
  3037. ;;
  3038. { .mmf
  3039. STFD [C3] = f19, SIZE
  3040. STFD [C4] = f23, SIZE
  3041. FMA f27 = ALPHA_I, f97, f27
  3042. }
  3043. { .mmf
  3044. LDFD f35 = [C7], - 3 * SIZE
  3045. LDFD f39 = [C8], - 3 * SIZE
  3046. FMA f31 = ALPHA_I, f105, f31
  3047. }
  3048. ;;
  3049. { .mmf
  3050. STFD [C5] = f24, SIZE
  3051. STFD [C6] = f28, SIZE
  3052. FMA f32 = ALPHA_R, f112, f32
  3053. }
  3054. { .mmf
  3055. nop __LINE__
  3056. nop __LINE__
  3057. FMA f36 = ALPHA_R, f120, f36
  3058. }
  3059. ;;
  3060. { .mmf
  3061. STFD [C5] = f25, SIZE
  3062. STFD [C6] = f29, SIZE
  3063. FMA f33 = ALPHA_I, f112, f33
  3064. }
  3065. { .mmf
  3066. nop __LINE__
  3067. nop __LINE__
  3068. FMA f37 = ALPHA_I, f120, f37
  3069. }
  3070. ;;
  3071. { .mmf
  3072. STFD [C5] = f26, SIZE
  3073. STFD [C6] = f30, SIZE
  3074. FMA f34 = ALPHA_R, f113, f34
  3075. }
  3076. { .mmf
  3077. nop __LINE__
  3078. nop __LINE__
  3079. FMA f38 = ALPHA_R, f121, f38
  3080. }
  3081. ;;
  3082. { .mmf
  3083. STFD [C5] = f27, SIZE
  3084. STFD [C6] = f31, SIZE
  3085. FMA f35 = ALPHA_I, f113, f35
  3086. }
  3087. { .mmf
  3088. nop __LINE__
  3089. nop __LINE__
  3090. FMA f39 = ALPHA_I, f121, f39
  3091. }
  3092. ;;
  3093. { .mmf
  3094. STFD [C7] = f32, SIZE
  3095. STFD [C8] = f36, SIZE
  3096. mov f64 = f0
  3097. }
  3098. { .mmf
  3099. nop __LINE__
  3100. nop __LINE__
  3101. mov f72 = f0
  3102. }
  3103. ;;
  3104. { .mmf
  3105. STFD [C7] = f33, SIZE
  3106. STFD [C8] = f37, SIZE
  3107. mov f80 = f0
  3108. }
  3109. { .mmf
  3110. nop __LINE__
  3111. nop __LINE__
  3112. mov f88 = f0
  3113. }
  3114. ;;
  3115. { .mmf
  3116. STFD [C7] = f34, SIZE
  3117. STFD [C8] = f38, SIZE
  3118. mov f96 = f0
  3119. }
  3120. { .mmf
  3121. nop __LINE__
  3122. nop __LINE__
  3123. mov f104 = f0
  3124. }
  3125. ;;
  3126. { .mmf
  3127. STFD [C7] = f35, SIZE
  3128. STFD [C8] = f39, SIZE
  3129. mov f112 = f0
  3130. }
  3131. { .mmf
  3132. nop __LINE__
  3133. nop __LINE__
  3134. mov f120 = f0
  3135. }
  3136. ;;
  3137. .align 32
  3138. .L040:
  3139. { .mib
  3140. nop __LINE__
  3141. tbit.z p6, p7 = M, 0
  3142. (p6) br.cond.dptk .L049
  3143. }
  3144. ;;
  3145. { .mmi
  3146. LDFPD f48, f49 = [B]
  3147. adds BOFFSET = 2 * SIZE, B
  3148. adds L = 1, K
  3149. }
  3150. ;;
  3151. { .mii
  3152. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3153. tbit.z p12, p0 = L, 0
  3154. shr L = L, 1
  3155. }
  3156. ;;
  3157. { .mmi
  3158. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  3159. LDFD f32 = [AOFFSET], 1 * SIZE
  3160. adds L = -1, L
  3161. }
  3162. ;;
  3163. { .mmi
  3164. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  3165. cmp.eq p3, p0 = r0, r0
  3166. mov ar.lc = L
  3167. }
  3168. { .mmi
  3169. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  3170. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  3171. nop __LINE__
  3172. }
  3173. ;;
  3174. .align 32
  3175. .L042:
  3176. { .mfb
  3177. lfetch.nt1 [PREB], 16 * SIZE
  3178. FMA f64 = f32, f48, f64 // A1 * B1
  3179. nop __LINE__
  3180. }
  3181. { .mfb
  3182. (p12) cmp.ne p3, p0 = 0, L
  3183. FMA f72 = f32, f49, f72 // A1 * B2
  3184. nop __LINE__
  3185. }
  3186. ;;
  3187. { .mfi
  3188. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  3189. FMA f80 = f32, f50, f80 // A1 * B3
  3190. cmp.ne p4, p5 = 0, L
  3191. }
  3192. { .mfb
  3193. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  3194. FMA f88 = f32, f51, f88 // A1 * B4
  3195. nop __LINE__
  3196. }
  3197. ;;
  3198. { .mfi
  3199. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  3200. FMA f96 = f32, f52, f96 // A1 * B5
  3201. nop __LINE__
  3202. }
  3203. { .mmf
  3204. (p5) LDFD f6 = [C1], SIZE
  3205. (p5) LDFD f10 = [C2], SIZE
  3206. FMA f104 = f32, f53, f104 // A1 * B6
  3207. }
  3208. ;;
  3209. { .mfi
  3210. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  3211. FMA f112 = f32, f54, f112 // A1 * B7
  3212. nop __LINE__
  3213. }
  3214. { .mmf
  3215. (p5) LDFD f7 = [C1], -SIZE
  3216. (p5) LDFD f11 = [C2], -SIZE
  3217. FMA f120 = f32, f55, f120 // A1 * B8
  3218. }
  3219. ;;
  3220. { .mmf
  3221. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  3222. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  3223. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  3224. }
  3225. { .mmf
  3226. (p5) LDFD f12 = [C3], SIZE
  3227. (p5) LDFD f14 = [C4], SIZE
  3228. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  3229. }
  3230. ;;
  3231. { .mfi
  3232. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3233. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  3234. nop __LINE__
  3235. }
  3236. { .mmf
  3237. (p5) LDFD f13 = [C3], -SIZE
  3238. (p5) LDFD f15 = [C4], -SIZE
  3239. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  3240. }
  3241. ;;
  3242. { .mfi
  3243. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3244. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  3245. nop __LINE__
  3246. }
  3247. { .mmf
  3248. (p5) LDFD f16 = [C5], SIZE
  3249. (p5) LDFD f18 = [C6], SIZE
  3250. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  3251. }
  3252. ;;
  3253. { .mfi
  3254. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  3255. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  3256. adds L = -1, L
  3257. }
  3258. { .mmb
  3259. (p5) LDFD f17 = [C5], -SIZE
  3260. (p5) LDFD f19 = [C6], -SIZE
  3261. nop __LINE__
  3262. }
  3263. ;;
  3264. { .mfb
  3265. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  3266. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  3267. nop __LINE__
  3268. }
  3269. { .mmb
  3270. (p5) LDFD f20 = [C7], SIZE
  3271. (p5) LDFD f22 = [C8], SIZE
  3272. br.cloop.sptk.few .L042
  3273. }
  3274. ;;
  3275. { .mmf
  3276. LDFD f21 = [C7], -SIZE
  3277. LDFD f23 = [C8], -SIZE
  3278. FMA f6 = ALPHA_R, f64, f6
  3279. }
  3280. { .mmf
  3281. nop __LINE__
  3282. nop __LINE__
  3283. FMA f10 = ALPHA_R, f72, f10
  3284. }
  3285. ;;
  3286. { .mmf
  3287. nop __LINE__
  3288. nop __LINE__
  3289. FMA f7 = ALPHA_I, f64, f7
  3290. }
  3291. { .mmf
  3292. nop __LINE__
  3293. nop __LINE__
  3294. FMA f11 = ALPHA_I, f72, f11
  3295. }
  3296. ;;
  3297. { .mmf
  3298. nop __LINE__
  3299. nop __LINE__
  3300. FMA f12 = ALPHA_R, f80, f12
  3301. }
  3302. { .mmf
  3303. nop __LINE__
  3304. nop __LINE__
  3305. FMA f14 = ALPHA_R, f88, f14
  3306. }
  3307. ;;
  3308. { .mmf
  3309. nop __LINE__
  3310. nop __LINE__
  3311. FMA f13 = ALPHA_I, f80, f13
  3312. }
  3313. { .mmf
  3314. nop __LINE__
  3315. nop __LINE__
  3316. FMA f15 = ALPHA_I, f88, f15
  3317. }
  3318. ;;
  3319. { .mmf
  3320. STFD [C1 ] = f6, SIZE
  3321. STFD [C2 ] = f10, SIZE
  3322. FMA f16 = ALPHA_R, f96, f16
  3323. }
  3324. { .mmf
  3325. nop __LINE__
  3326. nop __LINE__
  3327. FMA f18 = ALPHA_R, f104, f18
  3328. }
  3329. ;;
  3330. { .mmf
  3331. STFD [C1 ] = f7, SIZE
  3332. STFD [C2 ] = f11, SIZE
  3333. FMA f17 = ALPHA_I, f96, f17
  3334. }
  3335. { .mmf
  3336. nop __LINE__
  3337. nop __LINE__
  3338. FMA f19 = ALPHA_I, f104, f19
  3339. }
  3340. ;;
  3341. { .mmf
  3342. STFD [C3 ] = f12, SIZE
  3343. STFD [C4 ] = f14, SIZE
  3344. FMA f20 = ALPHA_R, f112, f20
  3345. }
  3346. { .mmf
  3347. nop __LINE__
  3348. nop __LINE__
  3349. FMA f22 = ALPHA_R, f120, f22
  3350. }
  3351. ;;
  3352. { .mmf
  3353. STFD [C3 ] = f13, SIZE
  3354. STFD [C4 ] = f15, SIZE
  3355. FMA f21 = ALPHA_I, f112, f21
  3356. }
  3357. { .mmf
  3358. nop __LINE__
  3359. nop __LINE__
  3360. FMA f23 = ALPHA_I, f120, f23
  3361. }
  3362. ;;
  3363. { .mmi
  3364. STFD [C5 ] = f16, SIZE
  3365. STFD [C6 ] = f18, SIZE
  3366. nop __LINE__
  3367. }
  3368. ;;
  3369. { .mmi
  3370. STFD [C5 ] = f17, SIZE
  3371. STFD [C6 ] = f19, SIZE
  3372. nop __LINE__
  3373. }
  3374. ;;
  3375. { .mmi
  3376. STFD [C7 ] = f20, SIZE
  3377. STFD [C8 ] = f22, SIZE
  3378. nop __LINE__
  3379. }
  3380. ;;
  3381. { .mmi
  3382. STFD [C7 ] = f21, SIZE
  3383. STFD [C8 ] = f23, SIZE
  3384. nop __LINE__
  3385. }
  3386. ;;
  3387. .align 32
  3388. .L049:
  3389. { .mmi
  3390. mov B = BOFFSET
  3391. mov AOFFSET = A
  3392. nop __LINE__
  3393. }
  3394. ;;
  3395. { .mmb
  3396. nop __LINE__
  3397. cmp.lt p6, p0 = 0, J
  3398. (p6) br.cond.dptk .L010
  3399. }
  3400. ;;
  3401. .align 32
  3402. .L050:
  3403. { .mfi
  3404. mov C1 = C
  3405. mov f64 = f0
  3406. tbit.z p6, p0 = N, 2
  3407. }
  3408. { .mfi
  3409. add C2 = LDC, C
  3410. mov f72 = f0
  3411. shr I = M, 3
  3412. }
  3413. ;;
  3414. { .mfi
  3415. shladd C3 = LDC, 1, C
  3416. mov f80 = f0
  3417. nop __LINE__
  3418. }
  3419. { .mfb
  3420. mov AOFFSET = A
  3421. mov f88 = f0
  3422. (p6) br.cond.dpnt .L090
  3423. }
  3424. ;;
  3425. { .mfi
  3426. cmp.eq p6, p7 = 0, I
  3427. mov f65 = f0
  3428. nop __LINE__
  3429. }
  3430. { .mfi
  3431. shladd C4 = LDC, 1, C2
  3432. mov f73 = f0
  3433. nop __LINE__
  3434. }
  3435. ;;
  3436. { .mfi
  3437. nop __LINE__
  3438. mov f81 = f0
  3439. nop __LINE__
  3440. }
  3441. { .mfb
  3442. shladd C = LDC, 2, C
  3443. mov f89 = f0
  3444. (p6) br.cond.dpnt .L060
  3445. }
  3446. ;;
  3447. .align 32
  3448. .L052:
  3449. { .mfb
  3450. LDFPD f48, f49 = [B]
  3451. mov f66 = f0
  3452. nop __LINE__
  3453. }
  3454. { .mfb
  3455. adds BOFFSET = 2 * SIZE, B
  3456. mov f74 = f0
  3457. nop __LINE__
  3458. }
  3459. ;;
  3460. { .mfi
  3461. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  3462. mov f82 = f0
  3463. nop __LINE__
  3464. }
  3465. { .mfi
  3466. setf.d f84 = r0
  3467. mov f90 = f0
  3468. nop __LINE__
  3469. }
  3470. ;;
  3471. { .mfi
  3472. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3473. mov f67 = f0
  3474. adds PREC = CPREFETCHSIZE * SIZE, C1
  3475. }
  3476. { .mfi
  3477. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  3478. mov f75 = f0
  3479. adds L = 1, K
  3480. }
  3481. ;;
  3482. { .mfi
  3483. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  3484. mov f83 = f0
  3485. tbit.z p12, p0 = L, 0
  3486. }
  3487. { .mfi
  3488. setf.d f91 = r0
  3489. mov f68 = f0
  3490. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  3491. }
  3492. ;;
  3493. { .mfi
  3494. CPREFETCH [PREC], LDC
  3495. mov f76 = f0
  3496. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  3497. }
  3498. { .mfi
  3499. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  3500. mov f92 = f0
  3501. cmp.eq p3, p0 = r0, r0
  3502. }
  3503. ;;
  3504. { .mfi
  3505. CPREFETCH [PREC], LDC
  3506. mov f69 = f0
  3507. shr L = L, 1
  3508. }
  3509. { .mmf
  3510. setf.d f77 = r0
  3511. setf.d f85 = r0
  3512. mov f93 = f0
  3513. }
  3514. ;;
  3515. { .mfi
  3516. CPREFETCH [PREC], LDC
  3517. mov f70 = f0
  3518. adds L = -1, L
  3519. }
  3520. { .mmf
  3521. setf.d f78 = r0
  3522. setf.d f86 = r0
  3523. mov f94 = f0
  3524. }
  3525. ;;
  3526. { .mfi
  3527. CPREFETCH [PREC]
  3528. mov f71 = f0
  3529. mov ar.lc = L
  3530. }
  3531. { .mmf
  3532. setf.d f79 = r0
  3533. setf.d f87 = r0
  3534. mov f95 = f0
  3535. }
  3536. ;;
  3537. .align 32
  3538. .L053:
  3539. { .mfb
  3540. lfetch.nt1 [PREA], 16 * SIZE
  3541. FMA f64 = f32, f48, f64 // A1 * B1
  3542. nop __LINE__
  3543. }
  3544. { .mfi
  3545. nop __LINE__
  3546. FMA f72 = f32, f49, f72 // A1 * B2
  3547. (p12) cmp.ne p3, p0 = 0, L
  3548. }
  3549. ;;
  3550. { .mfi
  3551. lfetch.nt1 [PREB], 8 * SIZE
  3552. FMA f80 = f32, f50, f80 // A1 * B3
  3553. cmp.ne p4, p5 = 0, L
  3554. }
  3555. { .mfi
  3556. nop __LINE__
  3557. FMA f88 = f32, f51, f88 // A1 * B4
  3558. adds C9 = 4 * SIZE, C1
  3559. }
  3560. ;;
  3561. { .mfi
  3562. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  3563. FMA f65 = f33, f48, f65 // A2 * B1
  3564. adds C10 = 4 * SIZE, C2
  3565. }
  3566. { .mfi
  3567. nop __LINE__
  3568. FMA f73 = f33, f49, f73 // A2 * B2
  3569. adds C11 = 4 * SIZE, C3
  3570. }
  3571. ;;
  3572. { .mfi
  3573. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  3574. FMA f81 = f33, f50, f81 // A2 * B3
  3575. adds C12 = 4 * SIZE, C4
  3576. }
  3577. { .mfb
  3578. nop __LINE__
  3579. FMA f89 = f33, f51, f89 // A2 * B4
  3580. nop __LINE__
  3581. }
  3582. ;;
  3583. { .mfb
  3584. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  3585. FMA f66 = f34, f48, f66 // A3 * B1
  3586. nop __LINE__
  3587. }
  3588. { .mfb
  3589. nop __LINE__
  3590. FMA f74 = f34, f49, f74 // A3 * B2
  3591. nop __LINE__
  3592. }
  3593. ;;
  3594. { .mfb
  3595. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  3596. FMA f82 = f34, f50, f82 // A3 * B3
  3597. nop __LINE__
  3598. }
  3599. { .mfb
  3600. nop __LINE__
  3601. FMA f90 = f34, f51, f90 // A3 * B4
  3602. nop __LINE__
  3603. }
  3604. ;;
  3605. { .mfb
  3606. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  3607. FMA f67 = f35, f48, f67 // A4 * B1
  3608. nop __LINE__
  3609. }
  3610. { .mfb
  3611. nop __LINE__
  3612. FMA f75 = f35, f49, f75 // A4 * B2
  3613. nop __LINE__
  3614. }
  3615. ;;
  3616. { .mfb
  3617. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  3618. FMA f83 = f35, f50, f83 // A4 * B3
  3619. nop __LINE__
  3620. }
  3621. { .mfb
  3622. nop __LINE__
  3623. FMA f91 = f35, f51, f91 // A4 * B4
  3624. nop __LINE__
  3625. }
  3626. ;;
  3627. { .mfb
  3628. nop __LINE__
  3629. FMA f68 = f36, f48, f68 // A5 * B1
  3630. nop __LINE__
  3631. }
  3632. { .mfb
  3633. nop __LINE__
  3634. FMA f76 = f36, f49, f76 // A5 * B2
  3635. nop __LINE__
  3636. }
  3637. ;;
  3638. { .mfb
  3639. nop __LINE__
  3640. FMA f84 = f36, f50, f84 // A5 * B3
  3641. nop __LINE__
  3642. }
  3643. { .mfb
  3644. nop __LINE__
  3645. FMA f92 = f36, f51, f92 // A5 * B4
  3646. nop __LINE__
  3647. }
  3648. ;;
  3649. { .mfb
  3650. nop __LINE__
  3651. FMA f69 = f37, f48, f69 // A6 * B1
  3652. nop __LINE__
  3653. }
  3654. { .mfb
  3655. nop __LINE__
  3656. FMA f77 = f37, f49, f77 // A6 * B2
  3657. nop __LINE__
  3658. }
  3659. ;;
  3660. { .mfb
  3661. nop __LINE__
  3662. FMA f85 = f37, f50, f85 // A6 * B3
  3663. nop __LINE__
  3664. }
  3665. { .mfb
  3666. nop __LINE__
  3667. FMA f93 = f37, f51, f93 // A6 * B4
  3668. nop __LINE__
  3669. }
  3670. ;;
  3671. { .mfb
  3672. nop __LINE__
  3673. FMA f70 = f38, f48, f70 // A7 * B1
  3674. nop __LINE__
  3675. }
  3676. { .mfb
  3677. nop __LINE__
  3678. FMA f78 = f38, f49, f78 // A7 * B2
  3679. nop __LINE__
  3680. }
  3681. ;;
  3682. { .mfb
  3683. nop __LINE__
  3684. FMA f86 = f38, f50, f86 // A7 * B3
  3685. nop __LINE__
  3686. }
  3687. { .mfb
  3688. nop __LINE__
  3689. FMA f94 = f38, f51, f94 // A7 * B4
  3690. nop __LINE__
  3691. }
  3692. ;;
  3693. { .mfb
  3694. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  3695. FMA f71 = f39, f48, f71 // A8 * B1
  3696. nop __LINE__
  3697. }
  3698. { .mfb
  3699. nop __LINE__
  3700. FMA f79 = f39, f49, f79 // A8 * B2
  3701. nop __LINE__
  3702. }
  3703. ;;
  3704. { .mfb
  3705. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  3706. FMA f87 = f39, f50, f87 // A8 * B3
  3707. nop __LINE__
  3708. }
  3709. { .mfb
  3710. nop __LINE__
  3711. FMA f95 = f39, f51, f95 // A8 * B4
  3712. nop __LINE__
  3713. }
  3714. ;;
  3715. { .mfb
  3716. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  3717. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  3718. nop __LINE__
  3719. }
  3720. { .mfb
  3721. nop __LINE__
  3722. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  3723. nop __LINE__
  3724. }
  3725. ;;
  3726. { .mfb
  3727. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  3728. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  3729. nop __LINE__
  3730. }
  3731. { .mfb
  3732. nop __LINE__
  3733. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  3734. nop __LINE__
  3735. }
  3736. ;;
  3737. { .mfb
  3738. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  3739. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  3740. nop __LINE__
  3741. }
  3742. { .mfb
  3743. nop __LINE__
  3744. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  3745. nop __LINE__
  3746. }
  3747. ;;
  3748. { .mfb
  3749. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  3750. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  3751. nop __LINE__
  3752. }
  3753. { .mfb
  3754. nop __LINE__
  3755. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  3756. nop __LINE__
  3757. }
  3758. ;;
  3759. { .mfb
  3760. (p5) LDFD f6 = [C1 ], SIZE
  3761. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  3762. nop __LINE__
  3763. }
  3764. { .mfb
  3765. (p5) LDFD f7 = [C9 ], SIZE
  3766. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  3767. nop __LINE__
  3768. }
  3769. ;;
  3770. { .mfb
  3771. (p5) LDFD f10 = [C1 ], SIZE
  3772. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  3773. nop __LINE__
  3774. }
  3775. { .mfb
  3776. (p5) LDFD f11 = [C9 ], SIZE
  3777. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  3778. nop __LINE__
  3779. }
  3780. ;;
  3781. { .mfb
  3782. (p5) LDFD f12 = [C1 ], SIZE
  3783. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  3784. nop __LINE__
  3785. }
  3786. { .mfb
  3787. (p5) LDFD f13 = [C9 ], SIZE
  3788. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  3789. nop __LINE__
  3790. }
  3791. ;;
  3792. { .mfb
  3793. (p5) LDFD f14 = [C1 ], 5 * SIZE
  3794. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  3795. nop __LINE__
  3796. }
  3797. { .mfb
  3798. (p5) LDFD f15 = [C9 ], 5 * SIZE
  3799. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  3800. nop __LINE__
  3801. }
  3802. ;;
  3803. { .mfb
  3804. (p5) LDFD f16 = [C1 ], SIZE
  3805. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  3806. nop __LINE__
  3807. }
  3808. { .mfb
  3809. (p5) LDFD f17 = [C9], SIZE
  3810. (p3) FMA f76 = f44, f57, f76 // A5 * B2
  3811. nop __LINE__
  3812. }
  3813. ;;
  3814. { .mfb
  3815. (p5) LDFD f18 = [C1 ], SIZE
  3816. (p3) FMA f84 = f44, f58, f84 // A5 * B3
  3817. nop __LINE__
  3818. }
  3819. { .mfb
  3820. (p5) LDFD f19 = [C9], SIZE
  3821. (p3) FMA f92 = f44, f59, f92 // A5 * B4
  3822. nop __LINE__
  3823. }
  3824. ;;
  3825. { .mfb
  3826. (p5) LDFD f20 = [C1 ], SIZE
  3827. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  3828. nop __LINE__
  3829. }
  3830. { .mfb
  3831. (p5) LDFD f21 = [C9], SIZE
  3832. (p3) FMA f77 = f45, f57, f77 // A6 * B2
  3833. nop __LINE__
  3834. }
  3835. ;;
  3836. { .mfb
  3837. (p5) LDFD f22 = [C1 ], -11 * SIZE
  3838. (p3) FMA f85 = f45, f58, f85 // A6 * B3
  3839. nop __LINE__
  3840. }
  3841. { .mfb
  3842. (p5) LDFD f23 = [C9 ], -11 * SIZE
  3843. (p3) FMA f93 = f45, f59, f93 // A6 * B4
  3844. nop __LINE__
  3845. }
  3846. ;;
  3847. { .mfb
  3848. (p5) LDFD f24 = [C2 ], SIZE
  3849. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  3850. nop __LINE__
  3851. }
  3852. { .mfb
  3853. (p5) LDFD f25 = [C10], SIZE
  3854. (p3) FMA f78 = f46, f57, f78 // A7 * B2
  3855. nop __LINE__
  3856. }
  3857. ;;
  3858. { .mfb
  3859. (p5) LDFD f26 = [C2 ], SIZE
  3860. (p3) FMA f86 = f46, f58, f86 // A7 * B3
  3861. nop __LINE__
  3862. }
  3863. { .mfb
  3864. (p5) LDFD f27 = [C10], SIZE
  3865. (p3) FMA f94 = f46, f59, f94 // A7 * B4
  3866. nop __LINE__
  3867. }
  3868. ;;
  3869. { .mfb
  3870. (p5) LDFD f28 = [C2 ], SIZE
  3871. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  3872. nop __LINE__
  3873. }
  3874. { .mfb
  3875. (p5) LDFD f29 = [C10], SIZE
  3876. (p3) FMA f79 = f47, f57, f79 // A8 * B2
  3877. nop __LINE__
  3878. }
  3879. ;;
  3880. { .mfi
  3881. (p5) LDFD f30 = [C2 ], 5 * SIZE
  3882. (p3) FMA f87 = f47, f58, f87 // A8 * B3
  3883. adds L = -1, L
  3884. }
  3885. { .mfb
  3886. (p5) LDFD f31 = [C10], 5 * SIZE
  3887. (p3) FMA f95 = f47, f59, f95 // A8 * B4
  3888. br.cloop.sptk.few .L053
  3889. }
  3890. ;;
  3891. .align 32
  3892. .L058:
  3893. { .mmf
  3894. LDFD f32 = [C2 ], SIZE
  3895. LDFD f33 = [C10], SIZE
  3896. FMA f6 = ALPHA_R, f64, f6
  3897. }
  3898. { .mmf
  3899. nop __LINE__
  3900. nop __LINE__
  3901. FMA f7 = ALPHA_R, f66, f7
  3902. }
  3903. ;;
  3904. { .mmf
  3905. LDFD f34 = [C2 ], SIZE
  3906. LDFD f35 = [C10], SIZE
  3907. FMA f10 = ALPHA_I, f64, f10
  3908. }
  3909. { .mmf
  3910. nop __LINE__
  3911. nop __LINE__
  3912. FMA f11 = ALPHA_I, f66, f11
  3913. }
  3914. ;;
  3915. { .mmf
  3916. LDFD f36 = [C2 ], SIZE
  3917. LDFD f37 = [C10], SIZE
  3918. FMA f12 = ALPHA_R, f65, f12
  3919. }
  3920. { .mmf
  3921. nop __LINE__
  3922. nop __LINE__
  3923. FMA f13 = ALPHA_R, f67, f13
  3924. }
  3925. ;;
  3926. { .mmf
  3927. LDFD f38 = [C2 ], - 11 * SIZE
  3928. LDFD f39 = [C10], - 11 * SIZE
  3929. FMA f14 = ALPHA_I, f65, f14
  3930. }
  3931. { .mmf
  3932. nop __LINE__
  3933. nop __LINE__
  3934. FMA f15 = ALPHA_I, f67, f15
  3935. }
  3936. ;;
  3937. { .mmf
  3938. STFD [C1 ] = f6, SIZE
  3939. STFD [C9 ] = f7, SIZE
  3940. FMA f16 = ALPHA_R, f68, f16
  3941. }
  3942. { .mmf
  3943. LDFD f48 = [C3 ], SIZE
  3944. LDFD f49 = [C11], SIZE
  3945. FMA f17 = ALPHA_R, f70, f17
  3946. }
  3947. ;;
  3948. { .mmf
  3949. STFD [C1 ] = f10, SIZE
  3950. STFD [C9 ] = f11, SIZE
  3951. FMA f18 = ALPHA_I, f68, f18
  3952. }
  3953. { .mmf
  3954. LDFD f50 = [C3 ], SIZE
  3955. LDFD f51 = [C11], SIZE
  3956. FMA f19 = ALPHA_I, f70, f19
  3957. }
  3958. ;;
  3959. { .mmf
  3960. STFD [C1 ] = f12, SIZE
  3961. STFD [C9 ] = f13, SIZE
  3962. FMA f20 = ALPHA_R, f69, f20
  3963. }
  3964. { .mmf
  3965. LDFD f52 = [C3 ], SIZE
  3966. LDFD f53 = [C11], SIZE
  3967. FMA f21 = ALPHA_R, f71, f21
  3968. }
  3969. ;;
  3970. { .mmf
  3971. STFD [C1 ] = f14, 5 * SIZE
  3972. STFD [C9 ] = f15, 5 * SIZE
  3973. FMA f22 = ALPHA_I, f69, f22
  3974. }
  3975. { .mmf
  3976. LDFD f54 = [C3 ], 5 * SIZE
  3977. LDFD f55 = [C11], 5 * SIZE
  3978. FMA f23 = ALPHA_I, f71, f23
  3979. }
  3980. ;;
  3981. { .mmf
  3982. STFD [C1 ] = f16, SIZE
  3983. STFD [C9 ] = f17, SIZE
  3984. FMA f24 = ALPHA_R, f72, f24
  3985. }
  3986. { .mmf
  3987. LDFD f40 = [C3 ], SIZE
  3988. LDFD f41 = [C11], SIZE
  3989. FMA f25 = ALPHA_R, f74, f25
  3990. }
  3991. ;;
  3992. { .mmf
  3993. STFD [C1 ] = f18, SIZE
  3994. STFD [C9 ] = f19, SIZE
  3995. FMA f26 = ALPHA_I, f72, f26
  3996. }
  3997. { .mmf
  3998. LDFD f42 = [C3 ], SIZE
  3999. LDFD f43 = [C11], SIZE
  4000. FMA f27 = ALPHA_I, f74, f27
  4001. }
  4002. ;;
  4003. { .mmf
  4004. STFD [C1 ] = f20, SIZE
  4005. STFD [C9 ] = f21, SIZE
  4006. FMA f28 = ALPHA_R, f73, f28
  4007. }
  4008. { .mmf
  4009. LDFD f44 = [C3 ], SIZE
  4010. LDFD f45 = [C11], SIZE
  4011. FMA f29 = ALPHA_R, f75, f29
  4012. }
  4013. ;;
  4014. { .mmf
  4015. STFD [C1 ] = f22, 5 * SIZE
  4016. STFD [C9 ] = f23, 5 * SIZE
  4017. FMA f30 = ALPHA_I, f73, f30
  4018. }
  4019. { .mmf
  4020. LDFD f46 = [C3 ], - 11 * SIZE
  4021. LDFD f56 = [C11], - 11 * SIZE
  4022. FMA f31 = ALPHA_I, f75, f31
  4023. }
  4024. ;;
  4025. { .mmf
  4026. STFD [C2 ] = f24, SIZE
  4027. STFD [C10] = f25, SIZE
  4028. FMA f32 = ALPHA_R, f76, f32
  4029. }
  4030. { .mmf
  4031. LDFD f57 = [C4 ], SIZE
  4032. LDFD f58 = [C12], SIZE
  4033. FMA f33 = ALPHA_R, f78, f33
  4034. }
  4035. ;;
  4036. { .mmf
  4037. STFD [C2 ] = f26, SIZE
  4038. STFD [C10] = f27, SIZE
  4039. FMA f34 = ALPHA_I, f76, f34
  4040. }
  4041. { .mmf
  4042. LDFD f59 = [C4 ], SIZE
  4043. LDFD f60 = [C12], SIZE
  4044. FMA f35 = ALPHA_I, f78, f35
  4045. }
  4046. ;;
  4047. { .mmf
  4048. STFD [C2 ] = f28, SIZE
  4049. STFD [C10] = f29, SIZE
  4050. FMA f36 = ALPHA_R, f77, f36
  4051. }
  4052. { .mmf
  4053. LDFD f61 = [C4 ], SIZE
  4054. LDFD f62 = [C12], SIZE
  4055. FMA f37 = ALPHA_R, f79, f37
  4056. }
  4057. ;;
  4058. { .mmf
  4059. STFD [C2 ] = f30, 5 * SIZE
  4060. STFD [C10] = f31, 5 * SIZE
  4061. FMA f38 = ALPHA_I, f77, f38
  4062. }
  4063. { .mmf
  4064. LDFD f63 = [C4 ], 5 * SIZE
  4065. LDFD f47 = [C12], 5 * SIZE
  4066. FMA f39 = ALPHA_I, f79, f39
  4067. }
  4068. ;;
  4069. { .mmf
  4070. STFD [C2 ] = f32, SIZE
  4071. STFD [C10] = f33, SIZE
  4072. FMA f48 = ALPHA_R, f80, f48
  4073. }
  4074. { .mmf
  4075. LDFD f64 = [C4 ], SIZE
  4076. LDFD f65 = [C12], SIZE
  4077. FMA f49 = ALPHA_R, f82, f49
  4078. }
  4079. ;;
  4080. { .mmf
  4081. STFD [C2 ] = f34, SIZE
  4082. STFD [C10] = f35, SIZE
  4083. FMA f50 = ALPHA_I, f80, f50
  4084. }
  4085. { .mmf
  4086. LDFD f6 = [C4 ], SIZE
  4087. LDFD f7 = [C12], SIZE
  4088. FMA f51 = ALPHA_I, f82, f51
  4089. }
  4090. ;;
  4091. { .mmf
  4092. STFD [C2 ] = f36, SIZE
  4093. STFD [C10] = f37, SIZE
  4094. FMA f52 = ALPHA_R, f81, f52
  4095. }
  4096. { .mmf
  4097. LDFD f10 = [C4 ], SIZE
  4098. LDFD f11 = [C12], SIZE
  4099. FMA f53 = ALPHA_R, f83, f53
  4100. }
  4101. ;;
  4102. { .mmf
  4103. STFD [C2 ] = f38, 5 * SIZE
  4104. STFD [C10] = f39, 5 * SIZE
  4105. FMA f54 = ALPHA_I, f81, f54
  4106. }
  4107. { .mmf
  4108. LDFD f12 = [C4 ], - 11 * SIZE
  4109. LDFD f13 = [C12], - 11 * SIZE
  4110. FMA f55 = ALPHA_I, f83, f55
  4111. }
  4112. ;;
  4113. { .mmf
  4114. STFD [C3 ] = f48, SIZE
  4115. STFD [C11] = f49, SIZE
  4116. FMA f40 = ALPHA_R, f84, f40
  4117. }
  4118. { .mmf
  4119. nop __LINE__
  4120. nop __LINE__
  4121. FMA f41 = ALPHA_R, f86, f41
  4122. }
  4123. ;;
  4124. { .mmf
  4125. STFD [C3 ] = f50, SIZE
  4126. STFD [C11] = f51, SIZE
  4127. FMA f42 = ALPHA_I, f84, f42
  4128. }
  4129. { .mmf
  4130. nop __LINE__
  4131. nop __LINE__
  4132. FMA f43 = ALPHA_I, f86, f43
  4133. }
  4134. ;;
  4135. { .mmf
  4136. STFD [C3 ] = f52, SIZE
  4137. STFD [C11] = f53, SIZE
  4138. FMA f44 = ALPHA_R, f85, f44
  4139. }
  4140. { .mmf
  4141. nop __LINE__
  4142. nop __LINE__
  4143. FMA f45 = ALPHA_R, f87, f45
  4144. }
  4145. ;;
  4146. { .mmf
  4147. STFD [C3 ] = f54, 5 * SIZE
  4148. STFD [C11] = f55, 5 * SIZE
  4149. FMA f46 = ALPHA_I, f85, f46
  4150. }
  4151. { .mmf
  4152. nop __LINE__
  4153. nop __LINE__
  4154. FMA f56 = ALPHA_I, f87, f56
  4155. }
  4156. ;;
  4157. { .mmf
  4158. STFD [C3 ] = f40, SIZE
  4159. STFD [C11] = f41, SIZE
  4160. FMA f57 = ALPHA_R, f88, f57
  4161. }
  4162. { .mmf
  4163. nop __LINE__
  4164. nop __LINE__
  4165. FMA f58 = ALPHA_R, f90, f58
  4166. }
  4167. ;;
  4168. { .mmf
  4169. STFD [C3 ] = f42, SIZE
  4170. STFD [C11] = f43, SIZE
  4171. FMA f59 = ALPHA_I, f88, f59
  4172. }
  4173. { .mmf
  4174. nop __LINE__
  4175. nop __LINE__
  4176. FMA f60 = ALPHA_I, f90, f60
  4177. }
  4178. ;;
  4179. { .mmf
  4180. STFD [C3 ] = f44, SIZE
  4181. STFD [C11] = f45, SIZE
  4182. FMA f61 = ALPHA_R, f89, f61
  4183. }
  4184. { .mmf
  4185. nop __LINE__
  4186. nop __LINE__
  4187. FMA f62 = ALPHA_R, f91, f62
  4188. }
  4189. ;;
  4190. { .mmf
  4191. STFD [C3 ] = f46, 5 * SIZE
  4192. STFD [C11] = f56, 5 * SIZE
  4193. FMA f63 = ALPHA_I, f89, f63
  4194. }
  4195. { .mmf
  4196. nop __LINE__
  4197. nop __LINE__
  4198. FMA f47 = ALPHA_I, f91, f47
  4199. }
  4200. ;;
  4201. { .mmf
  4202. STFD [C4 ] = f57, SIZE
  4203. STFD [C12] = f58, SIZE
  4204. FMA f64 = ALPHA_R, f92, f64
  4205. }
  4206. { .mmf
  4207. nop __LINE__
  4208. nop __LINE__
  4209. FMA f65 = ALPHA_R, f94, f65
  4210. }
  4211. ;;
  4212. { .mmf
  4213. STFD [C4 ] = f59, SIZE
  4214. STFD [C12] = f60, SIZE
  4215. FMA f6 = ALPHA_I, f92, f6
  4216. }
  4217. { .mmf
  4218. nop __LINE__
  4219. nop __LINE__
  4220. FMA f7 = ALPHA_I, f94, f7
  4221. }
  4222. ;;
  4223. { .mmf
  4224. STFD [C4 ] = f61, SIZE
  4225. STFD [C12] = f62, SIZE
  4226. FMA f10 = ALPHA_R, f93, f10
  4227. }
  4228. { .mmf
  4229. nop __LINE__
  4230. nop __LINE__
  4231. FMA f11 = ALPHA_R, f95, f11
  4232. }
  4233. ;;
  4234. { .mmf
  4235. STFD [C4 ] = f63, 5 * SIZE
  4236. STFD [C12] = f47, 5 * SIZE
  4237. FMA f12 = ALPHA_I, f93, f12
  4238. }
  4239. { .mmf
  4240. nop __LINE__
  4241. nop __LINE__
  4242. FMA f13 = ALPHA_I, f95, f13
  4243. }
  4244. ;;
  4245. { .mmf
  4246. STFD [C4 ] = f64, SIZE
  4247. STFD [C12] = f65, SIZE
  4248. mov f64 = f0
  4249. }
  4250. { .mmf
  4251. cmp.ne p6, p0 = 1, I
  4252. nop __LINE__
  4253. mov f72 = f0
  4254. }
  4255. ;;
  4256. { .mmf
  4257. STFD [C4 ] = f6, SIZE
  4258. STFD [C12] = f7, SIZE
  4259. mov f80 = f0
  4260. }
  4261. { .mmf
  4262. nop __LINE__
  4263. nop __LINE__
  4264. mov f88 = f0
  4265. }
  4266. ;;
  4267. { .mmf
  4268. STFD [C4 ] = f10, SIZE
  4269. STFD [C12] = f11, SIZE
  4270. mov f65 = f0
  4271. }
  4272. { .mmf
  4273. nop __LINE__
  4274. nop __LINE__
  4275. mov f73 = f0
  4276. }
  4277. ;;
  4278. { .mmf
  4279. STFD [C4 ] = f12, 5 * SIZE
  4280. STFD [C12] = f13, 5 * SIZE
  4281. mov f81 = f0
  4282. }
  4283. { .mfb
  4284. adds I = -1, I
  4285. mov f89 = f0
  4286. (p6) br.cond.dptk .L052
  4287. }
  4288. ;;
  4289. .align 32
  4290. .L060:
  4291. { .mfi
  4292. nop __LINE__
  4293. mov f66 = f0
  4294. tbit.z p6, p7 = M, 2
  4295. }
  4296. { .mfb
  4297. nop __LINE__
  4298. mov f74 = f0
  4299. (p6) br.cond.dptk .L070
  4300. }
  4301. ;;
  4302. { .mfb
  4303. LDFPD f48, f49 = [B]
  4304. mov f82 = f0
  4305. nop __LINE__
  4306. }
  4307. { .mfi
  4308. adds BOFFSET = 2 * SIZE, B
  4309. mov f90 = f0
  4310. adds L = 1, K
  4311. }
  4312. ;;
  4313. { .mii
  4314. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  4315. tbit.z p12, p0 = L, 0
  4316. shr L = L, 1
  4317. }
  4318. ;;
  4319. { .mfi
  4320. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  4321. mov f67 = f0
  4322. adds L = -1, L
  4323. }
  4324. { .mfi
  4325. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  4326. mov f75 = f0
  4327. nop __LINE__
  4328. }
  4329. ;;
  4330. { .mfi
  4331. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  4332. mov f83 = f0
  4333. mov ar.lc = L
  4334. }
  4335. { .mfi
  4336. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  4337. mov f91 = f0
  4338. cmp.eq p3, p0 = r0, r0
  4339. }
  4340. ;;
  4341. .align 32
  4342. .L062:
  4343. { .mfi
  4344. lfetch.nt1 [PREA], 8 * SIZE
  4345. FMA f64 = f32, f48, f64 // A1 * B1
  4346. cmp.ne p4, p5 = 0, L
  4347. }
  4348. { .mfi
  4349. nop __LINE__
  4350. FMA f72 = f32, f49, f72 // A1 * B2
  4351. (p12) cmp.ne p3, p0 = 0, L
  4352. }
  4353. ;;
  4354. { .mfi
  4355. lfetch.nt1 [PREB], 8 * SIZE
  4356. FMA f80 = f32, f50, f80 // A1 * B3
  4357. (p5) adds C9 = 4 * SIZE, C1
  4358. }
  4359. { .mfi
  4360. nop __LINE__
  4361. FMA f88 = f32, f51, f88 // A1 * B4
  4362. (p5) adds C10 = 4 * SIZE, C2
  4363. }
  4364. ;;
  4365. { .mfi
  4366. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  4367. FMA f65 = f33, f48, f65 // A2 * B1
  4368. (p5) adds C11 = 4 * SIZE, C3
  4369. }
  4370. { .mfi
  4371. nop __LINE__
  4372. FMA f73 = f33, f49, f73 // A2 * B2
  4373. (p5) adds C12 = 4 * SIZE, C4
  4374. }
  4375. ;;
  4376. { .mfb
  4377. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  4378. FMA f81 = f33, f50, f81 // A2 * B3
  4379. nop __LINE__
  4380. }
  4381. { .mfb
  4382. nop __LINE__
  4383. FMA f89 = f33, f51, f89 // A2 * B4
  4384. nop __LINE__
  4385. }
  4386. ;;
  4387. { .mfb
  4388. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  4389. FMA f66 = f34, f48, f66 // A3 * B1
  4390. nop __LINE__
  4391. }
  4392. { .mfb
  4393. nop __LINE__
  4394. FMA f74 = f34, f49, f74 // A3 * B2
  4395. nop __LINE__
  4396. }
  4397. ;;
  4398. { .mfb
  4399. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  4400. FMA f82 = f34, f50, f82 // A3 * B3
  4401. nop __LINE__
  4402. }
  4403. { .mfb
  4404. nop __LINE__
  4405. FMA f90 = f34, f51, f90 // A3 * B4
  4406. nop __LINE__
  4407. }
  4408. ;;
  4409. { .mfb
  4410. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  4411. FMA f67 = f35, f48, f67 // A4 * B1
  4412. }
  4413. { .mfb
  4414. (p5) LDFD f6 = [C1 ], SIZE
  4415. FMA f75 = f35, f49, f75 // A4 * B2
  4416. nop __LINE__
  4417. }
  4418. { .mfb
  4419. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  4420. FMA f83 = f35, f50, f83 // A4 * B3
  4421. nop __LINE__
  4422. }
  4423. { .mfb
  4424. (p5) LDFD f7 = [C9 ], SIZE
  4425. FMA f91 = f35, f51, f91 // A4 * B4
  4426. nop __LINE__
  4427. }
  4428. ;;
  4429. { .mfb
  4430. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  4431. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  4432. nop __LINE__
  4433. }
  4434. { .mfb
  4435. (p5) LDFD f10 = [C1 ], SIZE
  4436. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  4437. nop __LINE__
  4438. }
  4439. ;;
  4440. { .mfb
  4441. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  4442. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  4443. nop __LINE__
  4444. }
  4445. { .mfb
  4446. (p5) LDFD f11 = [C9 ], SIZE
  4447. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  4448. nop __LINE__
  4449. }
  4450. ;;
  4451. { .mfb
  4452. (p5) LDFD f12 = [C1 ], SIZE
  4453. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  4454. nop __LINE__
  4455. }
  4456. { .mfb
  4457. (p5) LDFD f13 = [C9], SIZE
  4458. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  4459. nop __LINE__
  4460. }
  4461. ;;
  4462. { .mfb
  4463. (p5) LDFD f14 = [C1 ], - 3 * SIZE
  4464. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  4465. nop __LINE__
  4466. }
  4467. { .mfb
  4468. (p5) LDFD f15 = [C9], - 3 * SIZE
  4469. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  4470. nop __LINE__
  4471. }
  4472. ;;
  4473. { .mfb
  4474. (p5) LDFD f16 = [C2 ], SIZE
  4475. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  4476. nop __LINE__
  4477. }
  4478. { .mfb
  4479. (p5) LDFD f17 = [C10], SIZE
  4480. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  4481. nop __LINE__
  4482. }
  4483. ;;
  4484. { .mfb
  4485. (p5) LDFD f18 = [C2 ], SIZE
  4486. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  4487. nop __LINE__
  4488. }
  4489. { .mfb
  4490. (p5) LDFD f19 = [C10], SIZE
  4491. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  4492. nop __LINE__
  4493. }
  4494. ;;
  4495. { .mfb
  4496. (p5) LDFD f20 = [C2 ], SIZE
  4497. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  4498. nop __LINE__
  4499. }
  4500. { .mfb
  4501. (p5) LDFD f21 = [C10], SIZE
  4502. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  4503. nop __LINE__
  4504. }
  4505. ;;
  4506. { .mfi
  4507. (p5) LDFD f22 = [C2 ], -3 * SIZE
  4508. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  4509. adds L = -1, L
  4510. }
  4511. { .mfb
  4512. (p5) LDFD f23 = [C10], -3 * SIZE
  4513. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  4514. br.cloop.sptk.few .L062
  4515. }
  4516. ;;
  4517. { .mmf
  4518. LDFD f24 = [C3 ], SIZE
  4519. LDFD f25 = [C11], SIZE
  4520. FMA f6 = ALPHA_R, f64, f6
  4521. }
  4522. { .mmf
  4523. nop __LINE__
  4524. nop __LINE__
  4525. FMA f7 = ALPHA_R, f66, f7
  4526. }
  4527. ;;
  4528. { .mmf
  4529. LDFD f26 = [C3 ], SIZE
  4530. LDFD f27 = [C11], SIZE
  4531. FMA f10 = ALPHA_I, f64, f10
  4532. }
  4533. { .mmf
  4534. nop __LINE__
  4535. nop __LINE__
  4536. FMA f11 = ALPHA_I, f66, f11
  4537. }
  4538. ;;
  4539. { .mmf
  4540. LDFD f28 = [C3 ], SIZE
  4541. LDFD f29 = [C11], SIZE
  4542. FMA f12 = ALPHA_R, f65, f12
  4543. }
  4544. { .mmf
  4545. nop __LINE__
  4546. nop __LINE__
  4547. FMA f13 = ALPHA_R, f67, f13
  4548. }
  4549. ;;
  4550. { .mmf
  4551. LDFD f30 = [C3 ], - 3 * SIZE
  4552. LDFD f31 = [C11], - 3 * SIZE
  4553. FMA f14 = ALPHA_I, f65, f14
  4554. }
  4555. { .mmf
  4556. nop __LINE__
  4557. nop __LINE__
  4558. FMA f15 = ALPHA_I, f67, f15
  4559. }
  4560. ;;
  4561. { .mmf
  4562. STFD [C1 ] = f6, SIZE
  4563. STFD [C9 ] = f7, SIZE
  4564. FMA f16 = ALPHA_R, f72, f16
  4565. }
  4566. { .mmf
  4567. LDFD f32 = [C4 ], SIZE
  4568. LDFD f33 = [C12], SIZE
  4569. FMA f17 = ALPHA_R, f74, f17
  4570. }
  4571. ;;
  4572. { .mmf
  4573. STFD [C1 ] = f10, SIZE
  4574. STFD [C9 ] = f11, SIZE
  4575. FMA f18 = ALPHA_I, f72, f18
  4576. }
  4577. { .mmf
  4578. LDFD f34 = [C4 ], SIZE
  4579. LDFD f35 = [C12], SIZE
  4580. FMA f19 = ALPHA_I, f74, f19
  4581. }
  4582. ;;
  4583. { .mmf
  4584. STFD [C1 ] = f12, SIZE
  4585. STFD [C9 ] = f13, SIZE
  4586. FMA f20 = ALPHA_R, f73, f20
  4587. }
  4588. { .mmf
  4589. LDFD f36 = [C4 ], SIZE
  4590. LDFD f37 = [C12], SIZE
  4591. FMA f21 = ALPHA_R, f75, f21
  4592. }
  4593. ;;
  4594. { .mmf
  4595. STFD [C1 ] = f14, 5 * SIZE
  4596. STFD [C9 ] = f15, 5 * SIZE
  4597. FMA f22 = ALPHA_I, f73, f22
  4598. }
  4599. { .mmf
  4600. LDFD f38 = [C4 ], - 3 * SIZE
  4601. LDFD f39 = [C12], - 3 * SIZE
  4602. FMA f23 = ALPHA_I, f75, f23
  4603. }
  4604. ;;
  4605. { .mmf
  4606. STFD [C2 ] = f16, SIZE
  4607. STFD [C10] = f17, SIZE
  4608. FMA f24 = ALPHA_R, f80, f24
  4609. }
  4610. { .mmf
  4611. nop __LINE__
  4612. nop __LINE__
  4613. FMA f25 = ALPHA_R, f82, f25
  4614. }
  4615. ;;
  4616. { .mmf
  4617. STFD [C2 ] = f18, SIZE
  4618. STFD [C10] = f19, SIZE
  4619. FMA f26 = ALPHA_I, f80, f26
  4620. }
  4621. { .mmf
  4622. nop __LINE__
  4623. nop __LINE__
  4624. FMA f27 = ALPHA_I, f82, f27
  4625. }
  4626. ;;
  4627. { .mmf
  4628. STFD [C2 ] = f20, SIZE
  4629. STFD [C10] = f21, SIZE
  4630. FMA f28 = ALPHA_R, f81, f28
  4631. }
  4632. { .mmf
  4633. nop __LINE__
  4634. nop __LINE__
  4635. FMA f29 = ALPHA_R, f83, f29
  4636. }
  4637. ;;
  4638. { .mmf
  4639. STFD [C2 ] = f22, 5 * SIZE
  4640. STFD [C10] = f23, 5 * SIZE
  4641. FMA f30 = ALPHA_I, f81, f30
  4642. }
  4643. { .mmf
  4644. nop __LINE__
  4645. nop __LINE__
  4646. FMA f31 = ALPHA_I, f83, f31
  4647. }
  4648. ;;
  4649. { .mmf
  4650. STFD [C3 ] = f24, SIZE
  4651. STFD [C11] = f25, SIZE
  4652. FMA f32 = ALPHA_R, f88, f32
  4653. }
  4654. { .mmf
  4655. nop __LINE__
  4656. nop __LINE__
  4657. FMA f33 = ALPHA_R, f90, f33
  4658. }
  4659. ;;
  4660. { .mmf
  4661. STFD [C3 ] = f26, SIZE
  4662. STFD [C11] = f27, SIZE
  4663. FMA f34 = ALPHA_I, f88, f34
  4664. }
  4665. { .mmf
  4666. nop __LINE__
  4667. nop __LINE__
  4668. FMA f35 = ALPHA_I, f90, f35
  4669. }
  4670. ;;
  4671. { .mmf
  4672. STFD [C3 ] = f28, SIZE
  4673. STFD [C11] = f29, SIZE
  4674. FMA f36 = ALPHA_R, f89, f36
  4675. }
  4676. { .mmf
  4677. nop __LINE__
  4678. nop __LINE__
  4679. FMA f37 = ALPHA_R, f91, f37
  4680. }
  4681. ;;
  4682. { .mmf
  4683. STFD [C3 ] = f30, 5 * SIZE
  4684. STFD [C11] = f31, 5 * SIZE
  4685. FMA f38 = ALPHA_I, f89, f38
  4686. }
  4687. { .mmf
  4688. nop __LINE__
  4689. nop __LINE__
  4690. FMA f39 = ALPHA_I, f91, f39
  4691. }
  4692. ;;
  4693. { .mmf
  4694. STFD [C4 ] = f32, SIZE
  4695. STFD [C12] = f33, SIZE
  4696. mov f64 = f0
  4697. }
  4698. { .mmf
  4699. nop __LINE__
  4700. nop __LINE__
  4701. mov f72 = f0
  4702. }
  4703. ;;
  4704. { .mmf
  4705. STFD [C4 ] = f34, SIZE
  4706. STFD [C12] = f35, SIZE
  4707. mov f80 = f0
  4708. }
  4709. { .mmf
  4710. nop __LINE__
  4711. nop __LINE__
  4712. mov f88 = f0
  4713. }
  4714. ;;
  4715. { .mmf
  4716. STFD [C4 ] = f36, SIZE
  4717. STFD [C12] = f37, SIZE
  4718. mov f81 = f0
  4719. }
  4720. { .mmf
  4721. nop __LINE__
  4722. nop __LINE__
  4723. mov f65 = f0
  4724. }
  4725. ;;
  4726. { .mmf
  4727. STFD [C4 ] = f38, 5 * SIZE
  4728. STFD [C12] = f39, 5 * SIZE
  4729. mov f89 = f0
  4730. }
  4731. { .mmf
  4732. nop __LINE__
  4733. nop __LINE__
  4734. mov f73 = f0
  4735. }
  4736. ;;
  4737. .align 32
  4738. .L070:
  4739. { .mib
  4740. nop __LINE__
  4741. tbit.z p6,p7 = M, 1
  4742. (p6) br.cond.dptk .L080
  4743. }
  4744. ;;
  4745. { .mmi
  4746. LDFPD f48, f49 = [B]
  4747. adds BOFFSET = 2 * SIZE, B
  4748. adds L = 1, K
  4749. }
  4750. ;;
  4751. { .mii
  4752. cmp.eq p3, p0 = r0, r0
  4753. tbit.z p12, p0 = L, 0
  4754. shr L = L, 1
  4755. }
  4756. ;;
  4757. { .mmi
  4758. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  4759. adds L = -1, L
  4760. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  4761. }
  4762. ;;
  4763. { .mmi
  4764. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  4765. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  4766. mov ar.lc = L
  4767. }
  4768. ;;
  4769. .align 32
  4770. .L072:
  4771. { .mfb
  4772. lfetch.nt1 [PREA], 4 * SIZE
  4773. FMA f64 = f32, f48, f64 // A1 * B1
  4774. nop __LINE__
  4775. }
  4776. { .mfi
  4777. nop __LINE__
  4778. FMA f72 = f32, f49, f72 // A1 * B2
  4779. (p12) cmp.ne p3, p0 = 0, L
  4780. }
  4781. ;;
  4782. { .mfi
  4783. lfetch.nt1 [PREB], 8 * SIZE
  4784. FMA f80 = f32, f50, f80 // A1 * B3
  4785. cmp.ne p4, p5 = 0, L
  4786. }
  4787. { .mfb
  4788. nop __LINE__
  4789. FMA f88 = f32, f51, f88 // A1 * B4
  4790. nop __LINE__
  4791. }
  4792. ;;
  4793. { .mfi
  4794. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  4795. FMA f65 = f33, f48, f65 // A2 * B1
  4796. }
  4797. { .mfi
  4798. nop __LINE__
  4799. FMA f73 = f33, f49, f73 // A2 * B2
  4800. }
  4801. ;;
  4802. { .mfi
  4803. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  4804. FMA f81 = f33, f50, f81 // A2 * B3
  4805. }
  4806. { .mmf
  4807. (p5) LDFD f6 = [C1 ], SIZE
  4808. (p5) LDFD f12 = [C2 ], SIZE
  4809. FMA f89 = f33, f51, f89 // A2 * B4
  4810. }
  4811. ;;
  4812. { .mfb
  4813. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  4814. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  4815. nop __LINE__
  4816. }
  4817. { .mmf
  4818. (p5) LDFD f7 = [C1 ], SIZE
  4819. (p5) LDFD f13 = [C2 ], SIZE
  4820. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  4821. }
  4822. ;;
  4823. { .mfb
  4824. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  4825. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  4826. nop __LINE__
  4827. }
  4828. { .mmf
  4829. (p5) LDFD f10 = [C1 ], SIZE
  4830. (p5) LDFD f14 = [C2 ], SIZE
  4831. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  4832. }
  4833. ;;
  4834. { .mfb
  4835. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  4836. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  4837. nop __LINE__
  4838. }
  4839. { .mfb
  4840. (p5) LDFD f11 = [C1 ], - 3 * SIZE
  4841. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  4842. nop __LINE__
  4843. }
  4844. ;;
  4845. { .mfi
  4846. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  4847. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  4848. adds L = -1, L
  4849. }
  4850. { .mfb
  4851. (p5) LDFD f15 = [C2 ], - 3 * SIZE
  4852. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  4853. br.cloop.sptk.few .L072
  4854. }
  4855. ;;
  4856. { .mmf
  4857. LDFD f16 = [C3], SIZE
  4858. LDFD f20 = [C4], SIZE
  4859. FMA f6 = ALPHA_R, f64, f6
  4860. }
  4861. { .mmf
  4862. nop __LINE__
  4863. nop __LINE__
  4864. FMA f12 = ALPHA_R, f72, f12
  4865. }
  4866. ;;
  4867. { .mmf
  4868. LDFD f17 = [C3], SIZE
  4869. LDFD f21 = [C4], SIZE
  4870. FMA f7 = ALPHA_I, f64, f7
  4871. }
  4872. { .mmf
  4873. nop __LINE__
  4874. nop __LINE__
  4875. FMA f13 = ALPHA_I, f72, f13
  4876. }
  4877. ;;
  4878. { .mmf
  4879. LDFD f18 = [C3], SIZE
  4880. LDFD f22 = [C4], SIZE
  4881. FMA f10 = ALPHA_R, f65, f10
  4882. }
  4883. { .mmf
  4884. nop __LINE__
  4885. nop __LINE__
  4886. FMA f14 = ALPHA_R, f73, f14
  4887. }
  4888. ;;
  4889. { .mmf
  4890. LDFD f19 = [C3], - 3 * SIZE
  4891. LDFD f23 = [C4], - 3 * SIZE
  4892. FMA f11 = ALPHA_I, f65, f11
  4893. }
  4894. { .mmf
  4895. nop __LINE__
  4896. nop __LINE__
  4897. FMA f15 = ALPHA_I, f73, f15
  4898. }
  4899. ;;
  4900. { .mmf
  4901. STFD [C1] = f6, SIZE
  4902. STFD [C2] = f12, SIZE
  4903. FMA f16 = ALPHA_R, f80, f16
  4904. }
  4905. { .mmf
  4906. nop __LINE__
  4907. nop __LINE__
  4908. FMA f20 = ALPHA_R, f88, f20
  4909. }
  4910. ;;
  4911. { .mmf
  4912. STFD [C1] = f7, SIZE
  4913. STFD [C2] = f13, SIZE
  4914. FMA f17 = ALPHA_I, f80, f17
  4915. }
  4916. { .mmf
  4917. nop __LINE__
  4918. nop __LINE__
  4919. FMA f21 = ALPHA_I, f88, f21
  4920. }
  4921. ;;
  4922. { .mmf
  4923. STFD [C1] = f10, SIZE
  4924. STFD [C2] = f14, SIZE
  4925. FMA f18 = ALPHA_R, f81, f18
  4926. }
  4927. { .mmf
  4928. nop __LINE__
  4929. nop __LINE__
  4930. FMA f22 = ALPHA_R, f89, f22
  4931. }
  4932. ;;
  4933. { .mmf
  4934. STFD [C1] = f11, SIZE
  4935. STFD [C2] = f15, SIZE
  4936. FMA f19 = ALPHA_I, f81, f19
  4937. }
  4938. { .mmf
  4939. nop __LINE__
  4940. nop __LINE__
  4941. FMA f23 = ALPHA_I, f89, f23
  4942. }
  4943. ;;
  4944. { .mmf
  4945. STFD [C3] = f16, SIZE
  4946. STFD [C4] = f20, SIZE
  4947. mov f64 = f0
  4948. }
  4949. ;;
  4950. { .mmf
  4951. STFD [C3] = f17, SIZE
  4952. STFD [C4] = f21, SIZE
  4953. mov f72 = f0
  4954. }
  4955. ;;
  4956. { .mmf
  4957. STFD [C3] = f18, SIZE
  4958. STFD [C4] = f22, SIZE
  4959. mov f80 = f0
  4960. }
  4961. ;;
  4962. { .mmf
  4963. STFD [C3] = f19, SIZE
  4964. STFD [C4] = f23, SIZE
  4965. mov f88 = f0
  4966. }
  4967. ;;
  4968. .align 32
  4969. .L080:
  4970. { .mib
  4971. nop __LINE__
  4972. tbit.z p6,p7 = M, 0
  4973. (p6) br.cond.dptk .L089
  4974. }
  4975. ;;
  4976. { .mmi
  4977. LDFPD f48, f49 = [B]
  4978. adds BOFFSET = 2 * SIZE, B
  4979. adds L = 1, K
  4980. }
  4981. ;;
  4982. { .mii
  4983. LDFD f32 = [AOFFSET], 1 * SIZE
  4984. tbit.z p12, p0 = L, 0
  4985. shr L = L, 1
  4986. }
  4987. ;;
  4988. { .mmi
  4989. nop __LINE__
  4990. nop __LINE__
  4991. adds L = -1, L
  4992. }
  4993. ;;
  4994. { .mmi
  4995. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  4996. cmp.eq p3, p0 = r0, r0
  4997. mov ar.lc = L
  4998. }
  4999. ;;
  5000. .align 32
  5001. .L082:
  5002. { .mfb
  5003. cmp.ne p4, p5 = 0, L
  5004. FMA f64 = f32, f48, f64 // A1 * B1
  5005. nop __LINE__
  5006. }
  5007. { .mfi
  5008. (p12) cmp.ne p3, p0 = 0, L
  5009. FMA f72 = f32, f49, f72 // A1 * B2
  5010. nop __LINE__
  5011. }
  5012. ;;
  5013. { .mfb
  5014. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  5015. FMA f80 = f32, f50, f80 // A1 * B3
  5016. nop __LINE__
  5017. }
  5018. { .mfb
  5019. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  5020. FMA f88 = f32, f51, f88 // A1 * B4
  5021. nop __LINE__
  5022. }
  5023. ;;
  5024. { .mfb
  5025. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  5026. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  5027. nop __LINE__
  5028. }
  5029. { .mmf
  5030. (p5) LDFD f6 = [C1], SIZE
  5031. (p5) LDFD f10 = [C2], SIZE
  5032. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  5033. }
  5034. ;;
  5035. { .mmf
  5036. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5037. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  5038. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  5039. }
  5040. { .mmf
  5041. (p5) LDFD f7 = [C1], -SIZE
  5042. (p5) LDFD f11 = [C2], -SIZE
  5043. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  5044. }
  5045. ;;
  5046. { .mib
  5047. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  5048. adds L = -1, L
  5049. br.cloop.sptk.few .L082
  5050. }
  5051. ;;
  5052. { .mmf
  5053. LDFD f12 = [C3], SIZE
  5054. LDFD f14 = [C4], SIZE
  5055. FMA f6 = ALPHA_R, f64, f6
  5056. }
  5057. { .mmf
  5058. nop __LINE__
  5059. nop __LINE__
  5060. FMA f10 = ALPHA_R, f72, f10
  5061. }
  5062. ;;
  5063. { .mmf
  5064. LDFD f13 = [C3], -SIZE
  5065. LDFD f15 = [C4], -SIZE
  5066. FMA f7 = ALPHA_I, f64, f7
  5067. }
  5068. { .mmf
  5069. nop __LINE__
  5070. nop __LINE__
  5071. FMA f11 = ALPHA_I, f72, f11
  5072. }
  5073. ;;
  5074. { .mmf
  5075. nop __LINE__
  5076. nop __LINE__
  5077. FMA f12 = ALPHA_R, f80, f12
  5078. }
  5079. { .mmf
  5080. nop __LINE__
  5081. nop __LINE__
  5082. FMA f14 = ALPHA_R, f88, f14
  5083. }
  5084. ;;
  5085. { .mmf
  5086. nop __LINE__
  5087. nop __LINE__
  5088. FMA f13 = ALPHA_I, f80, f13
  5089. }
  5090. { .mmf
  5091. nop __LINE__
  5092. nop __LINE__
  5093. FMA f15 = ALPHA_I, f88, f15
  5094. }
  5095. ;;
  5096. { .mmi
  5097. STFD [C1] = f6, SIZE
  5098. STFD [C2] = f10, SIZE
  5099. nop __LINE__
  5100. }
  5101. ;;
  5102. { .mmi
  5103. STFD [C1] = f7, SIZE
  5104. STFD [C2] = f11, SIZE
  5105. nop __LINE__
  5106. }
  5107. ;;
  5108. { .mmi
  5109. STFD [C3] = f12, SIZE
  5110. STFD [C4] = f14, SIZE
  5111. nop __LINE__
  5112. }
  5113. ;;
  5114. { .mmi
  5115. STFD [C3] = f13, SIZE
  5116. STFD [C4] = f15, SIZE
  5117. nop __LINE__
  5118. }
  5119. ;;
  5120. .align 32
  5121. .L089:
  5122. { .mmi
  5123. mov B = BOFFSET
  5124. mov AOFFSET = A
  5125. nop __LINE__
  5126. }
  5127. ;;
  5128. .align 16
  5129. .L090:
  5130. { .mfi
  5131. mov C1 = C
  5132. mov f64 = f0
  5133. tbit.z p6, p0 = N, 1
  5134. }
  5135. { .mfi
  5136. add C2 = LDC, C
  5137. mov f72 = f0
  5138. shr I = M, 3
  5139. }
  5140. ;;
  5141. { .mfi
  5142. setf.d f66 = r0
  5143. mov f65 = f0
  5144. nop __LINE__
  5145. }
  5146. { .mfb
  5147. mov AOFFSET = A
  5148. mov f73 = f0
  5149. (p6) br.cond.dpnt .L130
  5150. }
  5151. ;;
  5152. { .mfi
  5153. nop __LINE__
  5154. mov f67 = f0
  5155. shladd C = LDC, 1, C
  5156. }
  5157. { .mfb
  5158. cmp.eq p6, p7 = 0, I
  5159. mov f74 = f0
  5160. (p6) br.cond.dpnt .L100
  5161. }
  5162. ;;
  5163. .align 32
  5164. .L092:
  5165. { .mfb
  5166. LDFPD f48, f49 = [B]
  5167. mov f68 = f0
  5168. nop __LINE__
  5169. }
  5170. { .mfb
  5171. adds BOFFSET = 2 * SIZE, B
  5172. mov f79 = f0
  5173. nop __LINE__
  5174. }
  5175. ;;
  5176. { .mfi
  5177. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5178. mov f75 = f0
  5179. nop __LINE__
  5180. }
  5181. ;;
  5182. { .mfi
  5183. adds PREC = CPREFETCHSIZE * SIZE, C1
  5184. mov f76 = f0
  5185. adds L = 1, K
  5186. }
  5187. ;;
  5188. { .mfi
  5189. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  5190. mov f69 = f0
  5191. tbit.z p12, p0 = L, 0
  5192. }
  5193. { .mfi
  5194. cmp.eq p3, p0 = r0, r0
  5195. mov f77 = f0
  5196. shr L = L, 1
  5197. }
  5198. ;;
  5199. { .mfi
  5200. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  5201. adds L = -1, L
  5202. }
  5203. { .mmf
  5204. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  5205. CPREFETCH [PREC], LDC
  5206. mov f70 = f0
  5207. }
  5208. ;;
  5209. { .mfi
  5210. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  5211. mov f78 = f0
  5212. mov ar.lc = L
  5213. }
  5214. { .mfi
  5215. CPREFETCH [PREC]
  5216. mov f71 = f0
  5217. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  5218. }
  5219. ;;
  5220. .align 32
  5221. .L093:
  5222. /* 1 */
  5223. { .mfi
  5224. lfetch.nt1 [PREA], 16 * SIZE
  5225. FMA f64 = f32, f48, f64 // A1 * B1
  5226. cmp.ne p4, p5 = 0, L
  5227. }
  5228. { .mfi
  5229. nop __LINE__
  5230. FMA f72 = f32, f49, f72 // A1 * B2
  5231. (p12) cmp.ne p3, p0 = 0, L
  5232. }
  5233. ;;
  5234. { .mfi
  5235. lfetch.nt1 [PREB], 4 * SIZE
  5236. FMA f65 = f33, f48, f65 // A2 * B1
  5237. adds C9 = 4 * SIZE, C1
  5238. }
  5239. { .mfi
  5240. nop __LINE__
  5241. FMA f73 = f33, f49, f73 // A2 * B2
  5242. adds C10 = 4 * SIZE, C2
  5243. }
  5244. ;;
  5245. { .mfi
  5246. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  5247. FMA f66 = f34, f48, f66 // A3 * B1
  5248. adds C11 = 4 * SIZE, C3
  5249. }
  5250. { .mfi
  5251. nop __LINE__
  5252. FMA f74 = f34, f49, f74 // A3 * B2
  5253. adds C12 = 4 * SIZE, C4
  5254. }
  5255. ;;
  5256. { .mfb
  5257. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  5258. FMA f67 = f35, f48, f67 // A4 * B1
  5259. nop __LINE__
  5260. }
  5261. { .mfb
  5262. (p5) LDFD f6 = [C1 ], SIZE
  5263. FMA f75 = f35, f49, f75 // A4 * B2
  5264. nop __LINE__
  5265. }
  5266. ;;
  5267. { .mfb
  5268. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  5269. FMA f68 = f36, f48, f68 // A5 * B1
  5270. nop __LINE__
  5271. }
  5272. { .mfb
  5273. (p5) LDFD f7 = [C9 ], SIZE
  5274. FMA f76 = f36, f49, f76 // A5 * B2
  5275. nop __LINE__
  5276. }
  5277. ;;
  5278. { .mfb
  5279. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  5280. FMA f69 = f37, f48, f69 // A6 * B1
  5281. nop __LINE__
  5282. }
  5283. { .mfb
  5284. (p5) LDFD f10 = [C1 ], SIZE
  5285. FMA f77 = f37, f49, f77 // A6 * B2
  5286. nop __LINE__
  5287. }
  5288. ;;
  5289. { .mfb
  5290. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  5291. FMA f70 = f38, f48, f70 // A7 * B1
  5292. nop __LINE__
  5293. }
  5294. { .mfb
  5295. (p5) LDFD f11 = [C9 ], SIZE
  5296. FMA f78 = f38, f49, f78 // A7 * B2
  5297. nop __LINE__
  5298. }
  5299. ;;
  5300. { .mfb
  5301. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5302. FMA f71 = f39, f48, f71 // A8 * B1
  5303. nop __LINE__
  5304. }
  5305. { .mfb
  5306. (p5) LDFD f12 = [C1 ], SIZE
  5307. FMA f79 = f39, f49, f79 // A8 * B2
  5308. nop __LINE__
  5309. }
  5310. ;;
  5311. { .mfb
  5312. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5313. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  5314. nop __LINE__
  5315. }
  5316. { .mfb
  5317. (p5) LDFD f13 = [C9 ], SIZE
  5318. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  5319. nop __LINE__
  5320. }
  5321. ;;
  5322. { .mfb
  5323. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  5324. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  5325. nop __LINE__
  5326. }
  5327. { .mfb
  5328. (p5) LDFD f14 = [C1 ], 5 * SIZE
  5329. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  5330. nop __LINE__
  5331. }
  5332. ;;
  5333. { .mfb
  5334. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  5335. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  5336. nop __LINE__
  5337. }
  5338. { .mfb
  5339. (p5) LDFD f15 = [C9 ], 5 * SIZE
  5340. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  5341. nop __LINE__
  5342. }
  5343. ;;
  5344. { .mfb
  5345. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  5346. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  5347. nop __LINE__
  5348. }
  5349. { .mfb
  5350. nop __LINE__
  5351. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  5352. nop __LINE__
  5353. }
  5354. ;;
  5355. { .mfb
  5356. (p5) LDFD f16 = [C1 ], SIZE
  5357. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  5358. nop __LINE__
  5359. }
  5360. { .mfb
  5361. (p5) LDFD f17 = [C9 ], SIZE
  5362. (p3) FMA f76 = f44, f57, f76 // A5 * B2
  5363. nop __LINE__
  5364. }
  5365. ;;
  5366. { .mfb
  5367. (p5) LDFD f18 = [C1 ], SIZE
  5368. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  5369. nop __LINE__
  5370. }
  5371. { .mfb
  5372. (p5) LDFD f19 = [C9 ], SIZE
  5373. (p3) FMA f77 = f45, f57, f77 // A6 * B2
  5374. nop __LINE__
  5375. }
  5376. ;;
  5377. { .mfb
  5378. (p5) LDFD f20 = [C1 ], SIZE
  5379. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  5380. nop __LINE__
  5381. }
  5382. { .mfb
  5383. (p5) LDFD f21 = [C9 ], SIZE
  5384. (p3) FMA f78 = f46, f57, f78 // A7 * B2
  5385. nop __LINE__
  5386. }
  5387. ;;
  5388. { .mfi
  5389. (p5) LDFD f22 = [C1 ], -11 * SIZE
  5390. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  5391. adds L = -1, L
  5392. }
  5393. { .mfb
  5394. (p5) LDFD f23 = [C9 ], -11 * SIZE
  5395. (p3) FMA f79 = f47, f57, f79 // A8 * B2
  5396. br.cloop.sptk.few .L093
  5397. }
  5398. ;;
  5399. { .mmf
  5400. LDFD f24 = [C2 ], SIZE
  5401. LDFD f25 = [C10], SIZE
  5402. FMA f6 = ALPHA_R, f64, f6
  5403. }
  5404. { .mmf
  5405. nop __LINE__
  5406. nop __LINE__
  5407. FMA f7 = ALPHA_R, f66, f7
  5408. }
  5409. ;;
  5410. { .mmf
  5411. LDFD f26 = [C2 ], SIZE
  5412. LDFD f27 = [C10], SIZE
  5413. FMA f10 = ALPHA_I, f64, f10
  5414. }
  5415. { .mmf
  5416. nop __LINE__
  5417. nop __LINE__
  5418. FMA f11 = ALPHA_I, f66, f11
  5419. }
  5420. ;;
  5421. { .mmf
  5422. LDFD f28 = [C2 ], SIZE
  5423. LDFD f29 = [C10], SIZE
  5424. FMA f12 = ALPHA_R, f65, f12
  5425. }
  5426. { .mmf
  5427. nop __LINE__
  5428. nop __LINE__
  5429. FMA f13 = ALPHA_R, f67, f13
  5430. }
  5431. ;;
  5432. { .mmf
  5433. LDFD f30 = [C2 ], 5 * SIZE
  5434. LDFD f31 = [C10], 5 * SIZE
  5435. FMA f14 = ALPHA_I, f65, f14
  5436. }
  5437. { .mmf
  5438. nop __LINE__
  5439. nop __LINE__
  5440. FMA f15 = ALPHA_I, f67, f15
  5441. }
  5442. ;;
  5443. { .mmf
  5444. STFD [C1 ] = f6, SIZE
  5445. STFD [C9 ] = f7, SIZE
  5446. FMA f16 = ALPHA_R, f68, f16
  5447. }
  5448. { .mmf
  5449. LDFD f32 = [C2 ], SIZE
  5450. LDFD f33 = [C10], SIZE
  5451. FMA f17 = ALPHA_R, f70, f17
  5452. }
  5453. ;;
  5454. { .mmf
  5455. STFD [C1 ] = f10, SIZE
  5456. STFD [C9 ] = f11, SIZE
  5457. FMA f18 = ALPHA_I, f68, f18
  5458. }
  5459. { .mmf
  5460. LDFD f34 = [C2 ], SIZE
  5461. LDFD f35 = [C10], SIZE
  5462. FMA f19 = ALPHA_I, f70, f19
  5463. }
  5464. ;;
  5465. { .mmf
  5466. STFD [C1 ] = f12, SIZE
  5467. STFD [C9 ] = f13, SIZE
  5468. FMA f20 = ALPHA_R, f69, f20
  5469. }
  5470. { .mmf
  5471. LDFD f36 = [C2 ], SIZE
  5472. LDFD f37 = [C10], SIZE
  5473. FMA f21 = ALPHA_R, f71, f21
  5474. }
  5475. ;;
  5476. { .mmf
  5477. STFD [C1 ] = f14, 5 * SIZE
  5478. STFD [C9 ] = f15, 5 * SIZE
  5479. FMA f22 = ALPHA_I, f69, f22
  5480. }
  5481. { .mmf
  5482. LDFD f38 = [C2 ], - 11 * SIZE
  5483. LDFD f39 = [C10], - 11 * SIZE
  5484. FMA f23 = ALPHA_I, f71, f23
  5485. }
  5486. ;;
  5487. { .mmf
  5488. STFD [C1 ] = f16, SIZE
  5489. STFD [C9 ] = f17, SIZE
  5490. FMA f24 = ALPHA_R, f72, f24
  5491. }
  5492. { .mmf
  5493. nop __LINE__
  5494. nop __LINE__
  5495. FMA f25 = ALPHA_R, f74, f25
  5496. }
  5497. ;;
  5498. { .mmf
  5499. STFD [C1 ] = f18, SIZE
  5500. STFD [C9 ] = f19, SIZE
  5501. FMA f26 = ALPHA_I, f72, f26
  5502. }
  5503. { .mmf
  5504. nop __LINE__
  5505. nop __LINE__
  5506. FMA f27 = ALPHA_I, f74, f27
  5507. }
  5508. ;;
  5509. { .mmf
  5510. STFD [C1 ] = f20, SIZE
  5511. STFD [C9 ] = f21, SIZE
  5512. FMA f28 = ALPHA_R, f73, f28
  5513. }
  5514. { .mmf
  5515. nop __LINE__
  5516. nop __LINE__
  5517. FMA f29 = ALPHA_R, f75, f29
  5518. }
  5519. ;;
  5520. { .mmf
  5521. STFD [C1 ] = f22, 5 * SIZE
  5522. STFD [C9 ] = f23, 5 * SIZE
  5523. FMA f30 = ALPHA_I, f73, f30
  5524. }
  5525. { .mmf
  5526. nop __LINE__
  5527. nop __LINE__
  5528. FMA f31 = ALPHA_I, f75, f31
  5529. }
  5530. ;;
  5531. { .mmf
  5532. STFD [C2 ] = f24, SIZE
  5533. STFD [C10] = f25, SIZE
  5534. FMA f32 = ALPHA_R, f76, f32
  5535. }
  5536. { .mmf
  5537. nop __LINE__
  5538. nop __LINE__
  5539. FMA f33 = ALPHA_R, f78, f33
  5540. }
  5541. ;;
  5542. { .mmf
  5543. STFD [C2 ] = f26, SIZE
  5544. STFD [C10] = f27, SIZE
  5545. FMA f34 = ALPHA_I, f76, f34
  5546. }
  5547. { .mmf
  5548. nop __LINE__
  5549. nop __LINE__
  5550. FMA f35 = ALPHA_I, f78, f35
  5551. }
  5552. ;;
  5553. { .mmf
  5554. STFD [C2 ] = f28, SIZE
  5555. STFD [C10] = f29, SIZE
  5556. FMA f36 = ALPHA_R, f77, f36
  5557. }
  5558. { .mmf
  5559. nop __LINE__
  5560. nop __LINE__
  5561. FMA f37 = ALPHA_R, f79, f37
  5562. }
  5563. ;;
  5564. { .mmf
  5565. STFD [C2 ] = f30, 5 * SIZE
  5566. STFD [C10] = f31, 5 * SIZE
  5567. FMA f38 = ALPHA_I, f77, f38
  5568. }
  5569. { .mmf
  5570. nop __LINE__
  5571. nop __LINE__
  5572. FMA f39 = ALPHA_I, f79, f39
  5573. }
  5574. ;;
  5575. { .mmf
  5576. STFD [C2 ] = f32, SIZE
  5577. STFD [C10] = f33, SIZE
  5578. mov f64 = f0
  5579. }
  5580. { .mmf
  5581. cmp.ne p6, p0 = 1, I
  5582. nop __LINE__
  5583. mov f72 = f0
  5584. }
  5585. ;;
  5586. { .mmf
  5587. STFD [C2 ] = f34, SIZE
  5588. STFD [C10] = f35, SIZE
  5589. mov f65 = f0
  5590. }
  5591. { .mmf
  5592. nop __LINE__
  5593. nop __LINE__
  5594. mov f73 = f0
  5595. }
  5596. ;;
  5597. { .mmf
  5598. STFD [C2 ] = f36, SIZE
  5599. STFD [C10] = f37, SIZE
  5600. mov f66 = f0
  5601. }
  5602. { .mmf
  5603. nop __LINE__
  5604. nop __LINE__
  5605. mov f74 = f0
  5606. }
  5607. ;;
  5608. { .mmf
  5609. STFD [C2 ] = f38, 5 * SIZE
  5610. STFD [C10] = f39, 5 * SIZE
  5611. mov f67 = f0
  5612. }
  5613. { .mfb
  5614. adds I = -1, I
  5615. mov f75 = f0
  5616. (p6) br.cond.dptk .L092
  5617. }
  5618. ;;
  5619. .align 32
  5620. .L100:
  5621. { .mib
  5622. nop __LINE__
  5623. tbit.z p6, p7 = M, 2
  5624. (p6) br.cond.dptk .L110
  5625. }
  5626. ;;
  5627. { .mmf
  5628. LDFPD f48, f49 = [B]
  5629. adds BOFFSET = 2 * SIZE, B
  5630. mov f75 = f0
  5631. }
  5632. { .mii
  5633. nop __LINE__
  5634. adds L = 1, K
  5635. }
  5636. ;;
  5637. { .mii
  5638. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  5639. tbit.z p12, p0 = L, 0
  5640. shr L = L, 1
  5641. }
  5642. ;;
  5643. { .mmi
  5644. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5645. nop __LINE__
  5646. adds L = -1, L
  5647. }
  5648. ;;
  5649. { .mmi
  5650. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  5651. cmp.eq p3, p0 = r0, r0
  5652. mov ar.lc = L
  5653. }
  5654. ;;
  5655. .align 32
  5656. .L102:
  5657. { .mfi
  5658. lfetch.nt1 [PREA], 8 * SIZE
  5659. FMA f64 = f32, f48, f64 // A1 * B1
  5660. cmp.ne p4, p5 = 0, L
  5661. }
  5662. { .mfi
  5663. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  5664. FMA f72 = f32, f49, f72 // A1 * B2
  5665. (p12) cmp.ne p3, p0 = 0, L
  5666. }
  5667. ;;
  5668. { .mfi
  5669. lfetch.nt1 [PREB], 4 * SIZE
  5670. FMA f65 = f33, f48, f65 // A2 * B1
  5671. adds C9 = 4 * SIZE, C1
  5672. }
  5673. { .mfi
  5674. nop __LINE__
  5675. FMA f73 = f33, f49, f73 // A2 * B2
  5676. adds C10 = 4 * SIZE, C2
  5677. }
  5678. ;;
  5679. { .mfb
  5680. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  5681. FMA f66 = f34, f48, f66 // A3 * B1
  5682. nop __LINE__
  5683. }
  5684. { .mfb
  5685. (p5) LDFD f6 = [C1 ], SIZE
  5686. FMA f74 = f34, f49, f74 // A3 * B2
  5687. nop __LINE__
  5688. }
  5689. ;;
  5690. { .mfb
  5691. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  5692. FMA f67 = f35, f48, f67 // A4 * B1
  5693. nop __LINE__
  5694. }
  5695. { .mfb
  5696. (p5) LDFD f7 = [C9 ], SIZE
  5697. FMA f75 = f35, f49, f75 // A4 * B2
  5698. nop __LINE__
  5699. }
  5700. ;;
  5701. { .mfb
  5702. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  5703. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  5704. nop __LINE__
  5705. }
  5706. { .mfb
  5707. (p5) LDFD f10 = [C1 ], SIZE
  5708. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  5709. nop __LINE__
  5710. }
  5711. ;;
  5712. { .mfb
  5713. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5714. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  5715. nop __LINE__
  5716. }
  5717. { .mfb
  5718. (p5) LDFD f11 = [C9 ], SIZE
  5719. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  5720. nop __LINE__
  5721. }
  5722. ;;
  5723. { .mfb
  5724. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5725. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  5726. nop __LINE__
  5727. }
  5728. { .mfb
  5729. (p5) LDFD f12 = [C1], SIZE
  5730. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  5731. nop __LINE__
  5732. }
  5733. ;;
  5734. { .mfi
  5735. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  5736. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  5737. adds L = -1, L
  5738. }
  5739. { .mfb
  5740. (p5) LDFD f13 = [C9], SIZE
  5741. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  5742. br.cloop.sptk.few .L102
  5743. }
  5744. ;;
  5745. { .mmf
  5746. LDFD f14 = [C1], - 3 * SIZE
  5747. LDFD f15 = [C9], - 3 * SIZE
  5748. FMA f6 = ALPHA_R, f64, f6
  5749. }
  5750. { .mmf
  5751. nop __LINE__
  5752. nop __LINE__
  5753. FMA f7 = ALPHA_R, f66, f7
  5754. }
  5755. ;;
  5756. { .mmf
  5757. LDFD f16 = [C2 ], SIZE
  5758. LDFD f17 = [C10], SIZE
  5759. FMA f10 = ALPHA_I, f64, f10
  5760. }
  5761. { .mmf
  5762. nop __LINE__
  5763. nop __LINE__
  5764. FMA f11 = ALPHA_I, f66, f11
  5765. }
  5766. ;;
  5767. { .mmf
  5768. LDFD f18 = [C2 ], SIZE
  5769. LDFD f19 = [C10], SIZE
  5770. FMA f12 = ALPHA_R, f65, f12
  5771. }
  5772. { .mmf
  5773. nop __LINE__
  5774. nop __LINE__
  5775. FMA f13 = ALPHA_R, f67, f13
  5776. }
  5777. ;;
  5778. { .mmf
  5779. LDFD f20 = [C2 ], SIZE
  5780. LDFD f21 = [C10], SIZE
  5781. FMA f14 = ALPHA_I, f65, f14
  5782. }
  5783. { .mmf
  5784. nop __LINE__
  5785. nop __LINE__
  5786. FMA f15 = ALPHA_I, f67, f15
  5787. }
  5788. ;;
  5789. { .mmf
  5790. STFD [C1 ] = f6, SIZE
  5791. STFD [C9 ] = f7, SIZE
  5792. FMA f16 = ALPHA_R, f72, f16
  5793. }
  5794. { .mmf
  5795. LDFD f22 = [C2 ], - 3 * SIZE
  5796. LDFD f23 = [C10], - 3 * SIZE
  5797. FMA f17 = ALPHA_R, f74, f17
  5798. }
  5799. ;;
  5800. { .mmf
  5801. STFD [C1 ] = f10, SIZE
  5802. STFD [C9 ] = f11, SIZE
  5803. FMA f18 = ALPHA_I, f72, f18
  5804. }
  5805. { .mmf
  5806. nop __LINE__
  5807. nop __LINE__
  5808. FMA f19 = ALPHA_I, f74, f19
  5809. }
  5810. ;;
  5811. { .mmf
  5812. STFD [C1 ] = f12, SIZE
  5813. STFD [C9 ] = f13, SIZE
  5814. FMA f20 = ALPHA_R, f73, f20
  5815. }
  5816. { .mmf
  5817. nop __LINE__
  5818. nop __LINE__
  5819. FMA f21 = ALPHA_R, f75, f21
  5820. }
  5821. ;;
  5822. { .mmf
  5823. STFD [C1 ] = f14, 5 * SIZE
  5824. STFD [C9 ] = f15, 5 * SIZE
  5825. FMA f22 = ALPHA_I, f73, f22
  5826. }
  5827. { .mmf
  5828. nop __LINE__
  5829. nop __LINE__
  5830. FMA f23 = ALPHA_I, f75, f23
  5831. }
  5832. ;;
  5833. { .mmf
  5834. STFD [C2 ] = f16, SIZE
  5835. STFD [C10] = f17, SIZE
  5836. mov f64 = f0
  5837. }
  5838. ;;
  5839. { .mmf
  5840. STFD [C2 ] = f18, SIZE
  5841. STFD [C10] = f19, SIZE
  5842. mov f65 = f0
  5843. }
  5844. ;;
  5845. { .mmf
  5846. STFD [C2 ] = f20, SIZE
  5847. STFD [C10] = f21, SIZE
  5848. mov f72 = f0
  5849. }
  5850. ;;
  5851. { .mmf
  5852. STFD [C2 ] = f22, 5 * SIZE
  5853. STFD [C10] = f23, 5 * SIZE
  5854. mov f73 = f0
  5855. }
  5856. ;;
  5857. .align 32
  5858. .L110:
  5859. { .mib
  5860. nop __LINE__
  5861. tbit.z p6, p7 = M, 1
  5862. (p6) br.cond.dptk .L120
  5863. }
  5864. ;;
  5865. { .mmi
  5866. LDFPD f48, f49 = [B]
  5867. adds BOFFSET = 2 * SIZE, B
  5868. adds L = 1, K
  5869. }
  5870. ;;
  5871. { .mii
  5872. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  5873. tbit.z p12, p0 = L, 0
  5874. shr L = L, 1
  5875. }
  5876. ;;
  5877. { .mmi
  5878. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5879. nop __LINE__
  5880. adds L = -1, L
  5881. }
  5882. ;;
  5883. { .mmi
  5884. cmp.eq p3, p0 = r0, r0
  5885. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  5886. mov ar.lc = L
  5887. }
  5888. ;;
  5889. .align 32
  5890. .L112:
  5891. { .mfi
  5892. lfetch.nt1 [PREA], 4 * SIZE
  5893. FMA f64 = f32, f48, f64 // A1 * B1
  5894. cmp.ne p4, p5 = 0, L
  5895. }
  5896. { .mfi
  5897. lfetch.nt1 [PREB], 4 * SIZE
  5898. FMA f72 = f32, f49, f72 // A1 * B2
  5899. (p12) cmp.ne p3, p0 = 0, L
  5900. }
  5901. ;;
  5902. { .mmf
  5903. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  5904. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  5905. FMA f65 = f33, f48, f65 // A2 * B1
  5906. }
  5907. { .mmf
  5908. (p5) LDFD f6 = [C1 ], SIZE
  5909. (p5) LDFD f7 = [C2 ], SIZE
  5910. FMA f73 = f33, f49, f73 // A2 * B2
  5911. }
  5912. ;;
  5913. { .mfb
  5914. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5915. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  5916. nop __LINE__
  5917. }
  5918. { .mfb
  5919. (p5) LDFD f10 = [C1 ], SIZE
  5920. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  5921. nop __LINE__
  5922. }
  5923. ;;
  5924. { .mfi
  5925. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5926. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  5927. adds L = -1, L
  5928. }
  5929. { .mfb
  5930. (p5) LDFD f11 = [C2 ], SIZE
  5931. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  5932. br.cloop.sptk.few .L112
  5933. }
  5934. ;;
  5935. { .mmf
  5936. LDFD f12 = [C1], SIZE
  5937. LDFD f13 = [C2], SIZE
  5938. FMA f6 = ALPHA_R, f64, f6
  5939. }
  5940. { .mmf
  5941. nop __LINE__
  5942. nop __LINE__
  5943. FMA f7 = ALPHA_R, f72, f7
  5944. }
  5945. ;;
  5946. { .mmf
  5947. LDFD f14 = [C1], - 3 * SIZE
  5948. LDFD f15 = [C2], - 3 * SIZE
  5949. FMA f10 = ALPHA_I, f64, f10
  5950. }
  5951. { .mmf
  5952. nop __LINE__
  5953. nop __LINE__
  5954. FMA f11 = ALPHA_I, f72, f11
  5955. }
  5956. ;;
  5957. { .mmf
  5958. nop __LINE__
  5959. nop __LINE__
  5960. FMA f12 = ALPHA_R, f65, f12
  5961. }
  5962. { .mmf
  5963. nop __LINE__
  5964. nop __LINE__
  5965. FMA f13 = ALPHA_R, f73, f13
  5966. }
  5967. ;;
  5968. { .mmf
  5969. nop __LINE__
  5970. nop __LINE__
  5971. FMA f14 = ALPHA_I, f65, f14
  5972. }
  5973. { .mmf
  5974. nop __LINE__
  5975. nop __LINE__
  5976. FMA f15 = ALPHA_I, f73, f15
  5977. }
  5978. ;;
  5979. { .mmf
  5980. STFD [C1] = f6, SIZE
  5981. STFD [C2] = f7, SIZE
  5982. mov f64 = f0
  5983. }
  5984. ;;
  5985. { .mmf
  5986. STFD [C1] = f10, SIZE
  5987. STFD [C2] = f11, SIZE
  5988. mov f72 = f0
  5989. }
  5990. ;;
  5991. { .mmf
  5992. STFD [C1] = f12, SIZE
  5993. STFD [C2] = f13, SIZE
  5994. mov f65 = f0
  5995. }
  5996. ;;
  5997. { .mmf
  5998. STFD [C1] = f14, SIZE
  5999. STFD [C2] = f15, SIZE
  6000. mov f73 = f0
  6001. }
  6002. ;;
  6003. .align 32
  6004. .L120:
  6005. { .mib
  6006. nop __LINE__
  6007. tbit.z p6, p7 = M, 0
  6008. (p6) br.cond.dptk .L129
  6009. }
  6010. ;;
  6011. { .mmi
  6012. LDFPD f48, f49 = [B]
  6013. adds BOFFSET = 2 * SIZE, B
  6014. adds L = 1, K
  6015. }
  6016. ;;
  6017. { .mii
  6018. nop __LINE__
  6019. tbit.z p12, p0 = L, 0
  6020. shr L = L, 1
  6021. }
  6022. ;;
  6023. { .mmi
  6024. LDFD f32 = [AOFFSET], 1 * SIZE
  6025. nop __LINE__
  6026. adds L = -1, L
  6027. }
  6028. ;;
  6029. { .mmi
  6030. cmp.eq p3, p0 = r0, r0
  6031. nop __LINE__
  6032. mov ar.lc = L
  6033. }
  6034. ;;
  6035. .align 32
  6036. .L122:
  6037. { .mfi
  6038. FMA f64 = f32, f48, f64 // A1 * B1
  6039. cmp.ne p4, p5 = 0, L
  6040. }
  6041. { .mfi
  6042. nop __LINE__
  6043. FMA f72 = f32, f49, f72 // A1 * B2
  6044. (p12) cmp.ne p3, p0 = 0, L
  6045. }
  6046. ;;
  6047. { .mmi
  6048. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  6049. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  6050. nop __LINE__
  6051. }
  6052. { .mmi
  6053. (p5) LDFD f6 = [C1], SIZE
  6054. (p5) LDFD f7 = [C2], SIZE
  6055. }
  6056. ;;
  6057. { .mfi
  6058. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6059. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6060. adds L = -1, L
  6061. }
  6062. { .mfb
  6063. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  6064. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  6065. br.cloop.sptk.few .L122
  6066. }
  6067. ;;
  6068. .L128:
  6069. { .mmf
  6070. (p5) LDFD f10 = [C1], -SIZE
  6071. (p5) LDFD f11 = [C2], -SIZE
  6072. FMA f6 = ALPHA_R, f64, f6
  6073. }
  6074. { .mmf
  6075. nop __LINE__
  6076. nop __LINE__
  6077. FMA f7 = ALPHA_R, f72, f7
  6078. }
  6079. ;;
  6080. { .mmf
  6081. nop __LINE__
  6082. nop __LINE__
  6083. FMA f10 = ALPHA_I, f64, f10
  6084. }
  6085. { .mmf
  6086. nop __LINE__
  6087. nop __LINE__
  6088. FMA f11 = ALPHA_I, f72, f11
  6089. }
  6090. ;;
  6091. { .mmi
  6092. STFD [C1 ] = f6, SIZE
  6093. STFD [C2 ] = f7, SIZE
  6094. nop __LINE__
  6095. }
  6096. ;;
  6097. { .mmi
  6098. STFD [C1 ] = f10, SIZE
  6099. STFD [C2 ] = f11, SIZE
  6100. nop __LINE__
  6101. }
  6102. ;;
  6103. .align 32
  6104. .L129:
  6105. { .mmi
  6106. mov B = BOFFSET
  6107. mov AOFFSET = A
  6108. nop __LINE__
  6109. }
  6110. ;;
  6111. .align 16
  6112. .L130:
  6113. { .mfi
  6114. nop __LINE__
  6115. mov f64 = f0
  6116. tbit.z p6, p0 = N, 0
  6117. }
  6118. { .mib
  6119. mov AOFFSET = A
  6120. shr I = M, 3
  6121. (p6) br.cond.dpnt .L999
  6122. }
  6123. ;;
  6124. { .mfi
  6125. mov C1 = C
  6126. mov f65 = f0
  6127. nop __LINE__
  6128. }
  6129. ;;
  6130. { .mfi
  6131. nop __LINE__
  6132. mov f66 = f0
  6133. nop __LINE__
  6134. }
  6135. { .mfb
  6136. cmp.eq p7, p0 = 0, I
  6137. mov f67 = f0
  6138. (p7) br.cond.dpnt .L140
  6139. }
  6140. ;;
  6141. .align 32
  6142. .L132:
  6143. { .mfb
  6144. LDFD f48 = [B]
  6145. mov f68 = f0
  6146. nop __LINE__
  6147. }
  6148. { .mfi
  6149. adds BOFFSET = 1 * SIZE, B
  6150. mov f69 = f0
  6151. nop __LINE__
  6152. }
  6153. ;;
  6154. { .mfi
  6155. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6156. mov f70 = f0
  6157. adds L = 1, K
  6158. }
  6159. ;;
  6160. { .mii
  6161. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  6162. tbit.z p12, p0 = L, 0
  6163. shr L = L, 1
  6164. }
  6165. ;;
  6166. { .mfi
  6167. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  6168. mov f71 = f0
  6169. adds L = -1, L
  6170. }
  6171. ;;
  6172. { .mmi
  6173. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  6174. adds PREC = CPREFETCHSIZE * SIZE, C1
  6175. cmp.eq p3, p0 = r0, r0
  6176. }
  6177. ;;
  6178. { .mmi
  6179. CPREFETCH [PREC]
  6180. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  6181. mov ar.lc = L
  6182. }
  6183. ;;
  6184. .align 32
  6185. .L133:
  6186. { .mfi
  6187. lfetch.nt1 [PREA], 16 * SIZE
  6188. FMA f64 = f32, f48, f64 // A1 * B1
  6189. cmp.ne p4, p5 = 0, L
  6190. }
  6191. { .mfi
  6192. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  6193. FMA f65 = f33, f48, f65 // A2 * B1
  6194. (p12) cmp.ne p3, p0 = 0, L
  6195. }
  6196. ;;
  6197. { .mfi
  6198. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  6199. FMA f66 = f34, f48, f66 // A3 * B1
  6200. adds C9 = 4 * SIZE, C1
  6201. }
  6202. { .mmf
  6203. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  6204. (p5) LDFD f6 = [C1 ], SIZE
  6205. FMA f67 = f35, f48, f67 // A4 * B1
  6206. }
  6207. ;;
  6208. { .mfb
  6209. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  6210. FMA f68 = f36, f48, f68 // A5 * B1
  6211. nop __LINE__
  6212. }
  6213. { .mfb
  6214. (p5) LDFD f7 = [C9 ], SIZE
  6215. FMA f69 = f37, f48, f69 // A6 * B1
  6216. nop __LINE__
  6217. }
  6218. ;;
  6219. { .mfb
  6220. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  6221. FMA f70 = f38, f48, f70 // A7 * B1
  6222. nop __LINE__
  6223. }
  6224. { .mfb
  6225. (p5) LDFD f10 = [C1 ], SIZE
  6226. FMA f71 = f39, f48, f71 // A8 * B1
  6227. nop __LINE__
  6228. }
  6229. ;;
  6230. { .mfb
  6231. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  6232. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6233. nop __LINE__
  6234. }
  6235. { .mfb
  6236. (p5) LDFD f11 = [C9 ], SIZE
  6237. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  6238. nop __LINE__
  6239. }
  6240. ;;
  6241. { .mfb
  6242. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6243. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  6244. nop __LINE__
  6245. }
  6246. { .mmf
  6247. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  6248. (p5) LDFD f12 = [C1 ], SIZE
  6249. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  6250. }
  6251. ;;
  6252. { .mfb
  6253. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  6254. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  6255. nop __LINE__
  6256. }
  6257. { .mfb
  6258. (p5) LDFD f13 = [C9 ], SIZE
  6259. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  6260. nop __LINE__
  6261. }
  6262. ;;
  6263. { .mfi
  6264. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  6265. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  6266. adds L = -1, L
  6267. }
  6268. { .mfb
  6269. (p5) LDFD f14 = [C1 ], 5 * SIZE
  6270. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  6271. nop __LINE__
  6272. }
  6273. ;;
  6274. { .mfb
  6275. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  6276. nop __LINE__
  6277. nop __LINE__
  6278. }
  6279. { .mfb
  6280. (p5) LDFD f15 = [C9 ], 5 * SIZE
  6281. nop __LINE__
  6282. br.cloop.sptk.few .L133
  6283. }
  6284. ;;
  6285. .L138:
  6286. { .mmf
  6287. LDFD f16 = [C1 ], SIZE
  6288. LDFD f17 = [C9 ], SIZE
  6289. FMA f6 = ALPHA_R, f64, f6
  6290. }
  6291. { .mmf
  6292. nop __LINE__
  6293. nop __LINE__
  6294. FMA f7 = ALPHA_R, f66, f7
  6295. }
  6296. ;;
  6297. { .mmf
  6298. LDFD f18 = [C1 ], SIZE
  6299. LDFD f19 = [C9 ], SIZE
  6300. FMA f10 = ALPHA_I, f64, f10
  6301. }
  6302. { .mmf
  6303. nop __LINE__
  6304. nop __LINE__
  6305. FMA f11 = ALPHA_I, f66, f11
  6306. }
  6307. ;;
  6308. { .mmf
  6309. LDFD f20 = [C1 ], SIZE
  6310. LDFD f21 = [C9 ], SIZE
  6311. FMA f12 = ALPHA_R, f65, f12
  6312. }
  6313. { .mmf
  6314. nop __LINE__
  6315. nop __LINE__
  6316. FMA f13 = ALPHA_R, f67, f13
  6317. }
  6318. ;;
  6319. { .mmf
  6320. LDFD f22 = [C1 ], - 11 * SIZE
  6321. LDFD f23 = [C9 ], - 11 * SIZE
  6322. FMA f14 = ALPHA_I, f65, f14
  6323. }
  6324. { .mmf
  6325. nop __LINE__
  6326. nop __LINE__
  6327. FMA f15 = ALPHA_I, f67, f15
  6328. }
  6329. ;;
  6330. { .mmf
  6331. STFD [C1 ] = f6, SIZE
  6332. STFD [C9 ] = f7, SIZE
  6333. FMA f16 = ALPHA_R, f68, f16
  6334. }
  6335. { .mmf
  6336. nop __LINE__
  6337. nop __LINE__
  6338. FMA f17 = ALPHA_R, f70, f17
  6339. }
  6340. ;;
  6341. { .mmf
  6342. STFD [C1 ] = f10, SIZE
  6343. STFD [C9 ] = f11, SIZE
  6344. FMA f18 = ALPHA_I, f68, f18
  6345. }
  6346. { .mmf
  6347. nop __LINE__
  6348. nop __LINE__
  6349. FMA f19 = ALPHA_I, f70, f19
  6350. }
  6351. ;;
  6352. { .mmf
  6353. STFD [C1 ] = f12, SIZE
  6354. STFD [C9 ] = f13, SIZE
  6355. FMA f20 = ALPHA_R, f69, f20
  6356. }
  6357. { .mmf
  6358. cmp.ne p6, p0 = 1, I
  6359. adds I = -1, I
  6360. FMA f21 = ALPHA_R, f71, f21
  6361. }
  6362. ;;
  6363. { .mmf
  6364. STFD [C1 ] = f14, 5 * SIZE
  6365. STFD [C9 ] = f15, 5 * SIZE
  6366. FMA f22 = ALPHA_I, f69, f22
  6367. }
  6368. { .mmf
  6369. nop __LINE__
  6370. nop __LINE__
  6371. FMA f23 = ALPHA_I, f71, f23
  6372. }
  6373. ;;
  6374. { .mmf
  6375. STFD [C1 ] = f16, SIZE
  6376. STFD [C9 ] = f17, SIZE
  6377. mov f64 = f0
  6378. }
  6379. ;;
  6380. { .mmf
  6381. STFD [C1 ] = f18, SIZE
  6382. STFD [C9 ] = f19, SIZE
  6383. mov f65 = f0
  6384. }
  6385. ;;
  6386. { .mmf
  6387. STFD [C1 ] = f20, SIZE
  6388. STFD [C9 ] = f21, SIZE
  6389. mov f66 = f0
  6390. }
  6391. ;;
  6392. { .mmf
  6393. STFD [C1 ] = f22, 5 * SIZE
  6394. STFD [C9 ] = f23, 5 * SIZE
  6395. mov f67 = f0
  6396. }
  6397. { .mmb
  6398. nop __LINE__
  6399. nop __LINE__
  6400. (p6) br.cond.dptk .L132
  6401. }
  6402. ;;
  6403. .align 32
  6404. .L140:
  6405. { .mib
  6406. nop __LINE__
  6407. tbit.z p6, p7 = M, 2
  6408. (p6) br.cond.dptk .L150
  6409. }
  6410. ;;
  6411. { .mmi
  6412. LDFD f48 = [B]
  6413. adds BOFFSET = 1 * SIZE, B
  6414. adds L = 1, K
  6415. }
  6416. ;;
  6417. { .mii
  6418. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6419. tbit.z p12, p0 = L, 0
  6420. shr L = L, 1
  6421. }
  6422. ;;
  6423. { .mmi
  6424. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  6425. adds L = -1, L
  6426. nop __LINE__
  6427. }
  6428. ;;
  6429. { .mmi
  6430. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  6431. cmp.eq p3, p0 = r0, r0
  6432. mov ar.lc = L
  6433. }
  6434. ;;
  6435. .align 32
  6436. .L142:
  6437. { .mfi
  6438. lfetch.nt1 [PREA], 8 * SIZE
  6439. FMA f64 = f32, f48, f64 // A1 * B1
  6440. cmp.ne p4, p5 = 0, L
  6441. }
  6442. { .mfi
  6443. nop __LINE__
  6444. FMA f65 = f33, f48, f65 // A2 * B1
  6445. (p12) cmp.ne p3, p0 = 0, L
  6446. }
  6447. ;;
  6448. { .mfi
  6449. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  6450. FMA f66 = f34, f48, f66 // A3 * B1
  6451. (p5) adds C9 = 4 * SIZE, C1
  6452. }
  6453. { .mmf
  6454. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  6455. FMA f67 = f35, f48, f67 // A4 * B1
  6456. }
  6457. ;;
  6458. { .mfi
  6459. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  6460. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6461. (p5) adds C10 = 2 * SIZE, C2
  6462. }
  6463. { .mmf
  6464. (p5) LDFD f6 = [C1 ], SIZE
  6465. (p5) LDFD f7 = [C9 ], SIZE
  6466. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  6467. }
  6468. ;;
  6469. { .mmf
  6470. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6471. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  6472. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  6473. }
  6474. { .mmf
  6475. (p5) LDFD f10 = [C1 ], SIZE
  6476. (p5) LDFD f11 = [C9 ], SIZE
  6477. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  6478. }
  6479. ;;
  6480. { .mfi
  6481. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  6482. nop __LINE__
  6483. adds L = -1, L
  6484. }
  6485. { .mmb
  6486. (p5) LDFD f12 = [C1 ], SIZE
  6487. (p5) LDFD f13 = [C9 ], SIZE
  6488. br.cloop.sptk.few .L142
  6489. }
  6490. ;;
  6491. .L148:
  6492. { .mmf
  6493. LDFD f14 = [C1 ], - 3 * SIZE
  6494. LDFD f15 = [C9 ], - 3 * SIZE
  6495. FMA f6 = ALPHA_R, f64, f6
  6496. }
  6497. { .mmf
  6498. nop __LINE__
  6499. nop __LINE__
  6500. FMA f7 = ALPHA_R, f66, f7
  6501. }
  6502. ;;
  6503. { .mmf
  6504. nop __LINE__
  6505. nop __LINE__
  6506. FMA f10 = ALPHA_I, f64, f10
  6507. }
  6508. { .mmf
  6509. nop __LINE__
  6510. nop __LINE__
  6511. FMA f11 = ALPHA_I, f66, f11
  6512. }
  6513. ;;
  6514. { .mmf
  6515. nop __LINE__
  6516. nop __LINE__
  6517. FMA f12 = ALPHA_R, f65, f12
  6518. }
  6519. { .mmf
  6520. nop __LINE__
  6521. nop __LINE__
  6522. FMA f13 = ALPHA_R, f67, f13
  6523. }
  6524. ;;
  6525. { .mmf
  6526. nop __LINE__
  6527. nop __LINE__
  6528. FMA f14 = ALPHA_I, f65, f14
  6529. }
  6530. { .mmf
  6531. nop __LINE__
  6532. nop __LINE__
  6533. FMA f15 = ALPHA_I, f67, f15
  6534. }
  6535. ;;
  6536. { .mmf
  6537. STFD [C1 ] = f6, SIZE
  6538. STFD [C9 ] = f7, SIZE
  6539. mov f64 = f0
  6540. }
  6541. ;;
  6542. { .mmf
  6543. STFD [C1 ] = f10, SIZE
  6544. STFD [C9 ] = f11, SIZE
  6545. mov f65 = f0
  6546. }
  6547. ;;
  6548. { .mmf
  6549. STFD [C1 ] = f12, SIZE
  6550. STFD [C9 ] = f13, SIZE
  6551. mov f66 = f0
  6552. }
  6553. ;;
  6554. { .mmf
  6555. STFD [C1 ] = f14, 5 * SIZE
  6556. STFD [C9 ] = f15, 5 * SIZE
  6557. mov f67 = f0
  6558. }
  6559. ;;
  6560. .align 32
  6561. .L150:
  6562. { .mib
  6563. nop __LINE__
  6564. tbit.z p6, p7 = M, 1
  6565. (p6) br.cond.dptk .L160
  6566. }
  6567. ;;
  6568. { .mmi
  6569. LDFD f48 = [B]
  6570. adds BOFFSET = 1 * SIZE, B
  6571. adds L = 1, K
  6572. }
  6573. ;;
  6574. { .mii
  6575. cmp.eq p3, p0 = r0, r0
  6576. tbit.z p12, p0 = L, 0
  6577. shr L = L, 1
  6578. }
  6579. ;;
  6580. { .mii
  6581. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6582. adds L = -1, L
  6583. ;;
  6584. mov ar.lc = L
  6585. }
  6586. ;;
  6587. .align 32
  6588. .L152:
  6589. { .mfi
  6590. cmp.ne p4, p5 = 0, L
  6591. FMA f64 = f32, f48, f64 // A1 * B1
  6592. (p12) cmp.ne p3, p0 = 0, L
  6593. }
  6594. ;;
  6595. { .mmf
  6596. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  6597. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  6598. FMA f65 = f33, f48, f65 // A2 * B1
  6599. }
  6600. ;;
  6601. { .mfi
  6602. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6603. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6604. adds L = -1, L
  6605. }
  6606. ;;
  6607. { .mfb
  6608. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  6609. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  6610. br.cloop.sptk.few .L152
  6611. }
  6612. ;;
  6613. .L158:
  6614. LDFD f68 = [C1 ], 1 * SIZE
  6615. ;;
  6616. LDFD f69 = [C1 ], 1 * SIZE
  6617. ;;
  6618. LDFD f70 = [C1 ], 1 * SIZE
  6619. ;;
  6620. LDFD f71 = [C1 ], - 3 * SIZE
  6621. ;;
  6622. FMA f68 = ALPHA_R, f64, f68
  6623. FMA f69 = ALPHA_I, f64, f69
  6624. FMA f70 = ALPHA_R, f65, f70
  6625. FMA f71 = ALPHA_I, f65, f71
  6626. ;;
  6627. STFD [C1 ] = f68, SIZE
  6628. ;;
  6629. STFD [C1 ] = f69, SIZE
  6630. ;;
  6631. STFD [C1 ] = f70, SIZE
  6632. mov f64 = f0
  6633. ;;
  6634. STFD [C1 ] = f71, SIZE
  6635. mov f65 = f0
  6636. ;;
  6637. .align 32
  6638. .L160:
  6639. { .mib
  6640. nop __LINE__
  6641. tbit.z p6, p7 = M, 0
  6642. (p6) br.cond.dptk .L169
  6643. }
  6644. ;;
  6645. { .mmi
  6646. LDFD f48 = [B]
  6647. adds BOFFSET = 1 * SIZE, B
  6648. adds L = 1, K
  6649. }
  6650. ;;
  6651. { .mii
  6652. LDFD f32 = [AOFFSET], 1 * SIZE
  6653. tbit.z p12, p0 = L, 0
  6654. shr L = L, 1
  6655. }
  6656. ;;
  6657. { .mii
  6658. adds L = -1, L
  6659. cmp.eq p3, p0 = r0, r0
  6660. ;;
  6661. mov ar.lc = L
  6662. }
  6663. ;;
  6664. .align 32
  6665. .L162:
  6666. { .mmf
  6667. cmp.ne p4, p5 = 0, L
  6668. (p12) cmp.ne p3, p0 = 0, L
  6669. FMA f64 = f32, f48, f64 // A1 * B1
  6670. }
  6671. ;;
  6672. { .mmi
  6673. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  6674. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  6675. nop __LINE__
  6676. }
  6677. ;;
  6678. { .mmi
  6679. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  6680. (p5) LDFD f68 = [C1], 1 * SIZE
  6681. adds L = -1, L
  6682. }
  6683. ;;
  6684. { .mmf
  6685. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  6686. (p5) LDFD f69 = [C1], - 1 * SIZE
  6687. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6688. }
  6689. { .mib
  6690. nop __LINE__
  6691. nop __LINE__
  6692. br.cloop.sptk.few .L162
  6693. }
  6694. ;;
  6695. FMA f68 = ALPHA_R, f64, f68
  6696. FMA f69 = ALPHA_I, f64, f69
  6697. ;;
  6698. STFD [C1 ] = f68, SIZE
  6699. ;;
  6700. STFD [C1 ] = f69, SIZE
  6701. ;;
  6702. .align 32
  6703. .L169:
  6704. { .mmi
  6705. mov B = BOFFSET
  6706. mov AOFFSET = A
  6707. nop __LINE__
  6708. }
  6709. ;;
  6710. .align 16
  6711. .L999:
  6712. mov r8 = r0
  6713. adds r9 = 1 * 16, SP
  6714. ;;
  6715. ldf.fill f16 = [SP], 32
  6716. ldf.fill f17 = [r9], 32
  6717. ;;
  6718. ldf.fill f18 = [SP], 32
  6719. ldf.fill f19 = [r9], 32
  6720. ;;
  6721. ldf.fill f20 = [SP], 32
  6722. ldf.fill f21 = [r9], 32
  6723. ;;
  6724. ldf.fill f22 = [SP], 32
  6725. ldf.fill f23 = [r9], 32
  6726. mov ar.lc = ARLC
  6727. ;;
  6728. ldf.fill f24 = [SP], 32
  6729. ldf.fill f25 = [r9], 32
  6730. mov pr = PR, -1
  6731. ;;
  6732. ldf.fill f26 = [SP], 32
  6733. ldf.fill f27 = [r9], 32
  6734. mov ar.pfs = ARPFS
  6735. ;;
  6736. ldf.fill f28 = [SP], 32
  6737. ldf.fill f29 = [r9], 32
  6738. ;;
  6739. ldf.fill f30 = [SP], 32
  6740. ldf.fill f31 = [r9]
  6741. br.ret.sptk.many b0
  6742. EPILOGUE