You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LT.S 171 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef DOUBLE
  41. #define PREFETCHSIZE (16 * 8)
  42. #else
  43. #define PREFETCHSIZE (32 * 8)
  44. #endif
  45. #ifndef LN
  46. #define CPREFETCHSIZE 7
  47. #else
  48. #define CPREFETCHSIZE -8
  49. #endif
  50. #define CPREFETCH lfetch.excl.nt1
  51. #define M r32
  52. #define N r33
  53. #define K r34
  54. #define A r37
  55. #define B r38
  56. #define C r39
  57. #define LDC r35
  58. #define I r15
  59. #define J r16
  60. #define AOFFSET r17
  61. #define BOFFSET r18
  62. #define TEMP r19
  63. #define L r20
  64. #define C1 r21
  65. #define C2 r22
  66. #define C3 r23
  67. #define C4 r24
  68. #define C5 r25
  69. #define C6 r26
  70. #define C7 r27
  71. #define C8 r28
  72. #define PREA r8
  73. #define PREB r9
  74. #define PREC r10
  75. #define SP r12
  76. #define ARLC r29
  77. #define PR r30
  78. #define ARPFS r31
  79. #define ALPHA_R f8
  80. #define ALPHA_I f9
  81. #define AORIG loc0
  82. #define KK loc1
  83. #define KK8 loc2
  84. #define OFFSET loc3
  85. #define AOFFSET2 loc4
  86. #define BOFFSET2 loc5
  87. #ifndef CONJ
  88. #define FCALC_A FSUB
  89. #define FCALC_B FADD
  90. #define FMA_A FNMA
  91. #define FMA_B FMA
  92. #else
  93. #define FCALC_A FADD
  94. #define FCALC_B FSUB
  95. #define FMA_A FMA
  96. #define FMA_B FNMA
  97. #endif
  98. #ifndef CONJ
  99. #define FCALC_C FMA
  100. #define FCALC_D FNMA
  101. #else
  102. #define FCALC_C FNMA
  103. #define FCALC_D FMA
  104. #endif
  105. #ifndef CONJ
  106. #define FMA_C FNMA
  107. #define FMA_D FMA
  108. #define FSUB_A FSUB
  109. #else
  110. #define FMA_C FMA
  111. #define FMA_D FMS
  112. #define FSUB_A FADD
  113. #endif
  114. PROLOGUE
  115. .prologue
  116. PROFCODE
  117. { .mfi
  118. .save ar.pfs, ARPFS
  119. alloc ARPFS = ar.pfs, 8, 8, 0, 0
  120. mov f64 = f0
  121. adds r14 = 16, SP
  122. }
  123. { .mfi
  124. nop __LINE__
  125. mov f65 = f0
  126. adds r15 = 24, SP
  127. }
  128. ;;
  129. { .mfi
  130. ld8 LDC = [r14]
  131. mov f81 = f0
  132. mov PR = pr
  133. }
  134. { .mfi
  135. ld8 OFFSET = [r15]
  136. mov f96 = f0
  137. shr J = N, 2
  138. }
  139. ;;
  140. { .mfi
  141. shladd LDC = LDC, ZBASE_SHIFT, r0
  142. mov f97 = f0
  143. }
  144. { .mfi
  145. nop __LINE__
  146. mov f113 = f0
  147. }
  148. ;;
  149. #ifdef LN
  150. { .mmi
  151. setf.sig f32 = M
  152. setf.sig f33 = K
  153. shladd C = M, ZBASE_SHIFT, C
  154. }
  155. ;;
  156. {.mmf
  157. nop __LINE__
  158. nop __LINE__
  159. xmpy.l f32 = f32, f33
  160. }
  161. ;;
  162. { .mmi
  163. getf.sig r2 = f32
  164. ;;
  165. nop __LINE__
  166. shladd A = r2, ZBASE_SHIFT, A
  167. }
  168. ;;
  169. #endif
  170. #ifdef RN
  171. sub KK = r0, OFFSET
  172. #endif
  173. #ifdef RT
  174. { .mmi
  175. setf.sig f32 = N
  176. setf.sig f33 = K
  177. nop __LINE__
  178. }
  179. ;;
  180. { .mmi
  181. setf.sig f34 = LDC
  182. nop __LINE__
  183. nop __LINE__
  184. }
  185. ;;
  186. { .mmf
  187. nop __LINE__
  188. nop __LINE__
  189. xmpy.l f33 = f32, f33
  190. }
  191. { .mmf
  192. nop __LINE__
  193. sub KK = N, OFFSET
  194. xmpy.l f34 = f32, f34
  195. }
  196. ;;
  197. { .mmi
  198. getf.sig r2 = f33
  199. getf.sig r3 = f34
  200. }
  201. ;;
  202. shladd B = r2, ZBASE_SHIFT, B
  203. add C = r3, C
  204. #endif
  205. ;;
  206. .body
  207. { .mfi
  208. nop __LINE__
  209. mov f80 = f0
  210. mov ARLC = ar.lc
  211. }
  212. { .mfb
  213. cmp.ge p6, p0 = 0, J
  214. mov f112 = f0
  215. (p6) br.cond.dpnt .L050
  216. }
  217. ;;
  218. .align 16
  219. .L010:
  220. #ifdef RT
  221. { .mmi
  222. shladd r3 = LDC, 2, r0
  223. nop __LINE__
  224. shl r2 = K, 2 + ZBASE_SHIFT
  225. }
  226. ;;
  227. { .mmi
  228. sub B = B, r2
  229. sub C = C, r3
  230. nop __LINE__
  231. }
  232. ;;
  233. #endif
  234. { .mmi
  235. mov C1 = C // coffset1 = c + 0 * ldc
  236. add C2 = LDC, C // coffset2 = c + 1 * ldc
  237. shr I = M, 2
  238. }
  239. { .mmi
  240. adds J = -1, J
  241. #ifdef LN
  242. add KK = M, OFFSET
  243. #elif defined LT
  244. mov KK = OFFSET
  245. #else
  246. nop __LINE__
  247. #endif
  248. #if defined(LN) || defined(RT)
  249. mov AORIG = A
  250. #else
  251. mov AOFFSET = A
  252. #endif
  253. }
  254. ;;
  255. ;;
  256. { .mmi
  257. shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc
  258. shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc
  259. #if defined(LT) || defined(RN)
  260. mov L = KK
  261. #else
  262. sub L = K, KK
  263. #endif
  264. }
  265. { .mib
  266. cmp.eq p6, p7 = 0, I
  267. #ifndef RT
  268. shladd C = LDC, 2, C // coffset += 8 * ldc
  269. #else
  270. nop __LINE__
  271. #endif
  272. (p6) br.cond.dpnt .L020
  273. }
  274. ;;
  275. .align 16
  276. .L011:
  277. { .mmi
  278. cmp.ne p7, p0 = r0, L
  279. adds BOFFSET = 0 * SIZE, B
  280. shl r2 = K, 2 + ZBASE_SHIFT
  281. }
  282. { .mfi
  283. shladd r3 = KK, ZBASE_SHIFT, r0
  284. mov f118 = f0
  285. nop __LINE__
  286. }
  287. ;;
  288. #if defined(LT) || defined(RN)
  289. { .mfb
  290. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  291. mov f66 = f0
  292. nop __LINE__
  293. }
  294. { .mmf
  295. nop __LINE__
  296. nop __LINE__
  297. mov f67 = f0
  298. }
  299. ;;
  300. #else
  301. { .mfi
  302. shladd BOFFSET = r3, 2, B
  303. mov f66 = f0
  304. #ifdef LN
  305. sub AORIG = AORIG, r2
  306. #else
  307. nop __LINE__
  308. #endif
  309. }
  310. ;;
  311. { .mfi
  312. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  313. mov f67 = f0
  314. shladd AOFFSET = r3, 2, AORIG
  315. }
  316. ;;
  317. #endif
  318. ;;
  319. { .mfi
  320. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  321. mov f82 = f0
  322. nop __LINE__
  323. }
  324. { .mfi
  325. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  326. mov f83 = f0
  327. adds PREC = CPREFETCHSIZE * SIZE, C1
  328. }
  329. ;;
  330. { .mfi
  331. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  332. mov f98 = f0
  333. adds L = 1, L
  334. }
  335. { .mfi
  336. (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  337. mov f99 = f0
  338. adds C5 = 4 * SIZE, C1
  339. }
  340. ;;
  341. { .mfi
  342. (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  343. mov f114 = f0
  344. tbit.z p12, p0 = L, 0
  345. }
  346. { .mfi
  347. (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  348. mov f115 = f0
  349. adds C6 = 4 * SIZE, C2
  350. }
  351. ;;
  352. { .mfi
  353. (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  354. mov f68 = f0
  355. shr L = L, 1
  356. }
  357. { .mfi
  358. setf.d f86 = r0
  359. mov f69 = f0
  360. adds C7 = 4 * SIZE, C3
  361. }
  362. ;;
  363. { .mfi
  364. CPREFETCH [PREC], LDC
  365. mov f84 = f0
  366. adds L = -1, L
  367. }
  368. { .mfi
  369. setf.d f87 = r0
  370. mov f85 = f0
  371. adds C8 = 4 * SIZE, C4
  372. }
  373. ;;
  374. { .mfi
  375. CPREFETCH [PREC], LDC
  376. mov f100 = f0
  377. mov ar.lc = L
  378. }
  379. { .mfi
  380. setf.d f102 = r0
  381. mov f101 = f0
  382. cmp.eq p3, p0 = r0, r0
  383. }
  384. ;;
  385. { .mfi
  386. CPREFETCH [PREC], LDC
  387. mov f116 = f0
  388. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  389. }
  390. { .mfi
  391. setf.d f103 = r0
  392. mov f117 = f0
  393. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  394. }
  395. ;;
  396. { .mfi
  397. CPREFETCH [PREC]
  398. mov f70 = f0
  399. cmp.eq p6, p0 = -1, L
  400. }
  401. { .mfb
  402. setf.d f119 = r0
  403. mov f71 = f0
  404. (p6) br.cond.dpnt .L018
  405. }
  406. ;;
  407. .align 16
  408. .L012:
  409. /* 1 */
  410. { .mfi
  411. lfetch.nt1 [PREA], 16 * SIZE
  412. FMA f64 = f32, f48, f64 // A1 * B1
  413. nop __LINE__
  414. }
  415. { .mfb
  416. (p12) cmp.ne p3, p0 = 0, L
  417. FMA_B f65 = f32, f49, f65 // A1 * B2
  418. nop __LINE__
  419. }
  420. ;;
  421. /* 2 */
  422. { .mfi
  423. lfetch.nt1 [PREB], 16 * SIZE
  424. FMA f80 = f32, f50, f80 // A1 * B3
  425. nop __LINE__
  426. }
  427. { .mfb
  428. cmp.ne p4, p5 = 0, L
  429. FMA_B f81 = f32, f51, f81 // A1 * B4
  430. nop __LINE__
  431. }
  432. ;;
  433. /* 3 */
  434. { .mfb
  435. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  436. FMA f96 = f32, f52, f96 // A1 * B5
  437. nop __LINE__
  438. }
  439. { .mfb
  440. FMA_B f97 = f32, f53, f97 // A1 * B6
  441. nop __LINE__
  442. }
  443. ;;
  444. /* 4 */
  445. { .mfb
  446. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  447. FMA f112 = f32, f54, f112 // A1 * B7
  448. nop __LINE__
  449. }
  450. { .mfb
  451. FMA_B f113 = f32, f55, f113 // A1 * B8
  452. nop __LINE__
  453. }
  454. ;;
  455. /* 5 */
  456. { .mfb
  457. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  458. FMA f65 = f33, f48, f65 // A2 * B1
  459. nop __LINE__
  460. }
  461. { .mfb
  462. FMA_A f64 = f33, f49, f64 // A2 * B2
  463. nop __LINE__
  464. }
  465. ;;
  466. /* 6 */
  467. { .mfb
  468. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  469. FMA f81 = f33, f50, f81 // A2 * B3
  470. nop __LINE__
  471. }
  472. { .mfb
  473. FMA_A f80 = f33, f51, f80 // A2 * B4
  474. nop __LINE__
  475. }
  476. ;;
  477. /* 7 */
  478. { .mfb
  479. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  480. FMA f97 = f33, f52, f97 // A2 * B5
  481. nop __LINE__
  482. }
  483. { .mfb
  484. FMA_A f96 = f33, f53, f96 // A2 * B6
  485. nop __LINE__
  486. }
  487. ;;
  488. /* 8 */
  489. { .mfb
  490. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  491. FMA f113 = f33, f54, f113 // A2 * B7
  492. nop __LINE__
  493. }
  494. { .mfb
  495. FMA_A f112 = f33, f55, f112 // A2 * B8
  496. nop __LINE__
  497. }
  498. ;;
  499. /* 9 */
  500. { .mfb
  501. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  502. FMA f66 = f34, f48, f66 // A3 * B1
  503. nop __LINE__
  504. }
  505. { .mfb
  506. FMA_B f67 = f34, f49, f67 // A3 * B2
  507. nop __LINE__
  508. }
  509. ;;
  510. /* 10 */
  511. { .mfb
  512. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  513. FMA f82 = f34, f50, f82 // A3 * B3
  514. nop __LINE__
  515. }
  516. { .mfb
  517. FMA_B f83 = f34, f51, f83 // A3 * B4
  518. nop __LINE__
  519. }
  520. ;;
  521. /* 11 */
  522. { .mfb
  523. FMA f98 = f34, f52, f98 // A3 * B5
  524. nop __LINE__
  525. }
  526. { .mfb
  527. nop __LINE__
  528. FMA_B f99 = f34, f53, f99 // A3 * B6
  529. nop __LINE__
  530. }
  531. ;;
  532. /* 12 */
  533. { .mfb
  534. FMA f114 = f34, f54, f114 // A3 * B7
  535. nop __LINE__
  536. }
  537. { .mfb
  538. nop __LINE__
  539. FMA_B f115 = f34, f55, f115 // A3 * B8
  540. nop __LINE__
  541. }
  542. ;;
  543. /* 13 */
  544. { .mfb
  545. nop __LINE__
  546. FMA f67 = f35, f48, f67 // A4 * B1
  547. }
  548. { .mfb
  549. nop __LINE__
  550. FMA_A f66 = f35, f49, f66 // A4 * B2
  551. nop __LINE__
  552. }
  553. ;;
  554. /* 14 */
  555. { .mfb
  556. FMA f83 = f35, f50, f83 // A4 * B3
  557. nop __LINE__
  558. }
  559. { .mfb
  560. nop __LINE__
  561. FMA_A f82 = f35, f51, f82 // A4 * B4
  562. nop __LINE__
  563. }
  564. ;;
  565. /* 15 */
  566. { .mfb
  567. FMA f99 = f35, f52, f99 // A4 * B5
  568. nop __LINE__
  569. }
  570. { .mfb
  571. nop __LINE__
  572. FMA_A f98 = f35, f53, f98 // A4 * B6
  573. nop __LINE__
  574. }
  575. ;;
  576. /* 16 */
  577. { .mfb
  578. FMA f115 = f35, f54, f115 // A4 * B7
  579. nop __LINE__
  580. }
  581. { .mfb
  582. nop __LINE__
  583. FMA_A f114 = f35, f55, f114 // A4 * B8
  584. nop __LINE__
  585. }
  586. ;;
  587. /* 17 */
  588. { .mfb
  589. nop __LINE__
  590. FMA f68 = f36, f48, f68 // A5 * B1
  591. nop __LINE__
  592. }
  593. { .mfb
  594. nop __LINE__
  595. FMA_B f69 = f36, f49, f69 // A5 * B2
  596. nop __LINE__
  597. }
  598. ;;
  599. /* 18 */
  600. { .mfb
  601. nop __LINE__
  602. FMA f84 = f36, f50, f84 // A5 * B3
  603. nop __LINE__
  604. }
  605. { .mfb
  606. nop __LINE__
  607. FMA_B f85 = f36, f51, f85 // A5 * B4
  608. nop __LINE__
  609. }
  610. ;;
  611. /* 19 */
  612. { .mfb
  613. nop __LINE__
  614. FMA f100 = f36, f52, f100 // A5 * B5
  615. nop __LINE__
  616. }
  617. { .mfb
  618. nop __LINE__
  619. FMA_B f101 = f36, f53, f101 // A5 * B6
  620. nop __LINE__
  621. }
  622. ;;
  623. /* 20 */
  624. { .mfb
  625. nop __LINE__
  626. FMA f116 = f36, f54, f116 // A5 * B7
  627. nop __LINE__
  628. }
  629. { .mfb
  630. nop __LINE__
  631. FMA_B f117 = f36, f55, f117 // A5 * B8
  632. nop __LINE__
  633. }
  634. ;;
  635. /* 21 */
  636. { .mfb
  637. nop __LINE__
  638. FMA f69 = f37, f48, f69 // A6 * B1
  639. nop __LINE__
  640. }
  641. { .mfb
  642. nop __LINE__
  643. FMA_A f68 = f37, f49, f68 // A6 * B2
  644. nop __LINE__
  645. }
  646. ;;
  647. /* 22 */
  648. { .mfb
  649. nop __LINE__
  650. FMA f85 = f37, f50, f85 // A6 * B3
  651. nop __LINE__
  652. }
  653. { .mfb
  654. nop __LINE__
  655. FMA_A f84 = f37, f51, f84 // A6 * B4
  656. nop __LINE__
  657. }
  658. ;;
  659. /* 23 */
  660. { .mfb
  661. nop __LINE__
  662. FMA f101 = f37, f52, f101 // A6 * B5
  663. nop __LINE__
  664. }
  665. { .mfb
  666. nop __LINE__
  667. FMA_A f100 = f37, f53, f100 // A6 * B6
  668. nop __LINE__
  669. }
  670. ;;
  671. /* 24 */
  672. { .mfb
  673. nop __LINE__
  674. FMA f117 = f37, f54, f117 // A6 * B7
  675. nop __LINE__
  676. }
  677. { .mfb
  678. nop __LINE__
  679. FMA_A f116 = f37, f55, f116 // A6 * B8
  680. nop __LINE__
  681. }
  682. ;;
  683. /* 25 */
  684. { .mfb
  685. nop __LINE__
  686. FMA f70 = f38, f48, f70 // A7 * B1
  687. nop __LINE__
  688. }
  689. { .mfb
  690. nop __LINE__
  691. FMA_B f71 = f38, f49, f71 // A7 * B2
  692. nop __LINE__
  693. }
  694. ;;
  695. /* 26 */
  696. { .mfb
  697. nop __LINE__
  698. FMA f86 = f38, f50, f86 // A7 * B3
  699. nop __LINE__
  700. }
  701. { .mfb
  702. nop __LINE__
  703. FMA_B f87 = f38, f51, f87 // A7 * B4
  704. nop __LINE__
  705. }
  706. ;;
  707. /* 27 */
  708. { .mfb
  709. nop __LINE__
  710. FMA f102 = f38, f52, f102 // A7 * B5
  711. nop __LINE__
  712. }
  713. { .mfb
  714. nop __LINE__
  715. FMA_B f103 = f38, f53, f103 // A7 * B6
  716. nop __LINE__
  717. }
  718. ;;
  719. /* 28 */
  720. { .mfb
  721. nop __LINE__
  722. FMA f118 = f38, f54, f118 // A7 * B7
  723. nop __LINE__
  724. }
  725. { .mfb
  726. nop __LINE__
  727. FMA_B f119 = f38, f55, f119 // A7 * B8
  728. nop __LINE__
  729. }
  730. ;;
  731. /* 29 */
  732. { .mfb
  733. nop __LINE__
  734. FMA f71 = f39, f48, f71 // A8 * B1
  735. nop __LINE__
  736. }
  737. { .mfb
  738. nop __LINE__
  739. FMA_A f70 = f39, f49, f70 // A8 * B2
  740. nop __LINE__
  741. }
  742. ;;
  743. /* 30 */
  744. { .mfb
  745. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  746. FMA f87 = f39, f50, f87 // A8 * B3
  747. nop __LINE__
  748. }
  749. { .mfb
  750. nop __LINE__
  751. FMA_A f86 = f39, f51, f86 // A8 * B4
  752. nop __LINE__
  753. }
  754. ;;
  755. /* 31 */
  756. { .mfb
  757. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  758. FMA f103 = f39, f52, f103 // A8 * B5
  759. nop __LINE__
  760. }
  761. { .mfb
  762. nop __LINE__
  763. FMA_A f102 = f39, f53, f102 // A8 * B6
  764. nop __LINE__
  765. }
  766. ;;
  767. /* 32 */
  768. { .mfb
  769. nop __LINE__
  770. FMA f119 = f39, f54, f119 // A8 * B7
  771. nop __LINE__
  772. }
  773. { .mfb
  774. nop __LINE__
  775. FMA_A f118 = f39, f55, f118 // A8 * B8
  776. nop __LINE__
  777. }
  778. ;;
  779. /* 33 */
  780. { .mfb
  781. nop __LINE__
  782. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  783. nop __LINE__
  784. }
  785. { .mfb
  786. nop __LINE__
  787. (p3) FMA_B f65 = f40, f57, f65 // A1 * B2
  788. nop __LINE__
  789. }
  790. ;;
  791. /* 34 */
  792. { .mfb
  793. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  794. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  795. nop __LINE__
  796. }
  797. { .mfb
  798. nop __LINE__
  799. (p3) FMA_B f81 = f40, f59, f81 // A1 * B4
  800. nop __LINE__
  801. }
  802. ;;
  803. /* 35 */
  804. { .mfb
  805. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  806. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  807. nop __LINE__
  808. }
  809. { .mfb
  810. nop __LINE__
  811. (p3) FMA_B f97 = f40, f61, f97 // A1 * B6
  812. nop __LINE__
  813. }
  814. ;;
  815. /* 36 */
  816. { .mfb
  817. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  818. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  819. nop __LINE__
  820. }
  821. { .mfb
  822. nop __LINE__
  823. (p3) FMA_B f113 = f40, f63, f113 // A1 * B8
  824. nop __LINE__
  825. }
  826. ;;
  827. /* 37 */
  828. { .mfb
  829. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  830. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  831. nop __LINE__
  832. }
  833. { .mfb
  834. nop __LINE__
  835. (p3) FMA_A f64 = f41, f57, f64 // A2 * B2
  836. nop __LINE__
  837. }
  838. ;;
  839. /* 38 */
  840. { .mfb
  841. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  842. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  843. nop __LINE__
  844. }
  845. { .mfb
  846. nop __LINE__
  847. (p3) FMA_A f80 = f41, f59, f80 // A2 * B4
  848. nop __LINE__
  849. }
  850. ;;
  851. /* 39 */
  852. { .mfb
  853. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  854. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  855. nop __LINE__
  856. }
  857. { .mfb
  858. nop __LINE__
  859. (p3) FMA_A f96 = f41, f61, f96 // A2 * B6
  860. nop __LINE__
  861. }
  862. ;;
  863. /* 40 */
  864. { .mfb
  865. nop __LINE__
  866. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  867. nop __LINE__
  868. }
  869. { .mfb
  870. nop __LINE__
  871. (p3) FMA_A f112 = f41, f63, f112 // A2 * B8
  872. nop __LINE__
  873. }
  874. ;;
  875. /* 41 */
  876. { .mfb
  877. nop __LINE__
  878. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  879. nop __LINE__
  880. }
  881. { .mfb
  882. nop __LINE__
  883. (p3) FMA_B f67 = f42, f57, f67 // A3 * B2
  884. nop __LINE__
  885. }
  886. ;;
  887. /* 42 */
  888. { .mfb
  889. nop __LINE__
  890. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  891. nop __LINE__
  892. }
  893. { .mfb
  894. nop __LINE__
  895. (p3) FMA_B f83 = f42, f59, f83 // A3 * B4
  896. nop __LINE__
  897. }
  898. ;;
  899. /* 43 */
  900. { .mfb
  901. nop __LINE__
  902. (p3) FMA f98 = f42, f60, f98 // A3 * B5
  903. nop __LINE__
  904. }
  905. { .mfb
  906. nop __LINE__
  907. (p3) FMA_B f99 = f42, f61, f99 // A3 * B6
  908. nop __LINE__
  909. }
  910. ;;
  911. /* 44 */
  912. { .mfb
  913. nop __LINE__
  914. (p3) FMA f114 = f42, f62, f114 // A3 * B7
  915. nop __LINE__
  916. }
  917. { .mfb
  918. nop __LINE__
  919. (p3) FMA_B f115 = f42, f63, f115 // A3 * B8
  920. nop __LINE__
  921. }
  922. ;;
  923. /* 45 */
  924. { .mfb
  925. nop __LINE__
  926. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  927. nop __LINE__
  928. }
  929. { .mfb
  930. nop __LINE__
  931. (p3) FMA_A f66 = f43, f57, f66 // A4 * B2
  932. nop __LINE__
  933. }
  934. ;;
  935. /* 46 */
  936. { .mfb
  937. nop __LINE__
  938. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  939. nop __LINE__
  940. }
  941. { .mfb
  942. nop __LINE__
  943. (p3) FMA_A f82 = f43, f59, f82 // A4 * B4
  944. nop __LINE__
  945. }
  946. ;;
  947. /* 47 */
  948. { .mfb
  949. nop __LINE__
  950. (p3) FMA f99 = f43, f60, f99 // A4 * B5
  951. nop __LINE__
  952. }
  953. { .mfb
  954. nop __LINE__
  955. (p3) FMA_A f98 = f43, f61, f98 // A4 * B6
  956. nop __LINE__
  957. }
  958. ;;
  959. /* 48 */
  960. { .mfb
  961. nop __LINE__
  962. (p3) FMA f115 = f43, f62, f115 // A4 * B7
  963. nop __LINE__
  964. }
  965. { .mfb
  966. nop __LINE__
  967. (p3) FMA_A f114 = f43, f63, f114 // A4 * B8
  968. nop __LINE__
  969. }
  970. ;;
  971. /* 49 */
  972. { .mfb
  973. nop __LINE__
  974. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  975. nop __LINE__
  976. }
  977. { .mfb
  978. nop __LINE__
  979. (p3) FMA_B f69 = f44, f57, f69 // A5 * B2
  980. nop __LINE__
  981. }
  982. ;;
  983. /* 50 */
  984. { .mfb
  985. nop __LINE__
  986. (p3) FMA f84 = f44, f58, f84 // A5 * B3
  987. nop __LINE__
  988. }
  989. { .mfb
  990. nop __LINE__
  991. (p3) FMA_B f85 = f44, f59, f85 // A5 * B4
  992. nop __LINE__
  993. }
  994. ;;
  995. /* 51 */
  996. { .mfb
  997. nop __LINE__
  998. (p3) FMA f100 = f44, f60, f100 // A5 * B5
  999. nop __LINE__
  1000. }
  1001. { .mfb
  1002. nop __LINE__
  1003. (p3) FMA_B f101 = f44, f61, f101 // A5 * B6
  1004. nop __LINE__
  1005. }
  1006. ;;
  1007. /* 52 */
  1008. { .mfb
  1009. nop __LINE__
  1010. (p3) FMA f116 = f44, f62, f116 // A5 * B7
  1011. nop __LINE__
  1012. }
  1013. { .mfb
  1014. nop __LINE__
  1015. (p3) FMA_B f117 = f44, f63, f117 // A5 * B8
  1016. nop __LINE__
  1017. }
  1018. ;;
  1019. /* 53 */
  1020. { .mfb
  1021. nop __LINE__
  1022. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  1023. nop __LINE__
  1024. }
  1025. { .mfb
  1026. nop __LINE__
  1027. (p3) FMA_A f68 = f45, f57, f68 // A6 * B2
  1028. nop __LINE__
  1029. }
  1030. ;;
  1031. /* 54 */
  1032. { .mfb
  1033. nop __LINE__
  1034. (p3) FMA f85 = f45, f58, f85 // A6 * B3
  1035. nop __LINE__
  1036. }
  1037. { .mfb
  1038. nop __LINE__
  1039. (p3) FMA_A f84 = f45, f59, f84 // A6 * B4
  1040. nop __LINE__
  1041. }
  1042. ;;
  1043. /* 55 */
  1044. { .mfb
  1045. nop __LINE__
  1046. (p3) FMA f101 = f45, f60, f101 // A6 * B5
  1047. nop __LINE__
  1048. }
  1049. { .mfb
  1050. nop __LINE__
  1051. (p3) FMA_A f100 = f45, f61, f100 // A6 * B6
  1052. nop __LINE__
  1053. }
  1054. ;;
  1055. /* 56 */
  1056. { .mfb
  1057. nop __LINE__
  1058. (p3) FMA f117 = f45, f62, f117 // A6 * B7
  1059. nop __LINE__
  1060. }
  1061. { .mfb
  1062. nop __LINE__
  1063. (p3) FMA_A f116 = f45, f63, f116 // A6 * B8
  1064. nop __LINE__
  1065. }
  1066. ;;
  1067. /* 57 */
  1068. { .mfb
  1069. nop __LINE__
  1070. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  1071. nop __LINE__
  1072. }
  1073. { .mfb
  1074. nop __LINE__
  1075. (p3) FMA_B f71 = f46, f57, f71 // A7 * B2
  1076. nop __LINE__
  1077. }
  1078. ;;
  1079. /* 58 */
  1080. { .mfb
  1081. nop __LINE__
  1082. (p3) FMA f86 = f46, f58, f86 // A7 * B3
  1083. nop __LINE__
  1084. }
  1085. { .mfb
  1086. nop __LINE__
  1087. (p3) FMA_B f87 = f46, f59, f87 // A7 * B4
  1088. nop __LINE__
  1089. }
  1090. ;;
  1091. /* 59 */
  1092. { .mfb
  1093. nop __LINE__
  1094. (p3) FMA f102 = f46, f60, f102 // A7 * B5
  1095. nop __LINE__
  1096. }
  1097. { .mfb
  1098. nop __LINE__
  1099. (p3) FMA_B f103 = f46, f61, f103 // A7 * B6
  1100. nop __LINE__
  1101. }
  1102. ;;
  1103. /* 60 */
  1104. { .mfb
  1105. nop __LINE__
  1106. (p3) FMA f118 = f46, f62, f118 // A7 * B7
  1107. nop __LINE__
  1108. }
  1109. { .mfb
  1110. nop __LINE__
  1111. (p3) FMA_B f119 = f46, f63, f119 // A7 * B8
  1112. nop __LINE__
  1113. }
  1114. ;;
  1115. /* 61 */
  1116. { .mfb
  1117. nop __LINE__
  1118. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  1119. nop __LINE__
  1120. }
  1121. { .mfb
  1122. nop __LINE__
  1123. (p3) FMA_A f70 = f47, f57, f70 // A8 * B2
  1124. nop __LINE__
  1125. }
  1126. ;;
  1127. /* 62 */
  1128. { .mfb
  1129. nop __LINE__
  1130. (p3) FMA f87 = f47, f58, f87 // A8 * B3
  1131. nop __LINE__
  1132. }
  1133. { .mfb
  1134. nop __LINE__
  1135. (p3) FMA_A f86 = f47, f59, f86 // A8 * B4
  1136. nop __LINE__
  1137. }
  1138. ;;
  1139. /* 63 */
  1140. { .mfb
  1141. nop __LINE__
  1142. (p3) FMA f103 = f47, f60, f103 // A8 * B5
  1143. nop __LINE__
  1144. }
  1145. { .mfb
  1146. nop __LINE__
  1147. (p3) FMA_A f102 = f47, f61, f102 // A8 * B6
  1148. nop __LINE__
  1149. }
  1150. ;;
  1151. /* 64 */
  1152. { .mfi
  1153. nop __LINE__
  1154. (p3) FMA f119 = f47, f62, f119 // A8 * B7
  1155. adds L = -1, L
  1156. }
  1157. { .mfb
  1158. nop __LINE__
  1159. (p3) FMA_A f118 = f47, f63, f118 // A8 * B8
  1160. br.cloop.sptk.few .L012
  1161. }
  1162. ;;
  1163. .L018:
  1164. #if defined(LN) || defined(RT)
  1165. #ifdef LN
  1166. adds r2 = -4, KK
  1167. #else
  1168. adds r2 = -4, KK
  1169. #endif
  1170. ;;
  1171. shladd r2 = r2, ZBASE_SHIFT, r0
  1172. ;;
  1173. shladd AOFFSET = r2, 2, AORIG
  1174. shladd BOFFSET = r2, 2, B
  1175. ;;
  1176. #endif
  1177. #if defined(LN) || defined(LT)
  1178. LDFPD f72, f73 = [BOFFSET], 2 * SIZE
  1179. ;;
  1180. LDFPD f74, f75 = [BOFFSET], 2 * SIZE
  1181. ;;
  1182. LDFPD f76, f77 = [BOFFSET], 2 * SIZE
  1183. ;;
  1184. LDFPD f78, f79 = [BOFFSET], 2 * SIZE
  1185. ;;
  1186. LDFPD f88, f89 = [BOFFSET], 2 * SIZE
  1187. ;;
  1188. LDFPD f90, f91 = [BOFFSET], 2 * SIZE
  1189. ;;
  1190. LDFPD f92, f93 = [BOFFSET], 2 * SIZE
  1191. ;;
  1192. { .mfi
  1193. LDFPD f94, f95 = [BOFFSET], 2 * SIZE
  1194. FSUB f64 = f72, f64
  1195. nop __LINE__
  1196. }
  1197. { .mfi
  1198. nop __LINE__
  1199. FSUB_A f65 = f73, f65
  1200. nop __LINE__
  1201. }
  1202. ;;
  1203. { .mfi
  1204. LDFPD f104, f105 = [BOFFSET], 2 * SIZE
  1205. FSUB f80 = f74, f80
  1206. nop __LINE__
  1207. }
  1208. { .mfi
  1209. nop __LINE__
  1210. FSUB_A f81 = f75, f81
  1211. nop __LINE__
  1212. }
  1213. ;;
  1214. { .mfi
  1215. LDFPD f106, f107 = [BOFFSET], 2 * SIZE
  1216. FSUB f96 = f76, f96
  1217. nop __LINE__
  1218. }
  1219. { .mfi
  1220. nop __LINE__
  1221. FSUB_A f97 = f77, f97
  1222. nop __LINE__
  1223. }
  1224. ;;
  1225. { .mfi
  1226. LDFPD f108, f109 = [BOFFSET], 2 * SIZE
  1227. FSUB f112 = f78, f112
  1228. nop __LINE__
  1229. }
  1230. { .mfi
  1231. nop __LINE__
  1232. FSUB_A f113 = f79, f113
  1233. nop __LINE__
  1234. }
  1235. ;;
  1236. { .mfi
  1237. LDFPD f110, f111 = [BOFFSET], 2 * SIZE
  1238. FSUB f66 = f88, f66
  1239. nop __LINE__
  1240. }
  1241. { .mfi
  1242. nop __LINE__
  1243. FSUB_A f67 = f89, f67
  1244. nop __LINE__
  1245. }
  1246. ;;
  1247. { .mfi
  1248. LDFPD f120, f121 = [BOFFSET], 2 * SIZE
  1249. FSUB f82 = f90, f82
  1250. nop __LINE__
  1251. }
  1252. { .mfi
  1253. nop __LINE__
  1254. FSUB_A f83 = f91, f83
  1255. nop __LINE__
  1256. }
  1257. ;;
  1258. { .mfi
  1259. LDFPD f122, f123 = [BOFFSET], 2 * SIZE
  1260. FSUB f98 = f92, f98
  1261. nop __LINE__
  1262. }
  1263. { .mfi
  1264. nop __LINE__
  1265. FSUB_A f99 = f93, f99
  1266. nop __LINE__
  1267. }
  1268. ;;
  1269. { .mfi
  1270. LDFPD f124, f125 = [BOFFSET], 2 * SIZE
  1271. FSUB f114 = f94, f114
  1272. nop __LINE__
  1273. }
  1274. { .mfi
  1275. nop __LINE__
  1276. FSUB_A f115 = f95, f115
  1277. nop __LINE__
  1278. }
  1279. ;;
  1280. { .mfi
  1281. LDFPD f126, f127 = [BOFFSET]
  1282. FSUB f68 = f104, f68
  1283. adds BOFFSET = -30 * SIZE, BOFFSET
  1284. }
  1285. { .mfi
  1286. nop __LINE__
  1287. FSUB_A f69 = f105, f69
  1288. #ifdef LN
  1289. adds AOFFSET = 30 * SIZE, AOFFSET
  1290. #else
  1291. nop __LINE__
  1292. #endif
  1293. }
  1294. ;;
  1295. { .mfi
  1296. LDFPD f72, f73 = [AOFFSET]
  1297. FSUB f84 = f106, f84
  1298. #ifdef LN
  1299. adds AOFFSET = - 2 * SIZE, AOFFSET
  1300. #else
  1301. adds AOFFSET = 2 * SIZE, AOFFSET
  1302. #endif
  1303. }
  1304. { .mfi
  1305. nop __LINE__
  1306. FSUB_A f85 = f107, f85
  1307. nop __LINE__
  1308. }
  1309. ;;
  1310. { .mfi
  1311. LDFPD f74, f75 = [AOFFSET]
  1312. FSUB f100 = f108, f100
  1313. #ifdef LN
  1314. adds AOFFSET = - 2 * SIZE, AOFFSET
  1315. #else
  1316. adds AOFFSET = 2 * SIZE, AOFFSET
  1317. #endif
  1318. }
  1319. { .mfi
  1320. nop __LINE__
  1321. FSUB_A f101 = f109, f101
  1322. nop __LINE__
  1323. }
  1324. ;;
  1325. { .mfi
  1326. nop __LINE__
  1327. FSUB f116 = f110, f116
  1328. nop __LINE__
  1329. }
  1330. { .mfi
  1331. nop __LINE__
  1332. FSUB_A f117 = f111, f117
  1333. nop __LINE__
  1334. }
  1335. ;;
  1336. { .mfi
  1337. nop __LINE__
  1338. FSUB f70 = f120, f70
  1339. nop __LINE__
  1340. }
  1341. { .mfi
  1342. nop __LINE__
  1343. FSUB_A f71 = f121, f71
  1344. nop __LINE__
  1345. }
  1346. ;;
  1347. { .mfi
  1348. nop __LINE__
  1349. FSUB f86 = f122, f86
  1350. nop __LINE__
  1351. }
  1352. { .mfi
  1353. nop __LINE__
  1354. FSUB_A f87 = f123, f87
  1355. nop __LINE__
  1356. }
  1357. ;;
  1358. { .mfi
  1359. nop __LINE__
  1360. FSUB f102 = f124, f102
  1361. nop __LINE__
  1362. }
  1363. { .mfi
  1364. nop __LINE__
  1365. FSUB_A f103 = f125, f103
  1366. nop __LINE__
  1367. }
  1368. ;;
  1369. { .mfi
  1370. nop __LINE__
  1371. FSUB f118 = f126, f118
  1372. nop __LINE__
  1373. }
  1374. { .mfi
  1375. nop __LINE__
  1376. FSUB_A f119 = f127, f119
  1377. nop __LINE__
  1378. }
  1379. ;;
  1380. #else
  1381. LDFPD f72, f73 = [AOFFSET], 2 * SIZE
  1382. ;;
  1383. LDFPD f74, f75 = [AOFFSET], 2 * SIZE
  1384. ;;
  1385. LDFPD f76, f77 = [AOFFSET], 2 * SIZE
  1386. ;;
  1387. LDFPD f78, f79 = [AOFFSET], 2 * SIZE
  1388. ;;
  1389. LDFPD f88, f89 = [AOFFSET], 2 * SIZE
  1390. ;;
  1391. LDFPD f90, f91 = [AOFFSET], 2 * SIZE
  1392. ;;
  1393. { .mfi
  1394. LDFPD f92, f93 = [AOFFSET], 2 * SIZE
  1395. FSUB f64 = f72, f64
  1396. nop __LINE__
  1397. }
  1398. { .mfi
  1399. nop __LINE__
  1400. FSUB f65 = f73, f65
  1401. nop __LINE__
  1402. }
  1403. ;;
  1404. { .mfi
  1405. LDFPD f94, f95 = [AOFFSET], 2 * SIZE
  1406. FSUB f66 = f74, f66
  1407. nop __LINE__
  1408. }
  1409. { .mfi
  1410. nop __LINE__
  1411. FSUB f67 = f75, f67
  1412. nop __LINE__
  1413. }
  1414. ;;
  1415. { .mfi
  1416. LDFPD f104, f105 = [AOFFSET], 2 * SIZE
  1417. FSUB f68 = f76, f68
  1418. nop __LINE__
  1419. }
  1420. { .mfi
  1421. nop __LINE__
  1422. FSUB f69 = f77, f69
  1423. nop __LINE__
  1424. }
  1425. ;;
  1426. { .mfi
  1427. LDFPD f106, f107 = [AOFFSET], 2 * SIZE
  1428. FSUB f70 = f78, f70
  1429. nop __LINE__
  1430. }
  1431. { .mfi
  1432. nop __LINE__
  1433. FSUB f71 = f79, f71
  1434. nop __LINE__
  1435. }
  1436. ;;
  1437. { .mfi
  1438. LDFPD f108, f109 = [AOFFSET], 2 * SIZE
  1439. FSUB f80 = f88, f80
  1440. nop __LINE__
  1441. }
  1442. { .mfi
  1443. nop __LINE__
  1444. FSUB f81 = f89, f81
  1445. nop __LINE__
  1446. }
  1447. ;;
  1448. { .mfi
  1449. LDFPD f110, f111 = [AOFFSET], 2 * SIZE
  1450. FSUB f82 = f90, f82
  1451. nop __LINE__
  1452. }
  1453. { .mfi
  1454. nop __LINE__
  1455. FSUB f83 = f91, f83
  1456. nop __LINE__
  1457. }
  1458. ;;
  1459. { .mfi
  1460. LDFPD f120, f121 = [AOFFSET], 2 * SIZE
  1461. FSUB f84 = f92, f84
  1462. nop __LINE__
  1463. }
  1464. { .mfi
  1465. nop __LINE__
  1466. FSUB f85 = f93, f85
  1467. nop __LINE__
  1468. }
  1469. ;;
  1470. { .mfi
  1471. LDFPD f122, f123 = [AOFFSET], 2 * SIZE
  1472. FSUB f86 = f94, f86
  1473. nop __LINE__
  1474. }
  1475. { .mfi
  1476. nop __LINE__
  1477. FSUB f87 = f95, f87
  1478. nop __LINE__
  1479. }
  1480. ;;
  1481. { .mfi
  1482. LDFPD f124, f125 = [AOFFSET], 2 * SIZE
  1483. FSUB f96 = f104, f96
  1484. nop __LINE__
  1485. }
  1486. { .mfi
  1487. nop __LINE__
  1488. FSUB f97 = f105, f97
  1489. nop __LINE__
  1490. }
  1491. ;;
  1492. { .mfi
  1493. LDFPD f126, f127 = [AOFFSET]
  1494. FSUB f98 = f106, f98
  1495. adds AOFFSET = -30 * SIZE, AOFFSET
  1496. }
  1497. { .mfi
  1498. nop __LINE__
  1499. FSUB f99 = f107, f99
  1500. #ifdef RT
  1501. adds BOFFSET = 30 * SIZE, BOFFSET
  1502. #else
  1503. nop __LINE__
  1504. #endif
  1505. }
  1506. ;;
  1507. { .mfi
  1508. LDFPD f72, f73 = [BOFFSET]
  1509. FSUB f100 = f108, f100
  1510. #ifdef RN
  1511. adds BOFFSET = 2 * SIZE, BOFFSET
  1512. #else
  1513. adds BOFFSET = - 2 * SIZE, BOFFSET
  1514. #endif
  1515. }
  1516. { .mfi
  1517. nop __LINE__
  1518. FSUB f101 = f109, f101
  1519. nop __LINE__
  1520. }
  1521. ;;
  1522. { .mfi
  1523. LDFPD f74, f75 = [BOFFSET]
  1524. FSUB f102 = f110, f102
  1525. #ifdef RN
  1526. adds BOFFSET = 2 * SIZE, BOFFSET
  1527. #else
  1528. adds BOFFSET = - 2 * SIZE, BOFFSET
  1529. #endif
  1530. }
  1531. { .mfi
  1532. nop __LINE__
  1533. FSUB f103 = f111, f103
  1534. nop __LINE__
  1535. }
  1536. ;;
  1537. { .mfi
  1538. nop __LINE__
  1539. FSUB f112 = f120, f112
  1540. nop __LINE__
  1541. }
  1542. { .mfi
  1543. nop __LINE__
  1544. FSUB f113 = f121, f113
  1545. nop __LINE__
  1546. }
  1547. ;;
  1548. { .mfi
  1549. nop __LINE__
  1550. FSUB f114 = f122, f114
  1551. nop __LINE__
  1552. }
  1553. { .mfi
  1554. nop __LINE__
  1555. FSUB f115 = f123, f115
  1556. nop __LINE__
  1557. }
  1558. ;;
  1559. { .mfi
  1560. nop __LINE__
  1561. FSUB f116 = f124, f116
  1562. nop __LINE__
  1563. }
  1564. { .mfi
  1565. nop __LINE__
  1566. FSUB f117 = f125, f117
  1567. nop __LINE__
  1568. }
  1569. ;;
  1570. { .mfi
  1571. nop __LINE__
  1572. FSUB f118 = f126, f118
  1573. nop __LINE__
  1574. }
  1575. { .mfi
  1576. nop __LINE__
  1577. FSUB f119 = f127, f119
  1578. nop __LINE__
  1579. }
  1580. ;;
  1581. #endif
  1582. #ifdef LN
  1583. { .mfi
  1584. LDFPD f76, f77 = [AOFFSET]
  1585. FMPY f32 = f72, f70
  1586. adds AOFFSET = - 2 * SIZE, AOFFSET
  1587. }
  1588. { .mfi
  1589. nop __LINE__
  1590. FMPY f36 = f72, f102
  1591. nop __LINE__
  1592. }
  1593. ;;
  1594. { .mfi
  1595. LDFPD f78, f79 = [AOFFSET]
  1596. FMPY f33 = f73, f70
  1597. adds AOFFSET = - 4 * SIZE, AOFFSET
  1598. }
  1599. { .mfi
  1600. nop __LINE__
  1601. FMPY f37 = f73, f102
  1602. nop __LINE__
  1603. }
  1604. ;;
  1605. { .mfi
  1606. LDFPD f88, f89 = [AOFFSET]
  1607. FMPY f34 = f72, f86
  1608. adds AOFFSET = - 2 * SIZE, AOFFSET
  1609. }
  1610. { .mfi
  1611. nop __LINE__
  1612. FMPY f38 = f72, f118
  1613. nop __LINE__
  1614. }
  1615. ;;
  1616. { .mfi
  1617. LDFPD f90, f91 = [AOFFSET]
  1618. FMPY f35 = f73, f86
  1619. adds AOFFSET = - 2 * SIZE, AOFFSET
  1620. }
  1621. { .mfi
  1622. nop __LINE__
  1623. FMPY f39 = f73, f118
  1624. nop __LINE__
  1625. }
  1626. ;;
  1627. { .mfi
  1628. LDFPD f92, f93 = [AOFFSET]
  1629. FMA_C f70 = f73, f71, f32
  1630. adds AOFFSET = - 6 * SIZE, AOFFSET
  1631. }
  1632. { .mfi
  1633. nop __LINE__
  1634. FMA_C f102 = f73, f103, f36
  1635. adds C1 = -2 * SIZE, C1
  1636. }
  1637. ;;
  1638. { .mfi
  1639. LDFPD f104, f105 = [AOFFSET]
  1640. FMA_D f71 = f72, f71, f33
  1641. adds AOFFSET = - 2 * SIZE, AOFFSET
  1642. }
  1643. { .mfi
  1644. nop __LINE__
  1645. FMA_D f103 = f72, f103, f37
  1646. adds C2 = -2 * SIZE, C2
  1647. }
  1648. ;;
  1649. { .mfi
  1650. LDFPD f106, f107 = [AOFFSET]
  1651. FMA_C f86 = f73, f87, f34
  1652. adds AOFFSET = - 8 * SIZE, AOFFSET
  1653. }
  1654. { .mfi
  1655. nop __LINE__
  1656. FMA_C f118 = f73, f119, f38
  1657. adds C3 = -2 * SIZE, C3
  1658. }
  1659. ;;
  1660. { .mfi
  1661. LDFPD f120, f121 = [AOFFSET]
  1662. FMA_D f87 = f72, f87, f35
  1663. adds BOFFSET2 = 28 * SIZE, BOFFSET
  1664. }
  1665. { .mfi
  1666. nop __LINE__
  1667. FMA_D f119 = f72, f119, f39
  1668. adds BOFFSET = 24 * SIZE, BOFFSET
  1669. }
  1670. ;;
  1671. { .mfi
  1672. STFD [BOFFSET] = f70, SIZE
  1673. FNMA f68 = f74, f70, f68
  1674. adds C4 = -2 * SIZE, C4
  1675. }
  1676. { .mfi
  1677. STFD [BOFFSET2] = f102, SIZE
  1678. FNMA f100 = f74, f102, f100
  1679. nop __LINE__
  1680. }
  1681. ;;
  1682. { .mfi
  1683. STFD [BOFFSET] = f71, SIZE
  1684. FMA_A f69 = f75, f70, f69
  1685. nop __LINE__
  1686. }
  1687. { .mfi
  1688. STFD [BOFFSET2] = f103, SIZE
  1689. FMA_A f101 = f75, f102, f101
  1690. nop __LINE__
  1691. }
  1692. ;;
  1693. { .mfi
  1694. STFD [BOFFSET] = f86, SIZE
  1695. FNMA f84 = f74, f86, f84
  1696. nop __LINE__
  1697. }
  1698. { .mfi
  1699. STFD [BOFFSET2] = f118, SIZE
  1700. FNMA f116 = f74, f118, f116
  1701. nop __LINE__
  1702. }
  1703. ;;
  1704. { .mfi
  1705. STFD [BOFFSET] = f87, -11 * SIZE
  1706. FMA_A f85 = f75, f86, f85
  1707. nop __LINE__
  1708. }
  1709. { .mfi
  1710. STFD [BOFFSET2] = f119, -11 * SIZE
  1711. FMA_A f117 = f75, f118, f117
  1712. nop __LINE__
  1713. }
  1714. ;;
  1715. { .mfi
  1716. STFD [C1 ] = f70, SIZE
  1717. FMA_B f68 = f75, f71, f68
  1718. nop __LINE__
  1719. }
  1720. { .mfi
  1721. STFD [C3 ] = f102, SIZE
  1722. FMA_B f100 = f75, f103, f100
  1723. nop __LINE__
  1724. }
  1725. ;;
  1726. { .mfi
  1727. STFD [C1 ] = f71, -3 * SIZE
  1728. FNMA f69 = f74, f71, f69
  1729. nop __LINE__
  1730. }
  1731. { .mfi
  1732. STFD [C3 ] = f103, -3 * SIZE
  1733. FNMA f101 = f74, f103, f101
  1734. nop __LINE__
  1735. }
  1736. ;;
  1737. { .mfi
  1738. STFD [C2 ] = f86, SIZE
  1739. FMA_B f84 = f75, f87, f84
  1740. nop __LINE__
  1741. }
  1742. { .mfi
  1743. STFD [C4 ] = f118, SIZE
  1744. FMA_B f116 = f75, f119, f116
  1745. nop __LINE__
  1746. }
  1747. ;;
  1748. { .mfi
  1749. STFD [C2 ] = f87, -3 * SIZE
  1750. FNMA f85 = f74, f87, f85
  1751. nop __LINE__
  1752. }
  1753. { .mfi
  1754. STFD [C4 ] = f119, -3 * SIZE
  1755. FNMA f117 = f74, f119, f117
  1756. nop __LINE__
  1757. }
  1758. ;;
  1759. { .mfi
  1760. nop __LINE__
  1761. FNMA f66 = f76, f70, f66
  1762. nop __LINE__
  1763. }
  1764. { .mfi
  1765. nop __LINE__
  1766. FNMA f98 = f76, f102, f98
  1767. nop __LINE__
  1768. }
  1769. ;;
  1770. { .mfi
  1771. nop __LINE__
  1772. FMA_A f67 = f77, f70, f67
  1773. nop __LINE__
  1774. }
  1775. { .mfi
  1776. nop __LINE__
  1777. FMA_A f99 = f77, f102, f99
  1778. nop __LINE__
  1779. }
  1780. ;;
  1781. { .mfi
  1782. nop __LINE__
  1783. FNMA f82 = f76, f86, f82
  1784. nop __LINE__
  1785. }
  1786. { .mfi
  1787. nop __LINE__
  1788. FNMA f114 = f76, f118, f114
  1789. nop __LINE__
  1790. }
  1791. ;;
  1792. { .mfi
  1793. nop __LINE__
  1794. FMA_A f83 = f77, f86, f83
  1795. nop __LINE__
  1796. }
  1797. { .mfi
  1798. nop __LINE__
  1799. FMA_A f115 = f77, f118, f115
  1800. nop __LINE__
  1801. }
  1802. ;;
  1803. { .mfi
  1804. nop __LINE__
  1805. FMA_B f66 = f77, f71, f66
  1806. nop __LINE__
  1807. }
  1808. { .mfi
  1809. nop __LINE__
  1810. FMA_B f98 = f77, f103, f98
  1811. nop __LINE__
  1812. }
  1813. ;;
  1814. { .mfi
  1815. nop __LINE__
  1816. FNMA f67 = f76, f71, f67
  1817. nop __LINE__
  1818. }
  1819. { .mfi
  1820. nop __LINE__
  1821. FNMA f99 = f76, f103, f99
  1822. nop __LINE__
  1823. }
  1824. ;;
  1825. { .mfi
  1826. nop __LINE__
  1827. FMA_B f82 = f77, f87, f82
  1828. nop __LINE__
  1829. }
  1830. { .mfi
  1831. nop __LINE__
  1832. FMA_B f114 = f77, f119, f114
  1833. nop __LINE__
  1834. }
  1835. ;;
  1836. { .mfi
  1837. nop __LINE__
  1838. FNMA f83 = f76, f87, f83
  1839. nop __LINE__
  1840. }
  1841. { .mfi
  1842. nop __LINE__
  1843. FNMA f115 = f76, f119, f115
  1844. nop __LINE__
  1845. }
  1846. ;;
  1847. { .mfi
  1848. nop __LINE__
  1849. FNMA f64 = f78, f70, f64
  1850. nop __LINE__
  1851. }
  1852. { .mfi
  1853. nop __LINE__
  1854. FNMA f96 = f78, f102, f96
  1855. nop __LINE__
  1856. }
  1857. ;;
  1858. { .mfi
  1859. nop __LINE__
  1860. FMA_A f65 = f79, f70, f65
  1861. nop __LINE__
  1862. }
  1863. { .mfi
  1864. nop __LINE__
  1865. FMA_A f97 = f79, f102, f97
  1866. nop __LINE__
  1867. }
  1868. ;;
  1869. { .mfi
  1870. nop __LINE__
  1871. FNMA f80 = f78, f86, f80
  1872. nop __LINE__
  1873. }
  1874. { .mfi
  1875. nop __LINE__
  1876. FNMA f112 = f78, f118, f112
  1877. nop __LINE__
  1878. }
  1879. ;;
  1880. { .mfi
  1881. nop __LINE__
  1882. FMA_A f81 = f79, f86, f81
  1883. nop __LINE__
  1884. }
  1885. { .mfi
  1886. nop __LINE__
  1887. FMA_A f113 = f79, f118, f113
  1888. nop __LINE__
  1889. }
  1890. ;;
  1891. { .mfi
  1892. nop __LINE__
  1893. FMA_B f64 = f79, f71, f64
  1894. nop __LINE__
  1895. }
  1896. { .mfi
  1897. nop __LINE__
  1898. FMA_B f96 = f79, f103, f96
  1899. nop __LINE__
  1900. }
  1901. ;;
  1902. { .mfi
  1903. nop __LINE__
  1904. FNMA f65 = f78, f71, f65
  1905. nop __LINE__
  1906. }
  1907. { .mfi
  1908. nop __LINE__
  1909. FNMA f97 = f78, f103, f97
  1910. nop __LINE__
  1911. }
  1912. ;;
  1913. { .mfi
  1914. nop __LINE__
  1915. FMA_B f80 = f79, f87, f80
  1916. nop __LINE__
  1917. }
  1918. { .mfi
  1919. nop __LINE__
  1920. FMA_B f112 = f79, f119, f112
  1921. nop __LINE__
  1922. }
  1923. ;;
  1924. { .mfi
  1925. nop __LINE__
  1926. FNMA f81 = f78, f87, f81
  1927. nop __LINE__
  1928. }
  1929. { .mfi
  1930. nop __LINE__
  1931. FNMA f113 = f78, f119, f113
  1932. nop __LINE__
  1933. }
  1934. ;;
  1935. { .mfi
  1936. nop __LINE__
  1937. FMPY f32 = f88, f68
  1938. nop __LINE__
  1939. }
  1940. { .mfi
  1941. nop __LINE__
  1942. FMPY f36 = f88, f100
  1943. nop __LINE__
  1944. }
  1945. ;;
  1946. { .mfi
  1947. nop __LINE__
  1948. FMPY f33 = f89, f68
  1949. nop __LINE__
  1950. }
  1951. { .mfi
  1952. nop __LINE__
  1953. FMPY f37 = f89, f100
  1954. nop __LINE__
  1955. }
  1956. ;;
  1957. { .mfi
  1958. nop __LINE__
  1959. FMPY f34 = f88, f84
  1960. nop __LINE__
  1961. }
  1962. { .mfi
  1963. nop __LINE__
  1964. FMPY f38 = f88, f116
  1965. nop __LINE__
  1966. }
  1967. ;;
  1968. { .mfi
  1969. nop __LINE__
  1970. FMPY f35 = f89, f84
  1971. nop __LINE__
  1972. }
  1973. { .mfi
  1974. nop __LINE__
  1975. FMPY f39 = f89, f116
  1976. nop __LINE__
  1977. }
  1978. ;;
  1979. { .mfi
  1980. nop __LINE__
  1981. FMA_C f68 = f89, f69, f32
  1982. nop __LINE__
  1983. }
  1984. { .mfi
  1985. nop __LINE__
  1986. FMA_C f100 = f89, f101, f36
  1987. nop __LINE__
  1988. }
  1989. ;;
  1990. { .mfi
  1991. nop __LINE__
  1992. FMA_D f69 = f88, f69, f33
  1993. nop __LINE__
  1994. }
  1995. { .mfi
  1996. nop __LINE__
  1997. FMA_D f101 = f88, f101, f37
  1998. nop __LINE__
  1999. }
  2000. ;;
  2001. { .mfi
  2002. nop __LINE__
  2003. FMA_C f84 = f89, f85, f34
  2004. nop __LINE__
  2005. }
  2006. { .mfi
  2007. nop __LINE__
  2008. FMA_C f116 = f89, f117, f38
  2009. nop __LINE__
  2010. }
  2011. ;;
  2012. { .mfi
  2013. nop __LINE__
  2014. FMA_D f85 = f88, f85, f35
  2015. nop __LINE__
  2016. }
  2017. { .mfi
  2018. nop __LINE__
  2019. FMA_D f117 = f88, f117, f39
  2020. nop __LINE__
  2021. }
  2022. ;;
  2023. { .mfi
  2024. STFD [BOFFSET] = f68, SIZE
  2025. FNMA f66 = f90, f68, f66
  2026. nop __LINE__
  2027. }
  2028. { .mfi
  2029. STFD [BOFFSET2] = f100, SIZE
  2030. FNMA f98 = f90, f100, f98
  2031. nop __LINE__
  2032. }
  2033. ;;
  2034. { .mfi
  2035. STFD [BOFFSET] = f69, SIZE
  2036. FMA_A f67 = f91, f68, f67
  2037. nop __LINE__
  2038. }
  2039. { .mfi
  2040. STFD [BOFFSET2] = f101, SIZE
  2041. FMA_A f99 = f91, f100, f99
  2042. nop __LINE__
  2043. }
  2044. ;;
  2045. { .mfi
  2046. STFD [BOFFSET] = f84, SIZE
  2047. FNMA f82 = f90, f84, f82
  2048. nop __LINE__
  2049. }
  2050. { .mfi
  2051. STFD [BOFFSET2] = f116, SIZE
  2052. FNMA f114 = f90, f116, f114
  2053. nop __LINE__
  2054. }
  2055. ;;
  2056. { .mfi
  2057. STFD [BOFFSET] = f85, -11 * SIZE
  2058. FMA_A f83 = f91, f84, f83
  2059. nop __LINE__
  2060. }
  2061. { .mfi
  2062. STFD [BOFFSET2] = f117, -11 * SIZE
  2063. FMA_A f115 = f91, f116, f115
  2064. nop __LINE__
  2065. }
  2066. ;;
  2067. { .mfi
  2068. STFD [C1 ] = f68, SIZE
  2069. FMA_B f66 = f91, f69, f66
  2070. nop __LINE__
  2071. }
  2072. { .mfi
  2073. STFD [C3 ] = f100, SIZE
  2074. FMA_B f98 = f91, f101, f98
  2075. nop __LINE__
  2076. }
  2077. ;;
  2078. { .mfi
  2079. STFD [C1 ] = f69, -3 * SIZE
  2080. FNMA f67 = f90, f69, f67
  2081. nop __LINE__
  2082. }
  2083. { .mfi
  2084. STFD [C3 ] = f101, -3 * SIZE
  2085. FNMA f99 = f90, f101, f99
  2086. nop __LINE__
  2087. }
  2088. ;;
  2089. { .mfi
  2090. STFD [C2 ] = f84, SIZE
  2091. FMA_B f82 = f91, f85, f82
  2092. nop __LINE__
  2093. }
  2094. { .mfi
  2095. STFD [C4 ] = f116, SIZE
  2096. FMA_B f114 = f91, f117, f114
  2097. nop __LINE__
  2098. }
  2099. ;;
  2100. { .mfi
  2101. STFD [C2 ] = f85, -3 * SIZE
  2102. FNMA f83 = f90, f85, f83
  2103. nop __LINE__
  2104. }
  2105. { .mfi
  2106. STFD [C4 ] = f117, -3 * SIZE
  2107. FNMA f115 = f90, f117, f115
  2108. nop __LINE__
  2109. }
  2110. ;;
  2111. { .mfi
  2112. nop __LINE__
  2113. FNMA f64 = f92, f68, f64
  2114. nop __LINE__
  2115. }
  2116. { .mfi
  2117. nop __LINE__
  2118. FNMA f96 = f92, f100, f96
  2119. nop __LINE__
  2120. }
  2121. ;;
  2122. { .mfi
  2123. nop __LINE__
  2124. FMA_A f65 = f93, f68, f65
  2125. nop __LINE__
  2126. }
  2127. { .mfi
  2128. nop __LINE__
  2129. FMA_A f97 = f93, f100, f97
  2130. nop __LINE__
  2131. }
  2132. ;;
  2133. { .mfi
  2134. nop __LINE__
  2135. FNMA f80 = f92, f84, f80
  2136. nop __LINE__
  2137. }
  2138. { .mfi
  2139. nop __LINE__
  2140. FNMA f112 = f92, f116, f112
  2141. nop __LINE__
  2142. }
  2143. ;;
  2144. { .mfi
  2145. nop __LINE__
  2146. FMA_A f81 = f93, f84, f81
  2147. nop __LINE__
  2148. }
  2149. { .mfi
  2150. nop __LINE__
  2151. FMA_A f113 = f93, f116, f113
  2152. nop __LINE__
  2153. }
  2154. ;;
  2155. { .mfi
  2156. nop __LINE__
  2157. FMA_B f64 = f93, f69, f64
  2158. nop __LINE__
  2159. }
  2160. { .mfi
  2161. nop __LINE__
  2162. FMA_B f96 = f93, f101, f96
  2163. nop __LINE__
  2164. }
  2165. ;;
  2166. { .mfi
  2167. nop __LINE__
  2168. FNMA f65 = f92, f69, f65
  2169. nop __LINE__
  2170. }
  2171. { .mfi
  2172. nop __LINE__
  2173. FNMA f97 = f92, f101, f97
  2174. nop __LINE__
  2175. }
  2176. ;;
  2177. { .mfi
  2178. nop __LINE__
  2179. FMA_B f80 = f93, f85, f80
  2180. nop __LINE__
  2181. }
  2182. { .mfi
  2183. nop __LINE__
  2184. FMA_B f112 = f93, f117, f112
  2185. nop __LINE__
  2186. }
  2187. ;;
  2188. { .mfi
  2189. nop __LINE__
  2190. FNMA f81 = f92, f85, f81
  2191. nop __LINE__
  2192. }
  2193. { .mfi
  2194. nop __LINE__
  2195. FNMA f113 = f92, f117, f113
  2196. nop __LINE__
  2197. }
  2198. ;;
  2199. { .mfi
  2200. nop __LINE__
  2201. FMPY f32 = f104, f66
  2202. nop __LINE__
  2203. }
  2204. { .mfi
  2205. nop __LINE__
  2206. FMPY f36 = f104, f98
  2207. nop __LINE__
  2208. }
  2209. ;;
  2210. { .mfi
  2211. nop __LINE__
  2212. FMPY f33 = f105, f66
  2213. nop __LINE__
  2214. }
  2215. { .mfi
  2216. nop __LINE__
  2217. FMPY f37 = f105, f98
  2218. nop __LINE__
  2219. }
  2220. ;;
  2221. { .mfi
  2222. nop __LINE__
  2223. FMPY f34 = f104, f82
  2224. nop __LINE__
  2225. }
  2226. { .mfi
  2227. nop __LINE__
  2228. FMPY f38 = f104, f114
  2229. nop __LINE__
  2230. }
  2231. ;;
  2232. { .mfi
  2233. nop __LINE__
  2234. FMPY f35 = f105, f82
  2235. nop __LINE__
  2236. }
  2237. { .mfi
  2238. nop __LINE__
  2239. FMPY f39 = f105, f114
  2240. nop __LINE__
  2241. }
  2242. ;;
  2243. { .mfi
  2244. nop __LINE__
  2245. FMA_C f66 = f105, f67, f32
  2246. nop __LINE__
  2247. }
  2248. { .mfi
  2249. nop __LINE__
  2250. FMA_C f98 = f105, f99, f36
  2251. nop __LINE__
  2252. }
  2253. ;;
  2254. { .mfi
  2255. nop __LINE__
  2256. FMA_D f67 = f104, f67, f33
  2257. nop __LINE__
  2258. }
  2259. { .mfi
  2260. nop __LINE__
  2261. FMA_D f99 = f104, f99, f37
  2262. nop __LINE__
  2263. }
  2264. ;;
  2265. { .mfi
  2266. nop __LINE__
  2267. FMA_C f82 = f105, f83, f34
  2268. nop __LINE__
  2269. }
  2270. { .mfi
  2271. nop __LINE__
  2272. FMA_C f114 = f105, f115, f38
  2273. nop __LINE__
  2274. }
  2275. ;;
  2276. { .mfi
  2277. nop __LINE__
  2278. FMA_D f83 = f104, f83, f35
  2279. nop __LINE__
  2280. }
  2281. { .mfi
  2282. nop __LINE__
  2283. FMA_D f115 = f104, f115, f39
  2284. nop __LINE__
  2285. }
  2286. ;;
  2287. { .mfi
  2288. STFD [BOFFSET] = f66, SIZE
  2289. FNMA f64 = f106, f66, f64
  2290. nop __LINE__
  2291. }
  2292. { .mfi
  2293. STFD [BOFFSET2] = f98, SIZE
  2294. FNMA f96 = f106, f98, f96
  2295. nop __LINE__
  2296. }
  2297. ;;
  2298. { .mfi
  2299. STFD [BOFFSET] = f67, SIZE
  2300. FMA_A f65 = f107, f66, f65
  2301. nop __LINE__
  2302. }
  2303. { .mfi
  2304. STFD [BOFFSET2] = f99, SIZE
  2305. FMA_A f97 = f107, f98, f97
  2306. nop __LINE__
  2307. }
  2308. ;;
  2309. { .mfi
  2310. STFD [BOFFSET] = f82, SIZE
  2311. FNMA f80 = f106, f82, f80
  2312. nop __LINE__
  2313. }
  2314. { .mfi
  2315. STFD [BOFFSET2] = f114, SIZE
  2316. FNMA f112 = f106, f114, f112
  2317. nop __LINE__
  2318. }
  2319. ;;
  2320. { .mfi
  2321. STFD [BOFFSET] = f83, -11 * SIZE
  2322. FMA_A f81 = f107, f82, f81
  2323. nop __LINE__
  2324. }
  2325. { .mfi
  2326. STFD [BOFFSET2] = f115, -11 * SIZE
  2327. FMA_A f113 = f107, f114, f113
  2328. nop __LINE__
  2329. }
  2330. ;;
  2331. { .mfi
  2332. STFD [C1 ] = f66, SIZE
  2333. FMA_B f64 = f107, f67, f64
  2334. nop __LINE__
  2335. }
  2336. { .mfi
  2337. STFD [C3 ] = f98, SIZE
  2338. FMA_B f96 = f107, f99, f96
  2339. nop __LINE__
  2340. }
  2341. ;;
  2342. { .mfi
  2343. STFD [C1 ] = f67, -3 * SIZE
  2344. FNMA f65 = f106, f67, f65
  2345. nop __LINE__
  2346. }
  2347. { .mfi
  2348. STFD [C3 ] = f99, -3 * SIZE
  2349. FNMA f97 = f106, f99, f97
  2350. nop __LINE__
  2351. }
  2352. ;;
  2353. { .mfi
  2354. STFD [C2 ] = f82, SIZE
  2355. FMA_B f80 = f107, f83, f80
  2356. nop __LINE__
  2357. }
  2358. { .mfi
  2359. STFD [C4 ] = f114, SIZE
  2360. FMA_B f112 = f107, f115, f112
  2361. nop __LINE__
  2362. }
  2363. ;;
  2364. { .mfi
  2365. STFD [C2 ] = f83, -3 * SIZE
  2366. FNMA f81 = f106, f83, f81
  2367. nop __LINE__
  2368. }
  2369. { .mfi
  2370. STFD [C4 ] = f115, -3 * SIZE
  2371. FNMA f113 = f106, f115, f113
  2372. nop __LINE__
  2373. }
  2374. ;;
  2375. { .mfi
  2376. nop __LINE__
  2377. FMPY f32 = f120, f64
  2378. nop __LINE__
  2379. }
  2380. { .mfi
  2381. nop __LINE__
  2382. FMPY f36 = f120, f96
  2383. nop __LINE__
  2384. }
  2385. ;;
  2386. { .mfi
  2387. nop __LINE__
  2388. FMPY f33 = f121, f64
  2389. nop __LINE__
  2390. }
  2391. { .mfi
  2392. nop __LINE__
  2393. FMPY f37 = f121, f96
  2394. nop __LINE__
  2395. }
  2396. ;;
  2397. { .mfi
  2398. nop __LINE__
  2399. FMPY f34 = f120, f80
  2400. nop __LINE__
  2401. }
  2402. { .mfi
  2403. nop __LINE__
  2404. FMPY f38 = f120, f112
  2405. nop __LINE__
  2406. }
  2407. ;;
  2408. { .mfi
  2409. nop __LINE__
  2410. FMPY f35 = f121, f80
  2411. nop __LINE__
  2412. }
  2413. { .mfi
  2414. nop __LINE__
  2415. FMPY f39 = f121, f112
  2416. nop __LINE__
  2417. }
  2418. ;;
  2419. { .mfi
  2420. nop __LINE__
  2421. FMA_C f64 = f121, f65, f32
  2422. nop __LINE__
  2423. }
  2424. { .mfi
  2425. nop __LINE__
  2426. FMA_C f96 = f121, f97, f36
  2427. nop __LINE__
  2428. }
  2429. ;;
  2430. { .mfi
  2431. nop __LINE__
  2432. FMA_D f65 = f120, f65, f33
  2433. nop __LINE__
  2434. }
  2435. { .mfi
  2436. nop __LINE__
  2437. FMA_D f97 = f120, f97, f37
  2438. nop __LINE__
  2439. }
  2440. ;;
  2441. { .mfi
  2442. nop __LINE__
  2443. FMA_C f80 = f121, f81, f34
  2444. nop __LINE__
  2445. }
  2446. { .mfi
  2447. nop __LINE__
  2448. FMA_C f112 = f121, f113, f38
  2449. nop __LINE__
  2450. }
  2451. ;;
  2452. { .mfi
  2453. nop __LINE__
  2454. FMA_D f81 = f120, f81, f35
  2455. nop __LINE__
  2456. }
  2457. { .mfi
  2458. nop __LINE__
  2459. FMA_D f113 = f120, f113, f39
  2460. nop __LINE__
  2461. }
  2462. ;;
  2463. { .mmi
  2464. STFD [BOFFSET] = f64, SIZE
  2465. STFD [BOFFSET2] = f96, SIZE
  2466. nop __LINE__
  2467. }
  2468. ;;
  2469. { .mmi
  2470. STFD [BOFFSET] = f65, SIZE
  2471. STFD [BOFFSET2] = f97, SIZE
  2472. nop __LINE__
  2473. }
  2474. ;;
  2475. { .mmi
  2476. STFD [BOFFSET] = f80, SIZE
  2477. STFD [BOFFSET2] = f112, SIZE
  2478. nop __LINE__
  2479. }
  2480. ;;
  2481. { .mmi
  2482. STFD [BOFFSET] = f81, -3 * SIZE
  2483. STFD [BOFFSET2] = f113, -3 * SIZE
  2484. nop __LINE__
  2485. }
  2486. ;;
  2487. { .mfi
  2488. STFD [C1 ] = f64, SIZE
  2489. mov f64 = f0
  2490. nop __LINE__
  2491. }
  2492. { .mfi
  2493. STFD [C3 ] = f96, SIZE
  2494. mov f96 = f0
  2495. nop __LINE__
  2496. }
  2497. ;;
  2498. { .mfi
  2499. STFD [C1 ] = f65, -1 * SIZE
  2500. mov f65 = f0
  2501. adds KK = -4, KK
  2502. }
  2503. { .mfi
  2504. STFD [C3 ] = f97, -1 * SIZE
  2505. mov f97 = f0
  2506. nop __LINE__
  2507. }
  2508. ;;
  2509. { .mfi
  2510. STFD [C2 ] = f80, SIZE
  2511. mov f80 = f0
  2512. cmp.ne p6, p0 = 1, I
  2513. }
  2514. { .mfi
  2515. STFD [C4 ] = f112, SIZE
  2516. mov f112 = f0
  2517. sub L = K, KK
  2518. }
  2519. ;;
  2520. { .mfi
  2521. STFD [C2 ] = f81, -1 * SIZE
  2522. mov f81 = f0
  2523. adds I = -1, I
  2524. }
  2525. { .mfb
  2526. STFD [C4 ] = f113, -1 * SIZE
  2527. mov f113 = f0
  2528. (p6) br.cond.dptk .L011
  2529. }
  2530. ;;
  2531. #endif
  2532. #ifdef LT
  2533. { .mfi
  2534. LDFPD f76, f77 = [AOFFSET], 2 * SIZE
  2535. FMPY f32 = f72, f64
  2536. nop __LINE__
  2537. }
  2538. { .mfi
  2539. nop __LINE__
  2540. FMPY f36 = f72, f96
  2541. nop __LINE__
  2542. }
  2543. ;;
  2544. { .mfi
  2545. LDFPD f78, f79 = [AOFFSET]
  2546. FMPY f33 = f73, f64
  2547. adds AOFFSET = 4 * SIZE, AOFFSET
  2548. }
  2549. { .mfi
  2550. nop __LINE__
  2551. FMPY f37 = f73, f96
  2552. nop __LINE__
  2553. }
  2554. ;;
  2555. { .mfi
  2556. LDFPD f90, f91 = [AOFFSET], 2 * SIZE
  2557. FMPY f34 = f72, f80
  2558. nop __LINE__
  2559. }
  2560. { .mfi
  2561. nop __LINE__
  2562. FMPY f38 = f72, f112
  2563. nop __LINE__
  2564. }
  2565. ;;
  2566. { .mfi
  2567. LDFPD f92, f93 = [AOFFSET], 2 * SIZE
  2568. FMPY f35 = f73, f80
  2569. nop __LINE__
  2570. }
  2571. { .mfi
  2572. nop __LINE__
  2573. FMPY f39 = f73, f112
  2574. nop __LINE__
  2575. }
  2576. ;;
  2577. { .mfi
  2578. LDFPD f94, f95 = [AOFFSET]
  2579. FMA_C f64 = f73, f65, f32
  2580. adds AOFFSET = 6 * SIZE, AOFFSET
  2581. }
  2582. { .mfi
  2583. nop __LINE__
  2584. FMA_C f96 = f73, f97, f36
  2585. nop __LINE__
  2586. }
  2587. ;;
  2588. { .mfi
  2589. LDFPD f108, f109 = [AOFFSET], 2 * SIZE
  2590. FMA_D f65 = f72, f65, f33
  2591. nop __LINE__
  2592. }
  2593. { .mfi
  2594. nop __LINE__
  2595. FMA_D f97 = f72, f97, f37
  2596. nop __LINE__
  2597. }
  2598. ;;
  2599. { .mfi
  2600. LDFPD f110, f111 = [AOFFSET]
  2601. FMA_C f80 = f73, f81, f34
  2602. adds AOFFSET = 8 * SIZE, AOFFSET
  2603. }
  2604. { .mfi
  2605. nop __LINE__
  2606. FMA_C f112 = f73, f113, f38
  2607. nop __LINE__
  2608. }
  2609. ;;
  2610. { .mfi
  2611. LDFPD f126, f127 = [AOFFSET]
  2612. FMA_D f81 = f72, f81, f35
  2613. adds AOFFSET = - 30 * SIZE, AOFFSET
  2614. }
  2615. { .mfi
  2616. nop __LINE__
  2617. FMA_D f113 = f72, f113, f39
  2618. adds BOFFSET2 = 4 * SIZE, BOFFSET
  2619. }
  2620. ;;
  2621. { .mfi
  2622. STFD [BOFFSET] = f64, SIZE
  2623. FNMA f66 = f74, f64, f66
  2624. nop __LINE__
  2625. }
  2626. { .mfi
  2627. STFD [BOFFSET2] = f96, SIZE
  2628. FNMA f98 = f74, f96, f98
  2629. nop __LINE__
  2630. }
  2631. ;;
  2632. { .mfi
  2633. STFD [BOFFSET] = f65, SIZE
  2634. FMA_A f67 = f75, f64, f67
  2635. nop __LINE__
  2636. }
  2637. { .mfi
  2638. STFD [BOFFSET2] = f97, SIZE
  2639. FMA_A f99 = f75, f96, f99
  2640. nop __LINE__
  2641. }
  2642. ;;
  2643. { .mfi
  2644. STFD [BOFFSET] = f80, SIZE
  2645. FNMA f82 = f74, f80, f82
  2646. nop __LINE__
  2647. }
  2648. { .mfi
  2649. STFD [BOFFSET2] = f112, SIZE
  2650. FNMA f114 = f74, f112, f114
  2651. nop __LINE__
  2652. }
  2653. ;;
  2654. { .mfi
  2655. STFD [BOFFSET] = f81, 5 * SIZE
  2656. FMA_A f83 = f75, f80, f83
  2657. nop __LINE__
  2658. }
  2659. { .mfi
  2660. STFD [BOFFSET2] = f113, 5 * SIZE
  2661. FMA_A f115 = f75, f112, f115
  2662. nop __LINE__
  2663. }
  2664. ;;
  2665. { .mfi
  2666. STFD [C1 ] = f64, SIZE
  2667. FMA_B f66 = f75, f65, f66
  2668. nop __LINE__
  2669. }
  2670. { .mfi
  2671. STFD [C3 ] = f96, SIZE
  2672. FMA_B f98 = f75, f97, f98
  2673. nop __LINE__
  2674. }
  2675. ;;
  2676. { .mfi
  2677. STFD [C1 ] = f65, SIZE
  2678. FNMA f67 = f74, f65, f67
  2679. nop __LINE__
  2680. }
  2681. { .mfi
  2682. STFD [C3 ] = f97, SIZE
  2683. FNMA f99 = f74, f97, f99
  2684. nop __LINE__
  2685. }
  2686. ;;
  2687. { .mfi
  2688. STFD [C2 ] = f80, SIZE
  2689. FMA_B f82 = f75, f81, f82
  2690. nop __LINE__
  2691. }
  2692. { .mfi
  2693. STFD [C4 ] = f112, SIZE
  2694. FMA_B f114 = f75, f113, f114
  2695. nop __LINE__
  2696. }
  2697. ;;
  2698. { .mfi
  2699. STFD [C2 ] = f81, SIZE
  2700. FNMA f83 = f74, f81, f83
  2701. nop __LINE__
  2702. }
  2703. { .mfi
  2704. STFD [C4 ] = f113, SIZE
  2705. FNMA f115 = f74, f113, f115
  2706. nop __LINE__
  2707. }
  2708. ;;
  2709. { .mfi
  2710. nop __LINE__
  2711. FNMA f68 = f76, f64, f68
  2712. nop __LINE__
  2713. }
  2714. { .mfi
  2715. nop __LINE__
  2716. FNMA f100 = f76, f96, f100
  2717. nop __LINE__
  2718. }
  2719. ;;
  2720. { .mfi
  2721. nop __LINE__
  2722. FMA_A f69 = f77, f64, f69
  2723. nop __LINE__
  2724. }
  2725. { .mfi
  2726. nop __LINE__
  2727. FMA_A f101 = f77, f96, f101
  2728. nop __LINE__
  2729. }
  2730. ;;
  2731. { .mfi
  2732. nop __LINE__
  2733. FNMA f84 = f76, f80, f84
  2734. nop __LINE__
  2735. }
  2736. { .mfi
  2737. nop __LINE__
  2738. FNMA f116 = f76, f112, f116
  2739. nop __LINE__
  2740. }
  2741. ;;
  2742. { .mfi
  2743. nop __LINE__
  2744. FMA_A f85 = f77, f80, f85
  2745. nop __LINE__
  2746. }
  2747. { .mfi
  2748. nop __LINE__
  2749. FMA_A f117 = f77, f112, f117
  2750. nop __LINE__
  2751. }
  2752. ;;
  2753. { .mfi
  2754. nop __LINE__
  2755. FMA_B f68 = f77, f65, f68
  2756. nop __LINE__
  2757. }
  2758. { .mfi
  2759. nop __LINE__
  2760. FMA_B f100 = f77, f97, f100
  2761. nop __LINE__
  2762. }
  2763. ;;
  2764. { .mfi
  2765. nop __LINE__
  2766. FNMA f69 = f76, f65, f69
  2767. nop __LINE__
  2768. }
  2769. { .mfi
  2770. nop __LINE__
  2771. FNMA f101 = f76, f97, f101
  2772. nop __LINE__
  2773. }
  2774. ;;
  2775. { .mfi
  2776. nop __LINE__
  2777. FMA_B f84 = f77, f81, f84
  2778. nop __LINE__
  2779. }
  2780. { .mfi
  2781. nop __LINE__
  2782. FMA_B f116 = f77, f113, f116
  2783. nop __LINE__
  2784. }
  2785. ;;
  2786. { .mfi
  2787. nop __LINE__
  2788. FNMA f85 = f76, f81, f85
  2789. nop __LINE__
  2790. }
  2791. { .mfi
  2792. nop __LINE__
  2793. FNMA f117 = f76, f113, f117
  2794. nop __LINE__
  2795. }
  2796. ;;
  2797. { .mfi
  2798. nop __LINE__
  2799. FNMA f70 = f78, f64, f70
  2800. nop __LINE__
  2801. }
  2802. { .mfi
  2803. nop __LINE__
  2804. FNMA f102 = f78, f96, f102
  2805. nop __LINE__
  2806. }
  2807. ;;
  2808. { .mfi
  2809. nop __LINE__
  2810. FMA_A f71 = f79, f64, f71
  2811. nop __LINE__
  2812. }
  2813. { .mfi
  2814. nop __LINE__
  2815. FMA_A f103 = f79, f96, f103
  2816. nop __LINE__
  2817. }
  2818. ;;
  2819. { .mfi
  2820. nop __LINE__
  2821. FNMA f86 = f78, f80, f86
  2822. nop __LINE__
  2823. }
  2824. { .mfi
  2825. nop __LINE__
  2826. FNMA f118 = f78, f112, f118
  2827. nop __LINE__
  2828. }
  2829. ;;
  2830. { .mfi
  2831. nop __LINE__
  2832. FMA_A f87 = f79, f80, f87
  2833. nop __LINE__
  2834. }
  2835. { .mfi
  2836. nop __LINE__
  2837. FMA_A f119 = f79, f112, f119
  2838. nop __LINE__
  2839. }
  2840. ;;
  2841. { .mfi
  2842. nop __LINE__
  2843. FMA_B f70 = f79, f65, f70
  2844. nop __LINE__
  2845. }
  2846. { .mfi
  2847. nop __LINE__
  2848. FMA_B f102 = f79, f97, f102
  2849. nop __LINE__
  2850. }
  2851. ;;
  2852. { .mfi
  2853. nop __LINE__
  2854. FNMA f71 = f78, f65, f71
  2855. nop __LINE__
  2856. }
  2857. { .mfi
  2858. nop __LINE__
  2859. FNMA f103 = f78, f97, f103
  2860. nop __LINE__
  2861. }
  2862. ;;
  2863. { .mfi
  2864. nop __LINE__
  2865. FMA_B f86 = f79, f81, f86
  2866. nop __LINE__
  2867. }
  2868. { .mfi
  2869. nop __LINE__
  2870. FMA_B f118 = f79, f113, f118
  2871. nop __LINE__
  2872. }
  2873. ;;
  2874. { .mfi
  2875. nop __LINE__
  2876. FNMA f87 = f78, f81, f87
  2877. nop __LINE__
  2878. }
  2879. { .mfi
  2880. nop __LINE__
  2881. FNMA f119 = f78, f113, f119
  2882. nop __LINE__
  2883. }
  2884. ;;
  2885. { .mfi
  2886. nop __LINE__
  2887. FMPY f32 = f90, f66
  2888. nop __LINE__
  2889. }
  2890. { .mfi
  2891. nop __LINE__
  2892. FMPY f36 = f90, f98
  2893. nop __LINE__
  2894. }
  2895. ;;
  2896. { .mfi
  2897. nop __LINE__
  2898. FMPY f33 = f91, f66
  2899. nop __LINE__
  2900. }
  2901. { .mfi
  2902. nop __LINE__
  2903. FMPY f37 = f91, f98
  2904. nop __LINE__
  2905. }
  2906. ;;
  2907. { .mfi
  2908. nop __LINE__
  2909. FMPY f34 = f90, f82
  2910. nop __LINE__
  2911. }
  2912. { .mfi
  2913. nop __LINE__
  2914. FMPY f38 = f90, f114
  2915. nop __LINE__
  2916. }
  2917. ;;
  2918. { .mfi
  2919. nop __LINE__
  2920. FMPY f35 = f91, f82
  2921. nop __LINE__
  2922. }
  2923. { .mfi
  2924. nop __LINE__
  2925. FMPY f39 = f91, f114
  2926. nop __LINE__
  2927. }
  2928. ;;
  2929. { .mfi
  2930. nop __LINE__
  2931. FMA_C f66 = f91, f67, f32
  2932. nop __LINE__
  2933. }
  2934. { .mfi
  2935. nop __LINE__
  2936. FMA_C f98 = f91, f99, f36
  2937. nop __LINE__
  2938. }
  2939. ;;
  2940. { .mfi
  2941. nop __LINE__
  2942. FMA_D f67 = f90, f67, f33
  2943. nop __LINE__
  2944. }
  2945. { .mfi
  2946. nop __LINE__
  2947. FMA_D f99 = f90, f99, f37
  2948. nop __LINE__
  2949. }
  2950. ;;
  2951. { .mfi
  2952. nop __LINE__
  2953. FMA_C f82 = f91, f83, f34
  2954. nop __LINE__
  2955. }
  2956. { .mfi
  2957. nop __LINE__
  2958. FMA_C f114 = f91, f115, f38
  2959. nop __LINE__
  2960. }
  2961. ;;
  2962. { .mfi
  2963. nop __LINE__
  2964. FMA_D f83 = f90, f83, f35
  2965. nop __LINE__
  2966. }
  2967. { .mfi
  2968. nop __LINE__
  2969. FMA_D f115 = f90, f115, f39
  2970. nop __LINE__
  2971. }
  2972. ;;
  2973. { .mfi
  2974. STFD [BOFFSET] = f66, SIZE
  2975. FNMA f68 = f92, f66, f68
  2976. nop __LINE__
  2977. }
  2978. { .mfi
  2979. STFD [BOFFSET2] = f98, SIZE
  2980. FNMA f100 = f92, f98, f100
  2981. nop __LINE__
  2982. }
  2983. ;;
  2984. { .mfi
  2985. STFD [BOFFSET] = f67, SIZE
  2986. FMA_A f69 = f93, f66, f69
  2987. nop __LINE__
  2988. }
  2989. { .mfi
  2990. STFD [BOFFSET2] = f99, SIZE
  2991. FMA_A f101 = f93, f98, f101
  2992. nop __LINE__
  2993. }
  2994. ;;
  2995. { .mfi
  2996. STFD [BOFFSET] = f82, SIZE
  2997. FNMA f84 = f92, f82, f84
  2998. nop __LINE__
  2999. }
  3000. { .mfi
  3001. STFD [BOFFSET2] = f114, SIZE
  3002. FNMA f116 = f92, f114, f116
  3003. nop __LINE__
  3004. }
  3005. ;;
  3006. { .mfi
  3007. STFD [BOFFSET] = f83, 5 * SIZE
  3008. FMA_A f85 = f93, f82, f85
  3009. nop __LINE__
  3010. }
  3011. { .mfi
  3012. STFD [BOFFSET2] = f115, 5 * SIZE
  3013. FMA_A f117 = f93, f114, f117
  3014. nop __LINE__
  3015. }
  3016. ;;
  3017. { .mfi
  3018. STFD [C1 ] = f66, SIZE
  3019. FMA_B f68 = f93, f67, f68
  3020. nop __LINE__
  3021. }
  3022. { .mfi
  3023. STFD [C3 ] = f98, SIZE
  3024. FMA_B f100 = f93, f99, f100
  3025. nop __LINE__
  3026. }
  3027. ;;
  3028. { .mfi
  3029. STFD [C1 ] = f67, SIZE
  3030. FNMA f69 = f92, f67, f69
  3031. nop __LINE__
  3032. }
  3033. { .mfi
  3034. STFD [C3 ] = f99, SIZE
  3035. FNMA f101 = f92, f99, f101
  3036. nop __LINE__
  3037. }
  3038. ;;
  3039. { .mfi
  3040. STFD [C2 ] = f82, SIZE
  3041. FMA_B f84 = f93, f83, f84
  3042. nop __LINE__
  3043. }
  3044. { .mfi
  3045. STFD [C4 ] = f114, SIZE
  3046. FMA_B f116 = f93, f115, f116
  3047. nop __LINE__
  3048. }
  3049. ;;
  3050. { .mfi
  3051. STFD [C2 ] = f83, SIZE
  3052. FNMA f85 = f92, f83, f85
  3053. nop __LINE__
  3054. }
  3055. { .mfi
  3056. STFD [C4 ] = f115, SIZE
  3057. FNMA f117 = f92, f115, f117
  3058. nop __LINE__
  3059. }
  3060. ;;
  3061. { .mfi
  3062. nop __LINE__
  3063. FNMA f70 = f94, f66, f70
  3064. nop __LINE__
  3065. }
  3066. { .mfi
  3067. nop __LINE__
  3068. FNMA f102 = f94, f98, f102
  3069. nop __LINE__
  3070. }
  3071. ;;
  3072. { .mfi
  3073. nop __LINE__
  3074. FMA_A f71 = f95, f66, f71
  3075. nop __LINE__
  3076. }
  3077. { .mfi
  3078. nop __LINE__
  3079. FMA_A f103 = f95, f98, f103
  3080. nop __LINE__
  3081. }
  3082. ;;
  3083. { .mfi
  3084. nop __LINE__
  3085. FNMA f86 = f94, f82, f86
  3086. nop __LINE__
  3087. }
  3088. { .mfi
  3089. nop __LINE__
  3090. FNMA f118 = f94, f114, f118
  3091. nop __LINE__
  3092. }
  3093. ;;
  3094. { .mfi
  3095. nop __LINE__
  3096. FMA_A f87 = f95, f82, f87
  3097. nop __LINE__
  3098. }
  3099. { .mfi
  3100. nop __LINE__
  3101. FMA_A f119 = f95, f114, f119
  3102. nop __LINE__
  3103. }
  3104. ;;
  3105. { .mfi
  3106. nop __LINE__
  3107. FMA_B f70 = f95, f67, f70
  3108. nop __LINE__
  3109. }
  3110. { .mfi
  3111. nop __LINE__
  3112. FMA_B f102 = f95, f99, f102
  3113. nop __LINE__
  3114. }
  3115. ;;
  3116. { .mfi
  3117. nop __LINE__
  3118. FNMA f71 = f94, f67, f71
  3119. nop __LINE__
  3120. }
  3121. { .mfi
  3122. nop __LINE__
  3123. FNMA f103 = f94, f99, f103
  3124. nop __LINE__
  3125. }
  3126. ;;
  3127. { .mfi
  3128. nop __LINE__
  3129. FMA_B f86 = f95, f83, f86
  3130. nop __LINE__
  3131. }
  3132. { .mfi
  3133. nop __LINE__
  3134. FMA_B f118 = f95, f115, f118
  3135. nop __LINE__
  3136. }
  3137. ;;
  3138. { .mfi
  3139. nop __LINE__
  3140. FNMA f87 = f94, f83, f87
  3141. nop __LINE__
  3142. }
  3143. { .mfi
  3144. nop __LINE__
  3145. FNMA f119 = f94, f115, f119
  3146. nop __LINE__
  3147. }
  3148. ;;
  3149. { .mfi
  3150. nop __LINE__
  3151. FMPY f32 = f108, f68
  3152. nop __LINE__
  3153. }
  3154. { .mfi
  3155. nop __LINE__
  3156. FMPY f36 = f108, f100
  3157. nop __LINE__
  3158. }
  3159. { .mfi
  3160. nop __LINE__
  3161. FMPY f33 = f109, f68
  3162. nop __LINE__
  3163. }
  3164. { .mfi
  3165. nop __LINE__
  3166. FMPY f37 = f109, f100
  3167. nop __LINE__
  3168. }
  3169. { .mfi
  3170. nop __LINE__
  3171. FMPY f34 = f108, f84
  3172. nop __LINE__
  3173. }
  3174. { .mfi
  3175. nop __LINE__
  3176. FMPY f38 = f108, f116
  3177. nop __LINE__
  3178. }
  3179. { .mfi
  3180. nop __LINE__
  3181. FMPY f35 = f109, f84
  3182. nop __LINE__
  3183. }
  3184. { .mfi
  3185. nop __LINE__
  3186. FMPY f39 = f109, f116
  3187. nop __LINE__
  3188. }
  3189. ;;
  3190. { .mfi
  3191. nop __LINE__
  3192. FMA_C f68 = f109, f69, f32
  3193. nop __LINE__
  3194. }
  3195. { .mfi
  3196. nop __LINE__
  3197. FMA_C f100 = f109, f101, f36
  3198. nop __LINE__
  3199. }
  3200. { .mfi
  3201. nop __LINE__
  3202. FMA_D f69 = f108, f69, f33
  3203. nop __LINE__
  3204. }
  3205. { .mfi
  3206. nop __LINE__
  3207. FMA_D f101 = f108, f101, f37
  3208. nop __LINE__
  3209. }
  3210. { .mfi
  3211. nop __LINE__
  3212. FMA_C f84 = f109, f85, f34
  3213. nop __LINE__
  3214. }
  3215. { .mfi
  3216. nop __LINE__
  3217. FMA_C f116 = f109, f117, f38
  3218. nop __LINE__
  3219. }
  3220. { .mfi
  3221. nop __LINE__
  3222. FMA_D f85 = f108, f85, f35
  3223. nop __LINE__
  3224. }
  3225. { .mfi
  3226. nop __LINE__
  3227. FMA_D f117 = f108, f117, f39
  3228. nop __LINE__
  3229. }
  3230. ;;
  3231. { .mfi
  3232. STFD [BOFFSET] = f68, SIZE
  3233. FNMA f70 = f110, f68, f70
  3234. nop __LINE__
  3235. }
  3236. { .mfi
  3237. STFD [BOFFSET2] = f100, SIZE
  3238. FNMA f102 = f110, f100, f102
  3239. nop __LINE__
  3240. }
  3241. ;;
  3242. { .mfi
  3243. STFD [BOFFSET] = f69, SIZE
  3244. FMA_A f71 = f111, f68, f71
  3245. nop __LINE__
  3246. }
  3247. { .mfi
  3248. STFD [BOFFSET2] = f101, SIZE
  3249. FMA_A f103 = f111, f100, f103
  3250. nop __LINE__
  3251. }
  3252. ;;
  3253. { .mfi
  3254. STFD [BOFFSET] = f84, SIZE
  3255. FNMA f86 = f110, f84, f86
  3256. nop __LINE__
  3257. }
  3258. { .mfi
  3259. STFD [BOFFSET2] = f116, SIZE
  3260. FNMA f118 = f110, f116, f118
  3261. nop __LINE__
  3262. }
  3263. ;;
  3264. { .mfi
  3265. STFD [BOFFSET] = f85, 5 * SIZE
  3266. FMA_A f87 = f111, f84, f87
  3267. nop __LINE__
  3268. }
  3269. { .mfi
  3270. STFD [BOFFSET2] = f117, 5 * SIZE
  3271. FMA_A f119 = f111, f116, f119
  3272. nop __LINE__
  3273. }
  3274. ;;
  3275. { .mfi
  3276. STFD [C1 ] = f68, SIZE
  3277. FMA_B f70 = f111, f69, f70
  3278. nop __LINE__
  3279. }
  3280. { .mfi
  3281. STFD [C3 ] = f100, SIZE
  3282. FMA_B f102 = f111, f101, f102
  3283. nop __LINE__
  3284. }
  3285. ;;
  3286. { .mfi
  3287. STFD [C1 ] = f69, SIZE
  3288. FNMA f71 = f110, f69, f71
  3289. nop __LINE__
  3290. }
  3291. { .mfi
  3292. STFD [C3 ] = f101, SIZE
  3293. FNMA f103 = f110, f101, f103
  3294. nop __LINE__
  3295. }
  3296. ;;
  3297. { .mfi
  3298. STFD [C2 ] = f84, SIZE
  3299. FMA_B f86 = f111, f85, f86
  3300. nop __LINE__
  3301. }
  3302. { .mfi
  3303. STFD [C4 ] = f116, SIZE
  3304. FMA_B f118 = f111, f117, f118
  3305. nop __LINE__
  3306. }
  3307. ;;
  3308. { .mfi
  3309. STFD [C2 ] = f85, SIZE
  3310. FNMA f87 = f110, f85, f87
  3311. nop __LINE__
  3312. }
  3313. { .mfi
  3314. STFD [C4 ] = f117, SIZE
  3315. FNMA f119 = f110, f117, f119
  3316. nop __LINE__
  3317. }
  3318. ;;
  3319. { .mfi
  3320. nop __LINE__
  3321. FMPY f32 = f126, f70
  3322. nop __LINE__
  3323. }
  3324. { .mfi
  3325. nop __LINE__
  3326. FMPY f36 = f126, f102
  3327. nop __LINE__
  3328. }
  3329. ;;
  3330. { .mfi
  3331. nop __LINE__
  3332. FMPY f33 = f127, f70
  3333. nop __LINE__
  3334. }
  3335. { .mfi
  3336. nop __LINE__
  3337. FMPY f37 = f127, f102
  3338. nop __LINE__
  3339. }
  3340. ;;
  3341. { .mfi
  3342. nop __LINE__
  3343. FMPY f34 = f126, f86
  3344. nop __LINE__
  3345. }
  3346. { .mfi
  3347. nop __LINE__
  3348. FMPY f38 = f126, f118
  3349. nop __LINE__
  3350. }
  3351. ;;
  3352. { .mfi
  3353. nop __LINE__
  3354. FMPY f35 = f127, f86
  3355. nop __LINE__
  3356. }
  3357. { .mfi
  3358. nop __LINE__
  3359. FMPY f39 = f127, f118
  3360. nop __LINE__
  3361. }
  3362. ;;
  3363. { .mfi
  3364. nop __LINE__
  3365. FMA_C f70 = f127, f71, f32
  3366. nop __LINE__
  3367. }
  3368. { .mfi
  3369. nop __LINE__
  3370. FMA_C f102 = f127, f103, f36
  3371. nop __LINE__
  3372. }
  3373. ;;
  3374. { .mfi
  3375. nop __LINE__
  3376. FMA_D f71 = f126, f71, f33
  3377. nop __LINE__
  3378. }
  3379. { .mfi
  3380. nop __LINE__
  3381. FMA_D f103 = f126, f103, f37
  3382. nop __LINE__
  3383. }
  3384. ;;
  3385. { .mfi
  3386. nop __LINE__
  3387. FMA_C f86 = f127, f87, f34
  3388. nop __LINE__
  3389. }
  3390. { .mfi
  3391. nop __LINE__
  3392. FMA_C f118 = f127, f119, f38
  3393. nop __LINE__
  3394. }
  3395. ;;
  3396. { .mfi
  3397. nop __LINE__
  3398. FMA_D f87 = f126, f87, f35
  3399. nop __LINE__
  3400. }
  3401. { .mfi
  3402. nop __LINE__
  3403. FMA_D f119 = f126, f119, f39
  3404. nop __LINE__
  3405. }
  3406. ;;
  3407. { .mmi
  3408. STFD [BOFFSET] = f70, SIZE
  3409. STFD [BOFFSET2] = f102, SIZE
  3410. nop __LINE__
  3411. }
  3412. ;;
  3413. { .mmi
  3414. STFD [BOFFSET] = f71, SIZE
  3415. STFD [BOFFSET2] = f103, SIZE
  3416. sub r2 = K, KK
  3417. }
  3418. ;;
  3419. { .mmi
  3420. STFD [BOFFSET] = f86, SIZE
  3421. STFD [BOFFSET2] = f118, SIZE
  3422. adds KK = 4, KK
  3423. }
  3424. ;;
  3425. { .mmi
  3426. STFD [BOFFSET] = f87, -27 * SIZE
  3427. STFD [BOFFSET2] = f119
  3428. shladd r2 = r2, ZBASE_SHIFT, r0
  3429. }
  3430. ;;
  3431. { .mfi
  3432. STFD [C1 ] = f70, SIZE
  3433. mov f64 = f0
  3434. shladd AOFFSET = r2, 2, AOFFSET
  3435. }
  3436. { .mfi
  3437. STFD [C3 ] = f102, SIZE
  3438. mov f65 = f0
  3439. shladd BOFFSET = r2, 2, BOFFSET
  3440. }
  3441. ;;
  3442. { .mfi
  3443. STFD [C1 ] = f71, SIZE
  3444. mov f80 = f0
  3445. mov L = KK
  3446. }
  3447. { .mfi
  3448. STFD [C3 ] = f103, SIZE
  3449. mov f81 = f0
  3450. nop __LINE__
  3451. }
  3452. ;;
  3453. { .mfi
  3454. STFD [C2 ] = f86, SIZE
  3455. mov f96 = f0
  3456. cmp.ne p6, p0 = 1, I
  3457. }
  3458. { .mfi
  3459. STFD [C4 ] = f118, SIZE
  3460. mov f97 = f0
  3461. nop __LINE__
  3462. }
  3463. ;;
  3464. { .mfi
  3465. STFD [C2 ] = f87, SIZE
  3466. mov f112 = f0
  3467. adds I = -1, I
  3468. }
  3469. { .mfb
  3470. STFD [C4 ] = f119, SIZE
  3471. mov f113 = f0
  3472. (p6) br.cond.dptk .L011
  3473. }
  3474. ;;
  3475. #endif
  3476. #ifdef RN
  3477. { .mfi
  3478. LDFPD f76, f77 = [BOFFSET], 2 * SIZE
  3479. FMPY f32 = f72, f64
  3480. nop __LINE__
  3481. }
  3482. { .mfi
  3483. nop __LINE__
  3484. FMPY f36 = f72, f68
  3485. nop __LINE__
  3486. }
  3487. ;;
  3488. { .mfi
  3489. LDFPD f78, f79 = [BOFFSET]
  3490. FMPY f33 = f73, f64
  3491. adds BOFFSET = 4 * SIZE, BOFFSET
  3492. }
  3493. { .mfi
  3494. nop __LINE__
  3495. FMPY f37 = f73, f68
  3496. nop __LINE__
  3497. }
  3498. ;;
  3499. { .mfi
  3500. LDFPD f90, f91 = [BOFFSET], 2 * SIZE
  3501. FMPY f34 = f72, f66
  3502. nop __LINE__
  3503. }
  3504. { .mfi
  3505. nop __LINE__
  3506. FMPY f38 = f72, f70
  3507. nop __LINE__
  3508. }
  3509. ;;
  3510. { .mfi
  3511. LDFPD f92, f93 = [BOFFSET], 2 * SIZE
  3512. FMPY f35 = f73, f66
  3513. nop __LINE__
  3514. }
  3515. { .mfi
  3516. nop __LINE__
  3517. FMPY f39 = f73, f70
  3518. nop __LINE__
  3519. }
  3520. ;;
  3521. { .mfi
  3522. LDFPD f94, f95 = [BOFFSET]
  3523. FMA_C f64 = f73, f65, f32
  3524. adds BOFFSET = 6 * SIZE, BOFFSET
  3525. }
  3526. { .mfi
  3527. nop __LINE__
  3528. FMA_C f68 = f73, f69, f36
  3529. nop __LINE__
  3530. }
  3531. ;;
  3532. { .mfi
  3533. LDFPD f108, f109 = [BOFFSET], 2 * SIZE
  3534. FMA_D f65 = f72, f65, f33
  3535. nop __LINE__
  3536. }
  3537. { .mfi
  3538. nop __LINE__
  3539. FMA_D f69 = f72, f69, f37
  3540. nop __LINE__
  3541. }
  3542. ;;
  3543. { .mfi
  3544. LDFPD f110, f111 = [BOFFSET]
  3545. FMA_C f66 = f73, f67, f34
  3546. adds BOFFSET = 8 * SIZE, BOFFSET
  3547. }
  3548. { .mfi
  3549. nop __LINE__
  3550. FMA_C f70 = f73, f71, f38
  3551. nop __LINE__
  3552. }
  3553. ;;
  3554. { .mfi
  3555. LDFPD f126, f127 = [BOFFSET]
  3556. FMA_D f67 = f72, f67, f35
  3557. adds BOFFSET = - 30 * SIZE, BOFFSET
  3558. }
  3559. { .mfi
  3560. nop __LINE__
  3561. FMA_D f71 = f72, f71, f39
  3562. adds AOFFSET2 = 4 * SIZE, AOFFSET
  3563. }
  3564. ;;
  3565. { .mfi
  3566. STFD [AOFFSET] = f64, SIZE
  3567. FNMA f80 = f74, f64, f80
  3568. nop __LINE__
  3569. }
  3570. { .mfi
  3571. STFD [AOFFSET2] = f68, SIZE
  3572. FNMA f84 = f74, f68, f84
  3573. nop __LINE__
  3574. }
  3575. ;;
  3576. { .mfi
  3577. STFD [AOFFSET] = f65, SIZE
  3578. FMA_A f81 = f75, f64, f81
  3579. nop __LINE__
  3580. }
  3581. { .mfi
  3582. STFD [AOFFSET2] = f69, SIZE
  3583. FMA_A f85 = f75, f68, f85
  3584. nop __LINE__
  3585. }
  3586. ;;
  3587. { .mfi
  3588. STFD [AOFFSET] = f66, SIZE
  3589. FNMA f82 = f74, f66, f82
  3590. nop __LINE__
  3591. }
  3592. { .mfi
  3593. STFD [AOFFSET2] = f70, SIZE
  3594. FNMA f86 = f74, f70, f86
  3595. nop __LINE__
  3596. }
  3597. ;;
  3598. { .mfi
  3599. STFD [AOFFSET] = f67, 5 * SIZE
  3600. FMA_A f83 = f75, f66, f83
  3601. nop __LINE__
  3602. }
  3603. { .mfi
  3604. STFD [AOFFSET2] = f71, 5 * SIZE
  3605. FMA_A f87 = f75, f70, f87
  3606. nop __LINE__
  3607. }
  3608. ;;
  3609. { .mfi
  3610. STFD [C1 ] = f64, SIZE
  3611. FMA_B f80 = f75, f65, f80
  3612. nop __LINE__
  3613. }
  3614. { .mfi
  3615. STFD [C5 ] = f68, SIZE
  3616. FMA_B f84 = f75, f69, f84
  3617. nop __LINE__
  3618. }
  3619. ;;
  3620. { .mfi
  3621. STFD [C1 ] = f65, SIZE
  3622. FNMA f81 = f74, f65, f81
  3623. nop __LINE__
  3624. }
  3625. { .mfi
  3626. STFD [C5 ] = f69, SIZE
  3627. FNMA f85 = f74, f69, f85
  3628. nop __LINE__
  3629. }
  3630. ;;
  3631. { .mfi
  3632. STFD [C1 ] = f66, SIZE
  3633. FMA_B f82 = f75, f67, f82
  3634. nop __LINE__
  3635. }
  3636. { .mfi
  3637. STFD [C5 ] = f70, SIZE
  3638. FMA_B f86 = f75, f71, f86
  3639. nop __LINE__
  3640. }
  3641. ;;
  3642. { .mfi
  3643. STFD [C1 ] = f67, 5 * SIZE
  3644. FNMA f83 = f74, f67, f83
  3645. nop __LINE__
  3646. }
  3647. { .mfi
  3648. STFD [C5 ] = f71, 5 * SIZE
  3649. FNMA f87 = f74, f71, f87
  3650. nop __LINE__
  3651. }
  3652. ;;
  3653. { .mfi
  3654. nop __LINE__
  3655. FNMA f96 = f76, f64, f96
  3656. nop __LINE__
  3657. }
  3658. { .mfi
  3659. nop __LINE__
  3660. FNMA f100 = f76, f68, f100
  3661. nop __LINE__
  3662. }
  3663. ;;
  3664. { .mfi
  3665. nop __LINE__
  3666. FMA_A f97 = f77, f64, f97
  3667. nop __LINE__
  3668. }
  3669. { .mfi
  3670. nop __LINE__
  3671. FMA_A f101 = f77, f68, f101
  3672. nop __LINE__
  3673. }
  3674. ;;
  3675. { .mfi
  3676. nop __LINE__
  3677. FNMA f98 = f76, f66, f98
  3678. nop __LINE__
  3679. }
  3680. { .mfi
  3681. nop __LINE__
  3682. FNMA f102 = f76, f70, f102
  3683. nop __LINE__
  3684. }
  3685. ;;
  3686. { .mfi
  3687. nop __LINE__
  3688. FMA_A f99 = f77, f66, f99
  3689. nop __LINE__
  3690. }
  3691. { .mfi
  3692. nop __LINE__
  3693. FMA_A f103 = f77, f70, f103
  3694. nop __LINE__
  3695. }
  3696. ;;
  3697. { .mfi
  3698. nop __LINE__
  3699. FMA_B f96 = f77, f65, f96
  3700. nop __LINE__
  3701. }
  3702. { .mfi
  3703. nop __LINE__
  3704. FMA_B f100 = f77, f69, f100
  3705. nop __LINE__
  3706. }
  3707. ;;
  3708. { .mfi
  3709. nop __LINE__
  3710. FNMA f97 = f76, f65, f97
  3711. nop __LINE__
  3712. }
  3713. { .mfi
  3714. nop __LINE__
  3715. FNMA f101 = f76, f69, f101
  3716. nop __LINE__
  3717. }
  3718. ;;
  3719. { .mfi
  3720. nop __LINE__
  3721. FMA_B f98 = f77, f67, f98
  3722. nop __LINE__
  3723. }
  3724. { .mfi
  3725. nop __LINE__
  3726. FMA_B f102 = f77, f71, f102
  3727. nop __LINE__
  3728. }
  3729. ;;
  3730. { .mfi
  3731. nop __LINE__
  3732. FNMA f99 = f76, f67, f99
  3733. nop __LINE__
  3734. }
  3735. { .mfi
  3736. nop __LINE__
  3737. FNMA f103 = f76, f71, f103
  3738. nop __LINE__
  3739. }
  3740. ;;
  3741. { .mfi
  3742. nop __LINE__
  3743. FNMA f112 = f78, f64, f112
  3744. nop __LINE__
  3745. }
  3746. { .mfi
  3747. nop __LINE__
  3748. FNMA f116 = f78, f68, f116
  3749. nop __LINE__
  3750. }
  3751. ;;
  3752. { .mfi
  3753. nop __LINE__
  3754. FMA_A f113 = f79, f64, f113
  3755. nop __LINE__
  3756. }
  3757. { .mfi
  3758. nop __LINE__
  3759. FMA_A f117 = f79, f68, f117
  3760. nop __LINE__
  3761. }
  3762. ;;
  3763. { .mfi
  3764. nop __LINE__
  3765. FNMA f114 = f78, f66, f114
  3766. nop __LINE__
  3767. }
  3768. { .mfi
  3769. nop __LINE__
  3770. FNMA f118 = f78, f70, f118
  3771. nop __LINE__
  3772. }
  3773. ;;
  3774. { .mfi
  3775. nop __LINE__
  3776. FMA_A f115 = f79, f66, f115
  3777. nop __LINE__
  3778. }
  3779. { .mfi
  3780. nop __LINE__
  3781. FMA_A f119 = f79, f70, f119
  3782. nop __LINE__
  3783. }
  3784. ;;
  3785. { .mfi
  3786. nop __LINE__
  3787. FMA_B f112 = f79, f65, f112
  3788. nop __LINE__
  3789. }
  3790. { .mfi
  3791. nop __LINE__
  3792. FMA_B f116 = f79, f69, f116
  3793. nop __LINE__
  3794. }
  3795. ;;
  3796. { .mfi
  3797. nop __LINE__
  3798. FNMA f113 = f78, f65, f113
  3799. nop __LINE__
  3800. }
  3801. { .mfi
  3802. nop __LINE__
  3803. FNMA f117 = f78, f69, f117
  3804. nop __LINE__
  3805. }
  3806. ;;
  3807. { .mfi
  3808. nop __LINE__
  3809. FMA_B f114 = f79, f67, f114
  3810. nop __LINE__
  3811. }
  3812. { .mfi
  3813. nop __LINE__
  3814. FMA_B f118 = f79, f71, f118
  3815. nop __LINE__
  3816. }
  3817. ;;
  3818. { .mfi
  3819. nop __LINE__
  3820. FNMA f115 = f78, f67, f115
  3821. nop __LINE__
  3822. }
  3823. { .mfi
  3824. nop __LINE__
  3825. FNMA f119 = f78, f71, f119
  3826. nop __LINE__
  3827. }
  3828. ;;
  3829. { .mfi
  3830. nop __LINE__
  3831. FMPY f32 = f90, f80
  3832. nop __LINE__
  3833. }
  3834. { .mfi
  3835. nop __LINE__
  3836. FMPY f36 = f90, f84
  3837. nop __LINE__
  3838. }
  3839. { .mfi
  3840. nop __LINE__
  3841. FMPY f33 = f91, f80
  3842. nop __LINE__
  3843. }
  3844. { .mfi
  3845. nop __LINE__
  3846. FMPY f37 = f91, f84
  3847. nop __LINE__
  3848. }
  3849. { .mfi
  3850. nop __LINE__
  3851. FMPY f34 = f90, f82
  3852. nop __LINE__
  3853. }
  3854. { .mfi
  3855. nop __LINE__
  3856. FMPY f38 = f90, f86
  3857. nop __LINE__
  3858. }
  3859. { .mfi
  3860. nop __LINE__
  3861. FMPY f35 = f91, f82
  3862. nop __LINE__
  3863. }
  3864. { .mfi
  3865. nop __LINE__
  3866. FMPY f39 = f91, f86
  3867. nop __LINE__
  3868. }
  3869. ;;
  3870. { .mfi
  3871. nop __LINE__
  3872. FMA_C f80 = f91, f81, f32
  3873. nop __LINE__
  3874. }
  3875. { .mfi
  3876. nop __LINE__
  3877. FMA_C f84 = f91, f85, f36
  3878. nop __LINE__
  3879. }
  3880. { .mfi
  3881. nop __LINE__
  3882. FMA_D f81 = f90, f81, f33
  3883. nop __LINE__
  3884. }
  3885. { .mfi
  3886. nop __LINE__
  3887. FMA_D f85 = f90, f85, f37
  3888. nop __LINE__
  3889. }
  3890. { .mfi
  3891. nop __LINE__
  3892. FMA_C f82 = f91, f83, f34
  3893. nop __LINE__
  3894. }
  3895. { .mfi
  3896. nop __LINE__
  3897. FMA_C f86 = f91, f87, f38
  3898. nop __LINE__
  3899. }
  3900. { .mfi
  3901. nop __LINE__
  3902. FMA_D f83 = f90, f83, f35
  3903. nop __LINE__
  3904. }
  3905. { .mfi
  3906. nop __LINE__
  3907. FMA_D f87 = f90, f87, f39
  3908. nop __LINE__
  3909. }
  3910. ;;
  3911. { .mfi
  3912. STFD [AOFFSET] = f80, SIZE
  3913. FNMA f96 = f92, f80, f96
  3914. nop __LINE__
  3915. }
  3916. { .mfi
  3917. STFD [AOFFSET2] = f84, SIZE
  3918. FNMA f100 = f92, f84, f100
  3919. nop __LINE__
  3920. }
  3921. ;;
  3922. { .mfi
  3923. STFD [AOFFSET] = f81, SIZE
  3924. FMA_A f97 = f93, f80, f97
  3925. nop __LINE__
  3926. }
  3927. { .mfi
  3928. STFD [AOFFSET2] = f85, SIZE
  3929. FMA_A f101 = f93, f84, f101
  3930. nop __LINE__
  3931. }
  3932. ;;
  3933. { .mfi
  3934. STFD [AOFFSET] = f82, SIZE
  3935. FNMA f98 = f92, f82, f98
  3936. nop __LINE__
  3937. }
  3938. { .mfi
  3939. STFD [AOFFSET2] = f86, SIZE
  3940. FNMA f102 = f92, f86, f102
  3941. nop __LINE__
  3942. }
  3943. ;;
  3944. { .mfi
  3945. STFD [AOFFSET] = f83, 5 * SIZE
  3946. FMA_A f99 = f93, f82, f99
  3947. nop __LINE__
  3948. }
  3949. { .mfi
  3950. STFD [AOFFSET2] = f87, 5 * SIZE
  3951. FMA_A f103 = f93, f86, f103
  3952. nop __LINE__
  3953. }
  3954. ;;
  3955. { .mfi
  3956. STFD [C2 ] = f80, SIZE
  3957. FMA_B f96 = f93, f81, f96
  3958. nop __LINE__
  3959. }
  3960. { .mfi
  3961. STFD [C6 ] = f84, SIZE
  3962. FMA_B f100 = f93, f85, f100
  3963. nop __LINE__
  3964. }
  3965. ;;
  3966. { .mfi
  3967. STFD [C2 ] = f81, SIZE
  3968. FNMA f97 = f92, f81, f97
  3969. nop __LINE__
  3970. }
  3971. { .mfi
  3972. STFD [C6 ] = f85, SIZE
  3973. FNMA f101 = f92, f85, f101
  3974. nop __LINE__
  3975. }
  3976. ;;
  3977. { .mfi
  3978. STFD [C2 ] = f82, SIZE
  3979. FMA_B f98 = f93, f83, f98
  3980. nop __LINE__
  3981. }
  3982. { .mfi
  3983. STFD [C6 ] = f86, SIZE
  3984. FMA_B f102 = f93, f87, f102
  3985. nop __LINE__
  3986. }
  3987. ;;
  3988. { .mfi
  3989. STFD [C2 ] = f83, 5 * SIZE
  3990. FNMA f99 = f92, f83, f99
  3991. nop __LINE__
  3992. }
  3993. { .mfi
  3994. STFD [C6 ] = f87, 5 * SIZE
  3995. FNMA f103 = f92, f87, f103
  3996. nop __LINE__
  3997. }
  3998. ;;
  3999. { .mfi
  4000. nop __LINE__
  4001. FNMA f112 = f94, f80, f112
  4002. nop __LINE__
  4003. }
  4004. { .mfi
  4005. nop __LINE__
  4006. FNMA f116 = f94, f84, f116
  4007. nop __LINE__
  4008. }
  4009. ;;
  4010. { .mfi
  4011. nop __LINE__
  4012. FMA_A f113 = f95, f80, f113
  4013. nop __LINE__
  4014. }
  4015. { .mfi
  4016. nop __LINE__
  4017. FMA_A f117 = f95, f84, f117
  4018. nop __LINE__
  4019. }
  4020. ;;
  4021. { .mfi
  4022. nop __LINE__
  4023. FNMA f114 = f94, f82, f114
  4024. nop __LINE__
  4025. }
  4026. { .mfi
  4027. nop __LINE__
  4028. FNMA f118 = f94, f86, f118
  4029. nop __LINE__
  4030. }
  4031. ;;
  4032. { .mfi
  4033. nop __LINE__
  4034. FMA_A f115 = f95, f82, f115
  4035. nop __LINE__
  4036. }
  4037. { .mfi
  4038. nop __LINE__
  4039. FMA_A f119 = f95, f86, f119
  4040. nop __LINE__
  4041. }
  4042. ;;
  4043. { .mfi
  4044. nop __LINE__
  4045. FMA_B f112 = f95, f81, f112
  4046. nop __LINE__
  4047. }
  4048. { .mfi
  4049. nop __LINE__
  4050. FMA_B f116 = f95, f85, f116
  4051. nop __LINE__
  4052. }
  4053. ;;
  4054. { .mfi
  4055. nop __LINE__
  4056. FNMA f113 = f94, f81, f113
  4057. nop __LINE__
  4058. }
  4059. { .mfi
  4060. nop __LINE__
  4061. FNMA f117 = f94, f85, f117
  4062. nop __LINE__
  4063. }
  4064. ;;
  4065. { .mfi
  4066. nop __LINE__
  4067. FMA_B f114 = f95, f83, f114
  4068. nop __LINE__
  4069. }
  4070. { .mfi
  4071. nop __LINE__
  4072. FMA_B f118 = f95, f87, f118
  4073. nop __LINE__
  4074. }
  4075. ;;
  4076. { .mfi
  4077. nop __LINE__
  4078. FNMA f115 = f94, f83, f115
  4079. nop __LINE__
  4080. }
  4081. { .mfi
  4082. nop __LINE__
  4083. FNMA f119 = f94, f87, f119
  4084. nop __LINE__
  4085. }
  4086. ;;
  4087. { .mfi
  4088. nop __LINE__
  4089. FMPY f32 = f108, f96
  4090. nop __LINE__
  4091. }
  4092. { .mfi
  4093. nop __LINE__
  4094. FMPY f36 = f108, f100
  4095. nop __LINE__
  4096. }
  4097. ;;
  4098. { .mfi
  4099. nop __LINE__
  4100. FMPY f33 = f109, f96
  4101. nop __LINE__
  4102. }
  4103. { .mfi
  4104. nop __LINE__
  4105. FMPY f37 = f109, f100
  4106. nop __LINE__
  4107. }
  4108. ;;
  4109. { .mfi
  4110. nop __LINE__
  4111. FMPY f34 = f108, f98
  4112. nop __LINE__
  4113. }
  4114. { .mfi
  4115. nop __LINE__
  4116. FMPY f38 = f108, f102
  4117. nop __LINE__
  4118. }
  4119. ;;
  4120. { .mfi
  4121. nop __LINE__
  4122. FMPY f35 = f109, f98
  4123. nop __LINE__
  4124. }
  4125. { .mfi
  4126. nop __LINE__
  4127. FMPY f39 = f109, f102
  4128. nop __LINE__
  4129. }
  4130. ;;
  4131. { .mfi
  4132. nop __LINE__
  4133. FMA_C f96 = f109, f97, f32
  4134. nop __LINE__
  4135. }
  4136. { .mfi
  4137. nop __LINE__
  4138. FMA_C f100 = f109, f101, f36
  4139. nop __LINE__
  4140. }
  4141. ;;
  4142. { .mfi
  4143. nop __LINE__
  4144. FMA_D f97 = f108, f97, f33
  4145. nop __LINE__
  4146. }
  4147. { .mfi
  4148. nop __LINE__
  4149. FMA_D f101 = f108, f101, f37
  4150. nop __LINE__
  4151. }
  4152. ;;
  4153. { .mfi
  4154. nop __LINE__
  4155. FMA_C f98 = f109, f99, f34
  4156. nop __LINE__
  4157. }
  4158. { .mfi
  4159. nop __LINE__
  4160. FMA_C f102 = f109, f103, f38
  4161. nop __LINE__
  4162. }
  4163. ;;
  4164. { .mfi
  4165. nop __LINE__
  4166. FMA_D f99 = f108, f99, f35
  4167. nop __LINE__
  4168. }
  4169. { .mfi
  4170. nop __LINE__
  4171. FMA_D f103 = f108, f103, f39
  4172. nop __LINE__
  4173. }
  4174. ;;
  4175. { .mfi
  4176. STFD [AOFFSET] = f96, SIZE
  4177. FNMA f112 = f110, f96, f112
  4178. nop __LINE__
  4179. }
  4180. { .mfi
  4181. STFD [AOFFSET2] = f100, SIZE
  4182. FNMA f116 = f110, f100, f116
  4183. nop __LINE__
  4184. }
  4185. ;;
  4186. { .mfi
  4187. STFD [AOFFSET] = f97, SIZE
  4188. FMA_A f113 = f111, f96, f113
  4189. nop __LINE__
  4190. }
  4191. { .mfi
  4192. STFD [AOFFSET2] = f101, SIZE
  4193. FMA_A f117 = f111, f100, f117
  4194. nop __LINE__
  4195. }
  4196. ;;
  4197. { .mfi
  4198. STFD [AOFFSET] = f98, SIZE
  4199. FNMA f114 = f110, f98, f114
  4200. nop __LINE__
  4201. }
  4202. { .mfi
  4203. STFD [AOFFSET2] = f102, SIZE
  4204. FNMA f118 = f110, f102, f118
  4205. nop __LINE__
  4206. }
  4207. ;;
  4208. { .mfi
  4209. STFD [AOFFSET] = f99, 5 * SIZE
  4210. FMA_A f115 = f111, f98, f115
  4211. nop __LINE__
  4212. }
  4213. { .mfi
  4214. STFD [AOFFSET2] = f103, 5 * SIZE
  4215. FMA_A f119 = f111, f102, f119
  4216. nop __LINE__
  4217. }
  4218. ;;
  4219. { .mfi
  4220. STFD [C3 ] = f96, SIZE
  4221. FMA_B f112 = f111, f97, f112
  4222. nop __LINE__
  4223. }
  4224. { .mfi
  4225. STFD [C7 ] = f100, SIZE
  4226. FMA_B f116 = f111, f101, f116
  4227. nop __LINE__
  4228. }
  4229. ;;
  4230. { .mfi
  4231. STFD [C3 ] = f97, SIZE
  4232. FNMA f113 = f110, f97, f113
  4233. nop __LINE__
  4234. }
  4235. { .mfi
  4236. STFD [C7 ] = f101, SIZE
  4237. FNMA f117 = f110, f101, f117
  4238. nop __LINE__
  4239. }
  4240. ;;
  4241. { .mfi
  4242. STFD [C3 ] = f98, SIZE
  4243. FMA_B f114 = f111, f99, f114
  4244. nop __LINE__
  4245. }
  4246. { .mfi
  4247. STFD [C7 ] = f102, SIZE
  4248. FMA_B f118 = f111, f103, f118
  4249. nop __LINE__
  4250. }
  4251. ;;
  4252. { .mfi
  4253. STFD [C3 ] = f99, 5 * SIZE
  4254. FNMA f115 = f110, f99, f115
  4255. nop __LINE__
  4256. }
  4257. { .mfi
  4258. STFD [C7 ] = f103, 5 * SIZE
  4259. FNMA f119 = f110, f103, f119
  4260. nop __LINE__
  4261. }
  4262. ;;
  4263. { .mfi
  4264. nop __LINE__
  4265. FMPY f32 = f126, f112
  4266. nop __LINE__
  4267. }
  4268. { .mfi
  4269. nop __LINE__
  4270. FMPY f36 = f126, f116
  4271. nop __LINE__
  4272. }
  4273. ;;
  4274. { .mfi
  4275. nop __LINE__
  4276. FMPY f33 = f127, f112
  4277. nop __LINE__
  4278. }
  4279. { .mfi
  4280. nop __LINE__
  4281. FMPY f37 = f127, f116
  4282. nop __LINE__
  4283. }
  4284. ;;
  4285. { .mfi
  4286. nop __LINE__
  4287. FMPY f34 = f126, f114
  4288. nop __LINE__
  4289. }
  4290. { .mfi
  4291. nop __LINE__
  4292. FMPY f38 = f126, f118
  4293. nop __LINE__
  4294. }
  4295. ;;
  4296. { .mfi
  4297. nop __LINE__
  4298. FMPY f35 = f127, f114
  4299. nop __LINE__
  4300. }
  4301. { .mfi
  4302. nop __LINE__
  4303. FMPY f39 = f127, f118
  4304. nop __LINE__
  4305. }
  4306. ;;
  4307. { .mfi
  4308. nop __LINE__
  4309. FMA_C f112 = f127, f113, f32
  4310. nop __LINE__
  4311. }
  4312. { .mfi
  4313. nop __LINE__
  4314. FMA_C f116 = f127, f117, f36
  4315. nop __LINE__
  4316. }
  4317. ;;
  4318. { .mfi
  4319. nop __LINE__
  4320. FMA_D f113 = f126, f113, f33
  4321. nop __LINE__
  4322. }
  4323. { .mfi
  4324. nop __LINE__
  4325. FMA_D f117 = f126, f117, f37
  4326. nop __LINE__
  4327. }
  4328. ;;
  4329. { .mfi
  4330. nop __LINE__
  4331. FMA_C f114 = f127, f115, f34
  4332. nop __LINE__
  4333. }
  4334. { .mfi
  4335. nop __LINE__
  4336. FMA_C f118 = f127, f119, f38
  4337. nop __LINE__
  4338. }
  4339. ;;
  4340. { .mfi
  4341. nop __LINE__
  4342. FMA_D f115 = f126, f115, f35
  4343. nop __LINE__
  4344. }
  4345. { .mfi
  4346. nop __LINE__
  4347. FMA_D f119 = f126, f119, f39
  4348. nop __LINE__
  4349. }
  4350. ;;
  4351. { .mmi
  4352. STFD [AOFFSET] = f112, SIZE
  4353. STFD [AOFFSET2] = f116, SIZE
  4354. sub r2 = K, KK
  4355. }
  4356. ;;
  4357. { .mmi
  4358. STFD [AOFFSET] = f113, SIZE
  4359. STFD [AOFFSET2] = f117, SIZE
  4360. mov L = KK
  4361. }
  4362. ;;
  4363. { .mmi
  4364. STFD [AOFFSET] = f114, SIZE
  4365. STFD [AOFFSET2] = f118, SIZE
  4366. shladd r2 = r2, ZBASE_SHIFT, r0
  4367. }
  4368. ;;
  4369. { .mmi
  4370. STFD [AOFFSET] = f115, -27 * SIZE
  4371. STFD [AOFFSET2] = f119
  4372. nop __LINE__
  4373. }
  4374. ;;
  4375. { .mfi
  4376. STFD [C4 ] = f112, SIZE
  4377. mov f64 = f0
  4378. shladd BOFFSET = r2, 2, BOFFSET
  4379. }
  4380. { .mfi
  4381. STFD [C8 ] = f116, SIZE
  4382. mov f65 = f0
  4383. shladd AOFFSET = r2, 2, AOFFSET
  4384. }
  4385. ;;
  4386. { .mfi
  4387. STFD [C4 ] = f113, SIZE
  4388. mov f80 = f0
  4389. cmp.ne p6, p0 = 1, I
  4390. }
  4391. { .mfi
  4392. STFD [C8 ] = f117, SIZE
  4393. mov f81 = f0
  4394. nop __LINE__
  4395. }
  4396. ;;
  4397. { .mfi
  4398. STFD [C4 ] = f114, SIZE
  4399. mov f96 = f0
  4400. adds I = -1, I
  4401. }
  4402. { .mfi
  4403. STFD [C8 ] = f118, SIZE
  4404. mov f97 = f0
  4405. nop __LINE__
  4406. }
  4407. ;;
  4408. { .mfi
  4409. STFD [C4 ] = f115, 5 * SIZE
  4410. mov f112 = f0
  4411. nop __LINE__
  4412. }
  4413. { .mfb
  4414. STFD [C8 ] = f119, 5 * SIZE
  4415. mov f113 = f0
  4416. (p6) br.cond.dptk .L011
  4417. }
  4418. #endif
  4419. #ifdef RT
  4420. { .mfi
  4421. LDFPD f76, f77 = [BOFFSET]
  4422. FMPY f32 = f72, f112
  4423. adds BOFFSET = - 2 * SIZE, BOFFSET
  4424. }
  4425. { .mfi
  4426. nop __LINE__
  4427. FMPY f36 = f72, f116
  4428. nop __LINE__
  4429. }
  4430. ;;
  4431. { .mfi
  4432. LDFPD f78, f79 = [BOFFSET]
  4433. FMPY f33 = f73, f112
  4434. adds BOFFSET = - 4 * SIZE, BOFFSET
  4435. }
  4436. { .mfi
  4437. nop __LINE__
  4438. FMPY f37 = f73, f116
  4439. nop __LINE__
  4440. }
  4441. ;;
  4442. { .mfi
  4443. LDFPD f88, f89 = [BOFFSET]
  4444. FMPY f34 = f72, f114
  4445. adds BOFFSET = - 2 * SIZE, BOFFSET
  4446. }
  4447. { .mfi
  4448. nop __LINE__
  4449. FMPY f38 = f72, f118
  4450. nop __LINE__
  4451. }
  4452. ;;
  4453. { .mfi
  4454. LDFPD f90, f91 = [BOFFSET]
  4455. FMPY f35 = f73, f114
  4456. adds BOFFSET = - 2 * SIZE, BOFFSET
  4457. }
  4458. { .mfi
  4459. nop __LINE__
  4460. FMPY f39 = f73, f118
  4461. nop __LINE__
  4462. }
  4463. ;;
  4464. { .mfi
  4465. LDFPD f92, f93 = [BOFFSET]
  4466. FMA_C f112 = f73, f113, f32
  4467. adds BOFFSET = - 6 * SIZE, BOFFSET
  4468. }
  4469. { .mfi
  4470. nop __LINE__
  4471. FMA_C f116 = f73, f117, f36
  4472. nop __LINE__
  4473. }
  4474. ;;
  4475. { .mfi
  4476. LDFPD f104, f105 = [BOFFSET]
  4477. FMA_D f113 = f72, f113, f33
  4478. adds BOFFSET = - 2 * SIZE, BOFFSET
  4479. }
  4480. { .mfi
  4481. nop __LINE__
  4482. FMA_D f117 = f72, f117, f37
  4483. nop __LINE__
  4484. }
  4485. ;;
  4486. { .mfi
  4487. LDFPD f106, f107 = [BOFFSET]
  4488. FMA_C f114 = f73, f115, f34
  4489. adds BOFFSET = - 8 * SIZE, BOFFSET
  4490. }
  4491. { .mfi
  4492. nop __LINE__
  4493. FMA_C f118 = f73, f119, f38
  4494. nop __LINE__
  4495. }
  4496. ;;
  4497. { .mfi
  4498. LDFPD f120, f121 = [BOFFSET]
  4499. FMA_D f115 = f72, f115, f35
  4500. adds AOFFSET2 = 28 * SIZE, AOFFSET
  4501. }
  4502. { .mfi
  4503. nop __LINE__
  4504. FMA_D f119 = f72, f119, f39
  4505. adds AOFFSET = 24 * SIZE, AOFFSET
  4506. }
  4507. ;;
  4508. { .mfi
  4509. STFD [AOFFSET] = f112, SIZE
  4510. FNMA f96 = f74, f112, f96
  4511. nop __LINE__
  4512. }
  4513. { .mfi
  4514. STFD [AOFFSET2] = f116, SIZE
  4515. FNMA f100 = f74, f116, f100
  4516. nop __LINE__
  4517. }
  4518. ;;
  4519. { .mfi
  4520. STFD [AOFFSET] = f113, SIZE
  4521. FMA_A f97 = f75, f112, f97
  4522. nop __LINE__
  4523. }
  4524. { .mfi
  4525. STFD [AOFFSET2] = f117, SIZE
  4526. FMA_A f101 = f75, f116, f101
  4527. nop __LINE__
  4528. }
  4529. ;;
  4530. { .mfi
  4531. STFD [AOFFSET] = f114, SIZE
  4532. FNMA f98 = f74, f114, f98
  4533. nop __LINE__
  4534. }
  4535. { .mfi
  4536. STFD [AOFFSET2] = f118, SIZE
  4537. FNMA f102 = f74, f118, f102
  4538. nop __LINE__
  4539. }
  4540. ;;
  4541. { .mfi
  4542. STFD [AOFFSET] = f115, -11 * SIZE
  4543. FMA_A f99 = f75, f114, f99
  4544. nop __LINE__
  4545. }
  4546. { .mfi
  4547. STFD [AOFFSET2] = f119, -11 * SIZE
  4548. FMA_A f103 = f75, f118, f103
  4549. nop __LINE__
  4550. }
  4551. ;;
  4552. { .mfi
  4553. STFD [C4 ] = f112, SIZE
  4554. FMA_B f96 = f75, f113, f96
  4555. nop __LINE__
  4556. }
  4557. { .mfi
  4558. STFD [C8 ] = f116, SIZE
  4559. FMA_B f100 = f75, f117, f100
  4560. nop __LINE__
  4561. }
  4562. ;;
  4563. { .mfi
  4564. STFD [C4 ] = f113, SIZE
  4565. FNMA f97 = f74, f113, f97
  4566. nop __LINE__
  4567. }
  4568. { .mfi
  4569. STFD [C8 ] = f117, SIZE
  4570. FNMA f101 = f74, f117, f101
  4571. nop __LINE__
  4572. }
  4573. ;;
  4574. { .mfi
  4575. STFD [C4 ] = f114, SIZE
  4576. FMA_B f98 = f75, f115, f98
  4577. nop __LINE__
  4578. }
  4579. { .mfi
  4580. STFD [C8 ] = f118, SIZE
  4581. FMA_B f102 = f75, f119, f102
  4582. nop __LINE__
  4583. }
  4584. ;;
  4585. { .mfi
  4586. STFD [C4 ] = f115, 5 * SIZE
  4587. FNMA f99 = f74, f115, f99
  4588. nop __LINE__
  4589. }
  4590. { .mfi
  4591. STFD [C8 ] = f119, 5 * SIZE
  4592. FNMA f103 = f74, f119, f103
  4593. nop __LINE__
  4594. }
  4595. ;;
  4596. { .mfi
  4597. nop __LINE__
  4598. FNMA f80 = f76, f112, f80
  4599. nop __LINE__
  4600. }
  4601. { .mfi
  4602. nop __LINE__
  4603. FNMA f84 = f76, f116, f84
  4604. nop __LINE__
  4605. }
  4606. ;;
  4607. { .mfi
  4608. nop __LINE__
  4609. FMA_A f81 = f77, f112, f81
  4610. nop __LINE__
  4611. }
  4612. { .mfi
  4613. nop __LINE__
  4614. FMA_A f85 = f77, f116, f85
  4615. nop __LINE__
  4616. }
  4617. ;;
  4618. { .mfi
  4619. nop __LINE__
  4620. FNMA f82 = f76, f114, f82
  4621. nop __LINE__
  4622. }
  4623. { .mfi
  4624. nop __LINE__
  4625. FNMA f86 = f76, f118, f86
  4626. nop __LINE__
  4627. }
  4628. ;;
  4629. { .mfi
  4630. nop __LINE__
  4631. FMA_A f83 = f77, f114, f83
  4632. nop __LINE__
  4633. }
  4634. { .mfi
  4635. nop __LINE__
  4636. FMA_A f87 = f77, f118, f87
  4637. nop __LINE__
  4638. }
  4639. ;;
  4640. { .mfi
  4641. nop __LINE__
  4642. FMA_B f80 = f77, f113, f80
  4643. nop __LINE__
  4644. }
  4645. { .mfi
  4646. nop __LINE__
  4647. FMA_B f84 = f77, f117, f84
  4648. nop __LINE__
  4649. }
  4650. ;;
  4651. { .mfi
  4652. nop __LINE__
  4653. FNMA f81 = f76, f113, f81
  4654. nop __LINE__
  4655. }
  4656. { .mfi
  4657. nop __LINE__
  4658. FNMA f85 = f76, f117, f85
  4659. nop __LINE__
  4660. }
  4661. ;;
  4662. { .mfi
  4663. nop __LINE__
  4664. FMA_B f82 = f77, f115, f82
  4665. nop __LINE__
  4666. }
  4667. { .mfi
  4668. nop __LINE__
  4669. FMA_B f86 = f77, f119, f86
  4670. nop __LINE__
  4671. }
  4672. ;;
  4673. { .mfi
  4674. nop __LINE__
  4675. FNMA f83 = f76, f115, f83
  4676. nop __LINE__
  4677. }
  4678. { .mfi
  4679. nop __LINE__
  4680. FNMA f87 = f76, f119, f87
  4681. nop __LINE__
  4682. }
  4683. ;;
  4684. { .mfi
  4685. nop __LINE__
  4686. FNMA f64 = f78, f112, f64
  4687. nop __LINE__
  4688. }
  4689. { .mfi
  4690. nop __LINE__
  4691. FNMA f68 = f78, f116, f68
  4692. nop __LINE__
  4693. }
  4694. ;;
  4695. { .mfi
  4696. nop __LINE__
  4697. FMA_A f65 = f79, f112, f65
  4698. nop __LINE__
  4699. }
  4700. { .mfi
  4701. nop __LINE__
  4702. FMA_A f69 = f79, f116, f69
  4703. nop __LINE__
  4704. }
  4705. ;;
  4706. { .mfi
  4707. nop __LINE__
  4708. FNMA f66 = f78, f114, f66
  4709. nop __LINE__
  4710. }
  4711. { .mfi
  4712. nop __LINE__
  4713. FNMA f70 = f78, f118, f70
  4714. nop __LINE__
  4715. }
  4716. ;;
  4717. { .mfi
  4718. nop __LINE__
  4719. FMA_A f67 = f79, f114, f67
  4720. nop __LINE__
  4721. }
  4722. { .mfi
  4723. nop __LINE__
  4724. FMA_A f71 = f79, f118, f71
  4725. nop __LINE__
  4726. }
  4727. ;;
  4728. { .mfi
  4729. nop __LINE__
  4730. FMA_B f64 = f79, f113, f64
  4731. nop __LINE__
  4732. }
  4733. { .mfi
  4734. nop __LINE__
  4735. FMA_B f68 = f79, f117, f68
  4736. nop __LINE__
  4737. }
  4738. ;;
  4739. { .mfi
  4740. nop __LINE__
  4741. FNMA f65 = f78, f113, f65
  4742. nop __LINE__
  4743. }
  4744. { .mfi
  4745. nop __LINE__
  4746. FNMA f69 = f78, f117, f69
  4747. nop __LINE__
  4748. }
  4749. ;;
  4750. { .mfi
  4751. nop __LINE__
  4752. FMA_B f66 = f79, f115, f66
  4753. nop __LINE__
  4754. }
  4755. { .mfi
  4756. nop __LINE__
  4757. FMA_B f70 = f79, f119, f70
  4758. nop __LINE__
  4759. }
  4760. ;;
  4761. { .mfi
  4762. nop __LINE__
  4763. FNMA f67 = f78, f115, f67
  4764. nop __LINE__
  4765. }
  4766. { .mfi
  4767. nop __LINE__
  4768. FNMA f71 = f78, f119, f71
  4769. nop __LINE__
  4770. }
  4771. ;;
  4772. { .mfi
  4773. nop __LINE__
  4774. FMPY f32 = f88, f96
  4775. nop __LINE__
  4776. }
  4777. { .mfi
  4778. nop __LINE__
  4779. FMPY f36 = f88, f100
  4780. nop __LINE__
  4781. }
  4782. ;;
  4783. { .mfi
  4784. nop __LINE__
  4785. FMPY f33 = f89, f96
  4786. nop __LINE__
  4787. }
  4788. { .mfi
  4789. nop __LINE__
  4790. FMPY f37 = f89, f100
  4791. nop __LINE__
  4792. }
  4793. ;;
  4794. { .mfi
  4795. nop __LINE__
  4796. FMPY f34 = f88, f98
  4797. nop __LINE__
  4798. }
  4799. { .mfi
  4800. nop __LINE__
  4801. FMPY f38 = f88, f102
  4802. nop __LINE__
  4803. }
  4804. ;;
  4805. { .mfi
  4806. nop __LINE__
  4807. FMPY f35 = f89, f98
  4808. nop __LINE__
  4809. }
  4810. { .mfi
  4811. nop __LINE__
  4812. FMPY f39 = f89, f102
  4813. nop __LINE__
  4814. }
  4815. ;;
  4816. { .mfi
  4817. nop __LINE__
  4818. FMA_C f96 = f89, f97, f32
  4819. nop __LINE__
  4820. }
  4821. { .mfi
  4822. nop __LINE__
  4823. FMA_C f100 = f89, f101, f36
  4824. nop __LINE__
  4825. }
  4826. ;;
  4827. { .mfi
  4828. nop __LINE__
  4829. FMA_D f97 = f88, f97, f33
  4830. nop __LINE__
  4831. }
  4832. { .mfi
  4833. nop __LINE__
  4834. FMA_D f101 = f88, f101, f37
  4835. nop __LINE__
  4836. }
  4837. ;;
  4838. { .mfi
  4839. nop __LINE__
  4840. FMA_C f98 = f89, f99, f34
  4841. nop __LINE__
  4842. }
  4843. { .mfi
  4844. nop __LINE__
  4845. FMA_C f102 = f89, f103, f38
  4846. nop __LINE__
  4847. }
  4848. ;;
  4849. { .mfi
  4850. nop __LINE__
  4851. FMA_D f99 = f88, f99, f35
  4852. nop __LINE__
  4853. }
  4854. { .mfi
  4855. nop __LINE__
  4856. FMA_D f103 = f88, f103, f39
  4857. nop __LINE__
  4858. }
  4859. ;;
  4860. { .mfi
  4861. STFD [AOFFSET] = f96, SIZE
  4862. FNMA f80 = f90, f96, f80
  4863. nop __LINE__
  4864. }
  4865. { .mfi
  4866. STFD [AOFFSET2] = f100, SIZE
  4867. FNMA f84 = f90, f100, f84
  4868. nop __LINE__
  4869. }
  4870. ;;
  4871. { .mfi
  4872. STFD [AOFFSET] = f97, SIZE
  4873. FMA_A f81 = f91, f96, f81
  4874. nop __LINE__
  4875. }
  4876. { .mfi
  4877. STFD [AOFFSET2] = f101, SIZE
  4878. FMA_A f85 = f91, f100, f85
  4879. nop __LINE__
  4880. }
  4881. ;;
  4882. { .mfi
  4883. STFD [AOFFSET] = f98, SIZE
  4884. FNMA f82 = f90, f98, f82
  4885. nop __LINE__
  4886. }
  4887. { .mfi
  4888. STFD [AOFFSET2] = f102, SIZE
  4889. FNMA f86 = f90, f102, f86
  4890. nop __LINE__
  4891. }
  4892. ;;
  4893. { .mfi
  4894. STFD [AOFFSET] = f99, -11 * SIZE
  4895. FMA_A f83 = f91, f98, f83
  4896. nop __LINE__
  4897. }
  4898. { .mfi
  4899. STFD [AOFFSET2] = f103, -11 * SIZE
  4900. FMA_A f87 = f91, f102, f87
  4901. nop __LINE__
  4902. }
  4903. ;;
  4904. { .mfi
  4905. STFD [C3 ] = f96, SIZE
  4906. FMA_B f80 = f91, f97, f80
  4907. nop __LINE__
  4908. }
  4909. { .mfi
  4910. STFD [C7 ] = f100, SIZE
  4911. FMA_B f84 = f91, f101, f84
  4912. nop __LINE__
  4913. }
  4914. ;;
  4915. { .mfi
  4916. STFD [C3 ] = f97, SIZE
  4917. FNMA f81 = f90, f97, f81
  4918. nop __LINE__
  4919. }
  4920. { .mfi
  4921. STFD [C7 ] = f101, SIZE
  4922. FNMA f85 = f90, f101, f85
  4923. nop __LINE__
  4924. }
  4925. ;;
  4926. { .mfi
  4927. STFD [C3 ] = f98, SIZE
  4928. FMA_B f82 = f91, f99, f82
  4929. nop __LINE__
  4930. }
  4931. { .mfi
  4932. STFD [C7 ] = f102, SIZE
  4933. FMA_B f86 = f91, f103, f86
  4934. nop __LINE__
  4935. }
  4936. ;;
  4937. { .mfi
  4938. STFD [C3 ] = f99, 5 * SIZE
  4939. FNMA f83 = f90, f99, f83
  4940. nop __LINE__
  4941. }
  4942. { .mfi
  4943. STFD [C7 ] = f103, 5 * SIZE
  4944. FNMA f87 = f90, f103, f87
  4945. nop __LINE__
  4946. }
  4947. ;;
  4948. { .mfi
  4949. nop __LINE__
  4950. FNMA f64 = f92, f96, f64
  4951. nop __LINE__
  4952. }
  4953. { .mfi
  4954. nop __LINE__
  4955. FNMA f68 = f92, f100, f68
  4956. nop __LINE__
  4957. }
  4958. ;;
  4959. { .mfi
  4960. nop __LINE__
  4961. FMA_A f65 = f93, f96, f65
  4962. nop __LINE__
  4963. }
  4964. { .mfi
  4965. nop __LINE__
  4966. FMA_A f69 = f93, f100, f69
  4967. nop __LINE__
  4968. }
  4969. ;;
  4970. { .mfi
  4971. nop __LINE__
  4972. FNMA f66 = f92, f98, f66
  4973. nop __LINE__
  4974. }
  4975. { .mfi
  4976. nop __LINE__
  4977. FNMA f70 = f92, f102, f70
  4978. nop __LINE__
  4979. }
  4980. ;;
  4981. { .mfi
  4982. nop __LINE__
  4983. FMA_A f67 = f93, f98, f67
  4984. nop __LINE__
  4985. }
  4986. { .mfi
  4987. nop __LINE__
  4988. FMA_A f71 = f93, f102, f71
  4989. nop __LINE__
  4990. }
  4991. ;;
  4992. { .mfi
  4993. nop __LINE__
  4994. FMA_B f64 = f93, f97, f64
  4995. nop __LINE__
  4996. }
  4997. { .mfi
  4998. nop __LINE__
  4999. FMA_B f68 = f93, f101, f68
  5000. nop __LINE__
  5001. }
  5002. ;;
  5003. { .mfi
  5004. nop __LINE__
  5005. FNMA f65 = f92, f97, f65
  5006. nop __LINE__
  5007. }
  5008. { .mfi
  5009. nop __LINE__
  5010. FNMA f69 = f92, f101, f69
  5011. nop __LINE__
  5012. }
  5013. ;;
  5014. { .mfi
  5015. nop __LINE__
  5016. FMA_B f66 = f93, f99, f66
  5017. nop __LINE__
  5018. }
  5019. { .mfi
  5020. nop __LINE__
  5021. FMA_B f70 = f93, f103, f70
  5022. nop __LINE__
  5023. }
  5024. ;;
  5025. { .mfi
  5026. nop __LINE__
  5027. FNMA f67 = f92, f99, f67
  5028. nop __LINE__
  5029. }
  5030. { .mfi
  5031. nop __LINE__
  5032. FNMA f71 = f92, f103, f71
  5033. nop __LINE__
  5034. }
  5035. ;;
  5036. { .mfi
  5037. nop __LINE__
  5038. FMPY f32 = f104, f80
  5039. nop __LINE__
  5040. }
  5041. { .mfi
  5042. nop __LINE__
  5043. FMPY f36 = f104, f84
  5044. nop __LINE__
  5045. }
  5046. ;;
  5047. { .mfi
  5048. nop __LINE__
  5049. FMPY f33 = f105, f80
  5050. nop __LINE__
  5051. }
  5052. { .mfi
  5053. nop __LINE__
  5054. FMPY f37 = f105, f84
  5055. nop __LINE__
  5056. }
  5057. ;;
  5058. { .mfi
  5059. nop __LINE__
  5060. FMPY f34 = f104, f82
  5061. nop __LINE__
  5062. }
  5063. { .mfi
  5064. nop __LINE__
  5065. FMPY f38 = f104, f86
  5066. nop __LINE__
  5067. }
  5068. ;;
  5069. { .mfi
  5070. nop __LINE__
  5071. FMPY f35 = f105, f82
  5072. nop __LINE__
  5073. }
  5074. { .mfi
  5075. nop __LINE__
  5076. FMPY f39 = f105, f86
  5077. nop __LINE__
  5078. }
  5079. ;;
  5080. { .mfi
  5081. nop __LINE__
  5082. FMA_C f80 = f105, f81, f32
  5083. nop __LINE__
  5084. }
  5085. { .mfi
  5086. nop __LINE__
  5087. FMA_C f84 = f105, f85, f36
  5088. nop __LINE__
  5089. }
  5090. ;;
  5091. { .mfi
  5092. nop __LINE__
  5093. FMA_D f81 = f104, f81, f33
  5094. nop __LINE__
  5095. }
  5096. { .mfi
  5097. nop __LINE__
  5098. FMA_D f85 = f104, f85, f37
  5099. nop __LINE__
  5100. }
  5101. ;;
  5102. { .mfi
  5103. nop __LINE__
  5104. FMA_C f82 = f105, f83, f34
  5105. nop __LINE__
  5106. }
  5107. { .mfi
  5108. nop __LINE__
  5109. FMA_C f86 = f105, f87, f38
  5110. nop __LINE__
  5111. }
  5112. ;;
  5113. { .mfi
  5114. nop __LINE__
  5115. FMA_D f83 = f104, f83, f35
  5116. nop __LINE__
  5117. }
  5118. { .mfi
  5119. nop __LINE__
  5120. FMA_D f87 = f104, f87, f39
  5121. nop __LINE__
  5122. }
  5123. ;;
  5124. { .mfi
  5125. STFD [AOFFSET] = f80, SIZE
  5126. FNMA f64 = f106, f80, f64
  5127. nop __LINE__
  5128. }
  5129. { .mfi
  5130. STFD [AOFFSET2] = f84, SIZE
  5131. FNMA f68 = f106, f84, f68
  5132. nop __LINE__
  5133. }
  5134. ;;
  5135. { .mfi
  5136. STFD [AOFFSET] = f81, SIZE
  5137. FMA_A f65 = f107, f80, f65
  5138. nop __LINE__
  5139. }
  5140. { .mfi
  5141. STFD [AOFFSET2] = f85, SIZE
  5142. FMA_A f69 = f107, f84, f69
  5143. nop __LINE__
  5144. }
  5145. ;;
  5146. { .mfi
  5147. STFD [AOFFSET] = f82, SIZE
  5148. FNMA f66 = f106, f82, f66
  5149. nop __LINE__
  5150. }
  5151. { .mfi
  5152. STFD [AOFFSET2] = f86, SIZE
  5153. FNMA f70 = f106, f86, f70
  5154. nop __LINE__
  5155. }
  5156. ;;
  5157. { .mfi
  5158. STFD [AOFFSET] = f83, -11 * SIZE
  5159. FMA_A f67 = f107, f82, f67
  5160. nop __LINE__
  5161. }
  5162. { .mfi
  5163. STFD [AOFFSET2] = f87, -11 * SIZE
  5164. FMA_A f71 = f107, f86, f71
  5165. nop __LINE__
  5166. }
  5167. ;;
  5168. { .mfi
  5169. STFD [C2 ] = f80, SIZE
  5170. FMA_B f64 = f107, f81, f64
  5171. nop __LINE__
  5172. }
  5173. { .mfi
  5174. STFD [C6 ] = f84, SIZE
  5175. FMA_B f68 = f107, f85, f68
  5176. nop __LINE__
  5177. }
  5178. ;;
  5179. { .mfi
  5180. STFD [C2 ] = f81, SIZE
  5181. FNMA f65 = f106, f81, f65
  5182. nop __LINE__
  5183. }
  5184. { .mfi
  5185. STFD [C6 ] = f85, SIZE
  5186. FNMA f69 = f106, f85, f69
  5187. nop __LINE__
  5188. }
  5189. ;;
  5190. { .mfi
  5191. STFD [C2 ] = f82, SIZE
  5192. FMA_B f66 = f107, f83, f66
  5193. nop __LINE__
  5194. }
  5195. { .mfi
  5196. STFD [C6 ] = f86, SIZE
  5197. FMA_B f70 = f107, f87, f70
  5198. nop __LINE__
  5199. }
  5200. ;;
  5201. { .mfi
  5202. STFD [C2 ] = f83, 5 * SIZE
  5203. FNMA f67 = f106, f83, f67
  5204. nop __LINE__
  5205. }
  5206. { .mfi
  5207. STFD [C6 ] = f87, 5 * SIZE
  5208. FNMA f71 = f106, f87, f71
  5209. nop __LINE__
  5210. }
  5211. ;;
  5212. { .mfi
  5213. nop __LINE__
  5214. FMPY f32 = f120, f64
  5215. nop __LINE__
  5216. }
  5217. { .mfi
  5218. nop __LINE__
  5219. FMPY f36 = f120, f68
  5220. nop __LINE__
  5221. }
  5222. ;;
  5223. { .mfi
  5224. nop __LINE__
  5225. FMPY f33 = f121, f64
  5226. nop __LINE__
  5227. }
  5228. { .mfi
  5229. nop __LINE__
  5230. FMPY f37 = f121, f68
  5231. nop __LINE__
  5232. }
  5233. ;;
  5234. { .mfi
  5235. nop __LINE__
  5236. FMPY f34 = f120, f66
  5237. nop __LINE__
  5238. }
  5239. { .mfi
  5240. nop __LINE__
  5241. FMPY f38 = f120, f70
  5242. nop __LINE__
  5243. }
  5244. ;;
  5245. { .mfi
  5246. nop __LINE__
  5247. FMPY f35 = f121, f66
  5248. nop __LINE__
  5249. }
  5250. { .mfi
  5251. nop __LINE__
  5252. FMPY f39 = f121, f70
  5253. nop __LINE__
  5254. }
  5255. ;;
  5256. { .mfi
  5257. nop __LINE__
  5258. FMA_C f64 = f121, f65, f32
  5259. nop __LINE__
  5260. }
  5261. { .mfi
  5262. nop __LINE__
  5263. FMA_C f68 = f121, f69, f36
  5264. nop __LINE__
  5265. }
  5266. ;;
  5267. { .mfi
  5268. nop __LINE__
  5269. FMA_D f65 = f120, f65, f33
  5270. nop __LINE__
  5271. }
  5272. { .mfi
  5273. nop __LINE__
  5274. FMA_D f69 = f120, f69, f37
  5275. nop __LINE__
  5276. }
  5277. ;;
  5278. { .mfi
  5279. nop __LINE__
  5280. FMA_C f66 = f121, f67, f34
  5281. nop __LINE__
  5282. }
  5283. { .mfi
  5284. nop __LINE__
  5285. FMA_C f70 = f121, f71, f38
  5286. nop __LINE__
  5287. }
  5288. ;;
  5289. { .mfi
  5290. nop __LINE__
  5291. FMA_D f67 = f120, f67, f35
  5292. nop __LINE__
  5293. }
  5294. { .mfi
  5295. nop __LINE__
  5296. FMA_D f71 = f120, f71, f39
  5297. nop __LINE__
  5298. }
  5299. ;;
  5300. { .mmi
  5301. STFD [AOFFSET] = f64, SIZE
  5302. STFD [AOFFSET2] = f68, SIZE
  5303. shladd r2 = K, ZBASE_SHIFT, r0
  5304. }
  5305. ;;
  5306. { .mmi
  5307. STFD [AOFFSET] = f65, SIZE
  5308. STFD [AOFFSET2] = f69, SIZE
  5309. shladd AORIG = r2, 2, AORIG
  5310. }
  5311. ;;
  5312. { .mmi
  5313. STFD [AOFFSET] = f66, SIZE
  5314. STFD [AOFFSET2] = f70, SIZE
  5315. nop __LINE__
  5316. }
  5317. ;;
  5318. { .mmi
  5319. STFD [AOFFSET] = f67, -3 * SIZE
  5320. STFD [AOFFSET2] = f71
  5321. nop __LINE__
  5322. }
  5323. ;;
  5324. { .mfi
  5325. STFD [C1 ] = f64, SIZE
  5326. mov f64 = f0
  5327. cmp.ne p6, p0 = 1, I
  5328. }
  5329. { .mfi
  5330. STFD [C5 ] = f68, SIZE
  5331. mov f81 = f0
  5332. nop __LINE__
  5333. }
  5334. ;;
  5335. { .mfi
  5336. STFD [C1 ] = f65, SIZE
  5337. mov f65 = f0
  5338. nop __LINE__
  5339. }
  5340. { .mfi
  5341. STFD [C5 ] = f69, SIZE
  5342. mov f96 = f0
  5343. nop __LINE__
  5344. }
  5345. ;;
  5346. { .mfi
  5347. STFD [C1 ] = f66, SIZE
  5348. mov f80 = f0
  5349. sub L = K, KK
  5350. }
  5351. { .mfi
  5352. STFD [C5 ] = f70, SIZE
  5353. mov f97 = f0
  5354. nop __LINE__
  5355. }
  5356. ;;
  5357. { .mfi
  5358. STFD [C1 ] = f67, 5 * SIZE
  5359. mov f112 = f0
  5360. adds I = -1, I
  5361. }
  5362. { .mfb
  5363. STFD [C5 ] = f71, 5 * SIZE
  5364. mov f113 = f0
  5365. (p6) br.cond.dptk .L011
  5366. }
  5367. ;;
  5368. #endif
  5369. .L020:
  5370. { .mib
  5371. #if defined(LT) || defined(RN)
  5372. mov L = KK
  5373. #else
  5374. sub L = K, KK
  5375. #endif
  5376. tbit.z p6, p7 = M, 1
  5377. (p6) br.cond.dptk .L030
  5378. }
  5379. ;;
  5380. { .mmi
  5381. cmp.ne p7, p0 = r0, L
  5382. adds BOFFSET = 0 * SIZE, B
  5383. shl r2 = K, 1 + ZBASE_SHIFT
  5384. }
  5385. { .mmi
  5386. shladd r3 = KK, ZBASE_SHIFT, r0
  5387. nop __LINE__
  5388. nop __LINE__
  5389. }
  5390. ;;
  5391. #if defined(LT) || defined(RN)
  5392. { .mfb
  5393. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5394. mov f66 = f0
  5395. nop __LINE__
  5396. }
  5397. { .mmf
  5398. nop __LINE__
  5399. nop __LINE__
  5400. mov f67 = f0
  5401. }
  5402. ;;
  5403. #else
  5404. { .mfi
  5405. shladd BOFFSET = r3, 2, B
  5406. mov f66 = f0
  5407. #ifdef LN
  5408. sub AORIG = AORIG, r2
  5409. #else
  5410. nop __LINE__
  5411. #endif
  5412. }
  5413. ;;
  5414. { .mfi
  5415. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5416. mov f67 = f0
  5417. shladd AOFFSET = r3, 1, AORIG
  5418. }
  5419. ;;
  5420. #endif
  5421. ;;
  5422. adds L = 1, L
  5423. ;;
  5424. { .mfi
  5425. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5426. mov f82 = f0
  5427. tbit.z p12, p0 = L, 0
  5428. }
  5429. { .mfi
  5430. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  5431. mov f83 = f0
  5432. shr L = L, 1
  5433. }
  5434. ;;
  5435. { .mfi
  5436. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  5437. mov f98 = f0
  5438. adds L = -1, L
  5439. }
  5440. { .mfi
  5441. (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  5442. mov f99 = f0
  5443. cmp.eq p3, p0 = r0, r0
  5444. }
  5445. ;;
  5446. { .mfi
  5447. (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  5448. mov f114 = f0
  5449. mov ar.lc = L
  5450. }
  5451. { .mfi
  5452. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  5453. mov f115 = f0
  5454. nop __LINE__
  5455. }
  5456. ;;
  5457. cmp.eq p6, p0 = -1, L
  5458. (p6) br.cond.dpnt .L028
  5459. ;;
  5460. .align 16
  5461. .L022:
  5462. { .mfi
  5463. lfetch.nt1 [PREA], 8 * SIZE
  5464. FMA f64 = f32, f48, f64 // A1 * B1
  5465. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  5466. }
  5467. { .mfi
  5468. nop __LINE__
  5469. FMA_B f65 = f32, f49, f65 // A1 * B2
  5470. (p12) cmp.ne p3, p0 = 0, L
  5471. }
  5472. ;;
  5473. { .mfi
  5474. lfetch.nt1 [PREB], 16 * SIZE
  5475. FMA f80 = f32, f50, f80 // A1 * B3
  5476. cmp.ne p4, p5 = 0, L
  5477. }
  5478. { .mfb
  5479. nop __LINE__
  5480. FMA_B f81 = f32, f51, f81 // A1 * B4
  5481. nop __LINE__
  5482. }
  5483. ;;
  5484. { .mfb
  5485. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  5486. FMA f96 = f32, f52, f96 // A1 * B5
  5487. nop __LINE__
  5488. }
  5489. { .mfb
  5490. nop __LINE__
  5491. FMA_B f97 = f32, f53, f97 // A1 * B6
  5492. nop __LINE__
  5493. }
  5494. ;;
  5495. { .mfb
  5496. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  5497. FMA f112 = f32, f54, f112 // A1 * B7
  5498. nop __LINE__
  5499. }
  5500. { .mfb
  5501. nop __LINE__
  5502. FMA_B f113 = f32, f55, f113 // A1 * B8
  5503. nop __LINE__
  5504. }
  5505. ;;
  5506. { .mfb
  5507. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  5508. FMA f65 = f33, f48, f65 // A2 * B1
  5509. nop __LINE__
  5510. }
  5511. { .mfb
  5512. nop __LINE__
  5513. FMA_A f64 = f33, f49, f64 // A2 * B2
  5514. nop __LINE__
  5515. }
  5516. ;;
  5517. { .mfb
  5518. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  5519. FMA f81 = f33, f50, f81 // A2 * B3
  5520. nop __LINE__
  5521. }
  5522. { .mfb
  5523. nop __LINE__
  5524. FMA_A f80 = f33, f51, f80 // A2 * B4
  5525. nop __LINE__
  5526. }
  5527. ;;
  5528. { .mfb
  5529. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  5530. FMA f97 = f33, f52, f97 // A2 * B5
  5531. nop __LINE__
  5532. }
  5533. { .mfb
  5534. nop __LINE__
  5535. FMA_A f96 = f33, f53, f96 // A2 * B6
  5536. nop __LINE__
  5537. }
  5538. ;;
  5539. { .mfb
  5540. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  5541. FMA f113 = f33, f54, f113 // A2 * B7
  5542. nop __LINE__
  5543. }
  5544. { .mfb
  5545. nop __LINE__
  5546. FMA_A f112 = f33, f55, f112 // A2 * B8
  5547. nop __LINE__
  5548. }
  5549. ;;
  5550. { .mfb
  5551. nop __LINE__
  5552. FMA f66 = f34, f48, f66 // A3 * B1
  5553. nop __LINE__
  5554. }
  5555. { .mfb
  5556. nop __LINE__
  5557. FMA_B f67 = f34, f49, f67 // A3 * B2
  5558. nop __LINE__
  5559. }
  5560. ;;
  5561. { .mfb
  5562. nop __LINE__
  5563. FMA f82 = f34, f50, f82 // A3 * B3
  5564. nop __LINE__
  5565. }
  5566. { .mfb
  5567. nop __LINE__
  5568. FMA_B f83 = f34, f51, f83 // A3 * B4
  5569. nop __LINE__
  5570. }
  5571. ;;
  5572. { .mfb
  5573. nop __LINE__
  5574. FMA f98 = f34, f52, f98 // A3 * B5
  5575. nop __LINE__
  5576. }
  5577. { .mfb
  5578. nop __LINE__
  5579. FMA_B f99 = f34, f53, f99 // A3 * B6
  5580. nop __LINE__
  5581. }
  5582. ;;
  5583. { .mfb
  5584. nop __LINE__
  5585. FMA f114 = f34, f54, f114 // A3 * B7
  5586. nop __LINE__
  5587. }
  5588. { .mfb
  5589. nop __LINE__
  5590. FMA_B f115 = f34, f55, f115 // A3 * B8
  5591. nop __LINE__
  5592. }
  5593. ;;
  5594. { .mfb
  5595. nop __LINE__
  5596. FMA f67 = f35, f48, f67 // A4 * B1
  5597. nop __LINE__
  5598. }
  5599. { .mfb
  5600. nop __LINE__
  5601. FMA_A f66 = f35, f49, f66 // A4 * B2
  5602. nop __LINE__
  5603. }
  5604. ;;
  5605. { .mfb
  5606. nop __LINE__
  5607. FMA f83 = f35, f50, f83 // A4 * B3
  5608. nop __LINE__
  5609. }
  5610. { .mfb
  5611. nop __LINE__
  5612. FMA_A f82 = f35, f51, f82 // A4 * B4
  5613. nop __LINE__
  5614. }
  5615. ;;
  5616. { .mfb
  5617. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  5618. FMA f99 = f35, f52, f99 // A4 * B5
  5619. nop __LINE__
  5620. }
  5621. { .mfb
  5622. nop __LINE__
  5623. FMA_A f98 = f35, f53, f98 // A4 * B6
  5624. nop __LINE__
  5625. }
  5626. ;;
  5627. { .mfb
  5628. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5629. FMA f115 = f35, f54, f115 // A4 * B7
  5630. nop __LINE__
  5631. }
  5632. { .mfb
  5633. nop __LINE__
  5634. FMA_A f114 = f35, f55, f114 // A4 * B8
  5635. nop __LINE__
  5636. }
  5637. ;;
  5638. { .mfb
  5639. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  5640. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  5641. nop __LINE__
  5642. }
  5643. { .mfb
  5644. nop __LINE__
  5645. (p3) FMA_B f65 = f40, f57, f65 // A1 * B2
  5646. nop __LINE__
  5647. }
  5648. ;;
  5649. { .mfb
  5650. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  5651. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  5652. nop __LINE__
  5653. }
  5654. { .mfb
  5655. nop __LINE__
  5656. (p3) FMA_B f81 = f40, f59, f81 // A1 * B4
  5657. nop __LINE__
  5658. }
  5659. ;;
  5660. { .mfb
  5661. nop __LINE__
  5662. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  5663. nop __LINE__
  5664. }
  5665. { .mfb
  5666. nop __LINE__
  5667. (p3) FMA_B f97 = f40, f61, f97 // A1 * B6
  5668. nop __LINE__
  5669. }
  5670. ;;
  5671. { .mfb
  5672. nop __LINE__
  5673. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  5674. nop __LINE__
  5675. }
  5676. { .mfb
  5677. nop __LINE__
  5678. (p3) FMA_B f113 = f40, f63, f113 // A1 * B8
  5679. nop __LINE__
  5680. }
  5681. ;;
  5682. { .mfb
  5683. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  5684. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  5685. nop __LINE__
  5686. }
  5687. { .mfb
  5688. (p3) FMA_A f64 = f41, f57, f64 // A2 * B2
  5689. nop __LINE__
  5690. }
  5691. { .mfb
  5692. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  5693. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  5694. nop __LINE__
  5695. }
  5696. { .mfb
  5697. (p3) FMA_A f80 = f41, f59, f80 // A2 * B4
  5698. nop __LINE__
  5699. }
  5700. ;;
  5701. { .mfb
  5702. nop __LINE__
  5703. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  5704. nop __LINE__
  5705. }
  5706. { .mfb
  5707. nop __LINE__
  5708. (p3) FMA_A f96 = f41, f61, f96 // A2 * B6
  5709. nop __LINE__
  5710. }
  5711. ;;
  5712. { .mfb
  5713. nop __LINE__
  5714. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  5715. nop __LINE__
  5716. }
  5717. { .mfb
  5718. nop __LINE__
  5719. (p3) FMA_A f112 = f41, f63, f112 // A2 * B8
  5720. nop __LINE__
  5721. }
  5722. ;;
  5723. { .mfb
  5724. nop __LINE__
  5725. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  5726. nop __LINE__
  5727. }
  5728. { .mfb
  5729. nop __LINE__
  5730. (p3) FMA_B f67 = f42, f57, f67 // A3 * B2
  5731. nop __LINE__
  5732. }
  5733. ;;
  5734. { .mfb
  5735. nop __LINE__
  5736. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  5737. nop __LINE__
  5738. }
  5739. { .mfb
  5740. nop __LINE__
  5741. (p3) FMA_B f83 = f42, f59, f83 // A3 * B4
  5742. nop __LINE__
  5743. }
  5744. ;;
  5745. { .mfb
  5746. nop __LINE__
  5747. (p3) FMA f98 = f42, f60, f98 // A3 * B5
  5748. nop __LINE__
  5749. }
  5750. { .mfb
  5751. nop __LINE__
  5752. (p3) FMA_B f99 = f42, f61, f99 // A3 * B6
  5753. nop __LINE__
  5754. }
  5755. ;;
  5756. { .mfb
  5757. nop __LINE__
  5758. (p3) FMA f114 = f42, f62, f114 // A3 * B7
  5759. nop __LINE__
  5760. }
  5761. { .mfb
  5762. nop __LINE__
  5763. (p3) FMA_B f115 = f42, f63, f115 // A3 * B8
  5764. nop __LINE__
  5765. }
  5766. ;;
  5767. { .mfb
  5768. nop __LINE__
  5769. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  5770. nop __LINE__
  5771. }
  5772. { .mfb
  5773. nop __LINE__
  5774. (p3) FMA_A f66 = f43, f57, f66 // A4 * B2
  5775. nop __LINE__
  5776. }
  5777. ;;
  5778. { .mfb
  5779. nop __LINE__
  5780. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  5781. nop __LINE__
  5782. }
  5783. { .mfb
  5784. nop __LINE__
  5785. (p3) FMA_A f82 = f43, f59, f82 // A4 * B4
  5786. nop __LINE__
  5787. }
  5788. ;;
  5789. { .mfb
  5790. nop __LINE__
  5791. (p3) FMA f99 = f43, f60, f99 // A4 * B5
  5792. nop __LINE__
  5793. }
  5794. { .mfb
  5795. nop __LINE__
  5796. (p3) FMA_A f98 = f43, f61, f98 // A4 * B6
  5797. nop __LINE__
  5798. }
  5799. ;;
  5800. { .mfi
  5801. nop __LINE__
  5802. (p3) FMA f115 = f43, f62, f115 // A4 * B7
  5803. adds L = -1, L
  5804. }
  5805. { .mfb
  5806. nop __LINE__
  5807. (p3) FMA_A f114 = f43, f63, f114 // A4 * B8
  5808. br.cloop.sptk.few .L022
  5809. }
  5810. ;;
  5811. .L028:
  5812. #if defined(LN) || defined(RT)
  5813. #ifdef LN
  5814. adds r2 = -2, KK
  5815. #else
  5816. adds r2 = -4, KK
  5817. #endif
  5818. ;;
  5819. shladd r2 = r2, ZBASE_SHIFT, r0
  5820. ;;
  5821. shladd AOFFSET = r2, 1, AORIG
  5822. shladd BOFFSET = r2, 2, B
  5823. ;;
  5824. #endif
  5825. #if defined(LN) || defined(LT)
  5826. LDFPD f72, f73 = [BOFFSET], 2 * SIZE
  5827. ;;
  5828. LDFPD f74, f75 = [BOFFSET], 2 * SIZE
  5829. ;;
  5830. LDFPD f88, f89 = [BOFFSET], 2 * SIZE
  5831. ;;
  5832. LDFPD f90, f91 = [BOFFSET], 2 * SIZE
  5833. ;;
  5834. LDFPD f104, f105 = [BOFFSET], 2 * SIZE
  5835. ;;
  5836. LDFPD f106, f107 = [BOFFSET], 2 * SIZE
  5837. ;;
  5838. { .mfi
  5839. LDFPD f120, f121 = [BOFFSET], 2 * SIZE
  5840. FSUB f64 = f72, f64
  5841. nop __LINE__
  5842. }
  5843. { .mfi
  5844. nop __LINE__
  5845. FSUB_A f65 = f73, f65
  5846. nop __LINE__
  5847. }
  5848. ;;
  5849. { .mfi
  5850. LDFPD f122, f123 = [BOFFSET]
  5851. FSUB f80 = f74, f80
  5852. adds BOFFSET = -14 * SIZE, BOFFSET
  5853. }
  5854. { .mfi
  5855. nop __LINE__
  5856. FSUB_A f81 = f75, f81
  5857. nop __LINE__
  5858. }
  5859. ;;
  5860. { .mfi
  5861. nop __LINE__
  5862. FSUB f96 = f88, f96
  5863. nop __LINE__
  5864. }
  5865. { .mfi
  5866. nop __LINE__
  5867. FSUB_A f97 = f89, f97
  5868. nop __LINE__
  5869. }
  5870. ;;
  5871. { .mfi
  5872. nop __LINE__
  5873. FSUB f112 = f90, f112
  5874. nop __LINE__
  5875. }
  5876. { .mfi
  5877. nop __LINE__
  5878. FSUB_A f113 = f91, f113
  5879. nop __LINE__
  5880. }
  5881. ;;
  5882. { .mfi
  5883. nop __LINE__
  5884. FSUB f66 = f104, f66
  5885. nop __LINE__
  5886. }
  5887. { .mfi
  5888. nop __LINE__
  5889. FSUB_A f67 = f105, f67
  5890. nop __LINE__
  5891. }
  5892. ;;
  5893. { .mfi
  5894. nop __LINE__
  5895. FSUB f82 = f106, f82
  5896. nop __LINE__
  5897. }
  5898. { .mfi
  5899. nop __LINE__
  5900. FSUB_A f83 = f107, f83
  5901. nop __LINE__
  5902. }
  5903. ;;
  5904. { .mfi
  5905. nop __LINE__
  5906. FSUB f98 = f120, f98
  5907. nop __LINE__
  5908. }
  5909. { .mfi
  5910. nop __LINE__
  5911. FSUB_A f99 = f121, f99
  5912. nop __LINE__
  5913. }
  5914. ;;
  5915. { .mfi
  5916. nop __LINE__
  5917. FSUB f114 = f122, f114
  5918. nop __LINE__
  5919. }
  5920. { .mfi
  5921. nop __LINE__
  5922. FSUB_A f115 = f123, f115
  5923. nop __LINE__
  5924. }
  5925. ;;
  5926. #else
  5927. LDFPD f72, f73 = [AOFFSET], 2 * SIZE
  5928. ;;
  5929. LDFPD f74, f75 = [AOFFSET], 2 * SIZE
  5930. ;;
  5931. LDFPD f88, f89 = [AOFFSET], 2 * SIZE
  5932. ;;
  5933. LDFPD f90, f91 = [AOFFSET], 2 * SIZE
  5934. ;;
  5935. LDFPD f104, f105 = [AOFFSET], 2 * SIZE
  5936. ;;
  5937. LDFPD f106, f107 = [AOFFSET], 2 * SIZE
  5938. ;;
  5939. { .mfi
  5940. LDFPD f120, f121 = [AOFFSET], 2 * SIZE
  5941. FSUB f64 = f72, f64
  5942. nop __LINE__
  5943. }
  5944. { .mfi
  5945. nop __LINE__
  5946. FSUB f65 = f73, f65
  5947. nop __LINE__
  5948. }
  5949. ;;
  5950. { .mfi
  5951. LDFPD f122, f123 = [AOFFSET]
  5952. FSUB f66 = f74, f66
  5953. adds AOFFSET = -14 * SIZE, AOFFSET
  5954. }
  5955. { .mfi
  5956. nop __LINE__
  5957. FSUB f67 = f75, f67
  5958. nop __LINE__
  5959. }
  5960. ;;
  5961. { .mfi
  5962. nop __LINE__
  5963. FSUB f80 = f88, f80
  5964. nop __LINE__
  5965. }
  5966. { .mfi
  5967. nop __LINE__
  5968. FSUB f81 = f89, f81
  5969. nop __LINE__
  5970. }
  5971. ;;
  5972. { .mfi
  5973. nop __LINE__
  5974. FSUB f82 = f90, f82
  5975. nop __LINE__
  5976. }
  5977. { .mfi
  5978. nop __LINE__
  5979. FSUB f83 = f91, f83
  5980. nop __LINE__
  5981. }
  5982. ;;
  5983. { .mfi
  5984. nop __LINE__
  5985. FSUB f96 = f104, f96
  5986. nop __LINE__
  5987. }
  5988. { .mfi
  5989. nop __LINE__
  5990. FSUB f97 = f105, f97
  5991. nop __LINE__
  5992. }
  5993. ;;
  5994. { .mfi
  5995. nop __LINE__
  5996. FSUB f98 = f106, f98
  5997. nop __LINE__
  5998. }
  5999. { .mfi
  6000. nop __LINE__
  6001. FSUB f99 = f107, f99
  6002. nop __LINE__
  6003. }
  6004. ;;
  6005. { .mfi
  6006. nop __LINE__
  6007. FSUB f112 = f120, f112
  6008. nop __LINE__
  6009. }
  6010. { .mfi
  6011. nop __LINE__
  6012. FSUB f113 = f121, f113
  6013. nop __LINE__
  6014. }
  6015. ;;
  6016. { .mfi
  6017. nop __LINE__
  6018. FSUB f114 = f122, f114
  6019. nop __LINE__
  6020. }
  6021. { .mfi
  6022. nop __LINE__
  6023. FSUB f115 = f123, f115
  6024. nop __LINE__
  6025. }
  6026. ;;
  6027. #endif
  6028. #ifdef LN
  6029. adds AOFFSET = 6 * SIZE, AOFFSET
  6030. ;;
  6031. LDFPD f104, f105 = [AOFFSET]
  6032. adds AOFFSET = - 2 * SIZE, AOFFSET
  6033. ;;
  6034. LDFPD f106, f107 = [AOFFSET]
  6035. adds AOFFSET = - 4 * SIZE, AOFFSET
  6036. ;;
  6037. LDFPD f120, f121 = [AOFFSET]
  6038. ;;
  6039. FMPY f32 = f104, f66
  6040. FMPY f33 = f105, f66
  6041. FMPY f34 = f104, f82
  6042. FMPY f35 = f105, f82
  6043. FMPY f36 = f104, f98
  6044. FMPY f37 = f105, f98
  6045. FMPY f38 = f104, f114
  6046. FMPY f39 = f105, f114
  6047. ;;
  6048. FMA_C f66 = f105, f67, f32
  6049. FMA_D f67 = f104, f67, f33
  6050. FMA_C f82 = f105, f83, f34
  6051. FMA_D f83 = f104, f83, f35
  6052. FMA_C f98 = f105, f99, f36
  6053. FMA_D f99 = f104, f99, f37
  6054. FMA_C f114 = f105, f115, f38
  6055. FMA_D f115 = f104, f115, f39
  6056. ;;
  6057. FNMA f64 = f106, f66, f64
  6058. FMA_A f65 = f107, f66, f65
  6059. FNMA f80 = f106, f82, f80
  6060. FMA_A f81 = f107, f82, f81
  6061. FNMA f96 = f106, f98, f96
  6062. FMA_A f97 = f107, f98, f97
  6063. FNMA f112 = f106, f114, f112
  6064. FMA_A f113 = f107, f114, f113
  6065. ;;
  6066. FMA_B f64 = f107, f67, f64
  6067. FNMA f65 = f106, f67, f65
  6068. FMA_B f80 = f107, f83, f80
  6069. FNMA f81 = f106, f83, f81
  6070. FMA_B f96 = f107, f99, f96
  6071. FNMA f97 = f106, f99, f97
  6072. FMA_B f112 = f107, f115, f112
  6073. FNMA f113 = f106, f115, f113
  6074. ;;
  6075. FMPY f32 = f120, f64
  6076. FMPY f33 = f121, f64
  6077. FMPY f34 = f120, f80
  6078. FMPY f35 = f121, f80
  6079. FMPY f36 = f120, f96
  6080. FMPY f37 = f121, f96
  6081. FMPY f38 = f120, f112
  6082. FMPY f39 = f121, f112
  6083. ;;
  6084. FMA_C f64 = f121, f65, f32
  6085. FMA_D f65 = f120, f65, f33
  6086. FMA_C f80 = f121, f81, f34
  6087. FMA_D f81 = f120, f81, f35
  6088. FMA_C f96 = f121, f97, f36
  6089. FMA_D f97 = f120, f97, f37
  6090. FMA_C f112 = f121, f113, f38
  6091. FMA_D f113 = f120, f113, f39
  6092. ;;
  6093. #endif
  6094. #ifdef LT
  6095. LDFPD f72, f73 = [AOFFSET], 2 * SIZE
  6096. ;;
  6097. LDFPD f74, f75 = [AOFFSET]
  6098. adds AOFFSET = 4 * SIZE, AOFFSET
  6099. ;;
  6100. LDFPD f90, f91 = [AOFFSET]
  6101. adds AOFFSET = - 6 * SIZE, AOFFSET
  6102. ;;
  6103. FMPY f32 = f72, f64
  6104. FMPY f33 = f73, f64
  6105. FMPY f34 = f72, f80
  6106. FMPY f35 = f73, f80
  6107. FMPY f36 = f72, f96
  6108. FMPY f37 = f73, f96
  6109. FMPY f38 = f72, f112
  6110. FMPY f39 = f73, f112
  6111. ;;
  6112. FMA_C f64 = f73, f65, f32
  6113. FMA_D f65 = f72, f65, f33
  6114. FMA_C f80 = f73, f81, f34
  6115. FMA_D f81 = f72, f81, f35
  6116. FMA_C f96 = f73, f97, f36
  6117. FMA_D f97 = f72, f97, f37
  6118. FMA_C f112 = f73, f113, f38
  6119. FMA_D f113 = f72, f113, f39
  6120. ;;
  6121. FNMA f66 = f74, f64, f66
  6122. FMA_A f67 = f75, f64, f67
  6123. FNMA f82 = f74, f80, f82
  6124. FMA_A f83 = f75, f80, f83
  6125. FNMA f98 = f74, f96, f98
  6126. FMA_A f99 = f75, f96, f99
  6127. FNMA f114 = f74, f112, f114
  6128. FMA_A f115 = f75, f112, f115
  6129. ;;
  6130. FMA_B f66 = f75, f65, f66
  6131. FNMA f67 = f74, f65, f67
  6132. FMA_B f82 = f75, f81, f82
  6133. FNMA f83 = f74, f81, f83
  6134. FMA_B f98 = f75, f97, f98
  6135. FNMA f99 = f74, f97, f99
  6136. FMA_B f114 = f75, f113, f114
  6137. FNMA f115 = f74, f113, f115
  6138. ;;
  6139. FMPY f32 = f90, f66
  6140. FMPY f33 = f91, f66
  6141. FMPY f34 = f90, f82
  6142. FMPY f35 = f91, f82
  6143. FMPY f36 = f90, f98
  6144. FMPY f37 = f91, f98
  6145. FMPY f38 = f90, f114
  6146. FMPY f39 = f91, f114
  6147. ;;
  6148. FMA_C f66 = f91, f67, f32
  6149. FMA_D f67 = f90, f67, f33
  6150. FMA_C f82 = f91, f83, f34
  6151. FMA_D f83 = f90, f83, f35
  6152. FMA_C f98 = f91, f99, f36
  6153. FMA_D f99 = f90, f99, f37
  6154. FMA_C f114 = f91, f115, f38
  6155. FMA_D f115 = f90, f115, f39
  6156. ;;
  6157. #endif
  6158. #ifdef RN
  6159. LDFPD f72, f73 = [BOFFSET], 2 * SIZE
  6160. ;;
  6161. LDFPD f74, f75 = [BOFFSET], 2 * SIZE
  6162. ;;
  6163. LDFPD f76, f77 = [BOFFSET], 2 * SIZE
  6164. ;;
  6165. LDFPD f78, f79 = [BOFFSET]
  6166. adds BOFFSET = 4 * SIZE, BOFFSET
  6167. ;;
  6168. LDFPD f90, f91 = [BOFFSET], 2 * SIZE
  6169. ;;
  6170. LDFPD f92, f93 = [BOFFSET], 2 * SIZE
  6171. ;;
  6172. LDFPD f94, f95 = [BOFFSET]
  6173. adds BOFFSET = 6 * SIZE, BOFFSET
  6174. ;;
  6175. LDFPD f108, f109 = [BOFFSET], 2 * SIZE
  6176. ;;
  6177. LDFPD f110, f111 = [BOFFSET]
  6178. adds BOFFSET = 8 * SIZE, BOFFSET
  6179. ;;
  6180. LDFPD f126, f127 = [BOFFSET]
  6181. adds BOFFSET = - 30 * SIZE, BOFFSET
  6182. ;;
  6183. FMPY f32 = f72, f64
  6184. FMPY f33 = f73, f64
  6185. FMPY f34 = f72, f66
  6186. FMPY f35 = f73, f66
  6187. ;;
  6188. FMA_C f64 = f73, f65, f32
  6189. FMA_D f65 = f72, f65, f33
  6190. FMA_C f66 = f73, f67, f34
  6191. FMA_D f67 = f72, f67, f35
  6192. ;;
  6193. FNMA f80 = f74, f64, f80
  6194. FMA_A f81 = f75, f64, f81
  6195. FNMA f82 = f74, f66, f82
  6196. FMA_A f83 = f75, f66, f83
  6197. ;;
  6198. FMA_B f80 = f75, f65, f80
  6199. FNMA f81 = f74, f65, f81
  6200. FMA_B f82 = f75, f67, f82
  6201. FNMA f83 = f74, f67, f83
  6202. ;;
  6203. FNMA f96 = f76, f64, f96
  6204. FMA_A f97 = f77, f64, f97
  6205. FNMA f98 = f76, f66, f98
  6206. FMA_A f99 = f77, f66, f99
  6207. ;;
  6208. FMA_B f96 = f77, f65, f96
  6209. FNMA f97 = f76, f65, f97
  6210. FMA_B f98 = f77, f67, f98
  6211. FNMA f99 = f76, f67, f99
  6212. ;;
  6213. FNMA f112 = f78, f64, f112
  6214. FMA_A f113 = f79, f64, f113
  6215. FNMA f114 = f78, f66, f114
  6216. FMA_A f115 = f79, f66, f115
  6217. ;;
  6218. FMA_B f112 = f79, f65, f112
  6219. FNMA f113 = f78, f65, f113
  6220. FMA_B f114 = f79, f67, f114
  6221. FNMA f115 = f78, f67, f115
  6222. ;;
  6223. FMPY f32 = f90, f80
  6224. FMPY f33 = f91, f80
  6225. FMPY f34 = f90, f82
  6226. FMPY f35 = f91, f82
  6227. ;;
  6228. FMA_C f80 = f91, f81, f32
  6229. FMA_D f81 = f90, f81, f33
  6230. FMA_C f82 = f91, f83, f34
  6231. FMA_D f83 = f90, f83, f35
  6232. ;;
  6233. FNMA f96 = f92, f80, f96
  6234. FMA_A f97 = f93, f80, f97
  6235. FNMA f98 = f92, f82, f98
  6236. FMA_A f99 = f93, f82, f99
  6237. ;;
  6238. FMA_B f96 = f93, f81, f96
  6239. FNMA f97 = f92, f81, f97
  6240. FMA_B f98 = f93, f83, f98
  6241. FNMA f99 = f92, f83, f99
  6242. ;;
  6243. FNMA f112 = f94, f80, f112
  6244. FMA_A f113 = f95, f80, f113
  6245. FNMA f114 = f94, f82, f114
  6246. FMA_A f115 = f95, f82, f115
  6247. ;;
  6248. FMA_B f112 = f95, f81, f112
  6249. FNMA f113 = f94, f81, f113
  6250. FMA_B f114 = f95, f83, f114
  6251. FNMA f115 = f94, f83, f115
  6252. ;;
  6253. FMPY f32 = f108, f96
  6254. FMPY f33 = f109, f96
  6255. FMPY f34 = f108, f98
  6256. FMPY f35 = f109, f98
  6257. ;;
  6258. FMA_C f96 = f109, f97, f32
  6259. FMA_D f97 = f108, f97, f33
  6260. FMA_C f98 = f109, f99, f34
  6261. FMA_D f99 = f108, f99, f35
  6262. ;;
  6263. FNMA f112 = f110, f96, f112
  6264. FMA_A f113 = f111, f96, f113
  6265. FNMA f114 = f110, f98, f114
  6266. FMA_A f115 = f111, f98, f115
  6267. ;;
  6268. FMA_B f112 = f111, f97, f112
  6269. FNMA f113 = f110, f97, f113
  6270. FMA_B f114 = f111, f99, f114
  6271. FNMA f115 = f110, f99, f115
  6272. ;;
  6273. FMPY f32 = f126, f112
  6274. FMPY f33 = f127, f112
  6275. FMPY f34 = f126, f114
  6276. FMPY f35 = f127, f114
  6277. ;;
  6278. FMA_C f112 = f127, f113, f32
  6279. FMA_D f113 = f126, f113, f33
  6280. FMA_C f114 = f127, f115, f34
  6281. FMA_D f115 = f126, f115, f35
  6282. ;;
  6283. #endif
  6284. #ifdef RT
  6285. adds BOFFSET = 30 * SIZE, BOFFSET
  6286. ;;
  6287. LDFPD f72, f73 = [BOFFSET]
  6288. adds BOFFSET = - 2 * SIZE, BOFFSET
  6289. ;;
  6290. LDFPD f74, f75 = [BOFFSET]
  6291. adds BOFFSET = - 2 * SIZE, BOFFSET
  6292. ;;
  6293. LDFPD f76, f77 = [BOFFSET]
  6294. adds BOFFSET = - 2 * SIZE, BOFFSET
  6295. ;;
  6296. LDFPD f78, f79 = [BOFFSET]
  6297. adds BOFFSET = - 4 * SIZE, BOFFSET
  6298. ;;
  6299. LDFPD f88, f89 = [BOFFSET]
  6300. adds BOFFSET = - 2 * SIZE, BOFFSET
  6301. ;;
  6302. LDFPD f90, f91 = [BOFFSET]
  6303. adds BOFFSET = - 2 * SIZE, BOFFSET
  6304. ;;
  6305. LDFPD f92, f93 = [BOFFSET]
  6306. adds BOFFSET = - 6 * SIZE, BOFFSET
  6307. ;;
  6308. LDFPD f104, f105 = [BOFFSET]
  6309. adds BOFFSET = - 2 * SIZE, BOFFSET
  6310. ;;
  6311. LDFPD f106, f107 = [BOFFSET]
  6312. adds BOFFSET = - 8 * SIZE, BOFFSET
  6313. ;;
  6314. LDFPD f120, f121 = [BOFFSET]
  6315. ;;
  6316. FMPY f32 = f72, f112
  6317. FMPY f33 = f73, f112
  6318. FMPY f34 = f72, f114
  6319. FMPY f35 = f73, f114
  6320. ;;
  6321. FMA_C f112 = f73, f113, f32
  6322. FMA_D f113 = f72, f113, f33
  6323. FMA_C f114 = f73, f115, f34
  6324. FMA_D f115 = f72, f115, f35
  6325. ;;
  6326. FNMA f96 = f74, f112, f96
  6327. FMA_A f97 = f75, f112, f97
  6328. FNMA f98 = f74, f114, f98
  6329. FMA_A f99 = f75, f114, f99
  6330. ;;
  6331. FMA_B f96 = f75, f113, f96
  6332. FNMA f97 = f74, f113, f97
  6333. FMA_B f98 = f75, f115, f98
  6334. FNMA f99 = f74, f115, f99
  6335. ;;
  6336. FNMA f80 = f76, f112, f80
  6337. FMA_A f81 = f77, f112, f81
  6338. FNMA f82 = f76, f114, f82
  6339. FMA_A f83 = f77, f114, f83
  6340. ;;
  6341. FMA_B f80 = f77, f113, f80
  6342. FNMA f81 = f76, f113, f81
  6343. FMA_B f82 = f77, f115, f82
  6344. FNMA f83 = f76, f115, f83
  6345. ;;
  6346. FNMA f64 = f78, f112, f64
  6347. FMA_A f65 = f79, f112, f65
  6348. FNMA f66 = f78, f114, f66
  6349. FMA_A f67 = f79, f114, f67
  6350. ;;
  6351. FMA_B f64 = f79, f113, f64
  6352. FNMA f65 = f78, f113, f65
  6353. FMA_B f66 = f79, f115, f66
  6354. FNMA f67 = f78, f115, f67
  6355. ;;
  6356. FMPY f32 = f88, f96
  6357. FMPY f33 = f89, f96
  6358. FMPY f34 = f88, f98
  6359. FMPY f35 = f89, f98
  6360. ;;
  6361. FMA_C f96 = f89, f97, f32
  6362. FMA_D f97 = f88, f97, f33
  6363. FMA_C f98 = f89, f99, f34
  6364. FMA_D f99 = f88, f99, f35
  6365. ;;
  6366. FNMA f80 = f90, f96, f80
  6367. FMA_A f81 = f91, f96, f81
  6368. FNMA f82 = f90, f98, f82
  6369. FMA_A f83 = f91, f98, f83
  6370. ;;
  6371. FMA_B f80 = f91, f97, f80
  6372. FNMA f81 = f90, f97, f81
  6373. FMA_B f82 = f91, f99, f82
  6374. FNMA f83 = f90, f99, f83
  6375. ;;
  6376. FNMA f64 = f92, f96, f64
  6377. FMA_A f65 = f93, f96, f65
  6378. FNMA f66 = f92, f98, f66
  6379. FMA_A f67 = f93, f98, f67
  6380. ;;
  6381. FMA_B f64 = f93, f97, f64
  6382. FNMA f65 = f92, f97, f65
  6383. FMA_B f66 = f93, f99, f66
  6384. FNMA f67 = f92, f99, f67
  6385. ;;
  6386. FMPY f32 = f104, f80
  6387. FMPY f33 = f105, f80
  6388. FMPY f34 = f104, f82
  6389. FMPY f35 = f105, f82
  6390. ;;
  6391. FMA_C f80 = f105, f81, f32
  6392. FMA_D f81 = f104, f81, f33
  6393. FMA_C f82 = f105, f83, f34
  6394. FMA_D f83 = f104, f83, f35
  6395. ;;
  6396. FNMA f64 = f106, f80, f64
  6397. FMA_A f65 = f107, f80, f65
  6398. FNMA f66 = f106, f82, f66
  6399. FMA_A f67 = f107, f82, f67
  6400. ;;
  6401. FMA_B f64 = f107, f81, f64
  6402. FNMA f65 = f106, f81, f65
  6403. FMA_B f66 = f107, f83, f66
  6404. FNMA f67 = f106, f83, f67
  6405. ;;
  6406. FMPY f32 = f120, f64
  6407. FMPY f33 = f121, f64
  6408. FMPY f34 = f120, f66
  6409. FMPY f35 = f121, f66
  6410. ;;
  6411. FMA_C f64 = f121, f65, f32
  6412. FMA_D f65 = f120, f65, f33
  6413. FMA_C f66 = f121, f67, f34
  6414. FMA_D f67 = f120, f67, f35
  6415. ;;
  6416. #endif
  6417. #if defined(LN) || defined(LT)
  6418. adds BOFFSET2 = 4 * SIZE, BOFFSET
  6419. ;;
  6420. STFD [BOFFSET] = f64, SIZE
  6421. STFD [BOFFSET2] = f96, SIZE
  6422. ;;
  6423. STFD [BOFFSET] = f65, SIZE
  6424. STFD [BOFFSET2] = f97, SIZE
  6425. ;;
  6426. STFD [BOFFSET] = f80, SIZE
  6427. STFD [BOFFSET2] = f112, SIZE
  6428. ;;
  6429. STFD [BOFFSET] = f81, 5 * SIZE
  6430. STFD [BOFFSET2] = f113, 5 * SIZE
  6431. ;;
  6432. STFD [BOFFSET] = f66, SIZE
  6433. STFD [BOFFSET2] = f98, SIZE
  6434. ;;
  6435. STFD [BOFFSET] = f67, SIZE
  6436. STFD [BOFFSET2] = f99, SIZE
  6437. ;;
  6438. STFD [BOFFSET] = f82, SIZE
  6439. STFD [BOFFSET2] = f114, SIZE
  6440. ;;
  6441. STFD [BOFFSET] = f83, 5 * SIZE
  6442. STFD [BOFFSET2] = f115, 5 * SIZE
  6443. ;;
  6444. adds BOFFSET = - 16 * SIZE, BOFFSET
  6445. ;;
  6446. #else
  6447. adds AOFFSET2 = 4 * SIZE, AOFFSET
  6448. ;;
  6449. STFD [AOFFSET] = f64, SIZE
  6450. STFD [AOFFSET2] = f80, SIZE
  6451. ;;
  6452. STFD [AOFFSET] = f65, SIZE
  6453. STFD [AOFFSET2] = f81, SIZE
  6454. ;;
  6455. STFD [AOFFSET] = f66, SIZE
  6456. STFD [AOFFSET2] = f82, SIZE
  6457. ;;
  6458. STFD [AOFFSET] = f67, 5 * SIZE
  6459. STFD [AOFFSET2] = f83, 5 * SIZE
  6460. ;;
  6461. STFD [AOFFSET] = f96, SIZE
  6462. STFD [AOFFSET2] = f112, SIZE
  6463. ;;
  6464. STFD [AOFFSET] = f97, SIZE
  6465. STFD [AOFFSET2] = f113, SIZE
  6466. ;;
  6467. STFD [AOFFSET] = f98, SIZE
  6468. STFD [AOFFSET2] = f114, SIZE
  6469. ;;
  6470. STFD [AOFFSET] = f99, 5 * SIZE
  6471. STFD [AOFFSET2] = f115, 5 * SIZE
  6472. ;;
  6473. adds AOFFSET = - 16 * SIZE, AOFFSET
  6474. ;;
  6475. #endif
  6476. #ifdef LN
  6477. adds C1 = -4 * SIZE, C1
  6478. adds C2 = -4 * SIZE, C2
  6479. adds C3 = -4 * SIZE, C3
  6480. adds C4 = -4 * SIZE, C4
  6481. #endif
  6482. ;;
  6483. STFD [C1 ] = f64, SIZE
  6484. ;;
  6485. STFD [C1 ] = f65, SIZE
  6486. ;;
  6487. STFD [C1 ] = f66, SIZE
  6488. ;;
  6489. STFD [C1 ] = f67, SIZE
  6490. ;;
  6491. STFD [C2 ] = f80, SIZE
  6492. ;;
  6493. STFD [C2 ] = f81, SIZE
  6494. ;;
  6495. STFD [C2 ] = f82, SIZE
  6496. ;;
  6497. STFD [C2 ] = f83, SIZE
  6498. ;;
  6499. STFD [C3 ] = f96, SIZE
  6500. ;;
  6501. STFD [C3 ] = f97, SIZE
  6502. ;;
  6503. STFD [C3 ] = f98, SIZE
  6504. ;;
  6505. STFD [C3 ] = f99, SIZE
  6506. ;;
  6507. STFD [C4 ] = f112, SIZE
  6508. ;;
  6509. STFD [C4 ] = f113, SIZE
  6510. ;;
  6511. STFD [C4 ] = f114, SIZE
  6512. ;;
  6513. STFD [C4 ] = f115, SIZE
  6514. ;;
  6515. mov f64 = f0
  6516. mov f65 = f0
  6517. mov f80 = f0
  6518. mov f81 = f0
  6519. mov f96 = f0
  6520. mov f97 = f0
  6521. mov f112 = f0
  6522. mov f113 = f0
  6523. ;;
  6524. #ifdef LN
  6525. adds C1 = -4 * SIZE, C1
  6526. adds C2 = -4 * SIZE, C2
  6527. adds C3 = -4 * SIZE, C3
  6528. adds C4 = -4 * SIZE, C4
  6529. #endif
  6530. ;;
  6531. cmp.ne p6, p0 = 1, I
  6532. ;;
  6533. adds I = -1, I
  6534. ;;
  6535. shladd r2 = K, ZBASE_SHIFT, r0
  6536. ;;
  6537. sub L = K, KK
  6538. ;;
  6539. #ifdef RT
  6540. shladd AORIG = r2, 1, AORIG
  6541. #endif
  6542. ;;
  6543. #if defined(LT) || defined(RN)
  6544. shladd L = L, ZBASE_SHIFT, r0
  6545. ;;
  6546. shladd AOFFSET = L, 1, AOFFSET
  6547. shladd BOFFSET = L, 2, BOFFSET
  6548. #endif
  6549. ;;
  6550. #ifdef LT
  6551. adds KK = 2, KK
  6552. #elif defined LN
  6553. adds KK = -2, KK
  6554. #else
  6555. nop __LINE__
  6556. #endif
  6557. ;;
  6558. #if defined(LT) || defined(RN)
  6559. mov L = KK
  6560. #else
  6561. sub L = K, KK
  6562. #endif
  6563. ;;
  6564. .align 16
  6565. .L030:
  6566. { .mib
  6567. #if defined(LT) || defined(RN)
  6568. mov L = KK
  6569. #else
  6570. sub L = K, KK
  6571. #endif
  6572. tbit.z p6, p7 = M, 0
  6573. (p6) br.cond.dptk .L049
  6574. }
  6575. ;;
  6576. { .mmi
  6577. cmp.ne p7, p0 = r0, L
  6578. adds BOFFSET = 0 * SIZE, B
  6579. shl r2 = K, ZBASE_SHIFT
  6580. }
  6581. { .mmi
  6582. shladd r3 = KK, ZBASE_SHIFT, r0
  6583. nop __LINE__
  6584. nop __LINE__
  6585. }
  6586. ;;
  6587. #if defined(LT) || defined(RN)
  6588. { .mfb
  6589. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6590. mov f72 = f0
  6591. nop __LINE__
  6592. }
  6593. { .mmf
  6594. nop __LINE__
  6595. nop __LINE__
  6596. mov f73 = f0
  6597. }
  6598. ;;
  6599. #else
  6600. { .mfi
  6601. shladd BOFFSET = r3, 2, B
  6602. mov f72 = f0
  6603. #ifdef LN
  6604. sub AORIG = AORIG, r2
  6605. #else
  6606. nop __LINE__
  6607. #endif
  6608. }
  6609. ;;
  6610. { .mfi
  6611. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6612. mov f73 = f0
  6613. add AOFFSET = r3, AORIG
  6614. }
  6615. ;;
  6616. #endif
  6617. ;;
  6618. adds L = 1, L
  6619. ;;
  6620. { .mmi
  6621. nop __LINE__
  6622. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  6623. tbit.z p12, p0 = L, 0
  6624. }
  6625. ;;
  6626. { .mfi
  6627. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  6628. mov f88 = f0
  6629. shr L = L, 1
  6630. }
  6631. { .mfi
  6632. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6633. mov f89 = f0
  6634. nop __LINE__
  6635. }
  6636. ;;
  6637. { .mfi
  6638. (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  6639. mov f104 = f0
  6640. adds L = -1, L
  6641. }
  6642. { .mfb
  6643. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  6644. mov f105 = f0
  6645. nop __LINE__
  6646. }
  6647. ;;
  6648. { .mfi
  6649. (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  6650. mov f120 = f0
  6651. mov ar.lc = L
  6652. }
  6653. { .mfi
  6654. cmp.eq p3, p0 = r0, r0
  6655. mov f121 = f0
  6656. nop __LINE__
  6657. }
  6658. ;;
  6659. cmp.eq p6, p0 = -1, L
  6660. (p6) br.cond.dpnt .L038
  6661. ;;
  6662. .align 16
  6663. .L032:
  6664. { .mfb
  6665. lfetch.nt1 [PREA], 4 * SIZE
  6666. FMA f64 = f32, f48, f64 // A1 * B1
  6667. nop __LINE__
  6668. }
  6669. { .mfi
  6670. nop __LINE__
  6671. FMA_B f65 = f32, f49, f65 // A1 * B2
  6672. (p12) cmp.ne p3, p0 = 0, L
  6673. }
  6674. ;;
  6675. { .mfi
  6676. lfetch.nt1 [PREB], 16 * SIZE
  6677. FMA f80 = f32, f50, f80 // A1 * B3
  6678. cmp.ne p4, p5 = 0, L
  6679. }
  6680. { .mfb
  6681. nop __LINE__
  6682. FMA_B f81 = f32, f51, f81 // A1 * B4
  6683. nop __LINE__
  6684. }
  6685. ;;
  6686. { .mfb
  6687. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  6688. FMA f96 = f32, f52, f96 // A1 * B5
  6689. nop __LINE__
  6690. }
  6691. { .mfb
  6692. nop __LINE__
  6693. FMA_B f97 = f32, f53, f97 // A1 * B6
  6694. nop __LINE__
  6695. }
  6696. ;;
  6697. { .mfb
  6698. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  6699. FMA f112 = f32, f54, f112 // A1 * B7
  6700. nop __LINE__
  6701. }
  6702. { .mfb
  6703. nop __LINE__
  6704. FMA_B f113 = f32, f55, f113 // A1 * B8
  6705. nop __LINE__
  6706. }
  6707. ;;
  6708. { .mfb
  6709. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  6710. FMA f65 = f33, f48, f65 // A2 * B1
  6711. nop __LINE__
  6712. }
  6713. { .mfb
  6714. nop __LINE__
  6715. FMA_A f64 = f33, f49, f64 // A2 * B2
  6716. nop __LINE__
  6717. }
  6718. ;;
  6719. { .mfb
  6720. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  6721. FMA f81 = f33, f50, f81 // A2 * B3
  6722. nop __LINE__
  6723. }
  6724. { .mfb
  6725. nop __LINE__
  6726. FMA_A f80 = f33, f51, f80 // A2 * B4
  6727. nop __LINE__
  6728. }
  6729. ;;
  6730. { .mfb
  6731. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  6732. FMA f97 = f33, f52, f97 // A2 * B5
  6733. nop __LINE__
  6734. }
  6735. { .mfb
  6736. nop __LINE__
  6737. FMA_A f96 = f33, f53, f96 // A2 * B6
  6738. nop __LINE__
  6739. }
  6740. ;;
  6741. { .mfb
  6742. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6743. FMA f113 = f33, f54, f113 // A2 * B7
  6744. nop __LINE__
  6745. }
  6746. { .mfb
  6747. nop __LINE__
  6748. FMA_A f112 = f33, f55, f112 // A2 * B8
  6749. nop __LINE__
  6750. }
  6751. ;;
  6752. { .mfb
  6753. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6754. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6755. nop __LINE__
  6756. }
  6757. { .mfb
  6758. nop __LINE__
  6759. (p3) FMA_B f65 = f40, f57, f65 // A1 * B2
  6760. nop __LINE__
  6761. }
  6762. ;;
  6763. { .mfb
  6764. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  6765. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  6766. nop __LINE__
  6767. }
  6768. { .mfb
  6769. nop __LINE__
  6770. (p3) FMA_B f81 = f40, f59, f81 // A1 * B4
  6771. nop __LINE__
  6772. }
  6773. ;;
  6774. { .mfb
  6775. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  6776. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  6777. nop __LINE__
  6778. }
  6779. { .mfb
  6780. nop __LINE__
  6781. (p3) FMA_B f97 = f40, f61, f97 // A1 * B6
  6782. nop __LINE__
  6783. }
  6784. ;;
  6785. { .mfb
  6786. nop __LINE__
  6787. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  6788. nop __LINE__
  6789. }
  6790. { .mfb
  6791. nop __LINE__
  6792. (p3) FMA_B f113 = f40, f63, f113 // A1 * B8
  6793. nop __LINE__
  6794. }
  6795. ;;
  6796. { .mfb
  6797. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  6798. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  6799. nop __LINE__
  6800. }
  6801. { .mfb
  6802. nop __LINE__
  6803. (p3) FMA_A f64 = f41, f57, f64 // A2 * B2
  6804. nop __LINE__
  6805. }
  6806. ;;
  6807. { .mfb
  6808. nop __LINE__
  6809. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  6810. nop __LINE__
  6811. }
  6812. { .mfb
  6813. nop __LINE__
  6814. (p3) FMA_A f80 = f41, f59, f80 // A2 * B4
  6815. nop __LINE__
  6816. }
  6817. ;;
  6818. { .mfb
  6819. nop __LINE__
  6820. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  6821. nop __LINE__
  6822. }
  6823. { .mfb
  6824. nop __LINE__
  6825. (p3) FMA_A f96 = f41, f61, f96 // A2 * B6
  6826. nop __LINE__
  6827. }
  6828. ;;
  6829. { .mfi
  6830. nop __LINE__
  6831. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  6832. adds L = -1, L
  6833. }
  6834. { .mfb
  6835. nop __LINE__
  6836. (p3) FMA_A f112 = f41, f63, f112 // A2 * B8
  6837. br.cloop.sptk.few .L032
  6838. }
  6839. ;;
  6840. .L038:
  6841. #if defined(LN) || defined(RT)
  6842. #ifdef LN
  6843. adds r2 = -1, KK
  6844. #else
  6845. adds r2 = -4, KK
  6846. #endif
  6847. ;;
  6848. shladd r2 = r2, ZBASE_SHIFT, r0
  6849. ;;
  6850. add AOFFSET = r2, AORIG
  6851. shladd BOFFSET = r2, 2, B
  6852. ;;
  6853. #endif
  6854. #if defined(LN) || defined(LT)
  6855. LDFPD f72, f73 = [BOFFSET], 2 * SIZE
  6856. ;;
  6857. LDFPD f88, f89 = [BOFFSET], 2 * SIZE
  6858. ;;
  6859. LDFPD f104, f105 = [BOFFSET], 2 * SIZE
  6860. ;;
  6861. LDFPD f120, f121 = [BOFFSET]
  6862. adds BOFFSET = -6 * SIZE, BOFFSET
  6863. ;;
  6864. FSUB f64 = f72, f64
  6865. FSUB_A f65 = f73, f65
  6866. FSUB f80 = f88, f80
  6867. FSUB_A f81 = f89, f81
  6868. FSUB f96 = f104, f96
  6869. FSUB_A f97 = f105, f97
  6870. FSUB f112 = f120, f112
  6871. FSUB_A f113 = f121, f113
  6872. ;;
  6873. #else
  6874. LDFPD f72, f73 = [AOFFSET], 2 * SIZE
  6875. ;;
  6876. LDFPD f88, f89 = [AOFFSET], 2 * SIZE
  6877. ;;
  6878. LDFPD f104, f105 = [AOFFSET], 2 * SIZE
  6879. ;;
  6880. LDFPD f120, f121 = [AOFFSET]
  6881. adds AOFFSET = -6 * SIZE, AOFFSET
  6882. ;;
  6883. FSUB f64 = f72, f64
  6884. FSUB f65 = f73, f65
  6885. FSUB f80 = f88, f80
  6886. FSUB f81 = f89, f81
  6887. FSUB f96 = f104, f96
  6888. FSUB f97 = f105, f97
  6889. FSUB f112 = f120, f112
  6890. FSUB f113 = f121, f113
  6891. ;;
  6892. #endif
  6893. #ifdef LN
  6894. LDFPD f120, f121 = [AOFFSET]
  6895. ;;
  6896. FMPY f32 = f120, f64
  6897. FMPY f33 = f121, f64
  6898. FMPY f34 = f120, f80
  6899. FMPY f35 = f121, f80
  6900. FMPY f36 = f120, f96
  6901. FMPY f37 = f121, f96
  6902. FMPY f38 = f120, f112
  6903. FMPY f39 = f121, f112
  6904. ;;
  6905. FMA_C f64 = f121, f65, f32
  6906. FMA_D f65 = f120, f65, f33
  6907. FMA_C f80 = f121, f81, f34
  6908. FMA_D f81 = f120, f81, f35
  6909. FMA_C f96 = f121, f97, f36
  6910. FMA_D f97 = f120, f97, f37
  6911. FMA_C f112 = f121, f113, f38
  6912. FMA_D f113 = f120, f113, f39
  6913. ;;
  6914. #endif
  6915. #ifdef LT
  6916. LDFPD f90, f91 = [AOFFSET]
  6917. ;;
  6918. FMPY f32 = f90, f64
  6919. FMPY f33 = f91, f64
  6920. FMPY f34 = f90, f80
  6921. FMPY f35 = f91, f80
  6922. FMPY f36 = f90, f96
  6923. FMPY f37 = f91, f96
  6924. FMPY f38 = f90, f112
  6925. FMPY f39 = f91, f112
  6926. ;;
  6927. FMA_C f64 = f91, f65, f32
  6928. FMA_D f65 = f90, f65, f33
  6929. FMA_C f80 = f91, f81, f34
  6930. FMA_D f81 = f90, f81, f35
  6931. FMA_C f96 = f91, f97, f36
  6932. FMA_D f97 = f90, f97, f37
  6933. FMA_C f112 = f91, f113, f38
  6934. FMA_D f113 = f90, f113, f39
  6935. ;;
  6936. #endif
  6937. #ifdef RN
  6938. LDFPD f72, f73 = [BOFFSET], 2 * SIZE
  6939. ;;
  6940. LDFPD f74, f75 = [BOFFSET], 2 * SIZE
  6941. ;;
  6942. LDFPD f76, f77 = [BOFFSET], 2 * SIZE
  6943. ;;
  6944. LDFPD f78, f79 = [BOFFSET]
  6945. adds BOFFSET = 4 * SIZE, BOFFSET
  6946. ;;
  6947. LDFPD f90, f91 = [BOFFSET], 2 * SIZE
  6948. ;;
  6949. LDFPD f92, f93 = [BOFFSET], 2 * SIZE
  6950. ;;
  6951. LDFPD f94, f95 = [BOFFSET]
  6952. adds BOFFSET = 6 * SIZE, BOFFSET
  6953. ;;
  6954. LDFPD f108, f109 = [BOFFSET], 2 * SIZE
  6955. ;;
  6956. LDFPD f110, f111 = [BOFFSET]
  6957. adds BOFFSET = 8 * SIZE, BOFFSET
  6958. ;;
  6959. LDFPD f126, f127 = [BOFFSET]
  6960. adds BOFFSET = - 30 * SIZE, BOFFSET
  6961. ;;
  6962. FMPY f32 = f72, f64
  6963. FMPY f33 = f73, f64
  6964. ;;
  6965. FMA_C f64 = f73, f65, f32
  6966. FMA_D f65 = f72, f65, f33
  6967. ;;
  6968. FNMA f80 = f74, f64, f80
  6969. FMA_A f81 = f75, f64, f81
  6970. ;;
  6971. FMA_B f80 = f75, f65, f80
  6972. FNMA f81 = f74, f65, f81
  6973. ;;
  6974. FNMA f96 = f76, f64, f96
  6975. FMA_A f97 = f77, f64, f97
  6976. ;;
  6977. FMA_B f96 = f77, f65, f96
  6978. FNMA f97 = f76, f65, f97
  6979. ;;
  6980. FNMA f112 = f78, f64, f112
  6981. FMA_A f113 = f79, f64, f113
  6982. ;;
  6983. FMA_B f112 = f79, f65, f112
  6984. FNMA f113 = f78, f65, f113
  6985. ;;
  6986. FMPY f32 = f90, f80
  6987. FMPY f33 = f91, f80
  6988. ;;
  6989. FMA_C f80 = f91, f81, f32
  6990. FMA_D f81 = f90, f81, f33
  6991. ;;
  6992. FNMA f96 = f92, f80, f96
  6993. FMA_A f97 = f93, f80, f97
  6994. ;;
  6995. FMA_B f96 = f93, f81, f96
  6996. FNMA f97 = f92, f81, f97
  6997. ;;
  6998. FNMA f112 = f94, f80, f112
  6999. FMA_A f113 = f95, f80, f113
  7000. ;;
  7001. FMA_B f112 = f95, f81, f112
  7002. FNMA f113 = f94, f81, f113
  7003. ;;
  7004. FMPY f32 = f108, f96
  7005. FMPY f33 = f109, f96
  7006. ;;
  7007. FMA_C f96 = f109, f97, f32
  7008. FMA_D f97 = f108, f97, f33
  7009. ;;
  7010. FNMA f112 = f110, f96, f112
  7011. FMA_A f113 = f111, f96, f113
  7012. ;;
  7013. FMA_B f112 = f111, f97, f112
  7014. FNMA f113 = f110, f97, f113
  7015. ;;
  7016. FMPY f32 = f126, f112
  7017. FMPY f33 = f127, f112
  7018. ;;
  7019. FMA_C f112 = f127, f113, f32
  7020. FMA_D f113 = f126, f113, f33
  7021. ;;
  7022. #endif
  7023. #ifdef RT
  7024. adds BOFFSET = 30 * SIZE, BOFFSET
  7025. ;;
  7026. LDFPD f72, f73 = [BOFFSET]
  7027. adds BOFFSET = - 2 * SIZE, BOFFSET
  7028. ;;
  7029. LDFPD f74, f75 = [BOFFSET]
  7030. adds BOFFSET = - 2 * SIZE, BOFFSET
  7031. ;;
  7032. LDFPD f76, f77 = [BOFFSET]
  7033. adds BOFFSET = - 2 * SIZE, BOFFSET
  7034. ;;
  7035. LDFPD f78, f79 = [BOFFSET]
  7036. adds BOFFSET = - 4 * SIZE, BOFFSET
  7037. ;;
  7038. LDFPD f88, f89 = [BOFFSET]
  7039. adds BOFFSET = - 2 * SIZE, BOFFSET
  7040. ;;
  7041. LDFPD f90, f91 = [BOFFSET]
  7042. adds BOFFSET = - 2 * SIZE, BOFFSET
  7043. ;;
  7044. LDFPD f92, f93 = [BOFFSET]
  7045. adds BOFFSET = - 6 * SIZE, BOFFSET
  7046. ;;
  7047. LDFPD f104, f105 = [BOFFSET]
  7048. adds BOFFSET = - 2 * SIZE, BOFFSET
  7049. ;;
  7050. LDFPD f106, f107 = [BOFFSET]
  7051. adds BOFFSET = - 8 * SIZE, BOFFSET
  7052. ;;
  7053. LDFPD f120, f121 = [BOFFSET]
  7054. ;;
  7055. FMPY f32 = f72, f112
  7056. FMPY f33 = f73, f112
  7057. ;;
  7058. FMA_C f112 = f73, f113, f32
  7059. FMA_D f113 = f72, f113, f33
  7060. ;;
  7061. FNMA f96 = f74, f112, f96
  7062. FMA_A f97 = f75, f112, f97
  7063. ;;
  7064. FMA_B f96 = f75, f113, f96
  7065. FNMA f97 = f74, f113, f97
  7066. ;;
  7067. FNMA f80 = f76, f112, f80
  7068. FMA_A f81 = f77, f112, f81
  7069. ;;
  7070. FMA_B f80 = f77, f113, f80
  7071. FNMA f81 = f76, f113, f81
  7072. ;;
  7073. FNMA f64 = f78, f112, f64
  7074. FMA_A f65 = f79, f112, f65
  7075. ;;
  7076. FMA_B f64 = f79, f113, f64
  7077. FNMA f65 = f78, f113, f65
  7078. ;;
  7079. FMPY f32 = f88, f96
  7080. FMPY f33 = f89, f96
  7081. ;;
  7082. FMA_C f96 = f89, f97, f32
  7083. FMA_D f97 = f88, f97, f33
  7084. ;;
  7085. FNMA f80 = f90, f96, f80
  7086. FMA_A f81 = f91, f96, f81
  7087. ;;
  7088. FMA_B f80 = f91, f97, f80
  7089. FNMA f81 = f90, f97, f81
  7090. ;;
  7091. FNMA f64 = f92, f96, f64
  7092. FMA_A f65 = f93, f96, f65
  7093. ;;
  7094. FMA_B f64 = f93, f97, f64
  7095. FNMA f65 = f92, f97, f65
  7096. ;;
  7097. FMPY f32 = f104, f80
  7098. FMPY f33 = f105, f80
  7099. ;;
  7100. FMA_C f80 = f105, f81, f32
  7101. FMA_D f81 = f104, f81, f33
  7102. ;;
  7103. FNMA f64 = f106, f80, f64
  7104. FMA_A f65 = f107, f80, f65
  7105. ;;
  7106. FMA_B f64 = f107, f81, f64
  7107. FNMA f65 = f106, f81, f65
  7108. ;;
  7109. FMPY f32 = f120, f64
  7110. FMPY f33 = f121, f64
  7111. ;;
  7112. FMA_C f64 = f121, f65, f32
  7113. FMA_D f65 = f120, f65, f33
  7114. ;;
  7115. #endif
  7116. #if defined(LN) || defined(LT)
  7117. adds BOFFSET2 = 4 * SIZE, BOFFSET
  7118. ;;
  7119. STFD [BOFFSET] = f64, SIZE
  7120. STFD [BOFFSET2] = f96, SIZE
  7121. ;;
  7122. STFD [BOFFSET] = f65, SIZE
  7123. STFD [BOFFSET2] = f97, SIZE
  7124. ;;
  7125. STFD [BOFFSET] = f80, SIZE
  7126. STFD [BOFFSET2] = f112, SIZE
  7127. ;;
  7128. STFD [BOFFSET] = f81, 5 * SIZE
  7129. STFD [BOFFSET2] = f113, 5 * SIZE
  7130. ;;
  7131. adds BOFFSET = - 8 * SIZE, BOFFSET
  7132. ;;
  7133. #else
  7134. adds AOFFSET2 = 4 * SIZE, AOFFSET
  7135. ;;
  7136. STFD [AOFFSET] = f64, SIZE
  7137. STFD [AOFFSET2] = f96, SIZE
  7138. ;;
  7139. STFD [AOFFSET] = f65, SIZE
  7140. STFD [AOFFSET2] = f97, SIZE
  7141. ;;
  7142. STFD [AOFFSET] = f80, SIZE
  7143. STFD [AOFFSET2] = f112, SIZE
  7144. ;;
  7145. STFD [AOFFSET] = f81, 5 * SIZE
  7146. STFD [AOFFSET2] = f113, 5 * SIZE
  7147. ;;
  7148. adds AOFFSET = - 8 * SIZE, AOFFSET
  7149. ;;
  7150. #endif
  7151. #ifdef LN
  7152. adds C1 = -2 * SIZE, C1
  7153. adds C2 = -2 * SIZE, C2
  7154. adds C3 = -2 * SIZE, C3
  7155. adds C4 = -2 * SIZE, C4
  7156. #endif
  7157. ;;
  7158. STFD [C1 ] = f64, SIZE
  7159. ;;
  7160. STFD [C1 ] = f65, SIZE
  7161. ;;
  7162. STFD [C2 ] = f80, SIZE
  7163. ;;
  7164. STFD [C2 ] = f81, SIZE
  7165. ;;
  7166. STFD [C3 ] = f96, SIZE
  7167. ;;
  7168. STFD [C3 ] = f97, SIZE
  7169. ;;
  7170. STFD [C4 ] = f112, SIZE
  7171. ;;
  7172. STFD [C4 ] = f113, SIZE
  7173. ;;
  7174. mov f64 = f0
  7175. mov f65 = f0
  7176. mov f80 = f0
  7177. mov f81 = f0
  7178. mov f96 = f0
  7179. mov f97 = f0
  7180. mov f112 = f0
  7181. mov f113 = f0
  7182. ;;
  7183. #ifdef LN
  7184. adds C1 = -2 * SIZE, C1
  7185. adds C2 = -2 * SIZE, C2
  7186. adds C3 = -2 * SIZE, C3
  7187. adds C4 = -2 * SIZE, C4
  7188. #endif
  7189. ;;
  7190. cmp.ne p6, p0 = 1, I
  7191. ;;
  7192. adds I = -1, I
  7193. ;;
  7194. shladd r2 = K, ZBASE_SHIFT, r0
  7195. ;;
  7196. sub L = K, KK
  7197. ;;
  7198. #ifdef RT
  7199. add AORIG = r2, AORIG
  7200. #endif
  7201. ;;
  7202. #if defined(LT) || defined(RN)
  7203. shladd L = L, ZBASE_SHIFT, r0
  7204. ;;
  7205. add AOFFSET = L, AOFFSET
  7206. shladd BOFFSET = L, 2, BOFFSET
  7207. #endif
  7208. ;;
  7209. #ifdef LT
  7210. adds KK = 1, KK
  7211. #elif defined LN
  7212. adds KK = -1, KK
  7213. #else
  7214. nop __LINE__
  7215. #endif
  7216. ;;
  7217. #if defined(LT) || defined(RN)
  7218. mov L = KK
  7219. #else
  7220. sub L = K, KK
  7221. #endif
  7222. ;;
  7223. .align 16
  7224. .L049:
  7225. #ifdef LN
  7226. shladd KK8 = K, ZBASE_SHIFT, r0
  7227. ;;
  7228. shladd B = KK8, 2, B
  7229. #endif
  7230. #if defined(LT) || defined(RN)
  7231. mov B = BOFFSET
  7232. #endif
  7233. #ifdef RN
  7234. adds KK = 4, KK
  7235. #endif
  7236. #ifdef RT
  7237. adds KK = -4, KK
  7238. #endif
  7239. ;;
  7240. { .mmb
  7241. mov AOFFSET = A
  7242. cmp.lt p6, p0 = 0, J
  7243. (p6) br.cond.dptk .L010
  7244. }
  7245. ;;
  7246. .align 16
  7247. .L050:
  7248. { .mmi
  7249. shr I = M, 2
  7250. }
  7251. { .mib
  7252. tbit.z p6, p0 = N, 1
  7253. (p6) br.cond.dpnt .L090
  7254. }
  7255. ;;
  7256. #ifdef RT
  7257. { .mmi
  7258. shladd r3 = LDC, 1, r0
  7259. nop __LINE__
  7260. shl r2 = K, 1 + ZBASE_SHIFT
  7261. }
  7262. ;;
  7263. { .mmi
  7264. sub B = B, r2
  7265. sub C = C, r3
  7266. nop __LINE__
  7267. }
  7268. ;;
  7269. #endif
  7270. mov C1 = C
  7271. add C2 = LDC, C
  7272. ;;
  7273. #ifdef LN
  7274. add KK = M, OFFSET
  7275. #elif defined LT
  7276. mov KK = OFFSET
  7277. #else
  7278. nop __LINE__
  7279. #endif
  7280. ;;
  7281. #if defined(LN) || defined(RT)
  7282. mov AORIG = A
  7283. #else
  7284. mov AOFFSET = A
  7285. #endif
  7286. ;;
  7287. #if defined(LT) || defined(RN)
  7288. mov L = KK
  7289. #else
  7290. sub L = K, KK
  7291. #endif
  7292. ;;
  7293. { .mib
  7294. cmp.eq p6, p7 = 0, I
  7295. #ifndef RT
  7296. shladd C = LDC, 1, C
  7297. #else
  7298. nop __LINE__
  7299. #endif
  7300. (p6) br.cond.dpnt .L060
  7301. }
  7302. ;;
  7303. .align 16
  7304. .L052:
  7305. { .mmi
  7306. cmp.ne p7, p0 = r0, L
  7307. adds BOFFSET = 0 * SIZE, B
  7308. shl r2 = K, 2 + ZBASE_SHIFT
  7309. }
  7310. { .mmi
  7311. shladd r3 = KK, ZBASE_SHIFT, r0
  7312. nop __LINE__
  7313. nop __LINE__
  7314. }
  7315. ;;
  7316. #if defined(LT) || defined(RN)
  7317. { .mfb
  7318. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7319. mov f66 = f0
  7320. nop __LINE__
  7321. }
  7322. { .mmf
  7323. nop __LINE__
  7324. nop __LINE__
  7325. mov f67 = f0
  7326. }
  7327. ;;
  7328. #else
  7329. { .mfi
  7330. shladd BOFFSET = r3, 1, B
  7331. mov f66 = f0
  7332. #ifdef LN
  7333. sub AORIG = AORIG, r2
  7334. #else
  7335. nop __LINE__
  7336. #endif
  7337. }
  7338. ;;
  7339. { .mfi
  7340. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7341. mov f67 = f0
  7342. shladd AOFFSET = r3, 2, AORIG
  7343. }
  7344. ;;
  7345. #endif
  7346. { .mfi
  7347. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7348. mov f82 = f0
  7349. adds PREC = CPREFETCHSIZE * SIZE, C1
  7350. }
  7351. { .mfi
  7352. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  7353. mov f83 = f0
  7354. nop __LINE__
  7355. }
  7356. ;;
  7357. { .mfi
  7358. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  7359. mov f98 = f0
  7360. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  7361. }
  7362. { .mfi
  7363. cmp.eq p3, p0 = r0, r0
  7364. mov f99 = f0
  7365. adds L = 1, L
  7366. }
  7367. ;;
  7368. { .mfi
  7369. (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  7370. mov f114 = f0
  7371. tbit.z p12, p0 = L, 0
  7372. }
  7373. { .mfi
  7374. CPREFETCH [PREC], LDC
  7375. mov f115 = f0
  7376. shr L = L, 1
  7377. }
  7378. ;;
  7379. { .mmi
  7380. (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  7381. adds C5 = 4 * SIZE, C1
  7382. adds L = -1, L
  7383. }
  7384. ;;
  7385. { .mmi
  7386. CPREFETCH [PREC], LDC
  7387. adds C6 = 4 * SIZE, C2
  7388. mov ar.lc = L
  7389. }
  7390. ;;
  7391. cmp.eq p6, p0 = -1, L
  7392. (p6) br.cond.dpnt .L058
  7393. ;;
  7394. .align 16
  7395. .L053:
  7396. { .mfb
  7397. lfetch.nt1 [PREA], 16 * SIZE
  7398. FMA f64 = f32, f48, f64 // A1 * B1
  7399. nop __LINE__
  7400. }
  7401. { .mfi
  7402. nop __LINE__
  7403. FMA_B f65 = f32, f49, f65 // A1 * B2
  7404. (p12) cmp.ne p3, p0 = 0, L
  7405. }
  7406. ;;
  7407. { .mfi
  7408. lfetch.nt1 [PREB], 8 * SIZE
  7409. FMA f80 = f32, f50, f80 // A1 * B3
  7410. cmp.ne p4, p5 = 0, L
  7411. }
  7412. { .mfi
  7413. nop __LINE__
  7414. FMA_B f81 = f32, f51, f81 // A1 * B4
  7415. nop __LINE__
  7416. }
  7417. ;;
  7418. { .mfi
  7419. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  7420. FMA f96 = f34, f48, f96 // A3 * B1
  7421. nop __LINE__
  7422. }
  7423. { .mfi
  7424. FMA_B f97 = f34, f49, f97 // A3 * B2
  7425. nop __LINE__
  7426. }
  7427. ;;
  7428. { .mfi
  7429. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  7430. FMA f112 = f34, f50, f112 // A3 * B3
  7431. nop __LINE__
  7432. }
  7433. { .mfb
  7434. nop __LINE__
  7435. FMA_B f113 = f34, f51, f113 // A3 * B4
  7436. nop __LINE__
  7437. }
  7438. ;;
  7439. { .mfb
  7440. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  7441. FMA f65 = f33, f48, f65 // A2 * B1
  7442. nop __LINE__
  7443. }
  7444. { .mfb
  7445. nop __LINE__
  7446. FMA_A f64 = f33, f49, f64 // A2 * B2
  7447. nop __LINE__
  7448. }
  7449. ;;
  7450. { .mfb
  7451. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  7452. FMA f81 = f33, f50, f81 // A2 * B3
  7453. nop __LINE__
  7454. }
  7455. { .mfb
  7456. nop __LINE__
  7457. FMA_A f80 = f33, f51, f80 // A2 * B4
  7458. nop __LINE__
  7459. }
  7460. ;;
  7461. { .mfb
  7462. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  7463. FMA f97 = f35, f48, f97 // A4 * B1
  7464. nop __LINE__
  7465. }
  7466. { .mfb
  7467. nop __LINE__
  7468. FMA_A f96 = f35, f49, f96 // A4 * B2
  7469. nop __LINE__
  7470. }
  7471. ;;
  7472. { .mfb
  7473. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  7474. FMA f113 = f35, f50, f113 // A4 * B3
  7475. nop __LINE__
  7476. }
  7477. { .mfb
  7478. nop __LINE__
  7479. FMA_A f112 = f35, f51, f112 // A4 * B4
  7480. nop __LINE__
  7481. }
  7482. ;;
  7483. { .mfb
  7484. nop __LINE__
  7485. FMA f66 = f36, f48, f66 // A5 * B1
  7486. nop __LINE__
  7487. }
  7488. { .mfb
  7489. nop __LINE__
  7490. FMA_B f67 = f36, f49, f67 // A5 * B2
  7491. nop __LINE__
  7492. }
  7493. ;;
  7494. { .mfb
  7495. nop __LINE__
  7496. FMA f82 = f36, f50, f82 // A5 * B3
  7497. nop __LINE__
  7498. }
  7499. { .mfb
  7500. nop __LINE__
  7501. FMA_B f83 = f36, f51, f83 // A5 * B4
  7502. nop __LINE__
  7503. }
  7504. ;;
  7505. { .mfb
  7506. nop __LINE__
  7507. FMA f98 = f38, f48, f98 // A7 * B1
  7508. nop __LINE__
  7509. }
  7510. { .mfb
  7511. nop __LINE__
  7512. FMA_B f99 = f38, f49, f99 // A7 * B2
  7513. nop __LINE__
  7514. }
  7515. ;;
  7516. { .mfb
  7517. nop __LINE__
  7518. FMA f114 = f38, f50, f114 // A7 * B3
  7519. nop __LINE__
  7520. }
  7521. { .mfb
  7522. nop __LINE__
  7523. FMA_B f115 = f38, f51, f115 // A7 * B4
  7524. nop __LINE__
  7525. }
  7526. ;;
  7527. { .mfb
  7528. nop __LINE__
  7529. FMA f67 = f37, f48, f67 // A6 * B1
  7530. nop __LINE__
  7531. }
  7532. { .mfb
  7533. nop __LINE__
  7534. FMA_A f66 = f37, f49, f66 // A6 * B2
  7535. nop __LINE__
  7536. }
  7537. ;;
  7538. { .mfb
  7539. nop __LINE__
  7540. FMA f83 = f37, f50, f83 // A6 * B3
  7541. nop __LINE__
  7542. }
  7543. { .mfb
  7544. nop __LINE__
  7545. FMA_A f82 = f37, f51, f82 // A6 * B4
  7546. nop __LINE__
  7547. }
  7548. ;;
  7549. { .mfb
  7550. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7551. FMA f99 = f39, f48, f99 // A8 * B1
  7552. nop __LINE__
  7553. }
  7554. { .mfb
  7555. nop __LINE__
  7556. FMA_A f98 = f39, f49, f98 // A8 * B2
  7557. nop __LINE__
  7558. }
  7559. ;;
  7560. { .mfb
  7561. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7562. FMA f115 = f39, f50, f115 // A8 * B3
  7563. nop __LINE__
  7564. }
  7565. { .mfb
  7566. nop __LINE__
  7567. FMA_A f114 = f39, f51, f114 // A8 * B4
  7568. nop __LINE__
  7569. }
  7570. ;;
  7571. { .mfb
  7572. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  7573. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  7574. nop __LINE__
  7575. }
  7576. { .mfb
  7577. nop __LINE__
  7578. (p3) FMA_B f65 = f40, f57, f65 // A1 * B2
  7579. nop __LINE__
  7580. }
  7581. ;;
  7582. { .mfb
  7583. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  7584. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  7585. nop __LINE__
  7586. }
  7587. { .mfb
  7588. nop __LINE__
  7589. (p3) FMA_B f81 = f40, f59, f81 // A1 * B4
  7590. nop __LINE__
  7591. }
  7592. ;;
  7593. { .mfb
  7594. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  7595. (p3) FMA f96 = f42, f56, f96 // A3 * B1
  7596. nop __LINE__
  7597. }
  7598. { .mfb
  7599. nop __LINE__
  7600. (p3) FMA_B f97 = f42, f57, f97 // A3 * B2
  7601. nop __LINE__
  7602. }
  7603. ;;
  7604. { .mfb
  7605. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  7606. (p3) FMA f112 = f42, f58, f112 // A3 * B3
  7607. nop __LINE__
  7608. }
  7609. { .mfb
  7610. nop __LINE__
  7611. (p3) FMA_B f113 = f42, f59, f113 // A3 * B4
  7612. nop __LINE__
  7613. }
  7614. ;;
  7615. { .mfb
  7616. nop __LINE__
  7617. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  7618. nop __LINE__
  7619. }
  7620. { .mfb
  7621. nop __LINE__
  7622. (p3) FMA_A f64 = f41, f57, f64 // A2 * B2
  7623. nop __LINE__
  7624. }
  7625. ;;
  7626. { .mfb
  7627. nop __LINE__
  7628. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  7629. nop __LINE__
  7630. }
  7631. { .mfb
  7632. nop __LINE__
  7633. (p3) FMA_A f80 = f41, f59, f80 // A2 * B4
  7634. nop __LINE__
  7635. }
  7636. ;;
  7637. { .mfb
  7638. nop __LINE__
  7639. (p3) FMA f97 = f43, f56, f97 // A4 * B1
  7640. nop __LINE__
  7641. }
  7642. { .mfb
  7643. nop __LINE__
  7644. (p3) FMA_A f96 = f43, f57, f96 // A4 * B2
  7645. nop __LINE__
  7646. }
  7647. ;;
  7648. { .mfb
  7649. nop __LINE__
  7650. (p3) FMA f113 = f43, f58, f113 // A4 * B3
  7651. nop __LINE__
  7652. }
  7653. { .mfb
  7654. nop __LINE__
  7655. (p3) FMA_A f112 = f43, f59, f112 // A4 * B4
  7656. nop __LINE__
  7657. }
  7658. ;;
  7659. { .mfb
  7660. nop __LINE__
  7661. (p3) FMA f66 = f44, f56, f66 // A5 * B1
  7662. nop __LINE__
  7663. }
  7664. { .mfb
  7665. nop __LINE__
  7666. (p3) FMA_B f67 = f44, f57, f67 // A5 * B2
  7667. nop __LINE__
  7668. }
  7669. ;;
  7670. { .mfb
  7671. nop __LINE__
  7672. (p3) FMA f82 = f44, f58, f82 // A5 * B3
  7673. nop __LINE__
  7674. }
  7675. { .mfb
  7676. nop __LINE__
  7677. (p3) FMA_B f83 = f44, f59, f83 // A5 * B4
  7678. nop __LINE__
  7679. }
  7680. ;;
  7681. { .mfb
  7682. nop __LINE__
  7683. (p3) FMA f98 = f46, f56, f98 // A7 * B1
  7684. nop __LINE__
  7685. }
  7686. { .mfb
  7687. nop __LINE__
  7688. (p3) FMA_B f99 = f46, f57, f99 // A7 * B2
  7689. nop __LINE__
  7690. }
  7691. ;;
  7692. { .mfb
  7693. nop __LINE__
  7694. (p3) FMA f114 = f46, f58, f114 // A7 * B3
  7695. nop __LINE__
  7696. }
  7697. { .mfb
  7698. nop __LINE__
  7699. (p3) FMA_B f115 = f46, f59, f115 // A7 * B4
  7700. nop __LINE__
  7701. }
  7702. ;;
  7703. { .mfb
  7704. nop __LINE__
  7705. (p3) FMA f67 = f45, f56, f67 // A6 * B1
  7706. nop __LINE__
  7707. }
  7708. { .mfb
  7709. nop __LINE__
  7710. (p3) FMA_A f66 = f45, f57, f66 // A6 * B2
  7711. nop __LINE__
  7712. }
  7713. ;;
  7714. { .mfb
  7715. nop __LINE__
  7716. (p3) FMA f83 = f45, f58, f83 // A6 * B3
  7717. nop __LINE__
  7718. }
  7719. { .mfb
  7720. nop __LINE__
  7721. (p3) FMA_A f82 = f45, f59, f82 // A6 * B4
  7722. nop __LINE__
  7723. }
  7724. ;;
  7725. { .mfb
  7726. nop __LINE__
  7727. (p3) FMA f99 = f47, f56, f99 // A8 * B1
  7728. nop __LINE__
  7729. }
  7730. { .mfb
  7731. nop __LINE__
  7732. (p3) FMA_A f98 = f47, f57, f98 // A8 * B2
  7733. nop __LINE__
  7734. }
  7735. ;;
  7736. { .mfi
  7737. nop __LINE__
  7738. (p3) FMA f115 = f47, f58, f115 // A8 * B3
  7739. adds L = -1, L
  7740. }
  7741. { .mfb
  7742. nop __LINE__
  7743. (p3) FMA_A f114 = f47, f59, f114 // A8 * B4
  7744. br.cloop.sptk.few .L053
  7745. }
  7746. ;;
  7747. .L058:
  7748. #if defined(LN) || defined(RT)
  7749. #ifdef LN
  7750. adds r2 = -4, KK
  7751. #else
  7752. adds r2 = -2, KK
  7753. #endif
  7754. ;;
  7755. shladd r2 = r2, ZBASE_SHIFT, r0
  7756. ;;
  7757. shladd AOFFSET = r2, 2, AORIG
  7758. shladd BOFFSET = r2, 1, B
  7759. ;;
  7760. #endif
  7761. #if defined(LN) || defined(LT)
  7762. LDFPD f72, f73 = [BOFFSET], 2 * SIZE
  7763. ;;
  7764. LDFPD f74, f75 = [BOFFSET], 2 * SIZE
  7765. ;;
  7766. LDFPD f88, f89 = [BOFFSET], 2 * SIZE
  7767. ;;
  7768. LDFPD f90, f91 = [BOFFSET], 2 * SIZE
  7769. ;;
  7770. LDFPD f104, f105 = [BOFFSET], 2 * SIZE
  7771. ;;
  7772. LDFPD f106, f107 = [BOFFSET], 2 * SIZE
  7773. ;;
  7774. LDFPD f120, f121 = [BOFFSET], 2 * SIZE
  7775. ;;
  7776. LDFPD f122, f123 = [BOFFSET]
  7777. adds BOFFSET = -14 * SIZE, BOFFSET
  7778. ;;
  7779. FSUB f64 = f72, f64
  7780. FSUB_A f65 = f73, f65
  7781. FSUB f80 = f74, f80
  7782. FSUB_A f81 = f75, f81
  7783. FSUB f96 = f88, f96
  7784. FSUB_A f97 = f89, f97
  7785. FSUB f112 = f90, f112
  7786. FSUB_A f113 = f91, f113
  7787. FSUB f66 = f104, f66
  7788. FSUB_A f67 = f105, f67
  7789. FSUB f82 = f106, f82
  7790. FSUB_A f83 = f107, f83
  7791. FSUB f98 = f120, f98
  7792. FSUB_A f99 = f121, f99
  7793. FSUB f114 = f122, f114
  7794. FSUB_A f115 = f123, f115
  7795. ;;
  7796. #else
  7797. LDFPD f72, f73 = [AOFFSET], 2 * SIZE
  7798. ;;
  7799. LDFPD f74, f75 = [AOFFSET], 2 * SIZE
  7800. ;;
  7801. LDFPD f76, f77 = [AOFFSET], 2 * SIZE
  7802. ;;
  7803. LDFPD f78, f79 = [AOFFSET], 2 * SIZE
  7804. ;;
  7805. LDFPD f88, f89 = [AOFFSET], 2 * SIZE
  7806. ;;
  7807. LDFPD f90, f91 = [AOFFSET], 2 * SIZE
  7808. ;;
  7809. LDFPD f92, f93 = [AOFFSET], 2 * SIZE
  7810. ;;
  7811. LDFPD f94, f95 = [AOFFSET]
  7812. adds AOFFSET = -14 * SIZE, AOFFSET
  7813. ;;
  7814. FSUB f64 = f72, f64
  7815. FSUB f65 = f73, f65
  7816. FSUB f96 = f74, f96
  7817. FSUB f97 = f75, f97
  7818. FSUB f66 = f76, f66
  7819. FSUB f67 = f77, f67
  7820. FSUB f98 = f78, f98
  7821. FSUB f99 = f79, f99
  7822. FSUB f80 = f88, f80
  7823. FSUB f81 = f89, f81
  7824. FSUB f112 = f90, f112
  7825. FSUB f113 = f91, f113
  7826. FSUB f82 = f92, f82
  7827. FSUB f83 = f93, f83
  7828. FSUB f114 = f94, f114
  7829. FSUB f115 = f95, f115
  7830. ;;
  7831. #endif
  7832. #ifdef LN
  7833. adds AOFFSET = 30 * SIZE, AOFFSET
  7834. ;;
  7835. LDFPD f72, f73 = [AOFFSET]
  7836. adds AOFFSET = - 2 * SIZE, AOFFSET
  7837. ;;
  7838. LDFPD f74, f75 = [AOFFSET]
  7839. adds AOFFSET = - 2 * SIZE, AOFFSET
  7840. ;;
  7841. LDFPD f76, f77 = [AOFFSET]
  7842. adds AOFFSET = - 2 * SIZE, AOFFSET
  7843. ;;
  7844. LDFPD f78, f79 = [AOFFSET]
  7845. adds AOFFSET = - 4 * SIZE, AOFFSET
  7846. ;;
  7847. LDFPD f88, f89 = [AOFFSET]
  7848. adds AOFFSET = - 2 * SIZE, AOFFSET
  7849. ;;
  7850. LDFPD f90, f91 = [AOFFSET]
  7851. adds AOFFSET = - 2 * SIZE, AOFFSET
  7852. ;;
  7853. LDFPD f92, f93 = [AOFFSET]
  7854. adds AOFFSET = - 6 * SIZE, AOFFSET
  7855. ;;
  7856. LDFPD f104, f105 = [AOFFSET]
  7857. adds AOFFSET = - 2 * SIZE, AOFFSET
  7858. ;;
  7859. LDFPD f106, f107 = [AOFFSET]
  7860. adds AOFFSET = - 8 * SIZE, AOFFSET
  7861. ;;
  7862. LDFPD f120, f121 = [AOFFSET]
  7863. ;;
  7864. FMPY f32 = f72, f98
  7865. FMPY f33 = f73, f98
  7866. FMPY f34 = f72, f114
  7867. FMPY f35 = f73, f114
  7868. ;;
  7869. FMA_C f98 = f73, f99, f32
  7870. FMA_D f99 = f72, f99, f33
  7871. FMA_C f114 = f73, f115, f34
  7872. FMA_D f115 = f72, f115, f35
  7873. ;;
  7874. FNMA f66 = f74, f98, f66
  7875. FMA_A f67 = f75, f98, f67
  7876. FNMA f82 = f74, f114, f82
  7877. FMA_A f83 = f75, f114, f83
  7878. ;;
  7879. FMA_B f66 = f75, f99, f66
  7880. FNMA f67 = f74, f99, f67
  7881. FMA_B f82 = f75, f115, f82
  7882. FNMA f83 = f74, f115, f83
  7883. ;;
  7884. FNMA f96 = f76, f98, f96
  7885. FMA_A f97 = f77, f98, f97
  7886. FNMA f112 = f76, f114, f112
  7887. FMA_A f113 = f77, f114, f113
  7888. ;;
  7889. FMA_B f96 = f77, f99, f96
  7890. FNMA f97 = f76, f99, f97
  7891. FMA_B f112 = f77, f115, f112
  7892. FNMA f113 = f76, f115, f113
  7893. ;;
  7894. FNMA f64 = f78, f98, f64
  7895. FMA_A f65 = f79, f98, f65
  7896. FNMA f80 = f78, f114, f80
  7897. FMA_A f81 = f79, f114, f81
  7898. ;;
  7899. FMA_B f64 = f79, f99, f64
  7900. FNMA f65 = f78, f99, f65
  7901. FMA_B f80 = f79, f115, f80
  7902. FNMA f81 = f78, f115, f81
  7903. ;;
  7904. FMPY f32 = f88, f66
  7905. FMPY f33 = f89, f66
  7906. FMPY f34 = f88, f82
  7907. FMPY f35 = f89, f82
  7908. ;;
  7909. FMA_C f66 = f89, f67, f32
  7910. FMA_D f67 = f88, f67, f33
  7911. FMA_C f82 = f89, f83, f34
  7912. FMA_D f83 = f88, f83, f35
  7913. ;;
  7914. FNMA f96 = f90, f66, f96
  7915. FMA_A f97 = f91, f66, f97
  7916. FNMA f112 = f90, f82, f112
  7917. FMA_A f113 = f91, f82, f113
  7918. ;;
  7919. FMA_B f96 = f91, f67, f96
  7920. FNMA f97 = f90, f67, f97
  7921. FMA_B f112 = f91, f83, f112
  7922. FNMA f113 = f90, f83, f113
  7923. ;;
  7924. FNMA f64 = f92, f66, f64
  7925. FMA_A f65 = f93, f66, f65
  7926. FNMA f80 = f92, f82, f80
  7927. FMA_A f81 = f93, f82, f81
  7928. ;;
  7929. FMA_B f64 = f93, f67, f64
  7930. FNMA f65 = f92, f67, f65
  7931. FMA_B f80 = f93, f83, f80
  7932. FNMA f81 = f92, f83, f81
  7933. ;;
  7934. FMPY f32 = f104, f96
  7935. FMPY f33 = f105, f96
  7936. FMPY f34 = f104, f112
  7937. FMPY f35 = f105, f112
  7938. ;;
  7939. FMA_C f96 = f105, f97, f32
  7940. FMA_D f97 = f104, f97, f33
  7941. FMA_C f112 = f105, f113, f34
  7942. FMA_D f113 = f104, f113, f35
  7943. ;;
  7944. FNMA f64 = f106, f96, f64
  7945. FMA_A f65 = f107, f96, f65
  7946. FNMA f80 = f106, f112, f80
  7947. FMA_A f81 = f107, f112, f81
  7948. ;;
  7949. FMA_B f64 = f107, f97, f64
  7950. FNMA f65 = f106, f97, f65
  7951. FMA_B f80 = f107, f113, f80
  7952. FNMA f81 = f106, f113, f81
  7953. ;;
  7954. FMPY f32 = f120, f64
  7955. FMPY f33 = f121, f64
  7956. FMPY f34 = f120, f80
  7957. FMPY f35 = f121, f80
  7958. ;;
  7959. FMA_C f64 = f121, f65, f32
  7960. FMA_D f65 = f120, f65, f33
  7961. FMA_C f80 = f121, f81, f34
  7962. FMA_D f81 = f120, f81, f35
  7963. ;;
  7964. #endif
  7965. #ifdef LT
  7966. LDFPD f72, f73 = [AOFFSET], 2 * SIZE
  7967. ;;
  7968. LDFPD f74, f75 = [AOFFSET], 2 * SIZE
  7969. ;;
  7970. LDFPD f76, f77 = [AOFFSET], 2 * SIZE
  7971. ;;
  7972. LDFPD f78, f79 = [AOFFSET]
  7973. adds AOFFSET = 4 * SIZE, AOFFSET
  7974. ;;
  7975. LDFPD f90, f91 = [AOFFSET], 2 * SIZE
  7976. ;;
  7977. LDFPD f92, f93 = [AOFFSET], 2 * SIZE
  7978. ;;
  7979. LDFPD f94, f95 = [AOFFSET]
  7980. adds AOFFSET = 6 * SIZE, AOFFSET
  7981. ;;
  7982. LDFPD f108, f109 = [AOFFSET], 2 * SIZE
  7983. ;;
  7984. LDFPD f110, f111 = [AOFFSET]
  7985. adds AOFFSET = 8 * SIZE, AOFFSET
  7986. ;;
  7987. LDFPD f126, f127 = [AOFFSET]
  7988. adds AOFFSET = - 30 * SIZE, AOFFSET
  7989. ;;
  7990. FMPY f32 = f72, f64
  7991. FMPY f33 = f73, f64
  7992. FMPY f34 = f72, f80
  7993. FMPY f35 = f73, f80
  7994. ;;
  7995. FMA_C f64 = f73, f65, f32
  7996. FMA_D f65 = f72, f65, f33
  7997. FMA_C f80 = f73, f81, f34
  7998. FMA_D f81 = f72, f81, f35
  7999. ;;
  8000. FNMA f96 = f74, f64, f96
  8001. FMA_A f97 = f75, f64, f97
  8002. FNMA f112 = f74, f80, f112
  8003. FMA_A f113 = f75, f80, f113
  8004. ;;
  8005. FMA_B f96 = f75, f65, f96
  8006. FNMA f97 = f74, f65, f97
  8007. FMA_B f112 = f75, f81, f112
  8008. FNMA f113 = f74, f81, f113
  8009. ;;
  8010. FNMA f66 = f76, f64, f66
  8011. FMA_A f67 = f77, f64, f67
  8012. FNMA f82 = f76, f80, f82
  8013. FMA_A f83 = f77, f80, f83
  8014. ;;
  8015. FMA_B f66 = f77, f65, f66
  8016. FNMA f67 = f76, f65, f67
  8017. FMA_B f82 = f77, f81, f82
  8018. FNMA f83 = f76, f81, f83
  8019. ;;
  8020. FNMA f98 = f78, f64, f98
  8021. FMA_A f99 = f79, f64, f99
  8022. FNMA f114 = f78, f80, f114
  8023. FMA_A f115 = f79, f80, f115
  8024. ;;
  8025. FMA_B f98 = f79, f65, f98
  8026. FNMA f99 = f78, f65, f99
  8027. FMA_B f114 = f79, f81, f114
  8028. FNMA f115 = f78, f81, f115
  8029. ;;
  8030. FMPY f32 = f90, f96
  8031. FMPY f33 = f91, f96
  8032. FMPY f34 = f90, f112
  8033. FMPY f35 = f91, f112
  8034. ;;
  8035. FMA_C f96 = f91, f97, f32
  8036. FMA_D f97 = f90, f97, f33
  8037. FMA_C f112 = f91, f113, f34
  8038. FMA_D f113 = f90, f113, f35
  8039. ;;
  8040. FNMA f66 = f92, f96, f66
  8041. FMA_A f67 = f93, f96, f67
  8042. FNMA f82 = f92, f112, f82
  8043. FMA_A f83 = f93, f112, f83
  8044. ;;
  8045. FMA_B f66 = f93, f97, f66
  8046. FNMA f67 = f92, f97, f67
  8047. FMA_B f82 = f93, f113, f82
  8048. FNMA f83 = f92, f113, f83
  8049. ;;
  8050. FNMA f98 = f94, f96, f98
  8051. FMA_A f99 = f95, f96, f99
  8052. FNMA f114 = f94, f112, f114
  8053. FMA_A f115 = f95, f112, f115
  8054. ;;
  8055. FMA_B f98 = f95, f97, f98
  8056. FNMA f99 = f94, f97, f99
  8057. FMA_B f114 = f95, f113, f114
  8058. FNMA f115 = f94, f113, f115
  8059. ;;
  8060. FMPY f32 = f108, f66
  8061. FMPY f33 = f109, f66
  8062. FMPY f34 = f108, f82
  8063. FMPY f35 = f109, f82
  8064. ;;
  8065. FMA_C f66 = f109, f67, f32
  8066. FMA_D f67 = f108, f67, f33
  8067. FMA_C f82 = f109, f83, f34
  8068. FMA_D f83 = f108, f83, f35
  8069. ;;
  8070. FNMA f98 = f110, f66, f98
  8071. FMA_A f99 = f111, f66, f99
  8072. FNMA f114 = f110, f82, f114
  8073. FMA_A f115 = f111, f82, f115
  8074. ;;
  8075. FMA_B f98 = f111, f67, f98
  8076. FNMA f99 = f110, f67, f99
  8077. FMA_B f114 = f111, f83, f114
  8078. FNMA f115 = f110, f83, f115
  8079. ;;
  8080. FMPY f32 = f126, f98
  8081. FMPY f33 = f127, f98
  8082. FMPY f34 = f126, f114
  8083. FMPY f35 = f127, f114
  8084. ;;
  8085. FMA_C f98 = f127, f99, f32
  8086. FMA_D f99 = f126, f99, f33
  8087. FMA_C f114 = f127, f115, f34
  8088. FMA_D f115 = f126, f115, f35
  8089. ;;
  8090. #endif
  8091. #ifdef RN
  8092. LDFPD f72, f73 = [BOFFSET], 2 * SIZE
  8093. ;;
  8094. LDFPD f74, f75 = [BOFFSET]
  8095. adds BOFFSET = 4 * SIZE, BOFFSET
  8096. ;;
  8097. LDFPD f90, f91 = [BOFFSET]
  8098. adds BOFFSET = - 6 * SIZE, BOFFSET
  8099. ;;
  8100. FMPY f32 = f72, f64
  8101. FMPY f33 = f73, f64
  8102. FMPY f34 = f72, f96
  8103. FMPY f35 = f73, f96
  8104. FMPY f36 = f72, f66
  8105. FMPY f37 = f73, f66
  8106. FMPY f38 = f72, f98
  8107. FMPY f39 = f73, f98
  8108. ;;
  8109. FMA_C f64 = f73, f65, f32
  8110. FMA_D f65 = f72, f65, f33
  8111. FMA_C f96 = f73, f97, f34
  8112. FMA_D f97 = f72, f97, f35
  8113. FMA_C f66 = f73, f67, f36
  8114. FMA_D f67 = f72, f67, f37
  8115. FMA_C f98 = f73, f99, f38
  8116. FMA_D f99 = f72, f99, f39
  8117. ;;
  8118. FNMA f80 = f74, f64, f80
  8119. FMA_A f81 = f75, f64, f81
  8120. FNMA f112 = f74, f96, f112
  8121. FMA_A f113 = f75, f96, f113
  8122. FNMA f82 = f74, f66, f82
  8123. FMA_A f83 = f75, f66, f83
  8124. FNMA f114 = f74, f98, f114
  8125. FMA_A f115 = f75, f98, f115
  8126. ;;
  8127. FMA_B f80 = f75, f65, f80
  8128. FNMA f81 = f74, f65, f81
  8129. FMA_B f112 = f75, f97, f112
  8130. FNMA f113 = f74, f97, f113
  8131. FMA_B f82 = f75, f67, f82
  8132. FNMA f83 = f74, f67, f83
  8133. FMA_B f114 = f75, f99, f114
  8134. FNMA f115 = f74, f99, f115
  8135. ;;
  8136. FMPY f32 = f90, f80
  8137. FMPY f33 = f91, f80
  8138. FMPY f34 = f90, f112
  8139. FMPY f35 = f91, f112
  8140. FMPY f36 = f90, f82
  8141. FMPY f37 = f91, f82
  8142. FMPY f38 = f90, f114
  8143. FMPY f39 = f91, f114
  8144. ;;
  8145. FMA_C f80 = f91, f81, f32
  8146. FMA_D f81 = f90, f81, f33
  8147. FMA_C f112 = f91, f113, f34
  8148. FMA_D f113 = f90, f113, f35
  8149. FMA_C f82 = f91, f83, f36
  8150. FMA_D f83 = f90, f83, f37
  8151. FMA_C f114 = f91, f115, f38
  8152. FMA_D f115 = f90, f115, f39
  8153. ;;
  8154. #endif
  8155. #ifdef RT
  8156. adds BOFFSET = 6 * SIZE, BOFFSET
  8157. ;;
  8158. LDFPD f104, f105 = [BOFFSET]
  8159. adds BOFFSET = - 2 * SIZE, BOFFSET
  8160. ;;
  8161. LDFPD f106, f107 = [BOFFSET]
  8162. adds BOFFSET = - 4 * SIZE, BOFFSET
  8163. ;;
  8164. LDFPD f120, f121 = [BOFFSET]
  8165. ;;
  8166. FMPY f32 = f104, f80
  8167. FMPY f33 = f105, f80
  8168. FMPY f34 = f104, f112
  8169. FMPY f35 = f105, f112
  8170. FMPY f36 = f104, f82
  8171. FMPY f37 = f105, f82
  8172. FMPY f38 = f104, f114
  8173. FMPY f39 = f105, f114
  8174. ;;
  8175. FMA_C f80 = f105, f81, f32
  8176. FMA_D f81 = f104, f81, f33
  8177. FMA_C f112 = f105, f113, f34
  8178. FMA_D f113 = f104, f113, f35
  8179. FMA_C f82 = f105, f83, f36
  8180. FMA_D f83 = f104, f83, f37
  8181. FMA_C f114 = f105, f115, f38
  8182. FMA_D f115 = f104, f115, f39
  8183. ;;
  8184. FNMA f64 = f106, f80, f64
  8185. FMA_A f65 = f107, f80, f65
  8186. FNMA f96 = f106, f112, f96
  8187. FMA_A f97 = f107, f112, f97
  8188. FNMA f66 = f106, f82, f66
  8189. FMA_A f67 = f107, f82, f67
  8190. FNMA f98 = f106, f114, f98
  8191. FMA_A f99 = f107, f114, f99
  8192. ;;
  8193. FMA_B f64 = f107, f81, f64
  8194. FNMA f65 = f106, f81, f65
  8195. FMA_B f96 = f107, f113, f96
  8196. FNMA f97 = f106, f113, f97
  8197. FMA_B f66 = f107, f83, f66
  8198. FNMA f67 = f106, f83, f67
  8199. FMA_B f98 = f107, f115, f98
  8200. FNMA f99 = f106, f115, f99
  8201. ;;
  8202. FMPY f32 = f120, f64
  8203. FMPY f33 = f121, f64
  8204. FMPY f34 = f120, f96
  8205. FMPY f35 = f121, f96
  8206. FMPY f36 = f120, f66
  8207. FMPY f37 = f121, f66
  8208. FMPY f38 = f120, f98
  8209. FMPY f39 = f121, f98
  8210. ;;
  8211. FMA_C f64 = f121, f65, f32
  8212. FMA_D f65 = f120, f65, f33
  8213. FMA_C f96 = f121, f97, f34
  8214. FMA_D f97 = f120, f97, f35
  8215. FMA_C f66 = f121, f67, f36
  8216. FMA_D f67 = f120, f67, f37
  8217. FMA_C f98 = f121, f99, f38
  8218. FMA_D f99 = f120, f99, f39
  8219. ;;
  8220. #endif
  8221. #if defined(LN) || defined(LT)
  8222. adds BOFFSET2 = 4 * SIZE, BOFFSET
  8223. ;;
  8224. STFD [BOFFSET] = f64, SIZE
  8225. STFD [BOFFSET2] = f96, SIZE
  8226. ;;
  8227. STFD [BOFFSET] = f65, SIZE
  8228. STFD [BOFFSET2] = f97, SIZE
  8229. ;;
  8230. STFD [BOFFSET] = f80, SIZE
  8231. STFD [BOFFSET2] = f112, SIZE
  8232. ;;
  8233. STFD [BOFFSET] = f81, 5 * SIZE
  8234. STFD [BOFFSET2] = f113, 5 * SIZE
  8235. ;;
  8236. STFD [BOFFSET] = f66, SIZE
  8237. STFD [BOFFSET2] = f98, SIZE
  8238. ;;
  8239. STFD [BOFFSET] = f67, SIZE
  8240. STFD [BOFFSET2] = f99, SIZE
  8241. ;;
  8242. STFD [BOFFSET] = f82, SIZE
  8243. STFD [BOFFSET2] = f114, SIZE
  8244. ;;
  8245. STFD [BOFFSET] = f83, 5 * SIZE
  8246. STFD [BOFFSET2] = f115, 5 * SIZE
  8247. ;;
  8248. adds BOFFSET = - 16 * SIZE, BOFFSET
  8249. ;;
  8250. #else
  8251. adds AOFFSET2 = 4 * SIZE, AOFFSET
  8252. ;;
  8253. STFD [AOFFSET] = f64, SIZE
  8254. STFD [AOFFSET2] = f66, SIZE
  8255. ;;
  8256. STFD [AOFFSET] = f65, SIZE
  8257. STFD [AOFFSET2] = f67, SIZE
  8258. ;;
  8259. STFD [AOFFSET] = f96, SIZE
  8260. STFD [AOFFSET2] = f98, SIZE
  8261. ;;
  8262. STFD [AOFFSET] = f97, 5 * SIZE
  8263. STFD [AOFFSET2] = f99, 5 * SIZE
  8264. ;;
  8265. STFD [AOFFSET] = f80, SIZE
  8266. STFD [AOFFSET2] = f82, SIZE
  8267. ;;
  8268. STFD [AOFFSET] = f81, SIZE
  8269. STFD [AOFFSET2] = f83, SIZE
  8270. ;;
  8271. STFD [AOFFSET] = f112, SIZE
  8272. STFD [AOFFSET2] = f114, SIZE
  8273. ;;
  8274. STFD [AOFFSET] = f113, 5 * SIZE
  8275. STFD [AOFFSET2] = f115, 5 * SIZE
  8276. ;;
  8277. adds AOFFSET = - 16 * SIZE, AOFFSET
  8278. ;;
  8279. #endif
  8280. #ifdef LN
  8281. adds C1 = -8 * SIZE, C1
  8282. adds C2 = -8 * SIZE, C2
  8283. adds C5 = -8 * SIZE, C5
  8284. adds C6 = -8 * SIZE, C6
  8285. #endif
  8286. ;;
  8287. STFD [C1 ] = f64, SIZE
  8288. STFD [C5 ] = f66, SIZE
  8289. ;;
  8290. STFD [C1 ] = f65, SIZE
  8291. STFD [C5 ] = f67, SIZE
  8292. ;;
  8293. STFD [C1 ] = f96, SIZE
  8294. STFD [C5 ] = f98, SIZE
  8295. ;;
  8296. STFD [C1 ] = f97, 5 * SIZE
  8297. STFD [C5 ] = f99, 5 * SIZE
  8298. ;;
  8299. STFD [C2 ] = f80, SIZE
  8300. STFD [C6 ] = f82, SIZE
  8301. ;;
  8302. STFD [C2 ] = f81, SIZE
  8303. STFD [C6 ] = f83, SIZE
  8304. ;;
  8305. STFD [C2 ] = f112, SIZE
  8306. STFD [C6 ] = f114, SIZE
  8307. ;;
  8308. STFD [C2 ] = f113, 5 * SIZE
  8309. STFD [C6 ] = f115, 5 * SIZE
  8310. ;;
  8311. mov f64 = f0
  8312. mov f65 = f0
  8313. mov f80 = f0
  8314. mov f81 = f0
  8315. mov f96 = f0
  8316. mov f97 = f0
  8317. mov f112 = f0
  8318. mov f113 = f0
  8319. ;;
  8320. #ifdef LN
  8321. adds C1 = -8 * SIZE, C1
  8322. adds C2 = -8 * SIZE, C2
  8323. adds C5 = -8 * SIZE, C5
  8324. adds C6 = -8 * SIZE, C6
  8325. #endif
  8326. ;;
  8327. cmp.ne p6, p0 = 1, I
  8328. ;;
  8329. adds I = -1, I
  8330. ;;
  8331. shladd r2 = K, ZBASE_SHIFT, r0
  8332. ;;
  8333. sub L = K, KK
  8334. ;;
  8335. #ifdef RT
  8336. shladd AORIG = r2, 2, AORIG
  8337. #endif
  8338. ;;
  8339. #if defined(LT) || defined(RN)
  8340. shladd L = L, ZBASE_SHIFT, r0
  8341. ;;
  8342. shladd AOFFSET = L, 2, AOFFSET
  8343. shladd BOFFSET = L, 1, BOFFSET
  8344. #endif
  8345. ;;
  8346. #ifdef LT
  8347. adds KK = 4, KK
  8348. #elif defined LN
  8349. adds KK = -4, KK
  8350. #else
  8351. nop __LINE__
  8352. #endif
  8353. ;;
  8354. #if defined(LT) || defined(RN)
  8355. mov L = KK
  8356. #else
  8357. sub L = K, KK
  8358. #endif
  8359. ;;
  8360. (p6) br.cond.dptk .L052
  8361. ;;
  8362. .align 16
  8363. .L060:
  8364. { .mib
  8365. #if defined(LT) || defined(RN)
  8366. mov L = KK
  8367. #else
  8368. sub L = K, KK
  8369. #endif
  8370. tbit.z p6, p7 = M, 1
  8371. (p6) br.cond.dptk .L070
  8372. }
  8373. ;;
  8374. { .mmi
  8375. cmp.ne p7, p0 = r0, L
  8376. adds BOFFSET = 0 * SIZE, B
  8377. shl r2 = K, 1 + ZBASE_SHIFT
  8378. }
  8379. { .mmi
  8380. shladd r3 = KK, ZBASE_SHIFT, r0
  8381. nop __LINE__
  8382. nop __LINE__
  8383. }
  8384. ;;
  8385. #if defined(LT) || defined(RN)
  8386. { .mfb
  8387. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  8388. }
  8389. ;;
  8390. #else
  8391. { .mfi
  8392. shladd BOFFSET = r3, 1, B
  8393. #ifdef LN
  8394. sub AORIG = AORIG, r2
  8395. #else
  8396. nop __LINE__
  8397. #endif
  8398. }
  8399. ;;
  8400. { .mfi
  8401. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  8402. shladd AOFFSET = r3, 1, AORIG
  8403. }
  8404. ;;
  8405. #endif
  8406. ;;
  8407. adds L = 1, L
  8408. ;;
  8409. { .mmi
  8410. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8411. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  8412. tbit.z p12, p0 = L, 0
  8413. }
  8414. { .mmi
  8415. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  8416. shr L = L, 1
  8417. }
  8418. ;;
  8419. { .mmi
  8420. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  8421. nop __LINE__
  8422. adds L = -1, L
  8423. }
  8424. ;;
  8425. { .mmi
  8426. nop __LINE__
  8427. nop __LINE__
  8428. mov ar.lc = L
  8429. }
  8430. ;;
  8431. cmp.eq p6, p0 = -1, L
  8432. (p6) br.cond.dpnt .L068
  8433. ;;
  8434. .align 16
  8435. .L062:
  8436. { .mfi
  8437. lfetch.nt1 [PREA], 8 * SIZE
  8438. FMA f64 = f32, f48, f64 // A1 * B1
  8439. cmp.ne p4, p5 = 0, L
  8440. }
  8441. { .mfi
  8442. nop __LINE__
  8443. FMA_B f65 = f32, f49, f65 // A1 * B2
  8444. (p12) cmp.ne p3, p0 = 0, L
  8445. }
  8446. ;;
  8447. { .mfb
  8448. lfetch.nt1 [PREB], 8 * SIZE
  8449. FMA f80 = f32, f50, f80 // A1 * B3
  8450. nop __LINE__
  8451. }
  8452. { .mfb
  8453. nop __LINE__
  8454. FMA_B f81 = f32, f51, f81 // A1 * B4
  8455. nop __LINE__
  8456. }
  8457. ;;
  8458. { .mfb
  8459. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  8460. FMA f96 = f34, f48, f96 // A3 * B1
  8461. nop __LINE__
  8462. }
  8463. { .mfb
  8464. nop __LINE__
  8465. FMA_B f97 = f34, f49, f97 // A3 * B2
  8466. nop __LINE__
  8467. }
  8468. ;;
  8469. { .mfb
  8470. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  8471. FMA f112 = f34, f50, f112 // A3 * B3
  8472. nop __LINE__
  8473. }
  8474. { .mfb
  8475. nop __LINE__
  8476. FMA_B f113 = f34, f51, f113 // A3 * B4
  8477. nop __LINE__
  8478. }
  8479. ;;
  8480. { .mfb
  8481. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  8482. FMA f65 = f33, f48, f65 // A2 * B1
  8483. nop __LINE__
  8484. }
  8485. { .mfb
  8486. nop __LINE__
  8487. FMA_A f64 = f33, f49, f64 // A2 * B2
  8488. nop __LINE__
  8489. }
  8490. ;;
  8491. { .mfb
  8492. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  8493. FMA f81 = f33, f50, f81 // A2 * B3
  8494. nop __LINE__
  8495. }
  8496. { .mfb
  8497. nop __LINE__
  8498. FMA_A f80 = f33, f51, f80 // A2 * B4
  8499. nop __LINE__
  8500. }
  8501. ;;
  8502. { .mfb
  8503. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8504. FMA f97 = f35, f48, f97 // A4 * B1
  8505. }
  8506. { .mfb
  8507. FMA_A f96 = f35, f49, f96 // A4 * B2
  8508. nop __LINE__
  8509. }
  8510. { .mfb
  8511. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  8512. FMA f113 = f35, f50, f113 // A4 * B3
  8513. nop __LINE__
  8514. }
  8515. { .mfb
  8516. FMA_A f112 = f35, f51, f112 // A4 * B4
  8517. nop __LINE__
  8518. }
  8519. ;;
  8520. { .mfb
  8521. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  8522. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  8523. nop __LINE__
  8524. }
  8525. { .mfb
  8526. (p3) FMA_B f65 = f40, f57, f65 // A1 * B2
  8527. nop __LINE__
  8528. }
  8529. ;;
  8530. { .mfb
  8531. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  8532. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  8533. nop __LINE__
  8534. }
  8535. { .mfb
  8536. (p3) FMA_B f81 = f40, f59, f81 // A1 * B4
  8537. nop __LINE__
  8538. }
  8539. ;;
  8540. { .mfb
  8541. nop __LINE__
  8542. (p3) FMA f96 = f42, f56, f96 // A3 * B1
  8543. nop __LINE__
  8544. }
  8545. { .mfb
  8546. nop __LINE__
  8547. (p3) FMA_B f97 = f42, f57, f97 // A3 * B2
  8548. nop __LINE__
  8549. }
  8550. ;;
  8551. { .mfb
  8552. nop __LINE__
  8553. (p3) FMA f112 = f42, f58, f112 // A3 * B3
  8554. nop __LINE__
  8555. }
  8556. { .mfb
  8557. nop __LINE__
  8558. (p3) FMA_B f113 = f42, f59, f113 // A3 * B4
  8559. nop __LINE__
  8560. }
  8561. ;;
  8562. { .mfb
  8563. nop __LINE__
  8564. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  8565. nop __LINE__
  8566. }
  8567. { .mfb
  8568. nop __LINE__
  8569. (p3) FMA_A f64 = f41, f57, f64 // A2 * B2
  8570. nop __LINE__
  8571. }
  8572. ;;
  8573. { .mfb
  8574. nop __LINE__
  8575. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  8576. nop __LINE__
  8577. }
  8578. { .mfb
  8579. nop __LINE__
  8580. (p3) FMA_A f80 = f41, f59, f80 // A2 * B4
  8581. nop __LINE__
  8582. }
  8583. ;;
  8584. { .mfb
  8585. nop __LINE__
  8586. (p3) FMA f97 = f43, f56, f97 // A4 * B1
  8587. nop __LINE__
  8588. }
  8589. { .mfb
  8590. nop __LINE__
  8591. (p3) FMA_A f96 = f43, f57, f96 // A4 * B2
  8592. nop __LINE__
  8593. }
  8594. ;;
  8595. { .mfi
  8596. nop __LINE__
  8597. (p3) FMA f113 = f43, f58, f113 // A4 * B3
  8598. adds L = -1, L
  8599. }
  8600. { .mfb
  8601. nop __LINE__
  8602. (p3) FMA_A f112 = f43, f59, f112 // A4 * B4
  8603. br.cloop.sptk.few .L062
  8604. }
  8605. ;;
  8606. .L068:
  8607. #if defined(LN) || defined(RT)
  8608. #ifdef LN
  8609. adds r2 = -2, KK
  8610. #else
  8611. adds r2 = -2, KK
  8612. #endif
  8613. ;;
  8614. shladd r2 = r2, ZBASE_SHIFT, r0
  8615. ;;
  8616. shladd AOFFSET = r2, 1, AORIG
  8617. shladd BOFFSET = r2, 1, B
  8618. ;;
  8619. #endif
  8620. #if defined(LN) || defined(LT)
  8621. LDFPD f72, f73 = [BOFFSET], 2 * SIZE
  8622. ;;
  8623. LDFPD f74, f75 = [BOFFSET], 2 * SIZE
  8624. ;;
  8625. LDFPD f88, f89 = [BOFFSET], 2 * SIZE
  8626. ;;
  8627. LDFPD f90, f91 = [BOFFSET]
  8628. adds BOFFSET = -6 * SIZE, BOFFSET
  8629. ;;
  8630. FSUB f64 = f72, f64
  8631. FSUB_A f65 = f73, f65
  8632. FSUB f80 = f74, f80
  8633. FSUB_A f81 = f75, f81
  8634. FSUB f96 = f88, f96
  8635. FSUB_A f97 = f89, f97
  8636. FSUB f112 = f90, f112
  8637. FSUB_A f113 = f91, f113
  8638. ;;
  8639. #else
  8640. LDFPD f72, f73 = [AOFFSET], 2 * SIZE
  8641. ;;
  8642. LDFPD f74, f75 = [AOFFSET], 2 * SIZE
  8643. ;;
  8644. LDFPD f88, f89 = [AOFFSET], 2 * SIZE
  8645. ;;
  8646. LDFPD f90, f91 = [AOFFSET]
  8647. adds AOFFSET = -6 * SIZE, AOFFSET
  8648. ;;
  8649. FSUB f64 = f72, f64
  8650. FSUB f65 = f73, f65
  8651. FSUB f96 = f74, f96
  8652. FSUB f97 = f75, f97
  8653. FSUB f80 = f88, f80
  8654. FSUB f81 = f89, f81
  8655. FSUB f112 = f90, f112
  8656. FSUB f113 = f91, f113
  8657. ;;
  8658. #endif
  8659. #ifdef LN
  8660. adds AOFFSET = 6 * SIZE, AOFFSET
  8661. ;;
  8662. LDFPD f104, f105 = [AOFFSET]
  8663. adds AOFFSET = - 2 * SIZE, AOFFSET
  8664. ;;
  8665. LDFPD f106, f107 = [AOFFSET]
  8666. adds AOFFSET = - 4 * SIZE, AOFFSET
  8667. ;;
  8668. LDFPD f120, f121 = [AOFFSET]
  8669. ;;
  8670. FMPY f32 = f104, f96
  8671. FMPY f33 = f105, f96
  8672. FMPY f34 = f104, f112
  8673. FMPY f35 = f105, f112
  8674. ;;
  8675. FMA_C f96 = f105, f97, f32
  8676. FMA_D f97 = f104, f97, f33
  8677. FMA_C f112 = f105, f113, f34
  8678. FMA_D f113 = f104, f113, f35
  8679. ;;
  8680. FNMA f64 = f106, f96, f64
  8681. FMA_A f65 = f107, f96, f65
  8682. FNMA f80 = f106, f112, f80
  8683. FMA_A f81 = f107, f112, f81
  8684. ;;
  8685. FMA_B f64 = f107, f97, f64
  8686. FNMA f65 = f106, f97, f65
  8687. FMA_B f80 = f107, f113, f80
  8688. FNMA f81 = f106, f113, f81
  8689. ;;
  8690. FMPY f32 = f120, f64
  8691. FMPY f33 = f121, f64
  8692. FMPY f34 = f120, f80
  8693. FMPY f35 = f121, f80
  8694. ;;
  8695. FMA_C f64 = f121, f65, f32
  8696. FMA_D f65 = f120, f65, f33
  8697. FMA_C f80 = f121, f81, f34
  8698. FMA_D f81 = f120, f81, f35
  8699. ;;
  8700. #endif
  8701. #ifdef LT
  8702. LDFPD f72, f73 = [AOFFSET], 2 * SIZE
  8703. ;;
  8704. LDFPD f74, f75 = [AOFFSET]
  8705. adds AOFFSET = 4 * SIZE, AOFFSET
  8706. ;;
  8707. LDFPD f90, f91 = [AOFFSET]
  8708. adds AOFFSET = - 6 * SIZE, AOFFSET
  8709. ;;
  8710. FMPY f32 = f72, f64
  8711. FMPY f33 = f73, f64
  8712. FMPY f34 = f72, f80
  8713. FMPY f35 = f73, f80
  8714. ;;
  8715. FMA_C f64 = f73, f65, f32
  8716. FMA_D f65 = f72, f65, f33
  8717. FMA_C f80 = f73, f81, f34
  8718. FMA_D f81 = f72, f81, f35
  8719. ;;
  8720. FNMA f96 = f74, f64, f96
  8721. FMA_A f97 = f75, f64, f97
  8722. FNMA f112 = f74, f80, f112
  8723. FMA_A f113 = f75, f80, f113
  8724. ;;
  8725. FMA_B f96 = f75, f65, f96
  8726. FNMA f97 = f74, f65, f97
  8727. FMA_B f112 = f75, f81, f112
  8728. FNMA f113 = f74, f81, f113
  8729. ;;
  8730. FMPY f32 = f90, f96
  8731. FMPY f33 = f91, f96
  8732. FMPY f34 = f90, f112
  8733. FMPY f35 = f91, f112
  8734. ;;
  8735. FMA_C f96 = f91, f97, f32
  8736. FMA_D f97 = f90, f97, f33
  8737. FMA_C f112 = f91, f113, f34
  8738. FMA_D f113 = f90, f113, f35
  8739. ;;
  8740. #endif
  8741. #ifdef RN
  8742. LDFPD f72, f73 = [BOFFSET], 2 * SIZE
  8743. ;;
  8744. LDFPD f74, f75 = [BOFFSET]
  8745. adds BOFFSET = 4 * SIZE, BOFFSET
  8746. ;;
  8747. LDFPD f90, f91 = [BOFFSET]
  8748. adds BOFFSET = - 6 * SIZE, BOFFSET
  8749. ;;
  8750. FMPY f32 = f72, f64
  8751. FMPY f33 = f73, f64
  8752. FMPY f34 = f72, f96
  8753. FMPY f35 = f73, f96
  8754. ;;
  8755. FMA_C f64 = f73, f65, f32
  8756. FMA_D f65 = f72, f65, f33
  8757. FMA_C f96 = f73, f97, f34
  8758. FMA_D f97 = f72, f97, f35
  8759. ;;
  8760. FNMA f80 = f74, f64, f80
  8761. FMA_A f81 = f75, f64, f81
  8762. FNMA f112 = f74, f96, f112
  8763. FMA_A f113 = f75, f96, f113
  8764. ;;
  8765. FMA_B f80 = f75, f65, f80
  8766. FNMA f81 = f74, f65, f81
  8767. FMA_B f112 = f75, f97, f112
  8768. FNMA f113 = f74, f97, f113
  8769. ;;
  8770. FMPY f32 = f90, f80
  8771. FMPY f33 = f91, f80
  8772. FMPY f34 = f90, f112
  8773. FMPY f35 = f91, f112
  8774. ;;
  8775. FMA_C f80 = f91, f81, f32
  8776. FMA_D f81 = f90, f81, f33
  8777. FMA_C f112 = f91, f113, f34
  8778. FMA_D f113 = f90, f113, f35
  8779. ;;
  8780. #endif
  8781. #ifdef RT
  8782. adds BOFFSET = 6 * SIZE, BOFFSET
  8783. ;;
  8784. LDFPD f104, f105 = [BOFFSET]
  8785. adds BOFFSET = - 2 * SIZE, BOFFSET
  8786. ;;
  8787. LDFPD f106, f107 = [BOFFSET]
  8788. adds BOFFSET = - 4 * SIZE, BOFFSET
  8789. ;;
  8790. LDFPD f120, f121 = [BOFFSET]
  8791. ;;
  8792. FMPY f32 = f104, f80
  8793. FMPY f33 = f105, f80
  8794. FMPY f34 = f104, f112
  8795. FMPY f35 = f105, f112
  8796. ;;
  8797. FMA_C f80 = f105, f81, f32
  8798. FMA_D f81 = f104, f81, f33
  8799. FMA_C f112 = f105, f113, f34
  8800. FMA_D f113 = f104, f113, f35
  8801. ;;
  8802. FNMA f64 = f106, f80, f64
  8803. FMA_A f65 = f107, f80, f65
  8804. FNMA f96 = f106, f112, f96
  8805. FMA_A f97 = f107, f112, f97
  8806. ;;
  8807. FMA_B f64 = f107, f81, f64
  8808. FNMA f65 = f106, f81, f65
  8809. FMA_B f96 = f107, f113, f96
  8810. FNMA f97 = f106, f113, f97
  8811. ;;
  8812. FMPY f32 = f120, f64
  8813. FMPY f33 = f121, f64
  8814. FMPY f34 = f120, f96
  8815. FMPY f35 = f121, f96
  8816. ;;
  8817. FMA_C f64 = f121, f65, f32
  8818. FMA_D f65 = f120, f65, f33
  8819. FMA_C f96 = f121, f97, f34
  8820. FMA_D f97 = f120, f97, f35
  8821. ;;
  8822. #endif
  8823. #if defined(LN) || defined(LT)
  8824. adds BOFFSET2 = 4 * SIZE, BOFFSET
  8825. ;;
  8826. STFD [BOFFSET] = f64, SIZE
  8827. STFD [BOFFSET2] = f96, SIZE
  8828. ;;
  8829. STFD [BOFFSET] = f65, SIZE
  8830. STFD [BOFFSET2] = f97, SIZE
  8831. ;;
  8832. STFD [BOFFSET] = f80, SIZE
  8833. STFD [BOFFSET2] = f112, SIZE
  8834. ;;
  8835. STFD [BOFFSET] = f81, 5 * SIZE
  8836. STFD [BOFFSET2] = f113, 5 * SIZE
  8837. ;;
  8838. adds BOFFSET = - 8 * SIZE, BOFFSET
  8839. ;;
  8840. #else
  8841. adds AOFFSET2 = 4 * SIZE, AOFFSET
  8842. ;;
  8843. STFD [AOFFSET] = f64, SIZE
  8844. STFD [AOFFSET2] = f80, SIZE
  8845. ;;
  8846. STFD [AOFFSET] = f65, SIZE
  8847. STFD [AOFFSET2] = f81, SIZE
  8848. ;;
  8849. STFD [AOFFSET] = f96, SIZE
  8850. STFD [AOFFSET2] = f112, SIZE
  8851. ;;
  8852. STFD [AOFFSET] = f97, 5 * SIZE
  8853. STFD [AOFFSET2] = f113, 5 * SIZE
  8854. ;;
  8855. adds AOFFSET = - 8 * SIZE, AOFFSET
  8856. ;;
  8857. #endif
  8858. #ifdef LN
  8859. adds C1 = -4 * SIZE, C1
  8860. adds C2 = -4 * SIZE, C2
  8861. #endif
  8862. ;;
  8863. STFD [C1 ] = f64, SIZE
  8864. ;;
  8865. STFD [C1 ] = f65, SIZE
  8866. ;;
  8867. STFD [C1 ] = f96, SIZE
  8868. ;;
  8869. STFD [C1 ] = f97, SIZE
  8870. ;;
  8871. STFD [C2 ] = f80, SIZE
  8872. ;;
  8873. STFD [C2 ] = f81, SIZE
  8874. ;;
  8875. STFD [C2 ] = f112, SIZE
  8876. ;;
  8877. STFD [C2 ] = f113, SIZE
  8878. ;;
  8879. mov f64 = f0
  8880. mov f65 = f0
  8881. mov f80 = f0
  8882. mov f81 = f0
  8883. mov f96 = f0
  8884. mov f97 = f0
  8885. mov f112 = f0
  8886. mov f113 = f0
  8887. ;;
  8888. #ifdef LN
  8889. adds C1 = -4 * SIZE, C1
  8890. adds C2 = -4 * SIZE, C2
  8891. #endif
  8892. ;;
  8893. cmp.ne p6, p0 = 1, I
  8894. ;;
  8895. adds I = -1, I
  8896. ;;
  8897. shladd r2 = K, ZBASE_SHIFT, r0
  8898. ;;
  8899. sub L = K, KK
  8900. ;;
  8901. #ifdef RT
  8902. shladd AORIG = r2, 1, AORIG
  8903. #endif
  8904. ;;
  8905. #if defined(LT) || defined(RN)
  8906. shladd L = L, ZBASE_SHIFT, r0
  8907. ;;
  8908. shladd AOFFSET = L, 1, AOFFSET
  8909. shladd BOFFSET = L, 1, BOFFSET
  8910. #endif
  8911. ;;
  8912. #ifdef LT
  8913. adds KK = 2, KK
  8914. #elif defined LN
  8915. adds KK = -2, KK
  8916. #else
  8917. nop __LINE__
  8918. #endif
  8919. ;;
  8920. #if defined(LT) || defined(RN)
  8921. mov L = KK
  8922. #else
  8923. sub L = K, KK
  8924. #endif
  8925. ;;
  8926. .align 16
  8927. .L070:
  8928. { .mib
  8929. #if defined(LT) || defined(RN)
  8930. mov L = KK
  8931. #else
  8932. sub L = K, KK
  8933. #endif
  8934. tbit.z p6, p7 = M, 0
  8935. (p6) br.cond.dptk .L089
  8936. }
  8937. ;;
  8938. { .mmi
  8939. cmp.ne p7, p0 = r0, L
  8940. adds BOFFSET = 0 * SIZE, B
  8941. shl r2 = K, ZBASE_SHIFT
  8942. }
  8943. { .mmi
  8944. shladd r3 = KK, ZBASE_SHIFT, r0
  8945. nop __LINE__
  8946. nop __LINE__
  8947. }
  8948. ;;
  8949. #if defined(LT) || defined(RN)
  8950. { .mfb
  8951. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  8952. }
  8953. ;;
  8954. #else
  8955. { .mfi
  8956. shladd BOFFSET = r3, 1, B
  8957. #ifdef LN
  8958. sub AORIG = AORIG, r2
  8959. #else
  8960. nop __LINE__
  8961. #endif
  8962. }
  8963. ;;
  8964. { .mfi
  8965. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  8966. add AOFFSET = r3, AORIG
  8967. }
  8968. ;;
  8969. #endif
  8970. ;;
  8971. adds L = 1, L
  8972. ;;
  8973. { .mii
  8974. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8975. tbit.z p12, p0 = L, 0
  8976. shr L = L, 1
  8977. }
  8978. ;;
  8979. { .mmi
  8980. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  8981. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  8982. adds L = -1, L
  8983. }
  8984. ;;
  8985. { .mmi
  8986. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  8987. cmp.eq p3, p0 = r0, r0
  8988. mov ar.lc = L
  8989. }
  8990. ;;
  8991. cmp.eq p6, p0 = -1, L
  8992. (p6) br.cond.dpnt .L078
  8993. ;;
  8994. .align 16
  8995. .L072:
  8996. { .mfb
  8997. lfetch.nt1 [PREA], 4 * SIZE
  8998. FMA f64 = f32, f48, f64 // A1 * B1
  8999. nop __LINE__
  9000. }
  9001. { .mfi
  9002. nop __LINE__
  9003. FMA f96 = f32, f49, f96 // A1 * B2
  9004. (p12) cmp.ne p3, p0 = 0, L
  9005. }
  9006. ;;
  9007. { .mfi
  9008. lfetch.nt1 [PREB], 8 * SIZE
  9009. FMA f80 = f32, f50, f80 // A1 * B3
  9010. cmp.ne p4, p5 = 0, L
  9011. }
  9012. { .mfb
  9013. nop __LINE__
  9014. FMA f112 = f32, f51, f112 // A1 * B4
  9015. nop __LINE__
  9016. }
  9017. ;;
  9018. { .mfi
  9019. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  9020. FMA f65 = f33, f48, f65 // A2 * B1
  9021. }
  9022. { .mfi
  9023. nop __LINE__
  9024. FMA f97 = f33, f49, f97 // A2 * B2
  9025. }
  9026. ;;
  9027. { .mfi
  9028. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  9029. FMA f81 = f33, f50, f81 // A2 * B3
  9030. }
  9031. { .mmf
  9032. nop __LINE__
  9033. nop __LINE__
  9034. FMA f113 = f33, f51, f113 // A2 * B4
  9035. }
  9036. ;;
  9037. { .mfb
  9038. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  9039. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  9040. nop __LINE__
  9041. }
  9042. { .mmf
  9043. nop __LINE__
  9044. nop __LINE__
  9045. (p3) FMA f96 = f40, f57, f96 // A1 * B2
  9046. }
  9047. ;;
  9048. { .mfb
  9049. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  9050. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  9051. nop __LINE__
  9052. }
  9053. { .mmf
  9054. nop __LINE__
  9055. nop __LINE__
  9056. (p3) FMA f112 = f40, f59, f112 // A1 * B4
  9057. }
  9058. ;;
  9059. { .mfb
  9060. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  9061. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  9062. nop __LINE__
  9063. }
  9064. { .mfb
  9065. nop __LINE__
  9066. (p3) FMA f97 = f41, f57, f97 // A2 * B2
  9067. nop __LINE__
  9068. }
  9069. ;;
  9070. { .mfi
  9071. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  9072. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  9073. adds L = -1, L
  9074. }
  9075. { .mfb
  9076. nop __LINE__
  9077. (p3) FMA f113 = f41, f59, f113 // A2 * B4
  9078. br.cloop.sptk.few .L072
  9079. }
  9080. ;;
  9081. { .mfb
  9082. nop __LINE__
  9083. FCALC_A f64 = f64, f97
  9084. nop __LINE__
  9085. }
  9086. { .mfb
  9087. nop __LINE__
  9088. FCALC_A f80 = f80, f113
  9089. nop __LINE__
  9090. }
  9091. { .mfb
  9092. nop __LINE__
  9093. FCALC_B f65 = f65, f96
  9094. nop __LINE__
  9095. }
  9096. { .mfb
  9097. nop __LINE__
  9098. FCALC_B f81 = f81, f112
  9099. nop __LINE__
  9100. }
  9101. ;;
  9102. .L078:
  9103. #if defined(LN) || defined(RT)
  9104. #ifdef LN
  9105. adds r2 = -1, KK
  9106. #else
  9107. adds r2 = -2, KK
  9108. #endif
  9109. ;;
  9110. shladd r2 = r2, ZBASE_SHIFT, r0
  9111. ;;
  9112. add AOFFSET = r2, AORIG
  9113. shladd BOFFSET = r2, 1, B
  9114. ;;
  9115. #endif
  9116. #if defined(LN) || defined(LT)
  9117. LDFPD f72, f73 = [BOFFSET], 2 * SIZE
  9118. ;;
  9119. LDFPD f74, f75 = [BOFFSET]
  9120. adds BOFFSET = -2 * SIZE, BOFFSET
  9121. ;;
  9122. FSUB f64 = f72, f64
  9123. FSUB_A f65 = f73, f65
  9124. FSUB f80 = f74, f80
  9125. FSUB_A f81 = f75, f81
  9126. ;;
  9127. #else
  9128. LDFPD f72, f73 = [AOFFSET], 2 * SIZE
  9129. ;;
  9130. LDFPD f88, f89 = [AOFFSET]
  9131. adds AOFFSET = -2 * SIZE, AOFFSET
  9132. ;;
  9133. FSUB f64 = f72, f64
  9134. FSUB f65 = f73, f65
  9135. FSUB f80 = f88, f80
  9136. FSUB f81 = f89, f81
  9137. ;;
  9138. #endif
  9139. #ifdef LN
  9140. LDFPD f120, f121 = [AOFFSET]
  9141. ;;
  9142. FMPY f32 = f120, f64
  9143. FMPY f33 = f121, f64
  9144. FMPY f34 = f120, f80
  9145. FMPY f35 = f121, f80
  9146. ;;
  9147. FMA_C f64 = f121, f65, f32
  9148. FMA_D f65 = f120, f65, f33
  9149. FMA_C f80 = f121, f81, f34
  9150. FMA_D f81 = f120, f81, f35
  9151. ;;
  9152. #endif
  9153. #ifdef LT
  9154. LDFPD f72, f73 = [AOFFSET]
  9155. ;;
  9156. FMPY f32 = f72, f64
  9157. FMPY f33 = f73, f64
  9158. FMPY f34 = f72, f80
  9159. FMPY f35 = f73, f80
  9160. ;;
  9161. FMA_C f64 = f73, f65, f32
  9162. FMA_D f65 = f72, f65, f33
  9163. FMA_C f80 = f73, f81, f34
  9164. FMA_D f81 = f72, f81, f35
  9165. ;;
  9166. #endif
  9167. #ifdef RN
  9168. LDFPD f72, f73 = [BOFFSET], 2 * SIZE
  9169. ;;
  9170. LDFPD f74, f75 = [BOFFSET]
  9171. adds BOFFSET = 4 * SIZE, BOFFSET
  9172. ;;
  9173. LDFPD f90, f91 = [BOFFSET]
  9174. adds BOFFSET = - 6 * SIZE, BOFFSET
  9175. ;;
  9176. FMPY f32 = f72, f64
  9177. FMPY f33 = f73, f64
  9178. ;;
  9179. FMA_C f64 = f73, f65, f32
  9180. FMA_D f65 = f72, f65, f33
  9181. ;;
  9182. FNMA f80 = f74, f64, f80
  9183. FMA_A f81 = f75, f64, f81
  9184. ;;
  9185. FMA_B f80 = f75, f65, f80
  9186. FNMA f81 = f74, f65, f81
  9187. ;;
  9188. FMPY f32 = f90, f80
  9189. FMPY f33 = f91, f80
  9190. ;;
  9191. FMA_C f80 = f91, f81, f32
  9192. FMA_D f81 = f90, f81, f33
  9193. ;;
  9194. #endif
  9195. #ifdef RT
  9196. adds BOFFSET = 6 * SIZE, BOFFSET
  9197. ;;
  9198. LDFPD f104, f105 = [BOFFSET]
  9199. adds BOFFSET = - 2 * SIZE, BOFFSET
  9200. ;;
  9201. LDFPD f106, f107 = [BOFFSET]
  9202. adds BOFFSET = - 4 * SIZE, BOFFSET
  9203. ;;
  9204. LDFPD f120, f121 = [BOFFSET]
  9205. ;;
  9206. FMPY f32 = f104, f80
  9207. FMPY f33 = f105, f80
  9208. ;;
  9209. FMA_C f80 = f105, f81, f32
  9210. FMA_D f81 = f104, f81, f33
  9211. ;;
  9212. FNMA f64 = f106, f80, f64
  9213. FMA_A f65 = f107, f80, f65
  9214. ;;
  9215. FMA_B f64 = f107, f81, f64
  9216. FNMA f65 = f106, f81, f65
  9217. ;;
  9218. FMPY f32 = f120, f64
  9219. FMPY f33 = f121, f64
  9220. ;;
  9221. FMA_C f64 = f121, f65, f32
  9222. FMA_D f65 = f120, f65, f33
  9223. ;;
  9224. #endif
  9225. #if defined(LN) || defined(LT)
  9226. STFD [BOFFSET] = f64, SIZE
  9227. ;;
  9228. STFD [BOFFSET] = f65, SIZE
  9229. ;;
  9230. STFD [BOFFSET] = f80, SIZE
  9231. ;;
  9232. STFD [BOFFSET] = f81, SIZE
  9233. ;;
  9234. adds BOFFSET = - 4 * SIZE, BOFFSET
  9235. ;;
  9236. #else
  9237. STFD [AOFFSET] = f64, SIZE
  9238. ;;
  9239. STFD [AOFFSET] = f65, SIZE
  9240. ;;
  9241. STFD [AOFFSET] = f80, SIZE
  9242. ;;
  9243. STFD [AOFFSET] = f81, SIZE
  9244. ;;
  9245. adds AOFFSET = - 4 * SIZE, AOFFSET
  9246. ;;
  9247. #endif
  9248. #ifdef LN
  9249. adds C1 = -2 * SIZE, C1
  9250. adds C2 = -2 * SIZE, C2
  9251. #endif
  9252. ;;
  9253. STFD [C1 ] = f64, SIZE
  9254. ;;
  9255. STFD [C1 ] = f65, SIZE
  9256. ;;
  9257. STFD [C2 ] = f80, SIZE
  9258. ;;
  9259. STFD [C2 ] = f81, SIZE
  9260. ;;
  9261. mov f64 = f0
  9262. mov f65 = f0
  9263. mov f80 = f0
  9264. mov f81 = f0
  9265. mov f96 = f0
  9266. mov f97 = f0
  9267. mov f112 = f0
  9268. mov f113 = f0
  9269. ;;
  9270. #ifdef LN
  9271. adds C1 = -2 * SIZE, C1
  9272. adds C2 = -2 * SIZE, C2
  9273. #endif
  9274. ;;
  9275. cmp.ne p6, p0 = 1, I
  9276. ;;
  9277. adds I = -1, I
  9278. ;;
  9279. shladd r2 = K, ZBASE_SHIFT, r0
  9280. ;;
  9281. sub L = K, KK
  9282. ;;
  9283. #ifdef RT
  9284. add AORIG = r2, AORIG
  9285. #endif
  9286. ;;
  9287. #if defined(LT) || defined(RN)
  9288. shladd L = L, ZBASE_SHIFT, r0
  9289. ;;
  9290. add AOFFSET = L, AOFFSET
  9291. shladd BOFFSET = L, 1, BOFFSET
  9292. #endif
  9293. ;;
  9294. #ifdef LT
  9295. adds KK = 1, KK
  9296. #elif defined LN
  9297. adds KK = -1, KK
  9298. #else
  9299. nop __LINE__
  9300. #endif
  9301. ;;
  9302. #if defined(LT) || defined(RN)
  9303. mov L = KK
  9304. #else
  9305. sub L = K, KK
  9306. #endif
  9307. ;;
  9308. .align 16
  9309. .L089:
  9310. #ifdef LN
  9311. shladd KK8 = K, ZBASE_SHIFT, r0
  9312. ;;
  9313. shladd B = KK8, 1, B
  9314. #endif
  9315. #if defined(LT) || defined(RN)
  9316. mov B = BOFFSET
  9317. #endif
  9318. #ifdef RN
  9319. adds KK = 2, KK
  9320. #endif
  9321. #ifdef RT
  9322. adds KK = -2, KK
  9323. #endif
  9324. ;;
  9325. { .mmi
  9326. mov AOFFSET = A
  9327. nop __LINE__
  9328. }
  9329. ;;
  9330. .align 16
  9331. .L090:
  9332. shr I = M, 2
  9333. tbit.z p6, p0 = N, 0
  9334. (p6) br.cond.dpnt .L999
  9335. ;;
  9336. #ifdef RT
  9337. { .mmi
  9338. shl r2 = K, ZBASE_SHIFT
  9339. }
  9340. ;;
  9341. { .mmi
  9342. sub B = B, r2
  9343. sub C = C, LDC
  9344. nop __LINE__
  9345. }
  9346. ;;
  9347. #endif
  9348. mov C1 = C
  9349. #ifdef LN
  9350. add KK = M, OFFSET
  9351. #elif defined LT
  9352. mov KK = OFFSET
  9353. #else
  9354. nop __LINE__
  9355. #endif
  9356. ;;
  9357. #if defined(LN) || defined(RT)
  9358. mov AORIG = A
  9359. #else
  9360. mov AOFFSET = A
  9361. #endif
  9362. ;;
  9363. #if defined(LT) || defined(RN)
  9364. mov L = KK
  9365. #else
  9366. sub L = K, KK
  9367. #endif
  9368. ;;
  9369. { .mib
  9370. cmp.eq p6, p7 = 0, I
  9371. #ifndef RT
  9372. add C = LDC, C
  9373. #else
  9374. nop __LINE__
  9375. #endif
  9376. (p6) br.cond.dpnt .L100
  9377. }
  9378. ;;
  9379. .align 16
  9380. .L092:
  9381. { .mmi
  9382. cmp.ne p7, p0 = r0, L
  9383. adds BOFFSET = 0 * SIZE, B
  9384. shl r2 = K, 2 + ZBASE_SHIFT
  9385. }
  9386. { .mmi
  9387. shladd r3 = KK, ZBASE_SHIFT, r0
  9388. nop __LINE__
  9389. nop __LINE__
  9390. }
  9391. ;;
  9392. #if defined(LT) || defined(RN)
  9393. { .mfb
  9394. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  9395. mov f66 = f0
  9396. nop __LINE__
  9397. }
  9398. { .mmf
  9399. nop __LINE__
  9400. nop __LINE__
  9401. mov f67 = f0
  9402. }
  9403. ;;
  9404. #else
  9405. { .mfi
  9406. add BOFFSET = r3, B
  9407. mov f66 = f0
  9408. #ifdef LN
  9409. sub AORIG = AORIG, r2
  9410. #else
  9411. nop __LINE__
  9412. #endif
  9413. }
  9414. ;;
  9415. { .mfi
  9416. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  9417. mov f67 = f0
  9418. shladd AOFFSET = r3, 2, AORIG
  9419. }
  9420. ;;
  9421. #endif
  9422. ;;
  9423. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  9424. adds L = 1, L
  9425. ;;
  9426. { .mfi
  9427. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  9428. tbit.z p12, p0 = L, 0
  9429. }
  9430. { .mfi
  9431. adds PREC = CPREFETCHSIZE * SIZE, C1
  9432. shr L = L, 1
  9433. }
  9434. ;;
  9435. { .mfi
  9436. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  9437. adds L = -1, L
  9438. }
  9439. { .mmf
  9440. (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  9441. CPREFETCH [PREC]
  9442. }
  9443. ;;
  9444. { .mfi
  9445. (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  9446. mov ar.lc = L
  9447. }
  9448. { .mmi
  9449. adds C5 = 4 * SIZE, C1
  9450. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  9451. cmp.eq p3, p0 = r0, r0
  9452. }
  9453. ;;
  9454. cmp.eq p6, p0 = -1, L
  9455. (p6) br.cond.dpnt .L098
  9456. ;;
  9457. .align 16
  9458. .L093:
  9459. /* 1 */
  9460. { .mfi
  9461. lfetch.nt1 [PREA], 16 * SIZE
  9462. FMA f64 = f32, f48, f64 // A1 * B1
  9463. cmp.ne p4, p5 = 0, L
  9464. }
  9465. { .mfi
  9466. nop __LINE__
  9467. FMA_B f65 = f32, f49, f65 // A1 * B2
  9468. (p12) cmp.ne p3, p0 = 0, L
  9469. }
  9470. ;;
  9471. { .mfi
  9472. lfetch.nt1 [PREB], 4 * SIZE
  9473. FMA f80 = f34, f48, f80 // A3 * B1
  9474. nop __LINE__
  9475. }
  9476. { .mfi
  9477. nop __LINE__
  9478. FMA_B f81 = f34, f49, f81 // A3 * B2
  9479. nop __LINE__
  9480. }
  9481. ;;
  9482. { .mfi
  9483. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  9484. FMA f96 = f36, f48, f96 // A5 * B1
  9485. nop __LINE__
  9486. }
  9487. { .mfi
  9488. nop __LINE__
  9489. FMA_B f97 = f36, f49, f97 // A5 * B2
  9490. nop __LINE__
  9491. }
  9492. ;;
  9493. { .mfb
  9494. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  9495. FMA f112 = f38, f48, f112 // A7 * B1
  9496. nop __LINE__
  9497. }
  9498. { .mfb
  9499. nop __LINE__
  9500. FMA_B f113 = f38, f49, f113 // A7 * B2
  9501. nop __LINE__
  9502. }
  9503. ;;
  9504. { .mfb
  9505. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  9506. FMA f65 = f33, f48, f65 // A2 * B1
  9507. nop __LINE__
  9508. }
  9509. { .mfb
  9510. nop __LINE__
  9511. FMA_A f64 = f33, f49, f64 // A2 * B2
  9512. nop __LINE__
  9513. }
  9514. ;;
  9515. { .mfb
  9516. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  9517. FMA f81 = f35, f48, f81 // A4 * B1
  9518. nop __LINE__
  9519. }
  9520. { .mfb
  9521. nop __LINE__
  9522. FMA_A f80 = f35, f49, f80 // A4 * B2
  9523. nop __LINE__
  9524. }
  9525. ;;
  9526. { .mfb
  9527. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  9528. FMA f97 = f37, f48, f97 // A6 * B1
  9529. nop __LINE__
  9530. }
  9531. { .mfb
  9532. nop __LINE__
  9533. FMA_A f96 = f37, f49, f96 // A6 * B2
  9534. nop __LINE__
  9535. }
  9536. ;;
  9537. { .mfb
  9538. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  9539. FMA f113 = f39, f48, f113 // A8 * B1
  9540. nop __LINE__
  9541. }
  9542. { .mfb
  9543. nop __LINE__
  9544. FMA_A f112 = f39, f49, f112 // A8 * B2
  9545. nop __LINE__
  9546. }
  9547. ;;
  9548. { .mfb
  9549. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  9550. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  9551. nop __LINE__
  9552. }
  9553. { .mfb
  9554. nop __LINE__
  9555. (p3) FMA_B f65 = f40, f57, f65 // A1 * B2
  9556. nop __LINE__
  9557. }
  9558. ;;
  9559. { .mfb
  9560. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  9561. (p3) FMA f80 = f42, f56, f80 // A3 * B1
  9562. nop __LINE__
  9563. }
  9564. { .mfb
  9565. nop __LINE__
  9566. (p3) FMA_B f81 = f42, f57, f81 // A3 * B2
  9567. nop __LINE__
  9568. }
  9569. ;;
  9570. { .mfb
  9571. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  9572. (p3) FMA f96 = f44, f56, f96 // A5 * B1
  9573. nop __LINE__
  9574. }
  9575. { .mfb
  9576. nop __LINE__
  9577. (p3) FMA_B f97 = f44, f57, f97 // A5 * B2
  9578. nop __LINE__
  9579. }
  9580. ;;
  9581. { .mfb
  9582. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  9583. (p3) FMA f112 = f46, f56, f112 // A7 * B1
  9584. nop __LINE__
  9585. }
  9586. { .mfb
  9587. nop __LINE__
  9588. (p3) FMA_B f113 = f46, f57, f113 // A7 * B2
  9589. nop __LINE__
  9590. }
  9591. ;;
  9592. { .mfb
  9593. nop __LINE__
  9594. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  9595. nop __LINE__
  9596. }
  9597. { .mfb
  9598. nop __LINE__
  9599. (p3) FMA_A f64 = f41, f57, f64 // A2 * B2
  9600. nop __LINE__
  9601. }
  9602. ;;
  9603. { .mfb
  9604. nop __LINE__
  9605. (p3) FMA f81 = f43, f56, f81 // A4 * B1
  9606. nop __LINE__
  9607. }
  9608. { .mfb
  9609. nop __LINE__
  9610. (p3) FMA_A f80 = f43, f57, f80 // A4 * B2
  9611. nop __LINE__
  9612. }
  9613. ;;
  9614. { .mfb
  9615. nop __LINE__
  9616. (p3) FMA f97 = f45, f56, f97 // A6 * B1
  9617. nop __LINE__
  9618. }
  9619. { .mfb
  9620. nop __LINE__
  9621. (p3) FMA_A f96 = f45, f57, f96 // A6 * B2
  9622. nop __LINE__
  9623. }
  9624. ;;
  9625. { .mfi
  9626. nop __LINE__
  9627. (p3) FMA f113 = f47, f56, f113 // A8 * B1
  9628. adds L = -1, L
  9629. }
  9630. { .mfb
  9631. nop __LINE__
  9632. (p3) FMA_A f112 = f47, f57, f112 // A8 * B2
  9633. br.cloop.sptk.few .L093
  9634. }
  9635. ;;
  9636. .L098:
  9637. #if defined(LN) || defined(RT)
  9638. #ifdef LN
  9639. adds r2 = -4, KK
  9640. #else
  9641. adds r2 = -1, KK
  9642. #endif
  9643. ;;
  9644. shladd r2 = r2, ZBASE_SHIFT, r0
  9645. ;;
  9646. shladd AOFFSET = r2, 2, AORIG
  9647. add BOFFSET = r2, B
  9648. ;;
  9649. #endif
  9650. #if defined(LN) || defined(LT)
  9651. LDFPD f72, f73 = [BOFFSET], 2 * SIZE
  9652. ;;
  9653. LDFPD f74, f75 = [BOFFSET], 2 * SIZE
  9654. ;;
  9655. LDFPD f88, f89 = [BOFFSET], 2 * SIZE
  9656. ;;
  9657. LDFPD f90, f91 = [BOFFSET]
  9658. adds BOFFSET = -6 * SIZE, BOFFSET
  9659. ;;
  9660. FSUB f64 = f72, f64
  9661. FSUB_A f65 = f73, f65
  9662. FSUB f80 = f74, f80
  9663. FSUB_A f81 = f75, f81
  9664. FSUB f96 = f88, f96
  9665. FSUB_A f97 = f89, f97
  9666. FSUB f112 = f90, f112
  9667. FSUB_A f113 = f91, f113
  9668. ;;
  9669. #else
  9670. LDFPD f72, f73 = [AOFFSET], 2 * SIZE
  9671. ;;
  9672. LDFPD f74, f75 = [AOFFSET], 2 * SIZE
  9673. ;;
  9674. LDFPD f88, f89 = [AOFFSET], 2 * SIZE
  9675. ;;
  9676. LDFPD f90, f91 = [AOFFSET]
  9677. adds AOFFSET = -6 * SIZE, AOFFSET
  9678. ;;
  9679. FSUB f64 = f72, f64
  9680. FSUB f65 = f73, f65
  9681. FSUB f80 = f74, f80
  9682. FSUB f81 = f75, f81
  9683. FSUB f96 = f88, f96
  9684. FSUB f97 = f89, f97
  9685. FSUB f112 = f90, f112
  9686. FSUB f113 = f91, f113
  9687. ;;
  9688. #endif
  9689. #ifdef LN
  9690. adds AOFFSET = 30 * SIZE, AOFFSET
  9691. ;;
  9692. LDFPD f72, f73 = [AOFFSET]
  9693. adds AOFFSET = - 2 * SIZE, AOFFSET
  9694. ;;
  9695. LDFPD f74, f75 = [AOFFSET]
  9696. adds AOFFSET = - 2 * SIZE, AOFFSET
  9697. ;;
  9698. LDFPD f76, f77 = [AOFFSET]
  9699. adds AOFFSET = - 2 * SIZE, AOFFSET
  9700. ;;
  9701. LDFPD f78, f79 = [AOFFSET]
  9702. adds AOFFSET = - 4 * SIZE, AOFFSET
  9703. ;;
  9704. LDFPD f88, f89 = [AOFFSET]
  9705. adds AOFFSET = - 2 * SIZE, AOFFSET
  9706. ;;
  9707. LDFPD f90, f91 = [AOFFSET]
  9708. adds AOFFSET = - 2 * SIZE, AOFFSET
  9709. ;;
  9710. LDFPD f92, f93 = [AOFFSET]
  9711. adds AOFFSET = - 6 * SIZE, AOFFSET
  9712. ;;
  9713. LDFPD f104, f105 = [AOFFSET]
  9714. adds AOFFSET = - 2 * SIZE, AOFFSET
  9715. ;;
  9716. LDFPD f106, f107 = [AOFFSET]
  9717. adds AOFFSET = - 8 * SIZE, AOFFSET
  9718. ;;
  9719. LDFPD f120, f121 = [AOFFSET]
  9720. ;;
  9721. FMPY f32 = f72, f112
  9722. FMPY f33 = f73, f112
  9723. ;;
  9724. FMA_C f112 = f73, f113, f32
  9725. FMA_D f113 = f72, f113, f33
  9726. ;;
  9727. FNMA f96 = f74, f112, f96
  9728. FMA_A f97 = f75, f112, f97
  9729. FNMA f80 = f76, f112, f80
  9730. FMA_A f81 = f77, f112, f81
  9731. FNMA f64 = f78, f112, f64
  9732. FMA_A f65 = f79, f112, f65
  9733. ;;
  9734. FMA_B f96 = f75, f113, f96
  9735. FNMA f97 = f74, f113, f97
  9736. FMA_B f80 = f77, f113, f80
  9737. FNMA f81 = f76, f113, f81
  9738. FMA_B f64 = f79, f113, f64
  9739. FNMA f65 = f78, f113, f65
  9740. ;;
  9741. FMPY f32 = f88, f96
  9742. FMPY f33 = f89, f96
  9743. ;;
  9744. FMA_C f96 = f89, f97, f32
  9745. FMA_D f97 = f88, f97, f33
  9746. ;;
  9747. FNMA f80 = f90, f96, f80
  9748. FMA_A f81 = f91, f96, f81
  9749. FNMA f64 = f92, f96, f64
  9750. FMA_A f65 = f93, f96, f65
  9751. ;;
  9752. FMA_B f80 = f91, f97, f80
  9753. FNMA f81 = f90, f97, f81
  9754. FMA_B f64 = f93, f97, f64
  9755. FNMA f65 = f92, f97, f65
  9756. ;;
  9757. FMPY f32 = f104, f80
  9758. FMPY f33 = f105, f80
  9759. ;;
  9760. FMA_C f80 = f105, f81, f32
  9761. FMA_D f81 = f104, f81, f33
  9762. ;;
  9763. FNMA f64 = f106, f80, f64
  9764. FMA_A f65 = f107, f80, f65
  9765. ;;
  9766. FMA_B f64 = f107, f81, f64
  9767. FNMA f65 = f106, f81, f65
  9768. ;;
  9769. FMPY f32 = f120, f64
  9770. FMPY f33 = f121, f64
  9771. ;;
  9772. FMA_C f64 = f121, f65, f32
  9773. FMA_D f65 = f120, f65, f33
  9774. ;;
  9775. #endif
  9776. #ifdef LT
  9777. LDFPD f72, f73 = [AOFFSET], 2 * SIZE
  9778. ;;
  9779. LDFPD f74, f75 = [AOFFSET], 2 * SIZE
  9780. ;;
  9781. LDFPD f76, f77 = [AOFFSET], 2 * SIZE
  9782. ;;
  9783. LDFPD f78, f79 = [AOFFSET]
  9784. adds AOFFSET = 4 * SIZE, AOFFSET
  9785. ;;
  9786. LDFPD f90, f91 = [AOFFSET], 2 * SIZE
  9787. ;;
  9788. LDFPD f92, f93 = [AOFFSET], 2 * SIZE
  9789. ;;
  9790. LDFPD f94, f95 = [AOFFSET]
  9791. adds AOFFSET = 6 * SIZE, AOFFSET
  9792. ;;
  9793. LDFPD f108, f109 = [AOFFSET], 2 * SIZE
  9794. ;;
  9795. LDFPD f110, f111 = [AOFFSET]
  9796. adds AOFFSET = 8 * SIZE, AOFFSET
  9797. ;;
  9798. LDFPD f126, f127 = [AOFFSET]
  9799. adds AOFFSET = - 30 * SIZE, AOFFSET
  9800. ;;
  9801. FMPY f32 = f72, f64
  9802. FMPY f33 = f73, f64
  9803. ;;
  9804. FMA_C f64 = f73, f65, f32
  9805. FMA_D f65 = f72, f65, f33
  9806. ;;
  9807. FNMA f80 = f74, f64, f80
  9808. FMA_A f81 = f75, f64, f81
  9809. FNMA f96 = f76, f64, f96
  9810. FMA_A f97 = f77, f64, f97
  9811. FNMA f112 = f78, f64, f112
  9812. FMA_A f113 = f79, f64, f113
  9813. ;;
  9814. FMA_B f80 = f75, f65, f80
  9815. FNMA f81 = f74, f65, f81
  9816. FMA_B f96 = f77, f65, f96
  9817. FNMA f97 = f76, f65, f97
  9818. FMA_B f112 = f79, f65, f112
  9819. FNMA f113 = f78, f65, f113
  9820. ;;
  9821. FMPY f32 = f90, f80
  9822. FMPY f33 = f91, f80
  9823. ;;
  9824. FMA_C f80 = f91, f81, f32
  9825. FMA_D f81 = f90, f81, f33
  9826. ;;
  9827. FNMA f96 = f92, f80, f96
  9828. FMA_A f97 = f93, f80, f97
  9829. FNMA f112 = f94, f80, f112
  9830. FMA_A f113 = f95, f80, f113
  9831. ;;
  9832. FMA_B f96 = f93, f81, f96
  9833. FNMA f97 = f92, f81, f97
  9834. FMA_B f112 = f95, f81, f112
  9835. FNMA f113 = f94, f81, f113
  9836. ;;
  9837. FMPY f32 = f108, f96
  9838. FMPY f33 = f109, f96
  9839. ;;
  9840. FMA_C f96 = f109, f97, f32
  9841. FMA_D f97 = f108, f97, f33
  9842. ;;
  9843. FNMA f112 = f110, f96, f112
  9844. FMA_A f113 = f111, f96, f113
  9845. ;;
  9846. FMA_B f112 = f111, f97, f112
  9847. FNMA f113 = f110, f97, f113
  9848. ;;
  9849. FMPY f32 = f126, f112
  9850. FMPY f33 = f127, f112
  9851. ;;
  9852. FMA_C f112 = f127, f113, f32
  9853. FMA_D f113 = f126, f113, f33
  9854. ;;
  9855. #endif
  9856. #ifdef RN
  9857. LDFPD f72, f73 = [BOFFSET]
  9858. ;;
  9859. FMPY f32 = f72, f64
  9860. FMPY f33 = f73, f64
  9861. FMPY f34 = f72, f80
  9862. FMPY f35 = f73, f80
  9863. FMPY f36 = f72, f96
  9864. FMPY f37 = f73, f96
  9865. FMPY f38 = f72, f112
  9866. FMPY f39 = f73, f112
  9867. ;;
  9868. FMA_C f64 = f73, f65, f32
  9869. FMA_D f65 = f72, f65, f33
  9870. FMA_C f80 = f73, f81, f34
  9871. FMA_D f81 = f72, f81, f35
  9872. FMA_C f96 = f73, f97, f36
  9873. FMA_D f97 = f72, f97, f37
  9874. FMA_C f112 = f73, f113, f38
  9875. FMA_D f113 = f72, f113, f39
  9876. ;;
  9877. #endif
  9878. #ifdef RT
  9879. LDFPD f72, f73 = [BOFFSET]
  9880. ;;
  9881. FMPY f32 = f72, f64
  9882. FMPY f33 = f73, f64
  9883. FMPY f34 = f72, f80
  9884. FMPY f35 = f73, f80
  9885. FMPY f36 = f72, f96
  9886. FMPY f37 = f73, f96
  9887. FMPY f38 = f72, f112
  9888. FMPY f39 = f73, f112
  9889. ;;
  9890. FMA_C f64 = f73, f65, f32
  9891. FMA_D f65 = f72, f65, f33
  9892. FMA_C f80 = f73, f81, f34
  9893. FMA_D f81 = f72, f81, f35
  9894. FMA_C f96 = f73, f97, f36
  9895. FMA_D f97 = f72, f97, f37
  9896. FMA_C f112 = f73, f113, f38
  9897. FMA_D f113 = f72, f113, f39
  9898. ;;
  9899. #endif
  9900. #if defined(LN) || defined(LT)
  9901. adds BOFFSET2 = 4 * SIZE, BOFFSET
  9902. ;;
  9903. STFD [BOFFSET] = f64, SIZE
  9904. STFD [BOFFSET2] = f96, SIZE
  9905. ;;
  9906. STFD [BOFFSET] = f65, SIZE
  9907. STFD [BOFFSET2] = f97, SIZE
  9908. ;;
  9909. STFD [BOFFSET] = f80, SIZE
  9910. STFD [BOFFSET2] = f112, SIZE
  9911. ;;
  9912. STFD [BOFFSET] = f81, 5 * SIZE
  9913. STFD [BOFFSET2] = f113, 5 * SIZE
  9914. ;;
  9915. adds BOFFSET = - 8 * SIZE, BOFFSET
  9916. ;;
  9917. #else
  9918. adds AOFFSET2 = 4 * SIZE, AOFFSET
  9919. ;;
  9920. STFD [AOFFSET] = f64, SIZE
  9921. STFD [AOFFSET2] = f96, SIZE
  9922. ;;
  9923. STFD [AOFFSET] = f65, SIZE
  9924. STFD [AOFFSET2] = f97, SIZE
  9925. ;;
  9926. STFD [AOFFSET] = f80, SIZE
  9927. STFD [AOFFSET2] = f112, SIZE
  9928. ;;
  9929. STFD [AOFFSET] = f81, 5 * SIZE
  9930. STFD [AOFFSET2] = f113, 5 * SIZE
  9931. ;;
  9932. adds AOFFSET = - 8 * SIZE, AOFFSET
  9933. ;;
  9934. #endif
  9935. #ifdef LN
  9936. adds C1 = -8 * SIZE, C1
  9937. adds C5 = -8 * SIZE, C5
  9938. #endif
  9939. ;;
  9940. STFD [C1 ] = f64, SIZE
  9941. STFD [C5 ] = f96, SIZE
  9942. ;;
  9943. STFD [C1 ] = f65, SIZE
  9944. STFD [C5 ] = f97, SIZE
  9945. ;;
  9946. STFD [C1 ] = f80, SIZE
  9947. STFD [C5 ] = f112, SIZE
  9948. ;;
  9949. STFD [C1 ] = f81, 5 * SIZE
  9950. STFD [C5 ] = f113, 5 * SIZE
  9951. ;;
  9952. mov f64 = f0
  9953. mov f65 = f0
  9954. mov f80 = f0
  9955. mov f81 = f0
  9956. mov f96 = f0
  9957. mov f97 = f0
  9958. mov f112 = f0
  9959. mov f113 = f0
  9960. ;;
  9961. #ifdef LN
  9962. adds C1 = -8 * SIZE, C1
  9963. adds C5 = -8 * SIZE, C5
  9964. #endif
  9965. ;;
  9966. cmp.ne p6, p0 = 1, I
  9967. ;;
  9968. adds I = -1, I
  9969. ;;
  9970. shladd r2 = K, ZBASE_SHIFT, r0
  9971. ;;
  9972. sub L = K, KK
  9973. ;;
  9974. #ifdef RT
  9975. shladd AORIG = r2, 2, AORIG
  9976. #endif
  9977. ;;
  9978. #if defined(LT) || defined(RN)
  9979. shladd L = L, ZBASE_SHIFT, r0
  9980. ;;
  9981. shladd AOFFSET = L, 2, AOFFSET
  9982. add BOFFSET = L, BOFFSET
  9983. #endif
  9984. ;;
  9985. #ifdef LT
  9986. adds KK = 4, KK
  9987. #elif defined LN
  9988. adds KK = -4, KK
  9989. #else
  9990. nop __LINE__
  9991. #endif
  9992. ;;
  9993. #if defined(LT) || defined(RN)
  9994. mov L = KK
  9995. #else
  9996. sub L = K, KK
  9997. #endif
  9998. ;;
  9999. (p6) br.cond.dptk .L092
  10000. ;;
  10001. .align 16
  10002. .L100:
  10003. { .mib
  10004. #if defined(LT) || defined(RN)
  10005. mov L = KK
  10006. #else
  10007. sub L = K, KK
  10008. #endif
  10009. tbit.z p6, p7 = M, 1
  10010. (p6) br.cond.dptk .L110
  10011. }
  10012. ;;
  10013. { .mmi
  10014. cmp.ne p7, p0 = r0, L
  10015. adds BOFFSET = 0 * SIZE, B
  10016. shl r2 = K, 1 + ZBASE_SHIFT
  10017. }
  10018. { .mmi
  10019. shladd r3 = KK, ZBASE_SHIFT, r0
  10020. nop __LINE__
  10021. nop __LINE__
  10022. }
  10023. ;;
  10024. #if defined(LT) || defined(RN)
  10025. { .mfb
  10026. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10027. mov f66 = f0
  10028. nop __LINE__
  10029. }
  10030. { .mmf
  10031. nop __LINE__
  10032. nop __LINE__
  10033. mov f67 = f0
  10034. }
  10035. ;;
  10036. #else
  10037. { .mfi
  10038. add BOFFSET = r3, B
  10039. mov f66 = f0
  10040. #ifdef LN
  10041. sub AORIG = AORIG, r2
  10042. #else
  10043. nop __LINE__
  10044. #endif
  10045. }
  10046. ;;
  10047. { .mfi
  10048. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10049. mov f67 = f0
  10050. shladd AOFFSET = r3, 1, AORIG
  10051. }
  10052. ;;
  10053. #endif
  10054. ;;
  10055. adds L = 1, L
  10056. ;;
  10057. { .mii
  10058. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  10059. tbit.z p12, p0 = L, 0
  10060. shr L = L, 1
  10061. }
  10062. ;;
  10063. { .mmi
  10064. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  10065. nop __LINE__
  10066. adds L = -1, L
  10067. }
  10068. ;;
  10069. { .mmi
  10070. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  10071. cmp.eq p3, p0 = r0, r0
  10072. mov ar.lc = L
  10073. }
  10074. ;;
  10075. cmp.eq p6, p0 = -1, L
  10076. (p6) br.cond.dpnt .L108
  10077. ;;
  10078. .align 16
  10079. .L102:
  10080. { .mfi
  10081. lfetch.nt1 [PREA], 8 * SIZE
  10082. FMA f64 = f32, f48, f64 // A1 * B1
  10083. cmp.ne p4, p5 = 0, L
  10084. }
  10085. { .mfi
  10086. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  10087. FMA f80 = f32, f49, f80 // A1 * B2
  10088. (p12) cmp.ne p3, p0 = 0, L
  10089. }
  10090. ;;
  10091. { .mfb
  10092. lfetch.nt1 [PREB], 4 * SIZE
  10093. FMA f65 = f33, f48, f65 // A2 * B1
  10094. nop __LINE__
  10095. }
  10096. { .mfb
  10097. nop __LINE__
  10098. FMA f81 = f33, f49, f81 // A2 * B2
  10099. nop __LINE__
  10100. }
  10101. ;;
  10102. { .mfb
  10103. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  10104. FMA f96 = f34, f48, f96 // A3 * B1
  10105. nop __LINE__
  10106. }
  10107. { .mfb
  10108. nop __LINE__
  10109. FMA f112 = f34, f49, f112 // A3 * B2
  10110. nop __LINE__
  10111. }
  10112. ;;
  10113. { .mfb
  10114. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  10115. FMA f97 = f35, f48, f97 // A4 * B1
  10116. nop __LINE__
  10117. }
  10118. { .mfb
  10119. nop __LINE__
  10120. FMA f113 = f35, f49, f113 // A4 * B2
  10121. nop __LINE__
  10122. }
  10123. ;;
  10124. { .mfb
  10125. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  10126. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  10127. nop __LINE__
  10128. }
  10129. { .mfb
  10130. nop __LINE__
  10131. (p3) FMA f80 = f40, f57, f80 // A1 * B2
  10132. nop __LINE__
  10133. }
  10134. ;;
  10135. { .mfb
  10136. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  10137. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  10138. nop __LINE__
  10139. }
  10140. { .mfb
  10141. nop __LINE__
  10142. (p3) FMA f81 = f41, f57, f81 // A2 * B2
  10143. nop __LINE__
  10144. }
  10145. ;;
  10146. { .mfb
  10147. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10148. (p3) FMA f96 = f42, f56, f96 // A3 * B1
  10149. nop __LINE__
  10150. }
  10151. { .mfb
  10152. nop __LINE__
  10153. (p3) FMA f112 = f42, f57, f112 // A3 * B2
  10154. nop __LINE__
  10155. }
  10156. ;;
  10157. { .mfi
  10158. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  10159. (p3) FMA f97 = f43, f56, f97 // A4 * B1
  10160. adds L = -1, L
  10161. }
  10162. { .mfb
  10163. nop __LINE__
  10164. (p3) FMA f113 = f43, f57, f113 // A4 * B2
  10165. br.cloop.sptk.few .L102
  10166. }
  10167. ;;
  10168. { .mfb
  10169. nop __LINE__
  10170. FCALC_A f64 = f64, f81
  10171. nop __LINE__
  10172. }
  10173. { .mfb
  10174. nop __LINE__
  10175. FCALC_B f65 = f65, f80
  10176. nop __LINE__
  10177. }
  10178. { .mfb
  10179. nop __LINE__
  10180. FCALC_A f96 = f96, f113
  10181. nop __LINE__
  10182. }
  10183. { .mfb
  10184. nop __LINE__
  10185. FCALC_B f97 = f97, f112
  10186. nop __LINE__
  10187. }
  10188. ;;
  10189. .L108:
  10190. #if defined(LN) || defined(RT)
  10191. #ifdef LN
  10192. adds r2 = -2, KK
  10193. #else
  10194. adds r2 = -1, KK
  10195. #endif
  10196. ;;
  10197. shladd r2 = r2, ZBASE_SHIFT, r0
  10198. ;;
  10199. shladd AOFFSET = r2, 1, AORIG
  10200. add BOFFSET = r2, B
  10201. ;;
  10202. #endif
  10203. #if defined(LN) || defined(LT)
  10204. LDFPD f72, f73 = [BOFFSET], 2 * SIZE
  10205. ;;
  10206. LDFPD f88, f89 = [BOFFSET]
  10207. adds BOFFSET = -2 * SIZE, BOFFSET
  10208. ;;
  10209. FSUB f64 = f72, f64
  10210. FSUB_A f65 = f73, f65
  10211. FSUB f96 = f88, f96
  10212. FSUB_A f97 = f89, f97
  10213. ;;
  10214. #else
  10215. LDFPD f72, f73 = [AOFFSET], 2 * SIZE
  10216. ;;
  10217. LDFPD f88, f89 = [AOFFSET]
  10218. adds AOFFSET = -2 * SIZE, AOFFSET
  10219. ;;
  10220. FSUB f64 = f72, f64
  10221. FSUB f65 = f73, f65
  10222. FSUB f96 = f88, f96
  10223. FSUB f97 = f89, f97
  10224. ;;
  10225. #endif
  10226. #ifdef LN
  10227. adds AOFFSET = 6 * SIZE, AOFFSET
  10228. ;;
  10229. LDFPD f104, f105 = [AOFFSET]
  10230. adds AOFFSET = - 2 * SIZE, AOFFSET
  10231. ;;
  10232. LDFPD f106, f107 = [AOFFSET]
  10233. adds AOFFSET = - 4 * SIZE, AOFFSET
  10234. ;;
  10235. LDFPD f120, f121 = [AOFFSET]
  10236. ;;
  10237. FMPY f32 = f104, f96
  10238. FMPY f33 = f105, f96
  10239. ;;
  10240. FMA_C f96 = f105, f97, f32
  10241. FMA_D f97 = f104, f97, f33
  10242. ;;
  10243. FNMA f64 = f106, f96, f64
  10244. FMA_A f65 = f107, f96, f65
  10245. ;;
  10246. FMA_B f64 = f107, f97, f64
  10247. FNMA f65 = f106, f97, f65
  10248. ;;
  10249. FMPY f32 = f120, f64
  10250. FMPY f33 = f121, f64
  10251. ;;
  10252. FMA_C f64 = f121, f65, f32
  10253. FMA_D f65 = f120, f65, f33
  10254. ;;
  10255. #endif
  10256. #ifdef LT
  10257. LDFPD f72, f73 = [AOFFSET], 2 * SIZE
  10258. ;;
  10259. LDFPD f74, f75 = [AOFFSET]
  10260. adds AOFFSET = 4 * SIZE, AOFFSET
  10261. ;;
  10262. LDFPD f90, f91 = [AOFFSET]
  10263. adds AOFFSET = - 6 * SIZE, AOFFSET
  10264. ;;
  10265. FMPY f32 = f72, f64
  10266. FMPY f33 = f73, f64
  10267. ;;
  10268. FMA_C f64 = f73, f65, f32
  10269. FMA_D f65 = f72, f65, f33
  10270. ;;
  10271. FNMA f96 = f74, f64, f96
  10272. FMA_A f97 = f75, f64, f97
  10273. ;;
  10274. FMA_B f96 = f75, f65, f96
  10275. FNMA f97 = f74, f65, f97
  10276. ;;
  10277. FMPY f32 = f90, f96
  10278. FMPY f33 = f91, f96
  10279. ;;
  10280. FMA_C f96 = f91, f97, f32
  10281. FMA_D f97 = f90, f97, f33
  10282. ;;
  10283. #endif
  10284. #ifdef RN
  10285. LDFPD f72, f73 = [BOFFSET]
  10286. ;;
  10287. FMPY f32 = f72, f64
  10288. FMPY f33 = f73, f64
  10289. FMPY f36 = f72, f96
  10290. FMPY f37 = f73, f96
  10291. ;;
  10292. FMA_C f64 = f73, f65, f32
  10293. FMA_D f65 = f72, f65, f33
  10294. FMA_C f96 = f73, f97, f36
  10295. FMA_D f97 = f72, f97, f37
  10296. ;;
  10297. #endif
  10298. #ifdef RT
  10299. LDFPD f72, f73 = [BOFFSET]
  10300. ;;
  10301. FMPY f32 = f72, f64
  10302. FMPY f33 = f73, f64
  10303. FMPY f36 = f72, f96
  10304. FMPY f37 = f73, f96
  10305. ;;
  10306. FMA_C f64 = f73, f65, f32
  10307. FMA_D f65 = f72, f65, f33
  10308. FMA_C f96 = f73, f97, f36
  10309. FMA_D f97 = f72, f97, f37
  10310. ;;
  10311. #endif
  10312. #if defined(LN) || defined(LT)
  10313. STFD [BOFFSET] = f64, SIZE
  10314. ;;
  10315. STFD [BOFFSET] = f65, SIZE
  10316. ;;
  10317. STFD [BOFFSET] = f96, SIZE
  10318. ;;
  10319. STFD [BOFFSET] = f97, SIZE
  10320. ;;
  10321. adds BOFFSET = - 4 * SIZE, BOFFSET
  10322. ;;
  10323. #else
  10324. adds AOFFSET2 = 4 * SIZE, AOFFSET
  10325. ;;
  10326. STFD [AOFFSET] = f64, SIZE
  10327. ;;
  10328. STFD [AOFFSET] = f65, SIZE
  10329. ;;
  10330. STFD [AOFFSET] = f96, SIZE
  10331. ;;
  10332. STFD [AOFFSET] = f97, SIZE
  10333. ;;
  10334. adds AOFFSET = - 4 * SIZE, AOFFSET
  10335. ;;
  10336. #endif
  10337. #ifdef LN
  10338. adds C1 = -4 * SIZE, C1
  10339. adds C5 = -4 * SIZE, C5
  10340. #endif
  10341. ;;
  10342. STFD [C1 ] = f64, SIZE
  10343. ;;
  10344. STFD [C1 ] = f65, SIZE
  10345. ;;
  10346. STFD [C1 ] = f96, SIZE
  10347. ;;
  10348. STFD [C1 ] = f97, SIZE
  10349. ;;
  10350. mov f64 = f0
  10351. mov f65 = f0
  10352. mov f80 = f0
  10353. mov f81 = f0
  10354. mov f96 = f0
  10355. mov f97 = f0
  10356. mov f112 = f0
  10357. mov f113 = f0
  10358. ;;
  10359. #ifdef LN
  10360. adds C1 = -4 * SIZE, C1
  10361. adds C5 = -4 * SIZE, C5
  10362. #endif
  10363. ;;
  10364. cmp.ne p6, p0 = 1, I
  10365. ;;
  10366. adds I = -1, I
  10367. ;;
  10368. shladd r2 = K, ZBASE_SHIFT, r0
  10369. ;;
  10370. sub L = K, KK
  10371. ;;
  10372. #ifdef RT
  10373. shladd AORIG = r2, 1, AORIG
  10374. #endif
  10375. ;;
  10376. #if defined(LT) || defined(RN)
  10377. shladd L = L, ZBASE_SHIFT, r0
  10378. ;;
  10379. shladd AOFFSET = L, 1, AOFFSET
  10380. add BOFFSET = L, BOFFSET
  10381. #endif
  10382. ;;
  10383. #ifdef LT
  10384. adds KK = 2, KK
  10385. #elif defined LN
  10386. adds KK = -2, KK
  10387. #else
  10388. nop __LINE__
  10389. #endif
  10390. ;;
  10391. #if defined(LT) || defined(RN)
  10392. mov L = KK
  10393. #else
  10394. sub L = K, KK
  10395. #endif
  10396. ;;
  10397. .align 16
  10398. .L110:
  10399. { .mib
  10400. #if defined(LT) || defined(RN)
  10401. mov L = KK
  10402. #else
  10403. sub L = K, KK
  10404. #endif
  10405. tbit.z p6, p7 = M, 0
  10406. (p6) br.cond.dptk .L119
  10407. }
  10408. ;;
  10409. { .mmi
  10410. cmp.ne p7, p0 = r0, L
  10411. adds BOFFSET = 0 * SIZE, B
  10412. shl r2 = K, ZBASE_SHIFT
  10413. }
  10414. { .mmi
  10415. shladd r3 = KK, ZBASE_SHIFT, r0
  10416. nop __LINE__
  10417. nop __LINE__
  10418. }
  10419. ;;
  10420. #if defined(LT) || defined(RN)
  10421. { .mfb
  10422. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10423. mov f66 = f0
  10424. nop __LINE__
  10425. }
  10426. { .mmf
  10427. nop __LINE__
  10428. nop __LINE__
  10429. mov f67 = f0
  10430. }
  10431. ;;
  10432. #else
  10433. { .mfi
  10434. add BOFFSET = r3, B
  10435. mov f66 = f0
  10436. #ifdef LN
  10437. sub AORIG = AORIG, r2
  10438. #else
  10439. nop __LINE__
  10440. #endif
  10441. }
  10442. ;;
  10443. { .mfi
  10444. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10445. mov f67 = f0
  10446. add AOFFSET = r3, AORIG
  10447. }
  10448. ;;
  10449. #endif
  10450. ;;
  10451. adds L = 1, L
  10452. ;;
  10453. { .mii
  10454. nop __LINE__
  10455. tbit.z p12, p0 = L, 0
  10456. shr L = L, 1
  10457. }
  10458. ;;
  10459. { .mmi
  10460. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  10461. cmp.eq p3, p0 = r0, r0
  10462. adds L = -1, L
  10463. }
  10464. ;;
  10465. { .mmi
  10466. adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET
  10467. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  10468. mov ar.lc = L
  10469. }
  10470. ;;
  10471. cmp.eq p6, p0 = -1, L
  10472. (p6) br.cond.dpnt .L118
  10473. ;;
  10474. .align 16
  10475. .L112:
  10476. { .mfi
  10477. lfetch.nt1 [PREA], 4 * SIZE
  10478. FMA f64 = f32, f48, f64 // A1 * B1
  10479. cmp.ne p4, p5 = 0, L
  10480. }
  10481. { .mfi
  10482. lfetch.nt1 [PREB], 4 * SIZE
  10483. FMA f80 = f32, f49, f80 // A1 * B2
  10484. (p12) cmp.ne p3, p0 = 0, L
  10485. }
  10486. ;;
  10487. { .mmf
  10488. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  10489. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  10490. FMA f65 = f33, f48, f65 // A2 * B1
  10491. }
  10492. { .mmf
  10493. nop __LINE__
  10494. nop __LINE__
  10495. FMA f81 = f33, f49, f81 // A2 * B2
  10496. }
  10497. ;;
  10498. { .mfb
  10499. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  10500. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  10501. nop __LINE__
  10502. }
  10503. { .mfb
  10504. nop __LINE__
  10505. (p3) FMA f80 = f40, f57, f80 // A1 * B2
  10506. nop __LINE__
  10507. }
  10508. ;;
  10509. { .mfi
  10510. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10511. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  10512. adds L = -1, L
  10513. }
  10514. { .mfb
  10515. (p3) FMA f81 = f41, f57, f81 // A2 * B2
  10516. br.cloop.sptk.few .L112
  10517. }
  10518. ;;
  10519. { .mfb
  10520. nop __LINE__
  10521. FCALC_A f64 = f64, f81
  10522. nop __LINE__
  10523. }
  10524. { .mfb
  10525. nop __LINE__
  10526. FCALC_B f65 = f65, f80
  10527. nop __LINE__
  10528. }
  10529. ;;
  10530. .L118:
  10531. #if defined(LN) || defined(RT)
  10532. #ifdef LN
  10533. adds r2 = -1, KK
  10534. #else
  10535. adds r2 = -1, KK
  10536. #endif
  10537. ;;
  10538. shladd r2 = r2, ZBASE_SHIFT, r0
  10539. ;;
  10540. add AOFFSET = r2, AORIG
  10541. add BOFFSET = r2, B
  10542. ;;
  10543. #endif
  10544. #if defined(LN) || defined(LT)
  10545. LDFPD f72, f73 = [BOFFSET]
  10546. ;;
  10547. FSUB f64 = f72, f64
  10548. FSUB_A f65 = f73, f65
  10549. ;;
  10550. #else
  10551. LDFPD f72, f73 = [AOFFSET]
  10552. ;;
  10553. FSUB f64 = f72, f64
  10554. FSUB f65 = f73, f65
  10555. ;;
  10556. #endif
  10557. #ifdef LN
  10558. LDFPD f120, f121 = [AOFFSET]
  10559. ;;
  10560. FMPY f32 = f120, f64
  10561. FMPY f33 = f121, f64
  10562. ;;
  10563. FMA_C f64 = f121, f65, f32
  10564. FMA_D f65 = f120, f65, f33
  10565. ;;
  10566. #endif
  10567. #ifdef LT
  10568. LDFPD f72, f73 = [AOFFSET]
  10569. ;;
  10570. FMPY f32 = f72, f64
  10571. FMPY f33 = f73, f64
  10572. ;;
  10573. FMA_C f64 = f73, f65, f32
  10574. FMA_D f65 = f72, f65, f33
  10575. ;;
  10576. #endif
  10577. #ifdef RN
  10578. LDFPD f72, f73 = [BOFFSET]
  10579. ;;
  10580. FMPY f32 = f72, f64
  10581. FMPY f33 = f73, f64
  10582. ;;
  10583. FMA_C f64 = f73, f65, f32
  10584. FMA_D f65 = f72, f65, f33
  10585. ;;
  10586. #endif
  10587. #ifdef RT
  10588. LDFPD f72, f73 = [BOFFSET]
  10589. ;;
  10590. FMPY f32 = f72, f64
  10591. FMPY f33 = f73, f64
  10592. ;;
  10593. FMA_C f64 = f73, f65, f32
  10594. FMA_D f65 = f72, f65, f33
  10595. ;;
  10596. #endif
  10597. #if defined(LN) || defined(LT)
  10598. STFD [BOFFSET] = f64, SIZE
  10599. ;;
  10600. STFD [BOFFSET] = f65, SIZE
  10601. ;;
  10602. adds BOFFSET = - 2 * SIZE, BOFFSET
  10603. ;;
  10604. #else
  10605. STFD [AOFFSET] = f64, SIZE
  10606. ;;
  10607. STFD [AOFFSET] = f65, SIZE
  10608. ;;
  10609. adds AOFFSET = - 2 * SIZE, AOFFSET
  10610. ;;
  10611. #endif
  10612. #ifdef LN
  10613. adds C1 = -2 * SIZE, C1
  10614. #endif
  10615. ;;
  10616. STFD [C1 ] = f64, SIZE
  10617. ;;
  10618. STFD [C1 ] = f65, SIZE
  10619. ;;
  10620. mov f64 = f0
  10621. mov f65 = f0
  10622. mov f80 = f0
  10623. mov f81 = f0
  10624. ;;
  10625. #ifdef LN
  10626. adds C1 = -2 * SIZE, C1
  10627. #endif
  10628. ;;
  10629. cmp.ne p6, p0 = 1, I
  10630. ;;
  10631. adds I = -1, I
  10632. ;;
  10633. shladd r2 = K, ZBASE_SHIFT, r0
  10634. ;;
  10635. sub L = K, KK
  10636. ;;
  10637. #ifdef RT
  10638. add AORIG = r2, AORIG
  10639. #endif
  10640. ;;
  10641. #if defined(LT) || defined(RN)
  10642. shladd L = L, ZBASE_SHIFT, r0
  10643. ;;
  10644. add AOFFSET = L, AOFFSET
  10645. add BOFFSET = L, BOFFSET
  10646. #endif
  10647. ;;
  10648. #ifdef LT
  10649. adds KK = 1, KK
  10650. #elif defined LN
  10651. adds KK = -1, KK
  10652. #else
  10653. nop __LINE__
  10654. #endif
  10655. ;;
  10656. #if defined(LT) || defined(RN)
  10657. mov L = KK
  10658. #else
  10659. sub L = K, KK
  10660. #endif
  10661. .align 16
  10662. .L119:
  10663. #ifdef LN
  10664. shladd KK8 = K, ZBASE_SHIFT, r0
  10665. ;;
  10666. add B = KK8, B
  10667. #endif
  10668. #if defined(LT) || defined(RN)
  10669. mov B = BOFFSET
  10670. #endif
  10671. #ifdef RN
  10672. adds KK = 1, KK
  10673. #endif
  10674. #ifdef RT
  10675. adds KK = -1, KK
  10676. #endif
  10677. ;;
  10678. { .mmi
  10679. mov AOFFSET = A
  10680. nop __LINE__
  10681. }
  10682. ;;
  10683. .align 16
  10684. .L999:
  10685. { .mii
  10686. nop __LINE__
  10687. mov ar.lc = ARLC
  10688. mov pr = PR, -1
  10689. }
  10690. { .mib
  10691. nop __LINE__
  10692. #ifdef TRMMKERNEL
  10693. mov ar.pfs = ARPFS
  10694. #else
  10695. nop __LINE__
  10696. #endif
  10697. br.ret.sptk.many b0
  10698. }
  10699. EPILOGUE