You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN.S 235 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef DOUBLE
  41. #define PREFETCHSIZE (16 * 8)
  42. #else
  43. #define PREFETCHSIZE (32 * 8)
  44. #endif
  45. #define CPREFETCHSIZE -7
  46. #define CPREFETCH lfetch.excl.nt1
  47. #define M r32
  48. #define N r33
  49. #define K r34
  50. #define A r36
  51. #define B r37
  52. #define C r38
  53. #define LDC r39
  54. #define I r15
  55. #define J r16
  56. #define AOFFSET r17
  57. #define BOFFSET r18
  58. #define TEMP r19
  59. #define L r20
  60. #define C1 r21
  61. #define C2 r22
  62. #define C3 r23
  63. #define C4 r24
  64. #define C5 r25
  65. #define C6 r26
  66. #define C7 r27
  67. #define C8 r28
  68. #define C9 loc0
  69. #define C10 loc1
  70. #define C11 loc2
  71. #define C12 loc3
  72. #define C13 loc4
  73. #define C14 loc5
  74. #define C15 loc6
  75. #define C16 loc7
  76. #define PREA r8
  77. #define PREB r9
  78. #define PREC r10
  79. #define SP r12
  80. #define ARLC r29
  81. #define PR r30
  82. #define ARPFS r31
  83. #define ALPHA f8
  84. #define AORIG loc8
  85. #define KK loc9
  86. #define KK8 loc10
  87. #define OFFSET loc11
  88. #define AOFFSET2 loc12
  89. #define BOFFSET2 loc13
  90. PROLOGUE
  91. .prologue
  92. PROFCODE
  93. { .mmi
  94. .save ar.pfs, ARPFS
  95. alloc ARPFS = ar.pfs, 8, 16, 0, 0
  96. adds r14 = 16, SP
  97. mov ARLC = ar.lc
  98. }
  99. { .mmi
  100. adds r8 = -6 * 16, SP
  101. adds r9 = -5 * 16, SP
  102. adds SP = -6 * 16, SP
  103. }
  104. ;;
  105. { .mmi
  106. setf.sig f32 = M
  107. setf.sig f33 = K
  108. mov PR = pr
  109. }
  110. ;;
  111. { .mmi
  112. stf.spill [r8] = f16, 32
  113. stf.spill [r9] = f17, 32
  114. shr J = N, 3
  115. }
  116. ;;
  117. { .mmi
  118. stf.spill [r8] = f18, 32
  119. stf.spill [r9] = f19, 32
  120. shladd LDC = LDC, BASE_SHIFT, r0
  121. }
  122. ;;
  123. { .mmi
  124. stf.spill [r8] = f20
  125. stf.spill [r9] = f21
  126. mov AOFFSET = A
  127. }
  128. ;;
  129. .body
  130. { .mmf
  131. ld8 OFFSET = [r14]
  132. cmp.ge p6, p0 = 0, J
  133. xmpy.l f32 = f32, f33
  134. }
  135. ;;
  136. { .mmi
  137. getf.sig r2 = f32
  138. shladd C = M, BASE_SHIFT, C
  139. nop __LINE__
  140. }
  141. ;;
  142. { .mmb
  143. shladd A = r2, BASE_SHIFT, A
  144. nop __LINE__
  145. (p6) br.cond.dpnt .L050
  146. }
  147. ;;
  148. .align 8
  149. .L000:
  150. { .mmf
  151. mov C1 = C
  152. add KK = M, OFFSET
  153. }
  154. { .mmi
  155. mov AORIG = A
  156. add C2 = LDC, C
  157. shladd C3 = LDC, 1, C
  158. }
  159. ;;
  160. { .mmf
  161. shladd C5 = LDC, 2, C
  162. shladd C = LDC, 3, C
  163. }
  164. { .mmf
  165. shladd C4 = LDC, 1, C2
  166. shladd C6 = LDC, 2, C2
  167. }
  168. ;;
  169. { .mfi
  170. shladd C7 = LDC, 2, C3
  171. shladd C8 = LDC, 2, C4
  172. }
  173. ;;
  174. ;;
  175. mov f64 = f0
  176. mov f72 = f0
  177. mov f80 = f0
  178. mov f88 = f0
  179. mov f96 = f0
  180. mov f104 = f0
  181. mov f112 = f0
  182. mov f120 = f0
  183. .L040:
  184. { .mib
  185. sub L = K, KK
  186. tbit.z p6, p0 = M, 0
  187. (p6) br.cond.dptk .L030
  188. }
  189. ;;
  190. { .mmi
  191. cmp.ne p7, p0 = r0, L
  192. adds BOFFSET = 0 * SIZE, B
  193. shl r2 = K, 0 + BASE_SHIFT
  194. }
  195. { .mmi
  196. shladd r3 = KK, BASE_SHIFT, r0
  197. nop __LINE__
  198. nop __LINE__
  199. }
  200. ;;
  201. { .mfi
  202. shladd BOFFSET = r3, 3, B
  203. sub AORIG = AORIG, r2
  204. }
  205. ;;
  206. { .mfi
  207. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  208. add AOFFSET = r3, AORIG
  209. }
  210. ;;
  211. { .mmi
  212. adds L = 1, L
  213. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  214. cmp.eq p3, p0 = r0, r0
  215. }
  216. ;;
  217. { .mii
  218. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  219. tbit.z p12, p0 = L, 0
  220. shr L = L, 1
  221. }
  222. ;;
  223. { .mmi
  224. (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  225. adds L = -1, L
  226. }
  227. ;;
  228. { .mmi
  229. (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  230. cmp.eq p6, p0 = -1, L
  231. }
  232. ;;
  233. { .mib
  234. (p7) LDFD f32 = [AOFFSET], 1 * SIZE
  235. mov ar.lc = L
  236. (p6) br.cond.dpnt .L048
  237. }
  238. ;;
  239. .L042:
  240. { .mfb
  241. lfetch.nt1 [PREB], 16 * SIZE
  242. FMA f64 = f32, f48, f64 // A1 * B1
  243. nop __LINE__
  244. }
  245. { .mfb
  246. (p12) cmp.ne p3, p0 = 0, L
  247. FMA f72 = f32, f49, f72 // A1 * B2
  248. nop __LINE__
  249. }
  250. ;;
  251. { .mfi
  252. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  253. FMA f80 = f32, f50, f80 // A1 * B3
  254. cmp.ne p4, p5 = 0, L
  255. }
  256. { .mfb
  257. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  258. FMA f88 = f32, f51, f88 // A1 * B4
  259. nop __LINE__
  260. }
  261. ;;
  262. { .mfb
  263. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  264. FMA f96 = f32, f52, f96 // A1 * B5
  265. nop __LINE__
  266. }
  267. { .mfb
  268. nop __LINE__
  269. FMA f104 = f32, f53, f104 // A1 * B6
  270. nop __LINE__
  271. }
  272. ;;
  273. { .mfb
  274. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  275. FMA f112 = f32, f54, f112 // A1 * B7
  276. nop __LINE__
  277. }
  278. { .mfb
  279. nop __LINE__
  280. FMA f120 = f32, f55, f120 // A1 * B8
  281. nop __LINE__
  282. }
  283. ;;
  284. { .mfb
  285. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  286. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  287. nop __LINE__
  288. }
  289. { .mfb
  290. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  291. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  292. nop __LINE__
  293. }
  294. ;;
  295. { .mfb
  296. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  297. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  298. nop __LINE__
  299. }
  300. { .mfb
  301. nop __LINE__
  302. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  303. nop __LINE__
  304. }
  305. ;;
  306. { .mfb
  307. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  308. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  309. nop __LINE__
  310. }
  311. { .mfb
  312. nop __LINE__
  313. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  314. nop __LINE__
  315. }
  316. ;;
  317. { .mfi
  318. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  319. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  320. adds L = -1, L
  321. }
  322. { .mmb
  323. nop __LINE__
  324. nop __LINE__
  325. nop __LINE__
  326. }
  327. ;;
  328. { .mfb
  329. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  330. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  331. nop __LINE__
  332. }
  333. { .mmb
  334. nop __LINE__
  335. nop __LINE__
  336. br.cloop.sptk.few .L042
  337. }
  338. ;;
  339. .L048:
  340. #if defined(LN) || defined(RT)
  341. #ifdef LN
  342. adds r2 = -1, KK
  343. #else
  344. adds r2 = -8, KK
  345. #endif
  346. ;;
  347. shladd r2 = r2, BASE_SHIFT, r0
  348. ;;
  349. add AOFFSET = r2, AORIG
  350. shladd BOFFSET = r2, 3, B
  351. ;;
  352. #endif
  353. adds AOFFSET2 = 4 * SIZE, AOFFSET
  354. adds BOFFSET2 = 4 * SIZE, BOFFSET
  355. ;;
  356. #if defined(LN) || defined(LT)
  357. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  358. ;;
  359. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  360. ;;
  361. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  362. ;;
  363. LDFPD f38, f39 = [BOFFSET]
  364. adds BOFFSET = -6 * SIZE, BOFFSET
  365. ;;
  366. { .mfi
  367. FSUB f64 = f32, f64
  368. nop __LINE__
  369. }
  370. { .mfi
  371. nop __LINE__
  372. FSUB f72 = f33, f72
  373. nop __LINE__
  374. }
  375. ;;
  376. { .mfi
  377. FSUB f80 = f34, f80
  378. nop __LINE__
  379. }
  380. { .mfi
  381. nop __LINE__
  382. FSUB f88 = f35, f88
  383. nop __LINE__
  384. }
  385. ;;
  386. { .mfi
  387. FSUB f96 = f36, f96
  388. nop __LINE__
  389. }
  390. { .mfi
  391. nop __LINE__
  392. FSUB f104 = f37, f104
  393. nop __LINE__
  394. }
  395. ;;
  396. { .mfi
  397. FSUB f112 = f38, f112
  398. nop __LINE__
  399. }
  400. { .mfi
  401. nop __LINE__
  402. FSUB f120 = f39, f120
  403. nop __LINE__
  404. }
  405. ;;
  406. #else
  407. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  408. ;;
  409. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  410. ;;
  411. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  412. ;;
  413. LDFPD f38, f39 = [AOFFSET]
  414. adds AOFFSET = -6 * SIZE, AOFFSET
  415. ;;
  416. FSUB f64 = f32, f64
  417. FSUB f72 = f33, f72
  418. FSUB f80 = f34, f80
  419. FSUB f88 = f35, f88
  420. FSUB f96 = f36, f96
  421. FSUB f104 = f37, f104
  422. FSUB f112 = f38, f112
  423. FSUB f120 = f39, f120
  424. ;;
  425. #endif
  426. #ifdef LN
  427. LDFD f32 = [AOFFSET]
  428. ;;
  429. FMPY f64 = f64, f32
  430. FMPY f96 = f96, f32
  431. FMPY f72 = f72, f32
  432. FMPY f104 = f104, f32
  433. FMPY f80 = f80, f32
  434. FMPY f112 = f112, f32
  435. FMPY f88 = f88, f32
  436. FMPY f120 = f120, f32
  437. ;;
  438. { .mmi
  439. STFD [BOFFSET] = f64, SIZE
  440. STFD [BOFFSET2] = f96, SIZE
  441. adds C1 = -1 * SIZE, C1
  442. }
  443. ;;
  444. { .mmi
  445. STFD [BOFFSET] = f72, SIZE
  446. STFD [BOFFSET2] = f104, SIZE
  447. adds C2 = -1 * SIZE, C2
  448. }
  449. ;;
  450. { .mmi
  451. STFD [BOFFSET] = f80, SIZE
  452. STFD [BOFFSET2] = f112, SIZE
  453. nop __LINE__
  454. }
  455. ;;
  456. { .mmi
  457. STFD [BOFFSET] = f88, - 3 * SIZE
  458. STFD [BOFFSET2] = f120, - 3 * SIZE
  459. }
  460. ;;
  461. adds C3 = -1 * SIZE, C3
  462. adds C4 = -1 * SIZE, C4
  463. adds C5 = -1 * SIZE, C5
  464. adds C6 = -1 * SIZE, C6
  465. adds C7 = -1 * SIZE, C7
  466. adds C8 = -1 * SIZE, C8
  467. ;;
  468. #endif
  469. #ifdef LT
  470. LDFD f32 = [AOFFSET]
  471. ;;
  472. { .mfi
  473. FMPY f64 = f64, f32
  474. nop __LINE__
  475. }
  476. { .mfi
  477. nop __LINE__
  478. FMPY f96 = f96, f32
  479. nop __LINE__
  480. }
  481. ;;
  482. { .mfi
  483. FMPY f72 = f72, f32
  484. nop __LINE__
  485. }
  486. { .mfi
  487. nop __LINE__
  488. FMPY f104 = f104, f32
  489. nop __LINE__
  490. }
  491. ;;
  492. { .mfi
  493. FMPY f80 = f80, f32
  494. }
  495. { .mfi
  496. nop __LINE__
  497. FMPY f112 = f112, f32
  498. nop __LINE__
  499. }
  500. ;;
  501. { .mfi
  502. FMPY f88 = f88, f32
  503. nop __LINE__
  504. }
  505. { .mfi
  506. nop __LINE__
  507. FMPY f120 = f120, f32
  508. nop __LINE__
  509. }
  510. ;;
  511. { .mfi
  512. STFD [BOFFSET] = f64, SIZE
  513. }
  514. { .mfi
  515. STFD [BOFFSET2] = f96, SIZE
  516. }
  517. ;;
  518. { .mfi
  519. STFD [BOFFSET] = f72, SIZE
  520. }
  521. { .mfi
  522. STFD [BOFFSET2] = f104, SIZE
  523. }
  524. ;;
  525. { .mfi
  526. STFD [BOFFSET] = f80, SIZE
  527. }
  528. { .mfi
  529. STFD [BOFFSET2] = f112, SIZE
  530. }
  531. ;;
  532. { .mfi
  533. STFD [BOFFSET] = f88, -3 * SIZE
  534. }
  535. { .mfi
  536. STFD [BOFFSET2] = f120, -3 * SIZE
  537. }
  538. ;;
  539. #endif
  540. #ifdef RN
  541. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  542. ;;
  543. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  544. ;;
  545. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  546. ;;
  547. LDFPD f38, f39 = [BOFFSET]
  548. adds BOFFSET = 3 * SIZE, BOFFSET
  549. ;;
  550. LDFD f40 = [BOFFSET], 1 * SIZE
  551. ;;
  552. LDFPD f41, f42 = [BOFFSET], 2 * SIZE
  553. ;;
  554. LDFPD f43, f44 = [BOFFSET], 2 * SIZE
  555. ;;
  556. LDFPD f45, f46 = [BOFFSET]
  557. adds BOFFSET = 4 * SIZE, BOFFSET
  558. ;;
  559. LDFPD f47, f48 = [BOFFSET], 2 * SIZE
  560. ;;
  561. LDFPD f49, f50 = [BOFFSET], 2 * SIZE
  562. ;;
  563. LDFPD f51, f52 = [BOFFSET]
  564. adds BOFFSET = 5 * SIZE, BOFFSET
  565. ;;
  566. LDFD f53 = [BOFFSET], 1 * SIZE
  567. ;;
  568. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  569. ;;
  570. LDFPD f56, f57 = [BOFFSET]
  571. adds BOFFSET = 6 * SIZE, BOFFSET
  572. ;;
  573. LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  574. ;;
  575. LDFPD f60, f61 = [BOFFSET]
  576. adds BOFFSET = 7 * SIZE, BOFFSET
  577. ;;
  578. LDFD f16 = [BOFFSET], 1 * SIZE
  579. ;;
  580. LDFPD f17, f18 = [BOFFSET]
  581. adds BOFFSET = 8 * SIZE, BOFFSET
  582. ;;
  583. LDFPD f19, f20 = [BOFFSET]
  584. adds BOFFSET = 9 * SIZE, BOFFSET
  585. ;;
  586. LDFD f21 = [BOFFSET]
  587. adds BOFFSET = -63 * SIZE, BOFFSET
  588. ;;
  589. FMPY f64 = f64, f32
  590. ;;
  591. FNMA f72 = f64, f33, f72
  592. ;;
  593. FNMA f80 = f64, f34, f80
  594. ;;
  595. FNMA f88 = f64, f35, f88
  596. ;;
  597. FNMA f96 = f64, f36, f96
  598. ;;
  599. FNMA f104 = f64, f37, f104
  600. ;;
  601. FNMA f112 = f64, f38, f112
  602. ;;
  603. FNMA f120 = f64, f39, f120
  604. ;;
  605. FMPY f72 = f72, f40
  606. ;;
  607. FNMA f80 = f72, f41, f80
  608. ;;
  609. FNMA f88 = f72, f42, f88
  610. ;;
  611. FNMA f96 = f72, f43, f96
  612. ;;
  613. FNMA f104 = f72, f44, f104
  614. ;;
  615. FNMA f112 = f72, f45, f112
  616. ;;
  617. FNMA f120 = f72, f46, f120
  618. ;;
  619. FMPY f80 = f80, f47
  620. ;;
  621. FNMA f88 = f80, f48, f88
  622. ;;
  623. FNMA f96 = f80, f49, f96
  624. ;;
  625. FNMA f104 = f80, f50, f104
  626. ;;
  627. FNMA f112 = f80, f51, f112
  628. ;;
  629. FNMA f120 = f80, f52, f120
  630. ;;
  631. FMPY f88 = f88, f53
  632. ;;
  633. FNMA f96 = f88, f54, f96
  634. ;;
  635. FNMA f104 = f88, f55, f104
  636. ;;
  637. FNMA f112 = f88, f56, f112
  638. ;;
  639. FNMA f120 = f88, f57, f120
  640. ;;
  641. FMPY f96 = f96, f58
  642. ;;
  643. FNMA f104 = f96, f59, f104
  644. ;;
  645. FNMA f112 = f96, f60, f112
  646. ;;
  647. FNMA f120 = f96, f61, f120
  648. ;;
  649. FMPY f104 = f104, f16
  650. ;;
  651. FNMA f112 = f104, f17, f112
  652. ;;
  653. FNMA f120 = f104, f18, f120
  654. ;;
  655. FMPY f112 = f112, f19
  656. ;;
  657. FNMA f120 = f112, f20, f120
  658. ;;
  659. FMPY f120 = f120, f21
  660. ;;
  661. STFD [AOFFSET] = f64, SIZE
  662. STFD [AOFFSET2] = f96, SIZE
  663. ;;
  664. STFD [AOFFSET] = f72, SIZE
  665. STFD [AOFFSET2] = f104, SIZE
  666. ;;
  667. STFD [AOFFSET] = f80, SIZE
  668. STFD [AOFFSET2] = f112, SIZE
  669. ;;
  670. STFD [AOFFSET] = f88, -3 * SIZE
  671. STFD [AOFFSET2] = f120, - 3 * SIZE
  672. ;;
  673. #endif
  674. #ifdef RT
  675. adds BOFFSET = 62 * SIZE, BOFFSET
  676. ;;
  677. LDFPD f33, f32 = [BOFFSET]
  678. adds BOFFSET = - 2 * SIZE, BOFFSET
  679. ;;
  680. LDFPD f35, f34 = [BOFFSET]
  681. adds BOFFSET = - 2 * SIZE, BOFFSET
  682. ;;
  683. LDFPD f37, f36 = [BOFFSET]
  684. adds BOFFSET = - 2 * SIZE, BOFFSET
  685. ;;
  686. LDFPD f39, f38 = [BOFFSET]
  687. adds BOFFSET = - 2 * SIZE, BOFFSET
  688. ;;
  689. LDFD f40 = [BOFFSET], -2 * SIZE
  690. ;;
  691. LDFPD f42, f41 = [BOFFSET]
  692. adds BOFFSET = - 2 * SIZE, BOFFSET
  693. ;;
  694. LDFPD f44, f43 = [BOFFSET]
  695. adds BOFFSET = - 2 * SIZE, BOFFSET
  696. ;;
  697. LDFPD f46, f45 = [BOFFSET]
  698. adds BOFFSET = - 4 * SIZE, BOFFSET
  699. ;;
  700. LDFPD f48, f47 = [BOFFSET]
  701. adds BOFFSET = - 2 * SIZE, BOFFSET
  702. ;;
  703. LDFPD f50, f49 = [BOFFSET]
  704. adds BOFFSET = - 2 * SIZE, BOFFSET
  705. ;;
  706. LDFPD f52, f51 = [BOFFSET]
  707. adds BOFFSET = - 4 * SIZE, BOFFSET
  708. ;;
  709. LDFD f53 = [BOFFSET], -2 * SIZE
  710. ;;
  711. LDFPD f55, f54 = [BOFFSET]
  712. adds BOFFSET = - 2 * SIZE, BOFFSET
  713. ;;
  714. LDFPD f57, f56 = [BOFFSET]
  715. adds BOFFSET = - 6 * SIZE, BOFFSET
  716. ;;
  717. LDFPD f59, f58 = [BOFFSET]
  718. adds BOFFSET = - 2 * SIZE, BOFFSET
  719. ;;
  720. LDFPD f61, f60 = [BOFFSET]
  721. adds BOFFSET = - 6 * SIZE, BOFFSET
  722. ;;
  723. LDFD f16 = [BOFFSET], -2 * SIZE
  724. ;;
  725. LDFPD f18, f17 = [BOFFSET]
  726. adds BOFFSET = - 8 * SIZE, BOFFSET
  727. ;;
  728. LDFPD f20, f19 = [BOFFSET]
  729. adds BOFFSET = - 8 * SIZE, BOFFSET
  730. ;;
  731. LDFD f21 = [BOFFSET]
  732. ;;
  733. FMPY f120 = f120, f32
  734. ;;
  735. FNMA f112 = f120, f33, f112
  736. ;;
  737. FNMA f104 = f120, f34, f104
  738. ;;
  739. FNMA f96 = f120, f35, f96
  740. ;;
  741. FNMA f88 = f120, f36, f88
  742. ;;
  743. FNMA f80 = f120, f37, f80
  744. ;;
  745. FNMA f72 = f120, f38, f72
  746. ;;
  747. FNMA f64 = f120, f39, f64
  748. ;;
  749. FMPY f112 = f112, f40
  750. ;;
  751. FNMA f104 = f112, f41, f104
  752. ;;
  753. FNMA f96 = f112, f42, f96
  754. ;;
  755. FNMA f88 = f112, f43, f88
  756. ;;
  757. FNMA f80 = f112, f44, f80
  758. ;;
  759. FNMA f72 = f112, f45, f72
  760. ;;
  761. FNMA f64 = f112, f46, f64
  762. ;;
  763. FMPY f104 = f104, f47
  764. ;;
  765. FNMA f96 = f104, f48, f96
  766. ;;
  767. FNMA f88 = f104, f49, f88
  768. ;;
  769. FNMA f80 = f104, f50, f80
  770. ;;
  771. FNMA f72 = f104, f51, f72
  772. ;;
  773. FNMA f64 = f104, f52, f64
  774. ;;
  775. FMPY f96 = f96, f53
  776. ;;
  777. FNMA f88 = f96, f54, f88
  778. ;;
  779. FNMA f80 = f96, f55, f80
  780. ;;
  781. FNMA f72 = f96, f56, f72
  782. ;;
  783. FNMA f64 = f96, f57, f64
  784. ;;
  785. FMPY f88 = f88, f58
  786. ;;
  787. FNMA f80 = f88, f59, f80
  788. ;;
  789. FNMA f72 = f88, f60, f72
  790. ;;
  791. FNMA f64 = f88, f61, f64
  792. ;;
  793. FMPY f80 = f80, f16
  794. ;;
  795. FNMA f72 = f80, f17, f72
  796. ;;
  797. FNMA f64 = f80, f18, f64
  798. ;;
  799. FMPY f72 = f72, f19
  800. ;;
  801. FNMA f64 = f72, f20, f64
  802. ;;
  803. FMPY f64 = f64, f21
  804. ;;
  805. STFD [AOFFSET] = f64, SIZE
  806. STFD [AOFFSET2] = f96, SIZE
  807. ;;
  808. STFD [AOFFSET] = f72, SIZE
  809. STFD [AOFFSET2] = f104, SIZE
  810. ;;
  811. STFD [AOFFSET] = f80, SIZE
  812. STFD [AOFFSET2] = f112, SIZE
  813. ;;
  814. STFD [AOFFSET] = f88, - 3 * SIZE
  815. STFD [AOFFSET2] = f120, - 3 * SIZE
  816. ;;
  817. #endif
  818. #ifndef LN
  819. STFD [C1 ] = f64, SIZE
  820. #else
  821. STFD [C1 ] = f64
  822. #endif
  823. #ifndef LN
  824. STFD [C2 ] = f72, SIZE
  825. #else
  826. STFD [C2 ] = f72
  827. #endif
  828. #ifndef LN
  829. STFD [C3 ] = f80, SIZE
  830. #else
  831. STFD [C3 ] = f80
  832. #endif
  833. #ifndef LN
  834. STFD [C4 ] = f88, SIZE
  835. #else
  836. STFD [C4 ] = f88
  837. #endif
  838. #ifndef LN
  839. STFD [C5 ] = f96, SIZE
  840. #else
  841. STFD [C5 ] = f96
  842. #endif
  843. #ifndef LN
  844. STFD [C6 ] = f104, SIZE
  845. #else
  846. STFD [C6 ] = f104
  847. #endif
  848. #ifndef LN
  849. STFD [C7 ] = f112, SIZE
  850. #else
  851. STFD [C7 ] = f112
  852. #endif
  853. #ifndef LN
  854. STFD [C8 ] = f120, SIZE
  855. #else
  856. STFD [C8 ] = f120
  857. #endif
  858. ;;
  859. mov f64 = f0
  860. mov f72 = f0
  861. mov f80 = f0
  862. mov f88 = f0
  863. mov f96 = f0
  864. mov f104 = f0
  865. mov f112 = f0
  866. mov f120 = f0
  867. ;;
  868. shladd r2 = K, BASE_SHIFT, r0
  869. ;;
  870. sub L = K, KK
  871. ;;
  872. #ifdef RT
  873. add AORIG = r2, AORIG
  874. #else
  875. nop __LINE__
  876. #endif
  877. ;;
  878. #if defined(LT) || defined(RN)
  879. shladd L = L, BASE_SHIFT, r0
  880. #else
  881. nop __LINE__
  882. #endif
  883. ;;
  884. #if defined(LT) || defined(RN)
  885. add AOFFSET = L, AOFFSET
  886. #else
  887. nop __LINE__
  888. #endif
  889. ;;
  890. #if defined(LT) || defined(RN)
  891. shladd BOFFSET = L, 3, BOFFSET
  892. #else
  893. nop __LINE__
  894. #endif
  895. ;;
  896. #ifdef LT
  897. adds KK = 1, KK
  898. #elif defined LN
  899. adds KK = -1, KK
  900. #else
  901. nop __LINE__
  902. #endif
  903. ;;
  904. #if defined(LT) || defined(RN)
  905. mov L = KK
  906. #else
  907. sub L = K, KK
  908. #endif
  909. ;;
  910. .align 8
  911. .L030:
  912. { .mib
  913. sub L = K, KK
  914. tbit.z p6, p0 = M, 1
  915. (p6) br.cond.dptk .L020
  916. }
  917. ;;
  918. ;;
  919. { .mmi
  920. cmp.ne p7, p0 = r0, L
  921. adds BOFFSET = 0 * SIZE, B
  922. shl r2 = K, 1 + BASE_SHIFT
  923. }
  924. { .mmi
  925. shladd r3 = KK, BASE_SHIFT, r0
  926. nop __LINE__
  927. nop __LINE__
  928. }
  929. ;;
  930. #if defined(LT) || defined(RN)
  931. { .mmf
  932. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  933. setf.d f73 = r0
  934. mov f65 = f0
  935. }
  936. ;;
  937. #else
  938. { .mfi
  939. shladd BOFFSET = r3, 3, B
  940. mov f65 = f0
  941. #ifdef LN
  942. sub AORIG = AORIG, r2
  943. #else
  944. nop __LINE__
  945. #endif
  946. }
  947. ;;
  948. { .mfi
  949. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  950. mov f73 = f0
  951. shladd AOFFSET = r3, 1, AORIG
  952. }
  953. ;;
  954. #endif
  955. { .mfi
  956. setf.d f105 = r0
  957. mov f81 = f0
  958. adds L = 1, L
  959. }
  960. { .mfi
  961. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  962. mov f89 = f0
  963. cmp.eq p3, p0 = r0, r0
  964. }
  965. ;;
  966. { .mfi
  967. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  968. mov f113 = f0
  969. tbit.z p12, p0 = L, 0
  970. }
  971. { .mfi
  972. setf.d f97 = r0
  973. mov f121 = f0
  974. shr L = L, 1
  975. }
  976. ;;
  977. { .mmf
  978. (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  979. adds L = -1, L
  980. }
  981. ;;
  982. { .mmf
  983. (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  984. cmp.eq p6, p0 = -1, L
  985. }
  986. ;;
  987. { .mib
  988. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  989. mov ar.lc = L
  990. (p6) br.cond.dpnt .L038
  991. }
  992. ;;
  993. .L032:
  994. { .mfb
  995. lfetch.nt1 [PREA], 4 * SIZE
  996. FMA f64 = f32, f48, f64 // A1 * B1
  997. nop __LINE__
  998. }
  999. { .mfi
  1000. nop __LINE__
  1001. FMA f72 = f32, f49, f72 // A1 * B2
  1002. (p12) cmp.ne p3, p0 = 0, L
  1003. }
  1004. ;;
  1005. { .mfi
  1006. lfetch.nt1 [PREB], 16 * SIZE
  1007. FMA f80 = f32, f50, f80 // A1 * B3
  1008. cmp.ne p4, p5 = 0, L
  1009. }
  1010. { .mfb
  1011. nop __LINE__
  1012. FMA f88 = f32, f51, f88 // A1 * B4
  1013. nop __LINE__
  1014. }
  1015. ;;
  1016. { .mfb
  1017. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  1018. FMA f96 = f32, f52, f96 // A1 * B5
  1019. nop __LINE__
  1020. }
  1021. { .mfb
  1022. nop __LINE__
  1023. FMA f104 = f32, f53, f104 // A1 * B6
  1024. nop __LINE__
  1025. }
  1026. ;;
  1027. { .mfb
  1028. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  1029. FMA f112 = f32, f54, f112 // A1 * B7
  1030. nop __LINE__
  1031. }
  1032. { .mfb
  1033. nop __LINE__
  1034. FMA f120 = f32, f55, f120 // A1 * B8
  1035. nop __LINE__
  1036. }
  1037. ;;
  1038. { .mfb
  1039. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  1040. FMA f65 = f33, f48, f65 // A2 * B1
  1041. nop __LINE__
  1042. }
  1043. { .mfb
  1044. nop __LINE__
  1045. FMA f73 = f33, f49, f73 // A2 * B2
  1046. nop __LINE__
  1047. }
  1048. ;;
  1049. { .mfb
  1050. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  1051. FMA f81 = f33, f50, f81 // A2 * B3
  1052. nop __LINE__
  1053. }
  1054. { .mfb
  1055. nop __LINE__
  1056. FMA f89 = f33, f51, f89 // A2 * B4
  1057. nop __LINE__
  1058. }
  1059. ;;
  1060. { .mfb
  1061. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  1062. FMA f97 = f33, f52, f97 // A2 * B5
  1063. nop __LINE__
  1064. }
  1065. { .mfb
  1066. nop __LINE__
  1067. FMA f105 = f33, f53, f105 // A2 * B6
  1068. nop __LINE__
  1069. }
  1070. ;;
  1071. { .mfb
  1072. nop __LINE__
  1073. FMA f113 = f33, f54, f113 // A2 * B7
  1074. nop __LINE__
  1075. }
  1076. { .mfb
  1077. nop __LINE__
  1078. FMA f121 = f33, f55, f121 // A2 * B8
  1079. nop __LINE__
  1080. }
  1081. ;;
  1082. { .mfb
  1083. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  1084. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  1085. nop __LINE__
  1086. }
  1087. { .mfb
  1088. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  1089. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  1090. nop __LINE__
  1091. }
  1092. ;;
  1093. { .mfb
  1094. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  1095. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  1096. nop __LINE__
  1097. }
  1098. { .mfb
  1099. nop __LINE__
  1100. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  1101. nop __LINE__
  1102. }
  1103. ;;
  1104. { .mfb
  1105. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  1106. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  1107. nop __LINE__
  1108. }
  1109. { .mfb
  1110. nop __LINE__
  1111. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  1112. nop __LINE__
  1113. }
  1114. ;;
  1115. { .mfb
  1116. nop __LINE__
  1117. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  1118. nop __LINE__
  1119. }
  1120. { .mfb
  1121. nop __LINE__
  1122. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  1123. nop __LINE__
  1124. }
  1125. ;;
  1126. { .mfb
  1127. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  1128. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  1129. nop __LINE__
  1130. }
  1131. { .mfb
  1132. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  1133. nop __LINE__
  1134. }
  1135. { .mfb
  1136. nop __LINE__
  1137. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  1138. nop __LINE__
  1139. }
  1140. { .mfb
  1141. nop __LINE__
  1142. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  1143. nop __LINE__
  1144. }
  1145. ;;
  1146. { .mfb
  1147. nop __LINE__
  1148. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  1149. nop __LINE__
  1150. }
  1151. { .mfb
  1152. nop __LINE__
  1153. (p3) FMA f105 = f41, f61, f105 // A2 * B6
  1154. nop __LINE__
  1155. }
  1156. ;;
  1157. { .mfi
  1158. nop __LINE__
  1159. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  1160. adds L = -1, L
  1161. }
  1162. { .mfb
  1163. nop __LINE__
  1164. (p3) FMA f121 = f41, f63, f121 // A2 * B8
  1165. br.cloop.sptk.few .L032
  1166. }
  1167. ;;
  1168. .L038:
  1169. #if defined(LN) || defined(RT)
  1170. #ifdef LN
  1171. adds r2 = -2, KK
  1172. #else
  1173. adds r2 = -8, KK
  1174. #endif
  1175. ;;
  1176. shladd r2 = r2, BASE_SHIFT, r0
  1177. ;;
  1178. shladd AOFFSET = r2, 1, AORIG
  1179. shladd BOFFSET = r2, 3, B
  1180. ;;
  1181. #endif
  1182. adds AOFFSET2 = 4 * SIZE, AOFFSET
  1183. adds BOFFSET2 = 4 * SIZE, BOFFSET
  1184. ;;
  1185. #if defined(LN) || defined(LT)
  1186. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  1187. ;;
  1188. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  1189. ;;
  1190. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  1191. ;;
  1192. LDFPD f38, f39 = [BOFFSET], 2 * SIZE
  1193. ;;
  1194. LDFPD f40, f41 = [BOFFSET], 2 * SIZE
  1195. ;;
  1196. LDFPD f42, f43 = [BOFFSET], 2 * SIZE
  1197. ;;
  1198. LDFPD f44, f45 = [BOFFSET], 2 * SIZE
  1199. ;;
  1200. LDFPD f46, f47 = [BOFFSET]
  1201. adds BOFFSET = -14 * SIZE, BOFFSET
  1202. ;;
  1203. { .mfi
  1204. FSUB f64 = f32, f64
  1205. nop __LINE__
  1206. }
  1207. { .mfi
  1208. nop __LINE__
  1209. FSUB f72 = f33, f72
  1210. nop __LINE__
  1211. }
  1212. ;;
  1213. { .mfi
  1214. FSUB f80 = f34, f80
  1215. nop __LINE__
  1216. }
  1217. { .mfi
  1218. nop __LINE__
  1219. FSUB f88 = f35, f88
  1220. nop __LINE__
  1221. }
  1222. ;;
  1223. { .mfi
  1224. FSUB f96 = f36, f96
  1225. nop __LINE__
  1226. }
  1227. { .mfi
  1228. nop __LINE__
  1229. FSUB f104 = f37, f104
  1230. nop __LINE__
  1231. }
  1232. ;;
  1233. { .mfi
  1234. FSUB f112 = f38, f112
  1235. nop __LINE__
  1236. }
  1237. { .mfi
  1238. nop __LINE__
  1239. FSUB f120 = f39, f120
  1240. nop __LINE__
  1241. }
  1242. ;;
  1243. { .mfi
  1244. FSUB f65 = f40, f65
  1245. nop __LINE__
  1246. }
  1247. { .mfi
  1248. nop __LINE__
  1249. FSUB f73 = f41, f73
  1250. nop __LINE__
  1251. }
  1252. ;;
  1253. { .mfi
  1254. FSUB f81 = f42, f81
  1255. nop __LINE__
  1256. }
  1257. { .mfi
  1258. nop __LINE__
  1259. FSUB f89 = f43, f89
  1260. nop __LINE__
  1261. }
  1262. ;;
  1263. { .mfi
  1264. FSUB f97 = f44, f97
  1265. nop __LINE__
  1266. }
  1267. { .mfi
  1268. nop __LINE__
  1269. FSUB f105 = f45, f105
  1270. nop __LINE__
  1271. }
  1272. ;;
  1273. { .mfi
  1274. FSUB f113 = f46, f113
  1275. }
  1276. { .mfi
  1277. nop __LINE__
  1278. FSUB f121 = f47, f121
  1279. nop __LINE__
  1280. }
  1281. ;;
  1282. #else
  1283. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  1284. ;;
  1285. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  1286. ;;
  1287. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  1288. ;;
  1289. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  1290. ;;
  1291. LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  1292. ;;
  1293. LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  1294. ;;
  1295. LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  1296. ;;
  1297. LDFPD f46, f47 = [AOFFSET]
  1298. adds AOFFSET = -14 * SIZE, AOFFSET
  1299. ;;
  1300. FSUB f64 = f32, f64
  1301. FSUB f65 = f33, f65
  1302. FSUB f72 = f34, f72
  1303. FSUB f73 = f35, f73
  1304. FSUB f80 = f36, f80
  1305. FSUB f81 = f37, f81
  1306. FSUB f88 = f38, f88
  1307. FSUB f89 = f39, f89
  1308. ;;
  1309. FSUB f96 = f40, f96
  1310. FSUB f97 = f41, f97
  1311. ;;
  1312. FSUB f104 = f42, f104
  1313. FSUB f105 = f43, f105
  1314. ;;
  1315. FSUB f112 = f44, f112
  1316. FSUB f113 = f45, f113
  1317. ;;
  1318. FSUB f120 = f46, f120
  1319. FSUB f121 = f47, f121
  1320. ;;
  1321. #endif
  1322. #ifdef LN
  1323. adds AOFFSET = 2 * SIZE, AOFFSET
  1324. ;;
  1325. LDFPD f33, f32 = [AOFFSET]
  1326. adds AOFFSET = - 2 * SIZE, AOFFSET
  1327. ;;
  1328. LDFD f34 = [AOFFSET]
  1329. ;;
  1330. FMPY f65 = f65, f32
  1331. FMPY f97 = f97, f32
  1332. FMPY f73 = f73, f32
  1333. FMPY f105 = f105, f32
  1334. FMPY f81 = f81, f32
  1335. FMPY f113 = f113, f32
  1336. FMPY f89 = f89, f32
  1337. FMPY f121 = f121, f32
  1338. ;;
  1339. FNMA f64 = f65, f33, f64
  1340. FNMA f96 = f97, f33, f96
  1341. FNMA f72 = f73, f33, f72
  1342. FNMA f104 = f105, f33, f104
  1343. FNMA f80 = f81, f33, f80
  1344. FNMA f112 = f113, f33, f112
  1345. FNMA f88 = f89, f33, f88
  1346. FNMA f120 = f121, f33, f120
  1347. ;;
  1348. FMPY f64 = f64, f34
  1349. FMPY f96 = f96, f34
  1350. FMPY f72 = f72, f34
  1351. FMPY f104 = f104, f34
  1352. FMPY f80 = f80, f34
  1353. FMPY f112 = f112, f34
  1354. FMPY f88 = f88, f34
  1355. FMPY f120 = f120, f34
  1356. ;;
  1357. adds BOFFSET = 8 * SIZE, BOFFSET
  1358. adds BOFFSET2 = 8 * SIZE, BOFFSET2
  1359. ;;
  1360. { .mfi
  1361. STFD [BOFFSET] = f65, SIZE
  1362. }
  1363. { .mfi
  1364. STFD [BOFFSET2] = f97, SIZE
  1365. }
  1366. ;;
  1367. { .mfi
  1368. STFD [BOFFSET] = f73, SIZE
  1369. }
  1370. { .mfi
  1371. STFD [BOFFSET2] = f105, SIZE
  1372. }
  1373. ;;
  1374. { .mfi
  1375. STFD [BOFFSET] = f81, SIZE
  1376. }
  1377. { .mfi
  1378. STFD [BOFFSET2] = f113, SIZE
  1379. }
  1380. ;;
  1381. { .mfi
  1382. STFD [BOFFSET] = f89, - 11 * SIZE
  1383. }
  1384. { .mfi
  1385. STFD [BOFFSET2] = f121, - 11 * SIZE
  1386. }
  1387. ;;
  1388. { .mmi
  1389. STFD [BOFFSET] = f64, SIZE
  1390. STFD [BOFFSET2] = f96, SIZE
  1391. adds C1 = -2 * SIZE, C1
  1392. }
  1393. ;;
  1394. { .mmi
  1395. STFD [BOFFSET] = f72, SIZE
  1396. STFD [BOFFSET2] = f104, SIZE
  1397. adds C2 = -2 * SIZE, C2
  1398. }
  1399. ;;
  1400. { .mmi
  1401. STFD [BOFFSET] = f80, SIZE
  1402. STFD [BOFFSET2] = f112, SIZE
  1403. nop __LINE__
  1404. }
  1405. ;;
  1406. { .mmi
  1407. STFD [BOFFSET] = f88, - 3 * SIZE
  1408. STFD [BOFFSET2] = f120, - 3 * SIZE
  1409. }
  1410. ;;
  1411. adds C3 = -2 * SIZE, C3
  1412. adds C4 = -2 * SIZE, C4
  1413. adds C5 = -2 * SIZE, C5
  1414. adds C6 = -2 * SIZE, C6
  1415. adds C7 = -2 * SIZE, C7
  1416. adds C8 = -2 * SIZE, C8
  1417. ;;
  1418. #endif
  1419. #ifdef LT
  1420. LDFPD f32, f33 = [AOFFSET]
  1421. adds AOFFSET = 3 * SIZE, AOFFSET
  1422. ;;
  1423. LDFD f34 = [AOFFSET], - 3 * SIZE
  1424. ;;
  1425. { .mfi
  1426. FMPY f64 = f64, f32
  1427. nop __LINE__
  1428. }
  1429. { .mfi
  1430. nop __LINE__
  1431. FMPY f96 = f96, f32
  1432. nop __LINE__
  1433. }
  1434. ;;
  1435. { .mfi
  1436. FMPY f72 = f72, f32
  1437. nop __LINE__
  1438. }
  1439. { .mfi
  1440. nop __LINE__
  1441. FMPY f104 = f104, f32
  1442. nop __LINE__
  1443. }
  1444. ;;
  1445. { .mfi
  1446. FMPY f80 = f80, f32
  1447. }
  1448. { .mfi
  1449. nop __LINE__
  1450. FMPY f112 = f112, f32
  1451. nop __LINE__
  1452. }
  1453. ;;
  1454. { .mfi
  1455. FMPY f88 = f88, f32
  1456. nop __LINE__
  1457. }
  1458. { .mfi
  1459. nop __LINE__
  1460. FMPY f120 = f120, f32
  1461. nop __LINE__
  1462. }
  1463. ;;
  1464. { .mfi
  1465. FNMA f65 = f64, f33, f65
  1466. nop __LINE__
  1467. }
  1468. { .mfi
  1469. nop __LINE__
  1470. FNMA f97 = f96, f33, f97
  1471. nop __LINE__
  1472. }
  1473. ;;
  1474. { .mfi
  1475. FNMA f73 = f72, f33, f73
  1476. nop __LINE__
  1477. }
  1478. { .mfi
  1479. nop __LINE__
  1480. FNMA f105 = f104, f33, f105
  1481. nop __LINE__
  1482. }
  1483. ;;
  1484. { .mfi
  1485. FNMA f81 = f80, f33, f81
  1486. }
  1487. { .mfi
  1488. nop __LINE__
  1489. FNMA f113 = f112, f33, f113
  1490. nop __LINE__
  1491. }
  1492. ;;
  1493. { .mfi
  1494. FNMA f89 = f88, f33, f89
  1495. nop __LINE__
  1496. }
  1497. { .mfi
  1498. nop __LINE__
  1499. FNMA f121 = f120, f33, f121
  1500. nop __LINE__
  1501. }
  1502. ;;
  1503. FMPY f65 = f65, f34
  1504. FMPY f97 = f97, f34
  1505. FMPY f73 = f73, f34
  1506. FMPY f105 = f105, f34
  1507. FMPY f81 = f81, f34
  1508. FMPY f113 = f113, f34
  1509. FMPY f89 = f89, f34
  1510. FMPY f121 = f121, f34
  1511. ;;
  1512. { .mfi
  1513. STFD [BOFFSET] = f64, SIZE
  1514. }
  1515. { .mfi
  1516. STFD [BOFFSET2] = f96, SIZE
  1517. }
  1518. ;;
  1519. { .mfi
  1520. STFD [BOFFSET] = f72, SIZE
  1521. }
  1522. { .mfi
  1523. STFD [BOFFSET2] = f104, SIZE
  1524. }
  1525. ;;
  1526. { .mfi
  1527. STFD [BOFFSET] = f80, SIZE
  1528. }
  1529. { .mfi
  1530. STFD [BOFFSET2] = f112, SIZE
  1531. }
  1532. ;;
  1533. { .mfi
  1534. STFD [BOFFSET] = f88, 5 * SIZE
  1535. }
  1536. { .mfi
  1537. STFD [BOFFSET2] = f120, 5 * SIZE
  1538. }
  1539. ;;
  1540. { .mfi
  1541. STFD [BOFFSET] = f65, SIZE
  1542. }
  1543. { .mfi
  1544. STFD [BOFFSET2] = f97, SIZE
  1545. }
  1546. ;;
  1547. { .mfi
  1548. STFD [BOFFSET] = f73, SIZE
  1549. }
  1550. { .mfi
  1551. STFD [BOFFSET2] = f105, SIZE
  1552. }
  1553. ;;
  1554. { .mfi
  1555. STFD [BOFFSET] = f81, SIZE
  1556. }
  1557. { .mfi
  1558. STFD [BOFFSET2] = f113, SIZE
  1559. }
  1560. ;;
  1561. { .mfi
  1562. STFD [BOFFSET] = f89, -11 * SIZE
  1563. }
  1564. { .mfi
  1565. STFD [BOFFSET2] = f121, -11 * SIZE
  1566. }
  1567. #endif
  1568. #ifdef RN
  1569. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  1570. ;;
  1571. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  1572. ;;
  1573. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  1574. ;;
  1575. LDFPD f38, f39 = [BOFFSET]
  1576. adds BOFFSET = 3 * SIZE, BOFFSET
  1577. ;;
  1578. LDFD f40 = [BOFFSET], 1 * SIZE
  1579. ;;
  1580. LDFPD f41, f42 = [BOFFSET], 2 * SIZE
  1581. ;;
  1582. LDFPD f43, f44 = [BOFFSET], 2 * SIZE
  1583. ;;
  1584. LDFPD f45, f46 = [BOFFSET]
  1585. adds BOFFSET = 4 * SIZE, BOFFSET
  1586. ;;
  1587. LDFPD f47, f48 = [BOFFSET], 2 * SIZE
  1588. ;;
  1589. LDFPD f49, f50 = [BOFFSET], 2 * SIZE
  1590. ;;
  1591. LDFPD f51, f52 = [BOFFSET]
  1592. adds BOFFSET = 5 * SIZE, BOFFSET
  1593. ;;
  1594. LDFD f53 = [BOFFSET], 1 * SIZE
  1595. ;;
  1596. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  1597. ;;
  1598. LDFPD f56, f57 = [BOFFSET]
  1599. adds BOFFSET = 6 * SIZE, BOFFSET
  1600. ;;
  1601. LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  1602. ;;
  1603. LDFPD f60, f61 = [BOFFSET]
  1604. adds BOFFSET = 7 * SIZE, BOFFSET
  1605. ;;
  1606. LDFD f16 = [BOFFSET], 1 * SIZE
  1607. ;;
  1608. LDFPD f17, f18 = [BOFFSET]
  1609. adds BOFFSET = 8 * SIZE, BOFFSET
  1610. ;;
  1611. LDFPD f19, f20 = [BOFFSET]
  1612. adds BOFFSET = 9 * SIZE, BOFFSET
  1613. ;;
  1614. LDFD f21 = [BOFFSET]
  1615. adds BOFFSET = -63 * SIZE, BOFFSET
  1616. ;;
  1617. FMPY f64 = f64, f32
  1618. FMPY f65 = f65, f32
  1619. ;;
  1620. FNMA f72 = f64, f33, f72
  1621. FNMA f73 = f65, f33, f73
  1622. ;;
  1623. FNMA f80 = f64, f34, f80
  1624. FNMA f81 = f65, f34, f81
  1625. ;;
  1626. FNMA f88 = f64, f35, f88
  1627. FNMA f89 = f65, f35, f89
  1628. ;;
  1629. FNMA f96 = f64, f36, f96
  1630. FNMA f97 = f65, f36, f97
  1631. ;;
  1632. FNMA f104 = f64, f37, f104
  1633. FNMA f105 = f65, f37, f105
  1634. ;;
  1635. FNMA f112 = f64, f38, f112
  1636. FNMA f113 = f65, f38, f113
  1637. ;;
  1638. FNMA f120 = f64, f39, f120
  1639. FNMA f121 = f65, f39, f121
  1640. ;;
  1641. FMPY f72 = f72, f40
  1642. FMPY f73 = f73, f40
  1643. ;;
  1644. FNMA f80 = f72, f41, f80
  1645. FNMA f81 = f73, f41, f81
  1646. ;;
  1647. FNMA f88 = f72, f42, f88
  1648. FNMA f89 = f73, f42, f89
  1649. ;;
  1650. FNMA f96 = f72, f43, f96
  1651. FNMA f97 = f73, f43, f97
  1652. ;;
  1653. FNMA f104 = f72, f44, f104
  1654. FNMA f105 = f73, f44, f105
  1655. ;;
  1656. FNMA f112 = f72, f45, f112
  1657. FNMA f113 = f73, f45, f113
  1658. ;;
  1659. FNMA f120 = f72, f46, f120
  1660. FNMA f121 = f73, f46, f121
  1661. ;;
  1662. FMPY f80 = f80, f47
  1663. FMPY f81 = f81, f47
  1664. ;;
  1665. FNMA f88 = f80, f48, f88
  1666. FNMA f89 = f81, f48, f89
  1667. ;;
  1668. FNMA f96 = f80, f49, f96
  1669. FNMA f97 = f81, f49, f97
  1670. ;;
  1671. FNMA f104 = f80, f50, f104
  1672. FNMA f105 = f81, f50, f105
  1673. ;;
  1674. FNMA f112 = f80, f51, f112
  1675. FNMA f113 = f81, f51, f113
  1676. ;;
  1677. FNMA f120 = f80, f52, f120
  1678. FNMA f121 = f81, f52, f121
  1679. ;;
  1680. FMPY f88 = f88, f53
  1681. FMPY f89 = f89, f53
  1682. ;;
  1683. FNMA f96 = f88, f54, f96
  1684. FNMA f97 = f89, f54, f97
  1685. ;;
  1686. FNMA f104 = f88, f55, f104
  1687. FNMA f105 = f89, f55, f105
  1688. ;;
  1689. FNMA f112 = f88, f56, f112
  1690. FNMA f113 = f89, f56, f113
  1691. ;;
  1692. FNMA f120 = f88, f57, f120
  1693. FNMA f121 = f89, f57, f121
  1694. ;;
  1695. FMPY f96 = f96, f58
  1696. FMPY f97 = f97, f58
  1697. ;;
  1698. FNMA f104 = f96, f59, f104
  1699. FNMA f105 = f97, f59, f105
  1700. ;;
  1701. FNMA f112 = f96, f60, f112
  1702. FNMA f113 = f97, f60, f113
  1703. ;;
  1704. FNMA f120 = f96, f61, f120
  1705. FNMA f121 = f97, f61, f121
  1706. ;;
  1707. FMPY f104 = f104, f16
  1708. FMPY f105 = f105, f16
  1709. ;;
  1710. FNMA f112 = f104, f17, f112
  1711. FNMA f113 = f105, f17, f113
  1712. ;;
  1713. FNMA f120 = f104, f18, f120
  1714. FNMA f121 = f105, f18, f121
  1715. ;;
  1716. FMPY f112 = f112, f19
  1717. FMPY f113 = f113, f19
  1718. ;;
  1719. FNMA f120 = f112, f20, f120
  1720. FNMA f121 = f113, f20, f121
  1721. ;;
  1722. FMPY f120 = f120, f21
  1723. FMPY f121 = f121, f21
  1724. ;;
  1725. STFD [AOFFSET] = f64, SIZE
  1726. STFD [AOFFSET2] = f80, SIZE
  1727. ;;
  1728. STFD [AOFFSET] = f65, SIZE
  1729. STFD [AOFFSET2] = f81, SIZE
  1730. ;;
  1731. STFD [AOFFSET] = f72, SIZE
  1732. STFD [AOFFSET2] = f88, SIZE
  1733. ;;
  1734. STFD [AOFFSET] = f73, 5 * SIZE
  1735. STFD [AOFFSET2] = f89, 5 * SIZE
  1736. ;;
  1737. STFD [AOFFSET] = f96, SIZE
  1738. STFD [AOFFSET2] = f112, SIZE
  1739. ;;
  1740. STFD [AOFFSET] = f97, SIZE
  1741. STFD [AOFFSET2] = f113, SIZE
  1742. ;;
  1743. STFD [AOFFSET] = f104, SIZE
  1744. STFD [AOFFSET2] = f120, SIZE
  1745. ;;
  1746. STFD [AOFFSET] = f105, -11 * SIZE
  1747. STFD [AOFFSET2] = f121, - 11 * SIZE
  1748. ;;
  1749. #endif
  1750. #ifdef RT
  1751. adds BOFFSET = 62 * SIZE, BOFFSET
  1752. ;;
  1753. LDFPD f33, f32 = [BOFFSET]
  1754. adds BOFFSET = - 2 * SIZE, BOFFSET
  1755. ;;
  1756. LDFPD f35, f34 = [BOFFSET]
  1757. adds BOFFSET = - 2 * SIZE, BOFFSET
  1758. ;;
  1759. LDFPD f37, f36 = [BOFFSET]
  1760. adds BOFFSET = - 2 * SIZE, BOFFSET
  1761. ;;
  1762. LDFPD f39, f38 = [BOFFSET]
  1763. adds BOFFSET = - 2 * SIZE, BOFFSET
  1764. ;;
  1765. LDFD f40 = [BOFFSET], -2 * SIZE
  1766. ;;
  1767. LDFPD f42, f41 = [BOFFSET]
  1768. adds BOFFSET = - 2 * SIZE, BOFFSET
  1769. ;;
  1770. LDFPD f44, f43 = [BOFFSET]
  1771. adds BOFFSET = - 2 * SIZE, BOFFSET
  1772. ;;
  1773. LDFPD f46, f45 = [BOFFSET]
  1774. adds BOFFSET = - 4 * SIZE, BOFFSET
  1775. ;;
  1776. LDFPD f48, f47 = [BOFFSET]
  1777. adds BOFFSET = - 2 * SIZE, BOFFSET
  1778. ;;
  1779. LDFPD f50, f49 = [BOFFSET]
  1780. adds BOFFSET = - 2 * SIZE, BOFFSET
  1781. ;;
  1782. LDFPD f52, f51 = [BOFFSET]
  1783. adds BOFFSET = - 4 * SIZE, BOFFSET
  1784. ;;
  1785. LDFD f53 = [BOFFSET], -2 * SIZE
  1786. ;;
  1787. LDFPD f55, f54 = [BOFFSET]
  1788. adds BOFFSET = - 2 * SIZE, BOFFSET
  1789. ;;
  1790. LDFPD f57, f56 = [BOFFSET]
  1791. adds BOFFSET = - 6 * SIZE, BOFFSET
  1792. ;;
  1793. LDFPD f59, f58 = [BOFFSET]
  1794. adds BOFFSET = - 2 * SIZE, BOFFSET
  1795. ;;
  1796. LDFPD f61, f60 = [BOFFSET]
  1797. adds BOFFSET = - 6 * SIZE, BOFFSET
  1798. ;;
  1799. LDFD f16 = [BOFFSET], -2 * SIZE
  1800. ;;
  1801. LDFPD f18, f17 = [BOFFSET]
  1802. adds BOFFSET = - 8 * SIZE, BOFFSET
  1803. ;;
  1804. LDFPD f20, f19 = [BOFFSET]
  1805. adds BOFFSET = - 8 * SIZE, BOFFSET
  1806. ;;
  1807. LDFD f21 = [BOFFSET]
  1808. ;;
  1809. FMPY f120 = f120, f32
  1810. FMPY f121 = f121, f32
  1811. ;;
  1812. FNMA f112 = f120, f33, f112
  1813. FNMA f113 = f121, f33, f113
  1814. ;;
  1815. FNMA f104 = f120, f34, f104
  1816. FNMA f105 = f121, f34, f105
  1817. ;;
  1818. FNMA f96 = f120, f35, f96
  1819. FNMA f97 = f121, f35, f97
  1820. ;;
  1821. FNMA f88 = f120, f36, f88
  1822. FNMA f89 = f121, f36, f89
  1823. ;;
  1824. FNMA f80 = f120, f37, f80
  1825. FNMA f81 = f121, f37, f81
  1826. ;;
  1827. FNMA f72 = f120, f38, f72
  1828. FNMA f73 = f121, f38, f73
  1829. ;;
  1830. FNMA f64 = f120, f39, f64
  1831. FNMA f65 = f121, f39, f65
  1832. ;;
  1833. FMPY f112 = f112, f40
  1834. FMPY f113 = f113, f40
  1835. ;;
  1836. FNMA f104 = f112, f41, f104
  1837. FNMA f105 = f113, f41, f105
  1838. ;;
  1839. FNMA f96 = f112, f42, f96
  1840. FNMA f97 = f113, f42, f97
  1841. ;;
  1842. FNMA f88 = f112, f43, f88
  1843. FNMA f89 = f113, f43, f89
  1844. ;;
  1845. FNMA f80 = f112, f44, f80
  1846. FNMA f81 = f113, f44, f81
  1847. ;;
  1848. FNMA f72 = f112, f45, f72
  1849. FNMA f73 = f113, f45, f73
  1850. ;;
  1851. FNMA f64 = f112, f46, f64
  1852. FNMA f65 = f113, f46, f65
  1853. ;;
  1854. FMPY f104 = f104, f47
  1855. FMPY f105 = f105, f47
  1856. ;;
  1857. FNMA f96 = f104, f48, f96
  1858. FNMA f97 = f105, f48, f97
  1859. ;;
  1860. FNMA f88 = f104, f49, f88
  1861. FNMA f89 = f105, f49, f89
  1862. ;;
  1863. FNMA f80 = f104, f50, f80
  1864. FNMA f81 = f105, f50, f81
  1865. ;;
  1866. FNMA f72 = f104, f51, f72
  1867. FNMA f73 = f105, f51, f73
  1868. ;;
  1869. FNMA f64 = f104, f52, f64
  1870. FNMA f65 = f105, f52, f65
  1871. ;;
  1872. FMPY f96 = f96, f53
  1873. FMPY f97 = f97, f53
  1874. ;;
  1875. FNMA f88 = f96, f54, f88
  1876. FNMA f89 = f97, f54, f89
  1877. ;;
  1878. FNMA f80 = f96, f55, f80
  1879. FNMA f81 = f97, f55, f81
  1880. ;;
  1881. FNMA f72 = f96, f56, f72
  1882. FNMA f73 = f97, f56, f73
  1883. ;;
  1884. FNMA f64 = f96, f57, f64
  1885. FNMA f65 = f97, f57, f65
  1886. ;;
  1887. FMPY f88 = f88, f58
  1888. FMPY f89 = f89, f58
  1889. ;;
  1890. FNMA f80 = f88, f59, f80
  1891. FNMA f81 = f89, f59, f81
  1892. ;;
  1893. FNMA f72 = f88, f60, f72
  1894. FNMA f73 = f89, f60, f73
  1895. ;;
  1896. FNMA f64 = f88, f61, f64
  1897. FNMA f65 = f89, f61, f65
  1898. ;;
  1899. FMPY f80 = f80, f16
  1900. FMPY f81 = f81, f16
  1901. ;;
  1902. FNMA f72 = f80, f17, f72
  1903. FNMA f73 = f81, f17, f73
  1904. ;;
  1905. FNMA f64 = f80, f18, f64
  1906. FNMA f65 = f81, f18, f65
  1907. ;;
  1908. FMPY f72 = f72, f19
  1909. FMPY f73 = f73, f19
  1910. ;;
  1911. FNMA f64 = f72, f20, f64
  1912. FNMA f65 = f73, f20, f65
  1913. ;;
  1914. FMPY f64 = f64, f21
  1915. FMPY f65 = f65, f21
  1916. ;;
  1917. adds AOFFSET = 8 * SIZE, AOFFSET
  1918. adds AOFFSET2 = 8 * SIZE, AOFFSET2
  1919. ;;
  1920. STFD [AOFFSET] = f96, SIZE
  1921. STFD [AOFFSET2] = f112, SIZE
  1922. ;;
  1923. STFD [AOFFSET] = f97, SIZE
  1924. STFD [AOFFSET2] = f113, SIZE
  1925. ;;
  1926. STFD [AOFFSET] = f104, SIZE
  1927. STFD [AOFFSET2] = f120, SIZE
  1928. ;;
  1929. STFD [AOFFSET] = f105, - 11 * SIZE
  1930. STFD [AOFFSET2] = f121, - 11 * SIZE
  1931. ;;
  1932. STFD [AOFFSET] = f64, SIZE
  1933. STFD [AOFFSET2] = f80, SIZE
  1934. ;;
  1935. STFD [AOFFSET] = f65, SIZE
  1936. STFD [AOFFSET2] = f81, SIZE
  1937. ;;
  1938. STFD [AOFFSET] = f72, SIZE
  1939. STFD [AOFFSET2] = f88, SIZE
  1940. ;;
  1941. STFD [AOFFSET] = f73, - 3 * SIZE
  1942. STFD [AOFFSET2] = f89, - 3 * SIZE
  1943. ;;
  1944. #endif
  1945. STFD [C1 ] = f64, SIZE
  1946. mov f64 = f0
  1947. ;;
  1948. #ifndef LN
  1949. STFD [C1 ] = f65, SIZE
  1950. #else
  1951. STFD [C1 ] = f65, -SIZE
  1952. #endif
  1953. ;;
  1954. STFD [C2 ] = f72, SIZE
  1955. mov f72 = f0
  1956. ;;
  1957. #ifndef LN
  1958. STFD [C2 ] = f73, SIZE
  1959. #else
  1960. STFD [C2 ] = f73, -SIZE
  1961. #endif
  1962. ;;
  1963. STFD [C3 ] = f80, SIZE
  1964. mov f80 = f0
  1965. ;;
  1966. #ifndef LN
  1967. STFD [C3 ] = f81, SIZE
  1968. #else
  1969. STFD [C3 ] = f81, - SIZE
  1970. #endif
  1971. ;;
  1972. STFD [C4 ] = f88, SIZE
  1973. mov f88 = f0
  1974. ;;
  1975. #ifndef LN
  1976. STFD [C4 ] = f89, SIZE
  1977. #else
  1978. STFD [C4 ] = f89, -SIZE
  1979. #endif
  1980. ;;
  1981. STFD [C5 ] = f96, SIZE
  1982. mov f96 = f0
  1983. ;;
  1984. #ifndef LN
  1985. STFD [C5 ] = f97, SIZE
  1986. #else
  1987. STFD [C5 ] = f97, -SIZE
  1988. #endif
  1989. ;;
  1990. STFD [C6 ] = f104, SIZE
  1991. mov f104 = f0
  1992. ;;
  1993. #ifndef LN
  1994. STFD [C6 ] = f105, SIZE
  1995. #else
  1996. STFD [C6 ] = f105, -SIZE
  1997. #endif
  1998. ;;
  1999. shladd r2 = K, BASE_SHIFT, r0
  2000. ;;
  2001. sub L = K, KK
  2002. ;;
  2003. #ifdef RT
  2004. shladd AORIG = r2, 1, AORIG
  2005. #else
  2006. nop __LINE__
  2007. #endif
  2008. ;;
  2009. STFD [C7 ] = f112, SIZE
  2010. mov f112 = f0
  2011. ;;
  2012. { .mmi
  2013. #ifndef LN
  2014. STFD [C7 ] = f113, SIZE
  2015. #else
  2016. STFD [C7 ] = f113, -SIZE
  2017. #endif
  2018. #if defined(LT) || defined(RN)
  2019. shladd L = L, BASE_SHIFT, r0
  2020. #else
  2021. nop __LINE__
  2022. #endif
  2023. }
  2024. ;;
  2025. { .mmi
  2026. #if defined(LT) || defined(RN)
  2027. shladd AOFFSET = L, 1, AOFFSET
  2028. #else
  2029. nop __LINE__
  2030. #endif
  2031. }
  2032. ;;
  2033. { .mmi
  2034. #if defined(LT) || defined(RN)
  2035. shladd BOFFSET = L, 3, BOFFSET
  2036. #else
  2037. nop __LINE__
  2038. #endif
  2039. }
  2040. ;;
  2041. { .mmf
  2042. STFD [C8 ] = f120, SIZE
  2043. mov f120 = f0
  2044. }
  2045. ;;
  2046. { .mmi
  2047. #ifndef LN
  2048. STFD [C8 ] = f121, SIZE
  2049. #else
  2050. STFD [C8 ] = f121, -SIZE
  2051. #endif
  2052. #ifdef LT
  2053. adds KK = 2, KK
  2054. #elif defined LN
  2055. adds KK = -2, KK
  2056. #else
  2057. nop __LINE__
  2058. #endif
  2059. }
  2060. ;;
  2061. { .mmi
  2062. #if defined(LT) || defined(RN)
  2063. mov L = KK
  2064. #else
  2065. sub L = K, KK
  2066. #endif
  2067. }
  2068. ;;
  2069. .align 8
  2070. .L020:
  2071. { .mib
  2072. sub L = K, KK
  2073. tbit.z p6, p0 = M, 2
  2074. (p6) br.cond.dptk .L010
  2075. }
  2076. ;;
  2077. ;;
  2078. { .mmi
  2079. cmp.ne p7, p0 = r0, L
  2080. adds BOFFSET = 0 * SIZE, B
  2081. shl r2 = K, 2 + BASE_SHIFT
  2082. }
  2083. { .mmi
  2084. shladd r3 = KK, BASE_SHIFT, r0
  2085. nop __LINE__
  2086. nop __LINE__
  2087. }
  2088. ;;
  2089. #if defined(LT) || defined(RN)
  2090. { .mmf
  2091. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  2092. setf.d f73 = r0
  2093. mov f65 = f0
  2094. }
  2095. ;;
  2096. #else
  2097. { .mfi
  2098. shladd BOFFSET = r3, 3, B
  2099. mov f65 = f0
  2100. #ifdef LN
  2101. sub AORIG = AORIG, r2
  2102. #else
  2103. nop __LINE__
  2104. #endif
  2105. }
  2106. ;;
  2107. { .mfi
  2108. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  2109. mov f73 = f0
  2110. shladd AOFFSET = r3, 2, AORIG
  2111. }
  2112. ;;
  2113. #endif
  2114. { .mfi
  2115. setf.d f105 = r0
  2116. mov f81 = f0
  2117. adds L = 1, L
  2118. }
  2119. { .mfi
  2120. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  2121. mov f89 = f0
  2122. cmp.eq p3, p0 = r0, r0
  2123. }
  2124. ;;
  2125. { .mfi
  2126. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  2127. mov f113 = f0
  2128. tbit.z p12, p0 = L, 0
  2129. }
  2130. { .mfi
  2131. setf.d f97 = r0
  2132. mov f121 = f0
  2133. shr L = L, 1
  2134. }
  2135. ;;
  2136. { .mmf
  2137. (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  2138. setf.d f66 = r0
  2139. mov f67 = f0
  2140. }
  2141. { .mfi
  2142. setf.d f74 = r0
  2143. mov f75 = f0
  2144. adds L = -1, L
  2145. }
  2146. ;;
  2147. { .mmf
  2148. (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  2149. setf.d f82 = r0
  2150. mov f83 = f0
  2151. }
  2152. { .mfi
  2153. setf.d f90 = r0
  2154. mov f91 = f0
  2155. cmp.eq p6, p0 = -1, L
  2156. }
  2157. ;;
  2158. { .mmf
  2159. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  2160. setf.d f98 = r0
  2161. mov f99 = f0
  2162. }
  2163. { .mfi
  2164. setf.d f106 = r0
  2165. mov f107 = f0
  2166. mov ar.lc = L
  2167. }
  2168. ;;
  2169. { .mmf
  2170. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  2171. setf.d f114 = r0
  2172. mov f115 = f0
  2173. }
  2174. { .mfb
  2175. setf.d f122 = r0
  2176. mov f123 = f0
  2177. (p6) br.cond.dpnt .L028
  2178. }
  2179. ;;
  2180. .L022:
  2181. { .mfi
  2182. lfetch.nt1 [PREA], 8 * SIZE
  2183. FMA f64 = f32, f48, f64 // A1 * B1
  2184. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  2185. }
  2186. { .mfi
  2187. nop __LINE__
  2188. FMA f72 = f32, f49, f72 // A1 * B2
  2189. (p12) cmp.ne p3, p0 = 0, L
  2190. }
  2191. ;;
  2192. { .mfi
  2193. lfetch.nt1 [PREB], 16 * SIZE
  2194. FMA f80 = f32, f50, f80 // A1 * B3
  2195. cmp.ne p4, p5 = 0, L
  2196. }
  2197. { .mfb
  2198. nop __LINE__
  2199. FMA f88 = f32, f51, f88 // A1 * B4
  2200. nop __LINE__
  2201. }
  2202. ;;
  2203. { .mfi
  2204. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  2205. FMA f96 = f32, f52, f96 // A1 * B5
  2206. (p5) adds C9 = 2 * SIZE, C1
  2207. }
  2208. { .mfi
  2209. nop __LINE__
  2210. FMA f104 = f32, f53, f104 // A1 * B6
  2211. (p5) adds C10 = 2 * SIZE, C2
  2212. }
  2213. ;;
  2214. { .mfi
  2215. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  2216. FMA f112 = f32, f54, f112 // A1 * B7
  2217. (p5) adds C11 = 2 * SIZE, C3
  2218. }
  2219. { .mfi
  2220. nop __LINE__
  2221. FMA f120 = f32, f55, f120 // A1 * B8
  2222. (p5) adds C12 = 2 * SIZE, C4
  2223. }
  2224. ;;
  2225. { .mfi
  2226. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  2227. FMA f65 = f33, f48, f65 // A2 * B1
  2228. (p5) adds C13 = 2 * SIZE, C5
  2229. }
  2230. { .mfi
  2231. nop __LINE__
  2232. FMA f73 = f33, f49, f73 // A2 * B2
  2233. (p5) adds C14 = 2 * SIZE, C6
  2234. }
  2235. ;;
  2236. { .mfi
  2237. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  2238. FMA f81 = f33, f50, f81 // A2 * B3
  2239. (p5) adds C15 = 2 * SIZE, C7
  2240. }
  2241. { .mfi
  2242. nop __LINE__
  2243. FMA f89 = f33, f51, f89 // A2 * B4
  2244. (p5) adds C16 = 2 * SIZE, C8
  2245. }
  2246. ;;
  2247. { .mfb
  2248. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  2249. FMA f97 = f33, f52, f97 // A2 * B5
  2250. nop __LINE__
  2251. }
  2252. { .mfb
  2253. nop __LINE__
  2254. FMA f105 = f33, f53, f105 // A2 * B6
  2255. nop __LINE__
  2256. }
  2257. ;;
  2258. { .mfb
  2259. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  2260. FMA f113 = f33, f54, f113 // A2 * B7
  2261. nop __LINE__
  2262. }
  2263. { .mfb
  2264. nop __LINE__
  2265. FMA f121 = f33, f55, f121 // A2 * B8
  2266. nop __LINE__
  2267. }
  2268. ;;
  2269. { .mfb
  2270. nop __LINE__
  2271. FMA f66 = f34, f48, f66 // A3 * B1
  2272. nop __LINE__
  2273. }
  2274. { .mfb
  2275. nop __LINE__
  2276. FMA f74 = f34, f49, f74 // A3 * B2
  2277. nop __LINE__
  2278. }
  2279. ;;
  2280. { .mfb
  2281. nop __LINE__
  2282. FMA f82 = f34, f50, f82 // A3 * B3
  2283. nop __LINE__
  2284. }
  2285. { .mfb
  2286. nop __LINE__
  2287. FMA f90 = f34, f51, f90 // A3 * B4
  2288. nop __LINE__
  2289. }
  2290. ;;
  2291. { .mfb
  2292. nop __LINE__
  2293. FMA f98 = f34, f52, f98 // A3 * B5
  2294. nop __LINE__
  2295. }
  2296. { .mfb
  2297. nop __LINE__
  2298. FMA f106 = f34, f53, f106 // A3 * B6
  2299. nop __LINE__
  2300. }
  2301. { .mfb
  2302. nop __LINE__
  2303. FMA f114 = f34, f54, f114 // A3 * B7
  2304. nop __LINE__
  2305. }
  2306. { .mfb
  2307. nop __LINE__
  2308. FMA f122 = f34, f55, f122 // A3 * B8
  2309. nop __LINE__
  2310. }
  2311. { .mfb
  2312. nop __LINE__
  2313. FMA f67 = f35, f48, f67 // A4 * B1
  2314. nop __LINE__
  2315. }
  2316. { .mfb
  2317. nop __LINE__
  2318. FMA f75 = f35, f49, f75 // A4 * B2
  2319. nop __LINE__
  2320. }
  2321. { .mfb
  2322. nop __LINE__
  2323. FMA f83 = f35, f50, f83 // A4 * B3
  2324. nop __LINE__
  2325. }
  2326. { .mfb
  2327. nop __LINE__
  2328. FMA f91 = f35, f51, f91 // A4 * B4
  2329. nop __LINE__
  2330. }
  2331. { .mfb
  2332. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  2333. FMA f99 = f35, f52, f99 // A4 * B5
  2334. nop __LINE__
  2335. }
  2336. { .mfb
  2337. nop __LINE__
  2338. FMA f107 = f35, f53, f107 // A4 * B6
  2339. nop __LINE__
  2340. }
  2341. { .mfb
  2342. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  2343. FMA f115 = f35, f54, f115 // A4 * B7
  2344. nop __LINE__
  2345. }
  2346. { .mfb
  2347. nop __LINE__
  2348. FMA f123 = f35, f55, f123 // A4 * B8
  2349. nop __LINE__
  2350. }
  2351. ;;
  2352. { .mfb
  2353. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  2354. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  2355. nop __LINE__
  2356. }
  2357. { .mfb
  2358. nop __LINE__
  2359. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  2360. nop __LINE__
  2361. }
  2362. ;;
  2363. { .mfb
  2364. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  2365. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  2366. nop __LINE__
  2367. }
  2368. { .mfb
  2369. nop __LINE__
  2370. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  2371. nop __LINE__
  2372. }
  2373. ;;
  2374. { .mfb
  2375. nop __LINE__
  2376. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  2377. nop __LINE__
  2378. }
  2379. { .mfb
  2380. nop __LINE__
  2381. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  2382. nop __LINE__
  2383. }
  2384. ;;
  2385. { .mfb
  2386. nop __LINE__
  2387. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  2388. nop __LINE__
  2389. }
  2390. { .mfb
  2391. nop __LINE__
  2392. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  2393. nop __LINE__
  2394. }
  2395. ;;
  2396. { .mfb
  2397. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  2398. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  2399. nop __LINE__
  2400. }
  2401. { .mfb
  2402. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  2403. nop __LINE__
  2404. }
  2405. { .mfb
  2406. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  2407. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  2408. nop __LINE__
  2409. }
  2410. { .mfb
  2411. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  2412. nop __LINE__
  2413. }
  2414. ;;
  2415. { .mfb
  2416. nop __LINE__
  2417. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  2418. nop __LINE__
  2419. }
  2420. { .mfb
  2421. nop __LINE__
  2422. (p3) FMA f105 = f41, f61, f105 // A2 * B6
  2423. nop __LINE__
  2424. }
  2425. ;;
  2426. { .mfb
  2427. nop __LINE__
  2428. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  2429. nop __LINE__
  2430. }
  2431. { .mfb
  2432. nop __LINE__
  2433. (p3) FMA f121 = f41, f63, f121 // A2 * B8
  2434. nop __LINE__
  2435. }
  2436. ;;
  2437. { .mfb
  2438. nop __LINE__
  2439. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  2440. nop __LINE__
  2441. }
  2442. { .mfb
  2443. nop __LINE__
  2444. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  2445. nop __LINE__
  2446. }
  2447. ;;
  2448. { .mfb
  2449. nop __LINE__
  2450. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  2451. nop __LINE__
  2452. }
  2453. { .mfb
  2454. nop __LINE__
  2455. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  2456. nop __LINE__
  2457. }
  2458. ;;
  2459. { .mfb
  2460. nop __LINE__
  2461. (p3) FMA f98 = f42, f60, f98 // A3 * B5
  2462. nop __LINE__
  2463. }
  2464. { .mfb
  2465. nop __LINE__
  2466. (p3) FMA f106 = f42, f61, f106 // A3 * B6
  2467. nop __LINE__
  2468. }
  2469. ;;
  2470. { .mfb
  2471. nop __LINE__
  2472. (p3) FMA f114 = f42, f62, f114 // A3 * B7
  2473. nop __LINE__
  2474. }
  2475. { .mfb
  2476. nop __LINE__
  2477. (p3) FMA f122 = f42, f63, f122 // A3 * B8
  2478. nop __LINE__
  2479. }
  2480. ;;
  2481. { .mfb
  2482. nop __LINE__
  2483. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  2484. nop __LINE__
  2485. }
  2486. { .mfb
  2487. nop __LINE__
  2488. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  2489. nop __LINE__
  2490. }
  2491. ;;
  2492. { .mfb
  2493. nop __LINE__
  2494. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  2495. nop __LINE__
  2496. }
  2497. { .mfb
  2498. nop __LINE__
  2499. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  2500. nop __LINE__
  2501. }
  2502. ;;
  2503. { .mfb
  2504. nop __LINE__
  2505. (p3) FMA f99 = f43, f60, f99 // A4 * B5
  2506. nop __LINE__
  2507. }
  2508. { .mfb
  2509. nop __LINE__
  2510. (p3) FMA f107 = f43, f61, f107 // A4 * B6
  2511. nop __LINE__
  2512. }
  2513. ;;
  2514. { .mfi
  2515. nop __LINE__
  2516. (p3) FMA f115 = f43, f62, f115 // A4 * B7
  2517. adds L = -1, L
  2518. }
  2519. { .mfb
  2520. nop __LINE__
  2521. (p3) FMA f123 = f43, f63, f123 // A4 * B8
  2522. br.cloop.sptk.few .L022
  2523. }
  2524. ;;
  2525. .L028:
  2526. #if defined(LN) || defined(RT)
  2527. #ifdef LN
  2528. adds r2 = -4, KK
  2529. #else
  2530. adds r2 = -8, KK
  2531. #endif
  2532. ;;
  2533. shladd r2 = r2, BASE_SHIFT, r0
  2534. ;;
  2535. shladd AOFFSET = r2, 2, AORIG
  2536. shladd BOFFSET = r2, 3, B
  2537. ;;
  2538. #endif
  2539. adds AOFFSET2 = 4 * SIZE, AOFFSET
  2540. adds BOFFSET2 = 4 * SIZE, BOFFSET
  2541. ;;
  2542. #if defined(LN) || defined(LT)
  2543. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  2544. ;;
  2545. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  2546. ;;
  2547. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  2548. ;;
  2549. LDFPD f38, f39 = [BOFFSET], 2 * SIZE
  2550. ;;
  2551. LDFPD f40, f41 = [BOFFSET], 2 * SIZE
  2552. ;;
  2553. LDFPD f42, f43 = [BOFFSET], 2 * SIZE
  2554. ;;
  2555. LDFPD f44, f45 = [BOFFSET], 2 * SIZE
  2556. ;;
  2557. LDFPD f46, f47 = [BOFFSET], 2 * SIZE
  2558. ;;
  2559. { .mfi
  2560. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  2561. FSUB f64 = f32, f64
  2562. nop __LINE__
  2563. }
  2564. { .mfi
  2565. nop __LINE__
  2566. FSUB f72 = f33, f72
  2567. nop __LINE__
  2568. }
  2569. ;;
  2570. { .mfi
  2571. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  2572. FSUB f80 = f34, f80
  2573. nop __LINE__
  2574. }
  2575. { .mfi
  2576. nop __LINE__
  2577. FSUB f88 = f35, f88
  2578. nop __LINE__
  2579. }
  2580. ;;
  2581. { .mfi
  2582. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  2583. FSUB f96 = f36, f96
  2584. nop __LINE__
  2585. }
  2586. { .mfi
  2587. nop __LINE__
  2588. FSUB f104 = f37, f104
  2589. nop __LINE__
  2590. }
  2591. ;;
  2592. { .mfi
  2593. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  2594. FSUB f112 = f38, f112
  2595. nop __LINE__
  2596. }
  2597. { .mfi
  2598. nop __LINE__
  2599. FSUB f120 = f39, f120
  2600. nop __LINE__
  2601. }
  2602. ;;
  2603. { .mfi
  2604. LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  2605. FSUB f65 = f40, f65
  2606. nop __LINE__
  2607. }
  2608. { .mfi
  2609. nop __LINE__
  2610. FSUB f73 = f41, f73
  2611. nop __LINE__
  2612. }
  2613. ;;
  2614. { .mfi
  2615. LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  2616. FSUB f81 = f42, f81
  2617. nop __LINE__
  2618. }
  2619. { .mfi
  2620. nop __LINE__
  2621. FSUB f89 = f43, f89
  2622. nop __LINE__
  2623. }
  2624. ;;
  2625. { .mfi
  2626. LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  2627. FSUB f97 = f44, f97
  2628. nop __LINE__
  2629. }
  2630. { .mfi
  2631. nop __LINE__
  2632. FSUB f105 = f45, f105
  2633. nop __LINE__
  2634. }
  2635. ;;
  2636. { .mfi
  2637. LDFPD f62, f63 = [BOFFSET]
  2638. FSUB f113 = f46, f113
  2639. adds BOFFSET = -30 * SIZE, BOFFSET
  2640. }
  2641. { .mfi
  2642. nop __LINE__
  2643. FSUB f121 = f47, f121
  2644. nop __LINE__
  2645. }
  2646. ;;
  2647. FSUB f66 = f48, f66
  2648. FSUB f74 = f49, f74
  2649. FSUB f82 = f50, f82
  2650. FSUB f90 = f51, f90
  2651. FSUB f98 = f52, f98
  2652. FSUB f106 = f53, f106
  2653. FSUB f114 = f54, f114
  2654. FSUB f122 = f55, f122
  2655. ;;
  2656. FSUB f67 = f56, f67
  2657. FSUB f75 = f57, f75
  2658. FSUB f83 = f58, f83
  2659. FSUB f91 = f59, f91
  2660. FSUB f99 = f60, f99
  2661. FSUB f107 = f61, f107
  2662. FSUB f115 = f62, f115
  2663. FSUB f123 = f63, f123
  2664. ;;
  2665. #else
  2666. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  2667. ;;
  2668. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  2669. ;;
  2670. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  2671. ;;
  2672. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  2673. ;;
  2674. LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  2675. ;;
  2676. LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  2677. ;;
  2678. LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  2679. ;;
  2680. LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  2681. ;;
  2682. LDFPD f48, f49 = [AOFFSET], 2 * SIZE
  2683. ;;
  2684. LDFPD f50, f51 = [AOFFSET], 2 * SIZE
  2685. ;;
  2686. LDFPD f52, f53 = [AOFFSET], 2 * SIZE
  2687. ;;
  2688. LDFPD f54, f55 = [AOFFSET], 2 * SIZE
  2689. ;;
  2690. LDFPD f56, f57 = [AOFFSET], 2 * SIZE
  2691. ;;
  2692. LDFPD f58, f59 = [AOFFSET], 2 * SIZE
  2693. ;;
  2694. LDFPD f60, f61 = [AOFFSET], 2 * SIZE
  2695. ;;
  2696. LDFPD f62, f63 = [AOFFSET]
  2697. adds AOFFSET = -30 * SIZE, AOFFSET
  2698. ;;
  2699. FSUB f64 = f32, f64
  2700. FSUB f65 = f33, f65
  2701. FSUB f66 = f34, f66
  2702. FSUB f67 = f35, f67
  2703. FSUB f72 = f36, f72
  2704. FSUB f73 = f37, f73
  2705. FSUB f74 = f38, f74
  2706. FSUB f75 = f39, f75
  2707. FSUB f80 = f40, f80
  2708. FSUB f81 = f41, f81
  2709. FSUB f82 = f42, f82
  2710. FSUB f83 = f43, f83
  2711. FSUB f88 = f44, f88
  2712. FSUB f89 = f45, f89
  2713. FSUB f90 = f46, f90
  2714. FSUB f91 = f47, f91
  2715. ;;
  2716. FSUB f96 = f48, f96
  2717. FSUB f97 = f49, f97
  2718. FSUB f98 = f50, f98
  2719. FSUB f99 = f51, f99
  2720. ;;
  2721. FSUB f104 = f52, f104
  2722. FSUB f105 = f53, f105
  2723. FSUB f106 = f54, f106
  2724. FSUB f107 = f55, f107
  2725. ;;
  2726. FSUB f112 = f56, f112
  2727. FSUB f113 = f57, f113
  2728. FSUB f114 = f58, f114
  2729. FSUB f115 = f59, f115
  2730. ;;
  2731. FSUB f120 = f60, f120
  2732. FSUB f121 = f61, f121
  2733. FSUB f122 = f62, f122
  2734. FSUB f123 = f63, f123
  2735. ;;
  2736. #endif
  2737. #ifdef LN
  2738. adds AOFFSET = 14 * SIZE, AOFFSET
  2739. ;;
  2740. LDFPD f33, f32 = [AOFFSET]
  2741. adds AOFFSET = - 2 * SIZE, AOFFSET
  2742. ;;
  2743. LDFPD f35, f34 = [AOFFSET]
  2744. adds AOFFSET = - 2 * SIZE, AOFFSET
  2745. ;;
  2746. LDFD f36 = [AOFFSET], - 2 * SIZE
  2747. ;;
  2748. LDFPD f38, f37 = [AOFFSET]
  2749. adds AOFFSET = - 4 * SIZE, AOFFSET
  2750. ;;
  2751. LDFPD f40, f39 = [AOFFSET]
  2752. adds AOFFSET = - 4 * SIZE, AOFFSET
  2753. ;;
  2754. LDFD f41 = [AOFFSET]
  2755. ;;
  2756. FMPY f67 = f67, f32
  2757. FMPY f99 = f99, f32
  2758. FMPY f75 = f75, f32
  2759. FMPY f107 = f107, f32
  2760. FMPY f83 = f83, f32
  2761. FMPY f115 = f115, f32
  2762. FMPY f91 = f91, f32
  2763. FMPY f123 = f123, f32
  2764. ;;
  2765. FNMA f66 = f67, f33, f66
  2766. FNMA f98 = f99, f33, f98
  2767. FNMA f74 = f75, f33, f74
  2768. FNMA f106 = f107, f33, f106
  2769. FNMA f82 = f83, f33, f82
  2770. FNMA f114 = f115, f33, f114
  2771. FNMA f90 = f91, f33, f90
  2772. FNMA f122 = f123, f33, f122
  2773. ;;
  2774. FNMA f65 = f67, f34, f65
  2775. FNMA f97 = f99, f34, f97
  2776. FNMA f73 = f75, f34, f73
  2777. FNMA f105 = f107, f34, f105
  2778. FNMA f81 = f83, f34, f81
  2779. FNMA f113 = f115, f34, f113
  2780. FNMA f89 = f91, f34, f89
  2781. FNMA f121 = f123, f34, f121
  2782. ;;
  2783. FNMA f64 = f67, f35, f64
  2784. FNMA f96 = f99, f35, f96
  2785. FNMA f72 = f75, f35, f72
  2786. FNMA f104 = f107, f35, f104
  2787. FNMA f80 = f83, f35, f80
  2788. FNMA f112 = f115, f35, f112
  2789. FNMA f88 = f91, f35, f88
  2790. FNMA f120 = f123, f35, f120
  2791. ;;
  2792. FMPY f66 = f66, f36
  2793. FMPY f98 = f98, f36
  2794. FMPY f74 = f74, f36
  2795. FMPY f106 = f106, f36
  2796. FMPY f82 = f82, f36
  2797. FMPY f114 = f114, f36
  2798. FMPY f90 = f90, f36
  2799. FMPY f122 = f122, f36
  2800. ;;
  2801. FNMA f65 = f66, f37, f65
  2802. FNMA f97 = f98, f37, f97
  2803. FNMA f73 = f74, f37, f73
  2804. FNMA f105 = f106, f37, f105
  2805. FNMA f81 = f82, f37, f81
  2806. FNMA f113 = f114, f37, f113
  2807. FNMA f89 = f90, f37, f89
  2808. FNMA f121 = f122, f37, f121
  2809. ;;
  2810. FNMA f64 = f66, f38, f64
  2811. FNMA f96 = f98, f38, f96
  2812. FNMA f72 = f74, f38, f72
  2813. FNMA f104 = f106, f38, f104
  2814. FNMA f80 = f82, f38, f80
  2815. FNMA f112 = f114, f38, f112
  2816. FNMA f88 = f90, f38, f88
  2817. FNMA f120 = f122, f38, f120
  2818. ;;
  2819. adds BOFFSET = 24 * SIZE, BOFFSET
  2820. adds BOFFSET2 = 24 * SIZE, BOFFSET2
  2821. ;;
  2822. { .mfi
  2823. STFD [BOFFSET] = f67, SIZE
  2824. FMPY f65 = f65, f39
  2825. }
  2826. { .mfi
  2827. STFD [BOFFSET2] = f99, SIZE
  2828. FMPY f97 = f97, f39
  2829. }
  2830. ;;
  2831. { .mfi
  2832. STFD [BOFFSET] = f75, SIZE
  2833. FMPY f73 = f73, f39
  2834. }
  2835. { .mfi
  2836. STFD [BOFFSET2] = f107, SIZE
  2837. FMPY f105 = f105, f39
  2838. }
  2839. ;;
  2840. { .mfi
  2841. STFD [BOFFSET] = f83, SIZE
  2842. FMPY f81 = f81, f39
  2843. }
  2844. { .mfi
  2845. STFD [BOFFSET2] = f115, SIZE
  2846. FMPY f113 = f113, f39
  2847. }
  2848. ;;
  2849. { .mfi
  2850. STFD [BOFFSET] = f91, - 11 * SIZE
  2851. FMPY f89 = f89, f39
  2852. }
  2853. { .mfi
  2854. STFD [BOFFSET2] = f123, - 11 * SIZE
  2855. FMPY f121 = f121, f39
  2856. }
  2857. ;;
  2858. { .mfi
  2859. STFD [BOFFSET] = f66, SIZE
  2860. FNMA f64 = f65, f40, f64
  2861. }
  2862. { .mfi
  2863. STFD [BOFFSET2] = f98, SIZE
  2864. FNMA f96 = f97, f40, f96
  2865. }
  2866. ;;
  2867. { .mfi
  2868. STFD [BOFFSET] = f74, SIZE
  2869. FNMA f72 = f73, f40, f72
  2870. }
  2871. { .mfi
  2872. STFD [BOFFSET2] = f106, SIZE
  2873. FNMA f104 = f105, f40, f104
  2874. }
  2875. ;;
  2876. { .mfi
  2877. STFD [BOFFSET] = f82, SIZE
  2878. FNMA f80 = f81, f40, f80
  2879. }
  2880. { .mfi
  2881. STFD [BOFFSET2] = f114, SIZE
  2882. FNMA f112 = f113, f40, f112
  2883. }
  2884. ;;
  2885. { .mfi
  2886. STFD [BOFFSET] = f90, -11 * SIZE
  2887. FNMA f88 = f89, f40, f88
  2888. }
  2889. { .mfi
  2890. STFD [BOFFSET2] = f122, -11 * SIZE
  2891. FNMA f120 = f121, f40, f120
  2892. }
  2893. ;;
  2894. { .mfi
  2895. STFD [BOFFSET] = f65, SIZE
  2896. FMPY f64 = f64, f41
  2897. }
  2898. { .mfi
  2899. STFD [BOFFSET2] = f97, SIZE
  2900. FMPY f96 = f96, f41
  2901. }
  2902. ;;
  2903. { .mfi
  2904. STFD [BOFFSET] = f73, SIZE
  2905. FMPY f72 = f72, f41
  2906. }
  2907. { .mfi
  2908. STFD [BOFFSET2] = f105, SIZE
  2909. FMPY f104 = f104, f41
  2910. }
  2911. ;;
  2912. { .mfi
  2913. STFD [BOFFSET] = f81, SIZE
  2914. FMPY f80 = f80, f41
  2915. }
  2916. { .mfi
  2917. STFD [BOFFSET2] = f113, SIZE
  2918. FMPY f112 = f112, f41
  2919. }
  2920. ;;
  2921. { .mfi
  2922. STFD [BOFFSET] = f89, - 11 * SIZE
  2923. FMPY f88 = f88, f41
  2924. }
  2925. { .mfi
  2926. STFD [BOFFSET2] = f121, - 11 * SIZE
  2927. FMPY f120 = f120, f41
  2928. }
  2929. ;;
  2930. { .mmi
  2931. STFD [BOFFSET] = f64, SIZE
  2932. STFD [BOFFSET2] = f96, SIZE
  2933. adds C1 = -4 * SIZE, C1
  2934. }
  2935. ;;
  2936. { .mmi
  2937. STFD [BOFFSET] = f72, SIZE
  2938. STFD [BOFFSET2] = f104, SIZE
  2939. adds C2 = -4 * SIZE, C2
  2940. }
  2941. ;;
  2942. { .mmi
  2943. STFD [BOFFSET] = f80, SIZE
  2944. STFD [BOFFSET2] = f112, SIZE
  2945. nop __LINE__
  2946. }
  2947. ;;
  2948. { .mmi
  2949. STFD [BOFFSET] = f88, - 3 * SIZE
  2950. STFD [BOFFSET2] = f120, - 3 * SIZE
  2951. }
  2952. ;;
  2953. #endif
  2954. #ifdef LT
  2955. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  2956. ;;
  2957. LDFPD f34, f35 = [AOFFSET]
  2958. adds AOFFSET = 3 * SIZE, AOFFSET
  2959. ;;
  2960. LDFD f36 = [AOFFSET], 1 * SIZE
  2961. ;;
  2962. LDFPD f37, f38 = [AOFFSET]
  2963. adds AOFFSET = 4 * SIZE, AOFFSET
  2964. ;;
  2965. LDFPD f39, f40 = [AOFFSET]
  2966. adds AOFFSET = 5 * SIZE, AOFFSET
  2967. ;;
  2968. LDFD f41 = [AOFFSET], -15 * SIZE
  2969. ;;
  2970. { .mfi
  2971. FMPY f64 = f64, f32
  2972. nop __LINE__
  2973. }
  2974. { .mfi
  2975. nop __LINE__
  2976. FMPY f96 = f96, f32
  2977. nop __LINE__
  2978. }
  2979. ;;
  2980. { .mfi
  2981. FMPY f72 = f72, f32
  2982. nop __LINE__
  2983. }
  2984. { .mfi
  2985. nop __LINE__
  2986. FMPY f104 = f104, f32
  2987. nop __LINE__
  2988. }
  2989. ;;
  2990. { .mfi
  2991. FMPY f80 = f80, f32
  2992. }
  2993. { .mfi
  2994. nop __LINE__
  2995. FMPY f112 = f112, f32
  2996. nop __LINE__
  2997. }
  2998. ;;
  2999. { .mfi
  3000. FMPY f88 = f88, f32
  3001. nop __LINE__
  3002. }
  3003. { .mfi
  3004. nop __LINE__
  3005. FMPY f120 = f120, f32
  3006. nop __LINE__
  3007. }
  3008. ;;
  3009. { .mfi
  3010. FNMA f65 = f64, f33, f65
  3011. nop __LINE__
  3012. }
  3013. { .mfi
  3014. nop __LINE__
  3015. FNMA f97 = f96, f33, f97
  3016. nop __LINE__
  3017. }
  3018. ;;
  3019. { .mfi
  3020. FNMA f73 = f72, f33, f73
  3021. nop __LINE__
  3022. }
  3023. { .mfi
  3024. nop __LINE__
  3025. FNMA f105 = f104, f33, f105
  3026. nop __LINE__
  3027. }
  3028. ;;
  3029. { .mfi
  3030. FNMA f81 = f80, f33, f81
  3031. }
  3032. { .mfi
  3033. nop __LINE__
  3034. FNMA f113 = f112, f33, f113
  3035. nop __LINE__
  3036. }
  3037. ;;
  3038. { .mfi
  3039. FNMA f89 = f88, f33, f89
  3040. nop __LINE__
  3041. }
  3042. { .mfi
  3043. nop __LINE__
  3044. FNMA f121 = f120, f33, f121
  3045. nop __LINE__
  3046. }
  3047. ;;
  3048. { .mfi
  3049. FNMA f66 = f64, f34, f66
  3050. nop __LINE__
  3051. }
  3052. { .mfi
  3053. nop __LINE__
  3054. FNMA f98 = f96, f34, f98
  3055. nop __LINE__
  3056. }
  3057. ;;
  3058. { .mfi
  3059. FNMA f74 = f72, f34, f74
  3060. }
  3061. { .mfi
  3062. nop __LINE__
  3063. FNMA f106 = f104, f34, f106
  3064. nop __LINE__
  3065. }
  3066. ;;
  3067. { .mfi
  3068. FNMA f82 = f80, f34, f82
  3069. nop __LINE__
  3070. }
  3071. { .mfi
  3072. nop __LINE__
  3073. FNMA f114 = f112, f34, f114
  3074. nop __LINE__
  3075. }
  3076. ;;
  3077. { .mfi
  3078. FNMA f90 = f88, f34, f90
  3079. nop __LINE__
  3080. }
  3081. { .mfi
  3082. nop __LINE__
  3083. FNMA f122 = f120, f34, f122
  3084. nop __LINE__
  3085. }
  3086. ;;
  3087. { .mfi
  3088. FNMA f67 = f64, f35, f67
  3089. }
  3090. { .mfi
  3091. nop __LINE__
  3092. FNMA f99 = f96, f35, f99
  3093. nop __LINE__
  3094. }
  3095. ;;
  3096. { .mfi
  3097. FNMA f75 = f72, f35, f75
  3098. nop __LINE__
  3099. }
  3100. { .mfi
  3101. nop __LINE__
  3102. FNMA f107 = f104, f35, f107
  3103. nop __LINE__
  3104. }
  3105. ;;
  3106. { .mfi
  3107. FNMA f83 = f80, f35, f83
  3108. }
  3109. { .mfi
  3110. nop __LINE__
  3111. FNMA f115 = f112, f35, f115
  3112. nop __LINE__
  3113. }
  3114. ;;
  3115. { .mfi
  3116. FNMA f91 = f88, f35, f91
  3117. nop __LINE__
  3118. }
  3119. { .mfi
  3120. nop __LINE__
  3121. FNMA f123 = f120, f35, f123
  3122. nop __LINE__
  3123. }
  3124. ;;
  3125. FMPY f65 = f65, f36
  3126. FMPY f97 = f97, f36
  3127. FMPY f73 = f73, f36
  3128. FMPY f105 = f105, f36
  3129. FMPY f81 = f81, f36
  3130. FMPY f113 = f113, f36
  3131. FMPY f89 = f89, f36
  3132. FMPY f121 = f121, f36
  3133. ;;
  3134. FNMA f66 = f65, f37, f66
  3135. FNMA f98 = f97, f37, f98
  3136. FNMA f74 = f73, f37, f74
  3137. FNMA f106 = f105, f37, f106
  3138. FNMA f82 = f81, f37, f82
  3139. FNMA f114 = f113, f37, f114
  3140. FNMA f90 = f89, f37, f90
  3141. FNMA f122 = f121, f37, f122
  3142. ;;
  3143. FNMA f67 = f65, f38, f67
  3144. FNMA f99 = f97, f38, f99
  3145. FNMA f75 = f73, f38, f75
  3146. FNMA f107 = f105, f38, f107
  3147. FNMA f83 = f81, f38, f83
  3148. FNMA f115 = f113, f38, f115
  3149. FNMA f91 = f89, f38, f91
  3150. FNMA f123 = f121, f38, f123
  3151. ;;
  3152. FMPY f66 = f66, f39
  3153. FMPY f98 = f98, f39
  3154. FMPY f74 = f74, f39
  3155. FMPY f106 = f106, f39
  3156. FMPY f82 = f82, f39
  3157. FMPY f114 = f114, f39
  3158. FMPY f90 = f90, f39
  3159. FMPY f122 = f122, f39
  3160. ;;
  3161. FNMA f67 = f66, f40, f67
  3162. FNMA f99 = f98, f40, f99
  3163. FNMA f75 = f74, f40, f75
  3164. FNMA f107 = f106, f40, f107
  3165. FNMA f83 = f82, f40, f83
  3166. FNMA f115 = f114, f40, f115
  3167. FNMA f91 = f90, f40, f91
  3168. FNMA f123 = f122, f40, f123
  3169. ;;
  3170. FMPY f67 = f67, f41
  3171. FMPY f99 = f99, f41
  3172. FMPY f75 = f75, f41
  3173. FMPY f107 = f107, f41
  3174. FMPY f83 = f83, f41
  3175. FMPY f115 = f115, f41
  3176. FMPY f91 = f91, f41
  3177. FMPY f123 = f123, f41
  3178. ;;
  3179. { .mfi
  3180. STFD [BOFFSET] = f64, SIZE
  3181. }
  3182. { .mfi
  3183. STFD [BOFFSET2] = f96, SIZE
  3184. }
  3185. ;;
  3186. { .mfi
  3187. STFD [BOFFSET] = f72, SIZE
  3188. }
  3189. { .mfi
  3190. STFD [BOFFSET2] = f104, SIZE
  3191. }
  3192. ;;
  3193. { .mfi
  3194. STFD [BOFFSET] = f80, SIZE
  3195. }
  3196. { .mfi
  3197. STFD [BOFFSET2] = f112, SIZE
  3198. }
  3199. ;;
  3200. { .mfi
  3201. STFD [BOFFSET] = f88, 5 * SIZE
  3202. }
  3203. { .mfi
  3204. STFD [BOFFSET2] = f120, 5 * SIZE
  3205. }
  3206. ;;
  3207. { .mfi
  3208. STFD [BOFFSET] = f65, SIZE
  3209. }
  3210. { .mfi
  3211. STFD [BOFFSET2] = f97, SIZE
  3212. }
  3213. ;;
  3214. { .mfi
  3215. STFD [BOFFSET] = f73, SIZE
  3216. }
  3217. { .mfi
  3218. STFD [BOFFSET2] = f105, SIZE
  3219. }
  3220. ;;
  3221. { .mfi
  3222. STFD [BOFFSET] = f81, SIZE
  3223. }
  3224. { .mfi
  3225. STFD [BOFFSET2] = f113, SIZE
  3226. }
  3227. ;;
  3228. { .mfi
  3229. STFD [BOFFSET] = f89, 5 * SIZE
  3230. }
  3231. { .mfi
  3232. STFD [BOFFSET2] = f121, 5 * SIZE
  3233. }
  3234. ;;
  3235. { .mfi
  3236. STFD [BOFFSET] = f66, SIZE
  3237. }
  3238. { .mfi
  3239. STFD [BOFFSET2] = f98, SIZE
  3240. }
  3241. ;;
  3242. { .mfi
  3243. STFD [BOFFSET] = f74, SIZE
  3244. }
  3245. { .mfi
  3246. STFD [BOFFSET2] = f106, SIZE
  3247. }
  3248. ;;
  3249. { .mfi
  3250. STFD [BOFFSET] = f82, SIZE
  3251. }
  3252. { .mfi
  3253. STFD [BOFFSET2] = f114, SIZE
  3254. }
  3255. ;;
  3256. { .mfi
  3257. STFD [BOFFSET] = f90, 5 * SIZE
  3258. }
  3259. { .mfi
  3260. STFD [BOFFSET2] = f122, 5 * SIZE
  3261. }
  3262. ;;
  3263. { .mfi
  3264. STFD [BOFFSET] = f67, SIZE
  3265. }
  3266. { .mfi
  3267. STFD [BOFFSET2] = f99, SIZE
  3268. }
  3269. ;;
  3270. { .mfi
  3271. STFD [BOFFSET] = f75, SIZE
  3272. }
  3273. { .mfi
  3274. STFD [BOFFSET2] = f107, SIZE
  3275. }
  3276. ;;
  3277. { .mfi
  3278. STFD [BOFFSET] = f83, SIZE
  3279. }
  3280. { .mfi
  3281. STFD [BOFFSET2] = f115, SIZE
  3282. }
  3283. ;;
  3284. { .mfi
  3285. STFD [BOFFSET] = f91, -27 * SIZE
  3286. }
  3287. { .mfi
  3288. STFD [BOFFSET2] = f123, -27 * SIZE
  3289. }
  3290. ;;
  3291. #endif
  3292. #ifdef RN
  3293. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  3294. ;;
  3295. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  3296. ;;
  3297. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  3298. ;;
  3299. LDFPD f38, f39 = [BOFFSET]
  3300. adds BOFFSET = 3 * SIZE, BOFFSET
  3301. ;;
  3302. LDFD f40 = [BOFFSET], 1 * SIZE
  3303. ;;
  3304. LDFPD f41, f42 = [BOFFSET], 2 * SIZE
  3305. ;;
  3306. LDFPD f43, f44 = [BOFFSET], 2 * SIZE
  3307. ;;
  3308. LDFPD f45, f46 = [BOFFSET]
  3309. adds BOFFSET = 4 * SIZE, BOFFSET
  3310. ;;
  3311. LDFPD f47, f48 = [BOFFSET], 2 * SIZE
  3312. ;;
  3313. LDFPD f49, f50 = [BOFFSET], 2 * SIZE
  3314. ;;
  3315. LDFPD f51, f52 = [BOFFSET]
  3316. adds BOFFSET = 5 * SIZE, BOFFSET
  3317. ;;
  3318. LDFD f53 = [BOFFSET], 1 * SIZE
  3319. ;;
  3320. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  3321. ;;
  3322. LDFPD f56, f57 = [BOFFSET]
  3323. adds BOFFSET = 6 * SIZE, BOFFSET
  3324. ;;
  3325. LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  3326. ;;
  3327. LDFPD f60, f61 = [BOFFSET]
  3328. adds BOFFSET = 7 * SIZE, BOFFSET
  3329. ;;
  3330. LDFD f16 = [BOFFSET], 1 * SIZE
  3331. ;;
  3332. LDFPD f17, f18 = [BOFFSET]
  3333. adds BOFFSET = 8 * SIZE, BOFFSET
  3334. ;;
  3335. LDFPD f19, f20 = [BOFFSET]
  3336. adds BOFFSET = 9 * SIZE, BOFFSET
  3337. ;;
  3338. LDFD f21 = [BOFFSET]
  3339. adds BOFFSET = -63 * SIZE, BOFFSET
  3340. ;;
  3341. FMPY f64 = f64, f32
  3342. FMPY f65 = f65, f32
  3343. FMPY f66 = f66, f32
  3344. FMPY f67 = f67, f32
  3345. ;;
  3346. FNMA f72 = f64, f33, f72
  3347. FNMA f73 = f65, f33, f73
  3348. FNMA f74 = f66, f33, f74
  3349. FNMA f75 = f67, f33, f75
  3350. ;;
  3351. FNMA f80 = f64, f34, f80
  3352. FNMA f81 = f65, f34, f81
  3353. FNMA f82 = f66, f34, f82
  3354. FNMA f83 = f67, f34, f83
  3355. ;;
  3356. FNMA f88 = f64, f35, f88
  3357. FNMA f89 = f65, f35, f89
  3358. FNMA f90 = f66, f35, f90
  3359. FNMA f91 = f67, f35, f91
  3360. ;;
  3361. FNMA f96 = f64, f36, f96
  3362. FNMA f97 = f65, f36, f97
  3363. FNMA f98 = f66, f36, f98
  3364. FNMA f99 = f67, f36, f99
  3365. ;;
  3366. FNMA f104 = f64, f37, f104
  3367. FNMA f105 = f65, f37, f105
  3368. FNMA f106 = f66, f37, f106
  3369. FNMA f107 = f67, f37, f107
  3370. ;;
  3371. FNMA f112 = f64, f38, f112
  3372. FNMA f113 = f65, f38, f113
  3373. FNMA f114 = f66, f38, f114
  3374. FNMA f115 = f67, f38, f115
  3375. ;;
  3376. FNMA f120 = f64, f39, f120
  3377. FNMA f121 = f65, f39, f121
  3378. FNMA f122 = f66, f39, f122
  3379. FNMA f123 = f67, f39, f123
  3380. ;;
  3381. FMPY f72 = f72, f40
  3382. FMPY f73 = f73, f40
  3383. FMPY f74 = f74, f40
  3384. FMPY f75 = f75, f40
  3385. ;;
  3386. FNMA f80 = f72, f41, f80
  3387. FNMA f81 = f73, f41, f81
  3388. FNMA f82 = f74, f41, f82
  3389. FNMA f83 = f75, f41, f83
  3390. ;;
  3391. FNMA f88 = f72, f42, f88
  3392. FNMA f89 = f73, f42, f89
  3393. FNMA f90 = f74, f42, f90
  3394. FNMA f91 = f75, f42, f91
  3395. ;;
  3396. FNMA f96 = f72, f43, f96
  3397. FNMA f97 = f73, f43, f97
  3398. FNMA f98 = f74, f43, f98
  3399. FNMA f99 = f75, f43, f99
  3400. ;;
  3401. FNMA f104 = f72, f44, f104
  3402. FNMA f105 = f73, f44, f105
  3403. FNMA f106 = f74, f44, f106
  3404. FNMA f107 = f75, f44, f107
  3405. ;;
  3406. FNMA f112 = f72, f45, f112
  3407. FNMA f113 = f73, f45, f113
  3408. FNMA f114 = f74, f45, f114
  3409. FNMA f115 = f75, f45, f115
  3410. ;;
  3411. FNMA f120 = f72, f46, f120
  3412. FNMA f121 = f73, f46, f121
  3413. FNMA f122 = f74, f46, f122
  3414. FNMA f123 = f75, f46, f123
  3415. ;;
  3416. FMPY f80 = f80, f47
  3417. FMPY f81 = f81, f47
  3418. FMPY f82 = f82, f47
  3419. FMPY f83 = f83, f47
  3420. ;;
  3421. FNMA f88 = f80, f48, f88
  3422. FNMA f89 = f81, f48, f89
  3423. FNMA f90 = f82, f48, f90
  3424. FNMA f91 = f83, f48, f91
  3425. ;;
  3426. FNMA f96 = f80, f49, f96
  3427. FNMA f97 = f81, f49, f97
  3428. FNMA f98 = f82, f49, f98
  3429. FNMA f99 = f83, f49, f99
  3430. ;;
  3431. FNMA f104 = f80, f50, f104
  3432. FNMA f105 = f81, f50, f105
  3433. FNMA f106 = f82, f50, f106
  3434. FNMA f107 = f83, f50, f107
  3435. ;;
  3436. FNMA f112 = f80, f51, f112
  3437. FNMA f113 = f81, f51, f113
  3438. FNMA f114 = f82, f51, f114
  3439. FNMA f115 = f83, f51, f115
  3440. ;;
  3441. FNMA f120 = f80, f52, f120
  3442. FNMA f121 = f81, f52, f121
  3443. FNMA f122 = f82, f52, f122
  3444. FNMA f123 = f83, f52, f123
  3445. ;;
  3446. FMPY f88 = f88, f53
  3447. FMPY f89 = f89, f53
  3448. FMPY f90 = f90, f53
  3449. FMPY f91 = f91, f53
  3450. ;;
  3451. FNMA f96 = f88, f54, f96
  3452. FNMA f97 = f89, f54, f97
  3453. FNMA f98 = f90, f54, f98
  3454. FNMA f99 = f91, f54, f99
  3455. ;;
  3456. FNMA f104 = f88, f55, f104
  3457. FNMA f105 = f89, f55, f105
  3458. FNMA f106 = f90, f55, f106
  3459. FNMA f107 = f91, f55, f107
  3460. ;;
  3461. FNMA f112 = f88, f56, f112
  3462. FNMA f113 = f89, f56, f113
  3463. FNMA f114 = f90, f56, f114
  3464. FNMA f115 = f91, f56, f115
  3465. ;;
  3466. FNMA f120 = f88, f57, f120
  3467. FNMA f121 = f89, f57, f121
  3468. FNMA f122 = f90, f57, f122
  3469. FNMA f123 = f91, f57, f123
  3470. ;;
  3471. FMPY f96 = f96, f58
  3472. FMPY f97 = f97, f58
  3473. FMPY f98 = f98, f58
  3474. FMPY f99 = f99, f58
  3475. ;;
  3476. FNMA f104 = f96, f59, f104
  3477. FNMA f105 = f97, f59, f105
  3478. FNMA f106 = f98, f59, f106
  3479. FNMA f107 = f99, f59, f107
  3480. ;;
  3481. FNMA f112 = f96, f60, f112
  3482. FNMA f113 = f97, f60, f113
  3483. FNMA f114 = f98, f60, f114
  3484. FNMA f115 = f99, f60, f115
  3485. ;;
  3486. FNMA f120 = f96, f61, f120
  3487. FNMA f121 = f97, f61, f121
  3488. FNMA f122 = f98, f61, f122
  3489. FNMA f123 = f99, f61, f123
  3490. ;;
  3491. FMPY f104 = f104, f16
  3492. FMPY f105 = f105, f16
  3493. FMPY f106 = f106, f16
  3494. FMPY f107 = f107, f16
  3495. ;;
  3496. FNMA f112 = f104, f17, f112
  3497. FNMA f113 = f105, f17, f113
  3498. FNMA f114 = f106, f17, f114
  3499. FNMA f115 = f107, f17, f115
  3500. ;;
  3501. FNMA f120 = f104, f18, f120
  3502. FNMA f121 = f105, f18, f121
  3503. FNMA f122 = f106, f18, f122
  3504. FNMA f123 = f107, f18, f123
  3505. ;;
  3506. FMPY f112 = f112, f19
  3507. FMPY f113 = f113, f19
  3508. FMPY f114 = f114, f19
  3509. FMPY f115 = f115, f19
  3510. ;;
  3511. FNMA f120 = f112, f20, f120
  3512. FNMA f121 = f113, f20, f121
  3513. FNMA f122 = f114, f20, f122
  3514. FNMA f123 = f115, f20, f123
  3515. ;;
  3516. FMPY f120 = f120, f21
  3517. FMPY f121 = f121, f21
  3518. FMPY f122 = f122, f21
  3519. FMPY f123 = f123, f21
  3520. ;;
  3521. STFD [AOFFSET] = f64, SIZE
  3522. STFD [AOFFSET2] = f72, SIZE
  3523. ;;
  3524. STFD [AOFFSET] = f65, SIZE
  3525. STFD [AOFFSET2] = f73, SIZE
  3526. ;;
  3527. STFD [AOFFSET] = f66, SIZE
  3528. STFD [AOFFSET2] = f74, SIZE
  3529. ;;
  3530. STFD [AOFFSET] = f67, 5 * SIZE
  3531. STFD [AOFFSET2] = f75, 5 * SIZE
  3532. ;;
  3533. STFD [AOFFSET] = f80, SIZE
  3534. STFD [AOFFSET2] = f88, SIZE
  3535. ;;
  3536. STFD [AOFFSET] = f81, SIZE
  3537. STFD [AOFFSET2] = f89, SIZE
  3538. ;;
  3539. STFD [AOFFSET] = f82, SIZE
  3540. STFD [AOFFSET2] = f90, SIZE
  3541. ;;
  3542. STFD [AOFFSET] = f83, 5 * SIZE
  3543. STFD [AOFFSET2] = f91, 5 * SIZE
  3544. ;;
  3545. STFD [AOFFSET] = f96, SIZE
  3546. STFD [AOFFSET2] = f104, SIZE
  3547. ;;
  3548. STFD [AOFFSET] = f97, SIZE
  3549. STFD [AOFFSET2] = f105, SIZE
  3550. ;;
  3551. STFD [AOFFSET] = f98, SIZE
  3552. STFD [AOFFSET2] = f106, SIZE
  3553. ;;
  3554. STFD [AOFFSET] = f99, 5 * SIZE
  3555. STFD [AOFFSET2] = f107, 5 * SIZE
  3556. ;;
  3557. STFD [AOFFSET] = f112, SIZE
  3558. STFD [AOFFSET2] = f120, SIZE
  3559. ;;
  3560. STFD [AOFFSET] = f113, SIZE
  3561. STFD [AOFFSET2] = f121, SIZE
  3562. ;;
  3563. STFD [AOFFSET] = f114, SIZE
  3564. STFD [AOFFSET2] = f122, SIZE
  3565. ;;
  3566. STFD [AOFFSET] = f115, -27 * SIZE
  3567. STFD [AOFFSET2] = f123, - 27 * SIZE
  3568. ;;
  3569. #endif
  3570. #ifdef RT
  3571. adds BOFFSET = 62 * SIZE, BOFFSET
  3572. ;;
  3573. LDFPD f33, f32 = [BOFFSET]
  3574. adds BOFFSET = - 2 * SIZE, BOFFSET
  3575. ;;
  3576. LDFPD f35, f34 = [BOFFSET]
  3577. adds BOFFSET = - 2 * SIZE, BOFFSET
  3578. ;;
  3579. LDFPD f37, f36 = [BOFFSET]
  3580. adds BOFFSET = - 2 * SIZE, BOFFSET
  3581. ;;
  3582. LDFPD f39, f38 = [BOFFSET]
  3583. adds BOFFSET = - 2 * SIZE, BOFFSET
  3584. ;;
  3585. LDFD f40 = [BOFFSET], -2 * SIZE
  3586. ;;
  3587. LDFPD f42, f41 = [BOFFSET]
  3588. adds BOFFSET = - 2 * SIZE, BOFFSET
  3589. ;;
  3590. LDFPD f44, f43 = [BOFFSET]
  3591. adds BOFFSET = - 2 * SIZE, BOFFSET
  3592. ;;
  3593. LDFPD f46, f45 = [BOFFSET]
  3594. adds BOFFSET = - 4 * SIZE, BOFFSET
  3595. ;;
  3596. LDFPD f48, f47 = [BOFFSET]
  3597. adds BOFFSET = - 2 * SIZE, BOFFSET
  3598. ;;
  3599. LDFPD f50, f49 = [BOFFSET]
  3600. adds BOFFSET = - 2 * SIZE, BOFFSET
  3601. ;;
  3602. LDFPD f52, f51 = [BOFFSET]
  3603. adds BOFFSET = - 4 * SIZE, BOFFSET
  3604. ;;
  3605. LDFD f53 = [BOFFSET], -2 * SIZE
  3606. ;;
  3607. LDFPD f55, f54 = [BOFFSET]
  3608. adds BOFFSET = - 2 * SIZE, BOFFSET
  3609. ;;
  3610. LDFPD f57, f56 = [BOFFSET]
  3611. adds BOFFSET = - 6 * SIZE, BOFFSET
  3612. ;;
  3613. LDFPD f59, f58 = [BOFFSET]
  3614. adds BOFFSET = - 2 * SIZE, BOFFSET
  3615. ;;
  3616. LDFPD f61, f60 = [BOFFSET]
  3617. adds BOFFSET = - 6 * SIZE, BOFFSET
  3618. ;;
  3619. LDFD f16 = [BOFFSET], -2 * SIZE
  3620. ;;
  3621. LDFPD f18, f17 = [BOFFSET]
  3622. adds BOFFSET = - 8 * SIZE, BOFFSET
  3623. ;;
  3624. LDFPD f20, f19 = [BOFFSET]
  3625. adds BOFFSET = - 8 * SIZE, BOFFSET
  3626. ;;
  3627. LDFD f21 = [BOFFSET]
  3628. ;;
  3629. FMPY f120 = f120, f32
  3630. FMPY f121 = f121, f32
  3631. FMPY f122 = f122, f32
  3632. FMPY f123 = f123, f32
  3633. ;;
  3634. FNMA f112 = f120, f33, f112
  3635. FNMA f113 = f121, f33, f113
  3636. FNMA f114 = f122, f33, f114
  3637. FNMA f115 = f123, f33, f115
  3638. ;;
  3639. FNMA f104 = f120, f34, f104
  3640. FNMA f105 = f121, f34, f105
  3641. FNMA f106 = f122, f34, f106
  3642. FNMA f107 = f123, f34, f107
  3643. ;;
  3644. FNMA f96 = f120, f35, f96
  3645. FNMA f97 = f121, f35, f97
  3646. FNMA f98 = f122, f35, f98
  3647. FNMA f99 = f123, f35, f99
  3648. ;;
  3649. FNMA f88 = f120, f36, f88
  3650. FNMA f89 = f121, f36, f89
  3651. FNMA f90 = f122, f36, f90
  3652. FNMA f91 = f123, f36, f91
  3653. ;;
  3654. FNMA f80 = f120, f37, f80
  3655. FNMA f81 = f121, f37, f81
  3656. FNMA f82 = f122, f37, f82
  3657. FNMA f83 = f123, f37, f83
  3658. ;;
  3659. FNMA f72 = f120, f38, f72
  3660. FNMA f73 = f121, f38, f73
  3661. FNMA f74 = f122, f38, f74
  3662. FNMA f75 = f123, f38, f75
  3663. ;;
  3664. FNMA f64 = f120, f39, f64
  3665. FNMA f65 = f121, f39, f65
  3666. FNMA f66 = f122, f39, f66
  3667. FNMA f67 = f123, f39, f67
  3668. ;;
  3669. FMPY f112 = f112, f40
  3670. FMPY f113 = f113, f40
  3671. FMPY f114 = f114, f40
  3672. FMPY f115 = f115, f40
  3673. ;;
  3674. FNMA f104 = f112, f41, f104
  3675. FNMA f105 = f113, f41, f105
  3676. FNMA f106 = f114, f41, f106
  3677. FNMA f107 = f115, f41, f107
  3678. ;;
  3679. FNMA f96 = f112, f42, f96
  3680. FNMA f97 = f113, f42, f97
  3681. FNMA f98 = f114, f42, f98
  3682. FNMA f99 = f115, f42, f99
  3683. ;;
  3684. FNMA f88 = f112, f43, f88
  3685. FNMA f89 = f113, f43, f89
  3686. FNMA f90 = f114, f43, f90
  3687. FNMA f91 = f115, f43, f91
  3688. ;;
  3689. FNMA f80 = f112, f44, f80
  3690. FNMA f81 = f113, f44, f81
  3691. FNMA f82 = f114, f44, f82
  3692. FNMA f83 = f115, f44, f83
  3693. ;;
  3694. FNMA f72 = f112, f45, f72
  3695. FNMA f73 = f113, f45, f73
  3696. FNMA f74 = f114, f45, f74
  3697. FNMA f75 = f115, f45, f75
  3698. ;;
  3699. FNMA f64 = f112, f46, f64
  3700. FNMA f65 = f113, f46, f65
  3701. FNMA f66 = f114, f46, f66
  3702. FNMA f67 = f115, f46, f67
  3703. ;;
  3704. FMPY f104 = f104, f47
  3705. FMPY f105 = f105, f47
  3706. FMPY f106 = f106, f47
  3707. FMPY f107 = f107, f47
  3708. ;;
  3709. FNMA f96 = f104, f48, f96
  3710. FNMA f97 = f105, f48, f97
  3711. FNMA f98 = f106, f48, f98
  3712. FNMA f99 = f107, f48, f99
  3713. ;;
  3714. FNMA f88 = f104, f49, f88
  3715. FNMA f89 = f105, f49, f89
  3716. FNMA f90 = f106, f49, f90
  3717. FNMA f91 = f107, f49, f91
  3718. ;;
  3719. FNMA f80 = f104, f50, f80
  3720. FNMA f81 = f105, f50, f81
  3721. FNMA f82 = f106, f50, f82
  3722. FNMA f83 = f107, f50, f83
  3723. ;;
  3724. FNMA f72 = f104, f51, f72
  3725. FNMA f73 = f105, f51, f73
  3726. FNMA f74 = f106, f51, f74
  3727. FNMA f75 = f107, f51, f75
  3728. ;;
  3729. FNMA f64 = f104, f52, f64
  3730. FNMA f65 = f105, f52, f65
  3731. FNMA f66 = f106, f52, f66
  3732. FNMA f67 = f107, f52, f67
  3733. ;;
  3734. FMPY f96 = f96, f53
  3735. FMPY f97 = f97, f53
  3736. FMPY f98 = f98, f53
  3737. FMPY f99 = f99, f53
  3738. ;;
  3739. FNMA f88 = f96, f54, f88
  3740. FNMA f89 = f97, f54, f89
  3741. FNMA f90 = f98, f54, f90
  3742. FNMA f91 = f99, f54, f91
  3743. ;;
  3744. FNMA f80 = f96, f55, f80
  3745. FNMA f81 = f97, f55, f81
  3746. FNMA f82 = f98, f55, f82
  3747. FNMA f83 = f99, f55, f83
  3748. ;;
  3749. FNMA f72 = f96, f56, f72
  3750. FNMA f73 = f97, f56, f73
  3751. FNMA f74 = f98, f56, f74
  3752. FNMA f75 = f99, f56, f75
  3753. ;;
  3754. FNMA f64 = f96, f57, f64
  3755. FNMA f65 = f97, f57, f65
  3756. FNMA f66 = f98, f57, f66
  3757. FNMA f67 = f99, f57, f67
  3758. ;;
  3759. FMPY f88 = f88, f58
  3760. FMPY f89 = f89, f58
  3761. FMPY f90 = f90, f58
  3762. FMPY f91 = f91, f58
  3763. ;;
  3764. FNMA f80 = f88, f59, f80
  3765. FNMA f81 = f89, f59, f81
  3766. FNMA f82 = f90, f59, f82
  3767. FNMA f83 = f91, f59, f83
  3768. ;;
  3769. FNMA f72 = f88, f60, f72
  3770. FNMA f73 = f89, f60, f73
  3771. FNMA f74 = f90, f60, f74
  3772. FNMA f75 = f91, f60, f75
  3773. ;;
  3774. FNMA f64 = f88, f61, f64
  3775. FNMA f65 = f89, f61, f65
  3776. FNMA f66 = f90, f61, f66
  3777. FNMA f67 = f91, f61, f67
  3778. ;;
  3779. FMPY f80 = f80, f16
  3780. FMPY f81 = f81, f16
  3781. FMPY f82 = f82, f16
  3782. FMPY f83 = f83, f16
  3783. ;;
  3784. FNMA f72 = f80, f17, f72
  3785. FNMA f73 = f81, f17, f73
  3786. FNMA f74 = f82, f17, f74
  3787. FNMA f75 = f83, f17, f75
  3788. ;;
  3789. FNMA f64 = f80, f18, f64
  3790. FNMA f65 = f81, f18, f65
  3791. FNMA f66 = f82, f18, f66
  3792. FNMA f67 = f83, f18, f67
  3793. ;;
  3794. FMPY f72 = f72, f19
  3795. FMPY f73 = f73, f19
  3796. FMPY f74 = f74, f19
  3797. FMPY f75 = f75, f19
  3798. ;;
  3799. FNMA f64 = f72, f20, f64
  3800. FNMA f65 = f73, f20, f65
  3801. FNMA f66 = f74, f20, f66
  3802. FNMA f67 = f75, f20, f67
  3803. ;;
  3804. FMPY f64 = f64, f21
  3805. FMPY f65 = f65, f21
  3806. FMPY f66 = f66, f21
  3807. FMPY f67 = f67, f21
  3808. ;;
  3809. adds AOFFSET = 24 * SIZE, AOFFSET
  3810. adds AOFFSET2 = 24 * SIZE, AOFFSET2
  3811. ;;
  3812. STFD [AOFFSET] = f112, SIZE
  3813. STFD [AOFFSET2] = f120, SIZE
  3814. ;;
  3815. STFD [AOFFSET] = f113, SIZE
  3816. STFD [AOFFSET2] = f121, SIZE
  3817. ;;
  3818. STFD [AOFFSET] = f114, SIZE
  3819. STFD [AOFFSET2] = f122, SIZE
  3820. ;;
  3821. STFD [AOFFSET] = f115, - 11 * SIZE
  3822. STFD [AOFFSET2] = f123, - 11 * SIZE
  3823. ;;
  3824. STFD [AOFFSET] = f96, SIZE
  3825. STFD [AOFFSET2] = f104, SIZE
  3826. ;;
  3827. STFD [AOFFSET] = f97, SIZE
  3828. STFD [AOFFSET2] = f105, SIZE
  3829. ;;
  3830. STFD [AOFFSET] = f98, SIZE
  3831. STFD [AOFFSET2] = f106, SIZE
  3832. ;;
  3833. STFD [AOFFSET] = f99, - 11 * SIZE
  3834. STFD [AOFFSET2] = f107, - 11 * SIZE
  3835. ;;
  3836. STFD [AOFFSET] = f80, SIZE
  3837. STFD [AOFFSET2] = f88, SIZE
  3838. ;;
  3839. STFD [AOFFSET] = f81, SIZE
  3840. STFD [AOFFSET2] = f89, SIZE
  3841. ;;
  3842. STFD [AOFFSET] = f82, SIZE
  3843. STFD [AOFFSET2] = f90, SIZE
  3844. ;;
  3845. STFD [AOFFSET] = f83, - 11 * SIZE
  3846. STFD [AOFFSET2] = f91, - 11 * SIZE
  3847. ;;
  3848. STFD [AOFFSET] = f64, SIZE
  3849. STFD [AOFFSET2] = f72, SIZE
  3850. ;;
  3851. STFD [AOFFSET] = f65, SIZE
  3852. STFD [AOFFSET2] = f73, SIZE
  3853. ;;
  3854. STFD [AOFFSET] = f66, SIZE
  3855. STFD [AOFFSET2] = f74, SIZE
  3856. ;;
  3857. STFD [AOFFSET] = f67, - 3 * SIZE
  3858. STFD [AOFFSET2] = f75, - 3 * SIZE
  3859. ;;
  3860. #endif
  3861. { .mmf
  3862. STFD [C1 ] = f64, SIZE
  3863. mov f64 = f0
  3864. }
  3865. ;;
  3866. { .mmi
  3867. STFD [C1 ] = f65, SIZE
  3868. }
  3869. ;;
  3870. { .mmi
  3871. STFD [C1 ] = f66, SIZE
  3872. #ifdef LN
  3873. adds C3 = -4 * SIZE, C3
  3874. #else
  3875. nop __LINE__
  3876. #endif
  3877. }
  3878. ;;
  3879. { .mmi
  3880. #ifndef LN
  3881. STFD [C1 ] = f67, SIZE
  3882. #else
  3883. STFD [C1 ] = f67, - 3 * SIZE
  3884. #endif
  3885. }
  3886. ;;
  3887. { .mmf
  3888. STFD [C2 ] = f72, SIZE
  3889. mov f72 = f0
  3890. }
  3891. ;;
  3892. { .mmi
  3893. STFD [C2 ] = f73, SIZE
  3894. #ifdef LN
  3895. adds C4 = -4 * SIZE, C4
  3896. #else
  3897. nop __LINE__
  3898. #endif
  3899. }
  3900. ;;
  3901. { .mmi
  3902. STFD [C2 ] = f74, SIZE
  3903. }
  3904. ;;
  3905. { .mmi
  3906. #ifndef LN
  3907. STFD [C2 ] = f75, SIZE
  3908. #else
  3909. STFD [C2 ] = f75, - 3 * SIZE
  3910. #endif
  3911. #ifdef LN
  3912. adds C5 = -4 * SIZE, C5
  3913. #else
  3914. nop __LINE__
  3915. #endif
  3916. }
  3917. ;;
  3918. { .mmf
  3919. STFD [C3 ] = f80, SIZE
  3920. mov f80 = f0
  3921. }
  3922. ;;
  3923. { .mmi
  3924. STFD [C3 ] = f81, SIZE
  3925. }
  3926. ;;
  3927. { .mmi
  3928. STFD [C3 ] = f82, SIZE
  3929. #ifdef LN
  3930. adds C6 = -4 * SIZE, C6
  3931. #else
  3932. nop __LINE__
  3933. #endif
  3934. }
  3935. ;;
  3936. { .mmi
  3937. #ifndef LN
  3938. STFD [C3 ] = f83, SIZE
  3939. #else
  3940. STFD [C3 ] = f83, - 3 * SIZE
  3941. #endif
  3942. }
  3943. ;;
  3944. { .mmf
  3945. STFD [C4 ] = f88, SIZE
  3946. mov f88 = f0
  3947. }
  3948. ;;
  3949. { .mmi
  3950. STFD [C4 ] = f89, SIZE
  3951. #ifdef LN
  3952. adds C8 = -4 * SIZE, C8
  3953. #else
  3954. nop __LINE__
  3955. #endif
  3956. }
  3957. ;;
  3958. { .mmi
  3959. STFD [C4 ] = f90, SIZE
  3960. }
  3961. ;;
  3962. { .mmi
  3963. #ifndef LN
  3964. STFD [C4 ] = f91, SIZE
  3965. #else
  3966. STFD [C4 ] = f91, - 3 * SIZE
  3967. #endif
  3968. nop __LINE__
  3969. }
  3970. ;;
  3971. { .mmf
  3972. STFD [C5 ] = f96, SIZE
  3973. mov f96 = f0
  3974. }
  3975. ;;
  3976. { .mmi
  3977. STFD [C5 ] = f97, SIZE
  3978. nop __LINE__
  3979. }
  3980. ;;
  3981. { .mmi
  3982. STFD [C5 ] = f98, SIZE
  3983. #ifdef LN
  3984. adds C7 = -4 * SIZE, C7
  3985. #else
  3986. nop __LINE__
  3987. #endif
  3988. }
  3989. ;;
  3990. { .mmi
  3991. #ifndef LN
  3992. STFD [C5 ] = f99, SIZE
  3993. #else
  3994. STFD [C5 ] = f99, - 3 * SIZE
  3995. #endif
  3996. }
  3997. ;;
  3998. { .mmf
  3999. STFD [C6 ] = f104, SIZE
  4000. mov f104 = f0
  4001. }
  4002. ;;
  4003. { .mmi
  4004. STFD [C6 ] = f105, SIZE
  4005. shladd r2 = K, BASE_SHIFT, r0
  4006. }
  4007. ;;
  4008. { .mmi
  4009. STFD [C6 ] = f106, SIZE
  4010. sub L = K, KK
  4011. }
  4012. ;;
  4013. { .mmi
  4014. #ifndef LN
  4015. STFD [C6 ] = f107, SIZE
  4016. #else
  4017. STFD [C6 ] = f107, - 3 * SIZE
  4018. #endif
  4019. #ifdef RT
  4020. shladd AORIG = r2, 2, AORIG
  4021. #else
  4022. nop __LINE__
  4023. #endif
  4024. }
  4025. ;;
  4026. { .mmf
  4027. STFD [C7 ] = f112, SIZE
  4028. mov f112 = f0
  4029. }
  4030. ;;
  4031. { .mmi
  4032. STFD [C7 ] = f113, SIZE
  4033. #if defined(LT) || defined(RN)
  4034. shladd L = L, BASE_SHIFT, r0
  4035. #else
  4036. nop __LINE__
  4037. #endif
  4038. }
  4039. ;;
  4040. { .mmi
  4041. STFD [C7 ] = f114, SIZE
  4042. #if defined(LT) || defined(RN)
  4043. shladd AOFFSET = L, 2, AOFFSET
  4044. #else
  4045. nop __LINE__
  4046. #endif
  4047. }
  4048. ;;
  4049. { .mmi
  4050. #ifndef LN
  4051. STFD [C7 ] = f115, SIZE
  4052. #else
  4053. STFD [C7 ] = f115, - 3 * SIZE
  4054. #endif
  4055. #if defined(LT) || defined(RN)
  4056. shladd BOFFSET = L, 3, BOFFSET
  4057. #else
  4058. nop __LINE__
  4059. #endif
  4060. }
  4061. ;;
  4062. { .mmf
  4063. STFD [C8 ] = f120, SIZE
  4064. mov f120 = f0
  4065. }
  4066. ;;
  4067. { .mmi
  4068. STFD [C8 ] = f121, SIZE
  4069. #ifdef LT
  4070. adds KK = 4, KK
  4071. #elif defined LN
  4072. adds KK = -4, KK
  4073. #else
  4074. nop __LINE__
  4075. #endif
  4076. }
  4077. ;;
  4078. { .mmi
  4079. STFD [C8 ] = f122, SIZE
  4080. #if defined(LT) || defined(RN)
  4081. mov L = KK
  4082. #else
  4083. sub L = K, KK
  4084. #endif
  4085. }
  4086. ;;
  4087. { .mmb
  4088. #ifndef LN
  4089. STFD [C8 ] = f123, SIZE
  4090. #else
  4091. STFD [C8 ] = f123, - 3 * SIZE
  4092. #endif
  4093. }
  4094. ;;
  4095. .align 8
  4096. .L010:
  4097. { .mib
  4098. cmp.gt p6, p0 = 8, M
  4099. shr I = M, 3
  4100. (p6) br.cond.dpnt .L049
  4101. }
  4102. ;;
  4103. .align 8
  4104. .L011:
  4105. { .mmi
  4106. cmp.ne p7, p0 = r0, L
  4107. shladd r3 = KK, BASE_SHIFT, r0
  4108. shl r2 = K, 3 + BASE_SHIFT
  4109. }
  4110. ;;
  4111. { .mmi
  4112. shladd BOFFSET = r3, 3, B
  4113. sub AORIG = AORIG, r2
  4114. nop __LINE__
  4115. }
  4116. ;;
  4117. { .mmf
  4118. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  4119. setf.d f64 = r0
  4120. mov f72 = f0
  4121. }
  4122. { .mfi
  4123. setf.d f80 = r0
  4124. mov f88 = f0
  4125. shladd AOFFSET = r3, 3, AORIG
  4126. }
  4127. ;;
  4128. { .mmf
  4129. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  4130. setf.d f96 = r0
  4131. mov f104 = f0
  4132. }
  4133. { .mfb
  4134. setf.d f112 = r0
  4135. mov f120 = f0
  4136. nop __LINE__
  4137. }
  4138. ;;
  4139. { .mmf
  4140. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  4141. setf.d f65 = r0
  4142. mov f73 = f0
  4143. }
  4144. { .mfb
  4145. setf.d f89 = r0
  4146. mov f81 = f0
  4147. nop __LINE__
  4148. }
  4149. ;;
  4150. { .mmf
  4151. (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  4152. setf.d f97 = r0
  4153. mov f105 = f0
  4154. }
  4155. { .mfb
  4156. setf.d f113 = r0
  4157. mov f121 = f0
  4158. nop __LINE__
  4159. }
  4160. ;;
  4161. { .mmf
  4162. (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  4163. setf.d f66 = r0
  4164. mov f74 = f0
  4165. }
  4166. { .mfb
  4167. setf.d f82 = r0
  4168. mov f90 = f0
  4169. nop __LINE__
  4170. }
  4171. ;;
  4172. { .mmf
  4173. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  4174. setf.d f98 = r0
  4175. mov f106 = f0
  4176. }
  4177. { .mfi
  4178. setf.d f114 = r0
  4179. mov f122 = f0
  4180. adds PREC = CPREFETCHSIZE * SIZE, C1
  4181. }
  4182. ;;
  4183. { .mmf
  4184. (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  4185. setf.d f67 = r0
  4186. mov f75 = f0
  4187. }
  4188. { .mfi
  4189. setf.d f83 = r0
  4190. mov f91 = f0
  4191. cmp.eq p3, p0 = r0, r0
  4192. }
  4193. ;;
  4194. { .mmf
  4195. (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  4196. setf.d f99 = r0
  4197. mov f107 = f0
  4198. }
  4199. { .mfi
  4200. setf.d f115 = r0
  4201. mov f123 = f0
  4202. adds L = 1, L
  4203. }
  4204. ;;
  4205. { .mmf
  4206. CPREFETCH [PREC], LDC
  4207. setf.d f68 = r0
  4208. mov f76 = f0
  4209. }
  4210. { .mfi
  4211. setf.d f84 = r0
  4212. mov f92 = f0
  4213. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  4214. }
  4215. ;;
  4216. { .mmf
  4217. CPREFETCH [PREC], LDC
  4218. setf.d f100 = r0
  4219. mov f108 = f0
  4220. }
  4221. { .mfi
  4222. setf.d f116 = r0
  4223. mov f124 = f0
  4224. adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET
  4225. }
  4226. ;;
  4227. { .mmf
  4228. CPREFETCH [PREC], LDC
  4229. setf.d f69 = r0
  4230. mov f77 = f0
  4231. }
  4232. { .mfi
  4233. setf.d f85 = r0
  4234. mov f93 = f0
  4235. tbit.z p12, p0 = L, 0
  4236. }
  4237. ;;
  4238. { .mmf
  4239. CPREFETCH [PREC], LDC
  4240. setf.d f101 = r0
  4241. mov f109 = f0
  4242. }
  4243. { .mfi
  4244. setf.d f117 = r0
  4245. mov f125 = f0
  4246. shr L = L, 1
  4247. }
  4248. ;;
  4249. { .mmf
  4250. CPREFETCH [PREC], LDC
  4251. setf.d f70 = r0
  4252. mov f78 = f0
  4253. }
  4254. { .mfi
  4255. setf.d f86 = r0
  4256. mov f94 = f0
  4257. adds L = -1, L
  4258. }
  4259. ;;
  4260. { .mmf
  4261. CPREFETCH [PREC], LDC
  4262. setf.d f102 = r0
  4263. mov f110 = f0
  4264. }
  4265. { .mfi
  4266. setf.d f118 = r0
  4267. mov f126 = f0
  4268. mov ar.lc = L
  4269. }
  4270. ;;
  4271. { .mmf
  4272. CPREFETCH [PREC], LDC
  4273. setf.d f71 = r0
  4274. mov f79 = f0
  4275. }
  4276. { .mfi
  4277. setf.d f87 = r0
  4278. mov f95 = f0
  4279. cmp.eq p6, p0 = -1, L
  4280. }
  4281. ;;
  4282. { .mmf
  4283. CPREFETCH [PREC]
  4284. setf.d f103 = r0
  4285. mov f111 = f0
  4286. }
  4287. { .mfb
  4288. setf.d f119 = r0
  4289. mov f127 = f0
  4290. (p6) br.cond.dpnt .L018
  4291. }
  4292. ;;
  4293. .align 8
  4294. .L012:
  4295. /* 1 */
  4296. { .mfi
  4297. lfetch.fault.nt1 [PREA], 16 * SIZE
  4298. FMA f64 = f32, f48, f64 // A1 * B1
  4299. nop __LINE__
  4300. }
  4301. { .mfi
  4302. (p12) cmp.ne p3, p0 = 0, L
  4303. FMA f72 = f32, f49, f72 // A1 * B2
  4304. nop __LINE__
  4305. }
  4306. ;;
  4307. /* 2 */
  4308. { .mfb
  4309. lfetch.nt1 [PREB], 16 * SIZE
  4310. FMA f80 = f32, f50, f80 // A1 * B3
  4311. nop __LINE__
  4312. }
  4313. { .mfb
  4314. cmp.ne p4, p5 = 0, L
  4315. FMA f88 = f32, f51, f88 // A1 * B4
  4316. nop __LINE__
  4317. }
  4318. ;;
  4319. /* 3 */
  4320. { .mfb
  4321. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  4322. FMA f96 = f32, f52, f96 // A1 * B5
  4323. nop __LINE__
  4324. }
  4325. { .mfb
  4326. adds C9 = 4 * SIZE, C1
  4327. FMA f104 = f32, f53, f104 // A1 * B6
  4328. nop __LINE__
  4329. }
  4330. ;;
  4331. /* 4 */
  4332. { .mfb
  4333. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  4334. FMA f112 = f32, f54, f112 // A1 * B7
  4335. nop __LINE__
  4336. }
  4337. { .mfb
  4338. adds C10 = 4 * SIZE, C2
  4339. FMA f120 = f32, f55, f120 // A1 * B8
  4340. nop __LINE__
  4341. }
  4342. ;;
  4343. /* 5 */
  4344. { .mfb
  4345. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  4346. FMA f65 = f33, f48, f65 // A2 * B1
  4347. nop __LINE__
  4348. }
  4349. { .mfb
  4350. adds C11 = 4 * SIZE, C3
  4351. FMA f73 = f33, f49, f73 // A2 * B2
  4352. nop __LINE__
  4353. }
  4354. ;;
  4355. /* 6 */
  4356. { .mfb
  4357. (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  4358. FMA f81 = f33, f50, f81 // A2 * B3
  4359. nop __LINE__
  4360. }
  4361. { .mfb
  4362. adds C12 = 4 * SIZE, C4
  4363. FMA f89 = f33, f51, f89 // A2 * B4
  4364. nop __LINE__
  4365. }
  4366. ;;
  4367. /* 7 */
  4368. { .mfb
  4369. (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  4370. FMA f97 = f33, f52, f97 // A2 * B5
  4371. nop __LINE__
  4372. }
  4373. { .mfb
  4374. adds C13 = 4 * SIZE, C5
  4375. FMA f105 = f33, f53, f105 // A2 * B6
  4376. nop __LINE__
  4377. }
  4378. ;;
  4379. /* 8 */
  4380. { .mfb
  4381. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  4382. FMA f113 = f33, f54, f113 // A2 * B7
  4383. nop __LINE__
  4384. }
  4385. { .mfb
  4386. adds C14 = 4 * SIZE, C6
  4387. FMA f121 = f33, f55, f121 // A2 * B8
  4388. nop __LINE__
  4389. }
  4390. ;;
  4391. /* 9 */
  4392. { .mfb
  4393. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  4394. FMA f66 = f34, f48, f66 // A3 * B1
  4395. nop __LINE__
  4396. }
  4397. { .mfb
  4398. adds C15 = 4 * SIZE, C7
  4399. FMA f74 = f34, f49, f74 // A3 * B2
  4400. nop __LINE__
  4401. }
  4402. ;;
  4403. /* 10 */
  4404. { .mfb
  4405. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  4406. FMA f82 = f34, f50, f82 // A3 * B3
  4407. nop __LINE__
  4408. }
  4409. { .mfb
  4410. adds C16 = 4 * SIZE, C8
  4411. FMA f90 = f34, f51, f90 // A3 * B4
  4412. nop __LINE__
  4413. }
  4414. ;;
  4415. /* 11 */
  4416. { .mfb
  4417. FMA f98 = f34, f52, f98 // A3 * B5
  4418. nop __LINE__
  4419. }
  4420. { .mfb
  4421. nop __LINE__
  4422. FMA f106 = f34, f53, f106 // A3 * B6
  4423. nop __LINE__
  4424. }
  4425. ;;
  4426. /* 12 */
  4427. { .mfb
  4428. FMA f114 = f34, f54, f114 // A3 * B7
  4429. nop __LINE__
  4430. }
  4431. { .mfb
  4432. nop __LINE__
  4433. FMA f122 = f34, f55, f122 // A3 * B8
  4434. nop __LINE__
  4435. }
  4436. ;;
  4437. /* 13 */
  4438. { .mfb
  4439. nop __LINE__
  4440. FMA f67 = f35, f48, f67 // A4 * B1
  4441. }
  4442. { .mfb
  4443. nop __LINE__
  4444. FMA f75 = f35, f49, f75 // A4 * B2
  4445. nop __LINE__
  4446. }
  4447. ;;
  4448. /* 14 */
  4449. { .mfb
  4450. FMA f83 = f35, f50, f83 // A4 * B3
  4451. nop __LINE__
  4452. }
  4453. { .mfb
  4454. nop __LINE__
  4455. FMA f91 = f35, f51, f91 // A4 * B4
  4456. nop __LINE__
  4457. }
  4458. ;;
  4459. /* 15 */
  4460. { .mfb
  4461. FMA f99 = f35, f52, f99 // A4 * B5
  4462. nop __LINE__
  4463. }
  4464. { .mfb
  4465. nop __LINE__
  4466. FMA f107 = f35, f53, f107 // A4 * B6
  4467. nop __LINE__
  4468. }
  4469. ;;
  4470. /* 16 */
  4471. { .mfb
  4472. FMA f115 = f35, f54, f115 // A4 * B7
  4473. nop __LINE__
  4474. }
  4475. { .mfb
  4476. nop __LINE__
  4477. FMA f123 = f35, f55, f123 // A4 * B8
  4478. nop __LINE__
  4479. }
  4480. ;;
  4481. /* 17 */
  4482. { .mfb
  4483. nop __LINE__
  4484. FMA f68 = f36, f48, f68 // A5 * B1
  4485. nop __LINE__
  4486. }
  4487. { .mfb
  4488. nop __LINE__
  4489. FMA f76 = f36, f49, f76 // A5 * B2
  4490. nop __LINE__
  4491. }
  4492. ;;
  4493. /* 18 */
  4494. { .mfb
  4495. nop __LINE__
  4496. FMA f84 = f36, f50, f84 // A5 * B3
  4497. nop __LINE__
  4498. }
  4499. { .mfb
  4500. nop __LINE__
  4501. FMA f92 = f36, f51, f92 // A5 * B4
  4502. nop __LINE__
  4503. }
  4504. ;;
  4505. /* 19 */
  4506. { .mfb
  4507. nop __LINE__
  4508. FMA f100 = f36, f52, f100 // A5 * B5
  4509. nop __LINE__
  4510. }
  4511. { .mfb
  4512. nop __LINE__
  4513. FMA f108 = f36, f53, f108 // A5 * B6
  4514. nop __LINE__
  4515. }
  4516. ;;
  4517. /* 20 */
  4518. { .mfb
  4519. nop __LINE__
  4520. FMA f116 = f36, f54, f116 // A5 * B7
  4521. nop __LINE__
  4522. }
  4523. { .mfb
  4524. nop __LINE__
  4525. FMA f124 = f36, f55, f124 // A5 * B8
  4526. nop __LINE__
  4527. }
  4528. ;;
  4529. /* 21 */
  4530. { .mfb
  4531. nop __LINE__
  4532. FMA f69 = f37, f48, f69 // A6 * B1
  4533. nop __LINE__
  4534. }
  4535. { .mfb
  4536. nop __LINE__
  4537. FMA f77 = f37, f49, f77 // A6 * B2
  4538. nop __LINE__
  4539. }
  4540. ;;
  4541. /* 22 */
  4542. { .mfb
  4543. nop __LINE__
  4544. FMA f85 = f37, f50, f85 // A6 * B3
  4545. nop __LINE__
  4546. }
  4547. { .mfb
  4548. nop __LINE__
  4549. FMA f93 = f37, f51, f93 // A6 * B4
  4550. nop __LINE__
  4551. }
  4552. ;;
  4553. /* 23 */
  4554. { .mfb
  4555. nop __LINE__
  4556. FMA f101 = f37, f52, f101 // A6 * B5
  4557. nop __LINE__
  4558. }
  4559. { .mfb
  4560. nop __LINE__
  4561. FMA f109 = f37, f53, f109 // A6 * B6
  4562. nop __LINE__
  4563. }
  4564. ;;
  4565. /* 24 */
  4566. { .mfb
  4567. nop __LINE__
  4568. FMA f117 = f37, f54, f117 // A6 * B7
  4569. nop __LINE__
  4570. }
  4571. { .mfb
  4572. nop __LINE__
  4573. FMA f125 = f37, f55, f125 // A6 * B8
  4574. nop __LINE__
  4575. }
  4576. ;;
  4577. /* 25 */
  4578. { .mfb
  4579. nop __LINE__
  4580. FMA f70 = f38, f48, f70 // A7 * B1
  4581. nop __LINE__
  4582. }
  4583. { .mfb
  4584. nop __LINE__
  4585. FMA f78 = f38, f49, f78 // A7 * B2
  4586. nop __LINE__
  4587. }
  4588. ;;
  4589. /* 26 */
  4590. { .mfb
  4591. nop __LINE__
  4592. FMA f86 = f38, f50, f86 // A7 * B3
  4593. nop __LINE__
  4594. }
  4595. { .mfb
  4596. nop __LINE__
  4597. FMA f94 = f38, f51, f94 // A7 * B4
  4598. nop __LINE__
  4599. }
  4600. ;;
  4601. /* 27 */
  4602. { .mfb
  4603. nop __LINE__
  4604. FMA f102 = f38, f52, f102 // A7 * B5
  4605. nop __LINE__
  4606. }
  4607. { .mfb
  4608. nop __LINE__
  4609. FMA f110 = f38, f53, f110 // A7 * B6
  4610. nop __LINE__
  4611. }
  4612. ;;
  4613. /* 28 */
  4614. { .mfb
  4615. nop __LINE__
  4616. FMA f118 = f38, f54, f118 // A7 * B7
  4617. nop __LINE__
  4618. }
  4619. { .mfb
  4620. nop __LINE__
  4621. FMA f126 = f38, f55, f126 // A7 * B8
  4622. nop __LINE__
  4623. }
  4624. ;;
  4625. /* 29 */
  4626. { .mfb
  4627. nop __LINE__
  4628. FMA f71 = f39, f48, f71 // A8 * B1
  4629. nop __LINE__
  4630. }
  4631. { .mfb
  4632. nop __LINE__
  4633. FMA f79 = f39, f49, f79 // A8 * B2
  4634. nop __LINE__
  4635. }
  4636. ;;
  4637. /* 30 */
  4638. { .mfb
  4639. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  4640. FMA f87 = f39, f50, f87 // A8 * B3
  4641. nop __LINE__
  4642. }
  4643. { .mfb
  4644. nop __LINE__
  4645. FMA f95 = f39, f51, f95 // A8 * B4
  4646. nop __LINE__
  4647. }
  4648. ;;
  4649. /* 31 */
  4650. { .mfb
  4651. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  4652. FMA f103 = f39, f52, f103 // A8 * B5
  4653. nop __LINE__
  4654. }
  4655. { .mfb
  4656. nop __LINE__
  4657. FMA f111 = f39, f53, f111 // A8 * B6
  4658. nop __LINE__
  4659. }
  4660. ;;
  4661. /* 32 */
  4662. { .mfb
  4663. nop __LINE__
  4664. FMA f119 = f39, f54, f119 // A8 * B7
  4665. nop __LINE__
  4666. }
  4667. { .mfb
  4668. nop __LINE__
  4669. FMA f127 = f39, f55, f127 // A8 * B8
  4670. nop __LINE__
  4671. }
  4672. ;;
  4673. /* 33 */
  4674. { .mfb
  4675. nop __LINE__
  4676. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  4677. nop __LINE__
  4678. }
  4679. { .mfb
  4680. nop __LINE__
  4681. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  4682. nop __LINE__
  4683. }
  4684. ;;
  4685. /* 34 */
  4686. { .mfb
  4687. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  4688. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  4689. nop __LINE__
  4690. }
  4691. { .mfb
  4692. nop __LINE__
  4693. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  4694. nop __LINE__
  4695. }
  4696. ;;
  4697. /* 35 */
  4698. { .mfb
  4699. (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  4700. (p3) FMA f96 = f40, f60, f96 // A1 * B5
  4701. nop __LINE__
  4702. }
  4703. { .mfb
  4704. nop __LINE__
  4705. (p3) FMA f104 = f40, f61, f104 // A1 * B6
  4706. nop __LINE__
  4707. }
  4708. ;;
  4709. /* 36 */
  4710. { .mfb
  4711. (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  4712. (p3) FMA f112 = f40, f62, f112 // A1 * B7
  4713. nop __LINE__
  4714. }
  4715. { .mfb
  4716. nop __LINE__
  4717. (p3) FMA f120 = f40, f63, f120 // A1 * B8
  4718. nop __LINE__
  4719. }
  4720. ;;
  4721. /* 37 */
  4722. { .mfb
  4723. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  4724. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  4725. nop __LINE__
  4726. }
  4727. { .mfb
  4728. nop __LINE__
  4729. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  4730. nop __LINE__
  4731. }
  4732. ;;
  4733. /* 38 */
  4734. { .mfb
  4735. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  4736. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  4737. nop __LINE__
  4738. }
  4739. { .mfb
  4740. nop __LINE__
  4741. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  4742. nop __LINE__
  4743. }
  4744. ;;
  4745. /* 39 */
  4746. { .mfb
  4747. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  4748. (p3) FMA f97 = f41, f60, f97 // A2 * B5
  4749. nop __LINE__
  4750. }
  4751. { .mfb
  4752. nop __LINE__
  4753. (p3) FMA f105 = f41, f61, f105 // A2 * B6
  4754. nop __LINE__
  4755. }
  4756. ;;
  4757. /* 40 */
  4758. { .mfb
  4759. nop __LINE__
  4760. (p3) FMA f113 = f41, f62, f113 // A2 * B7
  4761. nop __LINE__
  4762. }
  4763. { .mfb
  4764. nop __LINE__
  4765. (p3) FMA f121 = f41, f63, f121 // A2 * B8
  4766. nop __LINE__
  4767. }
  4768. ;;
  4769. /* 41 */
  4770. { .mfb
  4771. nop __LINE__
  4772. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  4773. nop __LINE__
  4774. }
  4775. { .mfb
  4776. nop __LINE__
  4777. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  4778. nop __LINE__
  4779. }
  4780. ;;
  4781. /* 42 */
  4782. { .mfb
  4783. nop __LINE__
  4784. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  4785. nop __LINE__
  4786. }
  4787. { .mfb
  4788. nop __LINE__
  4789. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  4790. nop __LINE__
  4791. }
  4792. ;;
  4793. /* 43 */
  4794. { .mfb
  4795. nop __LINE__
  4796. (p3) FMA f98 = f42, f60, f98 // A3 * B5
  4797. nop __LINE__
  4798. }
  4799. { .mfb
  4800. nop __LINE__
  4801. (p3) FMA f106 = f42, f61, f106 // A3 * B6
  4802. nop __LINE__
  4803. }
  4804. ;;
  4805. /* 44 */
  4806. { .mfb
  4807. nop __LINE__
  4808. (p3) FMA f114 = f42, f62, f114 // A3 * B7
  4809. nop __LINE__
  4810. }
  4811. { .mfb
  4812. nop __LINE__
  4813. (p3) FMA f122 = f42, f63, f122 // A3 * B8
  4814. nop __LINE__
  4815. }
  4816. ;;
  4817. /* 45 */
  4818. { .mfb
  4819. nop __LINE__
  4820. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  4821. nop __LINE__
  4822. }
  4823. { .mfb
  4824. nop __LINE__
  4825. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  4826. nop __LINE__
  4827. }
  4828. ;;
  4829. /* 46 */
  4830. { .mfb
  4831. nop __LINE__
  4832. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  4833. nop __LINE__
  4834. }
  4835. { .mfb
  4836. nop __LINE__
  4837. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  4838. nop __LINE__
  4839. }
  4840. ;;
  4841. /* 47 */
  4842. { .mfb
  4843. nop __LINE__
  4844. (p3) FMA f99 = f43, f60, f99 // A4 * B5
  4845. nop __LINE__
  4846. }
  4847. { .mfb
  4848. nop __LINE__
  4849. (p3) FMA f107 = f43, f61, f107 // A4 * B6
  4850. nop __LINE__
  4851. }
  4852. ;;
  4853. /* 48 */
  4854. { .mfb
  4855. nop __LINE__
  4856. (p3) FMA f115 = f43, f62, f115 // A4 * B7
  4857. nop __LINE__
  4858. }
  4859. { .mfb
  4860. nop __LINE__
  4861. (p3) FMA f123 = f43, f63, f123 // A4 * B8
  4862. nop __LINE__
  4863. }
  4864. ;;
  4865. /* 49 */
  4866. { .mfb
  4867. nop __LINE__
  4868. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  4869. nop __LINE__
  4870. }
  4871. { .mfb
  4872. nop __LINE__
  4873. (p3) FMA f76 = f44, f57, f76 // A5 * B2
  4874. nop __LINE__
  4875. }
  4876. ;;
  4877. /* 50 */
  4878. { .mfb
  4879. nop __LINE__
  4880. (p3) FMA f84 = f44, f58, f84 // A5 * B3
  4881. nop __LINE__
  4882. }
  4883. { .mfb
  4884. nop __LINE__
  4885. (p3) FMA f92 = f44, f59, f92 // A5 * B4
  4886. nop __LINE__
  4887. }
  4888. ;;
  4889. /* 51 */
  4890. { .mfb
  4891. nop __LINE__
  4892. (p3) FMA f100 = f44, f60, f100 // A5 * B5
  4893. nop __LINE__
  4894. }
  4895. { .mfb
  4896. nop __LINE__
  4897. (p3) FMA f108 = f44, f61, f108 // A5 * B6
  4898. nop __LINE__
  4899. }
  4900. ;;
  4901. /* 52 */
  4902. { .mfb
  4903. nop __LINE__
  4904. (p3) FMA f116 = f44, f62, f116 // A5 * B7
  4905. nop __LINE__
  4906. }
  4907. { .mfb
  4908. nop __LINE__
  4909. (p3) FMA f124 = f44, f63, f124 // A5 * B8
  4910. nop __LINE__
  4911. }
  4912. ;;
  4913. /* 53 */
  4914. { .mfb
  4915. nop __LINE__
  4916. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  4917. nop __LINE__
  4918. }
  4919. { .mfb
  4920. nop __LINE__
  4921. (p3) FMA f77 = f45, f57, f77 // A6 * B2
  4922. nop __LINE__
  4923. }
  4924. ;;
  4925. /* 54 */
  4926. { .mfb
  4927. nop __LINE__
  4928. (p3) FMA f85 = f45, f58, f85 // A6 * B3
  4929. nop __LINE__
  4930. }
  4931. { .mfb
  4932. nop __LINE__
  4933. (p3) FMA f93 = f45, f59, f93 // A6 * B4
  4934. nop __LINE__
  4935. }
  4936. ;;
  4937. /* 55 */
  4938. { .mfb
  4939. nop __LINE__
  4940. (p3) FMA f101 = f45, f60, f101 // A6 * B5
  4941. nop __LINE__
  4942. }
  4943. { .mfb
  4944. nop __LINE__
  4945. (p3) FMA f109 = f45, f61, f109 // A6 * B6
  4946. nop __LINE__
  4947. }
  4948. ;;
  4949. /* 56 */
  4950. { .mfb
  4951. nop __LINE__
  4952. (p3) FMA f117 = f45, f62, f117 // A6 * B7
  4953. nop __LINE__
  4954. }
  4955. { .mfb
  4956. nop __LINE__
  4957. (p3) FMA f125 = f45, f63, f125 // A6 * B8
  4958. nop __LINE__
  4959. }
  4960. ;;
  4961. /* 57 */
  4962. { .mfb
  4963. nop __LINE__
  4964. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  4965. nop __LINE__
  4966. }
  4967. { .mfb
  4968. nop __LINE__
  4969. (p3) FMA f78 = f46, f57, f78 // A7 * B2
  4970. nop __LINE__
  4971. }
  4972. ;;
  4973. /* 58 */
  4974. { .mfb
  4975. nop __LINE__
  4976. (p3) FMA f86 = f46, f58, f86 // A7 * B3
  4977. nop __LINE__
  4978. }
  4979. { .mfb
  4980. nop __LINE__
  4981. (p3) FMA f94 = f46, f59, f94 // A7 * B4
  4982. nop __LINE__
  4983. }
  4984. ;;
  4985. /* 59 */
  4986. { .mfb
  4987. nop __LINE__
  4988. (p3) FMA f102 = f46, f60, f102 // A7 * B5
  4989. nop __LINE__
  4990. }
  4991. { .mfb
  4992. nop __LINE__
  4993. (p3) FMA f110 = f46, f61, f110 // A7 * B6
  4994. nop __LINE__
  4995. }
  4996. ;;
  4997. /* 60 */
  4998. { .mfb
  4999. nop __LINE__
  5000. (p3) FMA f118 = f46, f62, f118 // A7 * B7
  5001. nop __LINE__
  5002. }
  5003. { .mfb
  5004. nop __LINE__
  5005. (p3) FMA f126 = f46, f63, f126 // A7 * B8
  5006. nop __LINE__
  5007. }
  5008. ;;
  5009. /* 61 */
  5010. { .mfb
  5011. nop __LINE__
  5012. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  5013. nop __LINE__
  5014. }
  5015. { .mfb
  5016. nop __LINE__
  5017. (p3) FMA f79 = f47, f57, f79 // A8 * B2
  5018. nop __LINE__
  5019. }
  5020. ;;
  5021. /* 62 */
  5022. { .mfb
  5023. nop __LINE__
  5024. (p3) FMA f87 = f47, f58, f87 // A8 * B3
  5025. nop __LINE__
  5026. }
  5027. { .mfb
  5028. nop __LINE__
  5029. (p3) FMA f95 = f47, f59, f95 // A8 * B4
  5030. nop __LINE__
  5031. }
  5032. ;;
  5033. /* 63 */
  5034. { .mfb
  5035. nop __LINE__
  5036. (p3) FMA f103 = f47, f60, f103 // A8 * B5
  5037. nop __LINE__
  5038. }
  5039. { .mfb
  5040. nop __LINE__
  5041. (p3) FMA f111 = f47, f61, f111 // A8 * B6
  5042. nop __LINE__
  5043. }
  5044. ;;
  5045. /* 64 */
  5046. { .mfi
  5047. nop __LINE__
  5048. (p3) FMA f119 = f47, f62, f119 // A8 * B7
  5049. adds L = -1, L
  5050. }
  5051. { .mfb
  5052. nop __LINE__
  5053. (p3) FMA f127 = f47, f63, f127 // A8 * B8
  5054. br.cloop.sptk.few .L012
  5055. }
  5056. ;;
  5057. .L018:
  5058. adds r2 = -8, KK
  5059. ;;
  5060. shladd r2 = r2, BASE_SHIFT, r0
  5061. ;;
  5062. shladd AOFFSET = r2, 3, AORIG
  5063. shladd BOFFSET = r2, 3, B
  5064. ;;
  5065. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  5066. ;;
  5067. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  5068. ;;
  5069. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  5070. ;;
  5071. LDFPD f38, f39 = [BOFFSET], 2 * SIZE
  5072. ;;
  5073. LDFPD f40, f41 = [BOFFSET], 2 * SIZE
  5074. ;;
  5075. LDFPD f42, f43 = [BOFFSET], 2 * SIZE
  5076. ;;
  5077. LDFPD f44, f45 = [BOFFSET], 2 * SIZE
  5078. ;;
  5079. LDFPD f46, f47 = [BOFFSET], 2 * SIZE
  5080. ;;
  5081. { .mfi
  5082. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5083. FSUB f64 = f32, f64
  5084. nop __LINE__
  5085. }
  5086. { .mfi
  5087. nop __LINE__
  5088. FSUB f72 = f33, f72
  5089. nop __LINE__
  5090. }
  5091. ;;
  5092. { .mfi
  5093. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  5094. FSUB f80 = f34, f80
  5095. nop __LINE__
  5096. }
  5097. { .mfi
  5098. nop __LINE__
  5099. FSUB f88 = f35, f88
  5100. nop __LINE__
  5101. }
  5102. ;;
  5103. { .mfi
  5104. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  5105. FSUB f96 = f36, f96
  5106. nop __LINE__
  5107. }
  5108. { .mfi
  5109. nop __LINE__
  5110. FSUB f104 = f37, f104
  5111. nop __LINE__
  5112. }
  5113. ;;
  5114. { .mfi
  5115. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  5116. FSUB f112 = f38, f112
  5117. nop __LINE__
  5118. }
  5119. { .mfi
  5120. nop __LINE__
  5121. FSUB f120 = f39, f120
  5122. nop __LINE__
  5123. }
  5124. ;;
  5125. { .mfi
  5126. LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  5127. FSUB f65 = f40, f65
  5128. nop __LINE__
  5129. }
  5130. { .mfi
  5131. nop __LINE__
  5132. FSUB f73 = f41, f73
  5133. nop __LINE__
  5134. }
  5135. ;;
  5136. { .mfi
  5137. LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  5138. FSUB f81 = f42, f81
  5139. nop __LINE__
  5140. }
  5141. { .mfi
  5142. nop __LINE__
  5143. FSUB f89 = f43, f89
  5144. nop __LINE__
  5145. }
  5146. ;;
  5147. { .mfi
  5148. LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  5149. FSUB f97 = f44, f97
  5150. nop __LINE__
  5151. }
  5152. { .mfi
  5153. nop __LINE__
  5154. FSUB f105 = f45, f105
  5155. nop __LINE__
  5156. }
  5157. ;;
  5158. { .mfi
  5159. LDFPD f62, f63 = [BOFFSET], 2 * SIZE
  5160. FSUB f113 = f46, f113
  5161. nop __LINE__
  5162. }
  5163. { .mfi
  5164. nop __LINE__
  5165. FSUB f121 = f47, f121
  5166. nop __LINE__
  5167. }
  5168. ;;
  5169. { .mfi
  5170. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  5171. FSUB f66 = f48, f66
  5172. nop __LINE__
  5173. }
  5174. { .mfi
  5175. nop __LINE__
  5176. FSUB f74 = f49, f74
  5177. nop __LINE__
  5178. }
  5179. ;;
  5180. { .mfi
  5181. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  5182. FSUB f82 = f50, f82
  5183. nop __LINE__
  5184. }
  5185. { .mfi
  5186. nop __LINE__
  5187. FSUB f90 = f51, f90
  5188. nop __LINE__
  5189. }
  5190. ;;
  5191. { .mfi
  5192. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  5193. FSUB f98 = f52, f98
  5194. nop __LINE__
  5195. }
  5196. { .mfi
  5197. nop __LINE__
  5198. FSUB f106 = f53, f106
  5199. nop __LINE__
  5200. }
  5201. ;;
  5202. { .mfi
  5203. LDFPD f38, f39 = [BOFFSET], 2 * SIZE
  5204. FSUB f114 = f54, f114
  5205. nop __LINE__
  5206. }
  5207. { .mfi
  5208. nop __LINE__
  5209. FSUB f122 = f55, f122
  5210. nop __LINE__
  5211. }
  5212. ;;
  5213. { .mfi
  5214. LDFPD f40, f41 = [BOFFSET], 2 * SIZE
  5215. FSUB f67 = f56, f67
  5216. nop __LINE__
  5217. }
  5218. { .mfi
  5219. nop __LINE__
  5220. FSUB f75 = f57, f75
  5221. nop __LINE__
  5222. }
  5223. ;;
  5224. { .mfi
  5225. LDFPD f42, f43 = [BOFFSET], 2 * SIZE
  5226. FSUB f83 = f58, f83
  5227. nop __LINE__
  5228. }
  5229. { .mfi
  5230. nop __LINE__
  5231. FSUB f91 = f59, f91
  5232. nop __LINE__
  5233. }
  5234. ;;
  5235. { .mfi
  5236. LDFPD f44, f45 = [BOFFSET], 2 * SIZE
  5237. FSUB f99 = f60, f99
  5238. nop __LINE__
  5239. }
  5240. { .mfi
  5241. nop __LINE__
  5242. FSUB f107 = f61, f107
  5243. nop __LINE__
  5244. }
  5245. ;;
  5246. { .mfi
  5247. LDFPD f46, f47 = [BOFFSET], 2 * SIZE
  5248. FSUB f115 = f62, f115
  5249. nop __LINE__
  5250. }
  5251. { .mfi
  5252. nop __LINE__
  5253. FSUB f123 = f63, f123
  5254. nop __LINE__
  5255. }
  5256. ;;
  5257. { .mfi
  5258. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  5259. FSUB f68 = f32, f68
  5260. nop __LINE__
  5261. }
  5262. { .mfi
  5263. nop __LINE__
  5264. FSUB f76 = f33, f76
  5265. nop __LINE__
  5266. }
  5267. ;;
  5268. { .mfi
  5269. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  5270. FSUB f84 = f34, f84
  5271. nop __LINE__
  5272. }
  5273. { .mfi
  5274. nop __LINE__
  5275. FSUB f92 = f35, f92
  5276. nop __LINE__
  5277. }
  5278. ;;
  5279. { .mfi
  5280. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  5281. FSUB f100 = f36, f100
  5282. nop __LINE__
  5283. }
  5284. { .mfi
  5285. nop __LINE__
  5286. FSUB f108 = f37, f108
  5287. nop __LINE__
  5288. }
  5289. ;;
  5290. { .mfi
  5291. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  5292. FSUB f116 = f38, f116
  5293. nop __LINE__
  5294. }
  5295. { .mfi
  5296. nop __LINE__
  5297. FSUB f124 = f39, f124
  5298. nop __LINE__
  5299. }
  5300. ;;
  5301. { .mfi
  5302. LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  5303. FSUB f69 = f40, f69
  5304. nop __LINE__
  5305. }
  5306. { .mfi
  5307. nop __LINE__
  5308. FSUB f77 = f41, f77
  5309. nop __LINE__
  5310. }
  5311. ;;
  5312. { .mfi
  5313. LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  5314. FSUB f85 = f42, f85
  5315. nop __LINE__
  5316. }
  5317. { .mfi
  5318. nop __LINE__
  5319. FSUB f93 = f43, f93
  5320. nop __LINE__
  5321. }
  5322. ;;
  5323. { .mfi
  5324. LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  5325. FSUB f101 = f44, f101
  5326. nop __LINE__
  5327. }
  5328. { .mfi
  5329. nop __LINE__
  5330. FSUB f109 = f45, f109
  5331. nop __LINE__
  5332. }
  5333. ;;
  5334. { .mfi
  5335. LDFPD f62, f63 = [BOFFSET]
  5336. FSUB f117 = f46, f117
  5337. adds BOFFSET = -62 * SIZE, BOFFSET
  5338. }
  5339. { .mfi
  5340. nop __LINE__
  5341. FSUB f125 = f47, f125
  5342. nop __LINE__
  5343. }
  5344. ;;
  5345. { .mfi
  5346. nop __LINE__
  5347. FSUB f70 = f48, f70
  5348. #ifdef LN
  5349. adds AOFFSET = 62 * SIZE, AOFFSET
  5350. #else
  5351. nop __LINE__
  5352. #endif
  5353. }
  5354. { .mfi
  5355. nop __LINE__
  5356. FSUB f78 = f49, f78
  5357. nop __LINE__
  5358. }
  5359. { .mfi
  5360. nop __LINE__
  5361. FSUB f86 = f50, f86
  5362. nop __LINE__
  5363. }
  5364. { .mfi
  5365. nop __LINE__
  5366. FSUB f94 = f51, f94
  5367. nop __LINE__
  5368. }
  5369. ;;
  5370. { .mfi
  5371. #ifdef LN
  5372. LDFPD f33, f32 = [AOFFSET]
  5373. #else
  5374. LDFPD f32, f33 = [AOFFSET]
  5375. #endif
  5376. FSUB f102 = f52, f102
  5377. nop __LINE__
  5378. }
  5379. { .mfi
  5380. nop __LINE__
  5381. FSUB f110 = f53, f110
  5382. nop __LINE__
  5383. }
  5384. { .mfi
  5385. nop __LINE__
  5386. FSUB f118 = f54, f118
  5387. nop __LINE__
  5388. }
  5389. { .mfi
  5390. nop __LINE__
  5391. FSUB f126 = f55, f126
  5392. #ifdef LN
  5393. adds AOFFSET = - 2 * SIZE, AOFFSET
  5394. #else
  5395. adds AOFFSET = 2 * SIZE, AOFFSET
  5396. #endif
  5397. }
  5398. ;;
  5399. { .mfi
  5400. nop __LINE__
  5401. FSUB f71 = f56, f71
  5402. nop __LINE__
  5403. }
  5404. { .mfi
  5405. nop __LINE__
  5406. FSUB f79 = f57, f79
  5407. nop __LINE__
  5408. }
  5409. { .mfi
  5410. nop __LINE__
  5411. FSUB f87 = f58, f87
  5412. nop __LINE__
  5413. }
  5414. { .mfi
  5415. nop __LINE__
  5416. FSUB f95 = f59, f95
  5417. nop __LINE__
  5418. }
  5419. { .mfi
  5420. nop __LINE__
  5421. FSUB f103 = f60, f103
  5422. nop __LINE__
  5423. }
  5424. { .mfi
  5425. nop __LINE__
  5426. FSUB f111 = f61, f111
  5427. nop __LINE__
  5428. }
  5429. { .mfi
  5430. nop __LINE__
  5431. FSUB f119 = f62, f119
  5432. nop __LINE__
  5433. }
  5434. { .mfi
  5435. nop __LINE__
  5436. FSUB f127 = f63, f127
  5437. nop __LINE__
  5438. }
  5439. ;;
  5440. { .mfi
  5441. LDFPD f35, f34 = [AOFFSET]
  5442. FMPY f71 = f71, f32
  5443. adds AOFFSET = - 2 * SIZE, AOFFSET
  5444. }
  5445. { .mfi
  5446. nop __LINE__
  5447. FMPY f103 = f103, f32
  5448. adds BOFFSET2 = 4 * SIZE, BOFFSET
  5449. }
  5450. ;;
  5451. { .mfi
  5452. LDFPD f37, f36 = [AOFFSET]
  5453. FMPY f79 = f79, f32
  5454. adds AOFFSET = - 2 * SIZE, AOFFSET
  5455. }
  5456. { .mfi
  5457. nop __LINE__
  5458. FMPY f111 = f111, f32
  5459. nop __LINE__
  5460. }
  5461. ;;
  5462. { .mfi
  5463. LDFPD f39, f38 = [AOFFSET]
  5464. FMPY f87 = f87, f32
  5465. adds AOFFSET = - 2 * SIZE, AOFFSET
  5466. }
  5467. { .mfi
  5468. nop __LINE__
  5469. FMPY f119 = f119, f32
  5470. nop __LINE__
  5471. }
  5472. ;;
  5473. { .mfi
  5474. LDFD f40 = [AOFFSET], -2 * SIZE
  5475. FMPY f95 = f95, f32
  5476. nop __LINE__
  5477. }
  5478. { .mfi
  5479. nop __LINE__
  5480. FMPY f127 = f127, f32
  5481. nop __LINE__
  5482. }
  5483. ;;
  5484. { .mfi
  5485. LDFPD f42, f41 = [AOFFSET]
  5486. FNMA f70 = f71, f33, f70
  5487. adds AOFFSET = - 2 * SIZE, AOFFSET
  5488. }
  5489. { .mfi
  5490. nop __LINE__
  5491. FNMA f102 = f103, f33, f102
  5492. nop __LINE__
  5493. }
  5494. ;;
  5495. { .mfi
  5496. LDFPD f44, f43 = [AOFFSET]
  5497. FNMA f78 = f79, f33, f78
  5498. adds AOFFSET = - 2 * SIZE, AOFFSET
  5499. }
  5500. { .mfi
  5501. nop __LINE__
  5502. FNMA f110 = f111, f33, f110
  5503. nop __LINE__
  5504. }
  5505. ;;
  5506. { .mfi
  5507. LDFPD f46, f45 = [AOFFSET]
  5508. FNMA f86 = f87, f33, f86
  5509. adds AOFFSET = - 4 * SIZE, AOFFSET
  5510. }
  5511. { .mfi
  5512. nop __LINE__
  5513. FNMA f118 = f119, f33, f118
  5514. nop __LINE__
  5515. }
  5516. ;;
  5517. { .mfi
  5518. LDFPD f48, f47 = [AOFFSET]
  5519. FNMA f94 = f95, f33, f94
  5520. adds AOFFSET = - 2 * SIZE, AOFFSET
  5521. }
  5522. { .mfi
  5523. nop __LINE__
  5524. FNMA f126 = f127, f33, f126
  5525. nop __LINE__
  5526. }
  5527. ;;
  5528. { .mfi
  5529. LDFPD f50, f49 = [AOFFSET]
  5530. FNMA f69 = f71, f34, f69
  5531. adds AOFFSET = - 2 * SIZE, AOFFSET
  5532. }
  5533. { .mfi
  5534. nop __LINE__
  5535. FNMA f101 = f103, f34, f101
  5536. nop __LINE__
  5537. }
  5538. ;;
  5539. { .mfi
  5540. LDFPD f52, f51 = [AOFFSET]
  5541. FNMA f77 = f79, f34, f77
  5542. adds AOFFSET = - 4 * SIZE, AOFFSET
  5543. }
  5544. { .mfi
  5545. nop __LINE__
  5546. FNMA f109 = f111, f34, f109
  5547. nop __LINE__
  5548. }
  5549. ;;
  5550. { .mfi
  5551. LDFD f53 = [AOFFSET], -2 * SIZE
  5552. FNMA f85 = f87, f34, f85
  5553. nop __LINE__
  5554. }
  5555. { .mfi
  5556. nop __LINE__
  5557. FNMA f117 = f119, f34, f117
  5558. nop __LINE__
  5559. }
  5560. ;;
  5561. { .mfi
  5562. LDFPD f55, f54 = [AOFFSET]
  5563. FNMA f93 = f95, f34, f93
  5564. adds AOFFSET = - 2 * SIZE, AOFFSET
  5565. }
  5566. { .mfi
  5567. nop __LINE__
  5568. FNMA f125 = f127, f34, f125
  5569. nop __LINE__
  5570. }
  5571. ;;
  5572. { .mfi
  5573. LDFPD f57, f56 = [AOFFSET]
  5574. FNMA f68 = f71, f35, f68
  5575. adds AOFFSET = - 6 * SIZE, AOFFSET
  5576. }
  5577. { .mfi
  5578. nop __LINE__
  5579. FNMA f100 = f103, f35, f100
  5580. nop __LINE__
  5581. }
  5582. ;;
  5583. { .mfi
  5584. LDFPD f59, f58 = [AOFFSET]
  5585. FNMA f76 = f79, f35, f76
  5586. adds AOFFSET = - 2 * SIZE, AOFFSET
  5587. }
  5588. { .mfi
  5589. nop __LINE__
  5590. FNMA f108 = f111, f35, f108
  5591. nop __LINE__
  5592. }
  5593. ;;
  5594. { .mfi
  5595. LDFPD f61, f60 = [AOFFSET]
  5596. FNMA f84 = f87, f35, f84
  5597. adds AOFFSET = - 6 * SIZE, AOFFSET
  5598. }
  5599. { .mfi
  5600. nop __LINE__
  5601. FNMA f116 = f119, f35, f116
  5602. nop __LINE__
  5603. }
  5604. ;;
  5605. { .mfi
  5606. LDFD f16 = [AOFFSET], -2 * SIZE
  5607. FNMA f92 = f95, f35, f92
  5608. nop __LINE__
  5609. }
  5610. { .mfi
  5611. nop __LINE__
  5612. FNMA f124 = f127, f35, f124
  5613. nop __LINE__
  5614. }
  5615. ;;
  5616. { .mfi
  5617. LDFPD f18, f17 = [AOFFSET]
  5618. FNMA f67 = f71, f36, f67
  5619. adds AOFFSET = - 8 * SIZE, AOFFSET
  5620. }
  5621. { .mfi
  5622. nop __LINE__
  5623. FNMA f99 = f103, f36, f99
  5624. nop __LINE__
  5625. }
  5626. ;;
  5627. { .mfi
  5628. LDFPD f20, f19 = [AOFFSET]
  5629. FNMA f75 = f79, f36, f75
  5630. adds AOFFSET = - 8 * SIZE, AOFFSET
  5631. }
  5632. { .mfi
  5633. nop __LINE__
  5634. FNMA f107 = f111, f36, f107
  5635. nop __LINE__
  5636. }
  5637. ;;
  5638. { .mfi
  5639. LDFD f21 = [AOFFSET]
  5640. FNMA f83 = f87, f36, f83
  5641. adds BOFFSET = 56 * SIZE, BOFFSET
  5642. }
  5643. { .mfi
  5644. FNMA f115 = f119, f36, f115
  5645. adds BOFFSET2 = 56 * SIZE, BOFFSET2
  5646. }
  5647. ;;
  5648. FNMA f91 = f95, f36, f91
  5649. FNMA f123 = f127, f36, f123
  5650. ;;
  5651. FNMA f66 = f71, f37, f66
  5652. FNMA f98 = f103, f37, f98
  5653. FNMA f74 = f79, f37, f74
  5654. FNMA f106 = f111, f37, f106
  5655. FNMA f82 = f87, f37, f82
  5656. FNMA f114 = f119, f37, f114
  5657. FNMA f90 = f95, f37, f90
  5658. FNMA f122 = f127, f37, f122
  5659. ;;
  5660. FNMA f65 = f71, f38, f65
  5661. FNMA f97 = f103, f38, f97
  5662. FNMA f73 = f79, f38, f73
  5663. FNMA f105 = f111, f38, f105
  5664. FNMA f81 = f87, f38, f81
  5665. FNMA f113 = f119, f38, f113
  5666. FNMA f89 = f95, f38, f89
  5667. FNMA f121 = f127, f38, f121
  5668. ;;
  5669. FNMA f64 = f71, f39, f64
  5670. FNMA f96 = f103, f39, f96
  5671. FNMA f72 = f79, f39, f72
  5672. FNMA f104 = f111, f39, f104
  5673. FNMA f80 = f87, f39, f80
  5674. FNMA f112 = f119, f39, f112
  5675. FNMA f88 = f95, f39, f88
  5676. FNMA f120 = f127, f39, f120
  5677. ;;
  5678. FMPY f70 = f70, f40
  5679. FMPY f102 = f102, f40
  5680. FMPY f78 = f78, f40
  5681. FMPY f110 = f110, f40
  5682. FMPY f86 = f86, f40
  5683. FMPY f118 = f118, f40
  5684. FMPY f94 = f94, f40
  5685. FMPY f126 = f126, f40
  5686. ;;
  5687. FNMA f69 = f70, f41, f69
  5688. FNMA f101 = f102, f41, f101
  5689. FNMA f77 = f78, f41, f77
  5690. FNMA f109 = f110, f41, f109
  5691. FNMA f85 = f86, f41, f85
  5692. FNMA f117 = f118, f41, f117
  5693. FNMA f93 = f94, f41, f93
  5694. FNMA f125 = f126, f41, f125
  5695. ;;
  5696. FNMA f68 = f70, f42, f68
  5697. FNMA f100 = f102, f42, f100
  5698. FNMA f76 = f78, f42, f76
  5699. FNMA f108 = f110, f42, f108
  5700. FNMA f84 = f86, f42, f84
  5701. FNMA f116 = f118, f42, f116
  5702. FNMA f92 = f94, f42, f92
  5703. FNMA f124 = f126, f42, f124
  5704. ;;
  5705. FNMA f67 = f70, f43, f67
  5706. FNMA f99 = f102, f43, f99
  5707. FNMA f75 = f78, f43, f75
  5708. FNMA f107 = f110, f43, f107
  5709. FNMA f83 = f86, f43, f83
  5710. FNMA f115 = f118, f43, f115
  5711. FNMA f91 = f94, f43, f91
  5712. FNMA f123 = f126, f43, f123
  5713. ;;
  5714. FNMA f66 = f70, f44, f66
  5715. FNMA f98 = f102, f44, f98
  5716. FNMA f74 = f78, f44, f74
  5717. FNMA f106 = f110, f44, f106
  5718. FNMA f82 = f86, f44, f82
  5719. FNMA f114 = f118, f44, f114
  5720. FNMA f90 = f94, f44, f90
  5721. FNMA f122 = f126, f44, f122
  5722. ;;
  5723. FNMA f65 = f70, f45, f65
  5724. FNMA f97 = f102, f45, f97
  5725. FNMA f73 = f78, f45, f73
  5726. FNMA f105 = f110, f45, f105
  5727. FNMA f81 = f86, f45, f81
  5728. FNMA f113 = f118, f45, f113
  5729. FNMA f89 = f94, f45, f89
  5730. FNMA f121 = f126, f45, f121
  5731. ;;
  5732. FNMA f64 = f70, f46, f64
  5733. FNMA f96 = f102, f46, f96
  5734. FNMA f72 = f78, f46, f72
  5735. FNMA f104 = f110, f46, f104
  5736. FNMA f80 = f86, f46, f80
  5737. FNMA f112 = f118, f46, f112
  5738. FNMA f88 = f94, f46, f88
  5739. FNMA f120 = f126, f46, f120
  5740. ;;
  5741. FMPY f69 = f69, f47
  5742. FMPY f101 = f101, f47
  5743. FMPY f77 = f77, f47
  5744. FMPY f109 = f109, f47
  5745. FMPY f85 = f85, f47
  5746. FMPY f117 = f117, f47
  5747. FMPY f93 = f93, f47
  5748. FMPY f125 = f125, f47
  5749. ;;
  5750. FNMA f68 = f69, f48, f68
  5751. FNMA f100 = f101, f48, f100
  5752. FNMA f76 = f77, f48, f76
  5753. FNMA f108 = f109, f48, f108
  5754. FNMA f84 = f85, f48, f84
  5755. FNMA f116 = f117, f48, f116
  5756. FNMA f92 = f93, f48, f92
  5757. FNMA f124 = f125, f48, f124
  5758. ;;
  5759. FNMA f67 = f69, f49, f67
  5760. FNMA f99 = f101, f49, f99
  5761. FNMA f75 = f77, f49, f75
  5762. FNMA f107 = f109, f49, f107
  5763. FNMA f83 = f85, f49, f83
  5764. FNMA f115 = f117, f49, f115
  5765. FNMA f91 = f93, f49, f91
  5766. FNMA f123 = f125, f49, f123
  5767. ;;
  5768. FNMA f66 = f69, f50, f66
  5769. FNMA f98 = f101, f50, f98
  5770. FNMA f74 = f77, f50, f74
  5771. FNMA f106 = f109, f50, f106
  5772. FNMA f82 = f85, f50, f82
  5773. FNMA f114 = f117, f50, f114
  5774. FNMA f90 = f93, f50, f90
  5775. FNMA f122 = f125, f50, f122
  5776. ;;
  5777. FNMA f65 = f69, f51, f65
  5778. FNMA f97 = f101, f51, f97
  5779. FNMA f73 = f77, f51, f73
  5780. FNMA f105 = f109, f51, f105
  5781. FNMA f81 = f85, f51, f81
  5782. FNMA f113 = f117, f51, f113
  5783. FNMA f89 = f93, f51, f89
  5784. FNMA f121 = f125, f51, f121
  5785. ;;
  5786. FNMA f64 = f69, f52, f64
  5787. FNMA f96 = f101, f52, f96
  5788. FNMA f72 = f77, f52, f72
  5789. FNMA f104 = f109, f52, f104
  5790. FNMA f80 = f85, f52, f80
  5791. FNMA f112 = f117, f52, f112
  5792. FNMA f88 = f93, f52, f88
  5793. FNMA f120 = f125, f52, f120
  5794. ;;
  5795. FMPY f68 = f68, f53
  5796. FMPY f100 = f100, f53
  5797. FMPY f76 = f76, f53
  5798. FMPY f108 = f108, f53
  5799. FMPY f84 = f84, f53
  5800. FMPY f116 = f116, f53
  5801. FMPY f92 = f92, f53
  5802. FMPY f124 = f124, f53
  5803. ;;
  5804. FNMA f67 = f68, f54, f67
  5805. FNMA f99 = f100, f54, f99
  5806. FNMA f75 = f76, f54, f75
  5807. FNMA f107 = f108, f54, f107
  5808. FNMA f83 = f84, f54, f83
  5809. FNMA f115 = f116, f54, f115
  5810. FNMA f91 = f92, f54, f91
  5811. FNMA f123 = f124, f54, f123
  5812. ;;
  5813. FNMA f66 = f68, f55, f66
  5814. FNMA f98 = f100, f55, f98
  5815. FNMA f74 = f76, f55, f74
  5816. FNMA f106 = f108, f55, f106
  5817. FNMA f82 = f84, f55, f82
  5818. FNMA f114 = f116, f55, f114
  5819. FNMA f90 = f92, f55, f90
  5820. FNMA f122 = f124, f55, f122
  5821. ;;
  5822. FNMA f65 = f68, f56, f65
  5823. FNMA f97 = f100, f56, f97
  5824. FNMA f73 = f76, f56, f73
  5825. FNMA f105 = f108, f56, f105
  5826. FNMA f81 = f84, f56, f81
  5827. FNMA f113 = f116, f56, f113
  5828. FNMA f89 = f92, f56, f89
  5829. FNMA f121 = f124, f56, f121
  5830. ;;
  5831. FNMA f64 = f68, f57, f64
  5832. FNMA f96 = f100, f57, f96
  5833. FNMA f72 = f76, f57, f72
  5834. FNMA f104 = f108, f57, f104
  5835. FNMA f80 = f84, f57, f80
  5836. FNMA f112 = f116, f57, f112
  5837. FNMA f88 = f92, f57, f88
  5838. FNMA f120 = f124, f57, f120
  5839. ;;
  5840. FMPY f67 = f67, f58
  5841. FMPY f99 = f99, f58
  5842. FMPY f75 = f75, f58
  5843. FMPY f107 = f107, f58
  5844. FMPY f83 = f83, f58
  5845. FMPY f115 = f115, f58
  5846. FMPY f91 = f91, f58
  5847. FMPY f123 = f123, f58
  5848. ;;
  5849. FNMA f66 = f67, f59, f66
  5850. FNMA f98 = f99, f59, f98
  5851. FNMA f74 = f75, f59, f74
  5852. FNMA f106 = f107, f59, f106
  5853. FNMA f82 = f83, f59, f82
  5854. FNMA f114 = f115, f59, f114
  5855. FNMA f90 = f91, f59, f90
  5856. FNMA f122 = f123, f59, f122
  5857. ;;
  5858. FNMA f65 = f67, f60, f65
  5859. FNMA f97 = f99, f60, f97
  5860. FNMA f73 = f75, f60, f73
  5861. FNMA f105 = f107, f60, f105
  5862. FNMA f81 = f83, f60, f81
  5863. FNMA f113 = f115, f60, f113
  5864. FNMA f89 = f91, f60, f89
  5865. FNMA f121 = f123, f60, f121
  5866. ;;
  5867. { .mfi
  5868. STFD [BOFFSET] = f71, SIZE
  5869. FNMA f64 = f67, f61, f64
  5870. }
  5871. { .mfi
  5872. STFD [BOFFSET2] = f103, SIZE
  5873. FNMA f96 = f99, f61, f96
  5874. }
  5875. ;;
  5876. { .mfi
  5877. STFD [BOFFSET] = f79, SIZE
  5878. FNMA f72 = f75, f61, f72
  5879. }
  5880. { .mfi
  5881. STFD [BOFFSET2] = f111, SIZE
  5882. FNMA f104 = f107, f61, f104
  5883. }
  5884. ;;
  5885. { .mfi
  5886. STFD [BOFFSET] = f87, SIZE
  5887. FNMA f80 = f83, f61, f80
  5888. }
  5889. { .mfi
  5890. STFD [BOFFSET2] = f119, SIZE
  5891. FNMA f112 = f115, f61, f112
  5892. }
  5893. ;;
  5894. { .mfi
  5895. STFD [BOFFSET] = f95, - 11 * SIZE
  5896. FNMA f88 = f91, f61, f88
  5897. }
  5898. { .mfi
  5899. STFD [BOFFSET2] = f127, - 11 * SIZE
  5900. FNMA f120 = f123, f61, f120
  5901. }
  5902. ;;
  5903. { .mfi
  5904. STFD [BOFFSET] = f70, SIZE
  5905. FMPY f66 = f66, f16
  5906. }
  5907. { .mfi
  5908. STFD [BOFFSET2] = f102, SIZE
  5909. FMPY f98 = f98, f16
  5910. }
  5911. ;;
  5912. { .mfi
  5913. STFD [BOFFSET] = f78, SIZE
  5914. FMPY f74 = f74, f16
  5915. }
  5916. { .mfi
  5917. STFD [BOFFSET2] = f110, SIZE
  5918. FMPY f106 = f106, f16
  5919. }
  5920. ;;
  5921. { .mfi
  5922. STFD [BOFFSET] = f86, SIZE
  5923. FMPY f82 = f82, f16
  5924. }
  5925. { .mfi
  5926. STFD [BOFFSET2] = f118, SIZE
  5927. FMPY f114 = f114, f16
  5928. }
  5929. ;;
  5930. { .mfi
  5931. STFD [BOFFSET] = f94, - 11 * SIZE
  5932. FMPY f90 = f90, f16
  5933. }
  5934. { .mfi
  5935. STFD [BOFFSET2] = f126, - 11 * SIZE
  5936. FMPY f122 = f122, f16
  5937. }
  5938. ;;
  5939. { .mfi
  5940. STFD [BOFFSET] = f69, SIZE
  5941. FNMA f65 = f66, f17, f65
  5942. }
  5943. { .mfi
  5944. STFD [BOFFSET2] = f101, SIZE
  5945. FNMA f97 = f98, f17, f97
  5946. }
  5947. ;;
  5948. { .mfi
  5949. STFD [BOFFSET] = f77, SIZE
  5950. FNMA f73 = f74, f17, f73
  5951. }
  5952. { .mfi
  5953. STFD [BOFFSET2] = f109, SIZE
  5954. FNMA f105 = f106, f17, f105
  5955. }
  5956. ;;
  5957. { .mfi
  5958. STFD [BOFFSET] = f85, SIZE
  5959. FNMA f81 = f82, f17, f81
  5960. }
  5961. { .mfi
  5962. STFD [BOFFSET2] = f117, SIZE
  5963. FNMA f113 = f114, f17, f113
  5964. }
  5965. ;;
  5966. { .mfi
  5967. STFD [BOFFSET] = f93, - 11 * SIZE
  5968. FNMA f89 = f90, f17, f89
  5969. }
  5970. { .mfi
  5971. STFD [BOFFSET2] = f125, - 11 * SIZE
  5972. FNMA f121 = f122, f17, f121
  5973. }
  5974. ;;
  5975. { .mfi
  5976. STFD [BOFFSET] = f68, SIZE
  5977. FNMA f64 = f66, f18, f64
  5978. }
  5979. { .mfi
  5980. STFD [BOFFSET2] = f100, SIZE
  5981. FNMA f96 = f98, f18, f96
  5982. }
  5983. ;;
  5984. { .mfi
  5985. STFD [BOFFSET] = f76, SIZE
  5986. FNMA f72 = f74, f18, f72
  5987. }
  5988. { .mfi
  5989. STFD [BOFFSET2] = f108, SIZE
  5990. FNMA f104 = f106, f18, f104
  5991. }
  5992. ;;
  5993. { .mfi
  5994. STFD [BOFFSET] = f84, SIZE
  5995. FNMA f80 = f82, f18, f80
  5996. }
  5997. { .mfi
  5998. STFD [BOFFSET2] = f116, SIZE
  5999. FNMA f112 = f114, f18, f112
  6000. }
  6001. ;;
  6002. { .mfi
  6003. STFD [BOFFSET] = f92, - 11 * SIZE
  6004. FNMA f88 = f90, f18, f88
  6005. }
  6006. { .mfi
  6007. STFD [BOFFSET2] = f124, - 11 * SIZE
  6008. FNMA f120 = f122, f18, f120
  6009. }
  6010. ;;
  6011. { .mfi
  6012. STFD [BOFFSET] = f67, SIZE
  6013. FMPY f65 = f65, f19
  6014. }
  6015. { .mfi
  6016. STFD [BOFFSET2] = f99, SIZE
  6017. FMPY f97 = f97, f19
  6018. }
  6019. ;;
  6020. { .mfi
  6021. STFD [BOFFSET] = f75, SIZE
  6022. FMPY f73 = f73, f19
  6023. }
  6024. { .mfi
  6025. STFD [BOFFSET2] = f107, SIZE
  6026. FMPY f105 = f105, f19
  6027. }
  6028. ;;
  6029. { .mfi
  6030. STFD [BOFFSET] = f83, SIZE
  6031. FMPY f81 = f81, f19
  6032. }
  6033. { .mfi
  6034. STFD [BOFFSET2] = f115, SIZE
  6035. FMPY f113 = f113, f19
  6036. }
  6037. ;;
  6038. { .mfi
  6039. STFD [BOFFSET] = f91, - 11 * SIZE
  6040. FMPY f89 = f89, f19
  6041. }
  6042. { .mfi
  6043. STFD [BOFFSET2] = f123, - 11 * SIZE
  6044. FMPY f121 = f121, f19
  6045. }
  6046. ;;
  6047. { .mfi
  6048. STFD [BOFFSET] = f66, SIZE
  6049. FNMA f64 = f65, f20, f64
  6050. }
  6051. { .mfi
  6052. STFD [BOFFSET2] = f98, SIZE
  6053. FNMA f96 = f97, f20, f96
  6054. }
  6055. ;;
  6056. { .mfi
  6057. STFD [BOFFSET] = f74, SIZE
  6058. FNMA f72 = f73, f20, f72
  6059. }
  6060. { .mfi
  6061. STFD [BOFFSET2] = f106, SIZE
  6062. FNMA f104 = f105, f20, f104
  6063. }
  6064. ;;
  6065. { .mfi
  6066. STFD [BOFFSET] = f82, SIZE
  6067. FNMA f80 = f81, f20, f80
  6068. }
  6069. { .mfi
  6070. STFD [BOFFSET2] = f114, SIZE
  6071. FNMA f112 = f113, f20, f112
  6072. }
  6073. ;;
  6074. { .mfi
  6075. STFD [BOFFSET] = f90, -11 * SIZE
  6076. FNMA f88 = f89, f20, f88
  6077. }
  6078. { .mfi
  6079. STFD [BOFFSET2] = f122, -11 * SIZE
  6080. FNMA f120 = f121, f20, f120
  6081. }
  6082. ;;
  6083. { .mfi
  6084. STFD [BOFFSET] = f65, SIZE
  6085. FMPY f64 = f64, f21
  6086. }
  6087. { .mfi
  6088. STFD [BOFFSET2] = f97, SIZE
  6089. FMPY f96 = f96, f21
  6090. }
  6091. ;;
  6092. { .mfi
  6093. STFD [BOFFSET] = f73, SIZE
  6094. FMPY f72 = f72, f21
  6095. }
  6096. { .mfi
  6097. STFD [BOFFSET2] = f105, SIZE
  6098. FMPY f104 = f104, f21
  6099. }
  6100. ;;
  6101. { .mfi
  6102. STFD [BOFFSET] = f81, SIZE
  6103. FMPY f80 = f80, f21
  6104. }
  6105. { .mfi
  6106. STFD [BOFFSET2] = f113, SIZE
  6107. FMPY f112 = f112, f21
  6108. }
  6109. ;;
  6110. { .mfi
  6111. STFD [BOFFSET] = f89, - 11 * SIZE
  6112. FMPY f88 = f88, f21
  6113. }
  6114. { .mfi
  6115. STFD [BOFFSET2] = f121, - 11 * SIZE
  6116. FMPY f120 = f120, f21
  6117. }
  6118. ;;
  6119. { .mmi
  6120. STFD [BOFFSET] = f64, SIZE
  6121. STFD [BOFFSET2] = f96, SIZE
  6122. adds C1 = -8 * SIZE, C1
  6123. }
  6124. ;;
  6125. { .mmi
  6126. STFD [BOFFSET] = f72, SIZE
  6127. STFD [BOFFSET2] = f104, SIZE
  6128. adds C2 = -8 * SIZE, C2
  6129. }
  6130. ;;
  6131. { .mmi
  6132. STFD [BOFFSET] = f80, SIZE
  6133. STFD [BOFFSET2] = f112, SIZE
  6134. nop __LINE__
  6135. }
  6136. ;;
  6137. { .mmi
  6138. STFD [BOFFSET] = f88, - 3 * SIZE
  6139. STFD [BOFFSET2] = f120, - 3 * SIZE
  6140. adds C9 = 4 * SIZE, C1
  6141. }
  6142. ;;
  6143. { .mmf
  6144. STFD [C1 ] = f64, SIZE
  6145. STFD [C9 ] = f68, SIZE
  6146. mov f64 = f0
  6147. }
  6148. ;;
  6149. { .mmi
  6150. STFD [C1 ] = f65, SIZE
  6151. STFD [C9 ] = f69, SIZE
  6152. adds C10 = 4 * SIZE, C2
  6153. }
  6154. ;;
  6155. { .mmi
  6156. STFD [C1 ] = f66, SIZE
  6157. STFD [C9 ] = f70, SIZE
  6158. adds C3 = -8 * SIZE, C3
  6159. }
  6160. ;;
  6161. { .mmi
  6162. STFD [C1 ] = f67, - 3 * SIZE
  6163. STFD [C9 ] = f71
  6164. adds C11 = 4 * SIZE, C3
  6165. }
  6166. ;;
  6167. { .mmf
  6168. STFD [C2 ] = f72, SIZE
  6169. STFD [C10] = f76, SIZE
  6170. mov f72 = f0
  6171. }
  6172. ;;
  6173. { .mmi
  6174. STFD [C2 ] = f73, SIZE
  6175. STFD [C10] = f77, SIZE
  6176. adds C4 = -8 * SIZE, C4
  6177. }
  6178. ;;
  6179. { .mmi
  6180. STFD [C2 ] = f74, SIZE
  6181. STFD [C10] = f78, SIZE
  6182. adds C12 = 4 * SIZE, C4
  6183. }
  6184. ;;
  6185. { .mmi
  6186. STFD [C2 ] = f75, - 3 * SIZE
  6187. STFD [C10] = f79
  6188. adds C5 = -8 * SIZE, C5
  6189. }
  6190. ;;
  6191. { .mmf
  6192. STFD [C3 ] = f80, SIZE
  6193. STFD [C11] = f84, SIZE
  6194. mov f80 = f0
  6195. }
  6196. ;;
  6197. { .mmi
  6198. STFD [C3 ] = f81, SIZE
  6199. STFD [C11] = f85, SIZE
  6200. adds C13 = 4 * SIZE, C5
  6201. }
  6202. ;;
  6203. { .mmi
  6204. STFD [C3 ] = f82, SIZE
  6205. STFD [C11] = f86, SIZE
  6206. adds C6 = -8 * SIZE, C6
  6207. }
  6208. ;;
  6209. { .mmi
  6210. STFD [C3 ] = f83, - 3 * SIZE
  6211. STFD [C11] = f87
  6212. adds C14 = 4 * SIZE, C6
  6213. }
  6214. ;;
  6215. { .mmf
  6216. STFD [C4 ] = f88, SIZE
  6217. STFD [C12] = f92, SIZE
  6218. mov f88 = f0
  6219. }
  6220. ;;
  6221. { .mmi
  6222. STFD [C4 ] = f89, SIZE
  6223. STFD [C12] = f93, SIZE
  6224. adds C8 = -8 * SIZE, C8
  6225. }
  6226. ;;
  6227. { .mmi
  6228. STFD [C4 ] = f90, SIZE
  6229. STFD [C12] = f94, SIZE
  6230. adds C16 = 4 * SIZE, C8
  6231. }
  6232. ;;
  6233. { .mmi
  6234. STFD [C4 ] = f91, - 3 * SIZE
  6235. STFD [C12] = f95
  6236. cmp.ne p6, p0 = 1, I
  6237. }
  6238. ;;
  6239. { .mmf
  6240. STFD [C5 ] = f96, SIZE
  6241. STFD [C13] = f100, SIZE
  6242. mov f96 = f0
  6243. }
  6244. ;;
  6245. { .mmi
  6246. STFD [C5 ] = f97, SIZE
  6247. STFD [C13] = f101, SIZE
  6248. adds I = -1, I
  6249. }
  6250. ;;
  6251. { .mmi
  6252. STFD [C5 ] = f98, SIZE
  6253. STFD [C13] = f102, SIZE
  6254. adds C7 = -8 * SIZE, C7
  6255. }
  6256. ;;
  6257. { .mmi
  6258. STFD [C5 ] = f99, - 3 * SIZE
  6259. STFD [C13] = f103
  6260. adds C15 = 4 * SIZE, C7
  6261. }
  6262. ;;
  6263. { .mmf
  6264. STFD [C6 ] = f104, SIZE
  6265. STFD [C14] = f108, SIZE
  6266. mov f104 = f0
  6267. }
  6268. ;;
  6269. { .mmi
  6270. STFD [C6 ] = f105, SIZE
  6271. STFD [C14] = f109, SIZE
  6272. shladd r2 = K, BASE_SHIFT, r0
  6273. }
  6274. ;;
  6275. { .mmi
  6276. STFD [C6 ] = f106, SIZE
  6277. STFD [C14] = f110, SIZE
  6278. sub L = K, KK
  6279. }
  6280. ;;
  6281. { .mmi
  6282. STFD [C6 ] = f107, - 3 * SIZE
  6283. STFD [C14] = f111
  6284. nop __LINE__
  6285. }
  6286. ;;
  6287. { .mmf
  6288. STFD [C7 ] = f112, SIZE
  6289. STFD [C15] = f116, SIZE
  6290. mov f112 = f0
  6291. }
  6292. ;;
  6293. { .mmi
  6294. STFD [C7 ] = f113, SIZE
  6295. STFD [C15] = f117, SIZE
  6296. nop __LINE__
  6297. }
  6298. ;;
  6299. { .mmi
  6300. STFD [C7 ] = f114, SIZE
  6301. STFD [C15] = f118, SIZE
  6302. nop __LINE__
  6303. }
  6304. ;;
  6305. { .mmi
  6306. STFD [C7 ] = f115, - 3 * SIZE
  6307. STFD [C15] = f119
  6308. nop __LINE__
  6309. }
  6310. ;;
  6311. { .mmf
  6312. STFD [C8 ] = f120, SIZE
  6313. STFD [C16] = f124, SIZE
  6314. mov f120 = f0
  6315. }
  6316. ;;
  6317. { .mmi
  6318. STFD [C8 ] = f121, SIZE
  6319. STFD [C16] = f125, SIZE
  6320. adds KK = -8, KK
  6321. }
  6322. ;;
  6323. { .mmi
  6324. STFD [C8 ] = f122, SIZE
  6325. STFD [C16] = f126, SIZE
  6326. sub L = K, KK
  6327. }
  6328. ;;
  6329. { .mmb
  6330. STFD [C8 ] = f123, - 3 * SIZE
  6331. STFD [C16] = f127
  6332. (p6) br.cond.dptk .L011
  6333. }
  6334. ;;
  6335. .L049:
  6336. { .mmi
  6337. adds J = -1, J
  6338. mov AOFFSET = A
  6339. shladd KK8 = K, BASE_SHIFT, r0
  6340. }
  6341. ;;
  6342. { .mmb
  6343. shladd B = KK8, 3, B
  6344. cmp.lt p6, p0 = 0, J
  6345. (p6) br.cond.dptk .L000
  6346. }
  6347. ;;
  6348. .align 8
  6349. .L050:
  6350. { .mib
  6351. setf.d f64 = r0
  6352. tbit.z p6, p0 = N, 2
  6353. (p6) br.cond.dpnt .L090
  6354. }
  6355. ;;
  6356. #ifdef RT
  6357. { .mmi
  6358. shladd r3 = LDC, 2, r0
  6359. nop __LINE__
  6360. shl r2 = K, 2 + BASE_SHIFT
  6361. }
  6362. ;;
  6363. { .mmi
  6364. sub B = B, r2
  6365. sub C = C, r3
  6366. nop __LINE__
  6367. }
  6368. #endif
  6369. ;;
  6370. { .mfi
  6371. mov C1 = C // coffset1 = c + 0 * ldc
  6372. #ifdef LN
  6373. add KK = M, OFFSET
  6374. #elif defined LT
  6375. mov KK = OFFSET
  6376. #else
  6377. nop __LINE__
  6378. #endif
  6379. }
  6380. ;;
  6381. { .mmf
  6382. #if defined(LN) || defined(RT)
  6383. mov AORIG = A
  6384. #else
  6385. mov AOFFSET = A
  6386. #endif
  6387. }
  6388. { .mmf
  6389. add C2 = LDC, C // coffset2 = c + 1 * ldc
  6390. shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc
  6391. }
  6392. ;;
  6393. { .mfi
  6394. #ifndef RT
  6395. shladd C = LDC, 2, C // coffset += 8 * ldc
  6396. #else
  6397. nop __LINE__
  6398. #endif
  6399. #if defined(LT) || defined(RN)
  6400. mov L = KK
  6401. #else
  6402. sub L = K, KK
  6403. #endif
  6404. }{ .mfb
  6405. shladd C4 = LDC, 1, C2
  6406. }
  6407. ;;
  6408. mov f72 = f0
  6409. mov f80 = f0
  6410. mov f88 = f0
  6411. mov f65 = f0
  6412. mov f73 = f0
  6413. mov f81 = f0
  6414. mov f89 = f0
  6415. tbit.z p6,p7 = M, 0
  6416. (p6) br.cond.dptk .L070
  6417. { .mib
  6418. #if defined(LT) || defined(RN)
  6419. mov L = KK
  6420. #else
  6421. sub L = K, KK
  6422. #endif
  6423. }
  6424. ;;
  6425. { .mmi
  6426. cmp.ne p7, p0 = r0, L
  6427. adds BOFFSET = 0 * SIZE, B
  6428. shl r2 = K, 0 + BASE_SHIFT
  6429. }
  6430. { .mmi
  6431. shladd r3 = KK, BASE_SHIFT, r0
  6432. nop __LINE__
  6433. nop __LINE__
  6434. }
  6435. ;;
  6436. #if defined(LT) || defined(RN)
  6437. { .mmf
  6438. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6439. }
  6440. ;;
  6441. #else
  6442. { .mfi
  6443. shladd BOFFSET = r3, 2, B
  6444. #ifdef LN
  6445. sub AORIG = AORIG, r2
  6446. #else
  6447. nop __LINE__
  6448. #endif
  6449. }
  6450. ;;
  6451. { .mfi
  6452. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6453. add AOFFSET = r3, AORIG
  6454. }
  6455. ;;
  6456. #endif
  6457. { .mmi
  6458. adds L = 1, L
  6459. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  6460. cmp.eq p3, p0 = r0, r0
  6461. }
  6462. ;;
  6463. { .mii
  6464. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  6465. tbit.z p12, p0 = L, 0
  6466. shr L = L, 1
  6467. }
  6468. ;;
  6469. { .mmi
  6470. adds L = -1, L
  6471. }
  6472. ;;
  6473. { .mmi
  6474. cmp.eq p6, p0 = -1, L
  6475. }
  6476. ;;
  6477. { .mib
  6478. (p7) LDFD f32 = [AOFFSET], 1 * SIZE
  6479. mov ar.lc = L
  6480. (p6) br.cond.dpnt .L088
  6481. }
  6482. ;;
  6483. .L082:
  6484. { .mfb
  6485. cmp.ne p4, p5 = 0, L
  6486. FMA f64 = f32, f48, f64 // A1 * B1
  6487. nop __LINE__
  6488. }
  6489. { .mfi
  6490. (p12) cmp.ne p3, p0 = 0, L
  6491. FMA f72 = f32, f49, f72 // A1 * B2
  6492. nop __LINE__
  6493. }
  6494. ;;
  6495. { .mfb
  6496. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  6497. FMA f80 = f32, f50, f80 // A1 * B3
  6498. nop __LINE__
  6499. }
  6500. { .mfb
  6501. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  6502. FMA f88 = f32, f51, f88 // A1 * B4
  6503. nop __LINE__
  6504. }
  6505. ;;
  6506. { .mfb
  6507. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  6508. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6509. nop __LINE__
  6510. }
  6511. { .mfb
  6512. nop __LINE__
  6513. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  6514. nop __LINE__
  6515. }
  6516. ;;
  6517. { .mmf
  6518. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6519. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  6520. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  6521. }
  6522. { .mmf
  6523. nop __LINE__
  6524. nop __LINE__
  6525. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  6526. }
  6527. ;;
  6528. { .mib
  6529. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  6530. nop __LINE__
  6531. nop __LINE__
  6532. }
  6533. { .mmb
  6534. nop __LINE__
  6535. adds L = -1, L
  6536. br.cloop.sptk.few .L082
  6537. }
  6538. ;;
  6539. .L088:
  6540. #if defined(LN) || defined(RT)
  6541. #ifdef LN
  6542. adds r2 = -1, KK
  6543. #else
  6544. adds r2 = -4, KK
  6545. #endif
  6546. ;;
  6547. shladd r2 = r2, BASE_SHIFT, r0
  6548. ;;
  6549. add AOFFSET = r2, AORIG
  6550. shladd BOFFSET = r2, 2, B
  6551. ;;
  6552. #endif
  6553. adds AOFFSET2 = 4 * SIZE, AOFFSET
  6554. adds BOFFSET2 = 4 * SIZE, BOFFSET
  6555. ;;
  6556. #if defined(LN) || defined(LT)
  6557. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  6558. ;;
  6559. LDFPD f34, f35 = [BOFFSET]
  6560. adds BOFFSET = -2 * SIZE, BOFFSET
  6561. ;;
  6562. FSUB f64 = f32, f64
  6563. FSUB f72 = f33, f72
  6564. FSUB f80 = f34, f80
  6565. FSUB f88 = f35, f88
  6566. ;;
  6567. #else
  6568. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6569. ;;
  6570. LDFPD f34, f35 = [AOFFSET]
  6571. adds AOFFSET = -2 * SIZE, AOFFSET
  6572. ;;
  6573. FSUB f64 = f32, f64
  6574. FSUB f72 = f33, f72
  6575. FSUB f80 = f34, f80
  6576. FSUB f88 = f35, f88
  6577. ;;
  6578. #endif
  6579. #ifdef LN
  6580. LDFD f32 = [AOFFSET]
  6581. ;;
  6582. FMPY f64 = f64, f32
  6583. FMPY f72 = f72, f32
  6584. FMPY f80 = f80, f32
  6585. FMPY f88 = f88, f32
  6586. ;;
  6587. { .mmi
  6588. STFD [BOFFSET] = f64, SIZE
  6589. adds C1 = -1 * SIZE, C1
  6590. }
  6591. ;;
  6592. { .mmi
  6593. STFD [BOFFSET] = f72, SIZE
  6594. adds C2 = -1 * SIZE, C2
  6595. }
  6596. ;;
  6597. { .mmi
  6598. STFD [BOFFSET] = f80, SIZE
  6599. nop __LINE__
  6600. }
  6601. ;;
  6602. { .mmi
  6603. STFD [BOFFSET] = f88, - 3 * SIZE
  6604. }
  6605. ;;
  6606. adds C3 = -1 * SIZE, C3
  6607. adds C4 = -1 * SIZE, C4
  6608. ;;
  6609. #endif
  6610. #ifdef LT
  6611. LDFD f32 = [AOFFSET]
  6612. ;;
  6613. FMPY f64 = f64, f32
  6614. FMPY f72 = f72, f32
  6615. FMPY f80 = f80, f32
  6616. FMPY f88 = f88, f32
  6617. ;;
  6618. STFD [BOFFSET] = f64, SIZE
  6619. ;;
  6620. STFD [BOFFSET] = f72, SIZE
  6621. ;;
  6622. STFD [BOFFSET] = f80, SIZE
  6623. ;;
  6624. STFD [BOFFSET] = f88, -3 * SIZE
  6625. ;;
  6626. #endif
  6627. #ifdef RN
  6628. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  6629. ;;
  6630. LDFPD f34, f35 = [BOFFSET]
  6631. adds BOFFSET = 3 * SIZE, BOFFSET
  6632. ;;
  6633. LDFD f36 = [BOFFSET], 1 * SIZE
  6634. ;;
  6635. LDFPD f37, f38 = [BOFFSET]
  6636. adds BOFFSET = 4 * SIZE, BOFFSET
  6637. ;;
  6638. LDFPD f39, f40 = [BOFFSET]
  6639. adds BOFFSET = 5 * SIZE, BOFFSET
  6640. ;;
  6641. LDFD f41 = [BOFFSET], -15 * SIZE
  6642. FMPY f64 = f64, f32
  6643. ;;
  6644. FNMA f72 = f64, f33, f72
  6645. ;;
  6646. FNMA f80 = f64, f34, f80
  6647. ;;
  6648. FNMA f88 = f64, f35, f88
  6649. ;;
  6650. FMPY f72 = f72, f36
  6651. ;;
  6652. FNMA f80 = f72, f37, f80
  6653. ;;
  6654. FNMA f88 = f72, f38, f88
  6655. ;;
  6656. FMPY f80 = f80, f39
  6657. ;;
  6658. FNMA f88 = f80, f40, f88
  6659. ;;
  6660. FMPY f88 = f88, f41
  6661. ;;
  6662. STFD [AOFFSET] = f64, SIZE
  6663. ;;
  6664. STFD [AOFFSET] = f72, SIZE
  6665. ;;
  6666. STFD [AOFFSET] = f80, SIZE
  6667. ;;
  6668. STFD [AOFFSET] = f88, -3 * SIZE
  6669. ;;
  6670. #endif
  6671. #ifdef RT
  6672. adds BOFFSET = 14 * SIZE, BOFFSET
  6673. ;;
  6674. LDFPD f33, f32 = [BOFFSET]
  6675. adds BOFFSET = - 2 * SIZE, BOFFSET
  6676. ;;
  6677. LDFPD f35, f34 = [BOFFSET]
  6678. adds BOFFSET = - 2 * SIZE, BOFFSET
  6679. ;;
  6680. LDFD f36 = [BOFFSET], - 2 * SIZE
  6681. ;;
  6682. LDFPD f38, f37 = [BOFFSET]
  6683. adds BOFFSET = - 4 * SIZE, BOFFSET
  6684. ;;
  6685. LDFPD f40, f39 = [BOFFSET]
  6686. adds BOFFSET = - 4 * SIZE, BOFFSET
  6687. ;;
  6688. LDFD f41 = [BOFFSET]
  6689. ;;
  6690. FMPY f88 = f88, f32
  6691. ;;
  6692. FNMA f80 = f88, f33, f80
  6693. ;;
  6694. FNMA f72 = f88, f34, f72
  6695. ;;
  6696. FNMA f64 = f88, f35, f64
  6697. ;;
  6698. FMPY f80 = f80, f36
  6699. ;;
  6700. FNMA f72 = f80, f37, f72
  6701. ;;
  6702. FNMA f64 = f80, f38, f64
  6703. ;;
  6704. FMPY f72 = f72, f39
  6705. ;;
  6706. FNMA f64 = f72, f40, f64
  6707. ;;
  6708. FMPY f64 = f64, f41
  6709. ;;
  6710. STFD [AOFFSET] = f64, SIZE
  6711. ;;
  6712. STFD [AOFFSET] = f72, SIZE
  6713. ;;
  6714. STFD [AOFFSET] = f80, SIZE
  6715. ;;
  6716. STFD [AOFFSET] = f88, - 3 * SIZE
  6717. ;;
  6718. #endif
  6719. #ifndef LN
  6720. STFD [C1 ] = f64, SIZE
  6721. #else
  6722. STFD [C1 ] = f64
  6723. #endif
  6724. #ifndef LN
  6725. STFD [C2 ] = f72, SIZE
  6726. #else
  6727. STFD [C2 ] = f72
  6728. #endif
  6729. #ifndef LN
  6730. STFD [C3 ] = f80, SIZE
  6731. #else
  6732. STFD [C3 ] = f80
  6733. #endif
  6734. #ifndef LN
  6735. STFD [C4 ] = f88, SIZE
  6736. #else
  6737. STFD [C4 ] = f88
  6738. #endif
  6739. ;;
  6740. mov f64 = f0
  6741. mov f72 = f0
  6742. mov f80 = f0
  6743. mov f88 = f0
  6744. ;;
  6745. shladd r2 = K, BASE_SHIFT, r0
  6746. ;;
  6747. sub L = K, KK
  6748. ;;
  6749. #ifdef RT
  6750. add AORIG = r2, AORIG
  6751. #else
  6752. nop __LINE__
  6753. #endif
  6754. ;;
  6755. #if defined(LT) || defined(RN)
  6756. shladd L = L, BASE_SHIFT, r0
  6757. #else
  6758. nop __LINE__
  6759. #endif
  6760. ;;
  6761. #if defined(LT) || defined(RN)
  6762. add AOFFSET = L, AOFFSET
  6763. #else
  6764. nop __LINE__
  6765. #endif
  6766. ;;
  6767. #if defined(LT) || defined(RN)
  6768. shladd BOFFSET = L, 2, BOFFSET
  6769. #else
  6770. nop __LINE__
  6771. #endif
  6772. ;;
  6773. #ifdef LT
  6774. adds KK = 1, KK
  6775. #elif defined LN
  6776. adds KK = -1, KK
  6777. #else
  6778. nop __LINE__
  6779. #endif
  6780. ;;
  6781. #if defined(LT) || defined(RN)
  6782. mov L = KK
  6783. #else
  6784. sub L = K, KK
  6785. #endif
  6786. ;;
  6787. .align 8
  6788. .L070:
  6789. tbit.z p6,p7 = M, 1
  6790. (p6) br.cond.dptk .L060
  6791. ;;
  6792. { .mib
  6793. #if defined(LT) || defined(RN)
  6794. mov L = KK
  6795. #else
  6796. sub L = K, KK
  6797. #endif
  6798. }
  6799. ;;
  6800. { .mmi
  6801. cmp.ne p7, p0 = r0, L
  6802. adds BOFFSET = 0 * SIZE, B
  6803. shl r2 = K, 1 + BASE_SHIFT
  6804. }
  6805. { .mmi
  6806. shladd r3 = KK, BASE_SHIFT, r0
  6807. nop __LINE__
  6808. nop __LINE__
  6809. }
  6810. ;;
  6811. #if defined(LT) || defined(RN)
  6812. { .mmf
  6813. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6814. setf.d f73 = r0
  6815. mov f65 = f0
  6816. }
  6817. ;;
  6818. #else
  6819. { .mfi
  6820. shladd BOFFSET = r3, 2, B
  6821. mov f65 = f0
  6822. #ifdef LN
  6823. sub AORIG = AORIG, r2
  6824. #else
  6825. nop __LINE__
  6826. #endif
  6827. }
  6828. ;;
  6829. { .mfi
  6830. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6831. mov f73 = f0
  6832. shladd AOFFSET = r3, 1, AORIG
  6833. }
  6834. ;;
  6835. #endif
  6836. { .mfi
  6837. mov f81 = f0
  6838. adds L = 1, L
  6839. }
  6840. { .mfi
  6841. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  6842. mov f89 = f0
  6843. cmp.eq p3, p0 = r0, r0
  6844. }
  6845. ;;
  6846. { .mfi
  6847. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  6848. tbit.z p12, p0 = L, 0
  6849. }
  6850. { .mfi
  6851. shr L = L, 1
  6852. }
  6853. ;;
  6854. { .mmf
  6855. adds L = -1, L
  6856. }
  6857. ;;
  6858. { .mmf
  6859. cmp.eq p6, p0 = -1, L
  6860. }
  6861. ;;
  6862. { .mib
  6863. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6864. mov ar.lc = L
  6865. (p6) br.cond.dpnt .L078
  6866. }
  6867. ;;
  6868. .align 8
  6869. .L072:
  6870. { .mfb
  6871. lfetch.nt1 [PREA], 4 * SIZE
  6872. FMA f64 = f32, f48, f64 // A1 * B1
  6873. nop __LINE__
  6874. }
  6875. { .mfi
  6876. nop __LINE__
  6877. FMA f72 = f32, f49, f72 // A1 * B2
  6878. (p12) cmp.ne p3, p0 = 0, L
  6879. }
  6880. ;;
  6881. { .mfi
  6882. lfetch.nt1 [PREB], 8 * SIZE
  6883. FMA f80 = f32, f50, f80 // A1 * B3
  6884. cmp.ne p4, p5 = 0, L
  6885. }
  6886. { .mfb
  6887. nop __LINE__
  6888. FMA f88 = f32, f51, f88 // A1 * B4
  6889. nop __LINE__
  6890. }
  6891. ;;
  6892. { .mfi
  6893. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  6894. FMA f65 = f33, f48, f65 // A2 * B1
  6895. }
  6896. { .mfi
  6897. nop __LINE__
  6898. FMA f73 = f33, f49, f73 // A2 * B2
  6899. }
  6900. ;;
  6901. { .mfi
  6902. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  6903. FMA f81 = f33, f50, f81 // A2 * B3
  6904. }
  6905. { .mmf
  6906. nop __LINE__
  6907. nop __LINE__
  6908. FMA f89 = f33, f51, f89 // A2 * B4
  6909. }
  6910. ;;
  6911. { .mfb
  6912. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  6913. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  6914. nop __LINE__
  6915. }
  6916. { .mmf
  6917. nop __LINE__
  6918. nop __LINE__
  6919. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  6920. }
  6921. ;;
  6922. { .mfb
  6923. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6924. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  6925. nop __LINE__
  6926. }
  6927. { .mmf
  6928. nop __LINE__
  6929. nop __LINE__
  6930. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  6931. }
  6932. ;;
  6933. { .mfb
  6934. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  6935. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  6936. nop __LINE__
  6937. }
  6938. { .mfb
  6939. nop __LINE__
  6940. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  6941. nop __LINE__
  6942. }
  6943. ;;
  6944. { .mfi
  6945. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  6946. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  6947. adds L = -1, L
  6948. }
  6949. { .mfb
  6950. nop __LINE__
  6951. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  6952. br.cloop.sptk.few .L072
  6953. }
  6954. ;;
  6955. .L078:
  6956. #if defined(LN) || defined(RT)
  6957. #ifdef LN
  6958. adds r2 = -2, KK
  6959. #else
  6960. adds r2 = -4, KK
  6961. #endif
  6962. ;;
  6963. shladd r2 = r2, BASE_SHIFT, r0
  6964. ;;
  6965. shladd AOFFSET = r2, 1, AORIG
  6966. shladd BOFFSET = r2, 2, B
  6967. ;;
  6968. #endif
  6969. adds AOFFSET2 = 4 * SIZE, AOFFSET
  6970. adds BOFFSET2 = 4 * SIZE, BOFFSET
  6971. ;;
  6972. #if defined(LN) || defined(LT)
  6973. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  6974. ;;
  6975. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  6976. ;;
  6977. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  6978. ;;
  6979. LDFPD f38, f39 = [BOFFSET]
  6980. adds BOFFSET = -6 * SIZE, BOFFSET
  6981. ;;
  6982. FSUB f64 = f32, f64
  6983. FSUB f72 = f33, f72
  6984. FSUB f80 = f34, f80
  6985. FSUB f88 = f35, f88
  6986. FSUB f65 = f36, f65
  6987. FSUB f73 = f37, f73
  6988. FSUB f81 = f38, f81
  6989. FSUB f89 = f39, f89
  6990. ;;
  6991. #else
  6992. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  6993. ;;
  6994. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  6995. ;;
  6996. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  6997. ;;
  6998. LDFPD f38, f39 = [AOFFSET]
  6999. adds AOFFSET = -6 * SIZE, AOFFSET
  7000. ;;
  7001. FSUB f64 = f32, f64
  7002. FSUB f65 = f33, f65
  7003. FSUB f72 = f34, f72
  7004. FSUB f73 = f35, f73
  7005. FSUB f80 = f36, f80
  7006. FSUB f81 = f37, f81
  7007. FSUB f88 = f38, f88
  7008. FSUB f89 = f39, f89
  7009. ;;
  7010. #endif
  7011. #ifdef LN
  7012. adds AOFFSET = 2 * SIZE, AOFFSET
  7013. ;;
  7014. LDFPD f33, f32 = [AOFFSET]
  7015. adds AOFFSET = - 2 * SIZE, AOFFSET
  7016. ;;
  7017. LDFD f34 = [AOFFSET]
  7018. ;;
  7019. FMPY f65 = f65, f32
  7020. FMPY f73 = f73, f32
  7021. FMPY f81 = f81, f32
  7022. FMPY f89 = f89, f32
  7023. ;;
  7024. FNMA f64 = f65, f33, f64
  7025. FNMA f72 = f73, f33, f72
  7026. FNMA f80 = f81, f33, f80
  7027. FNMA f88 = f89, f33, f88
  7028. ;;
  7029. FMPY f64 = f64, f34
  7030. FMPY f72 = f72, f34
  7031. FMPY f80 = f80, f34
  7032. FMPY f88 = f88, f34
  7033. ;;
  7034. STFD [BOFFSET] = f64, SIZE
  7035. STFD [BOFFSET2] = f65, SIZE
  7036. ;;
  7037. STFD [BOFFSET] = f72, SIZE
  7038. STFD [BOFFSET2] = f73, SIZE
  7039. ;;
  7040. STFD [BOFFSET] = f80, SIZE
  7041. STFD [BOFFSET2] = f81, SIZE
  7042. ;;
  7043. STFD [BOFFSET] = f88, - 3 * SIZE
  7044. STFD [BOFFSET2] = f89, - 3 * SIZE
  7045. ;;
  7046. adds C1 = -2 * SIZE, C1
  7047. adds C2 = -2 * SIZE, C2
  7048. adds C3 = -2 * SIZE, C3
  7049. adds C4 = -2 * SIZE, C4
  7050. ;;
  7051. #endif
  7052. #ifdef LT
  7053. LDFPD f32, f33 = [AOFFSET]
  7054. adds AOFFSET = 3 * SIZE, AOFFSET
  7055. ;;
  7056. LDFD f34 = [AOFFSET], - 3 * SIZE
  7057. ;;
  7058. FMPY f64 = f64, f32
  7059. FMPY f72 = f72, f32
  7060. FMPY f80 = f80, f32
  7061. FMPY f88 = f88, f32
  7062. ;;
  7063. FNMA f65 = f64, f33, f65
  7064. FNMA f73 = f72, f33, f73
  7065. FNMA f81 = f80, f33, f81
  7066. FNMA f89 = f88, f33, f89
  7067. ;;
  7068. FMPY f65 = f65, f34
  7069. FMPY f73 = f73, f34
  7070. FMPY f81 = f81, f34
  7071. FMPY f89 = f89, f34
  7072. ;;
  7073. STFD [BOFFSET] = f64, SIZE
  7074. STFD [BOFFSET2] = f65, SIZE
  7075. ;;
  7076. STFD [BOFFSET] = f72, SIZE
  7077. STFD [BOFFSET2] = f73, SIZE
  7078. ;;
  7079. STFD [BOFFSET] = f80, SIZE
  7080. STFD [BOFFSET2] = f81, SIZE
  7081. ;;
  7082. STFD [BOFFSET] = f88, -3 * SIZE
  7083. STFD [BOFFSET2] = f89, -3 * SIZE
  7084. ;;
  7085. #endif
  7086. #ifdef RN
  7087. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  7088. ;;
  7089. LDFPD f34, f35 = [BOFFSET]
  7090. adds BOFFSET = 3 * SIZE, BOFFSET
  7091. ;;
  7092. LDFD f36 = [BOFFSET], 1 * SIZE
  7093. ;;
  7094. LDFPD f37, f38 = [BOFFSET]
  7095. adds BOFFSET = 4 * SIZE, BOFFSET
  7096. ;;
  7097. LDFPD f39, f40 = [BOFFSET]
  7098. adds BOFFSET = 5 * SIZE, BOFFSET
  7099. ;;
  7100. LDFD f41 = [BOFFSET], -15 * SIZE
  7101. ;;
  7102. FMPY f64 = f64, f32
  7103. FMPY f65 = f65, f32
  7104. ;;
  7105. FNMA f72 = f64, f33, f72
  7106. FNMA f73 = f65, f33, f73
  7107. ;;
  7108. FNMA f80 = f64, f34, f80
  7109. FNMA f81 = f65, f34, f81
  7110. ;;
  7111. FNMA f88 = f64, f35, f88
  7112. FNMA f89 = f65, f35, f89
  7113. ;;
  7114. FMPY f72 = f72, f36
  7115. FMPY f73 = f73, f36
  7116. ;;
  7117. FNMA f80 = f72, f37, f80
  7118. FNMA f81 = f73, f37, f81
  7119. ;;
  7120. FNMA f88 = f72, f38, f88
  7121. FNMA f89 = f73, f38, f89
  7122. ;;
  7123. FMPY f80 = f80, f39
  7124. FMPY f81 = f81, f39
  7125. ;;
  7126. FNMA f88 = f80, f40, f88
  7127. FNMA f89 = f81, f40, f89
  7128. ;;
  7129. FMPY f88 = f88, f41
  7130. FMPY f89 = f89, f41
  7131. ;;
  7132. STFD [AOFFSET] = f64, SIZE
  7133. STFD [AOFFSET2] = f80, SIZE
  7134. ;;
  7135. STFD [AOFFSET] = f65, SIZE
  7136. STFD [AOFFSET2] = f81, SIZE
  7137. ;;
  7138. STFD [AOFFSET] = f72, SIZE
  7139. STFD [AOFFSET2] = f88, SIZE
  7140. ;;
  7141. STFD [AOFFSET] = f73, -3 * SIZE
  7142. STFD [AOFFSET2] = f89, -3 * SIZE
  7143. ;;
  7144. #endif
  7145. #ifdef RT
  7146. adds BOFFSET = 14 * SIZE, BOFFSET
  7147. ;;
  7148. LDFPD f33, f32 = [BOFFSET]
  7149. adds BOFFSET = - 2 * SIZE, BOFFSET
  7150. ;;
  7151. LDFPD f35, f34 = [BOFFSET]
  7152. adds BOFFSET = - 2 * SIZE, BOFFSET
  7153. ;;
  7154. LDFD f36 = [BOFFSET], - 2 * SIZE
  7155. ;;
  7156. LDFPD f38, f37 = [BOFFSET]
  7157. adds BOFFSET = - 4 * SIZE, BOFFSET
  7158. ;;
  7159. LDFPD f40, f39 = [BOFFSET]
  7160. adds BOFFSET = - 4 * SIZE, BOFFSET
  7161. ;;
  7162. LDFD f41 = [BOFFSET]
  7163. ;;
  7164. FMPY f88 = f88, f32
  7165. FMPY f89 = f89, f32
  7166. ;;
  7167. FNMA f80 = f88, f33, f80
  7168. FNMA f81 = f89, f33, f81
  7169. ;;
  7170. FNMA f72 = f88, f34, f72
  7171. FNMA f73 = f89, f34, f73
  7172. ;;
  7173. FNMA f64 = f88, f35, f64
  7174. FNMA f65 = f89, f35, f65
  7175. ;;
  7176. FMPY f80 = f80, f36
  7177. FMPY f81 = f81, f36
  7178. ;;
  7179. FNMA f72 = f80, f37, f72
  7180. FNMA f73 = f81, f37, f73
  7181. ;;
  7182. FNMA f64 = f80, f38, f64
  7183. FNMA f65 = f81, f38, f65
  7184. ;;
  7185. FMPY f72 = f72, f39
  7186. FMPY f73 = f73, f39
  7187. ;;
  7188. FNMA f64 = f72, f40, f64
  7189. FNMA f65 = f73, f40, f65
  7190. ;;
  7191. FMPY f64 = f64, f41
  7192. FMPY f65 = f65, f41
  7193. ;;
  7194. STFD [AOFFSET] = f64, SIZE
  7195. STFD [AOFFSET2] = f65, SIZE
  7196. ;;
  7197. STFD [AOFFSET] = f72, SIZE
  7198. STFD [AOFFSET2] = f73, SIZE
  7199. ;;
  7200. STFD [AOFFSET] = f80, SIZE
  7201. STFD [AOFFSET2] = f81, SIZE
  7202. ;;
  7203. STFD [AOFFSET] = f88, -3 * SIZE
  7204. STFD [AOFFSET2] = f89, -3 * SIZE
  7205. ;;
  7206. #endif
  7207. STFD [C1 ] = f64, SIZE
  7208. mov f64 = f0
  7209. ;;
  7210. #ifndef LN
  7211. STFD [C1 ] = f65, SIZE
  7212. #else
  7213. STFD [C1 ] = f65, -SIZE
  7214. #endif
  7215. ;;
  7216. STFD [C2 ] = f72, SIZE
  7217. mov f72 = f0
  7218. ;;
  7219. #ifndef LN
  7220. STFD [C2 ] = f73, SIZE
  7221. #else
  7222. STFD [C2 ] = f73, -SIZE
  7223. #endif
  7224. ;;
  7225. STFD [C3 ] = f80, SIZE
  7226. mov f80 = f0
  7227. ;;
  7228. #ifndef LN
  7229. STFD [C3 ] = f81, SIZE
  7230. #else
  7231. STFD [C3 ] = f81, - SIZE
  7232. #endif
  7233. ;;
  7234. STFD [C4 ] = f88, SIZE
  7235. mov f88 = f0
  7236. ;;
  7237. #ifndef LN
  7238. STFD [C4 ] = f89, SIZE
  7239. #else
  7240. STFD [C4 ] = f89, -SIZE
  7241. #endif
  7242. ;;
  7243. mov f96 = f0
  7244. ;;
  7245. mov f104 = f0
  7246. ;;
  7247. shladd r2 = K, BASE_SHIFT, r0
  7248. ;;
  7249. sub L = K, KK
  7250. ;;
  7251. #ifdef RT
  7252. shladd AORIG = r2, 1, AORIG
  7253. #else
  7254. nop __LINE__
  7255. #endif
  7256. ;;
  7257. mov f112 = f0
  7258. ;;
  7259. { .mmi
  7260. #if defined(LT) || defined(RN)
  7261. shladd L = L, BASE_SHIFT, r0
  7262. #else
  7263. nop __LINE__
  7264. #endif
  7265. }
  7266. ;;
  7267. { .mmi
  7268. #if defined(LT) || defined(RN)
  7269. shladd AOFFSET = L, 1, AOFFSET
  7270. #else
  7271. nop __LINE__
  7272. #endif
  7273. }
  7274. ;;
  7275. { .mmi
  7276. #if defined(LT) || defined(RN)
  7277. shladd BOFFSET = L, 2, BOFFSET
  7278. #else
  7279. nop __LINE__
  7280. #endif
  7281. }
  7282. ;;
  7283. { .mmf
  7284. mov f120 = f0
  7285. }
  7286. ;;
  7287. { .mmi
  7288. #ifdef LT
  7289. adds KK = 2, KK
  7290. #elif defined LN
  7291. adds KK = -2, KK
  7292. #else
  7293. nop __LINE__
  7294. #endif
  7295. }
  7296. ;;
  7297. { .mmi
  7298. #if defined(LT) || defined(RN)
  7299. mov L = KK
  7300. #else
  7301. sub L = K, KK
  7302. #endif
  7303. }
  7304. ;;
  7305. .align 8
  7306. .L060:
  7307. tbit.z p6, p7 = M, 2
  7308. (p6) br.cond.dptk .L051
  7309. ;;
  7310. { .mib
  7311. #if defined(LT) || defined(RN)
  7312. mov L = KK
  7313. #else
  7314. sub L = K, KK
  7315. #endif
  7316. }
  7317. ;;
  7318. { .mmi
  7319. cmp.ne p7, p0 = r0, L
  7320. adds BOFFSET = 0 * SIZE, B
  7321. shl r2 = K, 2 + BASE_SHIFT
  7322. }
  7323. { .mmi
  7324. shladd r3 = KK, BASE_SHIFT, r0
  7325. nop __LINE__
  7326. nop __LINE__
  7327. }
  7328. ;;
  7329. #if defined(LT) || defined(RN)
  7330. { .mmf
  7331. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7332. }
  7333. ;;
  7334. #else
  7335. { .mfi
  7336. shladd BOFFSET = r3, 2, B
  7337. #ifdef LN
  7338. sub AORIG = AORIG, r2
  7339. #else
  7340. nop __LINE__
  7341. #endif
  7342. }
  7343. ;;
  7344. { .mfi
  7345. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7346. shladd AOFFSET = r3, 2, AORIG
  7347. }
  7348. ;;
  7349. #endif
  7350. { .mfi
  7351. adds L = 1, L
  7352. }
  7353. { .mfi
  7354. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  7355. cmp.eq p3, p0 = r0, r0
  7356. }
  7357. ;;
  7358. { .mfi
  7359. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  7360. tbit.z p12, p0 = L, 0
  7361. }
  7362. { .mfi
  7363. shr L = L, 1
  7364. }
  7365. ;;
  7366. { .mfi
  7367. adds L = -1, L
  7368. }
  7369. ;;
  7370. { .mfi
  7371. cmp.eq p6, p0 = -1, L
  7372. }
  7373. ;;
  7374. { .mmf
  7375. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7376. }
  7377. { .mfi
  7378. mov ar.lc = L
  7379. }
  7380. ;;
  7381. mov f64 = f0
  7382. mov f65 = f0
  7383. mov f66 = f0
  7384. mov f67 = f0
  7385. mov f72 = f0
  7386. mov f73 = f0
  7387. mov f74 = f0
  7388. mov f75 = f0
  7389. mov f80 = f0
  7390. mov f81 = f0
  7391. mov f82 = f0
  7392. mov f83 = f0
  7393. mov f88 = f0
  7394. mov f89 = f0
  7395. mov f90 = f0
  7396. mov f91 = f0
  7397. ;;
  7398. { .mmf
  7399. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  7400. }
  7401. { .mfb
  7402. (p6) br.cond.dpnt .L068
  7403. }
  7404. ;;
  7405. .align 8
  7406. .L062:
  7407. { .mfi
  7408. lfetch.nt1 [PREA], 8 * SIZE
  7409. FMA f64 = f32, f48, f64 // A1 * B1
  7410. cmp.ne p4, p5 = 0, L
  7411. }
  7412. { .mfi
  7413. nop __LINE__
  7414. FMA f72 = f32, f49, f72 // A1 * B2
  7415. (p12) cmp.ne p3, p0 = 0, L
  7416. }
  7417. ;;
  7418. { .mfi
  7419. lfetch.nt1 [PREB], 8 * SIZE
  7420. FMA f80 = f32, f50, f80 // A1 * B3
  7421. (p5) adds C9 = 2 * SIZE, C1
  7422. }
  7423. { .mfi
  7424. nop __LINE__
  7425. FMA f88 = f32, f51, f88 // A1 * B4
  7426. (p5) adds C10 = 2 * SIZE, C2
  7427. }
  7428. ;;
  7429. { .mfi
  7430. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  7431. FMA f65 = f33, f48, f65 // A2 * B1
  7432. (p5) adds C11 = 2 * SIZE, C3
  7433. }
  7434. { .mfi
  7435. nop __LINE__
  7436. FMA f73 = f33, f49, f73 // A2 * B2
  7437. (p5) adds C12 = 2 * SIZE, C4
  7438. }
  7439. ;;
  7440. { .mfb
  7441. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  7442. FMA f81 = f33, f50, f81 // A2 * B3
  7443. nop __LINE__
  7444. }
  7445. { .mfb
  7446. nop __LINE__
  7447. FMA f89 = f33, f51, f89 // A2 * B4
  7448. nop __LINE__
  7449. }
  7450. ;;
  7451. { .mfb
  7452. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  7453. FMA f66 = f34, f48, f66 // A3 * B1
  7454. nop __LINE__
  7455. }
  7456. { .mfb
  7457. nop __LINE__
  7458. FMA f74 = f34, f49, f74 // A3 * B2
  7459. nop __LINE__
  7460. }
  7461. ;;
  7462. { .mfb
  7463. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  7464. FMA f82 = f34, f50, f82 // A3 * B3
  7465. nop __LINE__
  7466. }
  7467. { .mfb
  7468. nop __LINE__
  7469. FMA f90 = f34, f51, f90 // A3 * B4
  7470. nop __LINE__
  7471. }
  7472. ;;
  7473. { .mfb
  7474. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7475. FMA f67 = f35, f48, f67 // A4 * B1
  7476. }
  7477. { .mfb
  7478. nop __LINE__
  7479. FMA f75 = f35, f49, f75 // A4 * B2
  7480. nop __LINE__
  7481. }
  7482. { .mfb
  7483. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  7484. FMA f83 = f35, f50, f83 // A4 * B3
  7485. nop __LINE__
  7486. }
  7487. { .mfb
  7488. nop __LINE__
  7489. FMA f91 = f35, f51, f91 // A4 * B4
  7490. nop __LINE__
  7491. }
  7492. ;;
  7493. { .mfb
  7494. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  7495. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  7496. nop __LINE__
  7497. }
  7498. { .mfb
  7499. nop __LINE__
  7500. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  7501. nop __LINE__
  7502. }
  7503. ;;
  7504. { .mfb
  7505. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  7506. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  7507. nop __LINE__
  7508. }
  7509. { .mfb
  7510. nop __LINE__
  7511. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  7512. nop __LINE__
  7513. }
  7514. ;;
  7515. { .mfb
  7516. nop __LINE__
  7517. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  7518. nop __LINE__
  7519. }
  7520. { .mfb
  7521. nop __LINE__
  7522. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  7523. nop __LINE__
  7524. }
  7525. ;;
  7526. { .mfb
  7527. nop __LINE__
  7528. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  7529. nop __LINE__
  7530. }
  7531. { .mfb
  7532. nop __LINE__
  7533. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  7534. nop __LINE__
  7535. }
  7536. ;;
  7537. { .mfb
  7538. nop __LINE__
  7539. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  7540. nop __LINE__
  7541. }
  7542. { .mfb
  7543. nop __LINE__
  7544. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  7545. nop __LINE__
  7546. }
  7547. ;;
  7548. { .mfb
  7549. nop __LINE__
  7550. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  7551. nop __LINE__
  7552. }
  7553. { .mfb
  7554. nop __LINE__
  7555. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  7556. nop __LINE__
  7557. }
  7558. ;;
  7559. { .mfb
  7560. nop __LINE__
  7561. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  7562. nop __LINE__
  7563. }
  7564. { .mfb
  7565. nop __LINE__
  7566. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  7567. nop __LINE__
  7568. }
  7569. ;;
  7570. { .mfi
  7571. nop __LINE__
  7572. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  7573. adds L = -1, L
  7574. }
  7575. { .mfb
  7576. nop __LINE__
  7577. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  7578. br.cloop.sptk.few .L062
  7579. }
  7580. ;;
  7581. .align 8
  7582. .L068:
  7583. #if defined(LN) || defined(RT)
  7584. #ifdef LN
  7585. adds r2 = -4, KK
  7586. #else
  7587. adds r2 = -4, KK
  7588. #endif
  7589. ;;
  7590. shladd r2 = r2, BASE_SHIFT, r0
  7591. ;;
  7592. shladd AOFFSET = r2, 2, AORIG
  7593. shladd BOFFSET = r2, 2, B
  7594. ;;
  7595. #endif
  7596. adds AOFFSET2 = 4 * SIZE, AOFFSET
  7597. adds BOFFSET2 = 4 * SIZE, BOFFSET
  7598. ;;
  7599. #if defined(LN) || defined(LT)
  7600. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  7601. ;;
  7602. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  7603. ;;
  7604. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  7605. ;;
  7606. LDFPD f38, f39 = [BOFFSET], 2 * SIZE
  7607. ;;
  7608. LDFPD f40, f41 = [BOFFSET], 2 * SIZE
  7609. ;;
  7610. LDFPD f42, f43 = [BOFFSET], 2 * SIZE
  7611. ;;
  7612. LDFPD f44, f45 = [BOFFSET], 2 * SIZE
  7613. ;;
  7614. LDFPD f46, f47 = [BOFFSET]
  7615. adds BOFFSET = -14 * SIZE, BOFFSET
  7616. ;;
  7617. FSUB f64 = f32, f64
  7618. FSUB f72 = f33, f72
  7619. FSUB f80 = f34, f80
  7620. FSUB f88 = f35, f88
  7621. ;;
  7622. FSUB f65 = f36, f65
  7623. FSUB f73 = f37, f73
  7624. FSUB f81 = f38, f81
  7625. FSUB f89 = f39, f89
  7626. ;;
  7627. FSUB f66 = f40, f66
  7628. FSUB f74 = f41, f74
  7629. FSUB f82 = f42, f82
  7630. FSUB f90 = f43, f90
  7631. ;;
  7632. FSUB f67 = f44, f67
  7633. FSUB f75 = f45, f75
  7634. FSUB f83 = f46, f83
  7635. FSUB f91 = f47, f91
  7636. ;;
  7637. #else
  7638. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7639. ;;
  7640. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  7641. ;;
  7642. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  7643. ;;
  7644. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  7645. ;;
  7646. LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  7647. ;;
  7648. LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  7649. ;;
  7650. LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  7651. ;;
  7652. LDFPD f46, f47 = [AOFFSET]
  7653. adds AOFFSET = -14 * SIZE, AOFFSET
  7654. ;;
  7655. FSUB f64 = f32, f64
  7656. FSUB f65 = f33, f65
  7657. FSUB f66 = f34, f66
  7658. FSUB f67 = f35, f67
  7659. FSUB f72 = f36, f72
  7660. FSUB f73 = f37, f73
  7661. FSUB f74 = f38, f74
  7662. FSUB f75 = f39, f75
  7663. FSUB f80 = f40, f80
  7664. FSUB f81 = f41, f81
  7665. FSUB f82 = f42, f82
  7666. FSUB f83 = f43, f83
  7667. FSUB f88 = f44, f88
  7668. FSUB f89 = f45, f89
  7669. FSUB f90 = f46, f90
  7670. FSUB f91 = f47, f91
  7671. ;;
  7672. #endif
  7673. #ifdef LN
  7674. adds AOFFSET = 14 * SIZE, AOFFSET
  7675. ;;
  7676. LDFPD f33, f32 = [AOFFSET]
  7677. adds AOFFSET = - 2 * SIZE, AOFFSET
  7678. ;;
  7679. LDFPD f35, f34 = [AOFFSET]
  7680. adds AOFFSET = - 2 * SIZE, AOFFSET
  7681. ;;
  7682. LDFD f36 = [AOFFSET], - 2 * SIZE
  7683. ;;
  7684. LDFPD f38, f37 = [AOFFSET]
  7685. adds AOFFSET = - 4 * SIZE, AOFFSET
  7686. ;;
  7687. LDFPD f40, f39 = [AOFFSET]
  7688. adds AOFFSET = - 4 * SIZE, AOFFSET
  7689. ;;
  7690. LDFD f41 = [AOFFSET]
  7691. ;;
  7692. FMPY f67 = f67, f32
  7693. FMPY f75 = f75, f32
  7694. FMPY f83 = f83, f32
  7695. FMPY f91 = f91, f32
  7696. ;;
  7697. FNMA f66 = f67, f33, f66
  7698. FNMA f74 = f75, f33, f74
  7699. FNMA f82 = f83, f33, f82
  7700. FNMA f90 = f91, f33, f90
  7701. ;;
  7702. FNMA f65 = f67, f34, f65
  7703. FNMA f73 = f75, f34, f73
  7704. FNMA f81 = f83, f34, f81
  7705. FNMA f89 = f91, f34, f89
  7706. ;;
  7707. FNMA f64 = f67, f35, f64
  7708. FNMA f72 = f75, f35, f72
  7709. FNMA f80 = f83, f35, f80
  7710. FNMA f88 = f91, f35, f88
  7711. ;;
  7712. FMPY f66 = f66, f36
  7713. FMPY f74 = f74, f36
  7714. FMPY f82 = f82, f36
  7715. FMPY f90 = f90, f36
  7716. ;;
  7717. FNMA f65 = f66, f37, f65
  7718. FNMA f73 = f74, f37, f73
  7719. FNMA f81 = f82, f37, f81
  7720. FNMA f89 = f90, f37, f89
  7721. ;;
  7722. FNMA f64 = f66, f38, f64
  7723. FNMA f72 = f74, f38, f72
  7724. FNMA f80 = f82, f38, f80
  7725. FNMA f88 = f90, f38, f88
  7726. ;;
  7727. FMPY f65 = f65, f39
  7728. FMPY f73 = f73, f39
  7729. FMPY f81 = f81, f39
  7730. FMPY f89 = f89, f39
  7731. ;;
  7732. FNMA f64 = f65, f40, f64
  7733. FNMA f72 = f73, f40, f72
  7734. FNMA f80 = f81, f40, f80
  7735. FNMA f88 = f89, f40, f88
  7736. ;;
  7737. FMPY f64 = f64, f41
  7738. FMPY f72 = f72, f41
  7739. FMPY f80 = f80, f41
  7740. FMPY f88 = f88, f41
  7741. ;;
  7742. adds BOFFSET = 8 * SIZE, BOFFSET
  7743. adds BOFFSET2 = 8 * SIZE, BOFFSET2
  7744. ;;
  7745. STFD [BOFFSET] = f66, SIZE
  7746. STFD [BOFFSET2] = f67, SIZE
  7747. ;;
  7748. STFD [BOFFSET] = f74, SIZE
  7749. STFD [BOFFSET2] = f75, SIZE
  7750. ;;
  7751. STFD [BOFFSET] = f82, SIZE
  7752. STFD [BOFFSET2] = f83, SIZE
  7753. ;;
  7754. STFD [BOFFSET] = f90, - 11 * SIZE
  7755. STFD [BOFFSET2] = f91, - 11 * SIZE
  7756. ;;
  7757. STFD [BOFFSET] = f64, SIZE
  7758. STFD [BOFFSET2] = f65, SIZE
  7759. ;;
  7760. STFD [BOFFSET] = f72, SIZE
  7761. STFD [BOFFSET2] = f73, SIZE
  7762. ;;
  7763. STFD [BOFFSET] = f80, SIZE
  7764. STFD [BOFFSET2] = f81, SIZE
  7765. ;;
  7766. STFD [BOFFSET] = f88, -3 * SIZE
  7767. STFD [BOFFSET2] = f89, -3 * SIZE
  7768. ;;
  7769. adds C1 = -4 * SIZE, C1
  7770. adds C2 = -4 * SIZE, C2
  7771. adds C3 = -4 * SIZE, C3
  7772. adds C4 = -4 * SIZE, C4
  7773. ;;
  7774. #endif
  7775. #ifdef LT
  7776. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  7777. ;;
  7778. LDFPD f34, f35 = [AOFFSET]
  7779. adds AOFFSET = 3 * SIZE, AOFFSET
  7780. ;;
  7781. LDFD f36 = [AOFFSET], 1 * SIZE
  7782. ;;
  7783. LDFPD f37, f38 = [AOFFSET]
  7784. adds AOFFSET = 4 * SIZE, AOFFSET
  7785. ;;
  7786. LDFPD f39, f40 = [AOFFSET]
  7787. adds AOFFSET = 5 * SIZE, AOFFSET
  7788. ;;
  7789. LDFD f41 = [AOFFSET], -15 * SIZE
  7790. ;;
  7791. FMPY f64 = f64, f32
  7792. FMPY f72 = f72, f32
  7793. FMPY f80 = f80, f32
  7794. FMPY f88 = f88, f32
  7795. ;;
  7796. FNMA f65 = f64, f33, f65
  7797. FNMA f73 = f72, f33, f73
  7798. FNMA f81 = f80, f33, f81
  7799. FNMA f89 = f88, f33, f89
  7800. ;;
  7801. FNMA f66 = f64, f34, f66
  7802. FNMA f74 = f72, f34, f74
  7803. FNMA f82 = f80, f34, f82
  7804. FNMA f90 = f88, f34, f90
  7805. ;;
  7806. FNMA f67 = f64, f35, f67
  7807. FNMA f75 = f72, f35, f75
  7808. FNMA f83 = f80, f35, f83
  7809. FNMA f91 = f88, f35, f91
  7810. ;;
  7811. FMPY f65 = f65, f36
  7812. FMPY f73 = f73, f36
  7813. FMPY f81 = f81, f36
  7814. FMPY f89 = f89, f36
  7815. ;;
  7816. FNMA f66 = f65, f37, f66
  7817. FNMA f74 = f73, f37, f74
  7818. FNMA f82 = f81, f37, f82
  7819. FNMA f90 = f89, f37, f90
  7820. ;;
  7821. FNMA f67 = f65, f38, f67
  7822. FNMA f75 = f73, f38, f75
  7823. FNMA f83 = f81, f38, f83
  7824. FNMA f91 = f89, f38, f91
  7825. ;;
  7826. FMPY f66 = f66, f39
  7827. FMPY f74 = f74, f39
  7828. FMPY f82 = f82, f39
  7829. FMPY f90 = f90, f39
  7830. ;;
  7831. FNMA f67 = f66, f40, f67
  7832. FNMA f75 = f74, f40, f75
  7833. FNMA f83 = f82, f40, f83
  7834. FNMA f91 = f90, f40, f91
  7835. ;;
  7836. FMPY f67 = f67, f41
  7837. FMPY f75 = f75, f41
  7838. FMPY f83 = f83, f41
  7839. FMPY f91 = f91, f41
  7840. ;;
  7841. STFD [BOFFSET] = f64, SIZE
  7842. STFD [BOFFSET2] = f65, SIZE
  7843. ;;
  7844. STFD [BOFFSET] = f72, SIZE
  7845. STFD [BOFFSET2] = f73, SIZE
  7846. ;;
  7847. STFD [BOFFSET] = f80, SIZE
  7848. STFD [BOFFSET2] = f81, SIZE
  7849. ;;
  7850. STFD [BOFFSET] = f88, 5 * SIZE
  7851. STFD [BOFFSET2] = f89, 5 * SIZE
  7852. ;;
  7853. STFD [BOFFSET] = f66, SIZE
  7854. STFD [BOFFSET2] = f67, SIZE
  7855. ;;
  7856. STFD [BOFFSET] = f74, SIZE
  7857. STFD [BOFFSET2] = f75, SIZE
  7858. ;;
  7859. STFD [BOFFSET] = f82, SIZE
  7860. STFD [BOFFSET2] = f83, SIZE
  7861. ;;
  7862. STFD [BOFFSET] = f90, -11 * SIZE
  7863. STFD [BOFFSET2] = f91, -11 * SIZE
  7864. ;;
  7865. #endif
  7866. #ifdef RN
  7867. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  7868. ;;
  7869. LDFPD f34, f35 = [BOFFSET]
  7870. adds BOFFSET = 3 * SIZE, BOFFSET
  7871. ;;
  7872. LDFD f36 = [BOFFSET], 1 * SIZE
  7873. ;;
  7874. LDFPD f37, f38 = [BOFFSET]
  7875. adds BOFFSET = 4 * SIZE, BOFFSET
  7876. ;;
  7877. LDFPD f39, f40 = [BOFFSET]
  7878. adds BOFFSET = 5 * SIZE, BOFFSET
  7879. ;;
  7880. LDFD f41 = [BOFFSET], -15 * SIZE
  7881. ;;
  7882. FMPY f64 = f64, f32
  7883. FMPY f65 = f65, f32
  7884. FMPY f66 = f66, f32
  7885. FMPY f67 = f67, f32
  7886. ;;
  7887. FNMA f72 = f64, f33, f72
  7888. FNMA f73 = f65, f33, f73
  7889. FNMA f74 = f66, f33, f74
  7890. FNMA f75 = f67, f33, f75
  7891. ;;
  7892. FNMA f80 = f64, f34, f80
  7893. FNMA f81 = f65, f34, f81
  7894. FNMA f82 = f66, f34, f82
  7895. FNMA f83 = f67, f34, f83
  7896. ;;
  7897. FNMA f88 = f64, f35, f88
  7898. FNMA f89 = f65, f35, f89
  7899. FNMA f90 = f66, f35, f90
  7900. FNMA f91 = f67, f35, f91
  7901. ;;
  7902. FMPY f72 = f72, f36
  7903. FMPY f73 = f73, f36
  7904. FMPY f74 = f74, f36
  7905. FMPY f75 = f75, f36
  7906. ;;
  7907. FNMA f80 = f72, f37, f80
  7908. FNMA f81 = f73, f37, f81
  7909. FNMA f82 = f74, f37, f82
  7910. FNMA f83 = f75, f37, f83
  7911. ;;
  7912. FNMA f88 = f72, f38, f88
  7913. FNMA f89 = f73, f38, f89
  7914. FNMA f90 = f74, f38, f90
  7915. FNMA f91 = f75, f38, f91
  7916. ;;
  7917. FMPY f80 = f80, f39
  7918. FMPY f81 = f81, f39
  7919. FMPY f82 = f82, f39
  7920. FMPY f83 = f83, f39
  7921. ;;
  7922. FNMA f88 = f80, f40, f88
  7923. FNMA f89 = f81, f40, f89
  7924. FNMA f90 = f82, f40, f90
  7925. FNMA f91 = f83, f40, f91
  7926. ;;
  7927. FMPY f88 = f88, f41
  7928. FMPY f89 = f89, f41
  7929. FMPY f90 = f90, f41
  7930. FMPY f91 = f91, f41
  7931. ;;
  7932. STFD [AOFFSET] = f64, SIZE
  7933. STFD [AOFFSET2] = f72, SIZE
  7934. ;;
  7935. STFD [AOFFSET] = f65, SIZE
  7936. STFD [AOFFSET2] = f73, SIZE
  7937. ;;
  7938. STFD [AOFFSET] = f66, SIZE
  7939. STFD [AOFFSET2] = f74, SIZE
  7940. ;;
  7941. STFD [AOFFSET] = f67, 5 * SIZE
  7942. STFD [AOFFSET2] = f75, 5 * SIZE
  7943. ;;
  7944. STFD [AOFFSET] = f80, SIZE
  7945. STFD [AOFFSET2] = f88, SIZE
  7946. ;;
  7947. STFD [AOFFSET] = f81, SIZE
  7948. STFD [AOFFSET2] = f89, SIZE
  7949. ;;
  7950. STFD [AOFFSET] = f82, SIZE
  7951. STFD [AOFFSET2] = f90, SIZE
  7952. ;;
  7953. STFD [AOFFSET] = f83, -11 * SIZE
  7954. STFD [AOFFSET2] = f91, -11 * SIZE
  7955. ;;
  7956. #endif
  7957. #ifdef RT
  7958. adds BOFFSET = 14 * SIZE, BOFFSET
  7959. ;;
  7960. LDFPD f33, f32 = [BOFFSET]
  7961. adds BOFFSET = - 2 * SIZE, BOFFSET
  7962. ;;
  7963. LDFPD f35, f34 = [BOFFSET]
  7964. adds BOFFSET = - 2 * SIZE, BOFFSET
  7965. ;;
  7966. LDFD f36 = [BOFFSET], - 2 * SIZE
  7967. ;;
  7968. LDFPD f38, f37 = [BOFFSET]
  7969. adds BOFFSET = - 4 * SIZE, BOFFSET
  7970. ;;
  7971. LDFPD f40, f39 = [BOFFSET]
  7972. adds BOFFSET = - 4 * SIZE, BOFFSET
  7973. ;;
  7974. LDFD f41 = [BOFFSET]
  7975. ;;
  7976. FMPY f88 = f88, f32
  7977. FMPY f89 = f89, f32
  7978. FMPY f90 = f90, f32
  7979. FMPY f91 = f91, f32
  7980. ;;
  7981. FNMA f80 = f88, f33, f80
  7982. FNMA f81 = f89, f33, f81
  7983. FNMA f82 = f90, f33, f82
  7984. FNMA f83 = f91, f33, f83
  7985. ;;
  7986. FNMA f72 = f88, f34, f72
  7987. FNMA f73 = f89, f34, f73
  7988. FNMA f74 = f90, f34, f74
  7989. FNMA f75 = f91, f34, f75
  7990. ;;
  7991. FNMA f64 = f88, f35, f64
  7992. FNMA f65 = f89, f35, f65
  7993. FNMA f66 = f90, f35, f66
  7994. FNMA f67 = f91, f35, f67
  7995. ;;
  7996. FMPY f80 = f80, f36
  7997. FMPY f81 = f81, f36
  7998. FMPY f82 = f82, f36
  7999. FMPY f83 = f83, f36
  8000. ;;
  8001. FNMA f72 = f80, f37, f72
  8002. FNMA f73 = f81, f37, f73
  8003. FNMA f74 = f82, f37, f74
  8004. FNMA f75 = f83, f37, f75
  8005. ;;
  8006. FNMA f64 = f80, f38, f64
  8007. FNMA f65 = f81, f38, f65
  8008. FNMA f66 = f82, f38, f66
  8009. FNMA f67 = f83, f38, f67
  8010. ;;
  8011. FMPY f72 = f72, f39
  8012. FMPY f73 = f73, f39
  8013. FMPY f74 = f74, f39
  8014. FMPY f75 = f75, f39
  8015. ;;
  8016. FNMA f64 = f72, f40, f64
  8017. FNMA f65 = f73, f40, f65
  8018. FNMA f66 = f74, f40, f66
  8019. FNMA f67 = f75, f40, f67
  8020. ;;
  8021. FMPY f64 = f64, f41
  8022. FMPY f65 = f65, f41
  8023. FMPY f66 = f66, f41
  8024. FMPY f67 = f67, f41
  8025. ;;
  8026. adds AOFFSET = 8 * SIZE, AOFFSET
  8027. adds AOFFSET2 = 8 * SIZE, AOFFSET2
  8028. ;;
  8029. STFD [AOFFSET] = f80, SIZE
  8030. STFD [AOFFSET2] = f88, SIZE
  8031. ;;
  8032. STFD [AOFFSET] = f81, SIZE
  8033. STFD [AOFFSET2] = f89, SIZE
  8034. ;;
  8035. STFD [AOFFSET] = f82, SIZE
  8036. STFD [AOFFSET2] = f90, SIZE
  8037. ;;
  8038. STFD [AOFFSET] = f83, - 11 * SIZE
  8039. STFD [AOFFSET2] = f91, - 11 * SIZE
  8040. ;;
  8041. STFD [AOFFSET] = f64, SIZE
  8042. STFD [AOFFSET2] = f72, SIZE
  8043. ;;
  8044. STFD [AOFFSET] = f65, SIZE
  8045. STFD [AOFFSET2] = f73, SIZE
  8046. ;;
  8047. STFD [AOFFSET] = f66, SIZE
  8048. STFD [AOFFSET2] = f74, SIZE
  8049. ;;
  8050. STFD [AOFFSET] = f67, - 3 * SIZE
  8051. STFD [AOFFSET2] = f75, - 3 * SIZE
  8052. ;;
  8053. #endif
  8054. { .mmf
  8055. STFD [C1 ] = f64, SIZE
  8056. mov f64 = f0
  8057. }
  8058. ;;
  8059. { .mmi
  8060. STFD [C1 ] = f65, SIZE
  8061. }
  8062. ;;
  8063. { .mmi
  8064. STFD [C1 ] = f66, SIZE
  8065. }
  8066. ;;
  8067. { .mmi
  8068. #ifndef LN
  8069. STFD [C1 ] = f67, SIZE
  8070. #else
  8071. STFD [C1 ] = f67, - 3 * SIZE
  8072. #endif
  8073. }
  8074. ;;
  8075. { .mmf
  8076. STFD [C2 ] = f72, SIZE
  8077. mov f72 = f0
  8078. }
  8079. ;;
  8080. { .mmi
  8081. STFD [C2 ] = f73, SIZE
  8082. }
  8083. ;;
  8084. { .mmi
  8085. STFD [C2 ] = f74, SIZE
  8086. }
  8087. ;;
  8088. { .mmi
  8089. #ifndef LN
  8090. STFD [C2 ] = f75, SIZE
  8091. #else
  8092. STFD [C2 ] = f75, - 3 * SIZE
  8093. #endif
  8094. }
  8095. ;;
  8096. { .mmf
  8097. STFD [C3 ] = f80, SIZE
  8098. mov f80 = f0
  8099. }
  8100. ;;
  8101. { .mmi
  8102. STFD [C3 ] = f81, SIZE
  8103. }
  8104. ;;
  8105. { .mmi
  8106. STFD [C3 ] = f82, SIZE
  8107. }
  8108. ;;
  8109. { .mmi
  8110. #ifndef LN
  8111. STFD [C3 ] = f83, SIZE
  8112. #else
  8113. STFD [C3 ] = f83, - 3 * SIZE
  8114. #endif
  8115. }
  8116. ;;
  8117. { .mmf
  8118. STFD [C4 ] = f88, SIZE
  8119. mov f88 = f0
  8120. }
  8121. ;;
  8122. { .mmi
  8123. STFD [C4 ] = f89, SIZE
  8124. }
  8125. ;;
  8126. { .mmi
  8127. STFD [C4 ] = f90, SIZE
  8128. }
  8129. ;;
  8130. { .mmi
  8131. #ifndef LN
  8132. STFD [C4 ] = f91, SIZE
  8133. #else
  8134. STFD [C4 ] = f91, - 3 * SIZE
  8135. #endif
  8136. nop __LINE__
  8137. }
  8138. ;;
  8139. mov f65 = f0
  8140. ;;
  8141. mov f73 = f0
  8142. ;;
  8143. shladd r2 = K, BASE_SHIFT, r0
  8144. ;;
  8145. { .mmi
  8146. sub L = K, KK
  8147. }
  8148. ;;
  8149. { .mmi
  8150. #ifdef RT
  8151. shladd AORIG = r2, 2, AORIG
  8152. #else
  8153. nop __LINE__
  8154. #endif
  8155. }
  8156. ;;
  8157. { .mmf
  8158. mov f81 = f0
  8159. }
  8160. ;;
  8161. { .mmi
  8162. #if defined(LT) || defined(RN)
  8163. shladd L = L, BASE_SHIFT, r0
  8164. #else
  8165. nop __LINE__
  8166. #endif
  8167. }
  8168. ;;
  8169. { .mmi
  8170. #if defined(LT) || defined(RN)
  8171. shladd AOFFSET = L, 2, AOFFSET
  8172. #else
  8173. nop __LINE__
  8174. #endif
  8175. }
  8176. ;;
  8177. { .mmi
  8178. #if defined(LT) || defined(RN)
  8179. shladd BOFFSET = L, 2, BOFFSET
  8180. #else
  8181. nop __LINE__
  8182. #endif
  8183. }
  8184. ;;
  8185. { .mmf
  8186. mov f89 = f0
  8187. }
  8188. ;;
  8189. { .mmi
  8190. #ifdef LT
  8191. adds KK = 4, KK
  8192. #elif defined LN
  8193. adds KK = -4, KK
  8194. #else
  8195. nop __LINE__
  8196. #endif
  8197. }
  8198. ;;
  8199. { .mmi
  8200. #if defined(LT) || defined(RN)
  8201. mov L = KK
  8202. #else
  8203. sub L = K, KK
  8204. #endif
  8205. }
  8206. ;;
  8207. .align 8
  8208. .L051:
  8209. mov f72 = f0
  8210. mov f80 = f0
  8211. mov f88 = f0
  8212. mov f65 = f0
  8213. mov f73 = f0
  8214. mov f81 = f0
  8215. mov f89 = f0
  8216. shr I = M, 3
  8217. ;;
  8218. cmp.eq p6, p7 = 0, I
  8219. (p6) br.cond.dpnt .L089
  8220. ;;
  8221. .align 16
  8222. .L052:
  8223. { .mmi
  8224. cmp.ne p7, p0 = r0, L
  8225. adds BOFFSET = 0 * SIZE, B
  8226. shl r2 = K, 3 + BASE_SHIFT
  8227. }
  8228. { .mmi
  8229. shladd r3 = KK, BASE_SHIFT, r0
  8230. nop __LINE__
  8231. nop __LINE__
  8232. }
  8233. ;;
  8234. #if defined(LT) || defined(RN)
  8235. { .mmi
  8236. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  8237. nop __LINE__
  8238. nop __LINE__
  8239. }
  8240. ;;
  8241. #else
  8242. { .mfi
  8243. shladd BOFFSET = r3, 2, B
  8244. #ifdef LN
  8245. sub AORIG = AORIG, r2
  8246. #else
  8247. nop __LINE__
  8248. #endif
  8249. }
  8250. ;;
  8251. { .mfi
  8252. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  8253. shladd AOFFSET = r3, 3, AORIG
  8254. }
  8255. ;;
  8256. #endif
  8257. { .mfi
  8258. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8259. mov f66 = f0
  8260. nop __LINE__
  8261. }
  8262. { .mfi
  8263. (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  8264. mov f74 = f0
  8265. nop __LINE__
  8266. }
  8267. ;;
  8268. { .mmf
  8269. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  8270. setf.d f82 = r0
  8271. mov f90 = f0
  8272. }
  8273. ;;
  8274. { .mmf
  8275. (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  8276. setf.d f67 = r0
  8277. mov f75 = f0
  8278. }
  8279. { .mfi
  8280. setf.d f83 = r0
  8281. mov f91 = f0
  8282. cmp.eq p3, p0 = r0, r0
  8283. }
  8284. ;;
  8285. { .mmf
  8286. (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  8287. }
  8288. { .mfi
  8289. adds PREC = CPREFETCHSIZE * SIZE, C1
  8290. }
  8291. ;;
  8292. { .mmf
  8293. CPREFETCH [PREC], LDC
  8294. setf.d f68 = r0
  8295. mov f76 = f0
  8296. }
  8297. { .mfi
  8298. setf.d f84 = r0
  8299. mov f92 = f0
  8300. adds L = 1, L
  8301. }
  8302. ;;
  8303. { .mmf
  8304. CPREFETCH [PREC], LDC
  8305. }
  8306. { .mfi
  8307. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  8308. }
  8309. ;;
  8310. { .mmf
  8311. CPREFETCH [PREC], LDC
  8312. setf.d f69 = r0
  8313. mov f77 = f0
  8314. }
  8315. { .mfi
  8316. setf.d f85 = r0
  8317. mov f93 = f0
  8318. adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET
  8319. }
  8320. ;;
  8321. { .mmf
  8322. CPREFETCH [PREC]
  8323. }
  8324. ;;
  8325. { .mfi
  8326. setf.d f70 = r0
  8327. mov f78 = f0
  8328. tbit.z p12, p0 = L, 0
  8329. }
  8330. { .mfi
  8331. setf.d f86 = r0
  8332. mov f94 = f0
  8333. shr L = L, 1
  8334. }
  8335. ;;
  8336. { .mfi
  8337. setf.d f71 = r0
  8338. adds L = -1, L
  8339. }
  8340. ;;
  8341. { .mfi
  8342. setf.d f87 = r0
  8343. mov f79 = f0
  8344. mov ar.lc = L
  8345. }
  8346. { .mfb
  8347. cmp.eq p6, p0 = -1, L
  8348. mov f95 = f0
  8349. (p6) br.cond.dpnt .L058
  8350. }
  8351. ;;
  8352. .align 8
  8353. .L053:
  8354. { .mfb
  8355. lfetch.nt1 [PREA], 16 * SIZE
  8356. FMA f64 = f32, f48, f64 // A1 * B1
  8357. nop __LINE__
  8358. }
  8359. { .mfi
  8360. nop __LINE__
  8361. FMA f72 = f32, f49, f72 // A1 * B2
  8362. (p12) cmp.ne p3, p0 = 0, L
  8363. }
  8364. ;;
  8365. { .mfi
  8366. lfetch.nt1 [PREB], 8 * SIZE
  8367. FMA f80 = f32, f50, f80 // A1 * B3
  8368. cmp.ne p4, p5 = 0, L
  8369. }
  8370. { .mfi
  8371. nop __LINE__
  8372. FMA f88 = f32, f51, f88 // A1 * B4
  8373. adds C9 = 4 * SIZE, C1
  8374. }
  8375. ;;
  8376. { .mfi
  8377. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  8378. FMA f65 = f33, f48, f65 // A2 * B1
  8379. adds C10 = 4 * SIZE, C2
  8380. }
  8381. { .mfi
  8382. nop __LINE__
  8383. FMA f73 = f33, f49, f73 // A2 * B2
  8384. adds C11 = 4 * SIZE, C3
  8385. }
  8386. ;;
  8387. { .mfi
  8388. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  8389. FMA f81 = f33, f50, f81 // A2 * B3
  8390. adds C12 = 4 * SIZE, C4
  8391. }
  8392. { .mfb
  8393. nop __LINE__
  8394. FMA f89 = f33, f51, f89 // A2 * B4
  8395. nop __LINE__
  8396. }
  8397. ;;
  8398. { .mfb
  8399. (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  8400. FMA f66 = f34, f48, f66 // A3 * B1
  8401. nop __LINE__
  8402. }
  8403. { .mfb
  8404. nop __LINE__
  8405. FMA f74 = f34, f49, f74 // A3 * B2
  8406. nop __LINE__
  8407. }
  8408. ;;
  8409. { .mfb
  8410. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  8411. FMA f82 = f34, f50, f82 // A3 * B3
  8412. nop __LINE__
  8413. }
  8414. { .mfb
  8415. nop __LINE__
  8416. FMA f90 = f34, f51, f90 // A3 * B4
  8417. nop __LINE__
  8418. }
  8419. ;;
  8420. { .mfb
  8421. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  8422. FMA f67 = f35, f48, f67 // A4 * B1
  8423. nop __LINE__
  8424. }
  8425. { .mfb
  8426. nop __LINE__
  8427. FMA f75 = f35, f49, f75 // A4 * B2
  8428. nop __LINE__
  8429. }
  8430. ;;
  8431. { .mfb
  8432. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  8433. FMA f83 = f35, f50, f83 // A4 * B3
  8434. nop __LINE__
  8435. }
  8436. { .mfb
  8437. nop __LINE__
  8438. FMA f91 = f35, f51, f91 // A4 * B4
  8439. nop __LINE__
  8440. }
  8441. ;;
  8442. { .mfb
  8443. nop __LINE__
  8444. FMA f68 = f36, f48, f68 // A5 * B1
  8445. nop __LINE__
  8446. }
  8447. { .mfb
  8448. nop __LINE__
  8449. FMA f76 = f36, f49, f76 // A5 * B2
  8450. nop __LINE__
  8451. }
  8452. ;;
  8453. { .mfb
  8454. nop __LINE__
  8455. FMA f84 = f36, f50, f84 // A5 * B3
  8456. nop __LINE__
  8457. }
  8458. { .mfb
  8459. nop __LINE__
  8460. FMA f92 = f36, f51, f92 // A5 * B4
  8461. nop __LINE__
  8462. }
  8463. ;;
  8464. { .mfb
  8465. nop __LINE__
  8466. FMA f69 = f37, f48, f69 // A6 * B1
  8467. nop __LINE__
  8468. }
  8469. { .mfb
  8470. nop __LINE__
  8471. FMA f77 = f37, f49, f77 // A6 * B2
  8472. nop __LINE__
  8473. }
  8474. ;;
  8475. { .mfb
  8476. nop __LINE__
  8477. FMA f85 = f37, f50, f85 // A6 * B3
  8478. nop __LINE__
  8479. }
  8480. { .mfb
  8481. nop __LINE__
  8482. FMA f93 = f37, f51, f93 // A6 * B4
  8483. nop __LINE__
  8484. }
  8485. ;;
  8486. { .mfb
  8487. nop __LINE__
  8488. FMA f70 = f38, f48, f70 // A7 * B1
  8489. nop __LINE__
  8490. }
  8491. { .mfb
  8492. nop __LINE__
  8493. FMA f78 = f38, f49, f78 // A7 * B2
  8494. nop __LINE__
  8495. }
  8496. ;;
  8497. { .mfb
  8498. nop __LINE__
  8499. FMA f86 = f38, f50, f86 // A7 * B3
  8500. nop __LINE__
  8501. }
  8502. { .mfb
  8503. nop __LINE__
  8504. FMA f94 = f38, f51, f94 // A7 * B4
  8505. nop __LINE__
  8506. }
  8507. ;;
  8508. { .mfb
  8509. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8510. FMA f71 = f39, f48, f71 // A8 * B1
  8511. nop __LINE__
  8512. }
  8513. { .mfb
  8514. nop __LINE__
  8515. FMA f79 = f39, f49, f79 // A8 * B2
  8516. nop __LINE__
  8517. }
  8518. ;;
  8519. { .mfb
  8520. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  8521. FMA f87 = f39, f50, f87 // A8 * B3
  8522. nop __LINE__
  8523. }
  8524. { .mfb
  8525. nop __LINE__
  8526. FMA f95 = f39, f51, f95 // A8 * B4
  8527. nop __LINE__
  8528. }
  8529. ;;
  8530. { .mfb
  8531. (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  8532. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  8533. nop __LINE__
  8534. }
  8535. { .mfb
  8536. nop __LINE__
  8537. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  8538. nop __LINE__
  8539. }
  8540. ;;
  8541. { .mfb
  8542. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  8543. (p3) FMA f80 = f40, f58, f80 // A1 * B3
  8544. nop __LINE__
  8545. }
  8546. { .mfb
  8547. nop __LINE__
  8548. (p3) FMA f88 = f40, f59, f88 // A1 * B4
  8549. nop __LINE__
  8550. }
  8551. ;;
  8552. { .mfb
  8553. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  8554. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  8555. nop __LINE__
  8556. }
  8557. { .mfb
  8558. nop __LINE__
  8559. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  8560. nop __LINE__
  8561. }
  8562. ;;
  8563. { .mfb
  8564. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  8565. (p3) FMA f81 = f41, f58, f81 // A2 * B3
  8566. nop __LINE__
  8567. }
  8568. { .mfb
  8569. nop __LINE__
  8570. (p3) FMA f89 = f41, f59, f89 // A2 * B4
  8571. nop __LINE__
  8572. }
  8573. ;;
  8574. { .mfb
  8575. nop __LINE__
  8576. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  8577. nop __LINE__
  8578. }
  8579. { .mfb
  8580. nop __LINE__
  8581. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  8582. nop __LINE__
  8583. }
  8584. ;;
  8585. { .mfb
  8586. nop __LINE__
  8587. (p3) FMA f82 = f42, f58, f82 // A3 * B3
  8588. nop __LINE__
  8589. }
  8590. { .mfb
  8591. nop __LINE__
  8592. (p3) FMA f90 = f42, f59, f90 // A3 * B4
  8593. nop __LINE__
  8594. }
  8595. ;;
  8596. { .mfb
  8597. nop __LINE__
  8598. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  8599. nop __LINE__
  8600. }
  8601. { .mfb
  8602. nop __LINE__
  8603. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  8604. nop __LINE__
  8605. }
  8606. ;;
  8607. { .mfb
  8608. nop __LINE__
  8609. (p3) FMA f83 = f43, f58, f83 // A4 * B3
  8610. nop __LINE__
  8611. }
  8612. { .mfb
  8613. nop __LINE__
  8614. (p3) FMA f91 = f43, f59, f91 // A4 * B4
  8615. nop __LINE__
  8616. }
  8617. ;;
  8618. { .mfb
  8619. nop __LINE__
  8620. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  8621. nop __LINE__
  8622. }
  8623. { .mfb
  8624. nop __LINE__
  8625. (p3) FMA f76 = f44, f57, f76 // A5 * B2
  8626. nop __LINE__
  8627. }
  8628. ;;
  8629. { .mfb
  8630. nop __LINE__
  8631. (p3) FMA f84 = f44, f58, f84 // A5 * B3
  8632. nop __LINE__
  8633. }
  8634. { .mfb
  8635. nop __LINE__
  8636. (p3) FMA f92 = f44, f59, f92 // A5 * B4
  8637. nop __LINE__
  8638. }
  8639. ;;
  8640. { .mfb
  8641. nop __LINE__
  8642. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  8643. nop __LINE__
  8644. }
  8645. { .mfb
  8646. nop __LINE__
  8647. (p3) FMA f77 = f45, f57, f77 // A6 * B2
  8648. nop __LINE__
  8649. }
  8650. ;;
  8651. { .mfb
  8652. nop __LINE__
  8653. (p3) FMA f85 = f45, f58, f85 // A6 * B3
  8654. nop __LINE__
  8655. }
  8656. { .mfb
  8657. nop __LINE__
  8658. (p3) FMA f93 = f45, f59, f93 // A6 * B4
  8659. nop __LINE__
  8660. }
  8661. ;;
  8662. { .mfb
  8663. nop __LINE__
  8664. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  8665. nop __LINE__
  8666. }
  8667. { .mfb
  8668. nop __LINE__
  8669. (p3) FMA f78 = f46, f57, f78 // A7 * B2
  8670. nop __LINE__
  8671. }
  8672. ;;
  8673. { .mfb
  8674. nop __LINE__
  8675. (p3) FMA f86 = f46, f58, f86 // A7 * B3
  8676. nop __LINE__
  8677. }
  8678. { .mfb
  8679. nop __LINE__
  8680. (p3) FMA f94 = f46, f59, f94 // A7 * B4
  8681. nop __LINE__
  8682. }
  8683. ;;
  8684. { .mfb
  8685. nop __LINE__
  8686. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  8687. nop __LINE__
  8688. }
  8689. { .mfb
  8690. nop __LINE__
  8691. (p3) FMA f79 = f47, f57, f79 // A8 * B2
  8692. nop __LINE__
  8693. }
  8694. ;;
  8695. { .mfi
  8696. nop __LINE__
  8697. (p3) FMA f87 = f47, f58, f87 // A8 * B3
  8698. adds L = -1, L
  8699. }
  8700. { .mfb
  8701. nop __LINE__
  8702. (p3) FMA f95 = f47, f59, f95 // A8 * B4
  8703. br.cloop.sptk.few .L053
  8704. }
  8705. ;;
  8706. .align 8
  8707. .L058:
  8708. #if defined(LN) || defined(RT)
  8709. #ifdef LN
  8710. adds r2 = -8, KK
  8711. #else
  8712. adds r2 = -4, KK
  8713. #endif
  8714. ;;
  8715. shladd r2 = r2, BASE_SHIFT, r0
  8716. ;;
  8717. shladd AOFFSET = r2, 3, AORIG
  8718. shladd BOFFSET = r2, 2, B
  8719. ;;
  8720. #endif
  8721. adds AOFFSET2 = 4 * SIZE, AOFFSET
  8722. adds BOFFSET2 = 4 * SIZE, BOFFSET
  8723. ;;
  8724. #if defined(LN) || defined(LT)
  8725. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  8726. ;;
  8727. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  8728. ;;
  8729. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  8730. ;;
  8731. LDFPD f38, f39 = [BOFFSET], 2 * SIZE
  8732. ;;
  8733. LDFPD f40, f41 = [BOFFSET], 2 * SIZE
  8734. ;;
  8735. LDFPD f42, f43 = [BOFFSET], 2 * SIZE
  8736. ;;
  8737. LDFPD f44, f45 = [BOFFSET], 2 * SIZE
  8738. ;;
  8739. LDFPD f46, f47 = [BOFFSET], 2 * SIZE
  8740. ;;
  8741. LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  8742. ;;
  8743. LDFPD f50, f51 = [BOFFSET], 2 * SIZE
  8744. ;;
  8745. LDFPD f52, f53 = [BOFFSET], 2 * SIZE
  8746. ;;
  8747. LDFPD f54, f55 = [BOFFSET], 2 * SIZE
  8748. ;;
  8749. LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  8750. ;;
  8751. LDFPD f58, f59 = [BOFFSET], 2 * SIZE
  8752. ;;
  8753. LDFPD f60, f61 = [BOFFSET], 2 * SIZE
  8754. ;;
  8755. LDFPD f62, f63 = [BOFFSET]
  8756. adds BOFFSET = -30 * SIZE, BOFFSET
  8757. ;;
  8758. FSUB f64 = f32, f64
  8759. FSUB f72 = f33, f72
  8760. FSUB f80 = f34, f80
  8761. FSUB f88 = f35, f88
  8762. FSUB f65 = f36, f65
  8763. FSUB f73 = f37, f73
  8764. FSUB f81 = f38, f81
  8765. FSUB f89 = f39, f89
  8766. FSUB f66 = f40, f66
  8767. FSUB f74 = f41, f74
  8768. FSUB f82 = f42, f82
  8769. FSUB f90 = f43, f90
  8770. FSUB f67 = f44, f67
  8771. FSUB f75 = f45, f75
  8772. FSUB f83 = f46, f83
  8773. FSUB f91 = f47, f91
  8774. FSUB f68 = f48, f68
  8775. FSUB f76 = f49, f76
  8776. FSUB f84 = f50, f84
  8777. FSUB f92 = f51, f92
  8778. FSUB f69 = f52, f69
  8779. FSUB f77 = f53, f77
  8780. FSUB f85 = f54, f85
  8781. FSUB f93 = f55, f93
  8782. FSUB f70 = f56, f70
  8783. FSUB f78 = f57, f78
  8784. FSUB f86 = f58, f86
  8785. FSUB f94 = f59, f94
  8786. FSUB f71 = f60, f71
  8787. FSUB f79 = f61, f79
  8788. FSUB f87 = f62, f87
  8789. FSUB f95 = f63, f95
  8790. ;;
  8791. #else
  8792. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  8793. ;;
  8794. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  8795. ;;
  8796. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  8797. ;;
  8798. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  8799. ;;
  8800. LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  8801. ;;
  8802. LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  8803. ;;
  8804. LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  8805. ;;
  8806. LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  8807. ;;
  8808. LDFPD f48, f49 = [AOFFSET], 2 * SIZE
  8809. ;;
  8810. LDFPD f50, f51 = [AOFFSET], 2 * SIZE
  8811. ;;
  8812. LDFPD f52, f53 = [AOFFSET], 2 * SIZE
  8813. ;;
  8814. LDFPD f54, f55 = [AOFFSET], 2 * SIZE
  8815. ;;
  8816. LDFPD f56, f57 = [AOFFSET], 2 * SIZE
  8817. ;;
  8818. LDFPD f58, f59 = [AOFFSET], 2 * SIZE
  8819. ;;
  8820. LDFPD f60, f61 = [AOFFSET], 2 * SIZE
  8821. ;;
  8822. LDFPD f62, f63 = [AOFFSET]
  8823. adds AOFFSET = -30 * SIZE, AOFFSET
  8824. ;;
  8825. FSUB f64 = f32, f64
  8826. FSUB f65 = f33, f65
  8827. FSUB f66 = f34, f66
  8828. FSUB f67 = f35, f67
  8829. FSUB f68 = f36, f68
  8830. FSUB f69 = f37, f69
  8831. FSUB f70 = f38, f70
  8832. FSUB f71 = f39, f71
  8833. ;;
  8834. FSUB f72 = f40, f72
  8835. FSUB f73 = f41, f73
  8836. FSUB f74 = f42, f74
  8837. FSUB f75 = f43, f75
  8838. FSUB f76 = f44, f76
  8839. FSUB f77 = f45, f77
  8840. FSUB f78 = f46, f78
  8841. FSUB f79 = f47, f79
  8842. ;;
  8843. FSUB f80 = f48, f80
  8844. FSUB f81 = f49, f81
  8845. FSUB f82 = f50, f82
  8846. FSUB f83 = f51, f83
  8847. FSUB f84 = f52, f84
  8848. FSUB f85 = f53, f85
  8849. FSUB f86 = f54, f86
  8850. FSUB f87 = f55, f87
  8851. FSUB f88 = f56, f88
  8852. FSUB f89 = f57, f89
  8853. FSUB f90 = f58, f90
  8854. FSUB f91 = f59, f91
  8855. FSUB f92 = f60, f92
  8856. FSUB f93 = f61, f93
  8857. FSUB f94 = f62, f94
  8858. FSUB f95 = f63, f95
  8859. ;;
  8860. #endif
  8861. #ifdef LN
  8862. adds AOFFSET = 62 * SIZE, AOFFSET
  8863. ;;
  8864. LDFPD f33, f32 = [AOFFSET]
  8865. adds AOFFSET = - 2 * SIZE, AOFFSET
  8866. ;;
  8867. LDFPD f35, f34 = [AOFFSET]
  8868. adds AOFFSET = - 2 * SIZE, AOFFSET
  8869. ;;
  8870. LDFPD f37, f36 = [AOFFSET]
  8871. adds AOFFSET = - 2 * SIZE, AOFFSET
  8872. ;;
  8873. LDFPD f39, f38 = [AOFFSET]
  8874. adds AOFFSET = - 2 * SIZE, AOFFSET
  8875. ;;
  8876. LDFD f40 = [AOFFSET], -2 * SIZE
  8877. ;;
  8878. LDFPD f42, f41 = [AOFFSET]
  8879. adds AOFFSET = - 2 * SIZE, AOFFSET
  8880. ;;
  8881. LDFPD f44, f43 = [AOFFSET]
  8882. adds AOFFSET = - 2 * SIZE, AOFFSET
  8883. ;;
  8884. LDFPD f46, f45 = [AOFFSET]
  8885. adds AOFFSET = - 4 * SIZE, AOFFSET
  8886. ;;
  8887. LDFPD f48, f47 = [AOFFSET]
  8888. adds AOFFSET = - 2 * SIZE, AOFFSET
  8889. ;;
  8890. LDFPD f50, f49 = [AOFFSET]
  8891. adds AOFFSET = - 2 * SIZE, AOFFSET
  8892. ;;
  8893. LDFPD f52, f51 = [AOFFSET]
  8894. adds AOFFSET = - 4 * SIZE, AOFFSET
  8895. ;;
  8896. LDFD f53 = [AOFFSET], -2 * SIZE
  8897. ;;
  8898. LDFPD f55, f54 = [AOFFSET]
  8899. adds AOFFSET = - 2 * SIZE, AOFFSET
  8900. ;;
  8901. LDFPD f57, f56 = [AOFFSET]
  8902. adds AOFFSET = - 6 * SIZE, AOFFSET
  8903. ;;
  8904. LDFPD f59, f58 = [AOFFSET]
  8905. adds AOFFSET = - 2 * SIZE, AOFFSET
  8906. ;;
  8907. LDFPD f61, f60 = [AOFFSET]
  8908. adds AOFFSET = - 6 * SIZE, AOFFSET
  8909. ;;
  8910. LDFD f16 = [AOFFSET], -2 * SIZE
  8911. ;;
  8912. LDFPD f18, f17 = [AOFFSET]
  8913. adds AOFFSET = - 8 * SIZE, AOFFSET
  8914. ;;
  8915. LDFPD f20, f19 = [AOFFSET]
  8916. adds AOFFSET = - 8 * SIZE, AOFFSET
  8917. ;;
  8918. LDFD f21 = [AOFFSET]
  8919. ;;
  8920. FMPY f71 = f71, f32
  8921. FMPY f79 = f79, f32
  8922. FMPY f87 = f87, f32
  8923. FMPY f95 = f95, f32
  8924. ;;
  8925. FNMA f70 = f71, f33, f70
  8926. FNMA f78 = f79, f33, f78
  8927. FNMA f86 = f87, f33, f86
  8928. FNMA f94 = f95, f33, f94
  8929. ;;
  8930. FNMA f69 = f71, f34, f69
  8931. FNMA f77 = f79, f34, f77
  8932. FNMA f85 = f87, f34, f85
  8933. FNMA f93 = f95, f34, f93
  8934. ;;
  8935. FNMA f68 = f71, f35, f68
  8936. FNMA f76 = f79, f35, f76
  8937. FNMA f84 = f87, f35, f84
  8938. FNMA f92 = f95, f35, f92
  8939. ;;
  8940. FNMA f67 = f71, f36, f67
  8941. FNMA f75 = f79, f36, f75
  8942. FNMA f83 = f87, f36, f83
  8943. FNMA f91 = f95, f36, f91
  8944. ;;
  8945. FNMA f66 = f71, f37, f66
  8946. FNMA f74 = f79, f37, f74
  8947. FNMA f82 = f87, f37, f82
  8948. FNMA f90 = f95, f37, f90
  8949. ;;
  8950. FNMA f65 = f71, f38, f65
  8951. FNMA f73 = f79, f38, f73
  8952. FNMA f81 = f87, f38, f81
  8953. FNMA f89 = f95, f38, f89
  8954. ;;
  8955. FNMA f64 = f71, f39, f64
  8956. FNMA f72 = f79, f39, f72
  8957. FNMA f80 = f87, f39, f80
  8958. FNMA f88 = f95, f39, f88
  8959. ;;
  8960. FMPY f70 = f70, f40
  8961. FMPY f78 = f78, f40
  8962. FMPY f86 = f86, f40
  8963. FMPY f94 = f94, f40
  8964. ;;
  8965. FNMA f69 = f70, f41, f69
  8966. FNMA f77 = f78, f41, f77
  8967. FNMA f85 = f86, f41, f85
  8968. FNMA f93 = f94, f41, f93
  8969. ;;
  8970. FNMA f68 = f70, f42, f68
  8971. FNMA f76 = f78, f42, f76
  8972. FNMA f84 = f86, f42, f84
  8973. FNMA f92 = f94, f42, f92
  8974. ;;
  8975. FNMA f67 = f70, f43, f67
  8976. FNMA f75 = f78, f43, f75
  8977. FNMA f83 = f86, f43, f83
  8978. FNMA f91 = f94, f43, f91
  8979. ;;
  8980. FNMA f66 = f70, f44, f66
  8981. FNMA f74 = f78, f44, f74
  8982. FNMA f82 = f86, f44, f82
  8983. FNMA f90 = f94, f44, f90
  8984. ;;
  8985. FNMA f65 = f70, f45, f65
  8986. FNMA f73 = f78, f45, f73
  8987. FNMA f81 = f86, f45, f81
  8988. FNMA f89 = f94, f45, f89
  8989. ;;
  8990. FNMA f64 = f70, f46, f64
  8991. FNMA f72 = f78, f46, f72
  8992. FNMA f80 = f86, f46, f80
  8993. FNMA f88 = f94, f46, f88
  8994. ;;
  8995. FMPY f69 = f69, f47
  8996. FMPY f77 = f77, f47
  8997. FMPY f85 = f85, f47
  8998. FMPY f93 = f93, f47
  8999. ;;
  9000. FNMA f68 = f69, f48, f68
  9001. FNMA f76 = f77, f48, f76
  9002. FNMA f84 = f85, f48, f84
  9003. FNMA f92 = f93, f48, f92
  9004. ;;
  9005. FNMA f67 = f69, f49, f67
  9006. FNMA f75 = f77, f49, f75
  9007. FNMA f83 = f85, f49, f83
  9008. FNMA f91 = f93, f49, f91
  9009. ;;
  9010. FNMA f66 = f69, f50, f66
  9011. FNMA f74 = f77, f50, f74
  9012. FNMA f82 = f85, f50, f82
  9013. FNMA f90 = f93, f50, f90
  9014. ;;
  9015. FNMA f65 = f69, f51, f65
  9016. FNMA f73 = f77, f51, f73
  9017. FNMA f81 = f85, f51, f81
  9018. FNMA f89 = f93, f51, f89
  9019. ;;
  9020. FNMA f64 = f69, f52, f64
  9021. FNMA f72 = f77, f52, f72
  9022. FNMA f80 = f85, f52, f80
  9023. FNMA f88 = f93, f52, f88
  9024. ;;
  9025. FMPY f68 = f68, f53
  9026. FMPY f76 = f76, f53
  9027. FMPY f84 = f84, f53
  9028. FMPY f92 = f92, f53
  9029. ;;
  9030. FNMA f67 = f68, f54, f67
  9031. FNMA f75 = f76, f54, f75
  9032. FNMA f83 = f84, f54, f83
  9033. FNMA f91 = f92, f54, f91
  9034. ;;
  9035. FNMA f66 = f68, f55, f66
  9036. FNMA f74 = f76, f55, f74
  9037. FNMA f82 = f84, f55, f82
  9038. FNMA f90 = f92, f55, f90
  9039. ;;
  9040. FNMA f65 = f68, f56, f65
  9041. FNMA f73 = f76, f56, f73
  9042. FNMA f81 = f84, f56, f81
  9043. FNMA f89 = f92, f56, f89
  9044. ;;
  9045. FNMA f64 = f68, f57, f64
  9046. FNMA f72 = f76, f57, f72
  9047. FNMA f80 = f84, f57, f80
  9048. FNMA f88 = f92, f57, f88
  9049. ;;
  9050. FMPY f67 = f67, f58
  9051. FMPY f75 = f75, f58
  9052. FMPY f83 = f83, f58
  9053. FMPY f91 = f91, f58
  9054. ;;
  9055. FNMA f66 = f67, f59, f66
  9056. FNMA f74 = f75, f59, f74
  9057. FNMA f82 = f83, f59, f82
  9058. FNMA f90 = f91, f59, f90
  9059. ;;
  9060. FNMA f65 = f67, f60, f65
  9061. FNMA f73 = f75, f60, f73
  9062. FNMA f81 = f83, f60, f81
  9063. FNMA f89 = f91, f60, f89
  9064. ;;
  9065. FNMA f64 = f67, f61, f64
  9066. FNMA f72 = f75, f61, f72
  9067. FNMA f80 = f83, f61, f80
  9068. FNMA f88 = f91, f61, f88
  9069. ;;
  9070. FMPY f66 = f66, f16
  9071. FMPY f74 = f74, f16
  9072. FMPY f82 = f82, f16
  9073. FMPY f90 = f90, f16
  9074. ;;
  9075. FNMA f65 = f66, f17, f65
  9076. FNMA f73 = f74, f17, f73
  9077. FNMA f81 = f82, f17, f81
  9078. FNMA f89 = f90, f17, f89
  9079. ;;
  9080. FNMA f64 = f66, f18, f64
  9081. FNMA f72 = f74, f18, f72
  9082. FNMA f80 = f82, f18, f80
  9083. FNMA f88 = f90, f18, f88
  9084. ;;
  9085. FMPY f65 = f65, f19
  9086. FMPY f73 = f73, f19
  9087. FMPY f81 = f81, f19
  9088. FMPY f89 = f89, f19
  9089. ;;
  9090. FNMA f64 = f65, f20, f64
  9091. FNMA f72 = f73, f20, f72
  9092. FNMA f80 = f81, f20, f80
  9093. FNMA f88 = f89, f20, f88
  9094. ;;
  9095. FMPY f64 = f64, f21
  9096. FMPY f72 = f72, f21
  9097. FMPY f80 = f80, f21
  9098. FMPY f88 = f88, f21
  9099. ;;
  9100. adds BOFFSET = 24 * SIZE, BOFFSET
  9101. adds BOFFSET2 = 24 * SIZE, BOFFSET2
  9102. ;;
  9103. STFD [BOFFSET] = f70, SIZE
  9104. STFD [BOFFSET2] = f71, SIZE
  9105. ;;
  9106. STFD [BOFFSET] = f78, SIZE
  9107. STFD [BOFFSET2] = f79, SIZE
  9108. ;;
  9109. STFD [BOFFSET] = f86, SIZE
  9110. STFD [BOFFSET2] = f87, SIZE
  9111. ;;
  9112. STFD [BOFFSET] = f94, - 11 * SIZE
  9113. STFD [BOFFSET2] = f95, - 11 * SIZE
  9114. ;;
  9115. STFD [BOFFSET] = f68, SIZE
  9116. STFD [BOFFSET2] = f69, SIZE
  9117. ;;
  9118. STFD [BOFFSET] = f76, SIZE
  9119. STFD [BOFFSET2] = f77, SIZE
  9120. ;;
  9121. STFD [BOFFSET] = f84, SIZE
  9122. STFD [BOFFSET2] = f85, SIZE
  9123. ;;
  9124. STFD [BOFFSET] = f92, - 11 * SIZE
  9125. STFD [BOFFSET2] = f93, - 11 * SIZE
  9126. ;;
  9127. STFD [BOFFSET] = f66, SIZE
  9128. STFD [BOFFSET2] = f67, SIZE
  9129. ;;
  9130. STFD [BOFFSET] = f74, SIZE
  9131. STFD [BOFFSET2] = f75, SIZE
  9132. ;;
  9133. STFD [BOFFSET] = f82, SIZE
  9134. STFD [BOFFSET2] = f83, SIZE
  9135. ;;
  9136. STFD [BOFFSET] = f90, - 11 * SIZE
  9137. STFD [BOFFSET2] = f91, - 11 * SIZE
  9138. ;;
  9139. STFD [BOFFSET] = f64, SIZE
  9140. STFD [BOFFSET2] = f65, SIZE
  9141. ;;
  9142. STFD [BOFFSET] = f72, SIZE
  9143. STFD [BOFFSET2] = f73, SIZE
  9144. ;;
  9145. STFD [BOFFSET] = f80, SIZE
  9146. STFD [BOFFSET2] = f81, SIZE
  9147. ;;
  9148. STFD [BOFFSET] = f88, - 3 * SIZE
  9149. STFD [BOFFSET2] = f89, - 3 * SIZE
  9150. ;;
  9151. adds C1 = -8 * SIZE, C1
  9152. adds C2 = -8 * SIZE, C2
  9153. adds C3 = -8 * SIZE, C3
  9154. adds C4 = -8 * SIZE, C4
  9155. ;;
  9156. #endif
  9157. #ifdef LT
  9158. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  9159. ;;
  9160. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  9161. ;;
  9162. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  9163. ;;
  9164. LDFPD f38, f39 = [AOFFSET]
  9165. adds AOFFSET = 3 * SIZE, AOFFSET
  9166. ;;
  9167. LDFD f40 = [AOFFSET], 1 * SIZE
  9168. ;;
  9169. LDFPD f41, f42 = [AOFFSET], 2 * SIZE
  9170. ;;
  9171. LDFPD f43, f44 = [AOFFSET], 2 * SIZE
  9172. ;;
  9173. LDFPD f45, f46 = [AOFFSET]
  9174. adds AOFFSET = 4 * SIZE, AOFFSET
  9175. ;;
  9176. LDFPD f47, f48 = [AOFFSET], 2 * SIZE
  9177. ;;
  9178. LDFPD f49, f50 = [AOFFSET], 2 * SIZE
  9179. ;;
  9180. LDFPD f51, f52 = [AOFFSET]
  9181. adds AOFFSET = 5 * SIZE, AOFFSET
  9182. ;;
  9183. LDFD f53 = [AOFFSET], 1 * SIZE
  9184. ;;
  9185. LDFPD f54, f55 = [AOFFSET], 2 * SIZE
  9186. ;;
  9187. LDFPD f56, f57 = [AOFFSET]
  9188. adds AOFFSET = 6 * SIZE, AOFFSET
  9189. ;;
  9190. LDFPD f58, f59 = [AOFFSET], 2 * SIZE
  9191. ;;
  9192. LDFPD f60, f61 = [AOFFSET]
  9193. adds AOFFSET = 7 * SIZE, AOFFSET
  9194. ;;
  9195. LDFD f16 = [AOFFSET], 1 * SIZE
  9196. ;;
  9197. LDFPD f17, f18 = [AOFFSET]
  9198. adds AOFFSET = 8 * SIZE, AOFFSET
  9199. ;;
  9200. LDFPD f19, f20 = [AOFFSET]
  9201. adds AOFFSET = 9 * SIZE, AOFFSET
  9202. ;;
  9203. LDFD f21 = [AOFFSET]
  9204. adds AOFFSET = -63 * SIZE, AOFFSET
  9205. ;;
  9206. FMPY f64 = f64, f32
  9207. FMPY f72 = f72, f32
  9208. FMPY f80 = f80, f32
  9209. FMPY f88 = f88, f32
  9210. ;;
  9211. FNMA f65 = f64, f33, f65
  9212. FNMA f73 = f72, f33, f73
  9213. FNMA f81 = f80, f33, f81
  9214. FNMA f89 = f88, f33, f89
  9215. ;;
  9216. FNMA f66 = f64, f34, f66
  9217. FNMA f74 = f72, f34, f74
  9218. FNMA f82 = f80, f34, f82
  9219. FNMA f90 = f88, f34, f90
  9220. ;;
  9221. FNMA f67 = f64, f35, f67
  9222. FNMA f75 = f72, f35, f75
  9223. FNMA f83 = f80, f35, f83
  9224. FNMA f91 = f88, f35, f91
  9225. ;;
  9226. FNMA f68 = f64, f36, f68
  9227. FNMA f76 = f72, f36, f76
  9228. FNMA f84 = f80, f36, f84
  9229. FNMA f92 = f88, f36, f92
  9230. ;;
  9231. FNMA f69 = f64, f37, f69
  9232. FNMA f77 = f72, f37, f77
  9233. FNMA f85 = f80, f37, f85
  9234. FNMA f93 = f88, f37, f93
  9235. ;;
  9236. FNMA f70 = f64, f38, f70
  9237. FNMA f78 = f72, f38, f78
  9238. FNMA f86 = f80, f38, f86
  9239. FNMA f94 = f88, f38, f94
  9240. ;;
  9241. FNMA f71 = f64, f39, f71
  9242. FNMA f79 = f72, f39, f79
  9243. FNMA f87 = f80, f39, f87
  9244. FNMA f95 = f88, f39, f95
  9245. ;;
  9246. FMPY f65 = f65, f40
  9247. FMPY f73 = f73, f40
  9248. FMPY f81 = f81, f40
  9249. FMPY f89 = f89, f40
  9250. ;;
  9251. FNMA f66 = f65, f41, f66
  9252. FNMA f74 = f73, f41, f74
  9253. FNMA f82 = f81, f41, f82
  9254. FNMA f90 = f89, f41, f90
  9255. ;;
  9256. FNMA f67 = f65, f42, f67
  9257. FNMA f75 = f73, f42, f75
  9258. FNMA f83 = f81, f42, f83
  9259. FNMA f91 = f89, f42, f91
  9260. ;;
  9261. FNMA f68 = f65, f43, f68
  9262. FNMA f76 = f73, f43, f76
  9263. FNMA f84 = f81, f43, f84
  9264. FNMA f92 = f89, f43, f92
  9265. ;;
  9266. FNMA f69 = f65, f44, f69
  9267. FNMA f77 = f73, f44, f77
  9268. FNMA f85 = f81, f44, f85
  9269. FNMA f93 = f89, f44, f93
  9270. ;;
  9271. FNMA f70 = f65, f45, f70
  9272. FNMA f78 = f73, f45, f78
  9273. FNMA f86 = f81, f45, f86
  9274. FNMA f94 = f89, f45, f94
  9275. ;;
  9276. FNMA f71 = f65, f46, f71
  9277. FNMA f79 = f73, f46, f79
  9278. FNMA f87 = f81, f46, f87
  9279. FNMA f95 = f89, f46, f95
  9280. ;;
  9281. FMPY f66 = f66, f47
  9282. FMPY f74 = f74, f47
  9283. FMPY f82 = f82, f47
  9284. FMPY f90 = f90, f47
  9285. ;;
  9286. FNMA f67 = f66, f48, f67
  9287. FNMA f75 = f74, f48, f75
  9288. FNMA f83 = f82, f48, f83
  9289. FNMA f91 = f90, f48, f91
  9290. ;;
  9291. FNMA f68 = f66, f49, f68
  9292. FNMA f76 = f74, f49, f76
  9293. FNMA f84 = f82, f49, f84
  9294. FNMA f92 = f90, f49, f92
  9295. ;;
  9296. FNMA f69 = f66, f50, f69
  9297. FNMA f77 = f74, f50, f77
  9298. FNMA f85 = f82, f50, f85
  9299. FNMA f93 = f90, f50, f93
  9300. ;;
  9301. FNMA f70 = f66, f51, f70
  9302. FNMA f78 = f74, f51, f78
  9303. FNMA f86 = f82, f51, f86
  9304. FNMA f94 = f90, f51, f94
  9305. ;;
  9306. FNMA f71 = f66, f52, f71
  9307. FNMA f79 = f74, f52, f79
  9308. FNMA f87 = f82, f52, f87
  9309. FNMA f95 = f90, f52, f95
  9310. ;;
  9311. FMPY f67 = f67, f53
  9312. FMPY f75 = f75, f53
  9313. FMPY f83 = f83, f53
  9314. FMPY f91 = f91, f53
  9315. ;;
  9316. FNMA f68 = f67, f54, f68
  9317. FNMA f76 = f75, f54, f76
  9318. FNMA f84 = f83, f54, f84
  9319. FNMA f92 = f91, f54, f92
  9320. ;;
  9321. FNMA f69 = f67, f55, f69
  9322. FNMA f77 = f75, f55, f77
  9323. FNMA f85 = f83, f55, f85
  9324. FNMA f93 = f91, f55, f93
  9325. ;;
  9326. FNMA f70 = f67, f56, f70
  9327. FNMA f78 = f75, f56, f78
  9328. FNMA f86 = f83, f56, f86
  9329. FNMA f94 = f91, f56, f94
  9330. ;;
  9331. FNMA f71 = f67, f57, f71
  9332. FNMA f79 = f75, f57, f79
  9333. FNMA f87 = f83, f57, f87
  9334. FNMA f95 = f91, f57, f95
  9335. ;;
  9336. FMPY f68 = f68, f58
  9337. FMPY f76 = f76, f58
  9338. FMPY f84 = f84, f58
  9339. FMPY f92 = f92, f58
  9340. ;;
  9341. FNMA f69 = f68, f59, f69
  9342. FNMA f77 = f76, f59, f77
  9343. FNMA f85 = f84, f59, f85
  9344. FNMA f93 = f92, f59, f93
  9345. ;;
  9346. FNMA f70 = f68, f60, f70
  9347. FNMA f78 = f76, f60, f78
  9348. FNMA f86 = f84, f60, f86
  9349. FNMA f94 = f92, f60, f94
  9350. ;;
  9351. FNMA f71 = f68, f61, f71
  9352. FNMA f79 = f76, f61, f79
  9353. FNMA f87 = f84, f61, f87
  9354. FNMA f95 = f92, f61, f95
  9355. ;;
  9356. FMPY f69 = f69, f16
  9357. FMPY f77 = f77, f16
  9358. FMPY f85 = f85, f16
  9359. FMPY f93 = f93, f16
  9360. ;;
  9361. FNMA f70 = f69, f17, f70
  9362. FNMA f78 = f77, f17, f78
  9363. FNMA f86 = f85, f17, f86
  9364. FNMA f94 = f93, f17, f94
  9365. ;;
  9366. FNMA f71 = f69, f18, f71
  9367. FNMA f79 = f77, f18, f79
  9368. FNMA f87 = f85, f18, f87
  9369. FNMA f95 = f93, f18, f95
  9370. ;;
  9371. FMPY f70 = f70, f19
  9372. FMPY f78 = f78, f19
  9373. FMPY f86 = f86, f19
  9374. FMPY f94 = f94, f19
  9375. ;;
  9376. FNMA f71 = f70, f20, f71
  9377. FNMA f79 = f78, f20, f79
  9378. FNMA f87 = f86, f20, f87
  9379. FNMA f95 = f94, f20, f95
  9380. ;;
  9381. FMPY f71 = f71, f21
  9382. FMPY f79 = f79, f21
  9383. FMPY f87 = f87, f21
  9384. FMPY f95 = f95, f21
  9385. ;;
  9386. STFD [BOFFSET] = f64, SIZE
  9387. STFD [BOFFSET2] = f65, SIZE
  9388. ;;
  9389. STFD [BOFFSET] = f72, SIZE
  9390. STFD [BOFFSET2] = f73, SIZE
  9391. ;;
  9392. STFD [BOFFSET] = f80, SIZE
  9393. STFD [BOFFSET2] = f81, SIZE
  9394. ;;
  9395. STFD [BOFFSET] = f88, 5 * SIZE
  9396. STFD [BOFFSET2] = f89, 5 * SIZE
  9397. ;;
  9398. STFD [BOFFSET] = f66, SIZE
  9399. STFD [BOFFSET2] = f67, SIZE
  9400. ;;
  9401. STFD [BOFFSET] = f74, SIZE
  9402. STFD [BOFFSET2] = f75, SIZE
  9403. ;;
  9404. STFD [BOFFSET] = f82, SIZE
  9405. STFD [BOFFSET2] = f83, SIZE
  9406. ;;
  9407. STFD [BOFFSET] = f90, 5 * SIZE
  9408. STFD [BOFFSET2] = f91, 5 * SIZE
  9409. ;;
  9410. STFD [BOFFSET] = f68, SIZE
  9411. STFD [BOFFSET2] = f69, SIZE
  9412. ;;
  9413. STFD [BOFFSET] = f76, SIZE
  9414. STFD [BOFFSET2] = f77, SIZE
  9415. ;;
  9416. STFD [BOFFSET] = f84, SIZE
  9417. STFD [BOFFSET2] = f85, SIZE
  9418. ;;
  9419. STFD [BOFFSET] = f92, 5 * SIZE
  9420. STFD [BOFFSET2] = f93, 5 * SIZE
  9421. ;;
  9422. STFD [BOFFSET] = f70, SIZE
  9423. STFD [BOFFSET2] = f71, SIZE
  9424. ;;
  9425. STFD [BOFFSET] = f78, SIZE
  9426. STFD [BOFFSET2] = f79, SIZE
  9427. ;;
  9428. STFD [BOFFSET] = f86, SIZE
  9429. STFD [BOFFSET2] = f87, SIZE
  9430. ;;
  9431. STFD [BOFFSET] = f94
  9432. STFD [BOFFSET2] = f95
  9433. adds C9 = 4 * SIZE, C1
  9434. adds BOFFSET = - 27 * SIZE, BOFFSET
  9435. adds BOFFSET2 = - 27 * SIZE, BOFFSET2
  9436. ;;
  9437. #endif
  9438. #ifdef RN
  9439. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  9440. ;;
  9441. LDFPD f34, f35 = [BOFFSET]
  9442. adds BOFFSET = 3 * SIZE, BOFFSET
  9443. ;;
  9444. LDFD f36 = [BOFFSET], 1 * SIZE
  9445. ;;
  9446. LDFPD f37, f38 = [BOFFSET]
  9447. adds BOFFSET = 4 * SIZE, BOFFSET
  9448. ;;
  9449. LDFPD f39, f40 = [BOFFSET]
  9450. adds BOFFSET = 5 * SIZE, BOFFSET
  9451. ;;
  9452. LDFD f41 = [BOFFSET], -15 * SIZE
  9453. ;;
  9454. FMPY f64 = f64, f32
  9455. FMPY f68 = f68, f32
  9456. FMPY f65 = f65, f32
  9457. FMPY f69 = f69, f32
  9458. FMPY f66 = f66, f32
  9459. FMPY f70 = f70, f32
  9460. FMPY f67 = f67, f32
  9461. FMPY f71 = f71, f32
  9462. ;;
  9463. FNMA f72 = f64, f33, f72
  9464. FNMA f76 = f68, f33, f76
  9465. FNMA f73 = f65, f33, f73
  9466. FNMA f77 = f69, f33, f77
  9467. FNMA f74 = f66, f33, f74
  9468. FNMA f78 = f70, f33, f78
  9469. FNMA f75 = f67, f33, f75
  9470. FNMA f79 = f71, f33, f79
  9471. ;;
  9472. FNMA f80 = f64, f34, f80
  9473. FNMA f84 = f68, f34, f84
  9474. FNMA f81 = f65, f34, f81
  9475. FNMA f85 = f69, f34, f85
  9476. FNMA f82 = f66, f34, f82
  9477. FNMA f86 = f70, f34, f86
  9478. FNMA f83 = f67, f34, f83
  9479. FNMA f87 = f71, f34, f87
  9480. ;;
  9481. FNMA f88 = f64, f35, f88
  9482. FNMA f92 = f68, f35, f92
  9483. FNMA f89 = f65, f35, f89
  9484. FNMA f93 = f69, f35, f93
  9485. FNMA f90 = f66, f35, f90
  9486. FNMA f94 = f70, f35, f94
  9487. FNMA f91 = f67, f35, f91
  9488. FNMA f95 = f71, f35, f95
  9489. ;;
  9490. FMPY f72 = f72, f36
  9491. FMPY f76 = f76, f36
  9492. FMPY f73 = f73, f36
  9493. FMPY f77 = f77, f36
  9494. FMPY f74 = f74, f36
  9495. FMPY f78 = f78, f36
  9496. FMPY f75 = f75, f36
  9497. FMPY f79 = f79, f36
  9498. ;;
  9499. FNMA f80 = f72, f37, f80
  9500. FNMA f84 = f76, f37, f84
  9501. FNMA f81 = f73, f37, f81
  9502. FNMA f85 = f77, f37, f85
  9503. FNMA f82 = f74, f37, f82
  9504. FNMA f86 = f78, f37, f86
  9505. FNMA f83 = f75, f37, f83
  9506. FNMA f87 = f79, f37, f87
  9507. ;;
  9508. FNMA f88 = f72, f38, f88
  9509. FNMA f92 = f76, f38, f92
  9510. FNMA f89 = f73, f38, f89
  9511. FNMA f93 = f77, f38, f93
  9512. FNMA f90 = f74, f38, f90
  9513. FNMA f94 = f78, f38, f94
  9514. FNMA f91 = f75, f38, f91
  9515. FNMA f95 = f79, f38, f95
  9516. ;;
  9517. FMPY f80 = f80, f39
  9518. FMPY f84 = f84, f39
  9519. FMPY f81 = f81, f39
  9520. FMPY f85 = f85, f39
  9521. FMPY f82 = f82, f39
  9522. FMPY f86 = f86, f39
  9523. FMPY f83 = f83, f39
  9524. FMPY f87 = f87, f39
  9525. ;;
  9526. FNMA f88 = f80, f40, f88
  9527. FNMA f92 = f84, f40, f92
  9528. FNMA f89 = f81, f40, f89
  9529. FNMA f93 = f85, f40, f93
  9530. FNMA f90 = f82, f40, f90
  9531. FNMA f94 = f86, f40, f94
  9532. FNMA f91 = f83, f40, f91
  9533. FNMA f95 = f87, f40, f95
  9534. ;;
  9535. FMPY f88 = f88, f41
  9536. FMPY f92 = f92, f41
  9537. FMPY f89 = f89, f41
  9538. FMPY f93 = f93, f41
  9539. FMPY f90 = f90, f41
  9540. FMPY f94 = f94, f41
  9541. FMPY f91 = f91, f41
  9542. FMPY f95 = f95, f41
  9543. ;;
  9544. STFD [AOFFSET] = f64, SIZE
  9545. STFD [AOFFSET2] = f68, SIZE
  9546. ;;
  9547. STFD [AOFFSET] = f65, SIZE
  9548. STFD [AOFFSET2] = f69, SIZE
  9549. ;;
  9550. STFD [AOFFSET] = f66, SIZE
  9551. STFD [AOFFSET2] = f70, SIZE
  9552. ;;
  9553. STFD [AOFFSET] = f67, 5 * SIZE
  9554. STFD [AOFFSET2] = f71, 5 * SIZE
  9555. ;;
  9556. STFD [AOFFSET] = f72, SIZE
  9557. STFD [AOFFSET2] = f76, SIZE
  9558. ;;
  9559. STFD [AOFFSET] = f73, SIZE
  9560. STFD [AOFFSET2] = f77, SIZE
  9561. ;;
  9562. STFD [AOFFSET] = f74, SIZE
  9563. STFD [AOFFSET2] = f78, SIZE
  9564. ;;
  9565. STFD [AOFFSET] = f75, 5 * SIZE
  9566. STFD [AOFFSET2] = f79, 5 * SIZE
  9567. ;;
  9568. STFD [AOFFSET] = f80, SIZE
  9569. STFD [AOFFSET2] = f84, SIZE
  9570. ;;
  9571. STFD [AOFFSET] = f81, SIZE
  9572. STFD [AOFFSET2] = f85, SIZE
  9573. ;;
  9574. STFD [AOFFSET] = f82, SIZE
  9575. STFD [AOFFSET2] = f86, SIZE
  9576. ;;
  9577. STFD [AOFFSET] = f83, 5 * SIZE
  9578. STFD [AOFFSET2] = f87, 5 * SIZE
  9579. ;;
  9580. STFD [AOFFSET] = f88, SIZE
  9581. STFD [AOFFSET2] = f92, SIZE
  9582. ;;
  9583. STFD [AOFFSET] = f89, SIZE
  9584. STFD [AOFFSET2] = f93, SIZE
  9585. ;;
  9586. STFD [AOFFSET] = f90, SIZE
  9587. STFD [AOFFSET2] = f94, SIZE
  9588. ;;
  9589. STFD [AOFFSET] = f91, -27 * SIZE
  9590. STFD [AOFFSET2] = f95, -27 * SIZE
  9591. ;;
  9592. #endif
  9593. #ifdef RT
  9594. adds BOFFSET = 14 * SIZE, BOFFSET
  9595. ;;
  9596. LDFPD f33, f32 = [BOFFSET]
  9597. adds BOFFSET = - 2 * SIZE, BOFFSET
  9598. ;;
  9599. LDFPD f35, f34 = [BOFFSET]
  9600. adds BOFFSET = - 2 * SIZE, BOFFSET
  9601. ;;
  9602. LDFD f36 = [BOFFSET], -2 * SIZE
  9603. ;;
  9604. LDFPD f38, f37 = [BOFFSET]
  9605. adds BOFFSET = - 4 * SIZE, BOFFSET
  9606. ;;
  9607. LDFPD f40, f39 = [BOFFSET]
  9608. adds BOFFSET = - 4 * SIZE, BOFFSET
  9609. ;;
  9610. LDFD f41 = [BOFFSET]
  9611. ;;
  9612. FMPY f88 = f88, f32
  9613. FMPY f92 = f92, f32
  9614. FMPY f89 = f89, f32
  9615. FMPY f93 = f93, f32
  9616. FMPY f90 = f90, f32
  9617. FMPY f94 = f94, f32
  9618. FMPY f91 = f91, f32
  9619. FMPY f95 = f95, f32
  9620. ;;
  9621. FNMA f80 = f88, f33, f80
  9622. FNMA f84 = f92, f33, f84
  9623. FNMA f81 = f89, f33, f81
  9624. FNMA f85 = f93, f33, f85
  9625. FNMA f82 = f90, f33, f82
  9626. FNMA f86 = f94, f33, f86
  9627. FNMA f83 = f91, f33, f83
  9628. FNMA f87 = f95, f33, f87
  9629. ;;
  9630. FNMA f72 = f88, f34, f72
  9631. FNMA f76 = f92, f34, f76
  9632. FNMA f73 = f89, f34, f73
  9633. FNMA f77 = f93, f34, f77
  9634. FNMA f74 = f90, f34, f74
  9635. FNMA f78 = f94, f34, f78
  9636. FNMA f75 = f91, f34, f75
  9637. FNMA f79 = f95, f34, f79
  9638. ;;
  9639. FNMA f64 = f88, f35, f64
  9640. FNMA f68 = f92, f35, f68
  9641. FNMA f65 = f89, f35, f65
  9642. FNMA f69 = f93, f35, f69
  9643. FNMA f66 = f90, f35, f66
  9644. FNMA f70 = f94, f35, f70
  9645. FNMA f67 = f91, f35, f67
  9646. FNMA f71 = f95, f35, f71
  9647. ;;
  9648. FMPY f80 = f80, f36
  9649. FMPY f84 = f84, f36
  9650. FMPY f81 = f81, f36
  9651. FMPY f85 = f85, f36
  9652. FMPY f82 = f82, f36
  9653. FMPY f86 = f86, f36
  9654. FMPY f83 = f83, f36
  9655. FMPY f87 = f87, f36
  9656. ;;
  9657. FNMA f72 = f80, f37, f72
  9658. FNMA f76 = f84, f37, f76
  9659. FNMA f73 = f81, f37, f73
  9660. FNMA f77 = f85, f37, f77
  9661. FNMA f74 = f82, f37, f74
  9662. FNMA f78 = f86, f37, f78
  9663. FNMA f75 = f83, f37, f75
  9664. FNMA f79 = f87, f37, f79
  9665. ;;
  9666. FNMA f64 = f80, f38, f64
  9667. FNMA f68 = f84, f38, f68
  9668. FNMA f65 = f81, f38, f65
  9669. FNMA f69 = f85, f38, f69
  9670. FNMA f66 = f82, f38, f66
  9671. FNMA f70 = f86, f38, f70
  9672. FNMA f67 = f83, f38, f67
  9673. FNMA f71 = f87, f38, f71
  9674. ;;
  9675. FMPY f72 = f72, f39
  9676. FMPY f76 = f76, f39
  9677. FMPY f73 = f73, f39
  9678. FMPY f77 = f77, f39
  9679. FMPY f74 = f74, f39
  9680. FMPY f78 = f78, f39
  9681. FMPY f75 = f75, f39
  9682. FMPY f79 = f79, f39
  9683. ;;
  9684. FNMA f64 = f72, f40, f64
  9685. FNMA f68 = f76, f40, f68
  9686. FNMA f65 = f73, f40, f65
  9687. FNMA f69 = f77, f40, f69
  9688. FNMA f66 = f74, f40, f66
  9689. FNMA f70 = f78, f40, f70
  9690. FNMA f67 = f75, f40, f67
  9691. FNMA f71 = f79, f40, f71
  9692. ;;
  9693. FMPY f64 = f64, f41
  9694. FMPY f68 = f68, f41
  9695. FMPY f65 = f65, f41
  9696. FMPY f69 = f69, f41
  9697. FMPY f66 = f66, f41
  9698. FMPY f70 = f70, f41
  9699. FMPY f67 = f67, f41
  9700. FMPY f71 = f71, f41
  9701. ;;
  9702. adds AOFFSET = 24 * SIZE, AOFFSET
  9703. adds AOFFSET2 = 24 * SIZE, AOFFSET2
  9704. ;;
  9705. STFD [AOFFSET] = f88, SIZE
  9706. STFD [AOFFSET2] = f92, SIZE
  9707. ;;
  9708. STFD [AOFFSET] = f89, SIZE
  9709. STFD [AOFFSET2] = f93, SIZE
  9710. ;;
  9711. STFD [AOFFSET] = f90, SIZE
  9712. STFD [AOFFSET2] = f94, SIZE
  9713. ;;
  9714. STFD [AOFFSET] = f91, - 11 * SIZE
  9715. STFD [AOFFSET2] = f95, - 11 * SIZE
  9716. ;;
  9717. STFD [AOFFSET] = f80, SIZE
  9718. STFD [AOFFSET2] = f84, SIZE
  9719. ;;
  9720. STFD [AOFFSET] = f81, SIZE
  9721. STFD [AOFFSET2] = f85, SIZE
  9722. ;;
  9723. STFD [AOFFSET] = f82, SIZE
  9724. STFD [AOFFSET2] = f86, SIZE
  9725. ;;
  9726. STFD [AOFFSET] = f83, - 11 * SIZE
  9727. STFD [AOFFSET2] = f87, - 11 * SIZE
  9728. ;;
  9729. STFD [AOFFSET] = f72, SIZE
  9730. STFD [AOFFSET2] = f76, SIZE
  9731. ;;
  9732. STFD [AOFFSET] = f73, SIZE
  9733. STFD [AOFFSET2] = f77, SIZE
  9734. ;;
  9735. STFD [AOFFSET] = f74, SIZE
  9736. STFD [AOFFSET2] = f78, SIZE
  9737. ;;
  9738. STFD [AOFFSET] = f75, - 11 * SIZE
  9739. STFD [AOFFSET2] = f79, - 11 * SIZE
  9740. ;;
  9741. STFD [AOFFSET] = f64, SIZE
  9742. STFD [AOFFSET2] = f68, SIZE
  9743. ;;
  9744. STFD [AOFFSET] = f65, SIZE
  9745. STFD [AOFFSET2] = f69, SIZE
  9746. ;;
  9747. STFD [AOFFSET] = f66, SIZE
  9748. STFD [AOFFSET2] = f70, SIZE
  9749. ;;
  9750. STFD [AOFFSET] = f67, - 3 * SIZE
  9751. STFD [AOFFSET2] = f71, - 3 * SIZE
  9752. ;;
  9753. #endif
  9754. adds C9 = 4 * SIZE, C1
  9755. ;;
  9756. { .mmf
  9757. STFD [C1 ] = f64, SIZE
  9758. STFD [C9 ] = f68, SIZE
  9759. mov f64 = f0
  9760. }
  9761. ;;
  9762. { .mmi
  9763. STFD [C1 ] = f65, SIZE
  9764. STFD [C9 ] = f69, SIZE
  9765. adds C10 = 4 * SIZE, C2
  9766. }
  9767. ;;
  9768. { .mmi
  9769. STFD [C1 ] = f66, SIZE
  9770. STFD [C9 ] = f70, SIZE
  9771. }
  9772. ;;
  9773. { .mmi
  9774. #ifndef LN
  9775. STFD [C1 ] = f67, 5 * SIZE
  9776. #else
  9777. STFD [C1 ] = f67, - 3 * SIZE
  9778. #endif
  9779. STFD [C9 ] = f71
  9780. adds C11 = 4 * SIZE, C3
  9781. }
  9782. ;;
  9783. { .mmf
  9784. STFD [C2 ] = f72, SIZE
  9785. STFD [C10] = f76, SIZE
  9786. mov f72 = f0
  9787. }
  9788. ;;
  9789. { .mmi
  9790. STFD [C2 ] = f73, SIZE
  9791. STFD [C10] = f77, SIZE
  9792. }
  9793. ;;
  9794. { .mmi
  9795. STFD [C2 ] = f74, SIZE
  9796. STFD [C10] = f78, SIZE
  9797. adds C12 = 4 * SIZE, C4
  9798. }
  9799. ;;
  9800. { .mmi
  9801. #ifndef LN
  9802. STFD [C2 ] = f75, 5 * SIZE
  9803. #else
  9804. STFD [C2 ] = f75, - 3 * SIZE
  9805. #endif
  9806. STFD [C10] = f79
  9807. }
  9808. ;;
  9809. { .mmf
  9810. STFD [C3 ] = f80, SIZE
  9811. STFD [C11] = f84, SIZE
  9812. }
  9813. ;;
  9814. { .mmi
  9815. STFD [C3 ] = f81, SIZE
  9816. STFD [C11] = f85, SIZE
  9817. }
  9818. ;;
  9819. { .mmi
  9820. STFD [C3 ] = f82, SIZE
  9821. STFD [C11] = f86, SIZE
  9822. }
  9823. ;;
  9824. { .mmi
  9825. #ifndef LN
  9826. STFD [C3 ] = f83, 5 * SIZE
  9827. #else
  9828. STFD [C3 ] = f83, - 3 * SIZE
  9829. #endif
  9830. STFD [C11] = f87
  9831. }
  9832. ;;
  9833. { .mmf
  9834. STFD [C4 ] = f88, SIZE
  9835. STFD [C12] = f92, SIZE
  9836. }
  9837. ;;
  9838. { .mmi
  9839. STFD [C4 ] = f89, SIZE
  9840. STFD [C12] = f93, SIZE
  9841. }
  9842. ;;
  9843. { .mmi
  9844. STFD [C4 ] = f90, SIZE
  9845. STFD [C12] = f94, SIZE
  9846. }
  9847. ;;
  9848. { .mmi
  9849. #ifndef LN
  9850. STFD [C4 ] = f91, 5 * SIZE
  9851. #else
  9852. STFD [C4 ] = f91, - 3 * SIZE
  9853. #endif
  9854. STFD [C12] = f95
  9855. cmp.ne p6, p0 = 1, I
  9856. }
  9857. ;;
  9858. adds I = -1, I
  9859. ;;
  9860. { .mmi
  9861. shladd r2 = K, BASE_SHIFT, r0
  9862. }
  9863. ;;
  9864. { .mmi
  9865. sub L = K, KK
  9866. }
  9867. ;;
  9868. { .mmi
  9869. #ifdef RT
  9870. shladd AORIG = r2, 3, AORIG
  9871. #else
  9872. nop __LINE__
  9873. #endif
  9874. }
  9875. ;;
  9876. { .mmi
  9877. #if defined(LT) || defined(RN)
  9878. shladd L = L, BASE_SHIFT, r0
  9879. #else
  9880. nop __LINE__
  9881. #endif
  9882. }
  9883. ;;
  9884. ;;
  9885. { .mmi
  9886. #if defined(LT) || defined(RN)
  9887. shladd AOFFSET = L, 3, AOFFSET
  9888. #else
  9889. nop __LINE__
  9890. #endif
  9891. }
  9892. ;;
  9893. { .mmi
  9894. #if defined(LT) || defined(RN)
  9895. shladd BOFFSET = L, 2, BOFFSET
  9896. #else
  9897. nop __LINE__
  9898. #endif
  9899. }
  9900. ;;
  9901. { .mmi
  9902. #ifdef LT
  9903. adds KK = 8, KK
  9904. #elif defined LN
  9905. adds KK = -8, KK
  9906. #else
  9907. nop __LINE__
  9908. #endif
  9909. }
  9910. ;;
  9911. { .mmi
  9912. #if defined(LT) || defined(RN)
  9913. mov L = KK
  9914. #else
  9915. sub L = K, KK
  9916. #endif
  9917. }
  9918. ;;
  9919. mov f64 = f0
  9920. mov f72 = f0
  9921. mov f80 = f0
  9922. mov f88 = f0
  9923. mov f65 = f0
  9924. mov f73 = f0
  9925. mov f81 = f0
  9926. mov f89 = f0
  9927. { .mmb
  9928. (p6) br.cond.dptk .L052
  9929. }
  9930. ;;
  9931. .align 8
  9932. .L089:
  9933. #ifdef LN
  9934. shladd KK8 = K, BASE_SHIFT, r0
  9935. ;;
  9936. shladd B = KK8, 2, B
  9937. #endif
  9938. #if defined(LT) || defined(RN)
  9939. mov B = BOFFSET
  9940. #endif
  9941. #ifdef RN
  9942. adds KK = 4, KK
  9943. #endif
  9944. #ifdef RT
  9945. adds KK = -4, KK
  9946. #endif
  9947. ;;
  9948. mov AOFFSET = A
  9949. ;;
  9950. .align 16
  9951. .L090:
  9952. tbit.z p6, p0 = N, 1
  9953. (p6) br.cond.dpnt .L130
  9954. ;;
  9955. #ifdef RT
  9956. { .mmi
  9957. shladd r3 = LDC, 1, r0
  9958. nop __LINE__
  9959. shl r2 = K, 1 + BASE_SHIFT
  9960. }
  9961. ;;
  9962. { .mmi
  9963. sub B = B, r2
  9964. sub C = C, r3
  9965. nop __LINE__
  9966. }
  9967. #endif
  9968. ;;
  9969. mov f64 = f0
  9970. mov f65 = f0
  9971. mov f66 = f0
  9972. mov f67 = f0
  9973. mov f72 = f0
  9974. mov f73 = f0
  9975. mov f74 = f0
  9976. mov f75 = f0
  9977. ;;
  9978. { .mfi
  9979. mov C1 = C // coffset1 = c + 0 * ldc
  9980. #ifdef LN
  9981. add KK = M, OFFSET
  9982. #elif defined LT
  9983. mov KK = OFFSET
  9984. #else
  9985. nop __LINE__
  9986. #endif
  9987. }
  9988. ;;
  9989. { .mmf
  9990. #if defined(LN) || defined(RT)
  9991. mov AORIG = A
  9992. #else
  9993. mov AOFFSET = A
  9994. #endif
  9995. }
  9996. { .mmf
  9997. add C2 = LDC, C // coffset2 = c + 1 * ldc
  9998. }
  9999. ;;
  10000. { .mfi
  10001. #ifndef RT
  10002. shladd C = LDC, 1, C // coffset += 8 * ldc
  10003. #else
  10004. nop __LINE__
  10005. #endif
  10006. mov f81 = f0
  10007. #if defined(LT) || defined(RN)
  10008. mov L = KK
  10009. #else
  10010. sub L = K, KK
  10011. #endif
  10012. }
  10013. ;;
  10014. tbit.z p6, p7 = M, 0
  10015. (p6) br.cond.dptk .L110
  10016. ;;
  10017. { .mib
  10018. #if defined(LT) || defined(RN)
  10019. mov L = KK
  10020. #else
  10021. sub L = K, KK
  10022. #endif
  10023. }
  10024. ;;
  10025. { .mmi
  10026. cmp.ne p7, p0 = r0, L
  10027. adds BOFFSET = 0 * SIZE, B
  10028. shl r2 = K, 0 + BASE_SHIFT
  10029. }
  10030. { .mmi
  10031. shladd r3 = KK, BASE_SHIFT, r0
  10032. nop __LINE__
  10033. nop __LINE__
  10034. }
  10035. ;;
  10036. #if defined(LT) || defined(RN)
  10037. { .mmf
  10038. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10039. }
  10040. ;;
  10041. #else
  10042. { .mfi
  10043. shladd BOFFSET = r3, 1, B
  10044. #ifdef LN
  10045. sub AORIG = AORIG, r2
  10046. #else
  10047. nop __LINE__
  10048. #endif
  10049. }
  10050. ;;
  10051. { .mfi
  10052. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10053. add AOFFSET = r3, AORIG
  10054. }
  10055. ;;
  10056. #endif
  10057. { .mmi
  10058. adds L = 1, L
  10059. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  10060. cmp.eq p3, p0 = r0, r0
  10061. }
  10062. ;;
  10063. { .mii
  10064. tbit.z p12, p0 = L, 0
  10065. shr L = L, 1
  10066. }
  10067. ;;
  10068. { .mmi
  10069. adds L = -1, L
  10070. }
  10071. ;;
  10072. { .mmi
  10073. cmp.eq p6, p0 = -1, L
  10074. }
  10075. ;;
  10076. { .mib
  10077. (p7) LDFD f32 = [AOFFSET], 1 * SIZE
  10078. mov ar.lc = L
  10079. (p6) br.cond.dpnt .L128
  10080. }
  10081. ;;
  10082. .align 8
  10083. .L122:
  10084. { .mfi
  10085. FMA f64 = f32, f48, f64 // A1 * B1
  10086. cmp.ne p4, p5 = 0, L
  10087. }
  10088. { .mfi
  10089. nop __LINE__
  10090. FMA f72 = f32, f49, f72 // A1 * B2
  10091. (p12) cmp.ne p3, p0 = 0, L
  10092. }
  10093. ;;
  10094. { .mmi
  10095. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  10096. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  10097. nop __LINE__
  10098. }
  10099. { .mmi
  10100. nop __LINE__
  10101. nop __LINE__
  10102. nop __LINE__
  10103. }
  10104. ;;
  10105. { .mfi
  10106. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10107. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  10108. adds L = -1, L
  10109. }
  10110. { .mfb
  10111. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  10112. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  10113. br.cloop.sptk.few .L122
  10114. }
  10115. ;;
  10116. .L128:
  10117. #if defined(LN) || defined(RT)
  10118. #ifdef LN
  10119. adds r2 = -1, KK
  10120. #else
  10121. adds r2 = -2, KK
  10122. #endif
  10123. ;;
  10124. shladd r2 = r2, BASE_SHIFT, r0
  10125. ;;
  10126. add AOFFSET = r2, AORIG
  10127. shladd BOFFSET = r2, 1, B
  10128. ;;
  10129. #endif
  10130. adds AOFFSET2 = 4 * SIZE, AOFFSET
  10131. adds BOFFSET2 = 4 * SIZE, BOFFSET
  10132. ;;
  10133. #if defined(LN) || defined(LT)
  10134. LDFPD f32, f33 = [BOFFSET]
  10135. ;;
  10136. FSUB f64 = f32, f64
  10137. FSUB f72 = f33, f72
  10138. ;;
  10139. #else
  10140. LDFPD f32, f33 = [AOFFSET]
  10141. ;;
  10142. FSUB f64 = f32, f64
  10143. FSUB f72 = f33, f72
  10144. ;;
  10145. #endif
  10146. #ifdef LN
  10147. LDFD f32 = [AOFFSET]
  10148. ;;
  10149. FMPY f64 = f64, f32
  10150. FMPY f72 = f72, f32
  10151. ;;
  10152. { .mmi
  10153. STFD [BOFFSET] = f64, SIZE
  10154. adds C1 = -1 * SIZE, C1
  10155. }
  10156. ;;
  10157. { .mmi
  10158. STFD [BOFFSET] = f72, -SIZE
  10159. adds C2 = -1 * SIZE, C2
  10160. }
  10161. ;;
  10162. #endif
  10163. #ifdef LT
  10164. LDFD f32 = [AOFFSET]
  10165. ;;
  10166. FMPY f64 = f64, f32
  10167. FMPY f72 = f72, f32
  10168. ;;
  10169. STFD [BOFFSET] = f64, SIZE
  10170. ;;
  10171. STFD [BOFFSET] = f72, -SIZE
  10172. ;;
  10173. #endif
  10174. #ifdef RN
  10175. LDFPD f32, f33 = [BOFFSET]
  10176. adds BOFFSET = 3 * SIZE, BOFFSET
  10177. ;;
  10178. LDFD f34 = [BOFFSET], -3 * SIZE
  10179. ;;
  10180. FMPY f64 = f64, f32
  10181. ;;
  10182. FNMA f72 = f64, f33, f72
  10183. ;;
  10184. FMPY f72 = f72, f34
  10185. ;;
  10186. STFD [AOFFSET] = f64, SIZE
  10187. ;;
  10188. STFD [AOFFSET] = f72, -SIZE
  10189. ;;
  10190. #endif
  10191. #ifdef RT
  10192. adds BOFFSET = 2 * SIZE, BOFFSET
  10193. ;;
  10194. LDFPD f33, f32 = [BOFFSET]
  10195. adds BOFFSET = - 2 * SIZE, BOFFSET
  10196. ;;
  10197. LDFD f34 = [BOFFSET]
  10198. ;;
  10199. FMPY f72 = f72, f32
  10200. ;;
  10201. FNMA f64 = f72, f33, f64
  10202. ;;
  10203. FMPY f64 = f64, f34
  10204. ;;
  10205. STFD [AOFFSET] = f64, SIZE
  10206. ;;
  10207. STFD [AOFFSET] = f72, -SIZE
  10208. ;;
  10209. #endif
  10210. #ifndef LN
  10211. STFD [C1 ] = f64, SIZE
  10212. #else
  10213. STFD [C1 ] = f64
  10214. #endif
  10215. #ifndef LN
  10216. STFD [C2 ] = f72, SIZE
  10217. #else
  10218. STFD [C2 ] = f72
  10219. #endif
  10220. mov f64 = f0
  10221. mov f72 = f0
  10222. ;;
  10223. shladd r2 = K, BASE_SHIFT, r0
  10224. ;;
  10225. sub L = K, KK
  10226. ;;
  10227. #ifdef RT
  10228. add AORIG = r2, AORIG
  10229. #else
  10230. nop __LINE__
  10231. #endif
  10232. ;;
  10233. #if defined(LT) || defined(RN)
  10234. shladd L = L, BASE_SHIFT, r0
  10235. #else
  10236. nop __LINE__
  10237. #endif
  10238. ;;
  10239. #if defined(LT) || defined(RN)
  10240. add AOFFSET = L, AOFFSET
  10241. #else
  10242. nop __LINE__
  10243. #endif
  10244. ;;
  10245. #if defined(LT) || defined(RN)
  10246. shladd BOFFSET = L, 1, BOFFSET
  10247. #else
  10248. nop __LINE__
  10249. #endif
  10250. ;;
  10251. #ifdef LT
  10252. adds KK = 1, KK
  10253. #elif defined LN
  10254. adds KK = -1, KK
  10255. #else
  10256. nop __LINE__
  10257. #endif
  10258. ;;
  10259. #if defined(LT) || defined(RN)
  10260. mov L = KK
  10261. #else
  10262. sub L = K, KK
  10263. #endif
  10264. ;;
  10265. .align 8
  10266. .L110:
  10267. tbit.z p6, p7 = M, 1
  10268. (p6) br.cond.dptk .L100
  10269. ;;
  10270. { .mib
  10271. #if defined(LT) || defined(RN)
  10272. mov L = KK
  10273. #else
  10274. sub L = K, KK
  10275. #endif
  10276. }
  10277. ;;
  10278. { .mmi
  10279. cmp.ne p7, p0 = r0, L
  10280. adds BOFFSET = 0 * SIZE, B
  10281. shl r2 = K, 1 + BASE_SHIFT
  10282. }
  10283. { .mmi
  10284. shladd r3 = KK, BASE_SHIFT, r0
  10285. nop __LINE__
  10286. nop __LINE__
  10287. }
  10288. ;;
  10289. #if defined(LT) || defined(RN)
  10290. { .mmf
  10291. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10292. }
  10293. ;;
  10294. #else
  10295. { .mfi
  10296. shladd BOFFSET = r3, 1, B
  10297. #ifdef LN
  10298. sub AORIG = AORIG, r2
  10299. #else
  10300. nop __LINE__
  10301. #endif
  10302. }
  10303. ;;
  10304. { .mfi
  10305. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10306. shladd AOFFSET = r3, 1, AORIG
  10307. }
  10308. ;;
  10309. #endif
  10310. { .mfi
  10311. adds L = 1, L
  10312. }
  10313. { .mfi
  10314. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  10315. cmp.eq p3, p0 = r0, r0
  10316. }
  10317. ;;
  10318. { .mfi
  10319. tbit.z p12, p0 = L, 0
  10320. }
  10321. { .mfi
  10322. shr L = L, 1
  10323. }
  10324. ;;
  10325. { .mmf
  10326. adds L = -1, L
  10327. }
  10328. ;;
  10329. { .mmf
  10330. cmp.eq p6, p0 = -1, L
  10331. }
  10332. ;;
  10333. { .mib
  10334. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  10335. mov ar.lc = L
  10336. (p6) br.cond.dpnt .L118
  10337. }
  10338. ;;
  10339. .L112:
  10340. { .mfi
  10341. lfetch.nt1 [PREA], 4 * SIZE
  10342. FMA f64 = f32, f48, f64 // A1 * B1
  10343. cmp.ne p4, p5 = 0, L
  10344. }
  10345. { .mfi
  10346. lfetch.nt1 [PREB], 4 * SIZE
  10347. FMA f72 = f32, f49, f72 // A1 * B2
  10348. (p12) cmp.ne p3, p0 = 0, L
  10349. }
  10350. ;;
  10351. { .mmf
  10352. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  10353. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  10354. FMA f65 = f33, f48, f65 // A2 * B1
  10355. }
  10356. { .mmf
  10357. nop __LINE__
  10358. nop __LINE__
  10359. FMA f73 = f33, f49, f73 // A2 * B2
  10360. }
  10361. ;;
  10362. { .mfb
  10363. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  10364. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  10365. nop __LINE__
  10366. }
  10367. { .mfb
  10368. nop __LINE__
  10369. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  10370. nop __LINE__
  10371. }
  10372. ;;
  10373. { .mfi
  10374. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10375. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  10376. adds L = -1, L
  10377. }
  10378. { .mfb
  10379. nop __LINE__
  10380. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  10381. br.cloop.sptk.few .L112
  10382. }
  10383. ;;
  10384. .align 8
  10385. .L118:
  10386. #if defined(LN) || defined(RT)
  10387. #ifdef LN
  10388. adds r2 = -2, KK
  10389. #else
  10390. adds r2 = -2, KK
  10391. #endif
  10392. ;;
  10393. shladd r2 = r2, BASE_SHIFT, r0
  10394. ;;
  10395. shladd AOFFSET = r2, 1, AORIG
  10396. shladd BOFFSET = r2, 1, B
  10397. ;;
  10398. #endif
  10399. adds AOFFSET2 = 4 * SIZE, AOFFSET
  10400. adds BOFFSET2 = 4 * SIZE, BOFFSET
  10401. ;;
  10402. #if defined(LN) || defined(LT)
  10403. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  10404. ;;
  10405. LDFPD f34, f35 = [BOFFSET]
  10406. adds BOFFSET = -2 * SIZE, BOFFSET
  10407. ;;
  10408. FSUB f64 = f32, f64
  10409. FSUB f72 = f33, f72
  10410. FSUB f65 = f34, f65
  10411. FSUB f73 = f35, f73
  10412. ;;
  10413. #else
  10414. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  10415. ;;
  10416. LDFPD f34, f35 = [AOFFSET]
  10417. adds AOFFSET = -2 * SIZE, AOFFSET
  10418. ;;
  10419. FSUB f64 = f32, f64
  10420. FSUB f65 = f33, f65
  10421. FSUB f72 = f34, f72
  10422. FSUB f73 = f35, f73
  10423. ;;
  10424. #endif
  10425. #ifdef LN
  10426. adds AOFFSET = 2 * SIZE, AOFFSET
  10427. ;;
  10428. LDFPD f33, f32 = [AOFFSET]
  10429. adds AOFFSET = - 2 * SIZE, AOFFSET
  10430. ;;
  10431. LDFD f34 = [AOFFSET]
  10432. ;;
  10433. FMPY f65 = f65, f32
  10434. FMPY f73 = f73, f32
  10435. ;;
  10436. FNMA f64 = f65, f33, f64
  10437. FNMA f72 = f73, f33, f72
  10438. ;;
  10439. FMPY f64 = f64, f34
  10440. FMPY f72 = f72, f34
  10441. ;;
  10442. STFD [BOFFSET] = f64, SIZE
  10443. ;;
  10444. STFD [BOFFSET] = f72, SIZE
  10445. ;;
  10446. STFD [BOFFSET] = f65, SIZE
  10447. ;;
  10448. STFD [BOFFSET] = f73, - 3 * SIZE
  10449. ;;
  10450. adds C1 = -2 * SIZE, C1
  10451. adds C2 = -2 * SIZE, C2
  10452. ;;
  10453. #endif
  10454. #ifdef LT
  10455. LDFPD f32, f33 = [AOFFSET]
  10456. adds AOFFSET = 3 * SIZE, AOFFSET
  10457. ;;
  10458. LDFD f34 = [AOFFSET], - 3 * SIZE
  10459. ;;
  10460. FMPY f64 = f64, f32
  10461. FMPY f72 = f72, f32
  10462. ;;
  10463. FNMA f65 = f64, f33, f65
  10464. FNMA f73 = f72, f33, f73
  10465. ;;
  10466. FMPY f65 = f65, f34
  10467. FMPY f73 = f73, f34
  10468. ;;
  10469. STFD [BOFFSET] = f64, SIZE
  10470. ;;
  10471. STFD [BOFFSET] = f72, SIZE
  10472. ;;
  10473. STFD [BOFFSET] = f65, SIZE
  10474. ;;
  10475. STFD [BOFFSET] = f73, -3 * SIZE
  10476. ;;
  10477. #endif
  10478. #ifdef RN
  10479. LDFPD f32, f33 = [BOFFSET]
  10480. adds BOFFSET = 3 * SIZE, BOFFSET
  10481. ;;
  10482. LDFD f34 = [BOFFSET], -3 * SIZE
  10483. ;;
  10484. FMPY f64 = f64, f32
  10485. FMPY f65 = f65, f32
  10486. ;;
  10487. FNMA f72 = f64, f33, f72
  10488. FNMA f73 = f65, f33, f73
  10489. ;;
  10490. FMPY f72 = f72, f34
  10491. FMPY f73 = f73, f34
  10492. ;;
  10493. STFD [AOFFSET] = f64, SIZE
  10494. ;;
  10495. STFD [AOFFSET] = f65, SIZE
  10496. ;;
  10497. STFD [AOFFSET] = f72, SIZE
  10498. ;;
  10499. STFD [AOFFSET] = f73, -3 * SIZE
  10500. ;;
  10501. #endif
  10502. #ifdef RT
  10503. adds BOFFSET = 2 * SIZE, BOFFSET
  10504. ;;
  10505. LDFPD f33, f32 = [BOFFSET]
  10506. adds BOFFSET = - 2 * SIZE, BOFFSET
  10507. ;;
  10508. LDFD f34 = [BOFFSET]
  10509. ;;
  10510. FMPY f72 = f72, f32
  10511. FMPY f73 = f73, f32
  10512. ;;
  10513. FNMA f64 = f72, f33, f64
  10514. FNMA f65 = f73, f33, f65
  10515. ;;
  10516. FMPY f64 = f64, f34
  10517. FMPY f65 = f65, f34
  10518. ;;
  10519. STFD [AOFFSET] = f64, SIZE
  10520. ;;
  10521. STFD [AOFFSET] = f65, SIZE
  10522. ;;
  10523. STFD [AOFFSET] = f72, SIZE
  10524. ;;
  10525. STFD [AOFFSET] = f73, -3 * SIZE
  10526. ;;
  10527. #endif
  10528. STFD [C1 ] = f64, SIZE
  10529. mov f64 = f0
  10530. ;;
  10531. #ifndef LN
  10532. STFD [C1 ] = f65, SIZE
  10533. #else
  10534. STFD [C1 ] = f65, -SIZE
  10535. #endif
  10536. ;;
  10537. STFD [C2 ] = f72, SIZE
  10538. mov f72 = f0
  10539. ;;
  10540. #ifndef LN
  10541. STFD [C2 ] = f73, SIZE
  10542. #else
  10543. STFD [C2 ] = f73, -SIZE
  10544. #endif
  10545. ;;
  10546. mov f65 = f0
  10547. mov f73 = f0
  10548. ;;
  10549. shladd r2 = K, BASE_SHIFT, r0
  10550. ;;
  10551. sub L = K, KK
  10552. ;;
  10553. #ifdef RT
  10554. shladd AORIG = r2, 1, AORIG
  10555. #else
  10556. nop __LINE__
  10557. #endif
  10558. ;;
  10559. { .mmi
  10560. #if defined(LT) || defined(RN)
  10561. shladd L = L, BASE_SHIFT, r0
  10562. #else
  10563. nop __LINE__
  10564. #endif
  10565. }
  10566. ;;
  10567. { .mmi
  10568. #if defined(LT) || defined(RN)
  10569. shladd AOFFSET = L, 1, AOFFSET
  10570. #else
  10571. nop __LINE__
  10572. #endif
  10573. }
  10574. ;;
  10575. { .mmi
  10576. #if defined(LT) || defined(RN)
  10577. shladd BOFFSET = L, 1, BOFFSET
  10578. #else
  10579. nop __LINE__
  10580. #endif
  10581. }
  10582. ;;
  10583. { .mmi
  10584. #ifdef LT
  10585. adds KK = 2, KK
  10586. #elif defined LN
  10587. adds KK = -2, KK
  10588. #else
  10589. nop __LINE__
  10590. #endif
  10591. }
  10592. ;;
  10593. { .mmi
  10594. #if defined(LT) || defined(RN)
  10595. mov L = KK
  10596. #else
  10597. sub L = K, KK
  10598. #endif
  10599. }
  10600. ;;
  10601. .align 8
  10602. .L100:
  10603. tbit.z p6, p7 = M, 2
  10604. (p6) br.cond.dptk .L091
  10605. ;;
  10606. { .mib
  10607. #if defined(LT) || defined(RN)
  10608. mov L = KK
  10609. #else
  10610. sub L = K, KK
  10611. #endif
  10612. }
  10613. ;;
  10614. { .mmi
  10615. cmp.ne p7, p0 = r0, L
  10616. adds BOFFSET = 0 * SIZE, B
  10617. shl r2 = K, 2 + BASE_SHIFT
  10618. }
  10619. { .mmi
  10620. shladd r3 = KK, BASE_SHIFT, r0
  10621. nop __LINE__
  10622. nop __LINE__
  10623. }
  10624. ;;
  10625. #if defined(LT) || defined(RN)
  10626. { .mmf
  10627. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10628. mov f65 = f0
  10629. }
  10630. ;;
  10631. #else
  10632. { .mfi
  10633. shladd BOFFSET = r3, 1, B
  10634. #ifdef LN
  10635. sub AORIG = AORIG, r2
  10636. #else
  10637. nop __LINE__
  10638. #endif
  10639. }
  10640. ;;
  10641. { .mfi
  10642. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10643. shladd AOFFSET = r3, 2, AORIG
  10644. }
  10645. ;;
  10646. #endif
  10647. { .mfi
  10648. adds L = 1, L
  10649. }
  10650. { .mfi
  10651. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  10652. cmp.eq p3, p0 = r0, r0
  10653. }
  10654. ;;
  10655. { .mfi
  10656. tbit.z p12, p0 = L, 0
  10657. }
  10658. { .mfi
  10659. shr L = L, 1
  10660. }
  10661. ;;
  10662. { .mfi
  10663. adds L = -1, L
  10664. }
  10665. ;;
  10666. { .mfi
  10667. cmp.eq p6, p0 = -1, L
  10668. }
  10669. ;;
  10670. { .mmf
  10671. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  10672. }
  10673. { .mfi
  10674. mov ar.lc = L
  10675. }
  10676. ;;
  10677. { .mmf
  10678. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  10679. }
  10680. { .mfb
  10681. (p6) br.cond.dpnt .L108
  10682. }
  10683. ;;
  10684. .L102:
  10685. { .mfi
  10686. lfetch.nt1 [PREA], 8 * SIZE
  10687. FMA f64 = f32, f48, f64 // A1 * B1
  10688. cmp.ne p4, p5 = 0, L
  10689. }
  10690. { .mfi
  10691. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  10692. FMA f72 = f32, f49, f72 // A1 * B2
  10693. (p12) cmp.ne p3, p0 = 0, L
  10694. }
  10695. ;;
  10696. { .mfi
  10697. lfetch.nt1 [PREB], 4 * SIZE
  10698. FMA f65 = f33, f48, f65 // A2 * B1
  10699. adds C9 = 2 * SIZE, C1
  10700. }
  10701. { .mfi
  10702. nop __LINE__
  10703. FMA f73 = f33, f49, f73 // A2 * B2
  10704. adds C10 = 2 * SIZE, C2
  10705. }
  10706. ;;
  10707. { .mfb
  10708. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  10709. FMA f66 = f34, f48, f66 // A3 * B1
  10710. nop __LINE__
  10711. }
  10712. { .mfb
  10713. nop __LINE__
  10714. FMA f74 = f34, f49, f74 // A3 * B2
  10715. nop __LINE__
  10716. }
  10717. ;;
  10718. { .mfb
  10719. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  10720. FMA f67 = f35, f48, f67 // A4 * B1
  10721. nop __LINE__
  10722. }
  10723. { .mfb
  10724. nop __LINE__
  10725. FMA f75 = f35, f49, f75 // A4 * B2
  10726. nop __LINE__
  10727. }
  10728. ;;
  10729. { .mfb
  10730. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  10731. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  10732. nop __LINE__
  10733. }
  10734. { .mfb
  10735. nop __LINE__
  10736. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  10737. nop __LINE__
  10738. }
  10739. ;;
  10740. { .mfb
  10741. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  10742. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  10743. nop __LINE__
  10744. }
  10745. { .mfb
  10746. nop __LINE__
  10747. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  10748. nop __LINE__
  10749. }
  10750. ;;
  10751. { .mfb
  10752. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  10753. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  10754. nop __LINE__
  10755. }
  10756. { .mfb
  10757. nop __LINE__
  10758. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  10759. nop __LINE__
  10760. }
  10761. ;;
  10762. { .mfi
  10763. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  10764. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  10765. adds L = -1, L
  10766. }
  10767. { .mfb
  10768. nop __LINE__
  10769. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  10770. br.cloop.sptk.few .L102
  10771. }
  10772. ;;
  10773. .align 8
  10774. .L108:
  10775. #if defined(LN) || defined(RT)
  10776. #ifdef LN
  10777. adds r2 = -4, KK
  10778. #else
  10779. adds r2 = -2, KK
  10780. #endif
  10781. ;;
  10782. shladd r2 = r2, BASE_SHIFT, r0
  10783. ;;
  10784. shladd AOFFSET = r2, 2, AORIG
  10785. shladd BOFFSET = r2, 1, B
  10786. ;;
  10787. #endif
  10788. adds AOFFSET2 = 4 * SIZE, AOFFSET
  10789. adds BOFFSET2 = 4 * SIZE, BOFFSET
  10790. ;;
  10791. #if defined(LN) || defined(LT)
  10792. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  10793. ;;
  10794. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  10795. ;;
  10796. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  10797. ;;
  10798. LDFPD f38, f39 = [BOFFSET]
  10799. adds BOFFSET = -6 * SIZE, BOFFSET
  10800. ;;
  10801. FSUB f64 = f32, f64
  10802. FSUB f72 = f33, f72
  10803. ;;
  10804. FSUB f65 = f34, f65
  10805. FSUB f73 = f35, f73
  10806. ;;
  10807. FSUB f66 = f36, f66
  10808. FSUB f74 = f37, f74
  10809. ;;
  10810. FSUB f67 = f38, f67
  10811. FSUB f75 = f39, f75
  10812. ;;
  10813. #else
  10814. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  10815. ;;
  10816. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  10817. ;;
  10818. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  10819. ;;
  10820. LDFPD f38, f39 = [AOFFSET]
  10821. adds AOFFSET = -6 * SIZE, AOFFSET
  10822. ;;
  10823. FSUB f64 = f32, f64
  10824. FSUB f65 = f33, f65
  10825. FSUB f66 = f34, f66
  10826. FSUB f67 = f35, f67
  10827. FSUB f72 = f36, f72
  10828. FSUB f73 = f37, f73
  10829. FSUB f74 = f38, f74
  10830. FSUB f75 = f39, f75
  10831. ;;
  10832. #endif
  10833. #ifdef LN
  10834. adds AOFFSET = 14 * SIZE, AOFFSET
  10835. ;;
  10836. LDFPD f33, f32 = [AOFFSET]
  10837. adds AOFFSET = - 2 * SIZE, AOFFSET
  10838. ;;
  10839. LDFPD f35, f34 = [AOFFSET]
  10840. adds AOFFSET = - 2 * SIZE, AOFFSET
  10841. ;;
  10842. LDFD f36 = [AOFFSET], - 2 * SIZE
  10843. ;;
  10844. LDFPD f38, f37 = [AOFFSET]
  10845. adds AOFFSET = - 4 * SIZE, AOFFSET
  10846. ;;
  10847. LDFPD f40, f39 = [AOFFSET]
  10848. adds AOFFSET = - 4 * SIZE, AOFFSET
  10849. ;;
  10850. LDFD f41 = [AOFFSET]
  10851. ;;
  10852. FMPY f67 = f67, f32
  10853. FMPY f75 = f75, f32
  10854. ;;
  10855. FNMA f66 = f67, f33, f66
  10856. FNMA f74 = f75, f33, f74
  10857. ;;
  10858. FNMA f65 = f67, f34, f65
  10859. FNMA f73 = f75, f34, f73
  10860. ;;
  10861. FNMA f64 = f67, f35, f64
  10862. FNMA f72 = f75, f35, f72
  10863. ;;
  10864. FMPY f66 = f66, f36
  10865. FMPY f74 = f74, f36
  10866. ;;
  10867. FNMA f65 = f66, f37, f65
  10868. FNMA f73 = f74, f37, f73
  10869. ;;
  10870. FNMA f64 = f66, f38, f64
  10871. FNMA f72 = f74, f38, f72
  10872. ;;
  10873. FMPY f65 = f65, f39
  10874. FMPY f73 = f73, f39
  10875. ;;
  10876. FNMA f64 = f65, f40, f64
  10877. FNMA f72 = f73, f40, f72
  10878. ;;
  10879. FMPY f64 = f64, f41
  10880. FMPY f72 = f72, f41
  10881. ;;
  10882. STFD [BOFFSET] = f64, SIZE
  10883. STFD [BOFFSET2] = f66, SIZE
  10884. ;;
  10885. STFD [BOFFSET] = f72, SIZE
  10886. STFD [BOFFSET2] = f74, SIZE
  10887. ;;
  10888. STFD [BOFFSET] = f65, SIZE
  10889. STFD [BOFFSET2] = f67, SIZE
  10890. ;;
  10891. STFD [BOFFSET] = f73, -3 * SIZE
  10892. STFD [BOFFSET2] = f75, -3 * SIZE
  10893. ;;
  10894. adds C1 = -4 * SIZE, C1
  10895. adds C2 = -4 * SIZE, C2
  10896. ;;
  10897. #endif
  10898. #ifdef LT
  10899. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  10900. ;;
  10901. LDFPD f34, f35 = [AOFFSET]
  10902. adds AOFFSET = 3 * SIZE, AOFFSET
  10903. ;;
  10904. LDFD f36 = [AOFFSET], 1 * SIZE
  10905. ;;
  10906. LDFPD f37, f38 = [AOFFSET]
  10907. adds AOFFSET = 4 * SIZE, AOFFSET
  10908. ;;
  10909. LDFPD f39, f40 = [AOFFSET]
  10910. adds AOFFSET = 5 * SIZE, AOFFSET
  10911. ;;
  10912. LDFD f41 = [AOFFSET], -15 * SIZE
  10913. ;;
  10914. FMPY f64 = f64, f32
  10915. FMPY f72 = f72, f32
  10916. ;;
  10917. FNMA f65 = f64, f33, f65
  10918. FNMA f73 = f72, f33, f73
  10919. ;;
  10920. FNMA f66 = f64, f34, f66
  10921. FNMA f74 = f72, f34, f74
  10922. ;;
  10923. FNMA f67 = f64, f35, f67
  10924. FNMA f75 = f72, f35, f75
  10925. ;;
  10926. FMPY f65 = f65, f36
  10927. FMPY f73 = f73, f36
  10928. ;;
  10929. FNMA f66 = f65, f37, f66
  10930. FNMA f74 = f73, f37, f74
  10931. ;;
  10932. FNMA f67 = f65, f38, f67
  10933. FNMA f75 = f73, f38, f75
  10934. ;;
  10935. FMPY f66 = f66, f39
  10936. FMPY f74 = f74, f39
  10937. ;;
  10938. FNMA f67 = f66, f40, f67
  10939. FNMA f75 = f74, f40, f75
  10940. ;;
  10941. FMPY f67 = f67, f41
  10942. FMPY f75 = f75, f41
  10943. ;;
  10944. STFD [BOFFSET] = f64, SIZE
  10945. STFD [BOFFSET2] = f66, SIZE
  10946. ;;
  10947. STFD [BOFFSET] = f72, SIZE
  10948. STFD [BOFFSET2] = f74, SIZE
  10949. ;;
  10950. STFD [BOFFSET] = f65, SIZE
  10951. STFD [BOFFSET2] = f67, SIZE
  10952. ;;
  10953. STFD [BOFFSET] = f73, -3 * SIZE
  10954. STFD [BOFFSET2] = f75, -3 * SIZE
  10955. ;;
  10956. #endif
  10957. #ifdef RN
  10958. LDFPD f32, f33 = [BOFFSET]
  10959. adds BOFFSET = 3 * SIZE, BOFFSET
  10960. ;;
  10961. LDFD f34 = [BOFFSET], -3 * SIZE
  10962. ;;
  10963. FMPY f64 = f64, f32
  10964. FMPY f65 = f65, f32
  10965. FMPY f66 = f66, f32
  10966. FMPY f67 = f67, f32
  10967. ;;
  10968. FNMA f72 = f64, f33, f72
  10969. FNMA f73 = f65, f33, f73
  10970. FNMA f74 = f66, f33, f74
  10971. FNMA f75 = f67, f33, f75
  10972. ;;
  10973. FMPY f72 = f72, f34
  10974. FMPY f73 = f73, f34
  10975. FMPY f74 = f74, f34
  10976. FMPY f75 = f75, f34
  10977. ;;
  10978. STFD [AOFFSET] = f64, SIZE
  10979. STFD [AOFFSET2] = f72, SIZE
  10980. ;;
  10981. STFD [AOFFSET] = f65, SIZE
  10982. STFD [AOFFSET2] = f73, SIZE
  10983. ;;
  10984. STFD [AOFFSET] = f66, SIZE
  10985. STFD [AOFFSET2] = f74, SIZE
  10986. ;;
  10987. STFD [AOFFSET] = f67, -3 * SIZE
  10988. STFD [AOFFSET2] = f75, -3 * SIZE
  10989. ;;
  10990. #endif
  10991. #ifdef RT
  10992. adds BOFFSET = 2 * SIZE, BOFFSET
  10993. ;;
  10994. LDFPD f33, f32 = [BOFFSET]
  10995. adds BOFFSET = - 2 * SIZE, BOFFSET
  10996. ;;
  10997. LDFD f34 = [BOFFSET]
  10998. ;;
  10999. FMPY f72 = f72, f32
  11000. FMPY f73 = f73, f32
  11001. FMPY f74 = f74, f32
  11002. FMPY f75 = f75, f32
  11003. ;;
  11004. FNMA f64 = f72, f33, f64
  11005. FNMA f65 = f73, f33, f65
  11006. FNMA f66 = f74, f33, f66
  11007. FNMA f67 = f75, f33, f67
  11008. ;;
  11009. FMPY f64 = f64, f34
  11010. FMPY f65 = f65, f34
  11011. FMPY f66 = f66, f34
  11012. FMPY f67 = f67, f34
  11013. ;;
  11014. STFD [AOFFSET] = f64, SIZE
  11015. STFD [AOFFSET2] = f72, SIZE
  11016. ;;
  11017. STFD [AOFFSET] = f65, SIZE
  11018. STFD [AOFFSET2] = f73, SIZE
  11019. ;;
  11020. STFD [AOFFSET] = f66, SIZE
  11021. STFD [AOFFSET2] = f74, SIZE
  11022. ;;
  11023. STFD [AOFFSET] = f67, - 3 * SIZE
  11024. STFD [AOFFSET2] = f75, - 3 * SIZE
  11025. ;;
  11026. #endif
  11027. { .mmf
  11028. STFD [C1 ] = f64, SIZE
  11029. mov f64 = f0
  11030. }
  11031. ;;
  11032. { .mmi
  11033. STFD [C1 ] = f65, SIZE
  11034. }
  11035. ;;
  11036. { .mmi
  11037. STFD [C1 ] = f66, SIZE
  11038. }
  11039. ;;
  11040. { .mmi
  11041. #ifndef LN
  11042. STFD [C1 ] = f67, SIZE
  11043. #else
  11044. STFD [C1 ] = f67, - 3 * SIZE
  11045. #endif
  11046. }
  11047. ;;
  11048. { .mmf
  11049. STFD [C2 ] = f72, SIZE
  11050. mov f72 = f0
  11051. }
  11052. ;;
  11053. { .mmi
  11054. STFD [C2 ] = f73, SIZE
  11055. }
  11056. ;;
  11057. { .mmi
  11058. STFD [C2 ] = f74, SIZE
  11059. }
  11060. ;;
  11061. { .mmi
  11062. #ifndef LN
  11063. STFD [C2 ] = f75, SIZE
  11064. #else
  11065. STFD [C2 ] = f75, - 3 * SIZE
  11066. #endif
  11067. }
  11068. ;;
  11069. mov f65 = f0
  11070. mov f73 = f0
  11071. mov f66 = f0
  11072. mov f74 = f0
  11073. mov f67 = f0
  11074. mov f75 = f0
  11075. ;;
  11076. shladd r2 = K, BASE_SHIFT, r0
  11077. ;;
  11078. { .mmi
  11079. sub L = K, KK
  11080. }
  11081. ;;
  11082. { .mmi
  11083. #ifdef RT
  11084. shladd AORIG = r2, 2, AORIG
  11085. #else
  11086. nop __LINE__
  11087. #endif
  11088. }
  11089. ;;
  11090. { .mmi
  11091. #if defined(LT) || defined(RN)
  11092. shladd L = L, BASE_SHIFT, r0
  11093. #else
  11094. nop __LINE__
  11095. #endif
  11096. }
  11097. ;;
  11098. { .mmi
  11099. #if defined(LT) || defined(RN)
  11100. shladd AOFFSET = L, 2, AOFFSET
  11101. #else
  11102. nop __LINE__
  11103. #endif
  11104. }
  11105. ;;
  11106. { .mmi
  11107. #if defined(LT) || defined(RN)
  11108. shladd BOFFSET = L, 1, BOFFSET
  11109. #else
  11110. nop __LINE__
  11111. #endif
  11112. }
  11113. ;;
  11114. { .mmi
  11115. #ifdef LT
  11116. adds KK = 4, KK
  11117. #elif defined LN
  11118. adds KK = -4, KK
  11119. #else
  11120. nop __LINE__
  11121. #endif
  11122. }
  11123. ;;
  11124. { .mmi
  11125. #if defined(LT) || defined(RN)
  11126. mov L = KK
  11127. #else
  11128. sub L = K, KK
  11129. #endif
  11130. }
  11131. ;;
  11132. .align 8
  11133. .L091:
  11134. shr I = M, 3
  11135. ;;
  11136. cmp.eq p6, p7 = 0, I
  11137. (p6) br.cond.dpnt .L129
  11138. ;;
  11139. .align 16
  11140. .L092:
  11141. { .mmi
  11142. cmp.ne p7, p0 = r0, L
  11143. adds BOFFSET = 0 * SIZE, B
  11144. shl r2 = K, 3 + BASE_SHIFT
  11145. }
  11146. { .mmi
  11147. shladd r3 = KK, BASE_SHIFT, r0
  11148. nop __LINE__
  11149. nop __LINE__
  11150. }
  11151. ;;
  11152. #if defined(LT) || defined(RN)
  11153. { .mmi
  11154. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  11155. nop __LINE__
  11156. nop __LINE__
  11157. }
  11158. ;;
  11159. #else
  11160. { .mfi
  11161. shladd BOFFSET = r3, 1, B
  11162. #ifdef LN
  11163. sub AORIG = AORIG, r2
  11164. #else
  11165. nop __LINE__
  11166. #endif
  11167. }
  11168. ;;
  11169. { .mfi
  11170. (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  11171. shladd AOFFSET = r3, 3, AORIG
  11172. }
  11173. ;;
  11174. #endif
  11175. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  11176. ;;
  11177. { .mmf
  11178. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  11179. }
  11180. ;;
  11181. { .mmf
  11182. (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  11183. }
  11184. { .mfi
  11185. cmp.eq p3, p0 = r0, r0
  11186. }
  11187. ;;
  11188. { .mmf
  11189. (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  11190. }
  11191. { .mfi
  11192. adds PREC = CPREFETCHSIZE * SIZE, C1
  11193. }
  11194. ;;
  11195. { .mmf
  11196. CPREFETCH [PREC], LDC
  11197. }
  11198. { .mfi
  11199. adds L = 1, L
  11200. }
  11201. ;;
  11202. { .mmf
  11203. CPREFETCH [PREC]
  11204. }
  11205. { .mfi
  11206. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  11207. }
  11208. ;;
  11209. { .mfi
  11210. adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET
  11211. }
  11212. ;;
  11213. { .mfi
  11214. tbit.z p12, p0 = L, 0
  11215. }
  11216. { .mfi
  11217. shr L = L, 1
  11218. }
  11219. ;;
  11220. { .mfi
  11221. adds L = -1, L
  11222. }
  11223. ;;
  11224. { .mfi
  11225. mov ar.lc = L
  11226. }
  11227. ;;
  11228. mov f68 = f0
  11229. mov f69 = f0
  11230. mov f70 = f0
  11231. mov f71 = f0
  11232. mov f76 = f0
  11233. mov f77 = f0
  11234. mov f78 = f0
  11235. mov f79 = f0
  11236. ;;
  11237. { .mfb
  11238. cmp.eq p6, p0 = -1, L
  11239. (p6) br.cond.dpnt .L098
  11240. }
  11241. ;;
  11242. .align 8
  11243. .L093:
  11244. /* 1 */
  11245. { .mfi
  11246. lfetch.nt1 [PREA], 16 * SIZE
  11247. FMA f64 = f32, f48, f64 // A1 * B1
  11248. cmp.ne p4, p5 = 0, L
  11249. }
  11250. { .mfi
  11251. nop __LINE__
  11252. FMA f72 = f32, f49, f72 // A1 * B2
  11253. (p12) cmp.ne p3, p0 = 0, L
  11254. }
  11255. ;;
  11256. { .mfi
  11257. lfetch.nt1 [PREB], 4 * SIZE
  11258. FMA f65 = f33, f48, f65 // A2 * B1
  11259. adds C9 = 4 * SIZE, C1
  11260. }
  11261. { .mfi
  11262. nop __LINE__
  11263. FMA f73 = f33, f49, f73 // A2 * B2
  11264. adds C10 = 4 * SIZE, C2
  11265. }
  11266. ;;
  11267. { .mfi
  11268. (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE
  11269. FMA f66 = f34, f48, f66 // A3 * B1
  11270. adds C11 = 4 * SIZE, C3
  11271. }
  11272. { .mfi
  11273. nop __LINE__
  11274. FMA f74 = f34, f49, f74 // A3 * B2
  11275. adds C12 = 4 * SIZE, C4
  11276. }
  11277. ;;
  11278. { .mfb
  11279. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  11280. FMA f67 = f35, f48, f67 // A4 * B1
  11281. nop __LINE__
  11282. }
  11283. { .mfb
  11284. nop __LINE__
  11285. FMA f75 = f35, f49, f75 // A4 * B2
  11286. nop __LINE__
  11287. }
  11288. ;;
  11289. { .mfb
  11290. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  11291. FMA f68 = f36, f48, f68 // A5 * B1
  11292. nop __LINE__
  11293. }
  11294. { .mfb
  11295. nop __LINE__
  11296. FMA f76 = f36, f49, f76 // A5 * B2
  11297. nop __LINE__
  11298. }
  11299. ;;
  11300. { .mfb
  11301. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  11302. FMA f69 = f37, f48, f69 // A6 * B1
  11303. nop __LINE__
  11304. }
  11305. { .mfb
  11306. nop __LINE__
  11307. FMA f77 = f37, f49, f77 // A6 * B2
  11308. nop __LINE__
  11309. }
  11310. ;;
  11311. { .mfb
  11312. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  11313. FMA f70 = f38, f48, f70 // A7 * B1
  11314. nop __LINE__
  11315. }
  11316. { .mfb
  11317. nop __LINE__
  11318. FMA f78 = f38, f49, f78 // A7 * B2
  11319. nop __LINE__
  11320. }
  11321. ;;
  11322. { .mfb
  11323. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  11324. FMA f71 = f39, f48, f71 // A8 * B1
  11325. nop __LINE__
  11326. }
  11327. { .mfb
  11328. nop __LINE__
  11329. FMA f79 = f39, f49, f79 // A8 * B2
  11330. nop __LINE__
  11331. }
  11332. ;;
  11333. { .mfb
  11334. (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE
  11335. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  11336. nop __LINE__
  11337. }
  11338. { .mfb
  11339. nop __LINE__
  11340. (p3) FMA f72 = f40, f57, f72 // A1 * B2
  11341. nop __LINE__
  11342. }
  11343. ;;
  11344. { .mfb
  11345. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  11346. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  11347. nop __LINE__
  11348. }
  11349. { .mfb
  11350. nop __LINE__
  11351. (p3) FMA f73 = f41, f57, f73 // A2 * B2
  11352. nop __LINE__
  11353. }
  11354. ;;
  11355. { .mfb
  11356. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  11357. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  11358. nop __LINE__
  11359. }
  11360. { .mfb
  11361. nop __LINE__
  11362. (p3) FMA f74 = f42, f57, f74 // A3 * B2
  11363. nop __LINE__
  11364. }
  11365. ;;
  11366. { .mfb
  11367. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  11368. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  11369. nop __LINE__
  11370. }
  11371. { .mfb
  11372. nop __LINE__
  11373. (p3) FMA f75 = f43, f57, f75 // A4 * B2
  11374. nop __LINE__
  11375. }
  11376. ;;
  11377. { .mfb
  11378. nop __LINE__
  11379. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  11380. nop __LINE__
  11381. }
  11382. { .mfb
  11383. nop __LINE__
  11384. (p3) FMA f76 = f44, f57, f76 // A5 * B2
  11385. nop __LINE__
  11386. }
  11387. ;;
  11388. { .mfb
  11389. nop __LINE__
  11390. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  11391. nop __LINE__
  11392. }
  11393. { .mfb
  11394. nop __LINE__
  11395. (p3) FMA f77 = f45, f57, f77 // A6 * B2
  11396. nop __LINE__
  11397. }
  11398. ;;
  11399. { .mfb
  11400. nop __LINE__
  11401. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  11402. nop __LINE__
  11403. }
  11404. { .mfb
  11405. nop __LINE__
  11406. (p3) FMA f78 = f46, f57, f78 // A7 * B2
  11407. nop __LINE__
  11408. }
  11409. ;;
  11410. { .mfi
  11411. nop __LINE__
  11412. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  11413. adds L = -1, L
  11414. }
  11415. { .mfb
  11416. nop __LINE__
  11417. (p3) FMA f79 = f47, f57, f79 // A8 * B2
  11418. br.cloop.sptk.few .L093
  11419. }
  11420. ;;
  11421. .align 8
  11422. .L098:
  11423. #if defined(LN) || defined(RT)
  11424. #ifdef LN
  11425. adds r2 = -8, KK
  11426. #else
  11427. adds r2 = -2, KK
  11428. #endif
  11429. ;;
  11430. shladd r2 = r2, BASE_SHIFT, r0
  11431. ;;
  11432. shladd AOFFSET = r2, 3, AORIG
  11433. shladd BOFFSET = r2, 1, B
  11434. ;;
  11435. #endif
  11436. adds AOFFSET2 = 4 * SIZE, AOFFSET
  11437. adds BOFFSET2 = 4 * SIZE, BOFFSET
  11438. ;;
  11439. #if defined(LN) || defined(LT)
  11440. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  11441. ;;
  11442. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  11443. ;;
  11444. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  11445. ;;
  11446. LDFPD f38, f39 = [BOFFSET], 2 * SIZE
  11447. ;;
  11448. LDFPD f40, f41 = [BOFFSET], 2 * SIZE
  11449. ;;
  11450. LDFPD f42, f43 = [BOFFSET], 2 * SIZE
  11451. ;;
  11452. LDFPD f44, f45 = [BOFFSET], 2 * SIZE
  11453. ;;
  11454. LDFPD f46, f47 = [BOFFSET]
  11455. adds BOFFSET = -14 * SIZE, BOFFSET
  11456. ;;
  11457. FSUB f64 = f32, f64
  11458. FSUB f72 = f33, f72
  11459. FSUB f65 = f34, f65
  11460. FSUB f73 = f35, f73
  11461. FSUB f66 = f36, f66
  11462. FSUB f74 = f37, f74
  11463. FSUB f67 = f38, f67
  11464. FSUB f75 = f39, f75
  11465. FSUB f68 = f40, f68
  11466. FSUB f76 = f41, f76
  11467. FSUB f69 = f42, f69
  11468. FSUB f77 = f43, f77
  11469. FSUB f70 = f44, f70
  11470. FSUB f78 = f45, f78
  11471. FSUB f71 = f46, f71
  11472. FSUB f79 = f47, f79
  11473. ;;
  11474. #else
  11475. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  11476. ;;
  11477. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  11478. ;;
  11479. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  11480. ;;
  11481. LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  11482. ;;
  11483. LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  11484. ;;
  11485. LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  11486. ;;
  11487. LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  11488. ;;
  11489. LDFPD f46, f47 = [AOFFSET]
  11490. adds AOFFSET = -14 * SIZE, AOFFSET
  11491. ;;
  11492. FSUB f64 = f32, f64
  11493. FSUB f65 = f33, f65
  11494. FSUB f66 = f34, f66
  11495. FSUB f67 = f35, f67
  11496. FSUB f68 = f36, f68
  11497. FSUB f69 = f37, f69
  11498. FSUB f70 = f38, f70
  11499. FSUB f71 = f39, f71
  11500. ;;
  11501. FSUB f72 = f40, f72
  11502. FSUB f73 = f41, f73
  11503. FSUB f74 = f42, f74
  11504. FSUB f75 = f43, f75
  11505. FSUB f76 = f44, f76
  11506. FSUB f77 = f45, f77
  11507. FSUB f78 = f46, f78
  11508. FSUB f79 = f47, f79
  11509. ;;
  11510. #endif
  11511. #ifdef LN
  11512. adds AOFFSET = 62 * SIZE, AOFFSET
  11513. ;;
  11514. LDFPD f33, f32 = [AOFFSET]
  11515. adds AOFFSET = - 2 * SIZE, AOFFSET
  11516. ;;
  11517. LDFPD f35, f34 = [AOFFSET]
  11518. adds AOFFSET = - 2 * SIZE, AOFFSET
  11519. ;;
  11520. LDFPD f37, f36 = [AOFFSET]
  11521. adds AOFFSET = - 2 * SIZE, AOFFSET
  11522. ;;
  11523. LDFPD f39, f38 = [AOFFSET]
  11524. adds AOFFSET = - 2 * SIZE, AOFFSET
  11525. ;;
  11526. LDFD f40 = [AOFFSET], -2 * SIZE
  11527. ;;
  11528. LDFPD f42, f41 = [AOFFSET]
  11529. adds AOFFSET = - 2 * SIZE, AOFFSET
  11530. ;;
  11531. LDFPD f44, f43 = [AOFFSET]
  11532. adds AOFFSET = - 2 * SIZE, AOFFSET
  11533. ;;
  11534. LDFPD f46, f45 = [AOFFSET]
  11535. adds AOFFSET = - 4 * SIZE, AOFFSET
  11536. ;;
  11537. LDFPD f48, f47 = [AOFFSET]
  11538. adds AOFFSET = - 2 * SIZE, AOFFSET
  11539. ;;
  11540. LDFPD f50, f49 = [AOFFSET]
  11541. adds AOFFSET = - 2 * SIZE, AOFFSET
  11542. ;;
  11543. LDFPD f52, f51 = [AOFFSET]
  11544. adds AOFFSET = - 4 * SIZE, AOFFSET
  11545. ;;
  11546. LDFD f53 = [AOFFSET], -2 * SIZE
  11547. ;;
  11548. LDFPD f55, f54 = [AOFFSET]
  11549. adds AOFFSET = - 2 * SIZE, AOFFSET
  11550. ;;
  11551. LDFPD f57, f56 = [AOFFSET]
  11552. adds AOFFSET = - 6 * SIZE, AOFFSET
  11553. ;;
  11554. LDFPD f59, f58 = [AOFFSET]
  11555. adds AOFFSET = - 2 * SIZE, AOFFSET
  11556. ;;
  11557. LDFPD f61, f60 = [AOFFSET]
  11558. adds AOFFSET = - 6 * SIZE, AOFFSET
  11559. ;;
  11560. LDFD f16 = [AOFFSET], -2 * SIZE
  11561. ;;
  11562. LDFPD f18, f17 = [AOFFSET]
  11563. adds AOFFSET = - 8 * SIZE, AOFFSET
  11564. ;;
  11565. LDFPD f20, f19 = [AOFFSET]
  11566. adds AOFFSET = - 8 * SIZE, AOFFSET
  11567. ;;
  11568. LDFD f21 = [AOFFSET]
  11569. ;;
  11570. FMPY f71 = f71, f32
  11571. FMPY f79 = f79, f32
  11572. ;;
  11573. FNMA f70 = f71, f33, f70
  11574. FNMA f78 = f79, f33, f78
  11575. ;;
  11576. FNMA f69 = f71, f34, f69
  11577. FNMA f77 = f79, f34, f77
  11578. ;;
  11579. FNMA f68 = f71, f35, f68
  11580. FNMA f76 = f79, f35, f76
  11581. ;;
  11582. FNMA f67 = f71, f36, f67
  11583. FNMA f75 = f79, f36, f75
  11584. ;;
  11585. FNMA f66 = f71, f37, f66
  11586. FNMA f74 = f79, f37, f74
  11587. ;;
  11588. FNMA f65 = f71, f38, f65
  11589. FNMA f73 = f79, f38, f73
  11590. ;;
  11591. FNMA f64 = f71, f39, f64
  11592. FNMA f72 = f79, f39, f72
  11593. ;;
  11594. FMPY f70 = f70, f40
  11595. FMPY f78 = f78, f40
  11596. ;;
  11597. FNMA f69 = f70, f41, f69
  11598. FNMA f77 = f78, f41, f77
  11599. ;;
  11600. FNMA f68 = f70, f42, f68
  11601. FNMA f76 = f78, f42, f76
  11602. ;;
  11603. FNMA f67 = f70, f43, f67
  11604. FNMA f75 = f78, f43, f75
  11605. ;;
  11606. FNMA f66 = f70, f44, f66
  11607. FNMA f74 = f78, f44, f74
  11608. ;;
  11609. FNMA f65 = f70, f45, f65
  11610. FNMA f73 = f78, f45, f73
  11611. ;;
  11612. FNMA f64 = f70, f46, f64
  11613. FNMA f72 = f78, f46, f72
  11614. ;;
  11615. FMPY f69 = f69, f47
  11616. FMPY f77 = f77, f47
  11617. ;;
  11618. FNMA f68 = f69, f48, f68
  11619. FNMA f76 = f77, f48, f76
  11620. ;;
  11621. FNMA f67 = f69, f49, f67
  11622. FNMA f75 = f77, f49, f75
  11623. ;;
  11624. FNMA f66 = f69, f50, f66
  11625. FNMA f74 = f77, f50, f74
  11626. ;;
  11627. FNMA f65 = f69, f51, f65
  11628. FNMA f73 = f77, f51, f73
  11629. ;;
  11630. FNMA f64 = f69, f52, f64
  11631. FNMA f72 = f77, f52, f72
  11632. ;;
  11633. FMPY f68 = f68, f53
  11634. FMPY f76 = f76, f53
  11635. ;;
  11636. FNMA f67 = f68, f54, f67
  11637. FNMA f75 = f76, f54, f75
  11638. ;;
  11639. FNMA f66 = f68, f55, f66
  11640. FNMA f74 = f76, f55, f74
  11641. ;;
  11642. FNMA f65 = f68, f56, f65
  11643. FNMA f73 = f76, f56, f73
  11644. ;;
  11645. FNMA f64 = f68, f57, f64
  11646. FNMA f72 = f76, f57, f72
  11647. ;;
  11648. FMPY f67 = f67, f58
  11649. FMPY f75 = f75, f58
  11650. ;;
  11651. FNMA f66 = f67, f59, f66
  11652. FNMA f74 = f75, f59, f74
  11653. ;;
  11654. FNMA f65 = f67, f60, f65
  11655. FNMA f73 = f75, f60, f73
  11656. ;;
  11657. FNMA f64 = f67, f61, f64
  11658. FNMA f72 = f75, f61, f72
  11659. ;;
  11660. FMPY f66 = f66, f16
  11661. FMPY f74 = f74, f16
  11662. ;;
  11663. FNMA f65 = f66, f17, f65
  11664. FNMA f73 = f74, f17, f73
  11665. ;;
  11666. FNMA f64 = f66, f18, f64
  11667. FNMA f72 = f74, f18, f72
  11668. ;;
  11669. FMPY f65 = f65, f19
  11670. FMPY f73 = f73, f19
  11671. ;;
  11672. FNMA f64 = f65, f20, f64
  11673. FNMA f72 = f73, f20, f72
  11674. ;;
  11675. FMPY f64 = f64, f21
  11676. FMPY f72 = f72, f21
  11677. ;;
  11678. adds BOFFSET = 8 * SIZE, BOFFSET
  11679. adds BOFFSET2 = 8 * SIZE, BOFFSET2
  11680. ;;
  11681. STFD [BOFFSET] = f68, SIZE
  11682. STFD [BOFFSET2] = f70, SIZE
  11683. ;;
  11684. STFD [BOFFSET] = f76, SIZE
  11685. STFD [BOFFSET2] = f78, SIZE
  11686. ;;
  11687. STFD [BOFFSET] = f69, SIZE
  11688. STFD [BOFFSET2] = f71, SIZE
  11689. ;;
  11690. STFD [BOFFSET] = f77, - 11 * SIZE
  11691. STFD [BOFFSET2] = f79, - 11 * SIZE
  11692. ;;
  11693. STFD [BOFFSET] = f64, SIZE
  11694. STFD [BOFFSET2] = f66, SIZE
  11695. ;;
  11696. STFD [BOFFSET] = f72, SIZE
  11697. STFD [BOFFSET2] = f74, SIZE
  11698. ;;
  11699. STFD [BOFFSET] = f65, SIZE
  11700. STFD [BOFFSET2] = f67, SIZE
  11701. ;;
  11702. STFD [BOFFSET] = f73, - 3 * SIZE
  11703. STFD [BOFFSET2] = f75, - 3 * SIZE
  11704. ;;
  11705. adds C1 = -8 * SIZE, C1
  11706. adds C2 = -8 * SIZE, C2
  11707. ;;
  11708. #endif
  11709. #ifdef LT
  11710. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  11711. ;;
  11712. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  11713. ;;
  11714. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  11715. ;;
  11716. LDFPD f38, f39 = [AOFFSET]
  11717. adds AOFFSET = 3 * SIZE, AOFFSET
  11718. ;;
  11719. LDFD f40 = [AOFFSET], 1 * SIZE
  11720. ;;
  11721. LDFPD f41, f42 = [AOFFSET], 2 * SIZE
  11722. ;;
  11723. LDFPD f43, f44 = [AOFFSET], 2 * SIZE
  11724. ;;
  11725. LDFPD f45, f46 = [AOFFSET]
  11726. adds AOFFSET = 4 * SIZE, AOFFSET
  11727. ;;
  11728. LDFPD f47, f48 = [AOFFSET], 2 * SIZE
  11729. ;;
  11730. LDFPD f49, f50 = [AOFFSET], 2 * SIZE
  11731. ;;
  11732. LDFPD f51, f52 = [AOFFSET]
  11733. adds AOFFSET = 5 * SIZE, AOFFSET
  11734. ;;
  11735. LDFD f53 = [AOFFSET], 1 * SIZE
  11736. ;;
  11737. LDFPD f54, f55 = [AOFFSET], 2 * SIZE
  11738. ;;
  11739. LDFPD f56, f57 = [AOFFSET]
  11740. adds AOFFSET = 6 * SIZE, AOFFSET
  11741. ;;
  11742. LDFPD f58, f59 = [AOFFSET], 2 * SIZE
  11743. ;;
  11744. LDFPD f60, f61 = [AOFFSET]
  11745. adds AOFFSET = 7 * SIZE, AOFFSET
  11746. ;;
  11747. LDFD f16 = [AOFFSET], 1 * SIZE
  11748. ;;
  11749. LDFPD f17, f18 = [AOFFSET]
  11750. adds AOFFSET = 8 * SIZE, AOFFSET
  11751. ;;
  11752. LDFPD f19, f20 = [AOFFSET]
  11753. adds AOFFSET = 9 * SIZE, AOFFSET
  11754. ;;
  11755. LDFD f21 = [AOFFSET]
  11756. adds AOFFSET = -63 * SIZE, AOFFSET
  11757. ;;
  11758. FMPY f64 = f64, f32
  11759. FMPY f72 = f72, f32
  11760. ;;
  11761. FNMA f65 = f64, f33, f65
  11762. FNMA f73 = f72, f33, f73
  11763. ;;
  11764. FNMA f66 = f64, f34, f66
  11765. FNMA f74 = f72, f34, f74
  11766. ;;
  11767. FNMA f67 = f64, f35, f67
  11768. FNMA f75 = f72, f35, f75
  11769. ;;
  11770. FNMA f68 = f64, f36, f68
  11771. FNMA f76 = f72, f36, f76
  11772. ;;
  11773. FNMA f69 = f64, f37, f69
  11774. FNMA f77 = f72, f37, f77
  11775. ;;
  11776. FNMA f70 = f64, f38, f70
  11777. FNMA f78 = f72, f38, f78
  11778. ;;
  11779. FNMA f71 = f64, f39, f71
  11780. FNMA f79 = f72, f39, f79
  11781. ;;
  11782. FMPY f65 = f65, f40
  11783. FMPY f73 = f73, f40
  11784. ;;
  11785. FNMA f66 = f65, f41, f66
  11786. FNMA f74 = f73, f41, f74
  11787. ;;
  11788. FNMA f67 = f65, f42, f67
  11789. FNMA f75 = f73, f42, f75
  11790. ;;
  11791. FNMA f68 = f65, f43, f68
  11792. FNMA f76 = f73, f43, f76
  11793. ;;
  11794. FNMA f69 = f65, f44, f69
  11795. FNMA f77 = f73, f44, f77
  11796. ;;
  11797. FNMA f70 = f65, f45, f70
  11798. FNMA f78 = f73, f45, f78
  11799. ;;
  11800. FNMA f71 = f65, f46, f71
  11801. FNMA f79 = f73, f46, f79
  11802. ;;
  11803. FMPY f66 = f66, f47
  11804. FMPY f74 = f74, f47
  11805. ;;
  11806. FNMA f67 = f66, f48, f67
  11807. FNMA f75 = f74, f48, f75
  11808. ;;
  11809. FNMA f68 = f66, f49, f68
  11810. FNMA f76 = f74, f49, f76
  11811. ;;
  11812. FNMA f69 = f66, f50, f69
  11813. FNMA f77 = f74, f50, f77
  11814. ;;
  11815. FNMA f70 = f66, f51, f70
  11816. FNMA f78 = f74, f51, f78
  11817. ;;
  11818. FNMA f71 = f66, f52, f71
  11819. FNMA f79 = f74, f52, f79
  11820. ;;
  11821. FMPY f67 = f67, f53
  11822. FMPY f75 = f75, f53
  11823. ;;
  11824. FNMA f68 = f67, f54, f68
  11825. FNMA f76 = f75, f54, f76
  11826. ;;
  11827. FNMA f69 = f67, f55, f69
  11828. FNMA f77 = f75, f55, f77
  11829. ;;
  11830. FNMA f70 = f67, f56, f70
  11831. FNMA f78 = f75, f56, f78
  11832. ;;
  11833. FNMA f71 = f67, f57, f71
  11834. FNMA f79 = f75, f57, f79
  11835. ;;
  11836. FMPY f68 = f68, f58
  11837. FMPY f76 = f76, f58
  11838. ;;
  11839. FNMA f69 = f68, f59, f69
  11840. FNMA f77 = f76, f59, f77
  11841. ;;
  11842. FNMA f70 = f68, f60, f70
  11843. FNMA f78 = f76, f60, f78
  11844. ;;
  11845. FNMA f71 = f68, f61, f71
  11846. FNMA f79 = f76, f61, f79
  11847. ;;
  11848. FMPY f69 = f69, f16
  11849. FMPY f77 = f77, f16
  11850. ;;
  11851. FNMA f70 = f69, f17, f70
  11852. FNMA f78 = f77, f17, f78
  11853. ;;
  11854. FNMA f71 = f69, f18, f71
  11855. FNMA f79 = f77, f18, f79
  11856. ;;
  11857. FMPY f70 = f70, f19
  11858. FMPY f78 = f78, f19
  11859. ;;
  11860. FNMA f71 = f70, f20, f71
  11861. FNMA f79 = f78, f20, f79
  11862. ;;
  11863. FMPY f71 = f71, f21
  11864. FMPY f79 = f79, f21
  11865. ;;
  11866. STFD [BOFFSET] = f64, SIZE
  11867. STFD [BOFFSET2] = f66, SIZE
  11868. ;;
  11869. STFD [BOFFSET] = f72, SIZE
  11870. STFD [BOFFSET2] = f74, SIZE
  11871. ;;
  11872. STFD [BOFFSET] = f65, SIZE
  11873. STFD [BOFFSET2] = f67, SIZE
  11874. ;;
  11875. STFD [BOFFSET] = f73, 5 * SIZE
  11876. STFD [BOFFSET2] = f75, 5 * SIZE
  11877. ;;
  11878. STFD [BOFFSET] = f68, SIZE
  11879. STFD [BOFFSET2] = f70, SIZE
  11880. ;;
  11881. STFD [BOFFSET] = f76, SIZE
  11882. STFD [BOFFSET2] = f78, SIZE
  11883. ;;
  11884. STFD [BOFFSET] = f69, SIZE
  11885. STFD [BOFFSET2] = f71, SIZE
  11886. ;;
  11887. STFD [BOFFSET] = f77, -11 * SIZE
  11888. STFD [BOFFSET2] = f79, -11 * SIZE
  11889. ;;
  11890. adds C9 = 4 * SIZE, C1
  11891. ;;
  11892. #endif
  11893. #ifdef RN
  11894. LDFPD f32, f33 = [BOFFSET]
  11895. adds BOFFSET = 3 * SIZE, BOFFSET
  11896. ;;
  11897. LDFD f34 = [BOFFSET], -3 * SIZE
  11898. ;;
  11899. FMPY f64 = f64, f32
  11900. FMPY f68 = f68, f32
  11901. FMPY f65 = f65, f32
  11902. FMPY f69 = f69, f32
  11903. FMPY f66 = f66, f32
  11904. FMPY f70 = f70, f32
  11905. FMPY f67 = f67, f32
  11906. FMPY f71 = f71, f32
  11907. ;;
  11908. FNMA f72 = f64, f33, f72
  11909. FNMA f76 = f68, f33, f76
  11910. FNMA f73 = f65, f33, f73
  11911. FNMA f77 = f69, f33, f77
  11912. FNMA f74 = f66, f33, f74
  11913. FNMA f78 = f70, f33, f78
  11914. FNMA f75 = f67, f33, f75
  11915. FNMA f79 = f71, f33, f79
  11916. ;;
  11917. FMPY f72 = f72, f34
  11918. FMPY f76 = f76, f34
  11919. FMPY f73 = f73, f34
  11920. FMPY f77 = f77, f34
  11921. FMPY f74 = f74, f34
  11922. FMPY f78 = f78, f34
  11923. FMPY f75 = f75, f34
  11924. FMPY f79 = f79, f34
  11925. ;;
  11926. STFD [AOFFSET] = f64, SIZE
  11927. STFD [AOFFSET2] = f68, SIZE
  11928. ;;
  11929. STFD [AOFFSET] = f65, SIZE
  11930. STFD [AOFFSET2] = f69, SIZE
  11931. ;;
  11932. STFD [AOFFSET] = f66, SIZE
  11933. STFD [AOFFSET2] = f70, SIZE
  11934. ;;
  11935. STFD [AOFFSET] = f67, 5 * SIZE
  11936. STFD [AOFFSET2] = f71, 5 * SIZE
  11937. ;;
  11938. STFD [AOFFSET] = f72, SIZE
  11939. STFD [AOFFSET2] = f76, SIZE
  11940. ;;
  11941. STFD [AOFFSET] = f73, SIZE
  11942. STFD [AOFFSET2] = f77, SIZE
  11943. ;;
  11944. STFD [AOFFSET] = f74, SIZE
  11945. STFD [AOFFSET2] = f78, SIZE
  11946. ;;
  11947. STFD [AOFFSET] = f75, -11 * SIZE
  11948. STFD [AOFFSET2] = f79, -11 * SIZE
  11949. ;;
  11950. #endif
  11951. #ifdef RT
  11952. adds BOFFSET = 2 * SIZE, BOFFSET
  11953. ;;
  11954. LDFPD f33, f32 = [BOFFSET]
  11955. adds BOFFSET = - 2 * SIZE, BOFFSET
  11956. ;;
  11957. LDFD f34 = [BOFFSET]
  11958. ;;
  11959. FMPY f72 = f72, f32
  11960. FMPY f76 = f76, f32
  11961. FMPY f73 = f73, f32
  11962. FMPY f77 = f77, f32
  11963. FMPY f74 = f74, f32
  11964. FMPY f78 = f78, f32
  11965. FMPY f75 = f75, f32
  11966. FMPY f79 = f79, f32
  11967. ;;
  11968. FNMA f64 = f72, f33, f64
  11969. FNMA f68 = f76, f33, f68
  11970. FNMA f65 = f73, f33, f65
  11971. FNMA f69 = f77, f33, f69
  11972. FNMA f66 = f74, f33, f66
  11973. FNMA f70 = f78, f33, f70
  11974. FNMA f67 = f75, f33, f67
  11975. FNMA f71 = f79, f33, f71
  11976. ;;
  11977. FMPY f64 = f64, f34
  11978. FMPY f68 = f68, f34
  11979. FMPY f65 = f65, f34
  11980. FMPY f69 = f69, f34
  11981. FMPY f66 = f66, f34
  11982. FMPY f70 = f70, f34
  11983. FMPY f67 = f67, f34
  11984. FMPY f71 = f71, f34
  11985. ;;
  11986. adds AOFFSET = 8 * SIZE, AOFFSET
  11987. adds AOFFSET2 = 8 * SIZE, AOFFSET2
  11988. ;;
  11989. STFD [AOFFSET] = f72, SIZE
  11990. STFD [AOFFSET2] = f76, SIZE
  11991. ;;
  11992. STFD [AOFFSET] = f73, SIZE
  11993. STFD [AOFFSET2] = f77, SIZE
  11994. ;;
  11995. STFD [AOFFSET] = f74, SIZE
  11996. STFD [AOFFSET2] = f78, SIZE
  11997. ;;
  11998. STFD [AOFFSET] = f75, - 11 * SIZE
  11999. STFD [AOFFSET2] = f79, - 11 * SIZE
  12000. ;;
  12001. STFD [AOFFSET] = f64, SIZE
  12002. STFD [AOFFSET2] = f68, SIZE
  12003. ;;
  12004. STFD [AOFFSET] = f65, SIZE
  12005. STFD [AOFFSET2] = f69, SIZE
  12006. ;;
  12007. STFD [AOFFSET] = f66, SIZE
  12008. STFD [AOFFSET2] = f70, SIZE
  12009. ;;
  12010. STFD [AOFFSET] = f67, - 3 * SIZE
  12011. STFD [AOFFSET2] = f71, - 3 * SIZE
  12012. ;;
  12013. #endif
  12014. adds C9 = 4 * SIZE, C1
  12015. ;;
  12016. { .mmf
  12017. STFD [C1 ] = f64, SIZE
  12018. STFD [C9 ] = f68, SIZE
  12019. mov f64 = f0
  12020. }
  12021. ;;
  12022. { .mmi
  12023. STFD [C1 ] = f65, SIZE
  12024. STFD [C9 ] = f69, SIZE
  12025. adds C10 = 4 * SIZE, C2
  12026. }
  12027. ;;
  12028. { .mmi
  12029. STFD [C1 ] = f66, SIZE
  12030. STFD [C9 ] = f70, SIZE
  12031. }
  12032. ;;
  12033. { .mmi
  12034. #ifndef LN
  12035. STFD [C1 ] = f67, 5 * SIZE
  12036. #else
  12037. STFD [C1 ] = f67, - 3 * SIZE
  12038. #endif
  12039. STFD [C9 ] = f71
  12040. adds C11 = 4 * SIZE, C3
  12041. }
  12042. ;;
  12043. { .mmf
  12044. STFD [C2 ] = f72, SIZE
  12045. STFD [C10] = f76, SIZE
  12046. mov f72 = f0
  12047. }
  12048. ;;
  12049. { .mmi
  12050. STFD [C2 ] = f73, SIZE
  12051. STFD [C10] = f77, SIZE
  12052. }
  12053. ;;
  12054. { .mmi
  12055. STFD [C2 ] = f74, SIZE
  12056. STFD [C10] = f78, SIZE
  12057. adds C12 = 4 * SIZE, C4
  12058. }
  12059. ;;
  12060. { .mmi
  12061. #ifndef LN
  12062. STFD [C2 ] = f75, 5 * SIZE
  12063. #else
  12064. STFD [C2 ] = f75, - 3 * SIZE
  12065. #endif
  12066. STFD [C10] = f79
  12067. }
  12068. ;;
  12069. { .mmf
  12070. cmp.ne p6, p0 = 1, I
  12071. }
  12072. ;;
  12073. adds I = -1, I
  12074. ;;
  12075. { .mmi
  12076. shladd r2 = K, BASE_SHIFT, r0
  12077. }
  12078. ;;
  12079. { .mmi
  12080. sub L = K, KK
  12081. }
  12082. ;;
  12083. { .mmi
  12084. #ifdef RT
  12085. shladd AORIG = r2, 3, AORIG
  12086. #else
  12087. nop __LINE__
  12088. #endif
  12089. }
  12090. ;;
  12091. { .mmi
  12092. #if defined(LT) || defined(RN)
  12093. shladd L = L, BASE_SHIFT, r0
  12094. #else
  12095. nop __LINE__
  12096. #endif
  12097. }
  12098. ;;
  12099. ;;
  12100. { .mmi
  12101. #if defined(LT) || defined(RN)
  12102. shladd AOFFSET = L, 3, AOFFSET
  12103. #else
  12104. nop __LINE__
  12105. #endif
  12106. }
  12107. ;;
  12108. { .mmi
  12109. #if defined(LT) || defined(RN)
  12110. shladd BOFFSET = L, 1, BOFFSET
  12111. #else
  12112. nop __LINE__
  12113. #endif
  12114. }
  12115. ;;
  12116. { .mmi
  12117. #ifdef LT
  12118. adds KK = 8, KK
  12119. #elif defined LN
  12120. adds KK = -8, KK
  12121. #else
  12122. nop __LINE__
  12123. #endif
  12124. }
  12125. ;;
  12126. { .mmi
  12127. #if defined(LT) || defined(RN)
  12128. mov L = KK
  12129. #else
  12130. sub L = K, KK
  12131. #endif
  12132. }
  12133. ;;
  12134. mov f64 = f0
  12135. mov f65 = f0
  12136. mov f66 = f0
  12137. mov f67 = f0
  12138. mov f72 = f0
  12139. mov f73 = f0
  12140. mov f74 = f0
  12141. mov f75 = f0
  12142. (p6) br.cond.dptk .L092
  12143. ;;
  12144. .align 8
  12145. .L129:
  12146. #ifdef LN
  12147. shladd KK8 = K, BASE_SHIFT, r0
  12148. ;;
  12149. shladd B = KK8, 1, B
  12150. #endif
  12151. #if defined(LT) || defined(RN)
  12152. mov B = BOFFSET
  12153. #endif
  12154. #ifdef RN
  12155. adds KK = 2, KK
  12156. #endif
  12157. #ifdef RT
  12158. adds KK = -2, KK
  12159. #endif
  12160. ;;
  12161. mov AOFFSET = A
  12162. ;;
  12163. .align 16
  12164. .L130:
  12165. tbit.z p6, p0 = N, 0
  12166. (p6) br.cond.dpnt .L999
  12167. ;;
  12168. #ifdef RT
  12169. { .mmi
  12170. nop __LINE__
  12171. shl r2 = K, BASE_SHIFT
  12172. }
  12173. ;;
  12174. { .mmi
  12175. sub B = B, r2
  12176. sub C = C, LDC
  12177. nop __LINE__
  12178. }
  12179. #endif
  12180. ;;
  12181. mov f64 = f0
  12182. mov f65 = f0
  12183. mov f66 = f0
  12184. mov f67 = f0
  12185. mov f68 = f0
  12186. mov f69 = f0
  12187. mov f70 = f0
  12188. mov f71 = f0
  12189. ;;
  12190. { .mfi
  12191. mov C1 = C // coffset1 = c + 0 * ldc
  12192. #ifdef LN
  12193. add KK = M, OFFSET
  12194. #elif defined LT
  12195. mov KK = OFFSET
  12196. #else
  12197. nop __LINE__
  12198. #endif
  12199. }
  12200. ;;
  12201. { .mmf
  12202. #if defined(LN) || defined(RT)
  12203. mov AORIG = A
  12204. #else
  12205. mov AOFFSET = A
  12206. #endif
  12207. }
  12208. ;;
  12209. { .mfi
  12210. #ifndef RT
  12211. add C = C, LDC // coffset += 8 * ldc
  12212. #else
  12213. nop __LINE__
  12214. #endif
  12215. #if defined(LT) || defined(RN)
  12216. mov L = KK
  12217. #else
  12218. sub L = K, KK
  12219. #endif
  12220. }
  12221. ;;
  12222. .L160:
  12223. { .mib
  12224. #if defined(LT) || defined(RN)
  12225. mov L = KK
  12226. #else
  12227. sub L = K, KK
  12228. #endif
  12229. tbit.z p6, p7 = M, 0
  12230. (p6) br.cond.dptk .L150
  12231. }
  12232. ;;
  12233. { .mmi
  12234. cmp.ne p7, p0 = r0, L
  12235. adds BOFFSET = 0 * SIZE, B
  12236. shl r2 = K, 0 + BASE_SHIFT
  12237. }
  12238. ;;
  12239. shladd r3 = KK, BASE_SHIFT, r0
  12240. ;;
  12241. #if defined(LT) || defined(RN)
  12242. { .mmi
  12243. (p7) LDFD f48 = [BOFFSET], 1 * SIZE
  12244. nop __LINE__
  12245. adds L = 1, L
  12246. }
  12247. ;;
  12248. #else
  12249. { .mmi
  12250. shladd BOFFSET = KK, BASE_SHIFT, B
  12251. nop __LINE__
  12252. #ifdef LN
  12253. sub AORIG = AORIG, r2
  12254. #else
  12255. nop __LINE__
  12256. #endif
  12257. }
  12258. ;;
  12259. { .mmi
  12260. (p7) LDFD f48 = [BOFFSET], 1 * SIZE
  12261. adds L = 1, L
  12262. add AOFFSET = r3, AORIG
  12263. }
  12264. ;;
  12265. #endif
  12266. ;;
  12267. { .mii
  12268. tbit.z p12, p0 = L, 0
  12269. shr L = L, 1
  12270. }
  12271. ;;
  12272. { .mmi
  12273. cmp.eq p6, p0 = 0, L
  12274. adds L = -1, L
  12275. cmp.eq p3, p0 = r0, r0
  12276. }
  12277. ;;
  12278. { .mib
  12279. (p7) LDFD f32 = [AOFFSET], 1 * SIZE
  12280. mov ar.lc = L
  12281. (p6) br.cond.dpnt .L168
  12282. }
  12283. ;;
  12284. .align 8
  12285. .L162:
  12286. { .mmf
  12287. cmp.ne p4, p5 = 0, L
  12288. (p12) cmp.ne p3, p0 = 0, L
  12289. FMA f64 = f32, f48, f64 // A1 * B1
  12290. }
  12291. ;;
  12292. { .mmi
  12293. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  12294. (p3) LDFD f40 = [AOFFSET], 1 * SIZE
  12295. nop __LINE__
  12296. }
  12297. ;;
  12298. { .mmi
  12299. (p4) LDFD f32 = [AOFFSET], 1 * SIZE
  12300. nop __LINE__
  12301. adds L = -1, L
  12302. }
  12303. { .mfb
  12304. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  12305. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  12306. br.cloop.sptk.few .L162
  12307. }
  12308. ;;
  12309. .align 8
  12310. .L168:
  12311. #if defined(LN) || defined(RT)
  12312. #ifdef LN
  12313. adds r2 = -1, KK
  12314. #else
  12315. adds r2 = -1, KK
  12316. #endif
  12317. ;;
  12318. shladd r2 = r2, BASE_SHIFT, r0
  12319. ;;
  12320. add AOFFSET = r2, AORIG
  12321. add BOFFSET = r2, B
  12322. ;;
  12323. #endif
  12324. #if defined(LN) || defined(LT)
  12325. { .mmi
  12326. LDFD f32 = [BOFFSET]
  12327. LDFD f33 = [AOFFSET]
  12328. #ifdef LN
  12329. adds C1 = -1 * SIZE, C1
  12330. #else
  12331. nop __LINE__
  12332. #endif
  12333. }
  12334. ;;
  12335. #else
  12336. { .mmi
  12337. LDFD f32 = [AOFFSET]
  12338. LDFD f33 = [BOFFSET]
  12339. nop __LINE__
  12340. }
  12341. ;;
  12342. #endif
  12343. { .mmf
  12344. sub L = K, KK
  12345. #ifdef RT
  12346. shladd AORIG = K, BASE_SHIFT, AORIG
  12347. #else
  12348. nop __LINE__
  12349. #endif
  12350. FSUB f64 = f32, f64
  12351. }
  12352. ;;
  12353. #ifdef LT
  12354. adds KK = 1, KK
  12355. #elif defined LN
  12356. adds KK = -1, KK
  12357. #else
  12358. nop __LINE__
  12359. #endif
  12360. ;;
  12361. #if defined(LT) || defined(RN)
  12362. mov L = KK
  12363. #else
  12364. sub L = K, KK
  12365. #endif
  12366. ;;
  12367. FMPY f64 = f64, f33
  12368. ;;
  12369. #if defined(LN) || defined(LT)
  12370. { .mmf
  12371. STFD [BOFFSET] = f64
  12372. #ifndef LN
  12373. STFD [C1 ] = f64, SIZE
  12374. #else
  12375. STFD [C1 ] = f64
  12376. #endif
  12377. mov f64 = f0
  12378. }
  12379. ;;
  12380. #else
  12381. { .mmf
  12382. STFD [AOFFSET] = f64
  12383. STFD [C1 ] = f64, SIZE
  12384. mov f64 = f0
  12385. }
  12386. ;;
  12387. #endif
  12388. #if defined(LT) || defined(RN)
  12389. shladd AOFFSET = L, BASE_SHIFT, AOFFSET
  12390. #else
  12391. nop __LINE__
  12392. #endif
  12393. #if defined(LT) || defined(RN)
  12394. shladd BOFFSET = L, BASE_SHIFT, BOFFSET
  12395. #else
  12396. nop __LINE__
  12397. #endif
  12398. ;;
  12399. .align 8
  12400. .L150:
  12401. tbit.z p6, p7 = M, 1
  12402. (p6) br.cond.dptk .L140
  12403. ;;
  12404. { .mib
  12405. #if defined(LT) || defined(RN)
  12406. mov L = KK
  12407. #else
  12408. sub L = K, KK
  12409. #endif
  12410. }
  12411. ;;
  12412. { .mmi
  12413. cmp.ne p7, p0 = r0, L
  12414. adds BOFFSET = 0 * SIZE, B
  12415. shl r2 = K, 1 + BASE_SHIFT
  12416. }
  12417. ;;
  12418. shladd r3 = KK, BASE_SHIFT, r0
  12419. ;;
  12420. #if defined(LT) || defined(RN)
  12421. { .mmf
  12422. (p7) LDFD f48 = [BOFFSET], 1 * SIZE
  12423. }
  12424. ;;
  12425. #else
  12426. { .mfi
  12427. shladd BOFFSET = KK, BASE_SHIFT, B
  12428. #ifdef LN
  12429. sub AORIG = AORIG, r2
  12430. #else
  12431. nop __LINE__
  12432. #endif
  12433. }
  12434. ;;
  12435. { .mfi
  12436. (p7) LDFD f48 = [BOFFSET], 1 * SIZE
  12437. shladd AOFFSET = r3, 1, AORIG
  12438. }
  12439. ;;
  12440. #endif
  12441. { .mfi
  12442. adds L = 1, L
  12443. }
  12444. { .mfi
  12445. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  12446. cmp.eq p3, p0 = r0, r0
  12447. }
  12448. ;;
  12449. { .mfi
  12450. tbit.z p12, p0 = L, 0
  12451. }
  12452. { .mfi
  12453. shr L = L, 1
  12454. }
  12455. ;;
  12456. { .mmf
  12457. adds L = -1, L
  12458. }
  12459. ;;
  12460. { .mmf
  12461. cmp.eq p6, p0 = -1, L
  12462. }
  12463. ;;
  12464. (p7) LDFD f32 = [AOFFSET], SIZE
  12465. ;;
  12466. (p7) LDFD f33 = [AOFFSET], SIZE
  12467. ;;
  12468. ;;
  12469. { .mib
  12470. mov ar.lc = L
  12471. (p6) br.cond.dpnt .L158
  12472. }
  12473. ;;
  12474. .L152:
  12475. { .mfi
  12476. cmp.ne p4, p5 = 0, L
  12477. FMA f64 = f32, f48, f64 // A1 * B1
  12478. (p12) cmp.ne p3, p0 = 0, L
  12479. }
  12480. ;;
  12481. { .mmf
  12482. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  12483. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  12484. FMA f65 = f33, f48, f65 // A2 * B1
  12485. }
  12486. ;;
  12487. { .mfi
  12488. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  12489. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  12490. adds L = -1, L
  12491. }
  12492. ;;
  12493. { .mfb
  12494. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  12495. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  12496. br.cloop.sptk.few .L152
  12497. }
  12498. ;;
  12499. .L158:
  12500. #if defined(LN) || defined(RT)
  12501. #ifdef LN
  12502. adds r2 = -2, KK
  12503. #else
  12504. adds r2 = -1, KK
  12505. #endif
  12506. ;;
  12507. shladd r2 = r2, BASE_SHIFT, r0
  12508. ;;
  12509. shladd AOFFSET = r2, 1, AORIG
  12510. add BOFFSET = r2, B
  12511. ;;
  12512. #endif
  12513. adds AOFFSET2 = 4 * SIZE, AOFFSET
  12514. adds BOFFSET2 = 4 * SIZE, BOFFSET
  12515. ;;
  12516. #if defined(LN) || defined(LT)
  12517. LDFPD f32, f33 = [BOFFSET]
  12518. ;;
  12519. FSUB f64 = f32, f64
  12520. FSUB f65 = f33, f65
  12521. ;;
  12522. #else
  12523. LDFPD f32, f33 = [AOFFSET]
  12524. ;;
  12525. FSUB f64 = f32, f64
  12526. FSUB f65 = f33, f65
  12527. ;;
  12528. #endif
  12529. #ifdef LN
  12530. adds AOFFSET = 2 * SIZE, AOFFSET
  12531. ;;
  12532. LDFPD f33, f32 = [AOFFSET]
  12533. adds AOFFSET = - 2 * SIZE, AOFFSET
  12534. ;;
  12535. LDFD f34 = [AOFFSET]
  12536. ;;
  12537. FMPY f65 = f65, f32
  12538. ;;
  12539. FNMA f64 = f65, f33, f64
  12540. ;;
  12541. FMPY f64 = f64, f34
  12542. ;;
  12543. STFD [BOFFSET] = f64, SIZE
  12544. ;;
  12545. STFD [BOFFSET] = f65, - SIZE
  12546. ;;
  12547. adds C1 = -2 * SIZE, C1
  12548. ;;
  12549. #endif
  12550. #ifdef LT
  12551. LDFPD f32, f33 = [AOFFSET]
  12552. adds AOFFSET = 3 * SIZE, AOFFSET
  12553. ;;
  12554. LDFD f34 = [AOFFSET], - 3 * SIZE
  12555. ;;
  12556. FMPY f64 = f64, f32
  12557. ;;
  12558. FNMA f65 = f64, f33, f65
  12559. ;;
  12560. FMPY f65 = f65, f34
  12561. ;;
  12562. STFD [BOFFSET] = f64, SIZE
  12563. ;;
  12564. STFD [BOFFSET] = f65, -SIZE
  12565. ;;
  12566. #endif
  12567. #ifdef RN
  12568. LDFD f32 = [BOFFSET]
  12569. ;;
  12570. FMPY f64 = f64, f32
  12571. FMPY f65 = f65, f32
  12572. ;;
  12573. STFD [AOFFSET] = f64, SIZE
  12574. ;;
  12575. STFD [AOFFSET] = f65, - SIZE
  12576. ;;
  12577. #endif
  12578. #ifdef RT
  12579. LDFD f32 = [BOFFSET]
  12580. ;;
  12581. FMPY f64 = f64, f32
  12582. FMPY f65 = f65, f32
  12583. ;;
  12584. STFD [AOFFSET] = f64, SIZE
  12585. ;;
  12586. STFD [AOFFSET] = f65, - SIZE
  12587. ;;
  12588. #endif
  12589. STFD [C1 ] = f64, SIZE
  12590. ;;
  12591. #ifndef LN
  12592. STFD [C1 ] = f65, SIZE
  12593. #else
  12594. STFD [C1 ] = f65, -SIZE
  12595. #endif
  12596. ;;
  12597. mov f64 = f0
  12598. mov f65 = f0
  12599. ;;
  12600. shladd r2 = K, BASE_SHIFT, r0
  12601. ;;
  12602. sub L = K, KK
  12603. ;;
  12604. #ifdef RT
  12605. shladd AORIG = r2, 1, AORIG
  12606. #else
  12607. nop __LINE__
  12608. #endif
  12609. ;;
  12610. { .mmi
  12611. #if defined(LT) || defined(RN)
  12612. shladd L = L, BASE_SHIFT, r0
  12613. #else
  12614. nop __LINE__
  12615. #endif
  12616. }
  12617. ;;
  12618. { .mmi
  12619. #if defined(LT) || defined(RN)
  12620. shladd AOFFSET = L, 1, AOFFSET
  12621. #else
  12622. nop __LINE__
  12623. #endif
  12624. }
  12625. ;;
  12626. { .mmi
  12627. #if defined(LT) || defined(RN)
  12628. add BOFFSET = L, BOFFSET
  12629. #else
  12630. nop __LINE__
  12631. #endif
  12632. }
  12633. ;;
  12634. { .mmi
  12635. #ifdef LT
  12636. adds KK = 2, KK
  12637. #elif defined LN
  12638. adds KK = -2, KK
  12639. #else
  12640. nop __LINE__
  12641. #endif
  12642. }
  12643. ;;
  12644. { .mmi
  12645. #if defined(LT) || defined(RN)
  12646. mov L = KK
  12647. #else
  12648. sub L = K, KK
  12649. #endif
  12650. }
  12651. ;;
  12652. .align 8
  12653. .L140:
  12654. tbit.z p6, p7 = M, 2
  12655. (p6) br.cond.dptk .L131
  12656. ;;
  12657. { .mib
  12658. #if defined(LT) || defined(RN)
  12659. mov L = KK
  12660. #else
  12661. sub L = K, KK
  12662. #endif
  12663. }
  12664. ;;
  12665. { .mmi
  12666. cmp.ne p7, p0 = r0, L
  12667. adds BOFFSET = 0 * SIZE, B
  12668. shl r2 = K, 2 + BASE_SHIFT
  12669. }
  12670. ;;
  12671. shladd r3 = KK, BASE_SHIFT, r0
  12672. ;;
  12673. #if defined(LT) || defined(RN)
  12674. { .mmf
  12675. (p7) LDFD f48 = [BOFFSET], 1 * SIZE
  12676. mov f65 = f0
  12677. }
  12678. ;;
  12679. #else
  12680. { .mfi
  12681. shladd BOFFSET = KK, BASE_SHIFT, B
  12682. #ifdef LN
  12683. sub AORIG = AORIG, r2
  12684. #else
  12685. nop __LINE__
  12686. #endif
  12687. }
  12688. ;;
  12689. { .mfi
  12690. (p7) LDFD f48 = [BOFFSET], 1 * SIZE
  12691. shladd AOFFSET = r3, 2, AORIG
  12692. }
  12693. ;;
  12694. #endif
  12695. { .mfi
  12696. adds L = 1, L
  12697. }
  12698. { .mfi
  12699. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  12700. cmp.eq p3, p0 = r0, r0
  12701. }
  12702. ;;
  12703. { .mfi
  12704. tbit.z p12, p0 = L, 0
  12705. }
  12706. { .mfi
  12707. shr L = L, 1
  12708. }
  12709. ;;
  12710. { .mfi
  12711. adds L = -1, L
  12712. }
  12713. ;;
  12714. { .mfi
  12715. cmp.eq p6, p0 = -1, L
  12716. }
  12717. ;;
  12718. { .mmf
  12719. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  12720. }
  12721. { .mfi
  12722. mov ar.lc = L
  12723. }
  12724. ;;
  12725. { .mmf
  12726. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  12727. }
  12728. { .mfb
  12729. (p6) br.cond.dpnt .L148
  12730. }
  12731. ;;
  12732. .L142:
  12733. { .mfi
  12734. lfetch.nt1 [PREA], 8 * SIZE
  12735. FMA f64 = f32, f48, f64 // A1 * B1
  12736. cmp.ne p4, p5 = 0, L
  12737. }
  12738. { .mfi
  12739. nop __LINE__
  12740. FMA f65 = f33, f48, f65 // A2 * B1
  12741. (p12) cmp.ne p3, p0 = 0, L
  12742. }
  12743. ;;
  12744. { .mfi
  12745. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  12746. FMA f66 = f34, f48, f66 // A3 * B1
  12747. (p5) adds C9 = 2 * SIZE, C1
  12748. }
  12749. { .mmf
  12750. nop __LINE__
  12751. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  12752. FMA f67 = f35, f48, f67 // A4 * B1
  12753. }
  12754. ;;
  12755. { .mfi
  12756. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  12757. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  12758. (p5) adds C10 = 2 * SIZE, C2
  12759. }
  12760. { .mfb
  12761. nop __LINE__
  12762. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  12763. nop __LINE__
  12764. }
  12765. ;;
  12766. { .mfb
  12767. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  12768. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  12769. nop __LINE__
  12770. }
  12771. { .mmf
  12772. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  12773. nop __LINE__
  12774. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  12775. }
  12776. ;;
  12777. { .mfi
  12778. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  12779. nop __LINE__
  12780. adds L = -1, L
  12781. }
  12782. { .mfb
  12783. nop __LINE__
  12784. nop.f 0
  12785. br.cloop.sptk.few .L142
  12786. }
  12787. ;;
  12788. .L148:
  12789. #if defined(LN) || defined(RT)
  12790. #ifdef LN
  12791. adds r2 = -4, KK
  12792. #else
  12793. adds r2 = -1, KK
  12794. #endif
  12795. ;;
  12796. shladd r2 = r2, BASE_SHIFT, r0
  12797. ;;
  12798. shladd AOFFSET = r2, 2, AORIG
  12799. add BOFFSET = r2, B
  12800. ;;
  12801. #endif
  12802. adds AOFFSET2 = 4 * SIZE, AOFFSET
  12803. adds BOFFSET2 = 4 * SIZE, BOFFSET
  12804. ;;
  12805. #if defined(LN) || defined(LT)
  12806. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  12807. ;;
  12808. LDFPD f34, f35 = [BOFFSET]
  12809. adds BOFFSET = -2 * SIZE, BOFFSET
  12810. ;;
  12811. FSUB f64 = f32, f64
  12812. FSUB f65 = f33, f65
  12813. FSUB f66 = f34, f66
  12814. FSUB f67 = f35, f67
  12815. ;;
  12816. #else
  12817. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  12818. ;;
  12819. LDFPD f34, f35 = [AOFFSET]
  12820. adds AOFFSET = -2 * SIZE, AOFFSET
  12821. ;;
  12822. FSUB f64 = f32, f64
  12823. FSUB f65 = f33, f65
  12824. FSUB f66 = f34, f66
  12825. FSUB f67 = f35, f67
  12826. ;;
  12827. #endif
  12828. #ifdef LN
  12829. adds AOFFSET = 14 * SIZE, AOFFSET
  12830. ;;
  12831. LDFPD f33, f32 = [AOFFSET]
  12832. adds AOFFSET = - 2 * SIZE, AOFFSET
  12833. ;;
  12834. LDFPD f35, f34 = [AOFFSET]
  12835. adds AOFFSET = - 2 * SIZE, AOFFSET
  12836. ;;
  12837. LDFD f36 = [AOFFSET], - 2 * SIZE
  12838. ;;
  12839. LDFPD f38, f37 = [AOFFSET]
  12840. adds AOFFSET = - 4 * SIZE, AOFFSET
  12841. ;;
  12842. LDFPD f40, f39 = [AOFFSET]
  12843. adds AOFFSET = - 4 * SIZE, AOFFSET
  12844. ;;
  12845. LDFD f41 = [AOFFSET]
  12846. ;;
  12847. FMPY f67 = f67, f32
  12848. ;;
  12849. FNMA f66 = f67, f33, f66
  12850. ;;
  12851. FNMA f65 = f67, f34, f65
  12852. ;;
  12853. FNMA f64 = f67, f35, f64
  12854. ;;
  12855. FMPY f66 = f66, f36
  12856. ;;
  12857. FNMA f65 = f66, f37, f65
  12858. ;;
  12859. FNMA f64 = f66, f38, f64
  12860. ;;
  12861. FMPY f65 = f65, f39
  12862. ;;
  12863. FNMA f64 = f65, f40, f64
  12864. ;;
  12865. FMPY f64 = f64, f41
  12866. ;;
  12867. STFD [BOFFSET] = f64, SIZE
  12868. ;;
  12869. STFD [BOFFSET] = f65, SIZE
  12870. ;;
  12871. STFD [BOFFSET] = f66, SIZE
  12872. ;;
  12873. STFD [BOFFSET] = f67, -3 * SIZE
  12874. ;;
  12875. adds C1 = -4 * SIZE, C1
  12876. ;;
  12877. #endif
  12878. #ifdef LT
  12879. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  12880. ;;
  12881. LDFPD f34, f35 = [AOFFSET]
  12882. adds AOFFSET = 3 * SIZE, AOFFSET
  12883. ;;
  12884. LDFD f36 = [AOFFSET], 1 * SIZE
  12885. ;;
  12886. LDFPD f37, f38 = [AOFFSET]
  12887. adds AOFFSET = 4 * SIZE, AOFFSET
  12888. ;;
  12889. LDFPD f39, f40 = [AOFFSET]
  12890. adds AOFFSET = 5 * SIZE, AOFFSET
  12891. ;;
  12892. LDFD f41 = [AOFFSET], -15 * SIZE
  12893. ;;
  12894. FMPY f64 = f64, f32
  12895. ;;
  12896. FNMA f65 = f64, f33, f65
  12897. ;;
  12898. FNMA f66 = f64, f34, f66
  12899. ;;
  12900. FNMA f67 = f64, f35, f67
  12901. ;;
  12902. FMPY f65 = f65, f36
  12903. ;;
  12904. FNMA f66 = f65, f37, f66
  12905. ;;
  12906. FNMA f67 = f65, f38, f67
  12907. ;;
  12908. FMPY f66 = f66, f39
  12909. ;;
  12910. FNMA f67 = f66, f40, f67
  12911. ;;
  12912. FMPY f67 = f67, f41
  12913. ;;
  12914. STFD [BOFFSET] = f64, SIZE
  12915. ;;
  12916. STFD [BOFFSET] = f65, SIZE
  12917. ;;
  12918. STFD [BOFFSET] = f66, SIZE
  12919. ;;
  12920. STFD [BOFFSET] = f67, -3 * SIZE
  12921. ;;
  12922. #endif
  12923. #ifdef RN
  12924. LDFD f32 = [BOFFSET]
  12925. ;;
  12926. FMPY f64 = f64, f32
  12927. FMPY f65 = f65, f32
  12928. FMPY f66 = f66, f32
  12929. FMPY f67 = f67, f32
  12930. ;;
  12931. STFD [AOFFSET] = f64, SIZE
  12932. ;;
  12933. STFD [AOFFSET] = f65, SIZE
  12934. ;;
  12935. STFD [AOFFSET] = f66, SIZE
  12936. ;;
  12937. STFD [AOFFSET] = f67, -3 * SIZE
  12938. ;;
  12939. #endif
  12940. #ifdef RT
  12941. LDFD f32 = [BOFFSET]
  12942. ;;
  12943. FMPY f64 = f64, f32
  12944. FMPY f65 = f65, f32
  12945. FMPY f66 = f66, f32
  12946. FMPY f67 = f67, f32
  12947. ;;
  12948. STFD [AOFFSET] = f64, SIZE
  12949. ;;
  12950. STFD [AOFFSET] = f65, SIZE
  12951. ;;
  12952. STFD [AOFFSET] = f66, SIZE
  12953. ;;
  12954. STFD [AOFFSET] = f67, - 3 * SIZE
  12955. ;;
  12956. #endif
  12957. { .mmf
  12958. STFD [C1 ] = f64, SIZE
  12959. mov f64 = f0
  12960. }
  12961. ;;
  12962. { .mmi
  12963. STFD [C1 ] = f65, SIZE
  12964. }
  12965. ;;
  12966. { .mmi
  12967. STFD [C1 ] = f66, SIZE
  12968. }
  12969. ;;
  12970. { .mmi
  12971. #ifndef LN
  12972. STFD [C1 ] = f67, SIZE
  12973. #else
  12974. STFD [C1 ] = f67, - 3 * SIZE
  12975. #endif
  12976. }
  12977. ;;
  12978. { .mmf
  12979. mov f72 = f0
  12980. }
  12981. ;;
  12982. mov f65 = f0
  12983. mov f73 = f0
  12984. mov f66 = f0
  12985. mov f74 = f0
  12986. mov f67 = f0
  12987. mov f75 = f0
  12988. ;;
  12989. shladd r2 = K, BASE_SHIFT, r0
  12990. ;;
  12991. { .mmi
  12992. sub L = K, KK
  12993. }
  12994. ;;
  12995. { .mmi
  12996. #ifdef RT
  12997. shladd AORIG = r2, 2, AORIG
  12998. #else
  12999. nop __LINE__
  13000. #endif
  13001. }
  13002. ;;
  13003. { .mmi
  13004. #if defined(LT) || defined(RN)
  13005. shladd L = L, BASE_SHIFT, r0
  13006. #else
  13007. nop __LINE__
  13008. #endif
  13009. }
  13010. ;;
  13011. { .mmi
  13012. #if defined(LT) || defined(RN)
  13013. shladd AOFFSET = L, 2, AOFFSET
  13014. #else
  13015. nop __LINE__
  13016. #endif
  13017. }
  13018. ;;
  13019. { .mmi
  13020. #if defined(LT) || defined(RN)
  13021. add BOFFSET = L, BOFFSET
  13022. #else
  13023. nop __LINE__
  13024. #endif
  13025. }
  13026. ;;
  13027. { .mmi
  13028. #ifdef LT
  13029. adds KK = 4, KK
  13030. #elif defined LN
  13031. adds KK = -4, KK
  13032. #else
  13033. nop __LINE__
  13034. #endif
  13035. }
  13036. ;;
  13037. { .mmi
  13038. #if defined(LT) || defined(RN)
  13039. mov L = KK
  13040. #else
  13041. sub L = K, KK
  13042. #endif
  13043. }
  13044. ;;
  13045. .align 8
  13046. .L131:
  13047. #if defined(LT) || defined(RN)
  13048. mov L = KK
  13049. #else
  13050. sub L = K, KK
  13051. #endif
  13052. ;;
  13053. shr I = M, 3
  13054. ;;
  13055. cmp.eq p6, p7 = 0, I
  13056. (p6) br.cond.dpnt .L169
  13057. ;;
  13058. .align 16
  13059. .L132:
  13060. { .mmi
  13061. cmp.ne p7, p0 = r0, L
  13062. adds BOFFSET = 0 * SIZE, B
  13063. shl r2 = K, 3 + BASE_SHIFT
  13064. }
  13065. ;;
  13066. shladd r3 = KK, BASE_SHIFT, r0
  13067. ;;
  13068. #if defined(LT) || defined(RN)
  13069. { .mmi
  13070. (p7) LDFD f48 = [BOFFSET], 1 * SIZE
  13071. nop __LINE__
  13072. nop __LINE__
  13073. }
  13074. ;;
  13075. #else
  13076. { .mfi
  13077. shladd BOFFSET = KK, BASE_SHIFT, B
  13078. #ifdef LN
  13079. sub AORIG = AORIG, r2
  13080. #else
  13081. nop __LINE__
  13082. #endif
  13083. }
  13084. ;;
  13085. { .mfi
  13086. (p7) LDFD f48 = [BOFFSET], 1 * SIZE
  13087. shladd AOFFSET = r3, 3, AORIG
  13088. }
  13089. ;;
  13090. #endif
  13091. (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  13092. ;;
  13093. { .mmf
  13094. (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  13095. }
  13096. ;;
  13097. { .mmf
  13098. (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  13099. }
  13100. { .mfi
  13101. cmp.eq p3, p0 = r0, r0
  13102. }
  13103. ;;
  13104. { .mmf
  13105. (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  13106. }
  13107. { .mfi
  13108. adds PREC = CPREFETCHSIZE * SIZE, C1
  13109. }
  13110. ;;
  13111. { .mmf
  13112. CPREFETCH [PREC]
  13113. }
  13114. { .mfi
  13115. adds L = 1, L
  13116. }
  13117. ;;
  13118. { .mfi
  13119. adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET
  13120. }
  13121. ;;
  13122. { .mfi
  13123. adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET
  13124. }
  13125. ;;
  13126. { .mfi
  13127. tbit.z p12, p0 = L, 0
  13128. }
  13129. { .mfi
  13130. shr L = L, 1
  13131. }
  13132. ;;
  13133. { .mfi
  13134. adds L = -1, L
  13135. }
  13136. ;;
  13137. { .mfi
  13138. mov ar.lc = L
  13139. }
  13140. ;;
  13141. mov f64 = f0
  13142. mov f65 = f0
  13143. mov f66 = f0
  13144. mov f67 = f0
  13145. mov f68 = f0
  13146. mov f69 = f0
  13147. mov f70 = f0
  13148. mov f71 = f0
  13149. ;;
  13150. { .mfb
  13151. cmp.eq p6, p0 = -1, L
  13152. (p6) br.cond.dpnt .L138
  13153. }
  13154. ;;
  13155. .align 16
  13156. .L133:
  13157. { .mfi
  13158. lfetch.nt1 [PREA], 16 * SIZE
  13159. FMA f64 = f32, f48, f64 // A1 * B1
  13160. cmp.ne p4, p5 = 0, L
  13161. }
  13162. { .mfi
  13163. adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET
  13164. FMA f65 = f33, f48, f65 // A2 * B1
  13165. (p12) cmp.ne p3, p0 = 0, L
  13166. }
  13167. ;;
  13168. { .mfi
  13169. (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE
  13170. FMA f66 = f34, f48, f66 // A3 * B1
  13171. adds C9 = 4 * SIZE, C1
  13172. }
  13173. { .mmf
  13174. (p3) LDFD f56 = [BOFFSET], 1 * SIZE
  13175. nop __LINE__
  13176. FMA f67 = f35, f48, f67 // A4 * B1
  13177. }
  13178. ;;
  13179. { .mfb
  13180. (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE
  13181. FMA f68 = f36, f48, f68 // A5 * B1
  13182. nop __LINE__
  13183. }
  13184. { .mfb
  13185. nop __LINE__
  13186. FMA f69 = f37, f48, f69 // A6 * B1
  13187. nop __LINE__
  13188. }
  13189. ;;
  13190. { .mfb
  13191. (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE
  13192. FMA f70 = f38, f48, f70 // A7 * B1
  13193. nop __LINE__
  13194. }
  13195. { .mfb
  13196. nop __LINE__
  13197. FMA f71 = f39, f48, f71 // A8 * B1
  13198. nop __LINE__
  13199. }
  13200. ;;
  13201. { .mfb
  13202. (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE
  13203. (p3) FMA f64 = f40, f56, f64 // A1 * B1
  13204. nop __LINE__
  13205. }
  13206. { .mfb
  13207. nop __LINE__
  13208. (p3) FMA f65 = f41, f56, f65 // A2 * B1
  13209. nop __LINE__
  13210. }
  13211. ;;
  13212. { .mfb
  13213. (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  13214. (p3) FMA f66 = f42, f56, f66 // A3 * B1
  13215. nop __LINE__
  13216. }
  13217. { .mmf
  13218. (p4) LDFD f48 = [BOFFSET], 1 * SIZE
  13219. nop __LINE__
  13220. (p3) FMA f67 = f43, f56, f67 // A4 * B1
  13221. }
  13222. ;;
  13223. { .mfb
  13224. (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  13225. (p3) FMA f68 = f44, f56, f68 // A5 * B1
  13226. nop __LINE__
  13227. }
  13228. { .mfb
  13229. nop __LINE__
  13230. (p3) FMA f69 = f45, f56, f69 // A6 * B1
  13231. nop __LINE__
  13232. }
  13233. ;;
  13234. { .mfi
  13235. (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  13236. (p3) FMA f70 = f46, f56, f70 // A7 * B1
  13237. adds L = -1, L
  13238. }
  13239. { .mfb
  13240. nop __LINE__
  13241. (p3) FMA f71 = f47, f56, f71 // A8 * B1
  13242. nop __LINE__
  13243. }
  13244. ;;
  13245. { .mfb
  13246. (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE
  13247. nop __LINE__
  13248. br.cloop.sptk.few .L133
  13249. }
  13250. ;;
  13251. .L138:
  13252. #if defined(LN) || defined(RT)
  13253. #ifdef LN
  13254. adds r2 = -8, KK
  13255. #else
  13256. adds r2 = -1, KK
  13257. #endif
  13258. ;;
  13259. shladd r2 = r2, BASE_SHIFT, r0
  13260. ;;
  13261. shladd AOFFSET = r2, 3, AORIG
  13262. add BOFFSET = r2, B
  13263. ;;
  13264. #endif
  13265. adds AOFFSET2 = 4 * SIZE, AOFFSET
  13266. adds BOFFSET2 = 4 * SIZE, BOFFSET
  13267. ;;
  13268. #if defined(LN) || defined(LT)
  13269. LDFPD f32, f33 = [BOFFSET], 2 * SIZE
  13270. ;;
  13271. LDFPD f34, f35 = [BOFFSET], 2 * SIZE
  13272. ;;
  13273. LDFPD f36, f37 = [BOFFSET], 2 * SIZE
  13274. ;;
  13275. LDFPD f38, f39 = [BOFFSET]
  13276. adds BOFFSET = -6 * SIZE, BOFFSET
  13277. ;;
  13278. FSUB f64 = f32, f64
  13279. FSUB f65 = f33, f65
  13280. FSUB f66 = f34, f66
  13281. FSUB f67 = f35, f67
  13282. FSUB f68 = f36, f68
  13283. FSUB f69 = f37, f69
  13284. FSUB f70 = f38, f70
  13285. FSUB f71 = f39, f71
  13286. ;;
  13287. #else
  13288. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  13289. ;;
  13290. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  13291. ;;
  13292. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  13293. ;;
  13294. LDFPD f38, f39 = [AOFFSET]
  13295. adds AOFFSET = -6 * SIZE, AOFFSET
  13296. ;;
  13297. FSUB f64 = f32, f64
  13298. FSUB f65 = f33, f65
  13299. FSUB f66 = f34, f66
  13300. FSUB f67 = f35, f67
  13301. FSUB f68 = f36, f68
  13302. FSUB f69 = f37, f69
  13303. FSUB f70 = f38, f70
  13304. FSUB f71 = f39, f71
  13305. ;;
  13306. #endif
  13307. #ifdef LN
  13308. adds AOFFSET = 62 * SIZE, AOFFSET
  13309. ;;
  13310. LDFPD f33, f32 = [AOFFSET]
  13311. adds AOFFSET = - 2 * SIZE, AOFFSET
  13312. ;;
  13313. LDFPD f35, f34 = [AOFFSET]
  13314. adds AOFFSET = - 2 * SIZE, AOFFSET
  13315. ;;
  13316. LDFPD f37, f36 = [AOFFSET]
  13317. adds AOFFSET = - 2 * SIZE, AOFFSET
  13318. ;;
  13319. LDFPD f39, f38 = [AOFFSET]
  13320. adds AOFFSET = - 2 * SIZE, AOFFSET
  13321. ;;
  13322. LDFD f40 = [AOFFSET], -2 * SIZE
  13323. ;;
  13324. LDFPD f42, f41 = [AOFFSET]
  13325. adds AOFFSET = - 2 * SIZE, AOFFSET
  13326. ;;
  13327. LDFPD f44, f43 = [AOFFSET]
  13328. adds AOFFSET = - 2 * SIZE, AOFFSET
  13329. ;;
  13330. LDFPD f46, f45 = [AOFFSET]
  13331. adds AOFFSET = - 4 * SIZE, AOFFSET
  13332. ;;
  13333. LDFPD f48, f47 = [AOFFSET]
  13334. adds AOFFSET = - 2 * SIZE, AOFFSET
  13335. ;;
  13336. LDFPD f50, f49 = [AOFFSET]
  13337. adds AOFFSET = - 2 * SIZE, AOFFSET
  13338. ;;
  13339. LDFPD f52, f51 = [AOFFSET]
  13340. adds AOFFSET = - 4 * SIZE, AOFFSET
  13341. ;;
  13342. LDFD f53 = [AOFFSET], -2 * SIZE
  13343. ;;
  13344. LDFPD f55, f54 = [AOFFSET]
  13345. adds AOFFSET = - 2 * SIZE, AOFFSET
  13346. ;;
  13347. LDFPD f57, f56 = [AOFFSET]
  13348. adds AOFFSET = - 6 * SIZE, AOFFSET
  13349. ;;
  13350. LDFPD f59, f58 = [AOFFSET]
  13351. adds AOFFSET = - 2 * SIZE, AOFFSET
  13352. ;;
  13353. LDFPD f61, f60 = [AOFFSET]
  13354. adds AOFFSET = - 6 * SIZE, AOFFSET
  13355. ;;
  13356. LDFD f16 = [AOFFSET], -2 * SIZE
  13357. ;;
  13358. LDFPD f18, f17 = [AOFFSET]
  13359. adds AOFFSET = - 8 * SIZE, AOFFSET
  13360. ;;
  13361. LDFPD f20, f19 = [AOFFSET]
  13362. adds AOFFSET = - 8 * SIZE, AOFFSET
  13363. ;;
  13364. LDFD f21 = [AOFFSET]
  13365. ;;
  13366. FMPY f71 = f71, f32
  13367. ;;
  13368. FNMA f70 = f71, f33, f70
  13369. ;;
  13370. FNMA f69 = f71, f34, f69
  13371. ;;
  13372. FNMA f68 = f71, f35, f68
  13373. ;;
  13374. FNMA f67 = f71, f36, f67
  13375. ;;
  13376. FNMA f66 = f71, f37, f66
  13377. ;;
  13378. FNMA f65 = f71, f38, f65
  13379. ;;
  13380. FNMA f64 = f71, f39, f64
  13381. ;;
  13382. FMPY f70 = f70, f40
  13383. ;;
  13384. FNMA f69 = f70, f41, f69
  13385. ;;
  13386. FNMA f68 = f70, f42, f68
  13387. ;;
  13388. FNMA f67 = f70, f43, f67
  13389. ;;
  13390. FNMA f66 = f70, f44, f66
  13391. ;;
  13392. FNMA f65 = f70, f45, f65
  13393. ;;
  13394. FNMA f64 = f70, f46, f64
  13395. ;;
  13396. FMPY f69 = f69, f47
  13397. ;;
  13398. FNMA f68 = f69, f48, f68
  13399. ;;
  13400. FNMA f67 = f69, f49, f67
  13401. ;;
  13402. FNMA f66 = f69, f50, f66
  13403. ;;
  13404. FNMA f65 = f69, f51, f65
  13405. ;;
  13406. FNMA f64 = f69, f52, f64
  13407. ;;
  13408. FMPY f68 = f68, f53
  13409. ;;
  13410. FNMA f67 = f68, f54, f67
  13411. ;;
  13412. FNMA f66 = f68, f55, f66
  13413. ;;
  13414. FNMA f65 = f68, f56, f65
  13415. ;;
  13416. FNMA f64 = f68, f57, f64
  13417. ;;
  13418. FMPY f67 = f67, f58
  13419. ;;
  13420. FNMA f66 = f67, f59, f66
  13421. ;;
  13422. FNMA f65 = f67, f60, f65
  13423. ;;
  13424. FNMA f64 = f67, f61, f64
  13425. ;;
  13426. FMPY f66 = f66, f16
  13427. ;;
  13428. FNMA f65 = f66, f17, f65
  13429. ;;
  13430. FNMA f64 = f66, f18, f64
  13431. ;;
  13432. FMPY f65 = f65, f19
  13433. ;;
  13434. FNMA f64 = f65, f20, f64
  13435. ;;
  13436. FMPY f64 = f64, f21
  13437. ;;
  13438. STFD [BOFFSET] = f64, SIZE
  13439. STFD [BOFFSET2] = f68, SIZE
  13440. ;;
  13441. STFD [BOFFSET] = f65, SIZE
  13442. STFD [BOFFSET2] = f69, SIZE
  13443. ;;
  13444. STFD [BOFFSET] = f66, SIZE
  13445. STFD [BOFFSET2] = f70, SIZE
  13446. ;;
  13447. STFD [BOFFSET] = f67, - 3 * SIZE
  13448. STFD [BOFFSET2] = f71, - 3 * SIZE
  13449. ;;
  13450. adds C1 = -8 * SIZE, C1
  13451. ;;
  13452. #endif
  13453. #ifdef LT
  13454. LDFPD f32, f33 = [AOFFSET], 2 * SIZE
  13455. ;;
  13456. LDFPD f34, f35 = [AOFFSET], 2 * SIZE
  13457. ;;
  13458. LDFPD f36, f37 = [AOFFSET], 2 * SIZE
  13459. ;;
  13460. LDFPD f38, f39 = [AOFFSET]
  13461. adds AOFFSET = 3 * SIZE, AOFFSET
  13462. ;;
  13463. LDFD f40 = [AOFFSET], 1 * SIZE
  13464. ;;
  13465. LDFPD f41, f42 = [AOFFSET], 2 * SIZE
  13466. ;;
  13467. LDFPD f43, f44 = [AOFFSET], 2 * SIZE
  13468. ;;
  13469. LDFPD f45, f46 = [AOFFSET]
  13470. adds AOFFSET = 4 * SIZE, AOFFSET
  13471. ;;
  13472. LDFPD f47, f48 = [AOFFSET], 2 * SIZE
  13473. ;;
  13474. LDFPD f49, f50 = [AOFFSET], 2 * SIZE
  13475. ;;
  13476. LDFPD f51, f52 = [AOFFSET]
  13477. adds AOFFSET = 5 * SIZE, AOFFSET
  13478. ;;
  13479. LDFD f53 = [AOFFSET], 1 * SIZE
  13480. ;;
  13481. LDFPD f54, f55 = [AOFFSET], 2 * SIZE
  13482. ;;
  13483. LDFPD f56, f57 = [AOFFSET]
  13484. adds AOFFSET = 6 * SIZE, AOFFSET
  13485. ;;
  13486. LDFPD f58, f59 = [AOFFSET], 2 * SIZE
  13487. ;;
  13488. LDFPD f60, f61 = [AOFFSET]
  13489. adds AOFFSET = 7 * SIZE, AOFFSET
  13490. ;;
  13491. LDFD f16 = [AOFFSET], 1 * SIZE
  13492. ;;
  13493. LDFPD f17, f18 = [AOFFSET]
  13494. adds AOFFSET = 8 * SIZE, AOFFSET
  13495. ;;
  13496. LDFPD f19, f20 = [AOFFSET]
  13497. adds AOFFSET = 9 * SIZE, AOFFSET
  13498. ;;
  13499. LDFD f21 = [AOFFSET]
  13500. adds AOFFSET = -63 * SIZE, AOFFSET
  13501. ;;
  13502. FMPY f64 = f64, f32
  13503. ;;
  13504. FNMA f65 = f64, f33, f65
  13505. ;;
  13506. FNMA f66 = f64, f34, f66
  13507. ;;
  13508. FNMA f67 = f64, f35, f67
  13509. ;;
  13510. FNMA f68 = f64, f36, f68
  13511. ;;
  13512. FNMA f69 = f64, f37, f69
  13513. ;;
  13514. FNMA f70 = f64, f38, f70
  13515. ;;
  13516. FNMA f71 = f64, f39, f71
  13517. ;;
  13518. FMPY f65 = f65, f40
  13519. ;;
  13520. FNMA f66 = f65, f41, f66
  13521. ;;
  13522. FNMA f67 = f65, f42, f67
  13523. ;;
  13524. FNMA f68 = f65, f43, f68
  13525. ;;
  13526. FNMA f69 = f65, f44, f69
  13527. ;;
  13528. FNMA f70 = f65, f45, f70
  13529. ;;
  13530. FNMA f71 = f65, f46, f71
  13531. ;;
  13532. FMPY f66 = f66, f47
  13533. ;;
  13534. FNMA f67 = f66, f48, f67
  13535. ;;
  13536. FNMA f68 = f66, f49, f68
  13537. ;;
  13538. FNMA f69 = f66, f50, f69
  13539. ;;
  13540. FNMA f70 = f66, f51, f70
  13541. ;;
  13542. FNMA f71 = f66, f52, f71
  13543. ;;
  13544. FMPY f67 = f67, f53
  13545. ;;
  13546. FNMA f68 = f67, f54, f68
  13547. ;;
  13548. FNMA f69 = f67, f55, f69
  13549. ;;
  13550. FNMA f70 = f67, f56, f70
  13551. ;;
  13552. FNMA f71 = f67, f57, f71
  13553. ;;
  13554. FMPY f68 = f68, f58
  13555. ;;
  13556. FNMA f69 = f68, f59, f69
  13557. ;;
  13558. FNMA f70 = f68, f60, f70
  13559. ;;
  13560. FNMA f71 = f68, f61, f71
  13561. ;;
  13562. FMPY f69 = f69, f16
  13563. ;;
  13564. FNMA f70 = f69, f17, f70
  13565. ;;
  13566. FNMA f71 = f69, f18, f71
  13567. ;;
  13568. FMPY f70 = f70, f19
  13569. ;;
  13570. FNMA f71 = f70, f20, f71
  13571. ;;
  13572. FMPY f71 = f71, f21
  13573. ;;
  13574. STFD [BOFFSET] = f64, SIZE
  13575. STFD [BOFFSET2] = f68, SIZE
  13576. ;;
  13577. STFD [BOFFSET] = f65, SIZE
  13578. STFD [BOFFSET2] = f69, SIZE
  13579. ;;
  13580. STFD [BOFFSET] = f66, SIZE
  13581. STFD [BOFFSET2] = f70, SIZE
  13582. ;;
  13583. STFD [BOFFSET] = f67, -3 * SIZE
  13584. STFD [BOFFSET2] = f71, -3 * SIZE
  13585. ;;
  13586. adds C9 = 4 * SIZE, C1
  13587. ;;
  13588. #endif
  13589. #ifdef RN
  13590. LDFD f32 = [BOFFSET]
  13591. ;;
  13592. FMPY f64 = f64, f32
  13593. FMPY f68 = f68, f32
  13594. FMPY f65 = f65, f32
  13595. FMPY f69 = f69, f32
  13596. FMPY f66 = f66, f32
  13597. FMPY f70 = f70, f32
  13598. FMPY f67 = f67, f32
  13599. FMPY f71 = f71, f32
  13600. ;;
  13601. STFD [AOFFSET] = f64, SIZE
  13602. STFD [AOFFSET2] = f68, SIZE
  13603. ;;
  13604. STFD [AOFFSET] = f65, SIZE
  13605. STFD [AOFFSET2] = f69, SIZE
  13606. ;;
  13607. STFD [AOFFSET] = f66, SIZE
  13608. STFD [AOFFSET2] = f70, SIZE
  13609. ;;
  13610. STFD [AOFFSET] = f67, -3 * SIZE
  13611. STFD [AOFFSET2] = f71, -3 * SIZE
  13612. ;;
  13613. #endif
  13614. #ifdef RT
  13615. LDFD f32 = [BOFFSET]
  13616. ;;
  13617. FMPY f64 = f64, f32
  13618. FMPY f68 = f68, f32
  13619. FMPY f65 = f65, f32
  13620. FMPY f69 = f69, f32
  13621. FMPY f66 = f66, f32
  13622. FMPY f70 = f70, f32
  13623. FMPY f67 = f67, f32
  13624. FMPY f71 = f71, f32
  13625. ;;
  13626. STFD [AOFFSET] = f64, SIZE
  13627. STFD [AOFFSET2] = f68, SIZE
  13628. ;;
  13629. STFD [AOFFSET] = f65, SIZE
  13630. STFD [AOFFSET2] = f69, SIZE
  13631. ;;
  13632. STFD [AOFFSET] = f66, SIZE
  13633. STFD [AOFFSET2] = f70, SIZE
  13634. ;;
  13635. STFD [AOFFSET] = f67, -3 * SIZE
  13636. STFD [AOFFSET2] = f71, -3 * SIZE
  13637. ;;
  13638. #endif
  13639. adds C9 = 4 * SIZE, C1
  13640. ;;
  13641. { .mmf
  13642. STFD [C1 ] = f64, SIZE
  13643. STFD [C9 ] = f68, SIZE
  13644. mov f64 = f0
  13645. }
  13646. ;;
  13647. { .mmi
  13648. STFD [C1 ] = f65, SIZE
  13649. STFD [C9 ] = f69, SIZE
  13650. }
  13651. ;;
  13652. { .mmi
  13653. STFD [C1 ] = f66, SIZE
  13654. STFD [C9 ] = f70, SIZE
  13655. }
  13656. ;;
  13657. { .mmi
  13658. #ifndef LN
  13659. STFD [C1 ] = f67, 5 * SIZE
  13660. #else
  13661. STFD [C1 ] = f67, - 3 * SIZE
  13662. #endif
  13663. STFD [C9 ] = f71
  13664. }
  13665. ;;
  13666. { .mmf
  13667. cmp.ne p6, p0 = 1, I
  13668. }
  13669. ;;
  13670. adds I = -1, I
  13671. ;;
  13672. { .mmi
  13673. shladd r2 = K, BASE_SHIFT, r0
  13674. }
  13675. ;;
  13676. { .mmi
  13677. sub L = K, KK
  13678. }
  13679. ;;
  13680. { .mmi
  13681. #ifdef RT
  13682. shladd AORIG = r2, 3, AORIG
  13683. #else
  13684. nop __LINE__
  13685. #endif
  13686. }
  13687. ;;
  13688. { .mmi
  13689. #if defined(LT) || defined(RN)
  13690. shladd L = L, BASE_SHIFT, r0
  13691. #else
  13692. nop __LINE__
  13693. #endif
  13694. }
  13695. ;;
  13696. ;;
  13697. { .mmi
  13698. #if defined(LT) || defined(RN)
  13699. shladd AOFFSET = L, 3, AOFFSET
  13700. #else
  13701. nop __LINE__
  13702. #endif
  13703. }
  13704. ;;
  13705. { .mmi
  13706. #if defined(LT) || defined(RN)
  13707. add BOFFSET = L, BOFFSET
  13708. #else
  13709. nop __LINE__
  13710. #endif
  13711. }
  13712. ;;
  13713. { .mmi
  13714. #ifdef LT
  13715. adds KK = 8, KK
  13716. #elif defined LN
  13717. adds KK = -8, KK
  13718. #else
  13719. nop __LINE__
  13720. #endif
  13721. }
  13722. ;;
  13723. { .mmi
  13724. #if defined(LT) || defined(RN)
  13725. mov L = KK
  13726. #else
  13727. sub L = K, KK
  13728. #endif
  13729. }
  13730. ;;
  13731. mov f64 = f0
  13732. mov f65 = f0
  13733. mov f66 = f0
  13734. mov f67 = f0
  13735. mov f68 = f0
  13736. mov f69 = f0
  13737. mov f70 = f0
  13738. mov f71 = f0
  13739. (p6) br.cond.dptk .L132
  13740. .align 8
  13741. .L169:
  13742. { .mii
  13743. #ifdef LN
  13744. shladd B = K, BASE_SHIFT, B
  13745. #elif defined(LT) || defined(RN)
  13746. mov B = BOFFSET
  13747. #else
  13748. nop __LINE__
  13749. #endif
  13750. #ifdef RN
  13751. adds KK = 1, KK
  13752. #elif defined RT
  13753. adds KK = -1, KK
  13754. #else
  13755. nop __LINE__
  13756. #endif
  13757. mov AOFFSET = A
  13758. }
  13759. ;;
  13760. .align 16
  13761. .L999:
  13762. mov r8 = r0
  13763. adds r9 = 1 * 16, SP
  13764. ;;
  13765. ldf.fill f16 = [SP], 32
  13766. ldf.fill f17 = [r9], 32
  13767. ;;
  13768. ldf.fill f18 = [SP], 32
  13769. ldf.fill f19 = [r9], 32
  13770. ;;
  13771. ldf.fill f20 = [SP], 32
  13772. ldf.fill f21 = [r9], 32
  13773. ;;
  13774. mov ar.lc = ARLC
  13775. ;;
  13776. mov pr = PR, -1
  13777. ;;
  13778. mov ar.pfs = ARPFS
  13779. ;;
  13780. br.ret.sptk.many b0
  13781. EPILOGUE