You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rot.S 16 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef XDOUBLE
  41. #define PREFETCH_SIZE ( 8 * 8 + 4)
  42. #elif defined(DOUBLE)
  43. #define PREFETCH_SIZE (16 * 8 + 8)
  44. #else
  45. #define PREFETCH_SIZE (32 * 8 + 16)
  46. #endif
  47. #define N r32
  48. #define X1 r33
  49. #define INCX r34
  50. #define Y1 r35
  51. #define INCY r36
  52. #define PREX r2
  53. #define PREY r3
  54. #define I r14
  55. #define J r15
  56. #define Y2 r16
  57. #define X2 r17
  58. #define INCX16 r18
  59. #define INCY16 r19
  60. #define PR r30
  61. #define ARLC r31
  62. #define C f8
  63. #define S f9
  64. PROLOGUE
  65. .prologue
  66. PROFCODE
  67. { .mmi
  68. adds r29 = 16, r12
  69. shladd INCX = INCX, BASE_SHIFT, r0
  70. .save ar.lc, ARLC
  71. mov ARLC = ar.lc
  72. }
  73. { .mib
  74. cmp.lt p0, p6 = r0, N
  75. shr I = N, 4
  76. (p6) br.ret.spnt.many b0
  77. }
  78. .body
  79. ;;
  80. { .mmi
  81. #ifdef XDOUBLE
  82. LDFD S = [r29]
  83. #else
  84. nop __LINE__
  85. #endif
  86. shladd INCY = INCY, BASE_SHIFT, r0
  87. mov PR = pr
  88. }
  89. { .mmi
  90. mov X2 = X1
  91. mov Y2 = Y1
  92. mov pr.rot= 0
  93. }
  94. ;;
  95. { .mmi
  96. #ifndef XDOUBLE
  97. shladd INCX16 = INCX, 4, r0
  98. shladd INCY16 = INCY, 4, r0
  99. #else
  100. shladd INCX16 = INCX, 3, r0
  101. shladd INCY16 = INCY, 3, r0
  102. #endif
  103. mov ar.ec= 3
  104. }
  105. { .mmi
  106. adds I = -1, I
  107. cmp.eq p16, p0 = r0, r0
  108. and J = 15, N
  109. }
  110. ;;
  111. { .mmi
  112. adds PREX = PREFETCH_SIZE * SIZE, X1
  113. adds PREY = PREFETCH_SIZE * SIZE, Y1
  114. mov ar.lc = I
  115. }
  116. { .mib
  117. cmp.eq p6 ,p0 = -1, I
  118. tbit.z p0, p12 = N, 3
  119. (p6) br.cond.dpnt .L15
  120. }
  121. ;;
  122. .align 32
  123. .L12:
  124. { .mmf
  125. (p18) STFD [X2] = f6
  126. (p16) lfetch.excl.nt1 [PREY], INCY16
  127. (p18) FMA f12 = C, f40, f12
  128. }
  129. { .mmf
  130. (p17) LDFD f120 = [Y1], INCY
  131. (p18) add X2 = X2, INCX
  132. (p18) FMPY f6 = S, f94
  133. }
  134. ;;
  135. { .mmf
  136. (p18) STFD [Y2] = f7
  137. (p16) lfetch.excl.nt1 [PREX], INCX16
  138. (p18) FNMA f13 = S, f40, f13
  139. }
  140. { .mmf
  141. (p16) LDFD f32 = [X1], INCX
  142. (p18) add Y2 = Y2, INCY
  143. (p18) FMPY f7 = C, f94
  144. }
  145. ;;
  146. { .mmf
  147. (p18) STFD [X2] = f10
  148. (p17) LDFD f123 = [Y1], INCY
  149. (p18) FMA f14 = C, f43, f14
  150. }
  151. { .mmf
  152. (p18) add X2 = X2, INCX
  153. nop __LINE__
  154. (p18) FMPY f10 = S, f97
  155. }
  156. ;;
  157. { .mmf
  158. (p18) STFD [Y2] = f11
  159. (p16) LDFD f35 = [X1], INCX
  160. (p18) FNMA f15 = S, f43, f15
  161. }
  162. { .mmf
  163. (p18) add Y2 = Y2, INCY
  164. nop __LINE__
  165. (p18) FMPY f11 = C, f97
  166. }
  167. ;;
  168. { .mmf
  169. (p18) STFD [X2] = f12
  170. (p17) LDFD f126 = [Y1], INCY
  171. (p18) FMPY f12 = S, f100
  172. }
  173. { .mmf
  174. (p18) add X2 = X2, INCX
  175. nop __LINE__
  176. (p18) FMA f6 = C, f46, f6
  177. }
  178. ;;
  179. { .mmf
  180. (p18) STFD [Y2] = f13
  181. (p16) LDFD f38 = [X1], INCX
  182. (p18) FMPY f13 = C, f100
  183. }
  184. { .mmf
  185. (p18) add Y2 = Y2, INCY
  186. nop __LINE__
  187. (p18) FNMA f7 = S, f46, f7
  188. }
  189. ;;
  190. { .mmf
  191. (p18) STFD [X2] = f14
  192. (p16) LDFD f80 = [Y1], INCY
  193. (p18) FMPY f14 = S, f103
  194. }
  195. { .mmf
  196. (p18) add X2 = X2, INCX
  197. nop __LINE__
  198. (p18) FMA f10 = C, f49, f10
  199. }
  200. ;;
  201. { .mmf
  202. (p18) STFD [Y2] = f15
  203. (p16) LDFD f41 = [X1], INCX
  204. (p18) FMPY f15 = C, f103
  205. }
  206. { .mmf
  207. (p18) add Y2 = Y2, INCY
  208. nop __LINE__
  209. (p18) FNMA f11 = S, f49, f11
  210. }
  211. ;;
  212. { .mmf
  213. (p18) STFD [X2] = f6
  214. (p16) LDFD f83 = [Y1], INCY
  215. (p18) FMA f12 = C, f52, f12
  216. }
  217. { .mmf
  218. (p18) add X2 = X2, INCX
  219. nop __LINE__
  220. (p18) FMPY f6 = S, f106
  221. }
  222. ;;
  223. { .mmf
  224. (p18) STFD [Y2] = f7
  225. (p16) LDFD f44 = [X1], INCX
  226. (p18) FNMA f13 = S, f52, f13
  227. }
  228. { .mmf
  229. (p18) add Y2 = Y2, INCY
  230. nop __LINE__
  231. (p18) FMPY f7 = C, f106
  232. }
  233. ;;
  234. { .mmf
  235. (p18) STFD [X2] = f10
  236. (p16) LDFD f86 = [Y1], INCY
  237. (p18) FMA f14 = C, f55, f14
  238. }
  239. { .mmf
  240. (p18) add X2 = X2, INCX
  241. nop __LINE__
  242. (p18) FMPY f10 = S, f109
  243. }
  244. ;;
  245. { .mmf
  246. (p18) STFD [Y2] = f11
  247. (p16) LDFD f47 = [X1], INCX
  248. (p18) FNMA f15 = S, f55, f15
  249. }
  250. { .mmf
  251. (p18) add Y2 = Y2, INCY
  252. nop __LINE__
  253. (p18) FMPY f11 = C, f109
  254. }
  255. ;;
  256. { .mmf
  257. (p18) STFD [X2] = f12
  258. (p16) LDFD f89 = [Y1], INCY
  259. (p18) FMPY f12 = S, f112
  260. }
  261. { .mmf
  262. (p18) add X2 = X2, INCX
  263. nop __LINE__
  264. (p18) FMA f6 = C, f58, f6
  265. }
  266. ;;
  267. { .mmf
  268. (p18) STFD [Y2] = f13
  269. (p16) LDFD f50 = [X1], INCX
  270. (p18) FMPY f13 = C, f112
  271. }
  272. { .mmf
  273. (p18) add Y2 = Y2, INCY
  274. nop __LINE__
  275. (p18) FNMA f7 = S, f58, f7
  276. }
  277. ;;
  278. { .mmf
  279. (p18) STFD [X2] = f14
  280. (p16) LDFD f92 = [Y1], INCY
  281. (p18) FMPY f14 = S, f115
  282. }
  283. { .mmf
  284. (p18) add X2 = X2, INCX
  285. nop __LINE__
  286. (p18) FMA f10 = C, f61, f10
  287. }
  288. ;;
  289. { .mmf
  290. (p18) STFD [Y2] = f15
  291. (p16) LDFD f53 = [X1], INCX
  292. (p18) FMPY f15 = C, f115
  293. }
  294. { .mmf
  295. (p18) add Y2 = Y2, INCY
  296. nop __LINE__
  297. (p18) FNMA f11 = S, f61, f11
  298. }
  299. ;;
  300. #ifndef XDOUBLE
  301. { .mmf
  302. (p18) STFD [X2] = f6
  303. (p16) LDFD f95 = [Y1], INCY
  304. (p18) FMA f12 = C, f64, f12
  305. }
  306. { .mmf
  307. (p18) add X2 = X2, INCX
  308. nop __LINE__
  309. (p18) FMPY f6 = S, f118
  310. }
  311. ;;
  312. { .mmf
  313. (p18) STFD [Y2] = f7
  314. (p16) LDFD f56 = [X1], INCX
  315. (p18) FNMA f13 = S, f64, f13
  316. }
  317. { .mmf
  318. (p18) add Y2 = Y2, INCY
  319. nop __LINE__
  320. (p18) FMPY f7 = C, f118
  321. }
  322. ;;
  323. #else
  324. { .mmf
  325. (p18) STFD [X2] = f6
  326. (p16) lfetch.excl.nt1 [PREY], INCY16
  327. (p18) FMA f12 = C, f64, f12
  328. }
  329. { .mmf
  330. (p16) LDFD f95 = [Y1], INCY
  331. (p18) add X2 = X2, INCX
  332. (p18) FMPY f6 = S, f118
  333. }
  334. ;;
  335. { .mmf
  336. (p18) STFD [Y2] = f7
  337. (p16) lfetch.excl.nt1 [PREX], INCX16
  338. (p18) FNMA f13 = S, f64, f13
  339. }
  340. { .mmf
  341. (p16) LDFD f56 = [X1], INCX
  342. (p18) add Y2 = Y2, INCY
  343. (p18) FMPY f7 = C, f118
  344. }
  345. ;;
  346. #endif
  347. { .mmf
  348. (p18) STFD [X2] = f10
  349. (p16) LDFD f98 = [Y1], INCY
  350. (p18) FMA f14 = C, f67, f14
  351. }
  352. { .mmf
  353. (p18) add X2 = X2, INCX
  354. nop __LINE__
  355. (p18) FMPY f10 = S, f121
  356. }
  357. ;;
  358. { .mmf
  359. (p18) STFD [Y2] = f11
  360. (p16) LDFD f59 = [X1], INCX
  361. (p18) FNMA f15 = S, f67, f15
  362. }
  363. { .mmf
  364. (p18) add Y2 = Y2, INCY
  365. nop __LINE__
  366. (p18) FMPY f11 = C, f121
  367. }
  368. ;;
  369. { .mmf
  370. (p18) STFD [X2] = f12
  371. (p16) LDFD f101 = [Y1], INCY
  372. (p18) FMPY f12 = S, f124
  373. }
  374. { .mmf
  375. (p18) add X2 = X2, INCX
  376. nop __LINE__
  377. (p18) FMA f6 = C, f70, f6
  378. }
  379. ;;
  380. { .mmf
  381. (p18) STFD [Y2] = f13
  382. (p16) LDFD f62 = [X1], INCX
  383. (p18) FMPY f13 = C, f124
  384. }
  385. { .mmf
  386. (p18) add Y2 = Y2, INCY
  387. nop __LINE__
  388. (p18) FNMA f7 = S, f70, f7
  389. }
  390. ;;
  391. { .mmf
  392. (p18) STFD [X2] = f14
  393. (p16) LDFD f104 = [Y1], INCY
  394. (p18) FMPY f14 = S, f127
  395. }
  396. { .mmf
  397. (p18) add X2 = X2, INCX
  398. nop __LINE__
  399. (p18) FMA f10 = C, f73, f10
  400. }
  401. ;;
  402. { .mmf
  403. (p18) STFD [Y2] = f15
  404. (p16) LDFD f65 = [X1], INCX
  405. (p18) FMPY f15 = C, f127
  406. }
  407. { .mmf
  408. (p18) add Y2 = Y2, INCY
  409. nop __LINE__
  410. (p18) FNMA f11 = S, f73, f11
  411. }
  412. ;;
  413. { .mmf
  414. (p18) STFD [X2] = f6
  415. (p16) LDFD f107 = [Y1], INCY
  416. (p18) FMA f12 = C, f76, f12
  417. }
  418. { .mmf
  419. (p18) add X2 = X2, INCX
  420. nop __LINE__
  421. (p17) FMPY f6 = S, f81
  422. }
  423. ;;
  424. { .mmf
  425. (p18) STFD [Y2] = f7
  426. (p16) LDFD f68 = [X1], INCX
  427. (p18) FNMA f13 = S, f76, f13
  428. }
  429. { .mmf
  430. (p18) add Y2 = Y2, INCY
  431. nop __LINE__
  432. (p17) FMPY f7 = C, f81
  433. }
  434. ;;
  435. { .mmf
  436. (p18) STFD [X2] = f10
  437. (p16) LDFD f110 = [Y1], INCY
  438. (p18) FMA f14 = C, f79, f14
  439. }
  440. { .mmf
  441. (p18) add X2 = X2, INCX
  442. nop __LINE__
  443. (p17) FMPY f10 = S, f84
  444. }
  445. ;;
  446. { .mmf
  447. (p18) STFD [Y2] = f11
  448. (p16) LDFD f71 = [X1], INCX
  449. (p18) FNMA f15 = S, f79, f15
  450. }
  451. { .mmf
  452. (p18) add Y2 = Y2, INCY
  453. nop __LINE__
  454. (p17) FMPY f11 = C, f84
  455. }
  456. ;;
  457. { .mmf
  458. (p18) STFD [X2] = f12
  459. (p16) LDFD f113 = [Y1], INCY
  460. (p17) FMPY f12 = S, f87
  461. }
  462. { .mmf
  463. (p18) add X2 = X2, INCX
  464. nop __LINE__
  465. (p17) FMA f6 = C, f33, f6
  466. }
  467. ;;
  468. { .mmf
  469. (p18) STFD [Y2] = f13
  470. (p16) LDFD f74 = [X1], INCX
  471. (p17) FMPY f13 = C, f87
  472. }
  473. { .mmf
  474. (p18) add Y2 = Y2, INCY
  475. nop __LINE__
  476. (p17) FNMA f7 = S, f33, f7
  477. }
  478. ;;
  479. { .mmf
  480. (p18) STFD [X2] = f14
  481. (p16) LDFD f116 = [Y1], INCY
  482. (p17) FMPY f14 = S, f90
  483. }
  484. { .mmf
  485. (p18) add X2 = X2, INCX
  486. nop __LINE__
  487. (p17) FMA f10 = C, f36, f10
  488. }
  489. ;;
  490. { .mmf
  491. (p18) STFD [Y2] = f15
  492. (p16) LDFD f77 = [X1], INCX
  493. (p17) FMPY f15 = C, f90
  494. }
  495. { .mfb
  496. (p18) add Y2 = Y2, INCY
  497. (p17) FNMA f11 = S, f36, f11
  498. br.ctop.sptk.few .L12
  499. }
  500. ;;
  501. .align 32
  502. .L15:
  503. { .mmi
  504. (p12) LDFD f40 = [Y1], INCY
  505. (p12) LDFD f32 = [X1], INCX
  506. mov ar.lc = ARLC
  507. }
  508. ;;
  509. { .mmi
  510. (p12) LDFD f41 = [Y1], INCY
  511. (p12) LDFD f33 = [X1], INCX
  512. mov pr = PR, -65474
  513. }
  514. ;;
  515. { .mmb
  516. (p12) LDFD f42 = [Y1], INCY
  517. cmp.eq p7, p0 = r0, J
  518. (p7) br.ret.sptk.many b0
  519. }
  520. ;;
  521. { .mmf
  522. (p12) LDFD f43 = [Y1], INCY
  523. nop __LINE__
  524. (p12) FMPY f6 = S, f40
  525. }
  526. ;;
  527. { .mmf
  528. (p12) LDFD f34 = [X1], INCX
  529. nop __LINE__
  530. (p12) FMPY f7 = C, f40
  531. }
  532. ;;
  533. { .mmf
  534. (p12) LDFD f44 = [Y1], INCY
  535. nop __LINE__
  536. (p12) FMPY f10 = S, f41
  537. }
  538. ;;
  539. { .mmf
  540. (p12) LDFD f35 = [X1], INCX
  541. nop __LINE__
  542. (p12) FMPY f11 = C, f41
  543. }
  544. ;;
  545. { .mmf
  546. (p12) LDFD f45 = [Y1], INCY
  547. nop __LINE__
  548. (p12) FMPY f12 = S, f42
  549. }
  550. { .mmf
  551. nop __LINE__
  552. nop __LINE__
  553. (p12) FMA f6 = C, f32, f6
  554. }
  555. ;;
  556. { .mmf
  557. (p12) LDFD f36 = [X1], INCX
  558. nop __LINE__
  559. (p12) FMPY f13 = C, f42
  560. }
  561. { .mmf
  562. nop __LINE__
  563. nop __LINE__
  564. (p12) FNMA f7 = S, f32, f7
  565. }
  566. ;;
  567. { .mmf
  568. (p12) LDFD f46 = [Y1], INCY
  569. nop __LINE__
  570. (p12) FMPY f14 = S, f43
  571. }
  572. { .mmf
  573. nop __LINE__
  574. nop __LINE__
  575. (p12) FMA f10 = C, f33, f10
  576. }
  577. ;;
  578. { .mmf
  579. (p12) LDFD f37 = [X1], INCX
  580. nop __LINE__
  581. (p12) FMPY f15 = C, f43
  582. }
  583. { .mmf
  584. nop __LINE__
  585. nop __LINE__
  586. (p12) FNMA f11 = S, f33, f11
  587. }
  588. ;;
  589. { .mmf
  590. (p12) STFD [X2] = f6
  591. (p12) LDFD f47 = [Y1], INCY
  592. (p12) FMA f12 = C, f34, f12
  593. }
  594. { .mfi
  595. (p12) add X2 = X2, INCX
  596. (p12) FMPY f6 = S, f44
  597. tbit.z p0, p13 = N, 2
  598. }
  599. ;;
  600. { .mmf
  601. (p12) STFD [Y2] = f7
  602. (p12) LDFD f38 = [X1], INCX
  603. (p12) FNMA f13 = S, f34, f13
  604. }
  605. { .mmf
  606. (p12) add Y2 = Y2, INCY
  607. nop __LINE__
  608. (p12) FMPY f7 = C, f44
  609. }
  610. ;;
  611. { .mmf
  612. (p12) STFD [X2] = f10
  613. (p13) LDFD f52 = [Y1], INCY
  614. (p12) FMA f14 = C, f35, f14
  615. }
  616. { .mmf
  617. (p12) add X2 = X2, INCX
  618. nop __LINE__
  619. (p12) FMPY f10 = S, f45
  620. }
  621. ;;
  622. { .mmf
  623. (p12) STFD [Y2] = f11
  624. (p12) LDFD f39 = [X1], INCX
  625. (p12) FNMA f15 = S, f35, f15
  626. }
  627. { .mmf
  628. (p12) add Y2 = Y2, INCY
  629. nop __LINE__
  630. (p12) FMPY f11 = C, f45
  631. }
  632. ;;
  633. { .mmf
  634. (p12) STFD [X2] = f12
  635. (p13) LDFD f53 = [Y1], INCY
  636. (p12) FMPY f12 = S, f46
  637. }
  638. { .mmf
  639. (p12) add X2 = X2, INCX
  640. nop __LINE__
  641. (p12) FMA f6 = C, f36, f6
  642. }
  643. ;;
  644. { .mmf
  645. (p12) STFD [Y2] = f13
  646. (p13) LDFD f48 = [X1], INCX
  647. (p12) FMPY f13 = C, f46
  648. }
  649. { .mmf
  650. (p12) add Y2 = Y2, INCY
  651. nop __LINE__
  652. (p12) FNMA f7 = S, f36, f7
  653. }
  654. ;;
  655. { .mmf
  656. (p12) STFD [X2] = f14
  657. (p13) LDFD f54 = [Y1], INCY
  658. (p12) FMPY f14 = S, f47
  659. }
  660. { .mmf
  661. (p12) add X2 = X2, INCX
  662. nop __LINE__
  663. (p12) FMA f10 = C, f37, f10
  664. }
  665. ;;
  666. { .mmf
  667. (p12) STFD [Y2] = f15
  668. (p13) LDFD f49 = [X1], INCX
  669. (p12) FMPY f15 = C, f47
  670. }
  671. { .mfi
  672. (p12) add Y2 = Y2, INCY
  673. (p12) FNMA f11 = S, f37, f11
  674. tbit.z p0, p14 = N, 1
  675. }
  676. ;;
  677. { .mmf
  678. (p12) STFD [X2] = f6
  679. (p13) LDFD f55 = [Y1], INCY
  680. (p12) FMA f12 = C, f38, f12
  681. }
  682. { .mmf
  683. (p12) add X2 = X2, INCX
  684. nop __LINE__
  685. (p13) FMPY f6 = S, f52
  686. }
  687. ;;
  688. { .mmf
  689. (p12) STFD [Y2] = f7
  690. (p13) LDFD f50 = [X1], INCX
  691. (p12) FNMA f13 = S, f38, f13
  692. }
  693. { .mmf
  694. (p12) add Y2 = Y2, INCY
  695. nop __LINE__
  696. (p13) FMPY f7 = C, f52
  697. }
  698. ;;
  699. { .mmf
  700. (p12) STFD [X2] = f10
  701. (p14) LDFD f58 = [Y1], INCY
  702. (p12) FMA f14 = C, f39, f14
  703. }
  704. { .mmf
  705. (p12) add X2 = X2, INCX
  706. nop __LINE__
  707. (p13) FMPY f10 = S, f53
  708. }
  709. ;;
  710. { .mmf
  711. (p12) STFD [Y2] = f11
  712. (p13) LDFD f51 = [X1], INCX
  713. (p12) FNMA f15 = S, f39, f15
  714. }
  715. { .mfi
  716. (p12) add Y2 = Y2, INCY
  717. (p13) FMPY f11 = C, f53
  718. tbit.z p0, p15 = N, 0
  719. }
  720. ;;
  721. { .mmf
  722. (p12) STFD [X2] = f12
  723. (p14) LDFD f59 = [Y1], INCY
  724. (p13) FMPY f12 = S, f54
  725. }
  726. { .mmf
  727. (p12) add X2 = X2, INCX
  728. nop __LINE__
  729. (p13) FMA f6 = C, f48, f6
  730. }
  731. ;;
  732. { .mmf
  733. (p12) STFD [Y2] = f13
  734. (p14) LDFD f56 = [X1], INCX
  735. (p13) FMPY f13 = C, f54
  736. }
  737. { .mmf
  738. (p12) add Y2 = Y2, INCY
  739. nop __LINE__
  740. (p13) FNMA f7 = S, f48, f7
  741. }
  742. ;;
  743. { .mmf
  744. (p12) STFD [X2] = f14
  745. (p15) LDFD f61 = [Y1], INCY
  746. (p13) FMPY f14 = S, f55
  747. }
  748. { .mmf
  749. (p12) add X2 = X2, INCX
  750. nop __LINE__
  751. (p13) FMA f10 = C, f49, f10
  752. }
  753. ;;
  754. { .mmf
  755. (p12) STFD [Y2] = f15
  756. (p14) LDFD f57 = [X1], INCX
  757. (p13) FMPY f15 = C, f55
  758. }
  759. { .mmf
  760. (p12) add Y2 = Y2, INCY
  761. nop __LINE__
  762. (p13) FNMA f11 = S, f49, f11
  763. }
  764. ;;
  765. { .mmf
  766. (p13) STFD [X2] = f6
  767. nop __LINE__
  768. (p13) FMA f12 = C, f50, f12
  769. }
  770. { .mmf
  771. (p13) add X2 = X2, INCX
  772. nop __LINE__
  773. (p14) FMPY f6 = S, f58
  774. }
  775. ;;
  776. { .mmf
  777. (p13) STFD [Y2] = f7
  778. (p15) LDFD f60 = [X1], INCX
  779. (p13) FNMA f13 = S, f50, f13
  780. }
  781. { .mmf
  782. (p13) add Y2 = Y2, INCY
  783. nop __LINE__
  784. (p14) FMPY f7 = C, f58
  785. }
  786. ;;
  787. { .mmf
  788. (p13) STFD [X2] = f10
  789. nop __LINE__
  790. (p13) FMA f14 = C, f51, f14
  791. }
  792. { .mmf
  793. (p13) add X2 = X2, INCX
  794. nop __LINE__
  795. (p14) FMPY f10 = S, f59
  796. }
  797. ;;
  798. { .mmf
  799. (p13) STFD [Y2] = f11
  800. nop __LINE__
  801. (p13) FNMA f15 = S, f51, f15
  802. }
  803. { .mmf
  804. (p13) add Y2 = Y2, INCY
  805. nop __LINE__
  806. (p14) FMPY f11 = C, f59
  807. }
  808. ;;
  809. { .mmf
  810. (p13) STFD [X2] = f12
  811. nop __LINE__
  812. (p14) FMA f6 = C, f56, f6
  813. }
  814. { .mmf
  815. (p13) add X2 = X2, INCX
  816. nop __LINE__
  817. (p15) FMPY f12 = S, f61
  818. }
  819. ;;
  820. { .mmf
  821. (p13) STFD [Y2] = f13
  822. nop __LINE__
  823. (p14) FNMA f7 = S, f56, f7
  824. }
  825. { .mmf
  826. (p13) add Y2 = Y2, INCY
  827. nop __LINE__
  828. (p15) FMPY f13 = C, f61
  829. }
  830. ;;
  831. { .mmf
  832. (p13) STFD [X2] = f14
  833. (p13) add X2 = X2, INCX
  834. (p14) FMA f10 = C, f57, f10
  835. }
  836. ;;
  837. { .mmf
  838. (p13) STFD [Y2] = f15
  839. (p13) add Y2 = Y2, INCY
  840. (p14) FNMA f11 = S, f57, f11
  841. }
  842. ;;
  843. { .mmf
  844. (p14) STFD [X2] = f6
  845. (p14) add X2 = X2, INCX
  846. (p15) FMA f12 = C, f60, f12
  847. }
  848. ;;
  849. { .mmf
  850. (p14) STFD [Y2] = f7
  851. (p14) add Y2 = Y2, INCY
  852. (p15) FNMA f13 = S, f60, f13
  853. }
  854. ;;
  855. { .mmi
  856. (p14) STFD [X2] = f10
  857. (p14) add X2 = X2, INCX
  858. nop __LINE__
  859. }
  860. ;;
  861. { .mmi
  862. (p14) STFD [Y2] = f11
  863. (p14) add Y2 = Y2, INCY
  864. nop __LINE__
  865. }
  866. ;;
  867. { .mmi
  868. (p15) STFD [X2] = f12
  869. (p15) add X2 = X2, INCX
  870. nop __LINE__
  871. }
  872. ;;
  873. { .mmb
  874. (p15) STFD [Y2] = f13
  875. (p15) add Y2 = Y2, INCY
  876. br.ret.sptk.many b0
  877. }
  878. ;;
  879. EPILOGUE