You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zrot.S 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef XDOUBLE
  41. #define PREFETCH_SIZE ( 8 * 8 + 4)
  42. #elif defined(DOUBLE)
  43. #define PREFETCH_SIZE (16 * 8 + 8)
  44. #else
  45. #define PREFETCH_SIZE (32 * 8 + 16)
  46. #endif
  47. #define N r32
  48. #define X1 r33
  49. #define INCX r34
  50. #define Y1 r35
  51. #define INCY r36
  52. #define PREX r2
  53. #define PREY r3
  54. #define I r14
  55. #define J r15
  56. #define Y2 r16
  57. #define X2 r17
  58. #define INCX16 r18
  59. #define INCY16 r19
  60. #define PR r30
  61. #define ARLC r31
  62. #define C f8
  63. #define S f9
  64. PROLOGUE
  65. .prologue
  66. PROFCODE
  67. { .mmi
  68. adds r29 = 16, r12
  69. add INCX = INCX, INCX
  70. .save ar.lc, ARLC
  71. mov ARLC = ar.lc
  72. }
  73. { .mib
  74. cmp.lt p0, p6 = r0, N
  75. shr I = N, 3
  76. (p6) br.ret.spnt.many b0
  77. }
  78. ;;
  79. .body
  80. { .mmi
  81. #ifdef XDOUBLE
  82. LDFD S = [r29]
  83. #else
  84. nop __LINE__
  85. #endif
  86. add INCY = INCY, INCY
  87. mov PR = pr
  88. }
  89. { .mmi
  90. mov X2 = X1
  91. mov Y2 = Y1
  92. mov pr.rot= 0
  93. }
  94. ;;
  95. { .mmi
  96. shladd INCX = INCX, BASE_SHIFT, r0
  97. shladd INCY = INCY, BASE_SHIFT, r0
  98. mov ar.ec= 3
  99. }
  100. { .mmi
  101. adds I = -1, I
  102. cmp.eq p16, p0 = r0, r0
  103. and J = 7, N
  104. }
  105. ;;
  106. { .mmi
  107. #ifndef XDOUBLE
  108. shladd INCX16 = INCX, 3, r0
  109. shladd INCY16 = INCY, 3, r0
  110. #else
  111. shladd INCX16 = INCX, 2, r0
  112. shladd INCY16 = INCY, 2, r0
  113. #endif
  114. nop __LINE__
  115. }
  116. { .mmi
  117. adds INCX = -SIZE, INCX
  118. adds INCY = -SIZE, INCY
  119. nop __LINE__
  120. }
  121. ;;
  122. { .mmi
  123. adds PREX = PREFETCH_SIZE * SIZE, X1
  124. adds PREY = PREFETCH_SIZE * SIZE, Y1
  125. mov ar.lc = I
  126. }
  127. { .mib
  128. cmp.eq p6 ,p0 = -1, I
  129. tbit.z p0, p12 = N, 2
  130. (p6) br.cond.dpnt .L15
  131. }
  132. ;;
  133. .align 32
  134. .L12:
  135. { .mmf
  136. (p19) STFD [Y2] = f15
  137. (p16) lfetch.excl.nt1 [PREX], INCX16
  138. (p18) FMPY f15 = C, f91
  139. }
  140. { .mmf
  141. (p16) LDFD f32 = [X1], SIZE
  142. (p19) add Y2 = Y2, INCY
  143. (p18) FNMA f11 = S, f37, f11
  144. }
  145. ;;
  146. { .mmf
  147. (p18) STFD [X2] = f6
  148. (p16) lfetch.excl.nt1 [PREY], INCY16
  149. (p18) FMA f12 = C, f40, f12
  150. }
  151. { .mmf
  152. (p17) LDFD f114 = [Y1], INCY
  153. (p18) adds X2 = SIZE, X2
  154. (p18) FMPY f6 = S, f94
  155. }
  156. ;;
  157. { .mmf
  158. (p18) STFD [Y2] = f7
  159. (p16) LDFD f35 = [X1], INCX
  160. (p18) FNMA f13 = S, f40, f13
  161. }
  162. { .mmf
  163. nop __LINE__
  164. (p18) adds Y2 = SIZE, Y2
  165. (p18) FMPY f7 = C, f94
  166. }
  167. ;;
  168. { .mmf
  169. (p18) STFD [X2] = f10
  170. (p17) LDFD f117 = [Y1], SIZE
  171. (p18) FMA f14 = C, f43, f14
  172. }
  173. { .mmf
  174. (p18) add X2 = X2, INCX
  175. nop __LINE__
  176. (p18) FMPY f10 = S, f97
  177. }
  178. ;;
  179. { .mmf
  180. (p18) STFD [Y2] = f11
  181. (p16) LDFD f38 = [X1], SIZE
  182. (p18) FNMA f15 = S, f43, f15
  183. }
  184. { .mmf
  185. (p18) add Y2 = Y2, INCY
  186. nop __LINE__
  187. (p18) FMPY f11 = C, f97
  188. }
  189. ;;
  190. { .mmf
  191. (p18) STFD [X2] = f12
  192. (p17) LDFD f120 = [Y1], INCY
  193. (p18) FMPY f12 = S, f100
  194. }
  195. { .mmf
  196. (p18) adds X2 = SIZE, X2
  197. nop __LINE__
  198. (p18) FMA f6 = C, f46, f6
  199. }
  200. ;;
  201. { .mmf
  202. (p18) STFD [Y2] = f13
  203. (p16) LDFD f41 = [X1], INCX
  204. (p18) FMPY f13 = C, f100
  205. }
  206. { .mmf
  207. (p18) adds Y2 = SIZE, Y2
  208. nop __LINE__
  209. (p18) FNMA f7 = S, f46, f7
  210. }
  211. ;;
  212. { .mmf
  213. (p18) STFD [X2] = f14
  214. (p17) LDFD f123 = [Y1], SIZE
  215. (p18) FMPY f14 = S, f103
  216. }
  217. { .mmf
  218. (p18) add X2 = X2, INCX
  219. nop __LINE__
  220. (p18) FMA f10 = C, f49, f10
  221. }
  222. ;;
  223. { .mmf
  224. (p18) STFD [Y2] = f15
  225. (p16) LDFD f44 = [X1], SIZE
  226. (p18) FMPY f15 = C, f103
  227. }
  228. { .mmf
  229. (p18) add Y2 = Y2, INCY
  230. nop __LINE__
  231. (p18) FNMA f11 = S, f49, f11
  232. }
  233. ;;
  234. { .mmf
  235. (p18) STFD [X2] = f6
  236. (p17) LDFD f126 = [Y1], INCY
  237. (p18) FMA f12 = C, f52, f12
  238. }
  239. { .mmf
  240. (p18) adds X2 = SIZE, X2
  241. nop __LINE__
  242. (p18) FMPY f6 = S, f106
  243. }
  244. ;;
  245. { .mmf
  246. (p18) STFD [Y2] = f7
  247. (p16) LDFD f47 = [X1], INCX
  248. (p18) FNMA f13 = S, f52, f13
  249. }
  250. { .mmf
  251. (p18) adds Y2 = SIZE, Y2
  252. nop __LINE__
  253. (p18) FMPY f7 = C, f106
  254. }
  255. ;;
  256. { .mmf
  257. (p18) STFD [X2] = f10
  258. (p16) LDFD f80 = [Y1], SIZE
  259. (p18) FMA f14 = C, f55, f14
  260. }
  261. { .mmf
  262. (p18) add X2 = X2, INCX
  263. nop __LINE__
  264. (p18) FMPY f10 = S, f109
  265. }
  266. ;;
  267. { .mmf
  268. (p18) STFD [Y2] = f11
  269. (p16) LDFD f50 = [X1], SIZE
  270. (p18) FNMA f15 = S, f55, f15
  271. }
  272. { .mmf
  273. (p18) add Y2 = Y2, INCY
  274. nop __LINE__
  275. (p18) FMPY f11 = C, f109
  276. }
  277. ;;
  278. { .mmf
  279. (p18) STFD [X2] = f12
  280. (p16) LDFD f83 = [Y1], INCY
  281. (p18) FMPY f12 = S, f112
  282. }
  283. { .mmf
  284. (p18) adds X2 = SIZE, X2
  285. nop __LINE__
  286. (p18) FMA f6 = C, f58, f6
  287. }
  288. ;;
  289. { .mmf
  290. (p18) STFD [Y2] = f13
  291. (p16) LDFD f53 = [X1], INCX
  292. (p18) FMPY f13 = C, f112
  293. }
  294. { .mmf
  295. (p18) adds Y2 = SIZE, Y2
  296. nop __LINE__
  297. (p18) FNMA f7 = S, f58, f7
  298. }
  299. ;;
  300. { .mmf
  301. (p18) STFD [X2] = f14
  302. (p16) LDFD f86 = [Y1], SIZE
  303. (p18) FMPY f14 = S, f115
  304. }
  305. { .mmf
  306. (p18) add X2 = X2, INCX
  307. nop __LINE__
  308. (p18) FMA f10 = C, f61, f10
  309. }
  310. ;;
  311. { .mmf
  312. (p18) STFD [Y2] = f15
  313. (p16) LDFD f56 = [X1], SIZE
  314. (p18) FMPY f15 = C, f115
  315. }
  316. { .mmf
  317. (p18) add Y2 = Y2, INCY
  318. nop __LINE__
  319. (p18) FNMA f11 = S, f61, f11
  320. }
  321. ;;
  322. #ifndef XDOUBLE
  323. { .mmf
  324. (p18) STFD [X2] = f6
  325. (p16) LDFD f89 = [Y1], INCY
  326. (p18) FMA f12 = C, f64, f12
  327. }
  328. { .mmf
  329. (p18) adds X2 = SIZE, X2
  330. nop __LINE__
  331. (p18) FMPY f6 = S, f118
  332. }
  333. ;;
  334. { .mmf
  335. (p18) STFD [Y2] = f7
  336. (p16) LDFD f59 = [X1], INCX
  337. (p18) FNMA f13 = S, f64, f13
  338. }
  339. { .mmf
  340. (p18) adds Y2 = SIZE, Y2
  341. nop __LINE__
  342. (p18) FMPY f7 = C, f118
  343. }
  344. ;;
  345. #else
  346. { .mmf
  347. (p18) STFD [X2] = f6
  348. (p16) lfetch.excl.nt1 [PREY], INCY16
  349. (p18) FMA f12 = C, f64, f12
  350. }
  351. { .mmf
  352. (p16) LDFD f89 = [Y1], INCY
  353. (p18) adds X2 = SIZE, X2
  354. (p18) FMPY f6 = S, f118
  355. }
  356. ;;
  357. { .mmf
  358. (p18) STFD [Y2] = f7
  359. (p16) lfetch.excl.nt1 [PREX], INCX16
  360. (p18) FNMA f13 = S, f64, f13
  361. }
  362. { .mmf
  363. (p16) LDFD f59 = [X1], INCX
  364. (p18) adds Y2 = SIZE, Y2
  365. (p18) FMPY f7 = C, f118
  366. }
  367. ;;
  368. #endif
  369. { .mmf
  370. (p18) STFD [X2] = f10
  371. (p16) LDFD f92 = [Y1], SIZE
  372. (p18) FMA f14 = C, f67, f14
  373. }
  374. { .mmf
  375. (p18) add X2 = X2, INCX
  376. nop __LINE__
  377. (p18) FMPY f10 = S, f121
  378. }
  379. ;;
  380. { .mmf
  381. (p18) STFD [Y2] = f11
  382. (p16) LDFD f62 = [X1], SIZE
  383. (p18) FNMA f15 = S, f67, f15
  384. }
  385. { .mmf
  386. (p18) add Y2 = Y2, INCY
  387. nop __LINE__
  388. (p18) FMPY f11 = C, f121
  389. }
  390. ;;
  391. { .mmf
  392. (p18) STFD [X2] = f12
  393. (p16) LDFD f95 = [Y1], INCY
  394. (p18) FMPY f12 = S, f124
  395. }
  396. { .mmf
  397. (p18) adds X2 = SIZE, X2
  398. nop __LINE__
  399. (p18) FMA f6 = C, f70, f6
  400. }
  401. ;;
  402. { .mmf
  403. (p18) STFD [Y2] = f13
  404. (p16) LDFD f65 = [X1], INCX
  405. (p18) FMPY f13 = C, f124
  406. }
  407. { .mmf
  408. (p18) adds Y2 = SIZE, Y2
  409. nop __LINE__
  410. (p18) FNMA f7 = S, f70, f7
  411. }
  412. ;;
  413. { .mmf
  414. (p18) STFD [X2] = f14
  415. (p16) LDFD f98 = [Y1], SIZE
  416. (p18) FMPY f14 = S, f127
  417. }
  418. { .mmf
  419. (p18) add X2 = X2, INCX
  420. nop __LINE__
  421. (p18) FMA f10 = C, f73, f10
  422. }
  423. ;;
  424. { .mmf
  425. (p18) STFD [Y2] = f15
  426. (p16) LDFD f68 = [X1], SIZE
  427. (p18) FMPY f15 = C, f127
  428. }
  429. { .mmf
  430. (p18) add Y2 = Y2, INCY
  431. nop __LINE__
  432. (p18) FNMA f11 = S, f73, f11
  433. }
  434. ;;
  435. { .mmf
  436. (p18) STFD [X2] = f6
  437. (p16) LDFD f101 = [Y1], INCY
  438. (p18) FMA f12 = C, f76, f12
  439. }
  440. { .mmf
  441. (p18) adds X2 = SIZE, X2
  442. nop __LINE__
  443. (p17) FMPY f6 = S, f81
  444. }
  445. ;;
  446. { .mmf
  447. (p18) STFD [Y2] = f7
  448. (p16) LDFD f71 = [X1], INCX
  449. (p18) FNMA f13 = S, f76, f13
  450. }
  451. { .mmf
  452. (p18) adds Y2 = SIZE, Y2
  453. nop __LINE__
  454. (p17) FMPY f7 = C, f81
  455. }
  456. ;;
  457. { .mmf
  458. (p18) STFD [X2] = f10
  459. (p16) LDFD f104 = [Y1], SIZE
  460. (p18) FMA f14 = C, f79, f14
  461. }
  462. { .mmf
  463. (p18) add X2 = X2, INCX
  464. nop __LINE__
  465. (p17) FMPY f10 = S, f84
  466. }
  467. ;;
  468. { .mmf
  469. (p18) STFD [Y2] = f11
  470. (p16) LDFD f74 = [X1], SIZE
  471. (p18) FNMA f15 = S, f79, f15
  472. }
  473. { .mmf
  474. (p18) add Y2 = Y2, INCY
  475. nop __LINE__
  476. (p17) FMPY f11 = C, f84
  477. }
  478. ;;
  479. { .mmf
  480. (p18) STFD [X2] = f12
  481. (p16) LDFD f107 = [Y1], INCY
  482. (p17) FMPY f12 = S, f87
  483. }
  484. { .mmf
  485. (p18) adds X2 = SIZE, X2
  486. nop __LINE__
  487. (p17) FMA f6 = C, f33, f6
  488. }
  489. ;;
  490. { .mmf
  491. (p18) STFD [Y2] = f13
  492. (p16) LDFD f77 = [X1], INCX
  493. (p17) FMPY f13 = C, f87
  494. }
  495. { .mmf
  496. (p18) adds Y2 = SIZE, Y2
  497. nop __LINE__
  498. (p17) FNMA f7 = S, f33, f7
  499. }
  500. ;;
  501. { .mmf
  502. (p18) STFD [X2] = f14
  503. (p16) LDFD f110 = [Y1], SIZE
  504. (p17) FMPY f14 = S, f90
  505. }
  506. { .mfb
  507. (p18) add X2 = X2, INCX
  508. (p17) FMA f10 = C, f36, f10
  509. br.ctop.sptk.few .L12
  510. }
  511. ;;
  512. { .mmi
  513. (p19) STFD [Y2] = f15
  514. (p19) add Y2 = Y2, INCY
  515. nop __LINE__
  516. }
  517. { .mmi
  518. nop __LINE__
  519. nop __LINE__
  520. nop __LINE__
  521. }
  522. ;;
  523. .align 32
  524. .L15:
  525. { .mmi
  526. (p12) LDFD f40 = [Y1], SIZE
  527. (p12) LDFD f32 = [X1], SIZE
  528. mov ar.lc = ARLC
  529. }
  530. ;;
  531. { .mmi
  532. (p12) LDFD f41 = [Y1], INCY
  533. (p12) LDFD f33 = [X1], INCX
  534. mov pr = PR, -65474
  535. }
  536. ;;
  537. { .mmb
  538. (p12) LDFD f42 = [Y1], SIZE
  539. cmp.eq p7, p0 = r0, J
  540. (p7) br.ret.sptk.many b0
  541. }
  542. ;;
  543. { .mmf
  544. (p12) LDFD f43 = [Y1], INCY
  545. nop __LINE__
  546. (p12) FMPY f6 = S, f40
  547. }
  548. ;;
  549. { .mmf
  550. (p12) LDFD f34 = [X1], SIZE
  551. nop __LINE__
  552. (p12) FMPY f7 = C, f40
  553. }
  554. ;;
  555. { .mmf
  556. (p12) LDFD f44 = [Y1], SIZE
  557. nop __LINE__
  558. (p12) FMPY f10 = S, f41
  559. }
  560. ;;
  561. { .mmf
  562. (p12) LDFD f35 = [X1], INCX
  563. nop __LINE__
  564. (p12) FMPY f11 = C, f41
  565. }
  566. ;;
  567. { .mmf
  568. (p12) LDFD f45 = [Y1], INCY
  569. nop __LINE__
  570. (p12) FMPY f12 = S, f42
  571. }
  572. { .mmf
  573. nop __LINE__
  574. nop __LINE__
  575. (p12) FMA f6 = C, f32, f6
  576. }
  577. ;;
  578. { .mmf
  579. (p12) LDFD f36 = [X1], SIZE
  580. nop __LINE__
  581. (p12) FMPY f13 = C, f42
  582. }
  583. { .mmf
  584. nop __LINE__
  585. nop __LINE__
  586. (p12) FNMA f7 = S, f32, f7
  587. }
  588. ;;
  589. { .mmf
  590. (p12) LDFD f46 = [Y1], SIZE
  591. nop __LINE__
  592. (p12) FMPY f14 = S, f43
  593. }
  594. { .mmf
  595. nop __LINE__
  596. nop __LINE__
  597. (p12) FMA f10 = C, f33, f10
  598. }
  599. ;;
  600. { .mmf
  601. (p12) LDFD f37 = [X1], INCX
  602. nop __LINE__
  603. (p12) FMPY f15 = C, f43
  604. }
  605. { .mmf
  606. nop __LINE__
  607. nop __LINE__
  608. (p12) FNMA f11 = S, f33, f11
  609. }
  610. ;;
  611. { .mmf
  612. (p12) STFD [X2] = f6, SIZE
  613. (p12) LDFD f47 = [Y1], INCY
  614. (p12) FMA f12 = C, f34, f12
  615. }
  616. { .mfi
  617. nop __LINE__
  618. (p12) FMPY f6 = S, f44
  619. tbit.z p0, p13 = N, 1
  620. }
  621. ;;
  622. { .mmf
  623. (p12) STFD [Y2] = f7, SIZE
  624. (p12) LDFD f38 = [X1], SIZE
  625. (p12) FNMA f13 = S, f34, f13
  626. }
  627. { .mmf
  628. nop __LINE__
  629. nop __LINE__
  630. (p12) FMPY f7 = C, f44
  631. }
  632. ;;
  633. { .mmf
  634. (p12) STFD [X2] = f10
  635. (p13) LDFD f52 = [Y1], SIZE
  636. (p12) FMA f14 = C, f35, f14
  637. }
  638. { .mmf
  639. (p12) add X2 = X2, INCX
  640. nop __LINE__
  641. (p12) FMPY f10 = S, f45
  642. }
  643. ;;
  644. { .mmf
  645. (p12) STFD [Y2] = f11
  646. (p12) LDFD f39 = [X1], INCX
  647. (p12) FNMA f15 = S, f35, f15
  648. }
  649. { .mmf
  650. (p12) add Y2 = Y2, INCY
  651. nop __LINE__
  652. (p12) FMPY f11 = C, f45
  653. }
  654. ;;
  655. { .mmf
  656. (p12) STFD [X2] = f12, SIZE
  657. (p13) LDFD f53 = [Y1], INCY
  658. (p12) FMPY f12 = S, f46
  659. }
  660. { .mmf
  661. nop __LINE__
  662. nop __LINE__
  663. (p12) FMA f6 = C, f36, f6
  664. }
  665. ;;
  666. { .mmf
  667. (p12) STFD [Y2] = f13, SIZE
  668. (p13) LDFD f48 = [X1], SIZE
  669. (p12) FMPY f13 = C, f46
  670. }
  671. { .mmf
  672. nop __LINE__
  673. nop __LINE__
  674. (p12) FNMA f7 = S, f36, f7
  675. }
  676. ;;
  677. { .mmf
  678. (p12) STFD [X2] = f14
  679. (p13) LDFD f54 = [Y1], SIZE
  680. (p12) FMPY f14 = S, f47
  681. }
  682. { .mmf
  683. (p12) add X2 = X2, INCX
  684. nop __LINE__
  685. (p12) FMA f10 = C, f37, f10
  686. }
  687. ;;
  688. { .mmf
  689. (p12) STFD [Y2] = f15
  690. (p13) LDFD f49 = [X1], INCX
  691. (p12) FMPY f15 = C, f47
  692. }
  693. { .mfi
  694. (p12) add Y2 = Y2, INCY
  695. (p12) FNMA f11 = S, f37, f11
  696. tbit.z p0, p14 = N, 0
  697. }
  698. ;;
  699. { .mmf
  700. (p12) STFD [X2] = f6, SIZE
  701. (p13) LDFD f55 = [Y1], INCY
  702. (p12) FMA f12 = C, f38, f12
  703. }
  704. { .mmf
  705. nop __LINE__
  706. nop __LINE__
  707. (p13) FMPY f6 = S, f52
  708. }
  709. ;;
  710. { .mmf
  711. (p12) STFD [Y2] = f7, SIZE
  712. (p13) LDFD f50 = [X1], SIZE
  713. (p12) FNMA f13 = S, f38, f13
  714. }
  715. { .mmf
  716. nop __LINE__
  717. nop __LINE__
  718. (p13) FMPY f7 = C, f52
  719. }
  720. ;;
  721. { .mmf
  722. (p12) STFD [X2] = f10
  723. (p14) LDFD f58 = [Y1], SIZE
  724. (p12) FMA f14 = C, f39, f14
  725. }
  726. { .mmf
  727. (p12) add X2 = X2, INCX
  728. nop __LINE__
  729. (p13) FMPY f10 = S, f53
  730. }
  731. ;;
  732. { .mmf
  733. (p12) STFD [Y2] = f11
  734. (p13) LDFD f51 = [X1], INCX
  735. (p12) FNMA f15 = S, f39, f15
  736. }
  737. { .mmf
  738. (p12) add Y2 = Y2, INCY
  739. nop __LINE__
  740. (p13) FMPY f11 = C, f53
  741. }
  742. ;;
  743. { .mmf
  744. (p12) STFD [X2] = f12, SIZE
  745. (p14) LDFD f59 = [Y1], INCY
  746. (p13) FMPY f12 = S, f54
  747. }
  748. { .mmf
  749. nop __LINE__
  750. nop __LINE__
  751. (p13) FMA f6 = C, f48, f6
  752. }
  753. ;;
  754. { .mmf
  755. (p12) STFD [Y2] = f13, SIZE
  756. (p14) LDFD f56 = [X1], SIZE
  757. (p13) FMPY f13 = C, f54
  758. }
  759. { .mmf
  760. nop __LINE__
  761. nop __LINE__
  762. (p13) FNMA f7 = S, f48, f7
  763. }
  764. ;;
  765. { .mmf
  766. (p12) STFD [X2] = f14
  767. (p12) add X2 = X2, INCX
  768. (p13) FMPY f14 = S, f55
  769. }
  770. { .mmf
  771. nop __LINE__
  772. nop __LINE__
  773. (p13) FMA f10 = C, f49, f10
  774. }
  775. ;;
  776. { .mmf
  777. (p12) STFD [Y2] = f15
  778. (p14) LDFD f57 = [X1], INCX
  779. (p13) FMPY f15 = C, f55
  780. }
  781. { .mmf
  782. (p12) add Y2 = Y2, INCY
  783. nop __LINE__
  784. (p13) FNMA f11 = S, f49, f11
  785. }
  786. ;;
  787. { .mmf
  788. (p13) STFD [X2] = f6, SIZE
  789. nop __LINE__
  790. (p13) FMA f12 = C, f50, f12
  791. }
  792. { .mmf
  793. nop __LINE__
  794. nop __LINE__
  795. (p14) FMPY f6 = S, f58
  796. }
  797. ;;
  798. { .mmf
  799. (p13) STFD [Y2] = f7, SIZE
  800. nop __LINE__
  801. (p13) FNMA f13 = S, f50, f13
  802. }
  803. { .mmf
  804. nop __LINE__
  805. nop __LINE__
  806. (p14) FMPY f7 = C, f58
  807. }
  808. ;;
  809. { .mmf
  810. (p13) STFD [X2] = f10
  811. (p13) add X2 = X2, INCX
  812. (p13) FMA f14 = C, f51, f14
  813. }
  814. { .mmf
  815. nop __LINE__
  816. nop __LINE__
  817. (p14) FMPY f10 = S, f59
  818. }
  819. ;;
  820. { .mmf
  821. (p13) STFD [Y2] = f11
  822. (p13) add Y2 = Y2, INCY
  823. (p13) FNMA f15 = S, f51, f15
  824. }
  825. { .mmf
  826. nop __LINE__
  827. nop __LINE__
  828. (p14) FMPY f11 = C, f59
  829. }
  830. ;;
  831. { .mmf
  832. (p13) STFD [X2] = f12, SIZE
  833. nop __LINE__
  834. (p14) FMA f6 = C, f56, f6
  835. }
  836. ;;
  837. { .mmf
  838. (p13) STFD [Y2] = f13, SIZE
  839. nop __LINE__
  840. (p14) FNMA f7 = S, f56, f7
  841. }
  842. ;;
  843. { .mmf
  844. (p13) STFD [X2] = f14
  845. (p13) add X2 = X2, INCX
  846. (p14) FMA f10 = C, f57, f10
  847. }
  848. ;;
  849. { .mmf
  850. (p13) STFD [Y2] = f15
  851. (p13) add Y2 = Y2, INCY
  852. (p14) FNMA f11 = S, f57, f11
  853. }
  854. ;;
  855. { .mmi
  856. (p14) STFD [X2] = f6, SIZE
  857. (p14) STFD [Y2] = f7, SIZE
  858. nop __LINE__
  859. }
  860. ;;
  861. { .mmb
  862. (p14) STFD [X2] = f10
  863. (p14) STFD [Y2] = f11
  864. br.ret.sptk.many b0
  865. }
  866. ;;
  867. EPILOGUE