You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

saxpy.S 31 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCHSIZE 64 * 8
  41. #define N r32
  42. #define X r36
  43. #define INCX r37
  44. #define Y r38
  45. #define INCY r39
  46. #define PRE1 r2
  47. #define PRE2 r3
  48. #define I r14
  49. #define J r15
  50. #define Y1 r16
  51. #define Y2 r17
  52. #define X1 r18
  53. #define X2 r19
  54. #define INCX16 r20
  55. #define INCY16 r21
  56. #define YYY r25
  57. #define YY r27
  58. #define XA r28
  59. #define XB r29
  60. #define PR r30
  61. #define ARLC r31
  62. #define ALPHA f8
  63. #define ALPHA_P f9
  64. PROLOGUE
  65. .prologue
  66. PROFCODE
  67. { .mii
  68. shladd INCX = INCX, BASE_SHIFT, r0
  69. .save ar.lc, ARLC
  70. mov ARLC = ar.lc
  71. tbit.nz p10, p0 = X, BASE_SHIFT
  72. }
  73. { .mfb
  74. cmp.lt p0, p6 = r0, N
  75. fcmp.eq p7, p0 = ALPHA, f0
  76. (p6) br.ret.sptk.many b0
  77. }
  78. ;;
  79. .body
  80. { .mmi
  81. (p10) LDFD f32 = [X], INCX
  82. shladd INCY = INCY, BASE_SHIFT, r0
  83. mov PR = pr
  84. }
  85. { .mib
  86. (p10) adds N = -1, N
  87. mov YYY = Y
  88. (p7) br.ret.sptk.many b0
  89. }
  90. ;;
  91. { .mmi
  92. (p10) LDFD f33 = [Y], INCY
  93. cmp.ne p13, p0 = SIZE, INCX
  94. shr XA = X, 2
  95. }
  96. { .mmi
  97. shladd INCX16 = INCX, 4, r0
  98. shladd INCY16 = INCY, 4, r0
  99. nop.i 0
  100. }
  101. ;;
  102. { .mii
  103. mov Y1 = Y
  104. tbit.nz p11, p0 = Y, BASE_SHIFT
  105. shr XB = Y, 2
  106. }
  107. ;;
  108. { .mmf
  109. and XA = 0x3f, XA
  110. and XB = 0x3f, XB
  111. (p10) FMA f32 = ALPHA, f32, f33
  112. }
  113. ;;
  114. { .mmi
  115. sub XA = XB, XA
  116. shladd Y2 = INCY, 2, Y
  117. mov pr.rot = 0x10000
  118. }
  119. { .mbb
  120. cmp.ne p14, p0 = SIZE, INCY
  121. (p13) br.cond.dpnt .L100
  122. (p14) br.cond.dpnt .L100
  123. }
  124. ;;
  125. { .mmi
  126. cmp.gt p14, p0 = r0, XA
  127. ;;
  128. and J = 15, N
  129. shr I = N, 4
  130. }
  131. { .mfb
  132. (p14) adds XA = 64, XA
  133. fpack ALPHA_P = f8, f8
  134. (p11) br.cond.dpnt .L30
  135. }
  136. ;;
  137. { .mmi
  138. cmp.gt p14, p0 = 32, XA
  139. cmp.lt p15, p0 = 58, XA
  140. mov ar.ec = 3
  141. }
  142. { .mmi
  143. and J = 31, N
  144. cmp.eq p16, p0 = r0, r0
  145. shr I = N, 5
  146. }
  147. ;;
  148. { .mmi
  149. cmp.eq p9, p0 = r0, J
  150. cmp.eq p7 ,p0 = 0, I
  151. adds I = -1, I
  152. }
  153. { .mbb
  154. nop.m 0
  155. (p14) br.cond.dpnt .L20
  156. (p15) br.cond.dpnt .L20
  157. }
  158. ;;
  159. { .mmi
  160. (p10) STFD [YYY] = f32
  161. adds PRE1 = PREFETCHSIZE * SIZE, X
  162. mov ar.lc = I
  163. }
  164. { .mib
  165. adds PRE2 = (PREFETCHSIZE - 24) * SIZE, Y
  166. tbit.z p0, p11 = N, 4
  167. (p7) br.cond.dpnt .L15
  168. }
  169. ;;
  170. .align 32
  171. .L12:
  172. /* 0 */
  173. { .mmf
  174. (p18) stf8 [Y1] = f6, 2 * SIZE
  175. (p16) lfetch.nt1 [PRE1], 32 * SIZE
  176. (p18) fpma f12 = ALPHA_P, f46, f94
  177. }
  178. { .mmi
  179. (p16) ldf8 f32 = [X], 2 * SIZE
  180. (p16) ldf8 f80 = [Y], 2 * SIZE
  181. }
  182. ;;
  183. /* 1 */
  184. { .mmf
  185. (p18) stf8 [Y1] = f7, 2 * SIZE
  186. (p16) lfetch.excl.nt1 [PRE2], 32 * SIZE
  187. (p18) fpma f13 = ALPHA_P, f49, f97
  188. }
  189. { .mmi
  190. (p16) ldf8 f35 = [X], 2 * SIZE
  191. (p16) ldf8 f83 = [Y], 2 * SIZE
  192. }
  193. ;;
  194. /* 2 */
  195. { .mmf
  196. (p18) stf8 [Y1] = f10, 2 * SIZE
  197. (p18) fpma f14 = ALPHA_P, f52, f100
  198. }
  199. { .mmi
  200. (p16) ldf8 f38 = [X], 2 * SIZE
  201. (p16) ldf8 f86 = [Y], 2 * SIZE
  202. }
  203. ;;
  204. /* 3 */
  205. { .mmf
  206. (p18) stf8 [Y1] = f11, 2 * SIZE
  207. (p18) fpma f15 = ALPHA_P, f55, f103
  208. }
  209. { .mmi
  210. (p16) ldf8 f41 = [X], 2 * SIZE
  211. (p16) ldf8 f89 = [Y], 2 * SIZE
  212. }
  213. ;;
  214. /* 4 */
  215. { .mmf
  216. (p18) stf8 [Y1] = f12, 2 * SIZE
  217. (p18) fpma f6 = ALPHA_P, f58, f106
  218. }
  219. { .mmi
  220. (p16) ldf8 f44 = [X], 2 * SIZE
  221. (p16) ldf8 f92 = [Y], 2 * SIZE
  222. }
  223. ;;
  224. /* 5 */
  225. { .mmf
  226. (p18) stf8 [Y1] = f13, 2 * SIZE
  227. (p18) fpma f7 = ALPHA_P, f61, f109
  228. }
  229. { .mmi
  230. (p16) ldf8 f47 = [X], 2 * SIZE
  231. (p16) ldf8 f95 = [Y], 2 * SIZE
  232. }
  233. ;;
  234. /* 6 */
  235. { .mmf
  236. (p18) stf8 [Y1] = f14, 2 * SIZE
  237. (p18) fpma f10 = ALPHA_P, f64, f112
  238. }
  239. { .mmi
  240. (p16) ldf8 f50 = [X], 2 * SIZE
  241. (p16) ldf8 f98 = [Y], 2 * SIZE
  242. }
  243. ;;
  244. /* 7 */
  245. { .mmf
  246. (p18) stf8 [Y1] = f15, 2 * SIZE
  247. (p18) fpma f11 = ALPHA_P, f67, f115
  248. }
  249. { .mmi
  250. (p16) ldf8 f53 = [X], 2 * SIZE
  251. (p16) ldf8 f101 = [Y], 2 * SIZE
  252. }
  253. ;;
  254. /* 8 */
  255. { .mmf
  256. (p18) stf8 [Y1] = f6, 2 * SIZE
  257. (p18) fpma f12 = ALPHA_P, f70, f118
  258. }
  259. { .mmi
  260. (p16) ldf8 f56 = [X], 2 * SIZE
  261. (p16) ldf8 f104 = [Y], 2 * SIZE
  262. }
  263. ;;
  264. /* 9 */
  265. { .mmf
  266. (p18) stf8 [Y1] = f7, 2 * SIZE
  267. (p18) fpma f13 = ALPHA_P, f73, f121
  268. }
  269. { .mmi
  270. (p16) ldf8 f59 = [X], 2 * SIZE
  271. (p16) ldf8 f107 = [Y], 2 * SIZE
  272. }
  273. ;;
  274. /* 10 */
  275. { .mmf
  276. (p18) stf8 [Y1] = f10, 2 * SIZE
  277. (p18) fpma f14 = ALPHA_P, f76, f124
  278. }
  279. { .mmi
  280. (p16) ldf8 f62 = [X], 2 * SIZE
  281. (p16) ldf8 f110 = [Y], 2 * SIZE
  282. }
  283. ;;
  284. /* 11 */
  285. { .mmf
  286. (p18) stf8 [Y1] = f11, 2 * SIZE
  287. (p18) fpma f15 = ALPHA_P, f79, f127
  288. }
  289. { .mmi
  290. (p16) ldf8 f65 = [X], 2 * SIZE
  291. (p16) ldf8 f113 = [Y], 2 * SIZE
  292. }
  293. ;;
  294. /* 12 */
  295. { .mmf
  296. (p18) stf8 [Y1] = f12, 2 * SIZE
  297. (p17) fpma f6 = ALPHA_P, f33, f81
  298. }
  299. { .mmi
  300. (p16) ldf8 f68 = [X], 2 * SIZE
  301. (p16) ldf8 f116 = [Y], 2 * SIZE
  302. }
  303. ;;
  304. /* 13 */
  305. { .mmf
  306. (p18) stf8 [Y1] = f13, 2 * SIZE
  307. (p17) fpma f7 = ALPHA_P, f36, f84
  308. }
  309. { .mmi
  310. (p16) ldf8 f71 = [X], 2 * SIZE
  311. (p16) ldf8 f119 = [Y], 2 * SIZE
  312. }
  313. ;;
  314. /* 14 */
  315. { .mmf
  316. (p18) stf8 [Y1] = f14, 2 * SIZE
  317. (p17) fpma f10 = ALPHA_P, f39, f87
  318. }
  319. { .mmi
  320. (p16) ldf8 f74 = [X], 2 * SIZE
  321. (p16) ldf8 f122 = [Y], 2 * SIZE
  322. }
  323. ;;
  324. /*15 */
  325. { .mmf
  326. (p18) stf8 [Y1] = f15, 2 * SIZE
  327. (p17) fpma f11 = ALPHA_P, f42, f90
  328. }
  329. { .mmb
  330. (p16) ldf8 f77 = [X], 2 * SIZE
  331. (p16) ldf8 f125 = [Y], 2 * SIZE
  332. br.ctop.sptk.few .L12
  333. }
  334. ;;
  335. .align 32
  336. .L15:
  337. { .mmi
  338. (p11) ldf8 f32 = [X], 2 * SIZE
  339. (p11) ldf8 f33 = [Y], 2 * SIZE
  340. mov pr = PR, -65474
  341. }
  342. ;;
  343. { .mmi
  344. (p11) ldf8 f34 = [X], 2 * SIZE
  345. (p11) ldf8 f35 = [Y], 2 * SIZE
  346. mov ar.lc = ARLC
  347. }
  348. ;;
  349. { .mmb
  350. (p11) ldf8 f36 = [X], 2 * SIZE
  351. (p11) ldf8 f37 = [Y], 2 * SIZE
  352. (p9) br.ret.sptk.many b0
  353. }
  354. ;;
  355. { .mmi
  356. (p11) ldf8 f38 = [X], 2 * SIZE
  357. (p11) ldf8 f39 = [Y], 2 * SIZE
  358. tbit.z p0, p12 = N, 3
  359. }
  360. ;;
  361. { .mmi
  362. (p11) ldf8 f40 = [X], 2 * SIZE
  363. (p11) ldf8 f41 = [Y], 2 * SIZE
  364. tbit.z p0, p13 = N, 2
  365. }
  366. ;;
  367. { .mmi
  368. (p11) ldf8 f42 = [X], 2 * SIZE
  369. (p11) ldf8 f43 = [Y], 2 * SIZE
  370. tbit.z p0, p14 = N, 1
  371. }
  372. ;;
  373. { .mmf
  374. (p11) ldf8 f44 = [X], 2 * SIZE
  375. (p11) ldf8 f45 = [Y], 2 * SIZE
  376. (p11) fpma f6 = ALPHA_P, f32, f33
  377. }
  378. ;;
  379. { .mmf
  380. (p11) ldf8 f46 = [X], 2 * SIZE
  381. (p11) ldf8 f47 = [Y], 2 * SIZE
  382. (p11) fpma f7 = ALPHA_P, f34, f35
  383. }
  384. ;;
  385. { .mmf
  386. (p12) ldf8 f48 = [X], 2 * SIZE
  387. (p12) ldf8 f49 = [Y], 2 * SIZE
  388. (p11) fpma f10 = ALPHA_P, f36, f37
  389. }
  390. ;;
  391. { .mmi
  392. (p11) stf8 [Y1] = f6, 2 * SIZE
  393. nop.m 0
  394. tbit.z p0, p15 = N, 0
  395. }
  396. { .mmf
  397. (p12) ldf8 f50 = [X], 2 * SIZE
  398. (p12) ldf8 f51 = [Y], 2 * SIZE
  399. (p11) fpma f11 = ALPHA_P, f38, f39
  400. }
  401. ;;
  402. { .mmi
  403. (p11) stf8 [Y1] = f7, 2 * SIZE
  404. nop.m 0
  405. nop.i 0
  406. }
  407. { .mmf
  408. (p12) ldf8 f52 = [X], 2 * SIZE
  409. (p12) ldf8 f53 = [Y], 2 * SIZE
  410. }
  411. ;;
  412. { .mmi
  413. (p11) stf8 [Y1] = f10, 2 * SIZE
  414. nop.m 0
  415. nop.i 0
  416. }
  417. { .mmf
  418. (p12) ldf8 f54 = [X], 2 * SIZE
  419. (p12) ldf8 f55 = [Y], 2 * SIZE
  420. (p11) fpma f12 = ALPHA_P, f40, f41
  421. }
  422. ;;
  423. { .mmi
  424. (p11) stf8 [Y1] = f11, 2 * SIZE
  425. nop.m 0
  426. nop.i 0
  427. }
  428. { .mmf
  429. (p13) ldf8 f56 = [X], 2 * SIZE
  430. (p13) ldf8 f57 = [Y], 2 * SIZE
  431. (p11) fpma f13 = ALPHA_P, f42, f43
  432. }
  433. ;;
  434. { .mmi
  435. (p11) stf8 [Y1] = f12, 2 * SIZE
  436. nop.m 0
  437. nop.i 0
  438. }
  439. { .mmf
  440. (p13) ldf8 f58 = [X], 2 * SIZE
  441. (p13) ldf8 f59 = [Y], 2 * SIZE
  442. (p11) fpma f14 = ALPHA_P, f44, f45
  443. }
  444. ;;
  445. { .mmi
  446. (p11) stf8 [Y1] = f13, 2 * SIZE
  447. nop.m 0
  448. nop.i 0
  449. }
  450. { .mmf
  451. (p14) ldf8 f60 = [X], 2 * SIZE
  452. (p14) ldf8 f61 = [Y], 2 * SIZE
  453. (p11) fpma f15 = ALPHA_P, f46, f47
  454. }
  455. ;;
  456. { .mmi
  457. (p11) stf8 [Y1] = f14, 2 * SIZE
  458. nop.m 0
  459. nop.i 0
  460. }
  461. { .mmf
  462. (p15) ldfs f62 = [X]
  463. (p15) ldfs f63 = [Y]
  464. (p12) fpma f6 = ALPHA_P, f48, f49
  465. }
  466. ;;
  467. (p12) fpma f7 = ALPHA_P, f50, f51
  468. (p12) fpma f10 = ALPHA_P, f52, f53
  469. ;;
  470. (p11) stf8 [Y1] = f15, 2 * SIZE
  471. (p12) fpma f11 = ALPHA_P, f54, f55
  472. ;;
  473. (p12) stf8 [Y1] = f6, 2 * SIZE
  474. (p13) fpma f12 = ALPHA_P, f56, f57
  475. ;;
  476. (p12) stf8 [Y1] = f7, 2 * SIZE
  477. (p13) fpma f13 = ALPHA_P, f58, f59
  478. ;;
  479. (p12) stf8 [Y1] = f10, 2 * SIZE
  480. (p14) fpma f14 = ALPHA_P, f60, f61
  481. ;;
  482. (p12) stf8 [Y1] = f11, 2 * SIZE
  483. (p15) FMA f15 = ALPHA, f62, f63
  484. ;;
  485. (p13) stf8 [Y1] = f12, 2 * SIZE
  486. ;;
  487. (p13) stf8 [Y1] = f13, 2 * SIZE
  488. ;;
  489. (p14) stf8 [Y1] = f14, 2 * SIZE
  490. ;;
  491. (p15) stfs [Y1] = f15
  492. br.ret.sptk.many b0
  493. ;;
  494. .align 32
  495. /* X is aligned; case 2 */
  496. .L20:
  497. { .mmi
  498. (p10) STFD [YYY] = f32
  499. adds PRE1 = (PREFETCHSIZE - 28) * SIZE, X
  500. mov ar.lc = I
  501. }
  502. { .mib
  503. adds PRE2 = (PREFETCHSIZE + 4) * SIZE, Y
  504. tbit.z p0, p11 = N, 4
  505. (p7) br.cond.dpnt .L25
  506. }
  507. ;;
  508. .align 32
  509. .L22:
  510. /* 0 */
  511. { .mmf
  512. (p18) stf8 [Y1] = f6, 2 * SIZE
  513. (p16) lfetch.nt1 [PRE1], 32 * SIZE
  514. (p18) fpma f12 = ALPHA_P, f46, f94
  515. }
  516. { .mmi
  517. (p17) ldf8 f60 = [X], 2 * SIZE
  518. (p16) ldf8 f80 = [Y], 2 * SIZE
  519. }
  520. ;;
  521. /* 1 */
  522. { .mmf
  523. (p18) stf8 [Y1] = f7, 2 * SIZE
  524. (p16) lfetch.excl.nt1 [PRE2], 32 * SIZE
  525. (p18) fpma f13 = ALPHA_P, f49, f97
  526. }
  527. { .mmi
  528. (p17) ldf8 f63 = [X], 2 * SIZE
  529. (p16) ldf8 f83 = [Y], 2 * SIZE
  530. }
  531. ;;
  532. /* 2 */
  533. { .mmf
  534. (p18) stf8 [Y1] = f10, 2 * SIZE
  535. (p18) fpma f14 = ALPHA_P, f52, f100
  536. }
  537. { .mmi
  538. (p17) ldf8 f66 = [X], 2 * SIZE
  539. (p16) ldf8 f86 = [Y], 2 * SIZE
  540. }
  541. ;;
  542. /* 3 */
  543. { .mmf
  544. (p18) stf8 [Y1] = f11, 2 * SIZE
  545. (p18) fpma f15 = ALPHA_P, f55, f103
  546. }
  547. { .mmi
  548. (p17) ldf8 f69 = [X], 2 * SIZE
  549. (p16) ldf8 f89 = [Y], 2 * SIZE
  550. }
  551. ;;
  552. /* 4 */
  553. { .mmf
  554. (p18) stf8 [Y1] = f12, 2 * SIZE
  555. (p18) fpma f6 = ALPHA_P, f58, f106
  556. }
  557. { .mmi
  558. (p17) ldf8 f72 = [X], 2 * SIZE
  559. (p16) ldf8 f92 = [Y], 2 * SIZE
  560. }
  561. ;;
  562. /* 5 */
  563. { .mmf
  564. (p18) stf8 [Y1] = f13, 2 * SIZE
  565. (p18) fpma f7 = ALPHA_P, f61, f109
  566. }
  567. { .mmi
  568. (p17) ldf8 f75 = [X], 2 * SIZE
  569. (p16) ldf8 f95 = [Y], 2 * SIZE
  570. }
  571. ;;
  572. /* 6 */
  573. { .mmf
  574. (p18) stf8 [Y1] = f14, 2 * SIZE
  575. (p18) fpma f10 = ALPHA_P, f64, f112
  576. }
  577. { .mmi
  578. (p17) ldf8 f78 = [X], 2 * SIZE
  579. (p16) ldf8 f98 = [Y], 2 * SIZE
  580. }
  581. ;;
  582. /* 7 */
  583. { .mmf
  584. (p18) stf8 [Y1] = f15, 2 * SIZE
  585. (p18) fpma f11 = ALPHA_P, f67, f115
  586. }
  587. { .mmi
  588. (p16) ldf8 f32 = [X], 2 * SIZE
  589. (p16) ldf8 f101 = [Y], 2 * SIZE
  590. }
  591. ;;
  592. /* 8 */
  593. { .mmf
  594. (p18) stf8 [Y1] = f6, 2 * SIZE
  595. (p18) fpma f12 = ALPHA_P, f70, f118
  596. }
  597. { .mmi
  598. (p16) ldf8 f35 = [X], 2 * SIZE
  599. (p16) ldf8 f104 = [Y], 2 * SIZE
  600. }
  601. ;;
  602. /* 9 */
  603. { .mmf
  604. (p18) stf8 [Y1] = f7, 2 * SIZE
  605. (p18) fpma f13 = ALPHA_P, f73, f121
  606. }
  607. { .mmi
  608. (p16) ldf8 f38 = [X], 2 * SIZE
  609. (p16) ldf8 f107 = [Y], 2 * SIZE
  610. }
  611. ;;
  612. /* 10 */
  613. { .mmf
  614. (p18) stf8 [Y1] = f10, 2 * SIZE
  615. (p18) fpma f14 = ALPHA_P, f76, f124
  616. }
  617. { .mmi
  618. (p16) ldf8 f41 = [X], 2 * SIZE
  619. (p16) ldf8 f110 = [Y], 2 * SIZE
  620. }
  621. ;;
  622. /* 11 */
  623. { .mmf
  624. (p18) stf8 [Y1] = f11, 2 * SIZE
  625. (p18) fpma f15 = ALPHA_P, f79, f127
  626. }
  627. { .mmi
  628. (p16) ldf8 f44 = [X], 2 * SIZE
  629. (p16) ldf8 f113 = [Y], 2 * SIZE
  630. }
  631. ;;
  632. /* 12 */
  633. { .mmf
  634. (p18) stf8 [Y1] = f12, 2 * SIZE
  635. (p17) fpma f6 = ALPHA_P, f33, f81
  636. }
  637. { .mmi
  638. (p16) ldf8 f47 = [X], 2 * SIZE
  639. (p16) ldf8 f116 = [Y], 2 * SIZE
  640. }
  641. ;;
  642. /* 13 */
  643. { .mmf
  644. (p18) stf8 [Y1] = f13, 2 * SIZE
  645. (p17) fpma f7 = ALPHA_P, f36, f84
  646. }
  647. { .mmi
  648. (p16) ldf8 f50 = [X], 2 * SIZE
  649. (p16) ldf8 f119 = [Y], 2 * SIZE
  650. }
  651. ;;
  652. /* 14 */
  653. { .mmf
  654. (p18) stf8 [Y1] = f14, 2 * SIZE
  655. (p17) fpma f10 = ALPHA_P, f39, f87
  656. }
  657. { .mmi
  658. (p16) ldf8 f53 = [X], 2 * SIZE
  659. (p16) ldf8 f122 = [Y], 2 * SIZE
  660. }
  661. ;;
  662. /*15 */
  663. { .mmf
  664. (p18) stf8 [Y1] = f15, 2 * SIZE
  665. (p17) fpma f11 = ALPHA_P, f42, f90
  666. }
  667. { .mmb
  668. (p16) ldf8 f56 = [X], 2 * SIZE
  669. (p16) ldf8 f125 = [Y], 2 * SIZE
  670. br.ctop.sptk.few .L22
  671. }
  672. ;;
  673. .align 32
  674. .L25:
  675. { .mmi
  676. (p11) ldf8 f32 = [X], 2 * SIZE
  677. (p11) ldf8 f33 = [Y], 2 * SIZE
  678. mov pr = PR, -65474
  679. }
  680. ;;
  681. { .mmi
  682. (p11) ldf8 f34 = [X], 2 * SIZE
  683. (p11) ldf8 f35 = [Y], 2 * SIZE
  684. mov ar.lc = ARLC
  685. }
  686. ;;
  687. { .mmb
  688. (p11) ldf8 f36 = [X], 2 * SIZE
  689. (p11) ldf8 f37 = [Y], 2 * SIZE
  690. (p9) br.ret.sptk.many b0
  691. }
  692. ;;
  693. { .mmi
  694. (p11) ldf8 f38 = [X], 2 * SIZE
  695. (p11) ldf8 f39 = [Y], 2 * SIZE
  696. tbit.z p0, p12 = N, 3
  697. }
  698. ;;
  699. { .mmi
  700. (p11) ldf8 f40 = [X], 2 * SIZE
  701. (p11) ldf8 f41 = [Y], 2 * SIZE
  702. tbit.z p0, p13 = N, 2
  703. }
  704. ;;
  705. { .mmi
  706. (p11) ldf8 f42 = [X], 2 * SIZE
  707. (p11) ldf8 f43 = [Y], 2 * SIZE
  708. tbit.z p0, p14 = N, 1
  709. }
  710. ;;
  711. { .mmf
  712. (p11) ldf8 f44 = [X], 2 * SIZE
  713. (p11) ldf8 f45 = [Y], 2 * SIZE
  714. (p11) fpma f6 = ALPHA_P, f32, f33
  715. }
  716. ;;
  717. { .mmf
  718. (p11) ldf8 f46 = [X], 2 * SIZE
  719. (p11) ldf8 f47 = [Y], 2 * SIZE
  720. (p11) fpma f7 = ALPHA_P, f34, f35
  721. }
  722. ;;
  723. { .mmf
  724. (p12) ldf8 f48 = [X], 2 * SIZE
  725. (p12) ldf8 f49 = [Y], 2 * SIZE
  726. (p11) fpma f10 = ALPHA_P, f36, f37
  727. }
  728. ;;
  729. { .mmi
  730. (p11) stf8 [Y1] = f6, 2 * SIZE
  731. nop.m 0
  732. tbit.z p0, p15 = N, 0
  733. }
  734. { .mmf
  735. (p12) ldf8 f50 = [X], 2 * SIZE
  736. (p12) ldf8 f51 = [Y], 2 * SIZE
  737. (p11) fpma f11 = ALPHA_P, f38, f39
  738. }
  739. ;;
  740. { .mmi
  741. (p11) stf8 [Y1] = f7, 2 * SIZE
  742. nop.m 0
  743. nop.i 0
  744. }
  745. { .mmf
  746. (p12) ldf8 f52 = [X], 2 * SIZE
  747. (p12) ldf8 f53 = [Y], 2 * SIZE
  748. }
  749. ;;
  750. { .mmi
  751. (p11) stf8 [Y1] = f10, 2 * SIZE
  752. nop.m 0
  753. nop.i 0
  754. }
  755. { .mmf
  756. (p12) ldf8 f54 = [X], 2 * SIZE
  757. (p12) ldf8 f55 = [Y], 2 * SIZE
  758. (p11) fpma f12 = ALPHA_P, f40, f41
  759. }
  760. ;;
  761. { .mmi
  762. (p11) stf8 [Y1] = f11, 2 * SIZE
  763. nop.m 0
  764. nop.i 0
  765. }
  766. { .mmf
  767. (p13) ldf8 f56 = [X], 2 * SIZE
  768. (p13) ldf8 f57 = [Y], 2 * SIZE
  769. (p11) fpma f13 = ALPHA_P, f42, f43
  770. }
  771. ;;
  772. { .mmi
  773. (p11) stf8 [Y1] = f12, 2 * SIZE
  774. nop.m 0
  775. nop.i 0
  776. }
  777. { .mmf
  778. (p13) ldf8 f58 = [X], 2 * SIZE
  779. (p13) ldf8 f59 = [Y], 2 * SIZE
  780. (p11) fpma f14 = ALPHA_P, f44, f45
  781. }
  782. ;;
  783. { .mmi
  784. (p11) stf8 [Y1] = f13, 2 * SIZE
  785. nop.m 0
  786. nop.i 0
  787. }
  788. { .mmf
  789. (p14) ldf8 f60 = [X], 2 * SIZE
  790. (p14) ldf8 f61 = [Y], 2 * SIZE
  791. (p11) fpma f15 = ALPHA_P, f46, f47
  792. }
  793. ;;
  794. { .mmi
  795. (p11) stf8 [Y1] = f14, 2 * SIZE
  796. nop.m 0
  797. nop.i 0
  798. }
  799. { .mmf
  800. (p15) ldfs f62 = [X]
  801. (p15) ldfs f63 = [Y]
  802. (p12) fpma f6 = ALPHA_P, f48, f49
  803. }
  804. ;;
  805. (p12) fpma f7 = ALPHA_P, f50, f51
  806. (p12) fpma f10 = ALPHA_P, f52, f53
  807. ;;
  808. (p11) stf8 [Y1] = f15, 2 * SIZE
  809. (p12) fpma f11 = ALPHA_P, f54, f55
  810. ;;
  811. (p12) stf8 [Y1] = f6, 2 * SIZE
  812. (p13) fpma f12 = ALPHA_P, f56, f57
  813. ;;
  814. (p12) stf8 [Y1] = f7, 2 * SIZE
  815. (p13) fpma f13 = ALPHA_P, f58, f59
  816. ;;
  817. (p12) stf8 [Y1] = f10, 2 * SIZE
  818. (p14) fpma f14 = ALPHA_P, f60, f61
  819. ;;
  820. (p12) stf8 [Y1] = f11, 2 * SIZE
  821. (p15) FMA f15 = ALPHA, f62, f63
  822. ;;
  823. (p13) stf8 [Y1] = f12, 2 * SIZE
  824. ;;
  825. (p13) stf8 [Y1] = f13, 2 * SIZE
  826. ;;
  827. (p14) stf8 [Y1] = f14, 2 * SIZE
  828. ;;
  829. (p15) stfs [Y1] = f15
  830. br.ret.sptk.many b0
  831. ;;
  832. .align 32
  833. .L30:
  834. { .mmi
  835. cmp.eq p9, p0 = r0, J
  836. cmp.eq p7 ,p0 = 0, I
  837. mov ar.ec = 4
  838. }
  839. { .mmi
  840. cmp.lt p12, p0 = 33, XA
  841. adds I = -1, I
  842. }
  843. ;;
  844. { .mmi
  845. cmp.gt p14, p0 = 15, XA
  846. cmp.lt p15, p0 = 60, XA
  847. (p12) cmp.gt.unc p13, p0 = 53, XA
  848. }
  849. { .bbb
  850. (p13) br.cond.dpnt .L40
  851. (p14) br.cond.dpnt .L40
  852. (p15) br.cond.dpnt .L40
  853. }
  854. ;;
  855. { .mmi
  856. (p10) STFD [YYY] = f32
  857. adds PRE1 = (PREFETCHSIZE + 6) * SIZE, X
  858. mov ar.lc = I
  859. }
  860. { .mib
  861. adds PRE2 = (PREFETCHSIZE + 0) * SIZE, Y
  862. tbit.z p0, p12 = N, 3
  863. (p7) br.cond.dpnt .L35
  864. }
  865. ;;
  866. .align 32
  867. .L32:
  868. { .mmf
  869. (p19) STFD [Y1] = f6, 1 * SIZE
  870. (p19) STFD [Y2] = f7, 1 * SIZE
  871. (p18) FMA f6 = ALPHA, f34, f82
  872. }
  873. { .mmf
  874. (p16) LDFPD f32, f35 = [X], 2 * SIZE
  875. (p16) LDFD f80 = [Y], 1 * SIZE
  876. (p18) FMA f7 = ALPHA, f46, f94
  877. }
  878. ;;
  879. { .mmf
  880. (p19) STFD [Y1] = f10, 1 * SIZE
  881. (p19) STFD [Y2] = f11, 1 * SIZE
  882. (p18) FMA f10 = ALPHA, f37, f85
  883. }
  884. { .mmf
  885. (p16) LDFPD f38, f41 = [X], 2 * SIZE
  886. (p16) LDFPD f83, f86 = [Y], 2 * SIZE
  887. (p18) FMA f11 = ALPHA, f49, f97
  888. }
  889. ;;
  890. { .mmf
  891. (p19) STFD [Y1] = f12, 1 * SIZE
  892. (p19) STFD [Y2] = f13, 1 * SIZE
  893. (p18) FMA f12 = ALPHA, f40, f88
  894. }
  895. { .mmf
  896. (p16) LDFPD f44, f47 = [X], 2 * SIZE
  897. (p16) LDFPD f89, f92 = [Y], 2 * SIZE
  898. (p18) FMA f13 = ALPHA, f52, f100
  899. }
  900. ;;
  901. { .mmf
  902. (p19) STFD [Y1] = f14, 5 * SIZE
  903. (p19) STFD [Y2] = f15, 5 * SIZE
  904. (p18) FMA f14 = ALPHA, f43, f91
  905. }
  906. { .mmf
  907. (p16) LDFPD f50, f53 = [X], 2 * SIZE
  908. (p16) LDFPD f95, f98 = [Y], 2 * SIZE
  909. (p18) FMA f15 = ALPHA, f55, f103
  910. }
  911. ;;
  912. { .mmf
  913. (p18) STFD [Y1] = f6, 1 * SIZE
  914. (p18) STFD [Y2] = f7, 1 * SIZE
  915. (p18) FMA f6 = ALPHA, f58, f106
  916. }
  917. { .mmf
  918. (p16) LDFPD f56, f59 = [X], 2 * SIZE
  919. (p16) LDFPD f101, f104 = [Y], 2 * SIZE
  920. (p18) FMA f7 = ALPHA, f70, f118
  921. }
  922. ;;
  923. { .mmf
  924. (p18) STFD [Y1] = f10, 1 * SIZE
  925. (p18) STFD [Y2] = f11, 1 * SIZE
  926. (p18) FMA f10 = ALPHA, f61, f109
  927. }
  928. { .mmf
  929. (p16) LDFPD f62, f65 = [X], 2 * SIZE
  930. (p16) LDFPD f107, f110 = [Y], 2 * SIZE
  931. (p18) FMA f11 = ALPHA, f73, f121
  932. }
  933. ;;
  934. { .mmf
  935. (p18) STFD [Y1] = f12, 1 * SIZE
  936. (p18) STFD [Y2] = f13, 1 * SIZE
  937. (p18) FMA f12 = ALPHA, f64, f112
  938. }
  939. { .mmf
  940. (p16) LDFPD f68, f71 = [X], 2 * SIZE
  941. (p16) LDFPD f113, f116 = [Y], 2 * SIZE
  942. (p18) FMA f13 = ALPHA, f76, f124
  943. }
  944. ;;
  945. { .mmf
  946. (p18) STFD [Y1] = f14, 5 * SIZE
  947. (p18) STFD [Y2] = f15, 5 * SIZE
  948. (p18) FMA f14 = ALPHA, f67, f115
  949. }
  950. { .mmf
  951. (p16) LDFPD f74, f77 = [X], 2 * SIZE
  952. (p16) LDFPD f119, f122 = [Y], 2 * SIZE
  953. (p18) FMA f15 = ALPHA, f79, f127
  954. }
  955. ;;
  956. { .mmi
  957. (p16) lfetch.nt1 [PRE1], 16 * SIZE
  958. (p16) lfetch.excl.nt1 [PRE2], 16 * SIZE
  959. nop.i 0
  960. }
  961. { .mmb
  962. (p16) LDFD f125 = [Y], 1 * SIZE
  963. nop.m 0
  964. br.ctop.sptk.few .L32
  965. }
  966. ;;
  967. .align 32
  968. .L35:
  969. { .mmi
  970. (p12) LDFPD f32, f33 = [X], 2 * SIZE
  971. (p12) LDFD f34 = [Y], 1 * SIZE;
  972. mov pr = PR, -65474
  973. }
  974. ;;
  975. { .mmi
  976. (p12) LDFPD f36, f37 = [X], 2 * SIZE
  977. (p12) LDFPD f35, f38 = [Y], 2 * SIZE
  978. mov ar.lc = ARLC
  979. }
  980. ;;
  981. { .mmb
  982. (p12) LDFPD f40, f41 = [X], 2 * SIZE
  983. (p12) LDFPD f39, f42 = [Y], 2 * SIZE
  984. (p9) br.ret.sptk.many b0
  985. }
  986. ;;
  987. { .mmi
  988. (p12) LDFPD f44, f45 = [X], 2 * SIZE
  989. (p12) LDFPD f43, f46 = [Y], 2 * SIZE
  990. tbit.z p0, p13 = N, 2
  991. }
  992. ;;
  993. { .mmi
  994. (p13) LDFPD f48, f49 = [X], 2 * SIZE
  995. (p12) LDFD f47 = [Y], 1 * SIZE
  996. tbit.z p0, p14 = N, 1
  997. }
  998. ;;
  999. { .mmi
  1000. (p13) LDFPD f52, f53 = [X], 2 * SIZE
  1001. (p13) LDFD f50 = [Y], 1 * SIZE
  1002. tbit.z p0, p15 = N, 0
  1003. }
  1004. ;;
  1005. { .mmi
  1006. (p14) LDFPD f56, f57 = [X], 2 * SIZE
  1007. (p13) LDFPD f51, f54 = [Y], 2 * SIZE
  1008. mov YY = Y1;
  1009. }
  1010. ;;
  1011. (p15) LDFD f60 = [X]
  1012. (p13) LDFD f55 = [Y], 1 * SIZE
  1013. ;;
  1014. (p14) LDFD f58 = [Y], 1 * SIZE
  1015. (p12) FMA f6 = ALPHA, f32, f34
  1016. (p12) FMA f7 = ALPHA, f40, f42
  1017. ;;
  1018. (p14) LDFD f59 = [Y], 1 * SIZE
  1019. (p12) shladd YY = INCY, 3, YY
  1020. (p12) FMA f10 = ALPHA, f33, f35
  1021. (p12) FMA f11 = ALPHA, f41, f43
  1022. ;;
  1023. (p15) LDFD f61 = [Y]
  1024. (p13) shladd YY = INCY, 2, YY
  1025. (p12) FMA f12 = ALPHA, f36, f38
  1026. (p12) FMA f13 = ALPHA, f44, f46
  1027. ;;
  1028. (p12) STFD [Y1] = f6, 1 * SIZE
  1029. (p12) FMA f14 = ALPHA, f37, f39
  1030. (p12) STFD [Y2] = f7, 1 * SIZE
  1031. (p12) FMA f15 = ALPHA, f45, f47
  1032. ;;
  1033. (p12) STFD [Y1] = f10, 1 * SIZE
  1034. (p13) FMA f6 = ALPHA, f48, f50
  1035. (p12) STFD [Y2] = f11, 1 * SIZE
  1036. (p14) FMA f7 = ALPHA, f56, f58
  1037. ;;
  1038. (p12) STFD [Y1] = f12, 1 * SIZE
  1039. (p13) FMA f10 = ALPHA, f49, f51
  1040. (p12) STFD [Y2] = f13, 1 * SIZE
  1041. (p14) FMA f11 = ALPHA, f57, f59
  1042. ;;
  1043. (p12) STFD [Y1] = f14, 5 * SIZE
  1044. (p13) FMA f12 = ALPHA, f52, f54
  1045. (p12) STFD [Y2] = f15, 5 * SIZE
  1046. (p15) FMA f13 = ALPHA, f60, f61
  1047. ;;
  1048. (p13) STFD [Y1] = f6, 1 * SIZE
  1049. (p14) STFD [YY] = f7, 1 * SIZE
  1050. (p13) FMA f14 = ALPHA, f53, f55
  1051. ;;
  1052. (p13) STFD [Y1] = f10, 1 * SIZE
  1053. (p14) STFD [YY] = f11, 1 * SIZE
  1054. ;;
  1055. (p13) STFD [Y1] = f12, 1 * SIZE
  1056. (p15) STFD [YY] = f13
  1057. ;;
  1058. (p13) STFD [Y1] = f14
  1059. br.ret.sptk.many b0
  1060. ;;
  1061. .align 32
  1062. .L40:
  1063. { .mmi
  1064. (p10) STFD [YYY] = f32
  1065. adds PRE1 = (PREFETCHSIZE + 38) * SIZE, X
  1066. mov ar.lc = I
  1067. }
  1068. { .mib
  1069. adds PRE2 = (PREFETCHSIZE + 14) * SIZE, Y
  1070. tbit.z p0, p12 = N, 3
  1071. (p7) br.cond.dpnt .L45
  1072. }
  1073. ;;
  1074. .align 32
  1075. .L42:
  1076. { .mmf
  1077. (p19) STFD [Y1] = f6, 1 * SIZE
  1078. (p19) STFD [Y2] = f7, 1 * SIZE
  1079. (p18) FMA f6 = ALPHA, f34, f82
  1080. }
  1081. { .mmf
  1082. (p16) lfetch.nt1 [PRE1], 16 * SIZE
  1083. (p17) LDFPD f102, f105 = [Y], 2 * SIZE
  1084. (p18) FMA f7 = ALPHA, f46, f94
  1085. }
  1086. ;;
  1087. { .mmf
  1088. (p19) STFD [Y1] = f10, 1 * SIZE
  1089. (p19) STFD [Y2] = f11, 1 * SIZE
  1090. (p18) FMA f10 = ALPHA, f37, f85
  1091. }
  1092. { .mmf
  1093. (p17) LDFPD f33, f36 = [X], 2 * SIZE
  1094. (p17) LDFPD f108, f111 = [Y], 2 * SIZE
  1095. (p18) FMA f11 = ALPHA, f49, f97
  1096. }
  1097. ;;
  1098. { .mmf
  1099. (p19) STFD [Y1] = f12, 1 * SIZE
  1100. (p19) STFD [Y2] = f13, 1 * SIZE
  1101. (p18) FMA f12 = ALPHA, f40, f88
  1102. }
  1103. { .mmf
  1104. (p17) LDFPD f39, f42 = [X], 2 * SIZE
  1105. (p17) LDFPD f114, f117 = [Y], 2 * SIZE
  1106. (p18) FMA f13 = ALPHA, f52, f100
  1107. }
  1108. ;;
  1109. { .mmf
  1110. (p19) STFD [Y1] = f14, 5 * SIZE
  1111. (p19) STFD [Y2] = f15, 5 * SIZE
  1112. (p18) FMA f14 = ALPHA, f43, f91
  1113. }
  1114. { .mmf
  1115. (p17) LDFPD f45, f48 = [X], 2 * SIZE
  1116. (p17) LDFPD f120, f123 = [Y], 2 * SIZE
  1117. (p18) FMA f15 = ALPHA, f55, f103
  1118. }
  1119. ;;
  1120. { .mmf
  1121. (p18) STFD [Y1] = f6, 1 * SIZE
  1122. (p18) STFD [Y2] = f7, 1 * SIZE
  1123. (p18) FMA f6 = ALPHA, f58, f106
  1124. }
  1125. { .mmf
  1126. (p17) LDFPD f51, f54 = [X], 2 * SIZE
  1127. (p17) LDFD f126 = [Y], 1 * SIZE
  1128. (p18) FMA f7 = ALPHA, f70, f118
  1129. }
  1130. ;;
  1131. { .mmf
  1132. (p18) STFD [Y1] = f10, 1 * SIZE
  1133. (p18) STFD [Y2] = f11, 1 * SIZE
  1134. (p18) FMA f10 = ALPHA, f61, f109
  1135. }
  1136. { .mmf
  1137. (p17) LDFPD f57, f60 = [X], 2 * SIZE
  1138. (p16) LDFD f80 = [Y], 1 * SIZE
  1139. (p18) FMA f11 = ALPHA, f73, f121
  1140. }
  1141. ;;
  1142. { .mmf
  1143. (p18) STFD [Y1] = f12, 1 * SIZE
  1144. (p18) STFD [Y2] = f13, 1 * SIZE
  1145. (p18) FMA f12 = ALPHA, f64, f112
  1146. }
  1147. { .mmf
  1148. (p17) LDFPD f63, f66 = [X], 2 * SIZE
  1149. (p16) LDFPD f83, f86 = [Y], 2 * SIZE
  1150. (p18) FMA f13 = ALPHA, f76, f124
  1151. }
  1152. ;;
  1153. { .mmf
  1154. (p18) STFD [Y1] = f14, 5 * SIZE
  1155. (p18) STFD [Y2] = f15, 5 * SIZE
  1156. (p18) FMA f14 = ALPHA, f67, f115
  1157. }
  1158. { .mmf
  1159. (p17) LDFPD f69, f72 = [X], 2 * SIZE
  1160. (p16) LDFPD f89, f92 = [Y], 2 * SIZE
  1161. (p18) FMA f15 = ALPHA, f79, f127
  1162. }
  1163. ;;
  1164. #if 0
  1165. (p16) lfetch.excl.nt1 [PRE2], 16 * SIZE
  1166. #endif
  1167. { .mmb
  1168. (p17) LDFPD f75, f78 = [X], 2 * SIZE
  1169. (p16) LDFPD f95, f98 = [Y], 2 * SIZE
  1170. br.ctop.sptk.few .L42
  1171. }
  1172. ;;
  1173. { .mmf
  1174. (p19) STFD [Y1] = f6, 1 * SIZE
  1175. (p19) STFD [Y2] = f7, 1 * SIZE
  1176. }
  1177. ;;
  1178. { .mmf
  1179. (p19) STFD [Y1] = f10, 1 * SIZE
  1180. (p19) STFD [Y2] = f11, 1 * SIZE
  1181. }
  1182. ;;
  1183. { .mmf
  1184. (p19) STFD [Y1] = f12, 1 * SIZE
  1185. (p19) STFD [Y2] = f13, 1 * SIZE
  1186. }
  1187. ;;
  1188. { .mmf
  1189. (p19) STFD [Y1] = f14, 5 * SIZE
  1190. (p19) STFD [Y2] = f15, 5 * SIZE
  1191. }
  1192. ;;
  1193. .align 32
  1194. .L45:
  1195. { .mmi
  1196. (p12) LDFPD f32, f33 = [X], 2 * SIZE
  1197. (p12) LDFD f34 = [Y], 1 * SIZE;
  1198. mov pr = PR, -65474
  1199. }
  1200. ;;
  1201. { .mmi
  1202. (p12) LDFPD f36, f37 = [X], 2 * SIZE
  1203. (p12) LDFPD f35, f38 = [Y], 2 * SIZE
  1204. mov ar.lc = ARLC
  1205. }
  1206. ;;
  1207. { .mmb
  1208. (p12) LDFPD f40, f41 = [X], 2 * SIZE
  1209. (p12) LDFPD f39, f42 = [Y], 2 * SIZE
  1210. (p9) br.ret.sptk.many b0
  1211. }
  1212. ;;
  1213. { .mmi
  1214. (p12) LDFPD f44, f45 = [X], 2 * SIZE
  1215. (p12) LDFPD f43, f46 = [Y], 2 * SIZE
  1216. tbit.z p0, p13 = N, 2
  1217. }
  1218. ;;
  1219. { .mmi
  1220. (p13) LDFPD f48, f49 = [X], 2 * SIZE
  1221. (p12) LDFD f47 = [Y], 1 * SIZE
  1222. tbit.z p0, p14 = N, 1
  1223. }
  1224. ;;
  1225. { .mmi
  1226. (p13) LDFPD f52, f53 = [X], 2 * SIZE
  1227. (p13) LDFD f50 = [Y], 1 * SIZE
  1228. tbit.z p0, p15 = N, 0
  1229. }
  1230. ;;
  1231. { .mmi
  1232. (p14) LDFPD f56, f57 = [X], 2 * SIZE
  1233. (p13) LDFPD f51, f54 = [Y], 2 * SIZE
  1234. mov YY = Y1;
  1235. }
  1236. ;;
  1237. (p15) LDFD f60 = [X]
  1238. (p13) LDFD f55 = [Y], 1 * SIZE
  1239. ;;
  1240. (p14) LDFD f58 = [Y], 1 * SIZE
  1241. (p12) FMA f6 = ALPHA, f32, f34
  1242. (p12) FMA f7 = ALPHA, f40, f42
  1243. ;;
  1244. (p14) LDFD f59 = [Y], 1 * SIZE
  1245. (p12) shladd YY = INCY, 3, YY
  1246. (p12) FMA f10 = ALPHA, f33, f35
  1247. (p12) FMA f11 = ALPHA, f41, f43
  1248. ;;
  1249. (p15) LDFD f61 = [Y]
  1250. (p13) shladd YY = INCY, 2, YY
  1251. (p12) FMA f12 = ALPHA, f36, f38
  1252. (p12) FMA f13 = ALPHA, f44, f46
  1253. ;;
  1254. (p12) STFD [Y1] = f6, 1 * SIZE
  1255. (p12) FMA f14 = ALPHA, f37, f39
  1256. (p12) STFD [Y2] = f7, 1 * SIZE
  1257. (p12) FMA f15 = ALPHA, f45, f47
  1258. ;;
  1259. (p12) STFD [Y1] = f10, 1 * SIZE
  1260. (p13) FMA f6 = ALPHA, f48, f50
  1261. (p12) STFD [Y2] = f11, 1 * SIZE
  1262. (p14) FMA f7 = ALPHA, f56, f58
  1263. ;;
  1264. (p12) STFD [Y1] = f12, 1 * SIZE
  1265. (p13) FMA f10 = ALPHA, f49, f51
  1266. (p12) STFD [Y2] = f13, 1 * SIZE
  1267. (p14) FMA f11 = ALPHA, f57, f59
  1268. ;;
  1269. (p12) STFD [Y1] = f14, 5 * SIZE
  1270. (p13) FMA f12 = ALPHA, f52, f54
  1271. (p12) STFD [Y2] = f15, 5 * SIZE
  1272. (p15) FMA f13 = ALPHA, f60, f61
  1273. ;;
  1274. (p13) STFD [Y1] = f6, 1 * SIZE
  1275. (p14) STFD [YY] = f7, 1 * SIZE
  1276. (p13) FMA f14 = ALPHA, f53, f55
  1277. ;;
  1278. (p13) STFD [Y1] = f10, 1 * SIZE
  1279. (p14) STFD [YY] = f11, 1 * SIZE
  1280. ;;
  1281. (p13) STFD [Y1] = f12, 1 * SIZE
  1282. (p15) STFD [YY] = f13
  1283. ;;
  1284. (p13) STFD [Y1] = f14
  1285. br.ret.sptk.many b0
  1286. ;;
  1287. .align 32
  1288. .L100:
  1289. { .mii
  1290. and J = 15, N
  1291. shr I = N, 4
  1292. mov ar.ec = 3
  1293. }
  1294. ;;
  1295. { .mmi
  1296. cmp.eq p9, p0 = r0, J
  1297. cmp.eq p7 ,p0 = 0, I
  1298. adds I = -1, I
  1299. }
  1300. ;;
  1301. { .mmi
  1302. (p10) STFD [YYY] = f32
  1303. adds PRE1 = PREFETCHSIZE * SIZE, X
  1304. mov ar.lc = I
  1305. }
  1306. { .mib
  1307. adds PRE2 = PREFETCHSIZE * SIZE, Y
  1308. tbit.z p0, p12 = N, 3
  1309. (p7) br.cond.dpnt .L115
  1310. }
  1311. ;;
  1312. .align 32
  1313. .L112:
  1314. { .mmi
  1315. (p18) STFD [Y1] = f6
  1316. (p16) lfetch.nt1 [PRE1], INCX16
  1317. (p18) add Y1 = INCY, Y1
  1318. }
  1319. {.mmf
  1320. (p16) LDFD f32 = [X], INCX
  1321. (p16) LDFD f80 = [Y], INCY
  1322. (p18) FMA f6 = ALPHA, f58, f106
  1323. }
  1324. ;;
  1325. { .mmi
  1326. (p18) STFD [Y1] = f7
  1327. (p16) lfetch.excl.nt1 [PRE2], INCY16
  1328. (p18) add Y1 = INCY, Y1
  1329. }
  1330. { .mmf
  1331. (p16) LDFD f35 = [X], INCX
  1332. (p16) LDFD f83 = [Y], INCY
  1333. (p18) FMA f7 = ALPHA, f61, f109
  1334. }
  1335. ;;
  1336. { .mmi
  1337. (p18) STFD [Y1] = f10
  1338. (p18) add Y1 = INCY, Y1
  1339. nop.i 0
  1340. }
  1341. { .mmf
  1342. (p16) LDFD f38 = [X], INCX
  1343. (p16) LDFD f86 = [Y], INCY
  1344. (p18) FMA f10 = ALPHA, f64, f112
  1345. }
  1346. ;;
  1347. { .mmi
  1348. (p18) STFD [Y1] = f11
  1349. (p18) add Y1 = INCY, Y1
  1350. nop.i 0
  1351. }
  1352. { .mmf
  1353. (p16) LDFD f41 = [X], INCX
  1354. (p16) LDFD f89 = [Y], INCY
  1355. (p18) FMA f11 = ALPHA, f67, f115
  1356. }
  1357. ;;
  1358. { .mmi
  1359. (p18) STFD [Y1] = f12
  1360. (p18) add Y1 = INCY, Y1
  1361. nop.i 0
  1362. }
  1363. { .mmf
  1364. (p16) LDFD f44 = [X], INCX
  1365. (p16) LDFD f92 = [Y], INCY
  1366. (p18) FMA f12 = ALPHA, f70, f118
  1367. }
  1368. ;;
  1369. { .mmi
  1370. (p18) STFD [Y1] = f13
  1371. (p18) add Y1 = INCY, Y1
  1372. nop.i 0
  1373. }
  1374. { .mmf
  1375. (p16) LDFD f47 = [X], INCX
  1376. (p16) LDFD f95 = [Y], INCY
  1377. (p18) FMA f13 = ALPHA, f73, f121
  1378. }
  1379. ;;
  1380. { .mmi
  1381. (p18) STFD [Y1] = f14
  1382. (p18) add Y1 = INCY, Y1
  1383. nop.i 0
  1384. }
  1385. { .mmf
  1386. (p16) LDFD f50 = [X], INCX
  1387. (p16) LDFD f98 = [Y], INCY
  1388. (p18) FMA f14 = ALPHA, f76, f124
  1389. }
  1390. ;;
  1391. { .mmi
  1392. (p18) STFD [Y1] = f15
  1393. (p18) add Y1 = INCY, Y1
  1394. nop.i 0
  1395. }
  1396. { .mmf
  1397. (p16) LDFD f53 = [X], INCX
  1398. (p16) LDFD f101 = [Y], INCY
  1399. (p18) FMA f15 = ALPHA, f79, f127
  1400. }
  1401. ;;
  1402. { .mmi
  1403. (p18) STFD [Y1] = f6
  1404. (p18) add Y1 = INCY, Y1
  1405. nop.i 0
  1406. }
  1407. { .mmf
  1408. (p16) LDFD f56 = [X], INCX
  1409. (p16) LDFD f104 = [Y], INCY
  1410. (p17) FMA f6 = ALPHA, f33, f81
  1411. }
  1412. ;;
  1413. { .mmi
  1414. (p18) STFD [Y1] = f7
  1415. (p18) add Y1 = INCY, Y1
  1416. nop.i 0
  1417. }
  1418. { .mmf
  1419. (p16) LDFD f59 = [X], INCX
  1420. (p16) LDFD f107 = [Y], INCY
  1421. (p17) FMA f7 = ALPHA, f36, f84
  1422. }
  1423. ;;
  1424. { .mmi
  1425. (p18) STFD [Y1] = f10
  1426. (p18) add Y1 = INCY, Y1
  1427. nop.i 0
  1428. }
  1429. { .mmf
  1430. (p16) LDFD f62 = [X], INCX
  1431. (p16) LDFD f110 = [Y], INCY
  1432. (p17) FMA f10 = ALPHA, f39, f87
  1433. }
  1434. ;;
  1435. { .mmi
  1436. (p18) STFD [Y1] = f11
  1437. (p18) add Y1 = INCY, Y1
  1438. nop.i 0
  1439. }
  1440. { .mmf
  1441. (p16) LDFD f65 = [X], INCX
  1442. (p16) LDFD f113 = [Y], INCY
  1443. (p17) FMA f11 = ALPHA, f42, f90
  1444. }
  1445. ;;
  1446. { .mmi
  1447. (p18) STFD [Y1] = f12
  1448. (p18) add Y1 = INCY, Y1
  1449. nop.i 0
  1450. }
  1451. { .mmf
  1452. (p16) LDFD f68 = [X], INCX
  1453. (p16) LDFD f116 = [Y], INCY
  1454. (p17) FMA f12 = ALPHA, f45, f93
  1455. }
  1456. ;;
  1457. { .mmi
  1458. (p18) STFD [Y1] = f13
  1459. (p18) add Y1 = INCY, Y1
  1460. nop.i 0
  1461. }
  1462. { .mmf
  1463. (p16) LDFD f71 = [X], INCX
  1464. (p16) LDFD f119 = [Y], INCY
  1465. (p17) FMA f13 = ALPHA, f48, f96
  1466. }
  1467. ;;
  1468. { .mmi
  1469. (p18) STFD [Y1] = f14
  1470. (p18) add Y1 = INCY, Y1
  1471. nop.i 0
  1472. }
  1473. { .mmf
  1474. (p16) LDFD f74 = [X], INCX
  1475. (p16) LDFD f122 = [Y], INCY
  1476. (p17) FMA f14 = ALPHA, f51, f99
  1477. }
  1478. ;;
  1479. { .mmf
  1480. (p18) STFD [Y1] = f15
  1481. (p18) add Y1 = INCY, Y1
  1482. (p17) FMA f15 = ALPHA, f54, f102
  1483. }
  1484. { .mmb
  1485. (p16) LDFD f77 = [X], INCX
  1486. (p16) LDFD f125 = [Y], INCY
  1487. br.ctop.sptk.few .L112
  1488. }
  1489. ;;
  1490. .align 32
  1491. .L115:
  1492. (p12) LDFD f32 = [X], INCX
  1493. (p12) LDFD f34 = [Y], INCY
  1494. mov pr = PR, -65474
  1495. ;;
  1496. (p12) LDFD f33 = [X], INCX
  1497. (p12) LDFD f35 = [Y], INCY
  1498. mov ar.lc = ARLC
  1499. ;;
  1500. (p12) LDFD f36 = [X], INCX
  1501. (p12) LDFD f38 = [Y], INCY
  1502. (p9) br.ret.sptk.many b0
  1503. ;;
  1504. (p12) LDFD f37 = [X], INCX
  1505. (p12) LDFD f39 = [Y], INCY
  1506. tbit.z p0, p13 = N, 2
  1507. ;;
  1508. (p12) LDFD f40 = [X], INCX
  1509. (p12) LDFD f42 = [Y], INCY
  1510. tbit.z p0, p14 = N, 1
  1511. ;;
  1512. (p12) LDFD f41 = [X], INCX
  1513. (p12) LDFD f43 = [Y], INCY
  1514. tbit.z p0, p15 = N, 0
  1515. ;;
  1516. { .mmf
  1517. (p12) LDFD f44 = [X], INCX
  1518. (p12) LDFD f46 = [Y], INCY
  1519. (p12) FMA f6 = ALPHA, f32, f34
  1520. }
  1521. ;;
  1522. { .mmf
  1523. (p12) LDFD f45 = [X], INCX
  1524. (p12) LDFD f47 = [Y], INCY
  1525. (p12) FMA f7 = ALPHA, f33, f35
  1526. }
  1527. ;;
  1528. { .mmf
  1529. (p13) LDFD f48 = [X], INCX
  1530. (p13) LDFD f50 = [Y], INCY
  1531. (p12) FMA f10 = ALPHA, f36, f38
  1532. }
  1533. ;;
  1534. { .mmf
  1535. (p13) LDFD f49 = [X], INCX
  1536. (p13) LDFD f51 = [Y], INCY
  1537. (p12) FMA f11 = ALPHA, f37, f39
  1538. }
  1539. ;;
  1540. { .mmi
  1541. (p12) STFD [Y1] = f6
  1542. (p12) add Y1 = INCY, Y1
  1543. nop.i 0
  1544. }
  1545. { .mmf
  1546. (p13) LDFD f52 = [X], INCX
  1547. (p13) LDFD f54 = [Y], INCY
  1548. (p12) FMA f12 = ALPHA, f40, f42
  1549. }
  1550. ;;
  1551. { .mmi
  1552. (p12) STFD [Y1] = f7
  1553. (p12) add Y1 = INCY, Y1
  1554. nop.i 0
  1555. }
  1556. { .mmf
  1557. (p13) LDFD f53 = [X], INCX
  1558. (p13) LDFD f55 = [Y], INCY
  1559. (p12) FMA f13 = ALPHA, f41, f43
  1560. }
  1561. ;;
  1562. { .mmi
  1563. (p12) STFD [Y1] = f10
  1564. (p12) add Y1 = INCY, Y1
  1565. nop.i 0
  1566. }
  1567. { .mmf
  1568. (p14) LDFD f56 = [X], INCX
  1569. (p14) LDFD f58 = [Y], INCY
  1570. (p12) FMA f14 = ALPHA, f44, f46
  1571. }
  1572. ;;
  1573. { .mmi
  1574. (p12) STFD [Y1] = f11
  1575. (p12) add Y1 = INCY, Y1
  1576. nop.i 0
  1577. }
  1578. { .mmf
  1579. (p14) LDFD f57 = [X], INCX
  1580. (p14) LDFD f59 = [Y], INCY
  1581. (p12) FMA f15 = ALPHA, f45, f47
  1582. }
  1583. ;;
  1584. { .mmi
  1585. (p12) STFD [Y1] = f12
  1586. (p12) add Y1 = INCY, Y1
  1587. nop.i 0
  1588. }
  1589. { .mmf
  1590. (p15) LDFD f60 = [X]
  1591. (p15) LDFD f61 = [Y]
  1592. (p13) FMA f6 = ALPHA, f48, f50
  1593. }
  1594. ;;
  1595. { .mmf
  1596. (p12) STFD [Y1] = f13
  1597. (p12) add Y1 = INCY, Y1
  1598. (p13) FMA f7 = ALPHA, f49, f51
  1599. }
  1600. ;;
  1601. { .mmf
  1602. (p12) STFD [Y1] = f14
  1603. (p12) add Y1 = INCY, Y1
  1604. (p13) FMA f10 = ALPHA, f52, f54
  1605. }
  1606. ;;
  1607. { .mmf
  1608. (p12) STFD [Y1] = f15
  1609. (p12) add Y1 = INCY, Y1
  1610. (p13) FMA f11 = ALPHA, f53, f55
  1611. }
  1612. ;;
  1613. { .mmf
  1614. (p13) STFD [Y1] = f6
  1615. (p13) add Y1 = INCY, Y1
  1616. (p14) FMA f12 = ALPHA, f56, f58
  1617. }
  1618. ;;
  1619. { .mmf
  1620. (p13) STFD [Y1] = f7
  1621. (p13) add Y1 = INCY, Y1
  1622. (p14) FMA f13 = ALPHA, f57, f59
  1623. }
  1624. ;;
  1625. { .mmf
  1626. (p13) STFD [Y1] = f10
  1627. (p13) add Y1 = INCY, Y1
  1628. (p15) FMA f14 = ALPHA, f60, f61
  1629. }
  1630. ;;
  1631. (p13) STFD [Y1] = f11
  1632. (p13) add Y1 = INCY, Y1
  1633. ;;
  1634. (p14) STFD [Y1] = f12
  1635. (p14) add Y1 = INCY, Y1
  1636. ;;
  1637. (p14) STFD [Y1] = f13
  1638. (p14) add Y1 = INCY, Y1
  1639. ;;
  1640. (p15) STFD [Y1] = f14
  1641. br.ret.sptk.many b0
  1642. ;;
  1643. EPILOGUE