You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

scal.S 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef DOUBLE
  41. #define PREFETCH_SIZE (8 * 16)
  42. #else
  43. #define PREFETCH_SIZE (1 * 64)
  44. #endif
  45. #define ALPHA f8
  46. #define N r32
  47. #define X1 r36
  48. #define INCX r37
  49. #define X2 r14
  50. #define Y1 r15
  51. #define Y2 r16
  52. #define PRE1 r17
  53. #define I r18
  54. #define NAND15 r19
  55. #define INCX5 r20
  56. #define INCX16 r21
  57. #define XX r22
  58. #define PR r30
  59. #define ARLC r31
  60. PROLOGUE
  61. .prologue
  62. PROFCODE
  63. { .mfi
  64. shladd INCX = INCX, BASE_SHIFT, r0
  65. fcmp.eq p0, p6 = ALPHA, f0
  66. .save ar.lc, ARLC
  67. mov ARLC = ar.lc
  68. }
  69. { .mib
  70. cmp.ge p7, p0 = 0, N
  71. tbit.z p0, p10 = X1, BASE_SHIFT
  72. (p7) br.ret.sptk.many b0
  73. }
  74. .body
  75. ;;
  76. { .mmi
  77. mov XX = X1
  78. (p10) LDFD f32 = [X1], INCX
  79. mov PR = pr
  80. }
  81. { .mmi
  82. shladd INCX5 = INCX, 2, INCX
  83. shladd INCX16 = INCX, 4, r0
  84. (p10) adds N = -1, N
  85. }
  86. ;;
  87. { .mmi
  88. shladd X2 = INCX, 2, X1
  89. nop __LINE__
  90. mov ar.ec = 5
  91. }
  92. { .mmi
  93. and NAND15 = 15, N
  94. nop __LINE__
  95. shr I = N, 4
  96. }
  97. ;;
  98. { .mmi
  99. adds I = -1, I
  100. nop __LINE__
  101. tbit.z p0, p12 = N, 3
  102. }
  103. { .mmb
  104. cmp.ge p9, p0 = 0, NAND15
  105. adds PRE1 = PREFETCH_SIZE * SIZE + 192, XX
  106. (p6) br.cond.dptk .L100 // if (alpha != 0) goto L3
  107. }
  108. ;;
  109. { .mmi
  110. (p10) STFD [XX] = f0
  111. nop __LINE__
  112. mov ar.lc = I
  113. }
  114. { .mmb
  115. cmp.gt p8, p0 = 0, I
  116. (p8) br.cond.dpnt .L30
  117. }
  118. ;;
  119. .align 32
  120. .L20:
  121. {.mmi
  122. STFD [X1] = f0
  123. STFD [X2] = f0
  124. nop __LINE__
  125. }
  126. {.mmi
  127. lfetch.excl.nt1 [PRE1], INCX16
  128. add X1 = INCX, X1
  129. add X2 = INCX, X2
  130. }
  131. ;;
  132. {.mmi
  133. STFD [X1] = f0
  134. STFD [X2] = f0
  135. nop __LINE__
  136. }
  137. {.mmi
  138. add X1 = INCX, X1
  139. add X2 = INCX, X2
  140. nop __LINE__
  141. }
  142. ;;
  143. {.mmi
  144. STFD [X1] = f0
  145. STFD [X2] = f0
  146. nop __LINE__
  147. }
  148. {.mmi
  149. add X1 = INCX, X1
  150. add X2 = INCX, X2
  151. nop __LINE__
  152. }
  153. ;;
  154. {.mmi
  155. STFD [X1] = f0
  156. STFD [X2] = f0
  157. nop __LINE__
  158. }
  159. {.mmi
  160. add X1 = INCX5, X1
  161. add X2 = INCX5, X2
  162. nop __LINE__
  163. }
  164. ;;
  165. {.mmi
  166. STFD [X1] = f0
  167. STFD [X2] = f0
  168. nop __LINE__
  169. }
  170. {.mmi
  171. add X1 = INCX, X1
  172. add X2 = INCX, X2
  173. nop __LINE__
  174. }
  175. ;;
  176. {.mmi
  177. STFD [X1] = f0
  178. STFD [X2] = f0
  179. nop __LINE__
  180. }
  181. {.mmi
  182. add X1 = INCX, X1
  183. add X2 = INCX, X2
  184. nop __LINE__
  185. }
  186. ;;
  187. {.mmi
  188. STFD [X1] = f0
  189. STFD [X2] = f0
  190. nop __LINE__
  191. }
  192. {.mmi
  193. add X1 = INCX, X1
  194. add X2 = INCX, X2
  195. nop __LINE__
  196. }
  197. ;;
  198. {.mmi
  199. STFD [X1] = f0
  200. STFD [X2] = f0
  201. nop __LINE__
  202. }
  203. {.mmb
  204. add X1 = INCX5, X1
  205. add X2 = INCX5, X2
  206. br.cloop.sptk.few .L20
  207. }
  208. ;;
  209. .align 16
  210. .L30:
  211. { .mmi
  212. (p12) STFD [X1] = f0
  213. (p12) STFD [X2] = f0
  214. mov ar.lc = ARLC
  215. }
  216. { .mmb
  217. (p12) add X1 = INCX, X1
  218. (p12) add X2 = INCX, X2
  219. (p9) br.ret.sptk.many b0
  220. }
  221. ;;
  222. { .mmi
  223. (p12) STFD [X1] = f0
  224. (p12) add X1 = INCX, X1
  225. tbit.z p0, p13 = N, 2
  226. }
  227. { .mmi
  228. (p12) STFD [X2] = f0
  229. (p12) add X2 = INCX, X2
  230. tbit.z p0, p14 = N, 1
  231. }
  232. ;;
  233. { .mmi
  234. (p12) STFD [X1] = f0
  235. (p12) add X1 = INCX, X1
  236. tbit.z p0, p15 = N, 0
  237. }
  238. { .mmb
  239. (p12) STFD [X2] = f0
  240. (p12) add X2 = INCX, X2
  241. nop __LINE__
  242. }
  243. ;;
  244. { .mmb
  245. (p12) STFD [X1] = f0
  246. (p12) add X1 = INCX5, X1
  247. nop __LINE__
  248. }
  249. { .mmb
  250. (p12) STFD [X2] = f0
  251. (p12) add X2 = INCX5, X2
  252. nop __LINE__
  253. }
  254. ;;
  255. { .mmb
  256. (p13) STFD [X1] = f0
  257. (p13) add X1 = INCX, X1
  258. nop __LINE__
  259. }
  260. ;;
  261. { .mmb
  262. (p13) STFD [X1] = f0
  263. (p13) add X1 = INCX, X1
  264. nop __LINE__
  265. }
  266. ;;
  267. { .mmb
  268. (p13) STFD [X1] = f0
  269. (p13) add X1 = INCX, X1
  270. nop __LINE__
  271. }
  272. ;;
  273. { .mmb
  274. (p13) STFD [X1] = f0
  275. (p13) add X1 = INCX, X1
  276. nop __LINE__
  277. }
  278. ;;
  279. { .mmb
  280. (p14) STFD [X1] = f0
  281. (p14) add X1 = INCX, X1
  282. nop __LINE__
  283. }
  284. ;;
  285. { .mmb
  286. (p14) STFD [X1] = f0
  287. (p14) add X1 = INCX, X1
  288. nop __LINE__
  289. }
  290. ;;
  291. { .mmb
  292. (p15) STFD [X1] = f0
  293. nop __LINE__
  294. br.ret.sptk.many b0
  295. }
  296. ;;
  297. .align 32
  298. .L100:
  299. { .mmi
  300. mov Y1 = X1
  301. shladd Y2 = INCX, 2, X1
  302. mov pr.rot = 0
  303. }
  304. { .mmf
  305. cmp.gt p8, p0 = 0, I
  306. shladd X2 = INCX, 2, X1
  307. (p10) FMPY f32 = ALPHA, f32
  308. }
  309. ;;
  310. { .mmi
  311. (p10) STFD [XX] = f32
  312. cmp.eq p0, p7 = SIZE, INCX
  313. mov ar.lc = I
  314. }
  315. { .mbb
  316. cmp.eq p16, p0 = r0, r0
  317. (p7) br.cond.dpnt .L300
  318. (p8) br.cond.dpnt .L120
  319. }
  320. ;;
  321. .align 32
  322. .L110:
  323. { .mmf
  324. (p21) STFD [Y1] = f6, 1 * SIZE
  325. (p21) STFD [Y2] = f7, 1 * SIZE
  326. (p20) FMPY f112 = ALPHA, f36
  327. }
  328. { .mmf
  329. (p16) lfetch.excl.nt1 [PRE1], 16 * SIZE
  330. (p16) LDFPD f32, f37 = [X1], 2 * SIZE
  331. (p20) FMPY f113 = ALPHA, f56
  332. }
  333. ;;
  334. { .mmf
  335. (p21) STFD [Y1] = f10, 1 * SIZE
  336. (p21) STFD [Y2] = f11, 1 * SIZE
  337. (p20) FMPY f114 = ALPHA, f41
  338. }
  339. { .mfi
  340. (p16) LDFPD f42, f47 = [X1], 2 * SIZE
  341. (p20) FMPY f115 = ALPHA, f61
  342. nop __LINE__
  343. }
  344. ;;
  345. { .mmf
  346. (p21) STFD [Y1] = f12, 1 * SIZE
  347. (p21) STFD [Y2] = f13, 1 * SIZE
  348. (p20) FMPY f116 = ALPHA, f46
  349. }
  350. { .mfi
  351. (p16) LDFPD f52, f57 = [X1], 2 * SIZE
  352. (p20) FMPY f117 = ALPHA, f66
  353. nop __LINE__
  354. }
  355. ;;
  356. { .mmf
  357. (p21) STFD [Y1] = f14, 5 * SIZE
  358. (p21) STFD [Y2] = f15, 5 * SIZE
  359. (p20) FMPY f118 = ALPHA, f51
  360. }
  361. { .mfi
  362. (p16) LDFPD f62, f67 = [X1], 2 * SIZE
  363. (p20) FMPY f119 = ALPHA, f71
  364. nop __LINE__
  365. }
  366. ;;
  367. { .mmf
  368. (p20) STFD [Y1] = f112, 1 * SIZE
  369. (p20) STFD [Y2] = f113, 1 * SIZE
  370. (p20) FMPY f6 = ALPHA, f76
  371. }
  372. { .mfi
  373. (p16) LDFPD f72, f77 = [X1], 2 * SIZE
  374. (p20) FMPY f7 = ALPHA, f96
  375. nop __LINE__
  376. }
  377. ;;
  378. { .mmf
  379. (p20) STFD [Y1] = f114, 1 * SIZE
  380. (p20) STFD [Y2] = f115, 1 * SIZE
  381. (p20) FMPY f10 = ALPHA, f81
  382. }
  383. { .mfi
  384. (p16) LDFPD f82, f87 = [X1], 2 * SIZE
  385. (p20) FMPY f11 = ALPHA, f101
  386. nop __LINE__
  387. }
  388. ;;
  389. { .mmf
  390. (p20) STFD [Y1] = f116, 1 * SIZE
  391. (p20) STFD [Y2] = f117, 1 * SIZE
  392. (p20) FMPY f12 = ALPHA, f86
  393. }
  394. { .mfi
  395. (p16) LDFPD f92, f97 = [X1], 2 * SIZE
  396. (p20) FMPY f13 = ALPHA, f106
  397. (p20) shladd X2 = INCX, 2, X1
  398. }
  399. ;;
  400. { .mmf
  401. (p20) STFD [Y1] = f118, 5 * SIZE
  402. (p20) STFD [Y2] = f119, 5 * SIZE
  403. (p20) FMPY f14 = ALPHA, f91
  404. }
  405. { .mfb
  406. (p16) LDFPD f102, f107 = [X1], 2 * SIZE
  407. (p20) FMPY f15 = ALPHA, f111
  408. br.ctop.sptk.few .L110
  409. }
  410. ;;
  411. .align 32
  412. .L120:
  413. { .mmi
  414. (p21) STFD [Y1] = f6, 1 * SIZE
  415. (p21) STFD [Y2] = f7, 1 * SIZE
  416. tbit.z p0, p13 = N, 2
  417. }
  418. { .mmi
  419. (p12) LDFPD f32, f33 = [X1], 2 * SIZE
  420. (p12) LDFPD f36, f37 = [X2], 2 * SIZE
  421. nop __LINE__
  422. }
  423. ;;
  424. { .mmi
  425. (p21) STFD [Y1] = f10, 1 * SIZE
  426. (p21) STFD [Y2] = f11, 1 * SIZE
  427. mov ar.lc = ARLC
  428. }
  429. { .mmi
  430. (p12) LDFPD f34, f35 = [X1]
  431. (p12) LDFPD f38, f39 = [X2]
  432. (p12) adds X1 = 6 * SIZE,X1
  433. }
  434. ;;
  435. { .mmi
  436. (p21) STFD [Y1] = f12, 1 * SIZE
  437. (p21) STFD [Y2] = f13, 1 * SIZE
  438. tbit.z p0, p14 = N, 1
  439. }
  440. { .mmi
  441. (p13) LDFPD f40, f41 = [X1], 2 * SIZE
  442. nop __LINE__
  443. nop __LINE__
  444. }
  445. ;;
  446. { .mmi
  447. (p21) STFD [Y1] = f14, 5 * SIZE
  448. (p21) STFD [Y2] = f15, 5 * SIZE
  449. mov pr = PR, -65474
  450. }
  451. { .mib
  452. (p13) LDFPD f42, f43 = [X1], 2 * SIZE
  453. nop __LINE__
  454. (p9) br.ret.sptk.many b0
  455. }
  456. ;;
  457. { .mmi
  458. (p14) LDFPD f44, f45 = [X1], 2 * SIZE
  459. nop __LINE__
  460. tbit.z p0, p15 = N, 0
  461. }
  462. ;;
  463. { .mmi
  464. (p15) LDFD f46 = [X1]
  465. nop __LINE__
  466. nop __LINE__
  467. }
  468. ;;
  469. { .mmf
  470. nop __LINE__
  471. nop __LINE__
  472. (p12) FMPY f32 = ALPHA, f32
  473. }
  474. { .mmf
  475. nop __LINE__
  476. nop __LINE__
  477. (p12) FMPY f36 = ALPHA, f36
  478. }
  479. ;;
  480. { .mmf
  481. nop __LINE__
  482. nop __LINE__
  483. (p12) FMPY f33 = ALPHA, f33
  484. }
  485. { .mmf
  486. nop __LINE__
  487. nop __LINE__
  488. (p12) FMPY f37 = ALPHA, f37
  489. }
  490. ;;
  491. { .mmf
  492. nop __LINE__
  493. nop __LINE__
  494. (p12) FMPY f34 = ALPHA, f34
  495. }
  496. { .mmf
  497. nop __LINE__
  498. nop __LINE__
  499. (p12) FMPY f38 = ALPHA, f38
  500. }
  501. ;;
  502. { .mmf
  503. nop __LINE__
  504. nop __LINE__
  505. (p12) FMPY f35 = ALPHA, f35
  506. }
  507. { .mmf
  508. nop __LINE__
  509. nop __LINE__
  510. (p12) FMPY f39 = ALPHA, f39
  511. }
  512. ;;
  513. { .mmf
  514. (p12) STFD [Y1] = f32, 1 * SIZE
  515. nop __LINE__
  516. (p13) FMPY f40 = ALPHA, f40
  517. }
  518. { .mmf
  519. (p12) STFD [Y2] = f36, 1 * SIZE
  520. nop __LINE__
  521. (p13) FMPY f41 = ALPHA, f41
  522. }
  523. ;;
  524. { .mmf
  525. (p12) STFD [Y1] = f33, 1 * SIZE
  526. nop __LINE__
  527. (p13) FMPY f42 = ALPHA, f42
  528. }
  529. { .mmf
  530. (p12) STFD [Y2] = f37, 1 * SIZE
  531. nop __LINE__
  532. (p13) FMPY f43 = ALPHA, f43
  533. }
  534. ;;
  535. { .mmf
  536. (p12) STFD [Y1] = f34, 1 * SIZE
  537. nop __LINE__
  538. (p14) FMPY f44 = ALPHA, f44
  539. }
  540. { .mmf
  541. (p12) STFD [Y2] = f38, 1 * SIZE
  542. nop __LINE__
  543. (p14) FMPY f45 = ALPHA, f45
  544. }
  545. ;;
  546. { .mmf
  547. (p12) STFD [Y1] = f35, 5 * SIZE
  548. (p12) STFD [Y2] = f39, 5 * SIZE
  549. (p15) FMPY f46 = ALPHA, f46
  550. }
  551. ;;
  552. { .mmi
  553. (p13) STFD [Y1] = f40, 1 * SIZE
  554. ;;
  555. (p13) STFD [Y1] = f41, 1 * SIZE
  556. nop __LINE__
  557. }
  558. ;;
  559. { .mmi
  560. (p13) STFD [Y1] = f42, 1 * SIZE
  561. ;;
  562. (p13) STFD [Y1] = f43, 1 * SIZE
  563. nop __LINE__
  564. }
  565. ;;
  566. { .mmi
  567. (p14) STFD [Y1] = f44, 1 * SIZE
  568. ;;
  569. (p14) STFD [Y1] = f45, 1 * SIZE
  570. nop __LINE__
  571. }
  572. ;;
  573. { .mmb
  574. (p15) STFD [Y1] = f46
  575. nop __LINE__
  576. br.ret.sptk.many b0
  577. }
  578. ;;
  579. .align 32
  580. .L300:
  581. { .mmi
  582. adds PRE1 = PREFETCH_SIZE * SIZE + 64, X1
  583. nop __LINE__
  584. mov.i ar.ec = 6
  585. }
  586. { .mmb
  587. cmp.gt p8, p0 = 0, I
  588. nop __LINE__
  589. (p8) br.cond.dpnt .L320
  590. }
  591. ;;
  592. .align 32
  593. .L310:
  594. { .mmf
  595. (p16) lfetch.excl.nt1 [PRE1], INCX16
  596. (p16) LDFD f32 = [X1], INCX
  597. (p21) FMPY f6 = ALPHA, f37
  598. }
  599. { .mmb
  600. (p22) STFD [Y1] = f12
  601. (p22) add Y1 = INCX, Y1
  602. nop __LINE__
  603. }
  604. ;;
  605. { .mfb
  606. (p16) LDFD f38 = [X1], INCX
  607. (p21) FMPY f7 = ALPHA, f43
  608. nop __LINE__
  609. }
  610. { .mmb
  611. (p22) STFD [Y1] = f13
  612. (p22) add Y1 = INCX, Y1
  613. nop __LINE__
  614. }
  615. ;;
  616. { .mfb
  617. (p16) LDFD f44 = [X1], INCX
  618. (p21) FMPY f10 = ALPHA, f49
  619. nop __LINE__
  620. }
  621. { .mmb
  622. (p22) STFD [Y1] = f14
  623. (p22) add Y1 = INCX, Y1
  624. nop __LINE__
  625. }
  626. ;;
  627. { .mfb
  628. (p16) LDFD f50 = [X1], INCX
  629. (p21) FMPY f11 = ALPHA, f55
  630. nop __LINE__
  631. }
  632. { .mmb
  633. (p22) STFD [Y1] = f15
  634. (p22) add Y1 = INCX, Y1
  635. nop __LINE__
  636. }
  637. ;;
  638. { .mfb
  639. (p16) LDFD f56 = [X1], INCX
  640. (p21) FMPY f12 = ALPHA, f61
  641. nop __LINE__
  642. }
  643. { .mmb
  644. (p21) STFD [Y1] = f6
  645. (p21) add Y1 = INCX, Y1
  646. nop __LINE__
  647. }
  648. ;;
  649. { .mfb
  650. (p16) LDFD f62 = [X1], INCX
  651. (p21) FMPY f13 = ALPHA, f67
  652. nop __LINE__
  653. }
  654. { .mmb
  655. (p21) STFD [Y1] = f7
  656. (p21) add Y1 = INCX, Y1
  657. nop __LINE__
  658. }
  659. ;;
  660. { .mfb
  661. (p16) LDFD f68 = [X1], INCX
  662. (p21) FMPY f14 = ALPHA, f73
  663. nop __LINE__
  664. }
  665. { .mmb
  666. (p21) STFD [Y1] = f10
  667. (p21) add Y1 = INCX, Y1
  668. nop __LINE__
  669. }
  670. ;;
  671. { .mfb
  672. (p16) LDFD f74 = [X1], INCX
  673. (p21) FMPY f15 = ALPHA, f79
  674. nop __LINE__
  675. }
  676. { .mmb
  677. (p21) STFD [Y1] = f11
  678. (p21) add Y1 = INCX, Y1
  679. nop __LINE__
  680. }
  681. ;;
  682. { .mfb
  683. (p16) LDFD f80 = [X1], INCX
  684. (p21) FMPY f6 = ALPHA, f85
  685. nop __LINE__
  686. }
  687. { .mmb
  688. (p21) STFD [Y1] = f12
  689. (p21) add Y1 = INCX, Y1
  690. nop __LINE__
  691. }
  692. ;;
  693. { .mfb
  694. (p16) LDFD f86 = [X1], INCX
  695. (p21) FMPY f7 = ALPHA, f91
  696. nop __LINE__
  697. }
  698. { .mmb
  699. (p21) STFD [Y1] = f13
  700. (p21) add Y1 = INCX, Y1
  701. nop __LINE__
  702. }
  703. ;;
  704. { .mfb
  705. (p16) LDFD f92 = [X1], INCX
  706. (p21) FMPY f10 = ALPHA, f97
  707. nop __LINE__
  708. }
  709. { .mmb
  710. (p21) STFD [Y1] = f14
  711. (p21) add Y1 = INCX, Y1
  712. nop __LINE__
  713. }
  714. ;;
  715. { .mfb
  716. (p16) LDFD f98 = [X1], INCX
  717. (p21) FMPY f11 = ALPHA, f103
  718. nop __LINE__
  719. }
  720. { .mmb
  721. (p21) STFD [Y1] = f15
  722. (p21) add Y1 = INCX, Y1
  723. nop __LINE__
  724. }
  725. ;;
  726. { .mfb
  727. (p16) LDFD f104 = [X1], INCX
  728. (p21) FMPY f12 = ALPHA, f109
  729. nop __LINE__
  730. }
  731. { .mmb
  732. (p21) STFD [Y1] = f6
  733. (p21) add Y1 = INCX, Y1
  734. nop __LINE__
  735. }
  736. ;;
  737. { .mfb
  738. (p16) LDFD f110 = [X1], INCX
  739. (p21) FMPY f13 = ALPHA, f115
  740. nop __LINE__
  741. }
  742. { .mmb
  743. (p21) STFD [Y1] = f7
  744. (p21) add Y1 = INCX, Y1
  745. nop __LINE__
  746. }
  747. ;;
  748. { .mfb
  749. (p16) LDFD f116 = [X1], INCX
  750. (p21) FMPY f14 = ALPHA, f121
  751. nop __LINE__
  752. }
  753. { .mmb
  754. (p21) STFD [Y1] = f10
  755. (p21) add Y1 = INCX, Y1
  756. nop __LINE__
  757. }
  758. ;;
  759. { .mfb
  760. (p16) LDFD f122 = [X1], INCX
  761. (p21) FMPY f15 = ALPHA, f127
  762. nop __LINE__
  763. }
  764. { .mmb
  765. (p21) STFD [Y1] = f11
  766. (p21) add Y1 = INCX, Y1
  767. br.ctop.sptk.few .L310
  768. }
  769. ;;
  770. STFD [Y1] = f12
  771. add Y1 = INCX, Y1
  772. shladd Y2 = INCX, 2, X1
  773. ;;
  774. STFD [Y1] = f13
  775. add Y1 = INCX, Y1
  776. shladd X2 = INCX, 2, X1
  777. ;;
  778. STFD [Y1] = f14
  779. add Y1 = INCX, Y1
  780. ;;
  781. STFD [Y1] = f15
  782. add Y1 = INCX, Y1
  783. ;;
  784. .align 16
  785. .L320:
  786. { .mmi
  787. (p12) LDFD f48 = [X1], INCX
  788. (p12) LDFD f52 = [X2], INCX
  789. mov ar.lc = ARLC
  790. }
  791. ;;
  792. { .mmi
  793. (p12) LDFD f49 = [X1], INCX
  794. (p12) LDFD f53 = [X2], INCX
  795. mov pr = PR, -65474
  796. }
  797. { .mmb
  798. nop.m 0
  799. nop.m 0
  800. (p9) br.ret.sptk.many b0
  801. }
  802. ;;
  803. { .mmi
  804. (p12) LDFD f50 = [X1], INCX
  805. (p12) LDFD f54 = [X2], INCX
  806. tbit.z p0, p13 = N, 2
  807. }
  808. ;;
  809. { .mmi
  810. (p12) LDFD f51 = [X1], INCX5
  811. (p12) LDFD f55 = [X2], INCX5
  812. tbit.z p0, p14 = N, 1
  813. }
  814. ;;
  815. (p13) LDFD f56 = [X1], INCX
  816. tbit.z p0, p15 = N, 0
  817. ;;
  818. (p13) LDFD f57 = [X1], INCX
  819. ;;
  820. { .mfi
  821. (p13) LDFD f58 = [X1], INCX
  822. (p12) FMPY f48 = ALPHA, f48
  823. }
  824. { .mfi
  825. (p12) FMPY f52 = ALPHA, f52
  826. }
  827. ;;
  828. { .mfi
  829. (p13) LDFD f59 = [X1], INCX
  830. (p12) FMPY f49 = ALPHA, f49
  831. }
  832. { .mfi
  833. (p12) FMPY f53 = ALPHA, f53
  834. }
  835. ;;
  836. { .mfi
  837. (p14) LDFD f60 = [X1], INCX
  838. (p12) FMPY f50 = ALPHA, f50
  839. }
  840. { .mfi
  841. (p12) FMPY f54 = ALPHA, f54
  842. }
  843. ;;
  844. { .mfi
  845. (p14) LDFD f61 = [X1], INCX
  846. (p12) FMPY f51 = ALPHA, f51
  847. }
  848. { .mfi
  849. (p12) FMPY f55 = ALPHA, f55
  850. }
  851. ;;
  852. { .mmf
  853. (p12) STFD [Y1] = f48
  854. (p12) STFD [Y2] = f52
  855. (p13) FMPY f56 = ALPHA, f56
  856. }
  857. { .mmi
  858. (p15) LDFD f62 = [X1]
  859. (p12) add Y1 = INCX, Y1
  860. (p12) add Y2 = INCX, Y2
  861. }
  862. ;;
  863. { .mmf
  864. (p12) STFD [Y1] = f49
  865. (p12) STFD [Y2] = f53
  866. (p13) FMPY f57 = ALPHA, f57
  867. }
  868. { .mmi
  869. (p12) add Y1 = INCX, Y1
  870. (p12) add Y2 = INCX, Y2
  871. nop __LINE__
  872. }
  873. ;;
  874. { .mmf
  875. (p12) STFD [Y1] = f50
  876. (p12) STFD [Y2] = f54
  877. (p13) FMPY f58 = ALPHA, f58
  878. }
  879. { .mmi
  880. (p12) add Y1 = INCX, Y1
  881. (p12) add Y2 = INCX, Y2
  882. nop __LINE__
  883. }
  884. ;;
  885. { .mmf
  886. (p12) STFD [Y1] = f51
  887. (p12) STFD [Y2] = f55
  888. (p13) FMPY f59 = ALPHA, f59
  889. }
  890. { .mmi
  891. (p12) add Y1 = INCX5, Y1
  892. (p12) add Y2 = INCX5, Y2
  893. nop __LINE__
  894. }
  895. ;;
  896. { .mfi
  897. (p13) STFD [Y1] = f56
  898. (p14) FMPY f60 = ALPHA, f60
  899. (p13) add Y1 = INCX, Y1
  900. }
  901. ;;
  902. { .mfi
  903. (p13) STFD [Y1] = f57
  904. (p14) FMPY f61 = ALPHA, f61
  905. (p13) add Y1 = INCX, Y1
  906. }
  907. ;;
  908. { .mfi
  909. (p13) STFD [Y1] = f58
  910. (p15) FMPY f62 = ALPHA, f62
  911. (p13) add Y1 = INCX, Y1
  912. }
  913. ;;
  914. { .mmi
  915. (p13) STFD [Y1] = f59
  916. (p13) add Y1 = INCX, Y1
  917. }
  918. ;;
  919. { .mmi
  920. (p14) STFD [Y1] = f60
  921. (p14) add Y1 = INCX, Y1
  922. }
  923. ;;
  924. { .mmi
  925. (p14) STFD [Y1] = f61
  926. (p14) add Y1 = INCX, Y1
  927. }
  928. ;;
  929. { .mib
  930. (p15) STFD [Y1] = f62
  931. mov pr = PR, -65474
  932. br.ret.sptk.many b0
  933. }
  934. EPILOGUE