You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qscal.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCH_SIZE (16 * 16)
  41. #define ALPHA f8
  42. #define N r32
  43. #define X1 r38
  44. #define INCX r39
  45. #define X2 r14
  46. #define Y1 r15
  47. #define Y2 r16
  48. #define PRE1 r17
  49. #define I r18
  50. #define NAND15 r19
  51. #define INCX5 r20
  52. #define INCX8 r21
  53. #define XX r22
  54. #define PR r30
  55. #define ARLC r31
  56. PROLOGUE
  57. .prologue
  58. PROFCODE
  59. { .mfi
  60. shladd INCX = INCX, BASE_SHIFT, r0
  61. fcmp.eq p0, p6 = ALPHA, f0
  62. .save ar.lc, ARLC
  63. mov ARLC = ar.lc
  64. }
  65. .body
  66. { .mib
  67. cmp.ge p7, p0 = 0, N
  68. (p7) br.ret.sptk.many b0
  69. }
  70. ;;
  71. { .mmi
  72. mov XX = X1
  73. mov PR = pr
  74. }
  75. { .mmi
  76. shladd INCX5 = INCX, 2, INCX
  77. shladd INCX8 = INCX, 3, r0
  78. }
  79. ;;
  80. { .mmi
  81. shladd X2 = INCX, 2, X1
  82. nop.m 0
  83. mov ar.ec = 5
  84. }
  85. { .mmi
  86. and NAND15 = 15, N
  87. nop.m 0
  88. shr I = N, 4
  89. }
  90. ;;
  91. { .mmi
  92. adds I = -1, I
  93. nop.m 0
  94. tbit.z p0, p12 = N, 3
  95. }
  96. { .mmb
  97. cmp.ge p9, p0 = 0, NAND15
  98. nop.m 0
  99. (p6) br.cond.dptk .L100 // if (alpha != 0) goto L3
  100. }
  101. ;;
  102. { .mmi
  103. adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1
  104. mov ar.lc = I
  105. }
  106. { .mmb
  107. cmp.gt p8, p0 = 0, I
  108. (p8) br.cond.dpnt .L30
  109. }
  110. ;;
  111. .align 32
  112. .L20:
  113. {.mmi
  114. STFD [X1] = f0
  115. STFD [X2] = f0
  116. nop.i 0
  117. }
  118. {.mmi
  119. lfetch.excl.nt1 [PRE1], INCX8
  120. add X1 = INCX, X1
  121. add X2 = INCX, X2
  122. }
  123. ;;
  124. {.mmi
  125. STFD [X1] = f0
  126. STFD [X2] = f0
  127. nop.i 0
  128. }
  129. {.mmi
  130. add X1 = INCX, X1
  131. add X2 = INCX, X2
  132. nop.i 0
  133. }
  134. ;;
  135. {.mmi
  136. STFD [X1] = f0
  137. STFD [X2] = f0
  138. nop.i 0
  139. }
  140. {.mmi
  141. add X1 = INCX, X1
  142. add X2 = INCX, X2
  143. nop.i 0
  144. }
  145. ;;
  146. {.mmi
  147. STFD [X1] = f0
  148. STFD [X2] = f0
  149. nop.i 0
  150. }
  151. {.mmi
  152. add X1 = INCX5, X1
  153. add X2 = INCX5, X2
  154. nop.i 0
  155. }
  156. ;;
  157. {.mmi
  158. STFD [X1] = f0
  159. STFD [X2] = f0
  160. nop.i 0
  161. }
  162. {.mmi
  163. lfetch.excl.nt1 [PRE1], INCX8
  164. add X1 = INCX, X1
  165. add X2 = INCX, X2
  166. }
  167. ;;
  168. {.mmi
  169. STFD [X1] = f0
  170. STFD [X2] = f0
  171. nop.i 0
  172. }
  173. {.mmi
  174. add X1 = INCX, X1
  175. add X2 = INCX, X2
  176. nop.i 0
  177. }
  178. ;;
  179. {.mmi
  180. STFD [X1] = f0
  181. STFD [X2] = f0
  182. nop.i 0
  183. }
  184. {.mmi
  185. add X1 = INCX, X1
  186. add X2 = INCX, X2
  187. nop.i 0
  188. }
  189. ;;
  190. {.mmi
  191. STFD [X1] = f0
  192. STFD [X2] = f0
  193. nop.i 0
  194. }
  195. {.mmb
  196. add X1 = INCX5, X1
  197. add X2 = INCX5, X2
  198. br.cloop.sptk.few .L20
  199. }
  200. ;;
  201. .align 16
  202. .L30:
  203. { .mmi
  204. (p12) STFD [X1] = f0
  205. (p12) STFD [X2] = f0
  206. mov ar.lc = ARLC
  207. }
  208. { .mmb
  209. (p12) add X1 = INCX, X1
  210. (p12) add X2 = INCX, X2
  211. (p9) br.ret.sptk.many b0
  212. }
  213. ;;
  214. { .mmi
  215. (p12) STFD [X1] = f0
  216. (p12) add X1 = INCX, X1
  217. tbit.z p0, p13 = N, 2
  218. }
  219. { .mmi
  220. (p12) STFD [X2] = f0
  221. (p12) add X2 = INCX, X2
  222. tbit.z p0, p14 = N, 1
  223. }
  224. ;;
  225. { .mmi
  226. (p12) STFD [X1] = f0
  227. (p12) add X1 = INCX, X1
  228. tbit.z p0, p15 = N, 0
  229. }
  230. { .mmb
  231. (p12) STFD [X2] = f0
  232. (p12) add X2 = INCX, X2
  233. nop __LINE__
  234. }
  235. ;;
  236. { .mmb
  237. (p12) STFD [X1] = f0
  238. (p12) add X1 = INCX5, X1
  239. nop __LINE__
  240. }
  241. { .mmb
  242. (p12) STFD [X2] = f0
  243. (p12) add X2 = INCX5, X2
  244. nop __LINE__
  245. }
  246. ;;
  247. { .mmb
  248. (p13) STFD [X1] = f0
  249. (p13) add X1 = INCX, X1
  250. nop __LINE__
  251. }
  252. ;;
  253. { .mmb
  254. (p13) STFD [X1] = f0
  255. (p13) add X1 = INCX, X1
  256. nop __LINE__
  257. }
  258. ;;
  259. { .mmb
  260. (p13) STFD [X1] = f0
  261. (p13) add X1 = INCX, X1
  262. nop __LINE__
  263. }
  264. ;;
  265. { .mmb
  266. (p13) STFD [X1] = f0
  267. (p13) add X1 = INCX, X1
  268. nop __LINE__
  269. }
  270. ;;
  271. { .mmb
  272. (p14) STFD [X1] = f0
  273. (p14) add X1 = INCX, X1
  274. nop __LINE__
  275. }
  276. ;;
  277. { .mmb
  278. (p14) STFD [X1] = f0
  279. (p14) add X1 = INCX, X1
  280. nop __LINE__
  281. }
  282. ;;
  283. { .mmb
  284. (p15) STFD [X1] = f0
  285. nop.m 0
  286. br.ret.sptk.many b0
  287. }
  288. ;;
  289. .align 32
  290. .L100:
  291. { .mmi
  292. mov Y1 = X1
  293. shladd Y2 = INCX, 2, X1
  294. mov pr.rot= 0
  295. }
  296. ;;
  297. { .mmi
  298. mov ar.lc = I
  299. }
  300. cmp.eq p16, p0 = r0, r0
  301. ;;
  302. { .mmi
  303. adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1
  304. nop.m 0
  305. mov.i ar.ec = 6
  306. }
  307. { .mmb
  308. cmp.gt p8, p0 = 0, I
  309. nop.m 0
  310. (p8) br.cond.dpnt .L320
  311. }
  312. ;;
  313. .align 32
  314. .L310:
  315. { .mmf
  316. (p16) lfetch.excl.nt1 [PRE1], INCX8
  317. (p22) STFD [Y1] = f12
  318. (p21) FMPY f6 = ALPHA, f37
  319. }
  320. { .mmi
  321. (p16) LDFD f32 = [X1], INCX
  322. nop __LINE__
  323. (p22) add Y1 = INCX, Y1
  324. }
  325. ;;
  326. { .mmf
  327. (p22) STFD [Y1] = f13
  328. (p16) LDFD f38 = [X1], INCX
  329. (p21) FMPY f7 = ALPHA, f43
  330. }
  331. { .mmi
  332. nop __LINE__
  333. nop __LINE__
  334. (p22) add Y1 = INCX, Y1
  335. }
  336. ;;
  337. { .mmf
  338. (p22) STFD [Y1] = f14
  339. (p16) LDFD f44 = [X1], INCX
  340. (p21) FMPY f10 = ALPHA, f49
  341. }
  342. { .mmi
  343. nop __LINE__
  344. nop __LINE__
  345. (p22) add Y1 = INCX, Y1
  346. }
  347. ;;
  348. { .mmf
  349. (p22) STFD [Y1] = f15
  350. (p16) LDFD f50 = [X1], INCX
  351. (p21) FMPY f11 = ALPHA, f55
  352. }
  353. { .mmi
  354. nop __LINE__
  355. nop __LINE__
  356. (p22) add Y1 = INCX, Y1
  357. }
  358. ;;
  359. { .mmf
  360. (p21) STFD [Y1] = f6
  361. (p16) LDFD f56 = [X1], INCX
  362. (p21) FMPY f12 = ALPHA, f61
  363. }
  364. { .mmi
  365. nop __LINE__
  366. nop __LINE__
  367. (p21) add Y1 = INCX, Y1
  368. }
  369. ;;
  370. { .mmf
  371. (p16) lfetch.excl.nt1 [PRE1], INCX8
  372. (p21) STFD [Y1] = f7
  373. (p21) FMPY f13 = ALPHA, f67
  374. }
  375. { .mmi
  376. (p16) LDFD f62 = [X1], INCX
  377. nop __LINE__
  378. (p21) add Y1 = INCX, Y1
  379. }
  380. ;;
  381. { .mmf
  382. (p21) STFD [Y1] = f10
  383. (p16) LDFD f68 = [X1], INCX
  384. (p21) FMPY f14 = ALPHA, f73
  385. }
  386. { .mmi
  387. nop __LINE__
  388. nop __LINE__
  389. (p21) add Y1 = INCX, Y1
  390. }
  391. ;;
  392. { .mmf
  393. (p21) STFD [Y1] = f11
  394. (p16) LDFD f74 = [X1], INCX
  395. (p21) FMPY f15 = ALPHA, f79
  396. }
  397. { .mmi
  398. nop __LINE__
  399. nop __LINE__
  400. (p21) add Y1 = INCX, Y1
  401. }
  402. ;;
  403. { .mmf
  404. (p21) STFD [Y1] = f12
  405. (p16) LDFD f80 = [X1], INCX
  406. (p21) FMPY f6 = ALPHA, f85
  407. }
  408. { .mmi
  409. nop __LINE__
  410. nop __LINE__
  411. (p21) add Y1 = INCX, Y1
  412. }
  413. ;;
  414. { .mmf
  415. (p21) STFD [Y1] = f13
  416. (p16) LDFD f86 = [X1], INCX
  417. (p21) FMPY f7 = ALPHA, f91
  418. }
  419. { .mmi
  420. nop __LINE__
  421. nop __LINE__
  422. (p21) add Y1 = INCX, Y1
  423. }
  424. ;;
  425. { .mmf
  426. (p21) STFD [Y1] = f14
  427. (p16) LDFD f92 = [X1], INCX
  428. (p21) FMPY f10 = ALPHA, f97
  429. }
  430. { .mmi
  431. nop __LINE__
  432. nop __LINE__
  433. (p21) add Y1 = INCX, Y1
  434. }
  435. ;;
  436. { .mmf
  437. (p21) STFD [Y1] = f15
  438. (p16) LDFD f98 = [X1], INCX
  439. (p21) FMPY f11 = ALPHA, f103
  440. }
  441. { .mmi
  442. nop __LINE__
  443. nop __LINE__
  444. (p21) add Y1 = INCX, Y1
  445. }
  446. ;;
  447. { .mmf
  448. (p21) STFD [Y1] = f6
  449. (p16) LDFD f104 = [X1], INCX
  450. (p21) FMPY f12 = ALPHA, f109
  451. }
  452. { .mmi
  453. nop __LINE__
  454. nop __LINE__
  455. (p21) add Y1 = INCX, Y1
  456. }
  457. ;;
  458. { .mmf
  459. (p21) STFD [Y1] = f7
  460. (p16) LDFD f110 = [X1], INCX
  461. (p21) FMPY f13 = ALPHA, f115
  462. }
  463. { .mmi
  464. nop __LINE__
  465. nop __LINE__
  466. (p21) add Y1 = INCX, Y1
  467. }
  468. ;;
  469. { .mmf
  470. (p21) STFD [Y1] = f10
  471. (p16) LDFD f116 = [X1], INCX
  472. (p21) FMPY f14 = ALPHA, f121
  473. }
  474. { .mmi
  475. nop __LINE__
  476. nop __LINE__
  477. (p21) add Y1 = INCX, Y1
  478. }
  479. ;;
  480. { .mmf
  481. (p21) STFD [Y1] = f11
  482. (p16) LDFD f122 = [X1], INCX
  483. (p21) FMPY f15 = ALPHA, f127
  484. }
  485. { .mmb
  486. nop __LINE__
  487. (p21) add Y1 = INCX, Y1
  488. br.ctop.sptk.few .L310
  489. }
  490. ;;
  491. { .mmi
  492. STFD [Y1] = f12
  493. add Y1 = INCX, Y1
  494. shladd Y2 = INCX, 2, X1
  495. }
  496. ;;
  497. { .mmi
  498. STFD [Y1] = f13
  499. add Y1 = INCX, Y1
  500. shladd X2 = INCX, 2, X1
  501. }
  502. ;;
  503. { .mmi
  504. STFD [Y1] = f14
  505. nop __LINE__
  506. add Y1 = INCX, Y1
  507. }
  508. ;;
  509. { .mmi
  510. STFD [Y1] = f15
  511. nop __LINE__
  512. add Y1 = INCX, Y1
  513. }
  514. ;;
  515. .align 16
  516. .L320:
  517. { .mmi
  518. (p12) LDFD f48 = [X1], INCX
  519. (p12) LDFD f52 = [X2], INCX
  520. mov ar.lc = ARLC
  521. }
  522. ;;
  523. { .mmi
  524. (p12) LDFD f49 = [X1], INCX
  525. (p12) LDFD f53 = [X2], INCX
  526. mov pr = PR, -65474
  527. }
  528. { .mmb
  529. nop __LINE__
  530. nop __LINE__
  531. (p9) br.ret.sptk.many b0
  532. }
  533. ;;
  534. { .mmi
  535. (p12) LDFD f50 = [X1], INCX
  536. (p12) LDFD f54 = [X2], INCX
  537. tbit.z p0, p13 = N, 2
  538. }
  539. ;;
  540. { .mmi
  541. (p12) LDFD f51 = [X1], INCX5
  542. (p12) LDFD f55 = [X2], INCX5
  543. tbit.z p0, p14 = N, 1
  544. }
  545. ;;
  546. (p13) LDFD f56 = [X1], INCX
  547. tbit.z p0, p15 = N, 0
  548. ;;
  549. (p13) LDFD f57 = [X1], INCX
  550. ;;
  551. { .mmf
  552. (p13) LDFD f58 = [X1], INCX
  553. nop __LINE__
  554. (p12) FMPY f48 = ALPHA, f48
  555. }
  556. { .mmf
  557. nop __LINE__
  558. nop __LINE__
  559. (p12) FMPY f52 = ALPHA, f52
  560. }
  561. ;;
  562. { .mmf
  563. (p13) LDFD f59 = [X1], INCX
  564. nop __LINE__
  565. (p12) FMPY f49 = ALPHA, f49
  566. }
  567. { .mmf
  568. nop __LINE__
  569. nop __LINE__
  570. (p12) FMPY f53 = ALPHA, f53
  571. }
  572. ;;
  573. { .mmf
  574. (p14) LDFD f60 = [X1], INCX
  575. nop __LINE__
  576. (p12) FMPY f50 = ALPHA, f50
  577. }
  578. { .mmf
  579. nop __LINE__
  580. nop __LINE__
  581. (p12) FMPY f54 = ALPHA, f54
  582. }
  583. ;;
  584. { .mmf
  585. (p14) LDFD f61 = [X1], INCX
  586. nop __LINE__
  587. (p12) FMPY f51 = ALPHA, f51
  588. }
  589. { .mmf
  590. nop __LINE__
  591. nop __LINE__
  592. (p12) FMPY f55 = ALPHA, f55
  593. }
  594. ;;
  595. { .mmf
  596. (p12) STFD [Y1] = f48
  597. (p12) STFD [Y2] = f52
  598. (p13) FMPY f56 = ALPHA, f56
  599. }
  600. { .mmi
  601. (p15) LDFD f62 = [X1]
  602. (p12) add Y1 = INCX, Y1
  603. (p12) add Y2 = INCX, Y2
  604. }
  605. ;;
  606. { .mmf
  607. (p12) STFD [Y1] = f49
  608. (p12) STFD [Y2] = f53
  609. (p13) FMPY f57 = ALPHA, f57
  610. }
  611. { .mmi
  612. (p12) add Y1 = INCX, Y1
  613. (p12) add Y2 = INCX, Y2
  614. nop __LINE__
  615. }
  616. ;;
  617. { .mmf
  618. (p12) STFD [Y1] = f50
  619. (p12) STFD [Y2] = f54
  620. (p13) FMPY f58 = ALPHA, f58
  621. }
  622. { .mmi
  623. (p12) add Y1 = INCX, Y1
  624. (p12) add Y2 = INCX, Y2
  625. nop __LINE__
  626. }
  627. ;;
  628. { .mmf
  629. (p12) STFD [Y1] = f51
  630. (p12) STFD [Y2] = f55
  631. (p13) FMPY f59 = ALPHA, f59
  632. }
  633. { .mmi
  634. (p12) add Y1 = INCX5, Y1
  635. (p12) add Y2 = INCX5, Y2
  636. nop __LINE__
  637. }
  638. ;;
  639. { .mfi
  640. (p13) STFD [Y1] = f56
  641. (p14) FMPY f60 = ALPHA, f60
  642. (p13) add Y1 = INCX, Y1
  643. }
  644. ;;
  645. { .mfi
  646. (p13) STFD [Y1] = f57
  647. (p14) FMPY f61 = ALPHA, f61
  648. (p13) add Y1 = INCX, Y1
  649. }
  650. ;;
  651. { .mfi
  652. (p13) STFD [Y1] = f58
  653. (p15) FMPY f62 = ALPHA, f62
  654. (p13) add Y1 = INCX, Y1
  655. }
  656. ;;
  657. { .mmi
  658. (p13) STFD [Y1] = f59
  659. nop __LINE__
  660. (p13) add Y1 = INCX, Y1
  661. }
  662. ;;
  663. { .mmi
  664. (p14) STFD [Y1] = f60
  665. nop __LINE__
  666. (p14) add Y1 = INCX, Y1
  667. }
  668. ;;
  669. { .mmi
  670. (p14) STFD [Y1] = f61
  671. nop __LINE__
  672. (p14) add Y1 = INCX, Y1
  673. }
  674. ;;
  675. { .mib
  676. (p15) STFD [Y1] = f62
  677. mov pr = PR, -65474
  678. br.ret.sptk.many b0
  679. }
  680. EPILOGUE