You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r32
  41. #define X1 r33
  42. #define INCX r34
  43. #define Y1 r35
  44. #define INCY r36
  45. #define PREX r2
  46. #define PREY r3
  47. #define I r14
  48. #define J r15
  49. #define X2 r16
  50. #define Y2 r17
  51. #define INCX2 r18
  52. #define INCY2 r19
  53. #define INCX8 r20
  54. #define INCY8 r21
  55. #define PR r30
  56. #define ARLC r31
  57. #define PREFETCH_SIZE (8 * 16)
  58. PROLOGUE
  59. .prologue
  60. PROFCODE
  61. { .mmi
  62. shladd INCX = INCX, BASE_SHIFT, r0
  63. shladd INCY = INCY, BASE_SHIFT, r0
  64. .save ar.lc, ARLC
  65. mov ARLC = ar.lc
  66. }
  67. { .mib
  68. cmp.lt p0, p6 = r0, N
  69. shr I = N, 4
  70. (p6) br.ret.sptk.many b0
  71. }
  72. ;;
  73. .body
  74. { .mmi
  75. sub r8 = X1, Y1
  76. mov r9 = 0xf0
  77. mov PR = pr
  78. }
  79. { .mmi
  80. shladd INCX2 = INCX, 1, r0
  81. shladd INCY2 = INCY, 1, r0
  82. and J = 15, N
  83. }
  84. ;;
  85. { .mmi
  86. shladd INCX8 = INCX, 3, r0
  87. shladd INCY8 = INCY, 3, r0
  88. mov pr.rot = 0
  89. }
  90. { .mmi
  91. and r8 = r9, r8
  92. cmp.eq p9, p0 = r0, J
  93. adds I = -1, I
  94. }
  95. ;;
  96. { .mmi
  97. add X2 = X1, INCX
  98. add Y2 = Y1, INCY
  99. mov ar.ec = 4
  100. }
  101. { .mmb
  102. cmp.gt p6, p0 = 127, r8
  103. cmp.eq p16, p0 = r0, r0
  104. (p6) br.cond.dpnt .L20
  105. }
  106. ;;
  107. { .mmi
  108. adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1
  109. adds PREY = (PREFETCH_SIZE + 2) * SIZE, Y1
  110. mov ar.lc = I
  111. }
  112. { .mib
  113. cmp.eq p8 ,p0 = -1, I
  114. tbit.z p0, p12 = N, 3
  115. (p8) br.cond.dpnt .L15
  116. }
  117. ;;
  118. .align 16
  119. .L12:
  120. { .mmi
  121. (p19) STFD [Y1] = f35
  122. (p19) STFD [Y2] = f39
  123. (p19) add Y1 = INCY2, Y1
  124. }
  125. { .mmi
  126. (p17) LDFD f81 = [X1], INCX2
  127. (p17) LDFD f85 = [X2], INCX2
  128. (p19) add Y2 = INCY2, Y2
  129. }
  130. ;;
  131. { .mmi
  132. (p19) STFD [Y1] = f43
  133. (p19) STFD [Y2] = f47
  134. (p19) add Y1 = INCY2, Y1
  135. }
  136. { .mmi
  137. (p17) LDFD f89 = [X1], INCX2
  138. (p17) LDFD f93 = [X2], INCX2
  139. (p19) add Y2 = INCY2, Y2
  140. }
  141. ;;
  142. { .mmi
  143. (p19) STFD [Y1] = f51
  144. (p19) STFD [Y2] = f55
  145. (p19) add Y1 = INCY2, Y1
  146. }
  147. { .mmi
  148. (p16) LDFD f32 = [X1], INCX2
  149. (p16) LDFD f36 = [X2], INCX2
  150. (p19) add Y2 = INCY2, Y2
  151. }
  152. ;;
  153. { .mmi
  154. (p19) STFD [Y1] = f59
  155. (p19) STFD [Y2] = f63
  156. (p19) add Y1 = INCY2, Y1
  157. }
  158. { .mmi
  159. lfetch.fault.nt1 [PREX], INCX8
  160. lfetch.fault.excl.nt1 [PREY], INCY8
  161. (p19) add Y2 = INCY2, Y2
  162. }
  163. ;;
  164. { .mmi
  165. (p16) LDFD f40 = [X1], INCX2
  166. (p16) LDFD f44 = [X2], INCX2
  167. nop __LINE__
  168. }
  169. ;;
  170. { .mmi
  171. (p19) STFD [Y1] = f67
  172. (p19) STFD [Y2] = f71
  173. (p19) add Y1 = INCY2, Y1
  174. }
  175. { .mmi
  176. (p16) LDFD f48 = [X1], INCX2
  177. (p16) LDFD f52 = [X2], INCX2
  178. (p19) add Y2 = INCY2, Y2
  179. }
  180. ;;
  181. { .mmi
  182. (p19) STFD [Y1] = f75
  183. (p19) STFD [Y2] = f79
  184. (p19) add Y1 = INCY2, Y1
  185. }
  186. { .mmi
  187. (p16) LDFD f56 = [X1], INCX2
  188. (p16) LDFD f60 = [X2], INCX2
  189. (p19) add Y2 = INCY2, Y2
  190. }
  191. ;;
  192. { .mmi
  193. (p19) STFD [Y1] = f83
  194. (p19) STFD [Y2] = f87
  195. (p19) add Y1 = INCY2, Y1
  196. }
  197. { .mmi
  198. lfetch.fault.nt1 [PREX], INCX8
  199. lfetch.fault.excl.nt1 [PREY], INCY8
  200. (p19) add Y2 = INCY2, Y2
  201. }
  202. ;;
  203. { .mmi
  204. (p19) STFD [Y1] = f91
  205. (p19) STFD [Y2] = f95
  206. (p19) add Y1 = INCY2, Y1
  207. }
  208. { .mmi
  209. (p16) LDFD f64 = [X1], INCX2
  210. (p16) LDFD f68 = [X2], INCX2
  211. (p19) add Y2 = INCY2, Y2
  212. }
  213. ;;
  214. { .mmb
  215. (p16) LDFD f72 = [X1], INCX2
  216. (p16) LDFD f76 = [X2], INCX2
  217. br.ctop.sptk.few .L12
  218. }
  219. ;;
  220. .align 32
  221. .L15:
  222. { .mmi
  223. (p12) LDFD f48 = [X1], INCX2
  224. (p12) LDFD f49 = [X2], INCX2
  225. mov ar.lc = ARLC
  226. }
  227. ;;
  228. { .mmi
  229. (p12) LDFD f50 = [X1], INCX2
  230. (p12) LDFD f51 = [X2], INCX2
  231. mov pr = PR, -65474
  232. }
  233. ;;
  234. { .mmb
  235. (p12) LDFD f52 = [X1], INCX2
  236. (p12) LDFD f53 = [X2], INCX2
  237. (p9) br.ret.sptk.many b0
  238. }
  239. ;;
  240. { .mmi
  241. (p12) LDFD f54 = [X1], INCX2
  242. (p12) LDFD f55 = [X2], INCX2
  243. tbit.z p0, p13 = N, 2
  244. }
  245. ;;
  246. { .mmi
  247. (p13) LDFD f56 = [X1], INCX2
  248. (p13) LDFD f57 = [X2], INCX2
  249. tbit.z p0, p14 = N, 1
  250. }
  251. ;;
  252. { .mmi
  253. (p13) LDFD f58 = [X1], INCX2
  254. (p13) LDFD f59 = [X2], INCX2
  255. tbit.z p0, p15 = N, 0
  256. }
  257. ;;
  258. { .mmi
  259. (p12) STFD [Y1] = f48
  260. (p12) STFD [Y2] = f49
  261. (p12) add Y1 = INCY2, Y1
  262. }
  263. { .mmi
  264. (p14) LDFD f60 = [X1], INCX2
  265. (p14) LDFD f61 = [X2], INCX2
  266. (p12) add Y2 = INCY2, Y2
  267. }
  268. ;;
  269. { .mmi
  270. (p12) STFD [Y1] = f50
  271. (p12) STFD [Y2] = f51
  272. (p12) add Y1 = INCY2, Y1
  273. }
  274. { .mmi
  275. (p15) LDFD f62 = [X1]
  276. nop __LINE__
  277. (p12) add Y2 = INCY2, Y2
  278. }
  279. ;;
  280. { .mmi
  281. (p12) STFD [Y1] = f52
  282. (p12) STFD [Y2] = f53
  283. (p12) add Y1 = INCY2, Y1
  284. }
  285. { .mmi
  286. nop __LINE__
  287. nop __LINE__
  288. (p12) add Y2 = INCY2, Y2
  289. }
  290. ;;
  291. { .mmi
  292. (p12) STFD [Y1] = f54
  293. (p12) STFD [Y2] = f55
  294. (p12) add Y1 = INCY2, Y1
  295. }
  296. { .mmi
  297. nop __LINE__
  298. nop __LINE__
  299. (p12) add Y2 = INCY2, Y2
  300. }
  301. ;;
  302. { .mmi
  303. (p13) STFD [Y1] = f56
  304. (p13) STFD [Y2] = f57
  305. (p13) add Y1 = INCY2, Y1
  306. }
  307. { .mmi
  308. nop __LINE__
  309. nop __LINE__
  310. (p13) add Y2 = INCY2, Y2
  311. }
  312. ;;
  313. { .mmi
  314. (p13) STFD [Y1] = f58
  315. (p13) STFD [Y2] = f59
  316. (p13) add Y1 = INCY2, Y1
  317. }
  318. { .mmi
  319. nop __LINE__
  320. nop __LINE__
  321. (p13) add Y2 = INCY2, Y2
  322. }
  323. ;;
  324. { .mmi
  325. (p14) STFD [Y1] = f60
  326. (p14) STFD [Y2] = f61
  327. (p14) add Y1 = INCY2, Y1
  328. }
  329. ;;
  330. { .mmb
  331. (p15) STFD [Y1] = f62
  332. nop __LINE__
  333. br.ret.sptk.many b0
  334. }
  335. ;;
  336. .align 16
  337. .L20:
  338. { .mmi
  339. adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1
  340. adds PREY = (PREFETCH_SIZE + 10) * SIZE, Y1
  341. mov ar.lc = I
  342. }
  343. { .mib
  344. cmp.eq p8 ,p0 = -1, I
  345. tbit.z p0, p12 = N, 3
  346. (p8) br.cond.dpnt .L25
  347. }
  348. ;;
  349. .align 16
  350. .L22:
  351. { .mmi
  352. (p19) STFD [Y1] = f67
  353. (p19) STFD [Y2] = f71
  354. (p19) add Y1 = INCY2, Y1
  355. }
  356. { .mmi
  357. (p17) LDFD f81 = [X1], INCX2
  358. (p17) LDFD f85 = [X2], INCX2
  359. (p19) add Y2 = INCY2, Y2
  360. }
  361. ;;
  362. { .mmi
  363. (p19) STFD [Y1] = f75
  364. (p19) STFD [Y2] = f79
  365. (p19) add Y1 = INCY2, Y1
  366. }
  367. { .mmi
  368. (p17) LDFD f89 = [X1], INCX2
  369. (p17) LDFD f93 = [X2], INCX2
  370. (p19) add Y2 = INCY2, Y2
  371. }
  372. ;;
  373. { .mmi
  374. (p19) STFD [Y1] = f83
  375. (p19) STFD [Y2] = f87
  376. (p19) add Y1 = INCY2, Y1
  377. }
  378. { .mmi
  379. (p16) LDFD f32 = [X1], INCX2
  380. (p16) LDFD f36 = [X2], INCX2
  381. (p19) add Y2 = INCY2, Y2
  382. }
  383. ;;
  384. { .mmi
  385. (p19) STFD [Y1] = f91
  386. (p19) STFD [Y2] = f95
  387. (p19) add Y1 = INCY2, Y1
  388. }
  389. { .mmi
  390. lfetch.fault.nt1 [PREX], INCX8
  391. lfetch.fault.excl.nt1 [PREY], INCY8
  392. (p19) add Y2 = INCY2, Y2
  393. }
  394. ;;
  395. { .mmi
  396. (p16) LDFD f40 = [X1], INCX2
  397. (p16) LDFD f44 = [X2], INCX2
  398. nop __LINE__
  399. }
  400. ;;
  401. { .mmi
  402. (p18) STFD [Y1] = f34
  403. (p18) STFD [Y2] = f38
  404. (p18) add Y1 = INCY2, Y1
  405. }
  406. { .mmi
  407. (p16) LDFD f48 = [X1], INCX2
  408. (p16) LDFD f52 = [X2], INCX2
  409. (p18) add Y2 = INCY2, Y2
  410. }
  411. ;;
  412. { .mmi
  413. (p18) STFD [Y1] = f42
  414. (p18) STFD [Y2] = f46
  415. (p18) add Y1 = INCY2, Y1
  416. }
  417. { .mmi
  418. (p16) LDFD f56 = [X1], INCX2
  419. (p16) LDFD f60 = [X2], INCX2
  420. (p18) add Y2 = INCY2, Y2
  421. }
  422. ;;
  423. { .mmi
  424. (p18) STFD [Y1] = f50
  425. (p18) STFD [Y2] = f54
  426. (p18) add Y1 = INCY2, Y1
  427. }
  428. { .mmi
  429. lfetch.fault.nt1 [PREX], INCX8
  430. lfetch.fault.excl.nt1 [PREY], INCY8
  431. (p18) add Y2 = INCY2, Y2
  432. }
  433. ;;
  434. { .mmi
  435. (p18) STFD [Y1] = f58
  436. (p18) STFD [Y2] = f62
  437. (p18) add Y1 = INCY2, Y1
  438. }
  439. { .mmi
  440. (p16) LDFD f64 = [X1], INCX2
  441. (p16) LDFD f68 = [X2], INCX2
  442. (p18) add Y2 = INCY2, Y2
  443. }
  444. ;;
  445. { .mmb
  446. (p16) LDFD f72 = [X1], INCX2
  447. (p16) LDFD f76 = [X2], INCX2
  448. br.ctop.sptk.few .L22
  449. }
  450. ;;
  451. .align 32
  452. .L25:
  453. { .mmi
  454. (p12) LDFD f48 = [X1], INCX2
  455. (p12) LDFD f49 = [X2], INCX2
  456. mov ar.lc = ARLC
  457. }
  458. ;;
  459. { .mmi
  460. (p12) LDFD f50 = [X1], INCX2
  461. (p12) LDFD f51 = [X2], INCX2
  462. mov pr = PR, -65474
  463. }
  464. ;;
  465. { .mmb
  466. (p12) LDFD f52 = [X1], INCX2
  467. (p12) LDFD f53 = [X2], INCX2
  468. (p9) br.ret.sptk.many b0
  469. }
  470. ;;
  471. { .mmi
  472. (p12) LDFD f54 = [X1], INCX2
  473. (p12) LDFD f55 = [X2], INCX2
  474. tbit.z p0, p13 = N, 2
  475. }
  476. ;;
  477. { .mmi
  478. (p13) LDFD f56 = [X1], INCX2
  479. (p13) LDFD f57 = [X2], INCX2
  480. tbit.z p0, p14 = N, 1
  481. }
  482. ;;
  483. { .mmi
  484. (p13) LDFD f58 = [X1], INCX2
  485. (p13) LDFD f59 = [X2], INCX2
  486. tbit.z p0, p15 = N, 0
  487. }
  488. ;;
  489. { .mmi
  490. (p12) STFD [Y1] = f48
  491. (p12) STFD [Y2] = f49
  492. (p12) add Y1 = INCY2, Y1
  493. }
  494. { .mmi
  495. (p14) LDFD f60 = [X1], INCX2
  496. (p14) LDFD f61 = [X2], INCX2
  497. (p12) add Y2 = INCY2, Y2
  498. }
  499. ;;
  500. { .mmi
  501. (p12) STFD [Y1] = f50
  502. (p12) STFD [Y2] = f51
  503. (p12) add Y1 = INCY2, Y1
  504. }
  505. { .mmi
  506. (p15) LDFD f62 = [X1]
  507. nop __LINE__
  508. (p12) add Y2 = INCY2, Y2
  509. }
  510. ;;
  511. { .mmi
  512. (p12) STFD [Y1] = f52
  513. (p12) STFD [Y2] = f53
  514. (p12) add Y1 = INCY2, Y1
  515. }
  516. { .mmi
  517. nop __LINE__
  518. nop __LINE__
  519. (p12) add Y2 = INCY2, Y2
  520. }
  521. ;;
  522. { .mmi
  523. (p12) STFD [Y1] = f54
  524. (p12) STFD [Y2] = f55
  525. (p12) add Y1 = INCY2, Y1
  526. }
  527. { .mmi
  528. nop __LINE__
  529. nop __LINE__
  530. (p12) add Y2 = INCY2, Y2
  531. }
  532. ;;
  533. { .mmi
  534. (p13) STFD [Y1] = f56
  535. (p13) STFD [Y2] = f57
  536. (p13) add Y1 = INCY2, Y1
  537. }
  538. { .mmi
  539. nop __LINE__
  540. nop __LINE__
  541. (p13) add Y2 = INCY2, Y2
  542. }
  543. ;;
  544. { .mmi
  545. (p13) STFD [Y1] = f58
  546. (p13) STFD [Y2] = f59
  547. (p13) add Y1 = INCY2, Y1
  548. }
  549. { .mmi
  550. nop __LINE__
  551. nop __LINE__
  552. (p13) add Y2 = INCY2, Y2
  553. }
  554. ;;
  555. { .mmi
  556. (p14) STFD [Y1] = f60
  557. (p14) STFD [Y2] = f61
  558. (p14) add Y1 = INCY2, Y1
  559. }
  560. ;;
  561. { .mmb
  562. (p15) STFD [Y1] = f62
  563. nop __LINE__
  564. br.ret.sptk.many b0
  565. }
  566. ;;
  567. EPILOGUE