You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

xcopy.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r32
  41. #define X1 r33
  42. #define INCX r34
  43. #define Y1 r35
  44. #define INCY r36
  45. #define PREX r2
  46. #define PREY r3
  47. #define I r14
  48. #define J r15
  49. #define X2 r16
  50. #define Y2 r17
  51. #define INCX2 r18
  52. #define INCY2 r19
  53. #define INCX8 r20
  54. #define INCY8 r21
  55. #define PR r30
  56. #define ARLC r31
  57. #define PREFETCH_SIZE (8 * 16)
  58. PROLOGUE
  59. .prologue
  60. PROFCODE
  61. { .mmi
  62. .save ar.lc, ARLC
  63. mov ARLC = ar.lc
  64. }
  65. { .mib
  66. cmp.lt p0, p6 = r0, N
  67. shr I = N, 3
  68. (p6) br.ret.sptk.many b0
  69. }
  70. ;;
  71. shl INCX = INCX, ZBASE_SHIFT
  72. shl INCY = INCY, ZBASE_SHIFT
  73. ;;
  74. .body
  75. { .mmi
  76. sub r8 = X1, Y1
  77. mov r9 = 0xf0
  78. mov PR = pr
  79. }
  80. { .mmi
  81. shladd INCX2 = INCX, 1, r0
  82. shladd INCY2 = INCY, 1, r0
  83. and J = 15, N
  84. }
  85. ;;
  86. { .mmi
  87. shladd INCX8 = INCX, 2, r0
  88. shladd INCY8 = INCY, 2, r0
  89. mov pr.rot = 0
  90. }
  91. { .mmi
  92. and r8 = r9, r8
  93. cmp.eq p9, p0 = r0, J
  94. adds I = -1, I
  95. }
  96. ;;
  97. { .mmi
  98. adds X2 = 1 * SIZE, X1
  99. adds Y2 = 1 * SIZE, Y1
  100. mov ar.ec = 4
  101. }
  102. { .mmb
  103. cmp.gt p6, p0 = 127, r8
  104. cmp.eq p16, p0 = r0, r0
  105. (p6) br.cond.dpnt .L20
  106. }
  107. ;;
  108. { .mmi
  109. adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1
  110. adds PREY = (PREFETCH_SIZE + 2) * SIZE, Y1
  111. mov ar.lc = I
  112. }
  113. { .mib
  114. cmp.eq p8 ,p0 = -1, I
  115. tbit.z p0, p12 = N, 2
  116. (p8) br.cond.dpnt .L15
  117. }
  118. ;;
  119. .align 16
  120. .L12:
  121. { .mmi
  122. (p19) STFD [Y1] = f35
  123. (p19) STFD [Y2] = f39
  124. (p19) add Y1 = INCY, Y1
  125. }
  126. { .mmi
  127. (p17) LDFD f81 = [X1], INCX
  128. (p17) LDFD f85 = [X2], INCX
  129. (p19) add Y2 = INCY, Y2
  130. }
  131. ;;
  132. { .mmi
  133. (p19) STFD [Y1] = f43
  134. (p19) STFD [Y2] = f47
  135. (p19) add Y1 = INCY, Y1
  136. }
  137. { .mmi
  138. (p17) LDFD f89 = [X1], INCX
  139. (p17) LDFD f93 = [X2], INCX
  140. (p19) add Y2 = INCY, Y2
  141. }
  142. ;;
  143. { .mmi
  144. (p19) STFD [Y1] = f51
  145. (p19) STFD [Y2] = f55
  146. (p19) add Y1 = INCY, Y1
  147. }
  148. { .mmi
  149. (p16) LDFD f32 = [X1], INCX
  150. (p16) LDFD f36 = [X2], INCX
  151. (p19) add Y2 = INCY, Y2
  152. }
  153. ;;
  154. { .mmi
  155. (p19) STFD [Y1] = f59
  156. (p19) STFD [Y2] = f63
  157. (p19) add Y1 = INCY, Y1
  158. }
  159. { .mmi
  160. lfetch.fault.nt1 [PREX], INCX8
  161. lfetch.fault.excl.nt1 [PREY], INCY8
  162. (p19) add Y2 = INCY, Y2
  163. }
  164. ;;
  165. { .mmi
  166. (p16) LDFD f40 = [X1], INCX
  167. (p16) LDFD f44 = [X2], INCX
  168. nop __LINE__
  169. }
  170. ;;
  171. { .mmi
  172. (p19) STFD [Y1] = f67
  173. (p19) STFD [Y2] = f71
  174. (p19) add Y1 = INCY, Y1
  175. }
  176. { .mmi
  177. (p16) LDFD f48 = [X1], INCX
  178. (p16) LDFD f52 = [X2], INCX
  179. (p19) add Y2 = INCY, Y2
  180. }
  181. ;;
  182. { .mmi
  183. (p19) STFD [Y1] = f75
  184. (p19) STFD [Y2] = f79
  185. (p19) add Y1 = INCY, Y1
  186. }
  187. { .mmi
  188. (p16) LDFD f56 = [X1], INCX
  189. (p16) LDFD f60 = [X2], INCX
  190. (p19) add Y2 = INCY, Y2
  191. }
  192. ;;
  193. { .mmi
  194. (p19) STFD [Y1] = f83
  195. (p19) STFD [Y2] = f87
  196. (p19) add Y1 = INCY, Y1
  197. }
  198. { .mmi
  199. lfetch.fault.nt1 [PREX], INCX8
  200. lfetch.fault.excl.nt1 [PREY], INCY8
  201. (p19) add Y2 = INCY, Y2
  202. }
  203. ;;
  204. { .mmi
  205. (p19) STFD [Y1] = f91
  206. (p19) STFD [Y2] = f95
  207. (p19) add Y1 = INCY, Y1
  208. }
  209. { .mmi
  210. (p16) LDFD f64 = [X1], INCX
  211. (p16) LDFD f68 = [X2], INCX
  212. (p19) add Y2 = INCY, Y2
  213. }
  214. ;;
  215. { .mmb
  216. (p16) LDFD f72 = [X1], INCX
  217. (p16) LDFD f76 = [X2], INCX
  218. br.ctop.sptk.few .L12
  219. }
  220. ;;
  221. .align 32
  222. .L15:
  223. { .mmi
  224. (p12) LDFD f48 = [X1], INCX
  225. (p12) LDFD f49 = [X2], INCX
  226. mov ar.lc = ARLC
  227. }
  228. ;;
  229. { .mmi
  230. (p12) LDFD f50 = [X1], INCX
  231. (p12) LDFD f51 = [X2], INCX
  232. mov pr = PR, -65474
  233. }
  234. ;;
  235. { .mmb
  236. (p12) LDFD f52 = [X1], INCX
  237. (p12) LDFD f53 = [X2], INCX
  238. (p9) br.ret.sptk.many b0
  239. }
  240. ;;
  241. { .mmi
  242. (p12) LDFD f54 = [X1], INCX
  243. (p12) LDFD f55 = [X2], INCX
  244. tbit.z p0, p13 = N, 1
  245. }
  246. ;;
  247. { .mmi
  248. (p13) LDFD f56 = [X1], INCX
  249. (p13) LDFD f57 = [X2], INCX
  250. tbit.z p0, p14 = N, 0
  251. }
  252. ;;
  253. { .mmi
  254. (p13) LDFD f58 = [X1], INCX
  255. (p13) LDFD f59 = [X2], INCX
  256. }
  257. ;;
  258. { .mmi
  259. (p12) STFD [Y1] = f48
  260. (p12) STFD [Y2] = f49
  261. (p12) add Y1 = INCY, Y1
  262. }
  263. { .mmi
  264. (p14) LDFD f60 = [X1], INCX
  265. (p14) LDFD f61 = [X2], INCX
  266. (p12) add Y2 = INCY, Y2
  267. }
  268. ;;
  269. { .mmi
  270. (p12) STFD [Y1] = f50
  271. (p12) STFD [Y2] = f51
  272. (p12) add Y1 = INCY, Y1
  273. }
  274. { .mmi
  275. nop __LINE__
  276. (p12) add Y2 = INCY, Y2
  277. }
  278. ;;
  279. { .mmi
  280. (p12) STFD [Y1] = f52
  281. (p12) STFD [Y2] = f53
  282. (p12) add Y1 = INCY, Y1
  283. }
  284. { .mmi
  285. nop __LINE__
  286. nop __LINE__
  287. (p12) add Y2 = INCY, Y2
  288. }
  289. ;;
  290. { .mmi
  291. (p12) STFD [Y1] = f54
  292. (p12) STFD [Y2] = f55
  293. (p12) add Y1 = INCY, Y1
  294. }
  295. { .mmi
  296. nop __LINE__
  297. nop __LINE__
  298. (p12) add Y2 = INCY, Y2
  299. }
  300. ;;
  301. { .mmi
  302. (p13) STFD [Y1] = f56
  303. (p13) STFD [Y2] = f57
  304. (p13) add Y1 = INCY, Y1
  305. }
  306. { .mmi
  307. nop __LINE__
  308. nop __LINE__
  309. (p13) add Y2 = INCY, Y2
  310. }
  311. ;;
  312. { .mmi
  313. (p13) STFD [Y1] = f58
  314. (p13) STFD [Y2] = f59
  315. (p13) add Y1 = INCY, Y1
  316. }
  317. { .mmi
  318. nop __LINE__
  319. nop __LINE__
  320. (p13) add Y2 = INCY, Y2
  321. }
  322. ;;
  323. { .mmb
  324. (p14) STFD [Y1] = f60
  325. (p14) STFD [Y2] = f61
  326. br.ret.sptk.many b0
  327. }
  328. ;;
  329. .align 16
  330. .L20:
  331. { .mmi
  332. adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1
  333. adds PREY = (PREFETCH_SIZE + 10) * SIZE, Y1
  334. mov ar.lc = I
  335. }
  336. { .mib
  337. cmp.eq p8 ,p0 = -1, I
  338. tbit.z p0, p12 = N, 2
  339. (p8) br.cond.dpnt .L25
  340. }
  341. ;;
  342. .align 16
  343. .L22:
  344. { .mmi
  345. (p19) STFD [Y1] = f67
  346. (p19) STFD [Y2] = f71
  347. (p19) add Y1 = INCY, Y1
  348. }
  349. { .mmi
  350. (p17) LDFD f81 = [X1], INCX
  351. (p17) LDFD f85 = [X2], INCX
  352. (p19) add Y2 = INCY, Y2
  353. }
  354. ;;
  355. { .mmi
  356. (p19) STFD [Y1] = f75
  357. (p19) STFD [Y2] = f79
  358. (p19) add Y1 = INCY, Y1
  359. }
  360. { .mmi
  361. (p17) LDFD f89 = [X1], INCX
  362. (p17) LDFD f93 = [X2], INCX
  363. (p19) add Y2 = INCY, Y2
  364. }
  365. ;;
  366. { .mmi
  367. (p19) STFD [Y1] = f83
  368. (p19) STFD [Y2] = f87
  369. (p19) add Y1 = INCY, Y1
  370. }
  371. { .mmi
  372. (p16) LDFD f32 = [X1], INCX
  373. (p16) LDFD f36 = [X2], INCX
  374. (p19) add Y2 = INCY, Y2
  375. }
  376. ;;
  377. { .mmi
  378. (p19) STFD [Y1] = f91
  379. (p19) STFD [Y2] = f95
  380. (p19) add Y1 = INCY, Y1
  381. }
  382. { .mmi
  383. lfetch.fault.nt1 [PREX], INCX8
  384. lfetch.fault.excl.nt1 [PREY], INCY8
  385. (p19) add Y2 = INCY, Y2
  386. }
  387. ;;
  388. { .mmi
  389. (p16) LDFD f40 = [X1], INCX
  390. (p16) LDFD f44 = [X2], INCX
  391. nop __LINE__
  392. }
  393. ;;
  394. { .mmi
  395. (p18) STFD [Y1] = f34
  396. (p18) STFD [Y2] = f38
  397. (p18) add Y1 = INCY, Y1
  398. }
  399. { .mmi
  400. (p16) LDFD f48 = [X1], INCX
  401. (p16) LDFD f52 = [X2], INCX
  402. (p18) add Y2 = INCY, Y2
  403. }
  404. ;;
  405. { .mmi
  406. (p18) STFD [Y1] = f42
  407. (p18) STFD [Y2] = f46
  408. (p18) add Y1 = INCY, Y1
  409. }
  410. { .mmi
  411. (p16) LDFD f56 = [X1], INCX
  412. (p16) LDFD f60 = [X2], INCX
  413. (p18) add Y2 = INCY, Y2
  414. }
  415. ;;
  416. { .mmi
  417. (p18) STFD [Y1] = f50
  418. (p18) STFD [Y2] = f54
  419. (p18) add Y1 = INCY, Y1
  420. }
  421. { .mmi
  422. lfetch.fault.nt1 [PREX], INCX8
  423. lfetch.fault.excl.nt1 [PREY], INCY8
  424. (p18) add Y2 = INCY, Y2
  425. }
  426. ;;
  427. { .mmi
  428. (p18) STFD [Y1] = f58
  429. (p18) STFD [Y2] = f62
  430. (p18) add Y1 = INCY, Y1
  431. }
  432. { .mmi
  433. (p16) LDFD f64 = [X1], INCX
  434. (p16) LDFD f68 = [X2], INCX
  435. (p18) add Y2 = INCY, Y2
  436. }
  437. ;;
  438. { .mmb
  439. (p16) LDFD f72 = [X1], INCX
  440. (p16) LDFD f76 = [X2], INCX
  441. br.ctop.sptk.few .L22
  442. }
  443. ;;
  444. .align 32
  445. .L25:
  446. { .mmi
  447. (p12) LDFD f48 = [X1], INCX
  448. (p12) LDFD f49 = [X2], INCX
  449. mov ar.lc = ARLC
  450. }
  451. ;;
  452. { .mmi
  453. (p12) LDFD f50 = [X1], INCX
  454. (p12) LDFD f51 = [X2], INCX
  455. mov pr = PR, -65474
  456. }
  457. ;;
  458. { .mmb
  459. (p12) LDFD f52 = [X1], INCX
  460. (p12) LDFD f53 = [X2], INCX
  461. (p9) br.ret.sptk.many b0
  462. }
  463. ;;
  464. { .mmi
  465. (p12) LDFD f54 = [X1], INCX
  466. (p12) LDFD f55 = [X2], INCX
  467. tbit.z p0, p13 = N, 1
  468. }
  469. ;;
  470. { .mmi
  471. (p13) LDFD f56 = [X1], INCX
  472. (p13) LDFD f57 = [X2], INCX
  473. tbit.z p0, p14 = N, 0
  474. }
  475. ;;
  476. { .mmi
  477. (p13) LDFD f58 = [X1], INCX
  478. (p13) LDFD f59 = [X2], INCX
  479. }
  480. ;;
  481. { .mmi
  482. (p12) STFD [Y1] = f48
  483. (p12) STFD [Y2] = f49
  484. (p12) add Y1 = INCY, Y1
  485. }
  486. { .mmi
  487. (p14) LDFD f60 = [X1], INCX
  488. (p14) LDFD f61 = [X2], INCX
  489. (p12) add Y2 = INCY, Y2
  490. }
  491. ;;
  492. { .mmi
  493. (p12) STFD [Y1] = f50
  494. (p12) STFD [Y2] = f51
  495. (p12) add Y1 = INCY, Y1
  496. }
  497. { .mmi
  498. nop __LINE__
  499. (p12) add Y2 = INCY, Y2
  500. }
  501. ;;
  502. { .mmi
  503. (p12) STFD [Y1] = f52
  504. (p12) STFD [Y2] = f53
  505. (p12) add Y1 = INCY, Y1
  506. }
  507. { .mmi
  508. nop __LINE__
  509. nop __LINE__
  510. (p12) add Y2 = INCY, Y2
  511. }
  512. ;;
  513. { .mmi
  514. (p12) STFD [Y1] = f54
  515. (p12) STFD [Y2] = f55
  516. (p12) add Y1 = INCY, Y1
  517. }
  518. { .mmi
  519. nop __LINE__
  520. nop __LINE__
  521. (p12) add Y2 = INCY, Y2
  522. }
  523. ;;
  524. { .mmi
  525. (p13) STFD [Y1] = f56
  526. (p13) STFD [Y2] = f57
  527. (p13) add Y1 = INCY, Y1
  528. }
  529. { .mmi
  530. nop __LINE__
  531. nop __LINE__
  532. (p13) add Y2 = INCY, Y2
  533. }
  534. ;;
  535. { .mmi
  536. (p13) STFD [Y1] = f58
  537. (p13) STFD [Y2] = f59
  538. (p13) add Y1 = INCY, Y1
  539. }
  540. { .mmi
  541. nop __LINE__
  542. nop __LINE__
  543. (p13) add Y2 = INCY, Y2
  544. }
  545. ;;
  546. { .mmb
  547. (p14) STFD [Y1] = f60
  548. (p14) STFD [Y2] = f61
  549. br.ret.sptk.many b0
  550. }
  551. ;;
  552. EPILOGUE