You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

xdot.S 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCH_SIZE (4 * 24)
  41. #ifdef F_INTERFACE
  42. #define N r33
  43. #define X1 r34
  44. #define INCX r35
  45. #define Y1 r36
  46. #define INCY r37
  47. #else
  48. #define N r32
  49. #define X1 r33
  50. #define INCX r34
  51. #define Y1 r35
  52. #define INCY r36
  53. #endif
  54. #define PREX1 r2
  55. #define PREY1 r3
  56. #define I r14
  57. #define J r15
  58. #define Y2 r16
  59. #define X2 r17
  60. #define INCX4 r24
  61. #define INCY4 r25
  62. #define PR r30
  63. #define ARLC r31
  64. PROLOGUE
  65. .prologue
  66. PROFCODE
  67. { .mfi
  68. nop __LINE__
  69. mov f8 = f0
  70. .save ar.lc, ARLC
  71. mov ARLC = ar.lc
  72. }
  73. { .mfi
  74. mov r26 = 1
  75. mov f9 = f0
  76. nop __LINE__
  77. }
  78. ;;
  79. .body
  80. #ifdef F_INTERFACE
  81. LDINT N = [N]
  82. LDINT INCX = [INCX]
  83. LDINT INCY = [INCY]
  84. ;;
  85. #ifndef USE64BITINT
  86. sxt4 N = N
  87. sxt4 INCX = INCX
  88. sxt4 INCY = INCY
  89. ;;
  90. #endif
  91. cmp.le p0, p6 = r0, INCX
  92. cmp.le p0, p7 = r0, INCY
  93. sub r26 = r26, N
  94. ;;
  95. setf.sig f32 = r26
  96. setf.sig f33 = INCX
  97. setf.sig f34 = INCY
  98. ;;
  99. xmpy.l f33 = f32, f33
  100. xmpy.l f34 = f32, f34
  101. ;;
  102. getf.sig r26 = f33
  103. getf.sig r27 = f34
  104. ;;
  105. shl r26 = r26, ZBASE_SHIFT
  106. shl r27 = r27, ZBASE_SHIFT
  107. ;;
  108. (p6) add X1 = r26, X1
  109. (p7) add Y1 = r27, Y1
  110. ;;
  111. #endif
  112. { .mfi
  113. adds PREX1 = (PREFETCH_SIZE + 2) * SIZE, X1
  114. mov f10 = f0
  115. mov PR = pr
  116. }
  117. { .mfb
  118. cmp.lt p0, p6 = r0, N
  119. mov f11 = f0
  120. (p6) br.cond.spnt .L1000
  121. }
  122. ;;
  123. { .mii
  124. adds PREY1 = (PREFETCH_SIZE + 2) * SIZE, Y1
  125. shl INCX = INCX, ZBASE_SHIFT
  126. shl INCY = INCY, ZBASE_SHIFT
  127. }
  128. ;;
  129. { .mfi
  130. add X2 = SIZE, X1
  131. mov f12 = f0
  132. mov pr.rot= 0
  133. }
  134. { .mfi
  135. add Y2 = SIZE, Y1
  136. mov f13 = f0
  137. shr I = N, 3
  138. }
  139. ;;
  140. { .mfi
  141. adds I = -1, I
  142. mov f14 = f0
  143. mov ar.ec= 3
  144. }
  145. { .mmf
  146. shladd INCX4 = INCX, 2, r0
  147. shladd INCY4 = INCY, 2, r0
  148. mov f15 = f0
  149. }
  150. ;;
  151. { .mmi
  152. and J = 7, N
  153. cmp.eq p16, p0 = r0, r0
  154. mov ar.lc = I
  155. }
  156. { .mib
  157. cmp.eq p6 ,p0 = -1, I
  158. tbit.nz p12, p0 = N, 2
  159. (p6) br.cond.dpnt .L215
  160. }
  161. ;;
  162. .align 32
  163. .L212:
  164. { .mmf
  165. (p16) lfetch.nt1 [PREX1], INCX4
  166. (p16) LDFD f80 = [X1], INCX
  167. (p18) FMA f8 = f34, f82, f8
  168. }
  169. { .mmf
  170. (p16) LDFD f83 = [X2], INCX
  171. nop __LINE__
  172. (p18) FMA f9 = f37, f82, f9
  173. }
  174. ;;
  175. { .mmf
  176. (p16) LDFD f32 = [Y1], INCY
  177. (p16) LDFD f35 = [Y2], INCY
  178. (p18) FMA f10 = f34, f85, f10
  179. }
  180. { .mmf
  181. nop __LINE__
  182. nop __LINE__
  183. (p18) FMA f11 = f37, f85, f11
  184. }
  185. ;;
  186. { .mmf
  187. (p16) LDFD f86 = [X1], INCX
  188. (p16) LDFD f89 = [X2], INCX
  189. (p18) FMA f12 = f40, f88, f12
  190. }
  191. { .mmf
  192. nop __LINE__
  193. nop __LINE__
  194. (p18) FMA f13 = f43, f88, f13
  195. }
  196. ;;
  197. { .mmf
  198. (p16) LDFD f38 = [Y1], INCY
  199. (p16) LDFD f41 = [Y2], INCY
  200. (p18) FMA f14 = f40, f91, f14
  201. }
  202. { .mmf
  203. nop __LINE__
  204. nop __LINE__
  205. (p18) FMA f15 = f43, f91, f15
  206. }
  207. ;;
  208. { .mmf
  209. (p16) LDFD f92 = [X1], INCX
  210. (p16) LDFD f95 = [X2], INCX
  211. (p18) FMA f8 = f46, f94, f8
  212. }
  213. { .mmf
  214. nop __LINE__
  215. nop __LINE__
  216. (p18) FMA f9 = f49, f94, f9
  217. }
  218. ;;
  219. { .mmf
  220. (p16) lfetch.nt1 [PREY1], INCY4
  221. (p16) LDFD f44 = [Y1], INCY
  222. (p18) FMA f10 = f46, f97, f10
  223. }
  224. { .mmf
  225. (p16) LDFD f47 = [Y2], INCY
  226. nop __LINE__
  227. (p18) FMA f11 = f49, f97, f11
  228. }
  229. ;;
  230. { .mmf
  231. (p16) LDFD f98 = [X1], INCX
  232. (p16) LDFD f101 = [X2], INCX
  233. (p18) FMA f12 = f52, f100, f12
  234. }
  235. { .mmf
  236. nop __LINE__
  237. nop __LINE__
  238. (p18) FMA f13 = f55, f100, f13
  239. }
  240. ;;
  241. { .mmf
  242. (p16) LDFD f50 = [Y1], INCY
  243. (p16) LDFD f53 = [Y2], INCY
  244. (p18) FMA f14 = f52, f103, f14
  245. }
  246. { .mmf
  247. nop __LINE__
  248. nop __LINE__
  249. (p18) FMA f15 = f55, f103, f15
  250. }
  251. ;;
  252. { .mmf
  253. (p16) lfetch.nt1 [PREX1], INCX4
  254. (p16) LDFD f104 = [X1], INCX
  255. (p18) FMA f8 = f58, f106, f8
  256. }
  257. { .mmf
  258. (p16) LDFD f107 = [X2], INCX
  259. nop __LINE__
  260. (p18) FMA f9 = f61, f106, f9
  261. }
  262. ;;
  263. { .mmf
  264. (p16) LDFD f56 = [Y1], INCY
  265. (p16) LDFD f59 = [Y2], INCY
  266. (p18) FMA f10 = f58, f109, f10
  267. }
  268. { .mmf
  269. nop __LINE__
  270. nop __LINE__
  271. (p18) FMA f11 = f61, f109, f11
  272. }
  273. ;;
  274. { .mmf
  275. (p16) LDFD f110 = [X1], INCX
  276. (p16) LDFD f113 = [X2], INCX
  277. (p18) FMA f12 = f64, f112, f12
  278. }
  279. { .mmf
  280. nop __LINE__
  281. nop __LINE__
  282. (p18) FMA f13 = f67, f112, f13
  283. }
  284. ;;
  285. { .mmf
  286. (p16) LDFD f62 = [Y1], INCY
  287. (p16) LDFD f65 = [Y2], INCY
  288. (p18) FMA f14 = f64, f115, f14
  289. }
  290. { .mmf
  291. nop __LINE__
  292. nop __LINE__
  293. (p18) FMA f15 = f67, f115, f15
  294. }
  295. ;;
  296. { .mmf
  297. (p16) lfetch.nt1 [PREY1], INCY4
  298. (p16) LDFD f116 = [X1], INCX
  299. (p18) FMA f8 = f70, f118, f8
  300. }
  301. { .mmf
  302. (p16) LDFD f119 = [X2], INCX
  303. nop __LINE__
  304. (p18) FMA f9 = f73, f118, f9
  305. }
  306. ;;
  307. { .mmf
  308. (p16) LDFD f68 = [Y1], INCY
  309. (p16) LDFD f71 = [Y2], INCY
  310. (p18) FMA f10 = f70, f121, f10
  311. }
  312. { .mmf
  313. nop __LINE__
  314. nop __LINE__
  315. (p18) FMA f11 = f73, f121, f11
  316. }
  317. ;;
  318. { .mmf
  319. (p16) LDFD f122 = [X1], INCX
  320. (p16) LDFD f125 = [X2], INCX
  321. (p18) FMA f12 = f76, f124, f12
  322. }
  323. { .mmf
  324. nop __LINE__
  325. nop __LINE__
  326. (p18) FMA f13 = f79, f124, f13
  327. }
  328. ;;
  329. { .mmf
  330. (p16) LDFD f74 = [Y1], INCY
  331. (p16) LDFD f77 = [Y2], INCY
  332. (p18) FMA f14 = f76, f127, f14
  333. }
  334. { .mfb
  335. nop __LINE__
  336. (p18) FMA f15 = f79, f127, f15
  337. br.ctop.sptk.few .L212
  338. }
  339. ;;
  340. .align 32
  341. .L215:
  342. { .mmi
  343. (p12) LDFD f48 = [X1], INCX
  344. (p12) LDFD f49 = [X2], INCX
  345. cmp.eq p7, p0 = r0, J
  346. }
  347. ;;
  348. { .mmb
  349. (p12) LDFD f32 = [Y1], INCY
  350. (p12) LDFD f33 = [Y2], INCY
  351. (p7) br.cond.dptk .L999
  352. }
  353. ;;
  354. { .mmi
  355. (p12) LDFD f50 = [X1], INCX
  356. (p12) LDFD f51 = [X2], INCX
  357. tbit.nz p13, p0 = N, 1
  358. }
  359. ;;
  360. { .mmi
  361. (p12) LDFD f34 = [Y1], INCY
  362. (p12) LDFD f35 = [Y2], INCY
  363. nop __LINE__
  364. }
  365. ;;
  366. { .mmi
  367. (p12) LDFD f52 = [X1], INCX
  368. (p12) LDFD f53 = [X2], INCX
  369. tbit.nz p14, p0 = N, 0
  370. }
  371. ;;
  372. { .mmi
  373. (p12) LDFD f36 = [Y1], INCY
  374. (p12) LDFD f37 = [Y2], INCY
  375. nop __LINE__
  376. }
  377. ;;
  378. { .mmf
  379. (p12) LDFD f54 = [X1], INCX
  380. (p12) LDFD f55 = [X2], INCX
  381. (p12) FMA f8 = f32, f48, f8
  382. }
  383. { .mmf
  384. nop __LINE__
  385. nop __LINE__
  386. (p12) FMA f9 = f33, f48, f9
  387. }
  388. ;;
  389. { .mmf
  390. (p12) LDFD f38 = [Y1], INCY
  391. (p12) LDFD f39 = [Y2], INCY
  392. (p12) FMA f10 = f32, f49, f10
  393. }
  394. { .mmf
  395. nop __LINE__
  396. nop __LINE__
  397. (p12) FMA f11 = f33, f49, f11
  398. }
  399. ;;
  400. { .mmf
  401. (p13) LDFD f56 = [X1], INCX
  402. (p13) LDFD f57 = [X2], INCX
  403. (p12) FMA f12 = f34, f50, f12
  404. }
  405. { .mmf
  406. nop __LINE__
  407. nop __LINE__
  408. (p12) FMA f13 = f35, f50, f13
  409. }
  410. ;;
  411. { .mmf
  412. (p13) LDFD f40 = [Y1], INCY
  413. (p13) LDFD f41 = [Y2], INCY
  414. (p12) FMA f14 = f34, f51, f14
  415. }
  416. { .mmf
  417. nop __LINE__
  418. nop __LINE__
  419. (p12) FMA f15 = f35, f51, f15
  420. }
  421. ;;
  422. { .mmf
  423. (p13) LDFD f58 = [X1], INCX
  424. (p13) LDFD f59 = [X2], INCX
  425. (p12) FMA f8 = f36, f52, f8
  426. }
  427. { .mmf
  428. nop __LINE__
  429. nop __LINE__
  430. (p12) FMA f9 = f37, f52, f9
  431. }
  432. ;;
  433. { .mmf
  434. (p13) LDFD f42 = [Y1], INCY
  435. (p13) LDFD f43 = [Y2], INCY
  436. (p12) FMA f10 = f36, f53, f10
  437. }
  438. { .mmf
  439. nop __LINE__
  440. nop __LINE__
  441. (p12) FMA f11 = f37, f53, f11
  442. }
  443. ;;
  444. { .mmf
  445. (p14) LDFD f60 = [X1]
  446. (p14) LDFD f61 = [X2]
  447. (p12) FMA f12 = f38, f54, f12
  448. }
  449. { .mmf
  450. nop __LINE__
  451. nop __LINE__
  452. (p12) FMA f13 = f39, f54, f13
  453. }
  454. ;;
  455. { .mmf
  456. (p14) LDFD f44 = [Y1]
  457. (p14) LDFD f45 = [Y2]
  458. (p12) FMA f14 = f38, f55, f14
  459. }
  460. { .mmf
  461. nop __LINE__
  462. nop __LINE__
  463. (p12) FMA f15 = f39, f55, f15
  464. }
  465. ;;
  466. (p13) FMA f8 = f40, f56, f8
  467. (p13) FMA f9 = f41, f56, f9
  468. (p13) FMA f10 = f40, f57, f10
  469. (p13) FMA f11 = f41, f57, f11
  470. (p13) FMA f12 = f42, f58, f12
  471. (p13) FMA f13 = f43, f58, f13
  472. (p13) FMA f14 = f42, f59, f14
  473. (p13) FMA f15 = f43, f59, f15
  474. ;;
  475. (p14) FMA f8 = f44, f60, f8
  476. (p14) FMA f9 = f45, f60, f9
  477. (p14) FMA f10 = f44, f61, f10
  478. (p14) FMA f11 = f45, f61, f11
  479. ;;
  480. .align 32
  481. .L999:
  482. FADD f8 = f8, f12
  483. FADD f9 = f9, f13
  484. FADD f10 = f10, f14
  485. FADD f11 = f11, f15
  486. mov ar.lc = ARLC
  487. ;;
  488. #ifndef CONJ
  489. FSUB f8 = f8, f11
  490. FADD f9 = f9, f10
  491. #else
  492. FADD f8 = f8, f11
  493. FSUB f9 = f9, f10
  494. #endif
  495. ;;
  496. .align 32
  497. .L1000:
  498. #ifdef F_INTERFACE
  499. STFD [r32] = f8, SIZE
  500. ;;
  501. STFD [r32] = f9, SIZE
  502. #endif
  503. mov pr = PR, -65474
  504. br.ret.sptk.many b0
  505. EPILOGUE