You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

izamax.S 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef XDOUBLE
  41. #define PREFETCH_SIZE ( 8 * 16 + 4)
  42. #elif defined(DOUBLE)
  43. #define PREFETCH_SIZE (16 * 16 + 8)
  44. #else
  45. #define PREFETCH_SIZE (32 * 16 + 16)
  46. #endif
  47. #ifdef USE_MIN
  48. #define CMPUNC cmp.lt.unc
  49. #define CMP cmp.lt
  50. #else
  51. #define CMPUNC cmp.gt.unc
  52. #define CMP cmp.gt
  53. #endif
  54. #define RET r8
  55. #define N r32
  56. #define DX r33
  57. #define INCX r34
  58. #define PRE1 r2
  59. #define I r14
  60. #define J r15
  61. #define K r16
  62. #define TMP r17
  63. #define INCXM1 r18
  64. #define INCX8 r19
  65. #define MAX1 r20
  66. #define DMAX1 r21
  67. #define DATA1 r22
  68. #define DATA2 r23
  69. #define DATA3 r24
  70. #define DATA4 r25
  71. #define DATA5 r26
  72. #define DATA6 r27
  73. #define DATA7 r28
  74. #define DATA8 r29
  75. #define PR r30
  76. #define ARLC r31
  77. PROLOGUE
  78. .prologue
  79. PROFCODE
  80. { .mmi
  81. mov MAX1 = -1
  82. mov DMAX1 = 0
  83. .save ar.lc, ARLC
  84. mov ARLC = ar.lc
  85. }
  86. .body
  87. #ifdef F_INTERFACE
  88. { .mmi
  89. LDINT N = [N]
  90. LDINT INCX = [INCX]
  91. nop.i 0
  92. }
  93. ;;
  94. #ifndef USE64BITINT
  95. { .mii
  96. nop.m 0
  97. sxt4 N = N
  98. sxt4 INCX = INCX
  99. }
  100. ;;
  101. #endif
  102. #endif
  103. { .mii
  104. adds K = -1, N
  105. shl INCX = INCX, ZBASE_SHIFT
  106. mov PR = pr
  107. }
  108. { .mmb
  109. cmp.ge p8, p0 = 0, N
  110. (p8) br.cond.dptk .L999
  111. }
  112. ;;
  113. { .mib
  114. cmp.ge p6, p0 = 0, INCX
  115. mov pr.rot= 0
  116. (p6) br.cond.dptk .L999
  117. }
  118. ;;
  119. { .mmi
  120. LDFD f6 = [DX], SIZE
  121. adds INCXM1 = - SIZE, INCX
  122. mov ar.ec= 5
  123. }
  124. ;;
  125. { .mmi
  126. LDFD f7 = [DX], INCXM1
  127. mov MAX1 = 0
  128. mov I = 1
  129. }
  130. ;;
  131. { .mfi
  132. cmp.eq p16, p0 = r0, r0
  133. fabs f6 = f6
  134. shr J = K, 3
  135. }
  136. { .mmf
  137. nop.m 0
  138. nop.m 0
  139. fabs f7 = f7
  140. }
  141. ;;
  142. { .mmi
  143. cmp.ne p8, p0 = r0, r0
  144. adds J = -1, J
  145. shladd INCX8 = INCX, 3, r0
  146. }
  147. { .mmf
  148. nop.m 0
  149. nop.m 0
  150. FADD f6 = f6, f7
  151. }
  152. ;;
  153. { .mmi
  154. getf.d DMAX1 = f6
  155. adds PRE1 = PREFETCH_SIZE * SIZE, DX
  156. mov ar.lc = J
  157. }
  158. { .mib
  159. cmp.eq p7 ,p0 = -1, J
  160. tbit.z p0, p13 = K, 2
  161. (p7) br.cond.dpnt .L15
  162. }
  163. .align 32
  164. ;;
  165. .L10:
  166. { .mmf
  167. (p16) lfetch.nt1 [PRE1], INCX8
  168. (p16) LDFD f32 = [DX], SIZE
  169. (p19) fabs f35 = f35
  170. }
  171. { .mmf
  172. (p8 ) mov DMAX1 = DATA1
  173. nop.m 0
  174. (p19) fabs f40 = f40
  175. }
  176. ;;
  177. { .mmf
  178. (p20) getf.d DATA5 = f12
  179. (p16) LDFD f37 = [DX], INCXM1
  180. (p20) FADD f14 = f96, f101
  181. }
  182. { .mmi
  183. (p8 ) adds MAX1 = 0, I
  184. (p20) CMPUNC p8, p0 = DATA2, DMAX1
  185. nop.i 0
  186. }
  187. ;;
  188. { .mmf
  189. (p16) LDFD f42 = [DX], SIZE
  190. (p8 ) mov DMAX1 = DATA2
  191. (p19) fabs f45 = f45
  192. }
  193. { .mmf
  194. nop.m 0
  195. nop.m 0
  196. (p19) fabs f50 = f50
  197. }
  198. ;;
  199. { .mmf
  200. (p20) getf.d DATA6 = f13
  201. (p16) LDFD f47 = [DX], INCXM1
  202. (p20) FADD f15 = f106, f111
  203. }
  204. { .mmi
  205. (p8 ) adds MAX1 = 1, I
  206. (p20) CMPUNC p8, p0 = DATA3, DMAX1
  207. nop.i 0
  208. }
  209. ;;
  210. { .mmf
  211. (p16) LDFD f52 = [DX], SIZE
  212. (p8 ) mov DMAX1 = DATA3
  213. (p19) fabs f55 = f55
  214. }
  215. { .mmf
  216. nop.m 0
  217. nop.m 0
  218. (p19) fabs f60 = f60
  219. }
  220. ;;
  221. { .mmf
  222. (p20) getf.d DATA7 = f14
  223. (p16) LDFD f57 = [DX], INCXM1
  224. (p19) FADD f8 = f35, f40
  225. }
  226. { .mmi
  227. (p8 ) adds MAX1 = 2, I
  228. (p20) CMPUNC p8, p0 = DATA4, DMAX1
  229. nop.i 0
  230. }
  231. ;;
  232. { .mmf
  233. (p16) LDFD f62 = [DX], SIZE
  234. (p8 ) mov DMAX1 = DATA4
  235. (p19) fabs f65 = f65
  236. }
  237. { .mmf
  238. nop.m 0
  239. nop.m 0
  240. (p19) fabs f70 = f70
  241. }
  242. ;;
  243. { .mmf
  244. (p20) getf.d DATA8 = f15
  245. (p16) LDFD f67 = [DX], INCXM1
  246. (p19) FADD f9 = f45, f50
  247. }
  248. { .mmi
  249. (p8 ) adds MAX1 = 3, I
  250. (p20) CMPUNC p8, p0 = DATA5, DMAX1
  251. nop.i 0
  252. }
  253. ;;
  254. { .mmf
  255. (p16) LDFD f72 = [DX], SIZE
  256. (p8 ) mov DMAX1 = DATA5
  257. (p19) fabs f75 = f75
  258. }
  259. { .mmf
  260. nop.m 0
  261. nop.m 0
  262. (p19) fabs f80 = f80
  263. }
  264. ;;
  265. { .mmf
  266. (p19) getf.d DATA1 = f8
  267. (p16) LDFD f77 = [DX], INCXM1
  268. (p19) FADD f10 = f55, f60
  269. }
  270. { .mmi
  271. (p8 ) adds MAX1 = 4, I
  272. (p20) CMPUNC p8, p0 = DATA6, DMAX1
  273. nop.i 0
  274. }
  275. ;;
  276. { .mmf
  277. (p16) LDFD f82 = [DX], SIZE
  278. (p8 ) mov DMAX1 = DATA6
  279. (p19) fabs f85 = f85
  280. }
  281. { .mmf
  282. nop.m 0
  283. nop.m 0
  284. (p19) fabs f90 = f90
  285. }
  286. ;;
  287. { .mmf
  288. (p19) getf.d DATA2 = f9
  289. (p16) LDFD f87 = [DX], INCXM1
  290. (p19) FADD f11 = f65, f70
  291. }
  292. { .mmi
  293. (p8 ) adds MAX1 = 5, I
  294. (p20) CMPUNC p8, p0 = DATA7, DMAX1
  295. nop.i 0
  296. }
  297. ;;
  298. { .mmf
  299. (p16) LDFD f92 = [DX], SIZE
  300. (p8 ) mov DMAX1 = DATA7
  301. (p19) fabs f95 = f95
  302. }
  303. { .mmf
  304. mov TMP = I
  305. nop.m 0
  306. (p19) fabs f100 = f100
  307. }
  308. ;;
  309. { .mmf
  310. (p19) getf.d DATA3 = f10
  311. (p16) LDFD f97 = [DX], INCXM1
  312. (p19) FADD f12 = f75, f80
  313. }
  314. { .mmi
  315. (p8 ) adds MAX1 = 6, I
  316. (p20) CMPUNC p8, p0 = DATA8, DMAX1
  317. nop.i 0
  318. }
  319. ;;
  320. { .mmf
  321. (p16) LDFD f102 = [DX], SIZE
  322. (p8 ) mov DMAX1 = DATA8
  323. (p19) fabs f105 = f105
  324. }
  325. { .mmf
  326. (p20) adds I = 8, I
  327. nop.m 0
  328. (p19) fabs f110 = f110
  329. }
  330. ;;
  331. { .mmi
  332. (p19) getf.d DATA4 = f11
  333. (p16) LDFD f107 = [DX], INCXM1
  334. (p8 ) adds MAX1 = 7, TMP
  335. }
  336. { .mfb
  337. (p19) CMPUNC p8, p0 = DATA1, DMAX1
  338. (p19) FADD f13 = f85, f90
  339. br.ctop.sptk.few .L10
  340. }
  341. ;;
  342. .align 32
  343. .L15:
  344. { .mmi
  345. (p13) LDFD f32 = [DX], SIZE
  346. and J = 7, K
  347. mov pr = PR, -65474
  348. }
  349. ;;
  350. { .mmb
  351. (p13) LDFD f33 = [DX], INCXM1
  352. cmp.eq p8 ,p0 = r0, J
  353. (p8) br.cond.dpnt .L999
  354. }
  355. ;;
  356. { .mmi
  357. (p13) LDFD f34 = [DX], SIZE
  358. ;;
  359. (p13) LDFD f35 = [DX], INCXM1
  360. nop.i 0
  361. }
  362. ;;
  363. { .mmi
  364. (p13) LDFD f36 = [DX], SIZE
  365. ;;
  366. (p13) LDFD f37 = [DX], INCXM1
  367. nop.i 0
  368. }
  369. ;;
  370. { .mfi
  371. (p13) LDFD f38 = [DX], SIZE
  372. (p13) fabs f32 = f32
  373. tbit.z p0, p14 = K, 1
  374. }
  375. ;;
  376. { .mmf
  377. (p13) LDFD f39 = [DX], INCXM1
  378. nop.m 0
  379. (p13) fabs f33 = f33
  380. }
  381. ;;
  382. { .mmf
  383. (p14) LDFD f40 = [DX], SIZE
  384. nop.m 0
  385. (p13) fabs f34 = f34
  386. }
  387. ;;
  388. { .mfi
  389. (p14) LDFD f41 = [DX], INCXM1
  390. (p13) fabs f35 = f35
  391. tbit.z p0, p15 = K, 0
  392. }
  393. ;;
  394. { .mmf
  395. (p14) LDFD f42 = [DX], SIZE
  396. nop.m 0
  397. (p13) fabs f36 = f36
  398. }
  399. ;;
  400. { .mmf
  401. (p14) LDFD f43 = [DX], INCXM1
  402. nop.m 0
  403. (p13) fabs f37 = f37
  404. }
  405. { .mmf
  406. nop.m 0
  407. nop.m 0
  408. (p13) FADD f32 = f32, f33
  409. }
  410. ;;
  411. { .mmf
  412. (p15) LDFD f44 = [DX], SIZE
  413. nop.m 0
  414. (p13) fabs f38 = f38
  415. }
  416. ;;
  417. { .mmf
  418. (p15) LDFD f45 = [DX], INCXM1
  419. nop.m 0
  420. (p13) fabs f39 = f39
  421. }
  422. { .mmf
  423. nop.m 0
  424. nop.m 0
  425. (p13) FADD f34 = f34, f35
  426. }
  427. ;;
  428. { .mmf
  429. nop.m 0
  430. nop.m 0
  431. (p14) fabs f40 = f40
  432. }
  433. ;;
  434. { .mmf
  435. (p13) getf.d DATA1 = f32
  436. nop.m 0
  437. (p14) fabs f41 = f41
  438. }
  439. { .mmf
  440. nop.m 0
  441. nop.m 0
  442. (p13) FADD f36 = f36, f37
  443. }
  444. ;;
  445. { .mmf
  446. nop.m 0
  447. nop.m 0
  448. (p14) fabs f42 = f42
  449. }
  450. ;;
  451. { .mmf
  452. (p13) getf.d DATA2 = f34
  453. nop.m 0
  454. (p14) fabs f43 = f43
  455. }
  456. { .mmf
  457. nop.m 0
  458. nop.m 0
  459. (p13) FADD f38 = f38, f39
  460. }
  461. ;;
  462. { .mmf
  463. nop.m 0
  464. nop.m 0
  465. (p15) fabs f44 = f44
  466. }
  467. ;;
  468. { .mmf
  469. (p13) getf.d DATA3 = f36
  470. nop.m 0
  471. (p15) fabs f45 = f45
  472. }
  473. { .mmf
  474. nop.m 0
  475. nop.m 0
  476. (p14) FADD f40 = f40, f41
  477. }
  478. ;;
  479. { .mmf
  480. (p13) getf.d DATA4 = f38
  481. nop.m 0
  482. (p14) FADD f42 = f42, f43
  483. }
  484. ;;
  485. { .mmf
  486. (p14) getf.d DATA5 = f40
  487. nop.m 0
  488. (p15) FADD f44 = f44, f45
  489. }
  490. ;;
  491. { .mmi
  492. (p14) getf.d DATA6 = f42
  493. nop.m 0
  494. (p13) CMPUNC p8, p0 = DATA1, DMAX1
  495. }
  496. ;;
  497. { .mmi
  498. (p15) getf.d DATA7 = f44
  499. (p8 ) adds MAX1 = 0, I
  500. (p8 ) mov DMAX1 = DATA1
  501. }
  502. ;;
  503. { .mmi
  504. (p13) CMPUNC p8, p0 = DATA2, DMAX1
  505. ;;
  506. (p8 ) adds MAX1 = 1, I
  507. (p8 ) mov DMAX1 = DATA2
  508. }
  509. ;;
  510. { .mmi
  511. (p13) CMPUNC p8, p0 = DATA3, DMAX1
  512. ;;
  513. (p8 ) adds MAX1 = 2, I
  514. (p8 ) mov DMAX1 = DATA3
  515. }
  516. ;;
  517. { .mmi
  518. (p13) CMPUNC p8, p0 = DATA4, DMAX1
  519. ;;
  520. (p8 ) adds MAX1 = 3, I
  521. (p8 ) mov DMAX1 = DATA4
  522. }{ .mmi
  523. (p13) adds I = 4, I
  524. nop.m 0
  525. nop.i 0
  526. }
  527. ;;
  528. { .mmi
  529. (p14) CMPUNC p8, p0 = DATA5, DMAX1
  530. ;;
  531. (p8 ) adds MAX1 = 0, I
  532. (p8 ) mov DMAX1 = DATA5
  533. }
  534. ;;
  535. { .mmi
  536. (p14) CMPUNC p8, p0 = DATA6, DMAX1
  537. ;;
  538. (p8 ) adds MAX1 = 1, I
  539. (p8 ) mov DMAX1 = DATA6
  540. }{ .mmi
  541. (p14) adds I = 2, I
  542. nop.m 0
  543. nop.i 0
  544. }
  545. ;;
  546. { .mmi
  547. (p15) CMPUNC p8, p0 = DATA7, DMAX1
  548. ;;
  549. (p8) adds MAX1 = 0, I
  550. (p8) mov DMAX1 = DATA7
  551. }
  552. ;;
  553. .align 32
  554. .L999:
  555. { .mmi
  556. setf.d f8 = DMAX1
  557. adds RET = 1, MAX1
  558. mov ar.lc = ARLC
  559. }
  560. { .mmb
  561. nop.m 0
  562. nop.m 0
  563. br.ret.sptk.many b0
  564. }
  565. EPILOGUE