You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

iamax.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef XDOUBLE
  41. #define PREFETCH_SIZE ( 8 * 16 + 4)
  42. #elif defined(DOUBLE)
  43. #define PREFETCH_SIZE (16 * 16 + 8)
  44. #else
  45. #define PREFETCH_SIZE (32 * 16 + 16)
  46. #endif
  47. #if !defined(USE_MIN) && defined(USE_ABS)
  48. #define FMAX famax
  49. #elif !defined(USE_MIN) && !defined(USE_ABS)
  50. #define FMAX fmax
  51. #elif defined(USE_MIN) && defined(USE_ABS)
  52. #define FMAX famin
  53. #else
  54. #define FMAX fmin
  55. #endif
  56. #define IMAX1 r8
  57. #define IMAX2 r26
  58. #define IMAX3 r27
  59. #define IMAX4 r28
  60. #define PRE1 r2
  61. #define N r14
  62. #define X1 r15
  63. #define INCX r16
  64. #define I r17
  65. #define X2 r18
  66. #define INCX5 r19
  67. #define INCX16 r20
  68. #define CURRENT r21
  69. #define DMAX1 f8
  70. #define DMAX2 f9
  71. #define DMAX3 f10
  72. #define DMAX4 f11
  73. #define DMAX5 f12
  74. #define DMAX6 f13
  75. #define DMAX7 f14
  76. #define DMAX8 f15
  77. #define PR r30
  78. #define ARLC r31
  79. PROLOGUE
  80. .prologue
  81. PROFCODE
  82. { .mmi
  83. mov IMAX1 = 0
  84. .save ar.lc, ARLC
  85. mov ARLC = ar.lc
  86. }
  87. ;;
  88. .body
  89. #ifdef F_INTERFACE
  90. { .mmi
  91. LDINT N = [r32]
  92. LDINT INCX = [r34]
  93. mov X1 = r33
  94. }
  95. ;;
  96. #ifndef USE64BITINT
  97. { .mii
  98. nop.m 0
  99. sxt4 N = N
  100. sxt4 INCX = INCX
  101. }
  102. ;;
  103. #endif
  104. #else
  105. { .mmi
  106. mov N = r32
  107. mov X1 = r33
  108. mov INCX = r34
  109. }
  110. ;;
  111. #endif
  112. { .mii
  113. mov PR = pr
  114. cmp.ge p6, p0 = 0, INCX
  115. }
  116. { .mbb
  117. cmp.ge p8, p0 = 0, N
  118. (p8) br.ret.sptk.many b0
  119. (p6) br.ret.sptk.many b0
  120. }
  121. ;;
  122. { .mmi
  123. LDFD DMAX1 = [X1]
  124. shladd INCX = INCX, BASE_SHIFT, r0
  125. mov pr.rot= 0
  126. }
  127. ;;
  128. mov IMAX1 = 1
  129. mov IMAX2 = 1
  130. mov IMAX3 = 1
  131. mov IMAX4 = 1
  132. mov CURRENT = 1
  133. adds N = -1, N
  134. ;;
  135. { .mmf
  136. add X1 = X1, INCX
  137. mov DMAX2 = DMAX1
  138. }
  139. ;;
  140. { .mmf
  141. shladd X2 = INCX, 2, X1
  142. }
  143. { .mfi
  144. cmp.eq p16, p0 = r0, r0
  145. shr I = N, 4
  146. }
  147. ;;
  148. { .mfi
  149. shladd INCX5 = INCX, 2, INCX
  150. mov DMAX3 = DMAX1
  151. mov ar.ec= 4
  152. }
  153. { .mmf
  154. #ifdef XDOUBLE
  155. shladd INCX16= INCX, 3, r0
  156. #else
  157. shladd INCX16= INCX, 4, r0
  158. #endif
  159. adds I = -1, I
  160. }
  161. ;;
  162. tbit.z p0, p7 = N, 3
  163. ;;
  164. { .mfi
  165. adds PRE1 = PREFETCH_SIZE * SIZE, X1
  166. mov DMAX4 = DMAX1
  167. mov ar.lc = I
  168. }
  169. { .mfb
  170. cmp.eq p6 ,p0 = -1, I
  171. (p6) br.cond.dpnt .L15
  172. }
  173. .align 32
  174. ;;
  175. .L10:
  176. { .mmf
  177. (p16) lfetch.nt1 [PRE1], INCX16
  178. (p16) LDFD f32 = [X1], INCX
  179. (p19) fcmp.neq.unc p12, p0 = DMAX1, DMAX5
  180. }
  181. { .mmf
  182. (p8) adds IMAX1 = 1, CURRENT
  183. nop __LINE__
  184. (p19) FMAX DMAX5 = f67, DMAX1
  185. }
  186. ;;
  187. { .mmf
  188. (p16) LDFD f36 = [X1], INCX
  189. nop __LINE__
  190. (p19) fcmp.neq.unc p13, p0 = DMAX2, DMAX6
  191. }
  192. { .mmf
  193. (p9) adds IMAX2 = 2, CURRENT
  194. nop __LINE__
  195. (p19) FMAX DMAX6 = f71, DMAX2
  196. }
  197. ;;
  198. { .mmf
  199. (p16) LDFD f40 = [X1], INCX
  200. nop __LINE__
  201. (p19) fcmp.neq.unc p14, p0 = DMAX3, DMAX7
  202. }
  203. { .mmf
  204. (p10) adds IMAX3 = 3, CURRENT
  205. nop __LINE__
  206. (p19) FMAX DMAX7 = f75, DMAX3
  207. }
  208. ;;
  209. { .mmf
  210. (p16) LDFD f44 = [X1], INCX
  211. nop __LINE__
  212. (p19) fcmp.neq.unc p15, p0 = DMAX4, DMAX8
  213. }
  214. { .mmf
  215. (p11) adds IMAX4 = 4, CURRENT
  216. nop __LINE__
  217. (p19) FMAX DMAX8 = f79, DMAX4
  218. }
  219. ;;
  220. { .mmf
  221. (p16) LDFD f48 = [X1], INCX
  222. nop __LINE__
  223. (p19) fcmp.neq.unc p8, p0 = DMAX1, DMAX5
  224. }
  225. { .mmf
  226. (p12) adds IMAX1 = 5, CURRENT
  227. nop __LINE__
  228. (p19) FMAX DMAX1 = f83, DMAX5
  229. }
  230. ;;
  231. { .mmf
  232. (p16) LDFD f52 = [X1], INCX
  233. nop __LINE__
  234. (p19) fcmp.neq.unc p9, p0 = DMAX2, DMAX6
  235. }
  236. { .mmf
  237. (p13) adds IMAX2 = 6, CURRENT
  238. nop __LINE__
  239. (p19) FMAX DMAX2 = f87, DMAX6
  240. }
  241. ;;
  242. { .mmf
  243. (p16) LDFD f56 = [X1], INCX
  244. nop __LINE__
  245. (p19) fcmp.neq.unc p10, p0 = DMAX3, DMAX7
  246. }
  247. { .mmf
  248. (p14) adds IMAX3 = 7, CURRENT
  249. nop __LINE__
  250. (p19) FMAX DMAX3 = f91, DMAX7
  251. }
  252. ;;
  253. { .mmf
  254. (p16) LDFD f60 = [X1], INCX
  255. nop __LINE__
  256. (p19) fcmp.neq.unc p11, p0 = DMAX4, DMAX8
  257. }
  258. { .mmf
  259. (p15) adds IMAX4 = 8, CURRENT
  260. nop __LINE__
  261. (p19) FMAX DMAX4 = f95, DMAX8
  262. }
  263. ;;
  264. { .mmf
  265. #ifdef XDOUBLE
  266. (p16) lfetch.nt1 [PRE1], INCX16
  267. #endif
  268. (p16) LDFD f64 = [X1], INCX
  269. #ifndef XDOUBLE
  270. nop __LINE__
  271. #endif
  272. (p19) fcmp.neq.unc p12, p0 = DMAX1, DMAX5
  273. }
  274. { .mmf
  275. (p8) adds IMAX1 = 9, CURRENT
  276. nop __LINE__
  277. (p18) FMAX DMAX5 = f34, DMAX1
  278. }
  279. ;;
  280. { .mmf
  281. (p16) LDFD f68 = [X1], INCX
  282. nop __LINE__
  283. (p19) fcmp.neq.unc p13, p0 = DMAX2, DMAX6
  284. }
  285. { .mmf
  286. (p9) adds IMAX2 = 10, CURRENT
  287. nop __LINE__
  288. (p18) FMAX DMAX6 = f38, DMAX2
  289. }
  290. ;;
  291. { .mmf
  292. (p16) LDFD f72 = [X1], INCX
  293. nop __LINE__
  294. (p19) fcmp.neq.unc p14, p0 = DMAX3, DMAX7
  295. }
  296. { .mmf
  297. (p10) adds IMAX3 = 11, CURRENT
  298. nop __LINE__
  299. (p18) FMAX DMAX7 = f42, DMAX3
  300. }
  301. ;;
  302. { .mmf
  303. (p16) LDFD f76 = [X1], INCX
  304. nop __LINE__
  305. (p19) fcmp.neq.unc p15, p0 = DMAX4, DMAX8
  306. }
  307. { .mmf
  308. (p11) adds IMAX4 = 12, CURRENT
  309. nop __LINE__
  310. (p18) FMAX DMAX8 = f46, DMAX4
  311. }
  312. ;;
  313. { .mmf
  314. (p16) LDFD f80 = [X1], INCX
  315. nop __LINE__
  316. (p18) fcmp.neq.unc p8, p0 = DMAX1, DMAX5
  317. }
  318. { .mmf
  319. (p12) adds IMAX1 = 13, CURRENT
  320. nop __LINE__
  321. (p18) FMAX DMAX1 = f50, DMAX5
  322. }
  323. ;;
  324. { .mmf
  325. (p16) LDFD f84 = [X1], INCX
  326. nop __LINE__
  327. (p18) fcmp.neq.unc p9, p0 = DMAX2, DMAX6
  328. }
  329. { .mmf
  330. (p13) adds IMAX2 = 14, CURRENT
  331. nop __LINE__
  332. (p18) FMAX DMAX2 = f54, DMAX6
  333. }
  334. ;;
  335. { .mmf
  336. (p16) LDFD f88 = [X1], INCX
  337. nop __LINE__
  338. (p18) fcmp.neq.unc p10, p0 = DMAX3, DMAX7
  339. }
  340. { .mmf
  341. (p14) adds IMAX3 = 15, CURRENT
  342. nop __LINE__
  343. (p18) FMAX DMAX3 = f58, DMAX7
  344. }
  345. ;;
  346. { .mmf
  347. (p16) LDFD f92 = [X1], INCX
  348. (p15) adds IMAX4 = 16, CURRENT
  349. (p18) fcmp.neq.unc p11, p0 = DMAX4, DMAX8
  350. }
  351. { .mfb
  352. (p19) adds CURRENT = 16, CURRENT
  353. (p18) FMAX DMAX4 = f62, DMAX8
  354. br.ctop.sptk.few .L10
  355. }
  356. ;;
  357. .align 32
  358. .L15:
  359. { .mmi
  360. (p7) LDFD f32 = [X1], INCX
  361. and I = 15, N
  362. cmp.ne p14, p0 = r0, r0
  363. }
  364. ;;
  365. { .mmb
  366. (p7) LDFD f33 = [X1], INCX
  367. cmp.eq p6, p0 = 0, I
  368. (p6) br.cond.dptk .L999
  369. }
  370. ;;
  371. { .mmi
  372. (p7) LDFD f34 = [X1], INCX
  373. ;;
  374. (p7) LDFD f35 = [X1], INCX
  375. tbit.z p0, p13 = N, 2
  376. }
  377. ;;
  378. { .mmi
  379. (p7) LDFD f36 = [X1], INCX
  380. ;;
  381. (p7) LDFD f37 = [X1], INCX
  382. tbit.z p0, p14 = N, 1
  383. }
  384. ;;
  385. { .mfi
  386. (p7) LDFD f38 = [X1], INCX
  387. (p7) FMAX DMAX5 = f32, DMAX1
  388. tbit.z p0, p15 = N, 0
  389. }
  390. ;;
  391. { .mmf
  392. (p7) LDFD f39 = [X1], INCX
  393. nop __LINE__
  394. (p7) FMAX DMAX6 = f33, DMAX2
  395. }
  396. ;;
  397. { .mmf
  398. (p13) LDFD f40 = [X1], INCX
  399. nop __LINE__
  400. (p7) FMAX DMAX7 = f34, DMAX3
  401. }
  402. ;;
  403. { .mmf
  404. (p13) LDFD f41 = [X1], INCX
  405. nop __LINE__
  406. (p7) FMAX DMAX8 = f35, DMAX4
  407. }
  408. ;;
  409. { .mmf
  410. (p13) LDFD f42 = [X1], INCX
  411. nop __LINE__
  412. (p7) fcmp.neq.unc p8, p0 = DMAX1, DMAX5
  413. }
  414. { .mmf
  415. nop __LINE__
  416. nop __LINE__
  417. (p7) FMAX DMAX1 = f36, DMAX5
  418. }
  419. ;;
  420. { .mmf
  421. (p13) LDFD f43 = [X1], INCX
  422. nop __LINE__
  423. (p7) fcmp.neq.unc p9, p0 = DMAX2, DMAX6
  424. }
  425. { .mmf
  426. nop __LINE__
  427. nop __LINE__
  428. (p7) FMAX DMAX2 = f37, DMAX6
  429. }
  430. ;;
  431. { .mmf
  432. (p14) LDFD f44 = [X1], INCX
  433. nop __LINE__
  434. (p7) fcmp.neq.unc p10, p0 = DMAX3, DMAX7
  435. }
  436. { .mmf
  437. nop __LINE__
  438. nop __LINE__
  439. (p7) FMAX DMAX3 = f38, DMAX7
  440. }
  441. ;;
  442. { .mmf
  443. (p14) LDFD f45 = [X1], INCX
  444. nop __LINE__
  445. (p7) fcmp.neq.unc p11, p0 = DMAX4, DMAX8
  446. }
  447. { .mmf
  448. nop __LINE__
  449. nop __LINE__
  450. (p7) FMAX DMAX4 = f39, DMAX8
  451. }
  452. ;;
  453. { .mmf
  454. (p15) LDFD f46 = [X1], INCX
  455. (p8) adds IMAX1 = 1, CURRENT
  456. (p7) fcmp.neq.unc p8, p0 = DMAX1, DMAX5
  457. }
  458. { .mmf
  459. nop __LINE__
  460. nop __LINE__
  461. (p13) FMAX DMAX5 = f40, DMAX1
  462. }
  463. { .mmf
  464. (p9) adds IMAX2 = 2, CURRENT
  465. nop __LINE__
  466. (p7) fcmp.neq.unc p9, p0 = DMAX2, DMAX6
  467. }
  468. { .mmf
  469. nop __LINE__
  470. nop __LINE__
  471. (p13) FMAX DMAX6 = f41, DMAX2
  472. }
  473. { .mmf
  474. (p10) adds IMAX3 = 3, CURRENT
  475. nop __LINE__
  476. (p7) fcmp.neq.unc p10, p0 = DMAX3, DMAX7
  477. }
  478. { .mmf
  479. nop __LINE__
  480. nop __LINE__
  481. (p13) FMAX DMAX7 = f42, DMAX3
  482. }
  483. { .mmf
  484. (p11) adds IMAX4 = 4, CURRENT
  485. nop __LINE__
  486. (p7) fcmp.neq.unc p11, p0 = DMAX4, DMAX8
  487. }
  488. { .mmf
  489. nop __LINE__
  490. nop __LINE__
  491. (p13) FMAX DMAX8 = f43, DMAX4
  492. }
  493. ;;
  494. { .mmf
  495. (p8) adds IMAX1 = 5, CURRENT
  496. nop __LINE__
  497. (p13) fcmp.neq.unc p8, p0 = DMAX1, DMAX5
  498. }
  499. { .mmf
  500. nop __LINE__
  501. nop __LINE__
  502. (p13) mov DMAX1 = DMAX5
  503. }
  504. { .mmf
  505. (p9) adds IMAX2 = 6, CURRENT
  506. nop __LINE__
  507. (p13) fcmp.neq.unc p9, p0 = DMAX2, DMAX6
  508. }
  509. { .mmf
  510. nop __LINE__
  511. nop __LINE__
  512. (p13) mov DMAX2 = DMAX6
  513. }
  514. { .mmf
  515. (p10) adds IMAX3 = 7, CURRENT
  516. nop __LINE__
  517. (p13) fcmp.neq.unc p10, p0 = DMAX3, DMAX7
  518. }
  519. { .mmf
  520. nop __LINE__
  521. nop __LINE__
  522. (p13) mov DMAX3 = DMAX7
  523. }
  524. { .mmf
  525. (p11) adds IMAX4 = 8, CURRENT
  526. nop __LINE__
  527. (p13) fcmp.neq.unc p11, p0 = DMAX4, DMAX8
  528. }
  529. { .mmf
  530. (p7) adds CURRENT = 8, CURRENT
  531. nop __LINE__
  532. (p13) mov DMAX4 = DMAX8
  533. }
  534. ;;
  535. { .mmf
  536. (p8) adds IMAX1 = 1, CURRENT
  537. nop __LINE__
  538. (p14) FMAX DMAX5 = f44, DMAX1
  539. }
  540. { .mmf
  541. (p9) adds IMAX2 = 2, CURRENT
  542. (p10) adds IMAX3 = 3, CURRENT
  543. (p14) FMAX DMAX6 = f45, DMAX2
  544. }
  545. { .mmf
  546. (p11) adds IMAX4 = 4, CURRENT
  547. (p13) adds CURRENT = 4, CURRENT
  548. (p15) FMAX DMAX7 = f46, DMAX3
  549. }
  550. ;;
  551. { .mmf
  552. nop __LINE__
  553. nop __LINE__
  554. (p14) fcmp.neq.unc p8, p0 = DMAX5, DMAX1
  555. }
  556. { .mmf
  557. nop __LINE__
  558. nop __LINE__
  559. (p14) mov DMAX1 = DMAX5
  560. }
  561. { .mmf
  562. nop __LINE__
  563. nop __LINE__
  564. (p14) fcmp.neq.unc p9, p0 = DMAX6, DMAX2
  565. }
  566. { .mmf
  567. nop __LINE__
  568. nop __LINE__
  569. (p14) mov DMAX2 = DMAX6
  570. }
  571. { .mmf
  572. nop __LINE__
  573. nop __LINE__
  574. (p15) fcmp.neq.unc p10, p0 = DMAX7, DMAX3
  575. }
  576. { .mmf
  577. nop __LINE__
  578. nop __LINE__
  579. (p15) mov DMAX3 = DMAX7
  580. }
  581. ;;
  582. .L999:
  583. { .mmf
  584. (p8) adds IMAX1 = 1, CURRENT
  585. nop __LINE__
  586. FMAX DMAX5 = DMAX2, DMAX1
  587. }
  588. { .mmf
  589. (p9) adds IMAX2 = 2, CURRENT
  590. (p14) adds CURRENT = 2, CURRENT
  591. FMAX DMAX6 = DMAX4, DMAX3
  592. }
  593. ;;
  594. { .mmf
  595. nop __LINE__
  596. nop __LINE__
  597. fcmp.neq p12, p0 = DMAX5, DMAX1
  598. }
  599. { .mmf
  600. (p10) adds IMAX3 = 1, CURRENT
  601. nop __LINE__
  602. fcmp.neq p13, p0 = DMAX6, DMAX3
  603. }
  604. ;;
  605. { .mmf
  606. (p12) mov IMAX1 = IMAX2
  607. (p13) mov IMAX3 = IMAX4
  608. FMAX DMAX1 = DMAX6, DMAX5
  609. }
  610. ;;
  611. { .mfi
  612. nop __LINE__
  613. fcmp.neq p12, p0 = DMAX1, DMAX5
  614. mov ar.lc = ARLC
  615. }
  616. ;;
  617. { .mib
  618. (p12) mov IMAX1 = IMAX3
  619. mov pr = PR, -65474
  620. br.ret.sptk.many b0
  621. }
  622. ;;
  623. EPILOGUE