You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

amax.S 7.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef XDOUBLE
  41. #define PREFETCH_SIZE ( 8 * 16 + 4)
  42. #elif defined(DOUBLE)
  43. #define PREFETCH_SIZE (16 * 16 + 8)
  44. #else
  45. #define PREFETCH_SIZE (32 * 16 + 16)
  46. #endif
  47. #if !defined(USE_MIN) && defined(USE_ABS)
  48. #define FMAX famax
  49. #elif !defined(USE_MIN) && !defined(USE_ABS)
  50. #define FMAX fmax
  51. #elif defined(USE_MIN) && defined(USE_ABS)
  52. #define FMAX famin
  53. #else
  54. #define FMAX fmin
  55. #endif
  56. #define RET r8
  57. #define N r32
  58. #define DX r33
  59. #define INCX r34
  60. #define PRE1 r2
  61. #define J r14
  62. #define K r15
  63. #define X2 r16
  64. #define X3 r17
  65. #define INCX5 r18
  66. #define INCX16 r19
  67. #define DMAX1 f8
  68. #define DMAX2 f9
  69. #define DMAX3 f10
  70. #define DMAX4 f11
  71. #define DMAX5 f12
  72. #define DMAX6 f13
  73. #define DMAX7 f14
  74. #define DMAX8 f15
  75. #define PR r30
  76. #define ARLC r31
  77. PROLOGUE
  78. .prologue
  79. PROFCODE
  80. { .mfi
  81. mov RET = 0
  82. mov DMAX1 = f0
  83. .save ar.lc, ARLC
  84. mov ARLC = ar.lc
  85. }
  86. ;;
  87. .body
  88. #ifdef F_INTERFACE
  89. { .mmi
  90. LDINT N = [N]
  91. LDINT INCX = [INCX]
  92. nop.i 0
  93. }
  94. ;;
  95. #ifndef USE64BITINT
  96. { .mii
  97. nop.m 0
  98. sxt4 N = N
  99. sxt4 INCX = INCX
  100. }
  101. ;;
  102. #endif
  103. #endif
  104. { .mii
  105. mov PR = pr
  106. cmp.ge p6, p0 = 0, INCX
  107. }
  108. { .mbb
  109. cmp.ge p8, p0 = 0, N
  110. (p8) br.ret.sptk.many b0
  111. (p6) br.ret.sptk.many b0
  112. }
  113. ;;
  114. { .mmi
  115. LDFD DMAX1 = [DX]
  116. shladd INCX = INCX, BASE_SHIFT, r0
  117. mov pr.rot= 0
  118. }
  119. ;;
  120. { .mmf
  121. add DX = DX, INCX
  122. adds K = -1, N
  123. mov DMAX2 = DMAX1
  124. }
  125. ;;
  126. { .mfi
  127. shladd X2 = INCX, 2, DX
  128. mov DMAX5 = DMAX1
  129. shr J = K, 4
  130. }
  131. { .mmf
  132. cmp.eq p16, p0 = r0, r0
  133. nop.m 0
  134. mov DMAX6 = DMAX1
  135. }
  136. ;;
  137. { .mfi
  138. shladd INCX5 = INCX, 2, INCX
  139. mov DMAX3 = DMAX1
  140. mov ar.ec= 4
  141. }
  142. { .mmf
  143. #ifdef XDOUBLE
  144. shladd INCX16= INCX, 3, r0
  145. #else
  146. shladd INCX16= INCX, 4, r0
  147. #endif
  148. adds J = -1, J
  149. mov DMAX7 = DMAX1
  150. }
  151. ;;
  152. { .mfi
  153. adds PRE1 = PREFETCH_SIZE * SIZE, DX
  154. mov DMAX4 = DMAX1
  155. mov ar.lc = J
  156. }
  157. { .mfb
  158. cmp.eq p7 ,p0 = -1, J
  159. mov DMAX8 = DMAX1
  160. (p7) br.cond.dpnt .L15
  161. }
  162. .align 32
  163. ;;
  164. .L10:
  165. { .mmf
  166. (p16) lfetch.nt1 [PRE1], INCX16
  167. (p16) LDFD f32 = [DX], INCX
  168. (p19) FMAX DMAX1 = f35, DMAX1
  169. }
  170. { .mmf
  171. (p16) LDFD f48 = [X2], INCX
  172. nop.m 0
  173. (p19) FMAX DMAX5 = f51, DMAX5
  174. }
  175. ;;
  176. { .mmf
  177. (p16) LDFD f36 = [DX], INCX
  178. nop.m 0
  179. (p19) FMAX DMAX2 = f39, DMAX2
  180. }
  181. { .mmf
  182. (p16) LDFD f52 = [X2], INCX
  183. nop.m 0
  184. (p19) FMAX DMAX6 = f55, DMAX6
  185. }
  186. ;;
  187. { .mmf
  188. (p16) LDFD f40 = [DX], INCX
  189. nop.m 0
  190. (p19) FMAX DMAX3 = f43, DMAX3
  191. }
  192. { .mmf
  193. (p16) LDFD f56 = [X2], INCX
  194. nop.m 0
  195. (p19) FMAX DMAX7 = f59, DMAX7
  196. }
  197. ;;
  198. { .mmf
  199. (p16) LDFD f44 = [DX], INCX5
  200. nop.m 0
  201. (p19) FMAX DMAX4 = f47, DMAX4
  202. }
  203. { .mmf
  204. (p16) LDFD f60 = [X2], INCX5
  205. nop.m 0
  206. (p19) FMAX DMAX8 = f63, DMAX8
  207. }
  208. ;;
  209. { .mmf
  210. #ifdef XDOUBLE
  211. (p16) lfetch.nt1 [PRE1], INCX16
  212. #endif
  213. (p16) LDFD f64 = [DX], INCX
  214. #ifndef XDOUBLE
  215. nop.m 0
  216. #endif
  217. (p19) FMAX DMAX1 = f67, DMAX1
  218. }
  219. { .mmf
  220. (p16) LDFD f80 = [X2], INCX
  221. nop.m 0
  222. (p19) FMAX DMAX5 = f83, DMAX5
  223. }
  224. ;;
  225. { .mmf
  226. (p16) LDFD f68 = [DX], INCX
  227. nop.m 0
  228. (p19) FMAX DMAX2 = f71, DMAX2
  229. }
  230. { .mmf
  231. (p16) LDFD f84 = [X2], INCX
  232. nop.m 0
  233. (p19) FMAX DMAX6 = f87, DMAX6
  234. }
  235. ;;
  236. { .mmf
  237. (p16) LDFD f72 = [DX], INCX
  238. nop.m 0
  239. (p19) FMAX DMAX3 = f75, DMAX3
  240. }
  241. { .mmf
  242. (p16) LDFD f88 = [X2], INCX
  243. nop.m 0
  244. (p19) FMAX DMAX7 = f91, DMAX7
  245. }
  246. ;;
  247. { .mmf
  248. (p16) LDFD f76 = [DX], INCX5
  249. nop.m 0
  250. (p19) FMAX DMAX4 = f79, DMAX4
  251. }
  252. { .mfb
  253. (p16) LDFD f92 = [X2], INCX5
  254. (p19) FMAX DMAX8 = f95, DMAX8
  255. br.ctop.sptk.few .L10
  256. }
  257. .align 32
  258. ;;
  259. .L15:
  260. and J = 15, K
  261. tbit.z p0, p12 = K, 3
  262. mov X3 = DX
  263. ;;
  264. { .mmi
  265. (p12) LDFD f32 = [DX], INCX
  266. (p12) LDFD f36 = [X2], INCX
  267. tbit.z p0, p13 = K, 2
  268. }
  269. { .mib
  270. cmp.eq p8 ,p0 = r0, J
  271. tbit.z p0, p14 = K, 1
  272. (p8) br.cond.dpnt .L99
  273. }
  274. ;;
  275. { .mmi
  276. (p12) LDFD f33 = [DX], INCX
  277. (p12) LDFD f37 = [X2], INCX
  278. tbit.z p0, p15 = K, 0
  279. }
  280. ;;
  281. { .mmi
  282. (p12) LDFD f34 = [DX], INCX
  283. (p12) LDFD f38 = [X2], INCX
  284. (p12) shladd X3 = INCX, 3, X3
  285. }
  286. ;;
  287. { .mmi
  288. (p12) LDFD f35 = [DX], INCX5
  289. (p12) LDFD f39 = [X2], INCX5
  290. (p13) shladd X3 = INCX, 2, X3
  291. }
  292. ;;
  293. { .mmi
  294. (p13) LDFD f40 = [DX], INCX
  295. (p14) LDFD f44 = [X3], INCX
  296. nop.i 0
  297. }
  298. ;;
  299. { .mmi
  300. (p13) LDFD f41 = [DX], INCX
  301. (p14) LDFD f45 = [X3], INCX
  302. nop.i 0
  303. }
  304. ;;
  305. { .mmf
  306. (p13) LDFD f42 = [DX], INCX
  307. nop.m 0
  308. (p12) FMAX DMAX1 = f32, DMAX1
  309. }
  310. { .mmf
  311. (p15) LDFD f46 = [X3], INCX
  312. nop.m 0
  313. (p12) FMAX DMAX5 = f36, DMAX5
  314. }
  315. ;;
  316. { .mmf
  317. (p13) LDFD f43 = [DX], INCX
  318. nop.m 0
  319. (p12) FMAX DMAX2 = f33, DMAX2
  320. }
  321. (p12) FMAX DMAX6 = f37, DMAX6
  322. (p12) FMAX DMAX3 = f34, DMAX3
  323. (p12) FMAX DMAX7 = f38, DMAX7
  324. (p12) FMAX DMAX4 = f35, DMAX4
  325. (p12) FMAX DMAX8 = f39, DMAX8
  326. ;;
  327. (p13) FMAX DMAX1 = f40, DMAX1
  328. (p14) FMAX DMAX5 = f44, DMAX5
  329. (p13) FMAX DMAX2 = f41, DMAX2
  330. (p14) FMAX DMAX6 = f45, DMAX6
  331. (p13) FMAX DMAX3 = f42, DMAX3
  332. (p15) FMAX DMAX7 = f46, DMAX7
  333. (p13) FMAX DMAX4 = f43, DMAX4
  334. ;;
  335. .align 32
  336. .L99:
  337. { .mfi
  338. nop.m 0
  339. FMAX DMAX1 = DMAX5, DMAX1
  340. mov ar.lc = ARLC
  341. }
  342. { .mmf
  343. nop.m 0
  344. nop.m 0
  345. FMAX DMAX2 = DMAX6, DMAX2
  346. }
  347. ;;
  348. { .mfi
  349. nop.m 0
  350. FMAX DMAX3 = DMAX7, DMAX3
  351. mov pr = PR, -65474
  352. }
  353. { .mmf
  354. nop.m 0
  355. nop.m 0
  356. FMAX DMAX4 = DMAX8, DMAX4
  357. }
  358. ;;
  359. { .mmf
  360. FMAX DMAX1 = DMAX2, DMAX1
  361. }
  362. { .mmf
  363. FMAX DMAX3 = DMAX4, DMAX3
  364. }
  365. ;;
  366. #ifndef USE_ABS
  367. { .mfb
  368. FMAX DMAX1 = DMAX3, DMAX1
  369. br.ret.sptk.many b0
  370. }
  371. #else
  372. { .mmf
  373. FMAX DMAX1 = DMAX3, DMAX1
  374. }
  375. ;;
  376. { .mfb
  377. fabs DMAX1 = DMAX1
  378. br.ret.sptk.many b0
  379. }
  380. #endif
  381. ;;
  382. EPILOGUE