You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

asum.S 7.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef XDOUBLE
  41. #define PREFETCH_SIZE ( 8 * 16 + 4)
  42. #elif defined(DOUBLE)
  43. #define PREFETCH_SIZE (16 * 16 + 8)
  44. #else
  45. #define PREFETCH_SIZE (32 * 16 + 16)
  46. #endif
  47. #ifndef COMPLEX
  48. #define COMPADD 0
  49. #define STRIDE INCX
  50. #else
  51. #define COMPADD 1
  52. #define STRIDE SIZE
  53. #endif
  54. #define PRE1 r2
  55. #define I r17
  56. #define J r18
  57. #define INCX16 r21
  58. #define PR r30
  59. #define ARLC r31
  60. #define N r32
  61. #define X r33
  62. #define INCX r34
  63. PROLOGUE
  64. .prologue
  65. PROFCODE
  66. { .mfi
  67. adds PRE1 = PREFETCH_SIZE * SIZE, X
  68. mov f8 = f0
  69. .save ar.lc, ARLC
  70. mov ARLC = ar.lc
  71. }
  72. ;;
  73. .body
  74. #ifdef F_INTERFACE
  75. { .mmi
  76. LDINT N = [N]
  77. LDINT INCX = [INCX]
  78. nop.i 0
  79. }
  80. ;;
  81. #ifndef USE64BITINT
  82. { .mii
  83. nop.m 0
  84. sxt4 N = N
  85. sxt4 INCX = INCX
  86. }
  87. ;;
  88. #endif
  89. #endif
  90. { .mmi
  91. cmp.lt p0, p6 = r0, INCX
  92. cmp.lt p0, p7 = r0, N
  93. shr I = N, (4 - COMPADD)
  94. }
  95. { .mbb
  96. and J = ((1 << (4 - COMPADD)) - 1), N
  97. (p6) br.ret.sptk.many b0
  98. (p7) br.ret.sptk.many b0
  99. }
  100. ;;
  101. { .mfi
  102. adds I = -1, I
  103. mov f10 = f0
  104. mov PR = pr
  105. }
  106. { .mfi
  107. cmp.eq p9, p0 = r0, J
  108. mov f9 = f0
  109. tbit.z p0, p12 = N, 3 - COMPADD
  110. }
  111. ;;
  112. { .mmi
  113. cmp.eq p16, p0 = r0, r0
  114. cmp.ne p17, p0 = r0, r0
  115. mov ar.ec= 3
  116. }
  117. { .mfi
  118. cmp.ne p18, p0 = r0, r0
  119. mov f11 = f0
  120. shl INCX = INCX, BASE_SHIFT + COMPADD
  121. }
  122. ;;
  123. { .mmi
  124. #ifdef XDOUBLE
  125. shladd INCX16 = INCX, (3 - COMPADD), r0
  126. #else
  127. shladd INCX16 = INCX, (4 - COMPADD), r0
  128. #endif
  129. cmp.ne p19, p0 = r0, r0
  130. mov ar.lc = I
  131. }
  132. { .mmb
  133. cmp.gt p8 ,p0 = r0, I
  134. #ifdef COMPLEX
  135. adds INCX = - SIZE, INCX
  136. #else
  137. nop.m 0
  138. #endif
  139. (p8) br.cond.dpnt .L55
  140. }
  141. ;;
  142. .align 32
  143. .L52:
  144. { .mmf
  145. (p16) lfetch.nt1 [PRE1], INCX16
  146. (p16) LDFD f32 = [X], STRIDE
  147. (p18) fabs f34 = f34
  148. }
  149. { .mfb
  150. (p19) FADD f8 = f8, f71
  151. }
  152. ;;
  153. { .mmf
  154. (p16) LDFD f35 = [X], INCX
  155. (p18) fabs f37 = f37
  156. }
  157. { .mfb
  158. (p19) FADD f9 = f9, f74
  159. }
  160. ;;
  161. { .mmf
  162. (p16) LDFD f38 = [X], STRIDE
  163. (p18) fabs f40 = f40
  164. }
  165. { .mfb
  166. (p19) FADD f10 = f10, f77
  167. }
  168. ;;
  169. { .mmf
  170. (p16) LDFD f41 = [X], INCX
  171. (p18) fabs f43 = f43
  172. }
  173. { .mfb
  174. (p19) FADD f11 = f11, f80
  175. }
  176. ;;
  177. { .mmf
  178. (p16) LDFD f44 = [X], STRIDE
  179. (p18) fabs f46 = f46
  180. }
  181. { .mfb
  182. (p18) FADD f8 = f8, f34
  183. }
  184. ;;
  185. { .mmf
  186. (p16) LDFD f47 = [X], INCX
  187. (p18) fabs f49 = f49
  188. }
  189. { .mfb
  190. (p18) FADD f9 = f9, f37
  191. }
  192. ;;
  193. { .mmf
  194. (p16) LDFD f50 = [X], STRIDE
  195. (p18) fabs f52 = f52
  196. }
  197. { .mfb
  198. (p18) FADD f10 = f10, f40
  199. }
  200. ;;
  201. { .mmf
  202. (p16) LDFD f53 = [X], INCX
  203. (p18) fabs f55 = f55
  204. }
  205. { .mfb
  206. (p18) FADD f11 = f11, f43
  207. }
  208. ;;
  209. { .mmf
  210. #ifdef XDOUBLE
  211. (p16) lfetch.nt1 [PRE1], INCX16
  212. #endif
  213. (p16) LDFD f56 = [X], STRIDE
  214. (p18) fabs f58 = f58
  215. }
  216. { .mfb
  217. (p18) FADD f8 = f8, f46
  218. }
  219. ;;
  220. { .mmf
  221. (p16) LDFD f59 = [X], INCX
  222. (p18) fabs f61 = f61
  223. }
  224. { .mfb
  225. (p18) FADD f9 = f9, f49
  226. }
  227. ;;
  228. { .mmf
  229. (p16) LDFD f62 = [X], STRIDE
  230. (p18) fabs f64 = f64
  231. }
  232. { .mfb
  233. (p18) FADD f10 = f10, f52
  234. }
  235. ;;
  236. { .mmf
  237. (p16) LDFD f65 = [X], INCX
  238. (p18) fabs f67 = f67
  239. }
  240. { .mfb
  241. (p18) FADD f11 = f11, f55
  242. }
  243. ;;
  244. { .mmf
  245. (p16) LDFD f68 = [X], STRIDE
  246. (p18) fabs f70 = f70
  247. }
  248. { .mfb
  249. (p18) FADD f8 = f8, f58
  250. }
  251. ;;
  252. { .mmf
  253. (p16) LDFD f71 = [X], INCX
  254. (p18) fabs f73 = f73
  255. }
  256. { .mfb
  257. (p18) FADD f9 = f9, f61
  258. }
  259. ;;
  260. { .mmf
  261. (p16) LDFD f74 = [X], STRIDE
  262. (p18) fabs f76 = f76
  263. }
  264. { .mfb
  265. (p18) FADD f10 = f10, f64
  266. }
  267. ;;
  268. { .mmf
  269. (p16) LDFD f77 = [X], INCX
  270. (p18) fabs f79 = f79
  271. }
  272. { .mfb
  273. (p18) FADD f11 = f11, f67
  274. br.ctop.sptk.few .L52
  275. }
  276. ;;
  277. FADD f8 = f8, f71
  278. FADD f9 = f9, f74
  279. FADD f10 = f10, f77
  280. FADD f11 = f11, f80
  281. .align 32
  282. ;;
  283. .L55:
  284. (p12) LDFD f32 = [X], STRIDE
  285. (p9) br.cond.dptk .L998
  286. ;;
  287. (p12) LDFD f33 = [X], INCX
  288. ;;
  289. (p12) LDFD f34 = [X], STRIDE
  290. ;;
  291. (p12) LDFD f35 = [X], INCX
  292. tbit.z p0, p13 = N, (2 - COMPADD)
  293. ;;
  294. (p12) LDFD f36 = [X], STRIDE
  295. tbit.z p0, p14 = N, (1 - COMPADD)
  296. ;;
  297. (p12) LDFD f37 = [X], INCX
  298. #ifndef COMPLEX
  299. tbit.z p0, p15 = N, 0
  300. #endif
  301. ;;
  302. (p12) LDFD f38 = [X], STRIDE
  303. (p12) fabs f32 = f32
  304. ;;
  305. (p12) LDFD f39 = [X], INCX
  306. (p12) fabs f33 = f33
  307. ;;
  308. (p13) LDFD f40 = [X], STRIDE
  309. (p12) fabs f34 = f34
  310. ;;
  311. (p13) LDFD f41 = [X], INCX
  312. (p12) fabs f35 = f35
  313. ;;
  314. (p13) LDFD f42 = [X], STRIDE
  315. (p12) fabs f36 = f36
  316. (p12) FADD f8 = f8, f32
  317. ;;
  318. (p13) LDFD f43 = [X], INCX
  319. (p12) fabs f37 = f37
  320. (p12) FADD f9 = f9, f33
  321. ;;
  322. (p14) LDFD f44 = [X], STRIDE
  323. (p12) fabs f38 = f38
  324. (p12) FADD f10 = f10, f34
  325. ;;
  326. (p14) LDFD f45 = [X], INCX
  327. (p12) fabs f39 = f39
  328. (p12) FADD f11 = f11, f35
  329. ;;
  330. #ifndef COMPLEX
  331. (p15) LDFD f46 = [X]
  332. #endif
  333. (p13) fabs f40 = f40
  334. (p12) FADD f8 = f8, f36
  335. ;;
  336. (p13) fabs f41 = f41
  337. (p12) FADD f9 = f9, f37
  338. (p13) fabs f42 = f42
  339. (p12) FADD f10 = f10, f38
  340. (p13) fabs f43 = f43
  341. (p12) FADD f11 = f11, f39
  342. ;;
  343. (p14) fabs f44 = f44
  344. (p13) FADD f8 = f8, f40
  345. (p14) fabs f45 = f45
  346. (p13) FADD f9 = f9, f41
  347. #ifndef COMPLEX
  348. (p15) fabs f46 = f46
  349. #endif
  350. (p13) FADD f10 = f10, f42
  351. ;;
  352. (p13) FADD f11 = f11, f43
  353. (p14) FADD f8 = f8, f44
  354. (p14) FADD f9 = f9, f45
  355. #ifndef COMPLEX
  356. (p15) FADD f10 = f10, f46
  357. #endif
  358. ;;
  359. .align 32
  360. .L998:
  361. { .mfi
  362. FADD f8 = f8, f9
  363. mov ar.lc = ARLC
  364. }
  365. { .mmf
  366. FADD f10 = f10, f11
  367. }
  368. ;;
  369. { .mii
  370. mov pr = PR, -65474
  371. }
  372. ;;
  373. { .mfb
  374. FADD f8 = f8, f10
  375. br.ret.sptk.many b0
  376. }
  377. EPILOGUE