You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef XDOUBLE
  41. #define PREFETCH_SIZE ( 8 * 16)
  42. #elif defined(DOUBLE)
  43. #define PREFETCH_SIZE (16 * 16)
  44. #else
  45. #define PREFETCH_SIZE (32 * 16)
  46. #endif
  47. #ifndef COMPLEX
  48. #define COMPADD 0
  49. #define STRIDE INCX
  50. #else
  51. #define COMPADD 1
  52. #define STRIDE SIZE
  53. #endif
  54. #define PRE1 r2
  55. #define I r17
  56. #define J r18
  57. #define X2 r19
  58. #define INCX5 r20
  59. #define INCX16 r21
  60. #define N r32
  61. #define X r33
  62. #define INCX r34
  63. #define PR r30
  64. #define ARLC r31
  65. PROLOGUE
  66. .prologue
  67. PROFCODE
  68. { .mfi
  69. adds PRE1 = PREFETCH_SIZE * SIZE, X
  70. mov f8 = f0
  71. .save ar.lc, ARLC
  72. mov ARLC = ar.lc
  73. }
  74. ;;
  75. .body
  76. #ifdef F_INTERFACE
  77. LDINT N = [N]
  78. LDINT INCX = [INCX]
  79. ;;
  80. #ifndef USE64BITINT
  81. sxt4 N = N
  82. sxt4 INCX = INCX
  83. ;;
  84. #endif
  85. #endif
  86. { .mmi
  87. cmp.ge p6, p0 = r0, N
  88. cmp.ge p7, p0 = r0, INCX
  89. shr I = N, (4 - COMPADD)
  90. }
  91. { .mbb
  92. and J = ((1 << (4 - COMPADD)) - 1), N
  93. (p6) br.ret.sptk.many b0
  94. (p7) br.ret.sptk.many b0
  95. }
  96. ;;
  97. { .mfi
  98. mov f9 = f0
  99. mov PR = pr
  100. }
  101. { .mfi
  102. adds I = -1, I
  103. mov f10 = f0
  104. shl INCX = INCX, (BASE_SHIFT + COMPADD)
  105. }
  106. ;;
  107. { .mfi
  108. shladd X2 = INCX, (2 - COMPADD), X
  109. mov f11 = f0
  110. mov pr.rot = 0
  111. }
  112. { .mfi
  113. shladd INCX5 = INCX, (2 - COMPADD), INCX
  114. mov f12 = f0
  115. tbit.z p0, p12 = N, (3 - COMPADD)
  116. }
  117. ;;
  118. { .mfi
  119. shladd INCX16 = INCX, (4 - COMPADD), r0
  120. mov f13 = f0
  121. mov ar.ec= 3
  122. }
  123. { .mmf
  124. cmp.gt p8 ,p0 = r0, I
  125. cmp.eq p16, p0 = r0, r0
  126. mov f14 = f0
  127. }
  128. ;;
  129. { .mmf
  130. #ifdef COMPLEX
  131. adds INCX = - SIZE, INCX
  132. adds INCX5 = - SIZE, INCX5
  133. #else
  134. nop.m 0
  135. nop.m 0
  136. #endif
  137. mov f15 = f0
  138. }
  139. { .mib
  140. cmp.eq p9, p0 = r0, J
  141. mov ar.lc = I
  142. (p8) br.cond.dpnt .L52
  143. }
  144. ;;
  145. .align 32
  146. .L51:
  147. (p16) LDFD f32 = [X], STRIDE
  148. (p16) lfetch.nt1 [PRE1], INCX16
  149. (p18) fma.d.s1 f8 = f34, f34, f8
  150. (p16) LDFD f35 = [X2], STRIDE
  151. (p18) fma.d.s1 f9 = f37, f37, f9
  152. nop.b 0
  153. ;;
  154. (p16) LDFD f38 = [X], INCX
  155. (p18) fma.d.s1 f10 = f40, f40, f10
  156. nop.b 0
  157. (p16) LDFD f41 = [X2], INCX
  158. (p18) fma.d.s1 f11 = f43, f43, f11
  159. nop.b 0
  160. ;;
  161. (p16) LDFD f44 = [X], STRIDE
  162. (p18) fma.d.s1 f12 = f46, f46, f12
  163. nop.b 0
  164. (p16) LDFD f47 = [X2], STRIDE
  165. (p18) fma.d.s1 f13 = f49, f49, f13
  166. nop.b 0
  167. ;;
  168. (p16) LDFD f50 = [X], INCX5
  169. (p18) fma.d.s1 f14 = f52, f52, f14
  170. nop.b 0
  171. (p16) LDFD f53 = [X2], INCX5
  172. (p18) fma.d.s1 f15 = f55, f55, f15
  173. nop.b 0
  174. ;;
  175. (p16) LDFD f56 = [X], STRIDE
  176. (p18) fma.d.s1 f8 = f58, f58, f8
  177. nop.b 0
  178. (p16) LDFD f59 = [X2], STRIDE
  179. (p18) fma.d.s1 f9 = f61, f61, f9
  180. nop.b 0
  181. ;;
  182. (p16) LDFD f62 = [X], INCX
  183. (p18) fma.d.s1 f10 = f64, f64, f10
  184. nop.b 0
  185. (p16) LDFD f65 = [X2], INCX
  186. (p18) fma.d.s1 f11 = f67, f67, f11
  187. nop.b 0
  188. ;;
  189. (p16) LDFD f68 = [X], STRIDE
  190. (p18) fma.d.s1 f12 = f70, f70, f12
  191. nop.b 0
  192. (p16) LDFD f71 = [X2], STRIDE
  193. (p18) fma.d.s1 f13 = f73, f73, f13
  194. nop.b 0
  195. ;;
  196. (p16) LDFD f74 = [X], INCX5
  197. (p18) fma.d.s1 f14 = f76, f76, f14
  198. nop.b 0
  199. (p16) LDFD f77 = [X2], INCX5
  200. (p18) fma.d.s1 f15 = f79, f79, f15
  201. br.ctop.sptk.few .L51
  202. ;;
  203. .align 32
  204. .L52:
  205. { .mmb
  206. (p12) LDFD f32 = [X], STRIDE
  207. (p12) LDFD f33 = [X2], STRIDE
  208. (p9) br.cond.dptk .L998
  209. }
  210. ;;
  211. { .mmi
  212. (p12) LDFD f34 = [X], INCX
  213. (p12) LDFD f35 = [X2], INCX
  214. tbit.z p0, p13 = N, (2 - COMPADD)
  215. }
  216. ;;
  217. { .mmi
  218. (p12) LDFD f36 = [X], STRIDE
  219. (p12) LDFD f37 = [X2], STRIDE
  220. tbit.z p0, p14 = N, (1 - COMPADD)
  221. }
  222. ;;
  223. { .mmi
  224. (p12) LDFD f38 = [X], INCX5
  225. (p12) LDFD f39 = [X2], INCX5
  226. #ifndef COMPLEX
  227. tbit.z p0, p15 = N, 0
  228. #endif
  229. }
  230. ;;
  231. (p13) LDFD f40 = [X], STRIDE
  232. (p12) fma.d.s1 f8 = f32, f32, f8
  233. (p12) fma.d.s1 f9 = f33, f33, f9
  234. ;;
  235. (p13) LDFD f41 = [X], INCX
  236. (p12) fma.d.s1 f10 = f34, f34, f10
  237. (p12) fma.d.s1 f11 = f35, f35, f11
  238. ;;
  239. (p13) LDFD f42 = [X], STRIDE
  240. (p12) fma.d.s1 f12 = f36, f36, f12
  241. (p12) fma.d.s1 f13 = f37, f37, f13
  242. ;;
  243. (p13) LDFD f43 = [X], INCX
  244. (p12) fma.d.s1 f14 = f38, f38, f14
  245. (p12) fma.d.s1 f15 = f39, f39, f15
  246. ;;
  247. (p14) LDFD f44 = [X], STRIDE
  248. (p13) fma.d.s1 f8 = f40, f40, f8
  249. (p13) fma.d.s1 f9 = f41, f41, f9
  250. ;;
  251. (p14) LDFD f45 = [X], INCX
  252. (p13) fma.d.s1 f10 = f42, f42, f10
  253. (p13) fma.d.s1 f11 = f43, f43, f11
  254. ;;
  255. #ifndef COMPLEX
  256. (p15) LDFD f46 = [X]
  257. #endif
  258. (p14) fma.d.s1 f12 = f44, f44, f12
  259. (p14) fma.d.s1 f13 = f45, f45, f13
  260. ;;
  261. #ifndef COMPLEX
  262. (p15) fma.d.s1 f14 = f46, f46, f14
  263. ;;
  264. #endif
  265. .align 32
  266. .L998:
  267. { .mmf
  268. fadd.d.s1 f8 = f8, f9
  269. }
  270. { .mmf
  271. fadd.d.s1 f10 = f10, f11
  272. }
  273. { .mmf
  274. fadd.d.s1 f12 = f12, f13
  275. }
  276. { .mfi
  277. fadd.d.s1 f14 = f14, f15
  278. mov ar.lc = ARLC
  279. }
  280. ;;
  281. { .mmf
  282. fadd.d.s1 f8 = f8, f10
  283. }
  284. { .mfi
  285. fadd.d.s1 f12 = f12, f14
  286. mov pr = PR, -65474
  287. }
  288. ;;
  289. { .mfb
  290. fadd.d.s1 f8 = f8, f12
  291. br sqrt
  292. }
  293. ;;
  294. EPILOGUE
  295. .section .data
  296. .type sqrt, @function
  297. .global sqrt