You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zdot_amd.S 7.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #if !defined(DOUBLE) && !defined(XDOUBLE)
  43. #define RESULT 4 + STACK + ARGS(%esp)
  44. #define STACK_N 8 + STACK + ARGS(%esp)
  45. #define STACK_X 12 + STACK + ARGS(%esp)
  46. #define STACK_INCX 16 + STACK + ARGS(%esp)
  47. #define STACK_Y 20 + STACK + ARGS(%esp)
  48. #define STACK_INCY 24 + STACK + ARGS(%esp)
  49. #else
  50. #define STACK_N 4 + STACK + ARGS(%esp)
  51. #define STACK_X 8 + STACK + ARGS(%esp)
  52. #define STACK_INCX 12 + STACK + ARGS(%esp)
  53. #define STACK_Y 16 + STACK + ARGS(%esp)
  54. #define STACK_INCY 20 + STACK + ARGS(%esp)
  55. #endif
  56. PROLOGUE
  57. pushl %edi
  58. pushl %esi
  59. pushl %ebx
  60. PROFCODE
  61. #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
  62. EMMS
  63. #endif
  64. #define N %ebx
  65. #define X %esi
  66. #define INCX %ecx
  67. #define Y %edi
  68. #define INCY %edx
  69. movl STACK_N, N
  70. movl STACK_X, X
  71. movl STACK_INCX, INCX
  72. movl STACK_Y, Y
  73. movl STACK_INCY, INCY
  74. #if defined(F_INTERFACE)
  75. movl (N),N
  76. movl (INCX),INCX
  77. movl (INCY),INCY
  78. #endif
  79. testl N, N
  80. jle .L88
  81. fldz
  82. fldz
  83. fldz
  84. fldz
  85. addl INCX, INCX
  86. addl INCY, INCY
  87. leal (, INCX, SIZE), INCX
  88. leal (, INCY, SIZE), INCY
  89. cmpl $2 * SIZE, INCX
  90. jne .L14
  91. cmpl $2 * SIZE, INCY
  92. jne .L14
  93. movl N, %eax
  94. sarl $2, %eax
  95. jle .L15
  96. FLD 0 * SIZE(X)
  97. ALIGN_3
  98. .L16:
  99. FLD 0 * SIZE(Y)
  100. PADDING fmul %st(1)
  101. faddp %st, %st(2)
  102. FMUL 1 * SIZE(Y)
  103. faddp %st, %st(2)
  104. FLD 1 * SIZE(X)
  105. FLD 0 * SIZE(Y)
  106. PADDING fmul %st(1)
  107. faddp %st, %st(4)
  108. FMUL 1 * SIZE(Y)
  109. faddp %st, %st(4)
  110. FLD 2 * SIZE(X)
  111. FLD 2 * SIZE(Y)
  112. PADDING fmul %st(1)
  113. faddp %st, %st(2)
  114. FMUL 3 * SIZE(Y)
  115. faddp %st, %st(2)
  116. FLD 3 * SIZE(X)
  117. FLD 2 * SIZE(Y)
  118. PADDING fmul %st(1)
  119. faddp %st, %st(4)
  120. FMUL 3 * SIZE(Y)
  121. faddp %st, %st(4)
  122. FLD 4 * SIZE(X)
  123. FLD 4 * SIZE(Y)
  124. PADDING fmul %st(1)
  125. faddp %st, %st(2)
  126. FMUL 5 * SIZE(Y)
  127. faddp %st, %st(2)
  128. FLD 5 * SIZE(X)
  129. FLD 4 * SIZE(Y)
  130. PADDING fmul %st(1)
  131. faddp %st, %st(4)
  132. FMUL 5 * SIZE(Y)
  133. faddp %st, %st(4)
  134. FLD 6 * SIZE(X)
  135. FLD 6 * SIZE(Y)
  136. PADDING fmul %st(1)
  137. faddp %st, %st(2)
  138. FMUL 7 * SIZE(Y)
  139. faddp %st, %st(2)
  140. FLD 7 * SIZE(X)
  141. FLD 6 * SIZE(Y)
  142. PADDING fmul %st(1)
  143. faddp %st, %st(4)
  144. FMUL 7 * SIZE(Y)
  145. faddp %st, %st(4)
  146. FLD 8 * SIZE(X)
  147. prefetch 32 * SIZE(X)
  148. addl $8 * SIZE, X
  149. addl $8 * SIZE, Y
  150. decl %eax
  151. jg .L16
  152. ffreep %st(0)
  153. ALIGN_3
  154. .L15:
  155. movl N, %eax
  156. andl $3, %eax
  157. jle .L27
  158. ALIGN_3
  159. .L22:
  160. FLD 0 * SIZE(X)
  161. FLD 0 * SIZE(Y)
  162. fmul %st(1)
  163. faddp %st, %st(2)
  164. FMUL 1 * SIZE(Y)
  165. faddp %st, %st(2)
  166. FLD 1 * SIZE(X)
  167. FLD 0 * SIZE(Y)
  168. fmul %st(1)
  169. faddp %st, %st(4)
  170. FMUL 1 * SIZE(Y)
  171. faddp %st, %st(4)
  172. addl $2 * SIZE, X
  173. addl $2 * SIZE, Y
  174. decl %eax
  175. jg .L22
  176. jmp .L27
  177. ALIGN_3
  178. .L14:
  179. #ifdef F_INTERFACE
  180. testl INCX, INCX # if (incx < 0)
  181. jge .L28
  182. movl N, %eax
  183. decl %eax
  184. imull INCX, %eax
  185. subl %eax, X
  186. ALIGN_3
  187. .L28:
  188. testl INCY, INCY # if (incy < 0)
  189. jge .L29
  190. movl N, %eax
  191. decl %eax
  192. imull INCY, %eax
  193. subl %eax, Y
  194. ALIGN_3
  195. .L29:
  196. #endif
  197. movl N, %eax
  198. sarl $1, %eax
  199. jle .L30
  200. ALIGN_3
  201. .L31:
  202. FLD 0 * SIZE(X)
  203. FLD 0 * SIZE(Y)
  204. fmul %st(1)
  205. faddp %st, %st(2)
  206. FMUL 1 * SIZE(Y)
  207. faddp %st, %st(2)
  208. FLD 1 * SIZE(X)
  209. FLD 0 * SIZE(Y)
  210. fmul %st(1)
  211. faddp %st, %st(4)
  212. FMUL 1 * SIZE(Y)
  213. faddp %st, %st(4)
  214. addl INCX, X
  215. FLD 0 * SIZE(X)
  216. addl INCY, Y
  217. FLD 0 * SIZE(Y)
  218. fmul %st(1)
  219. faddp %st, %st(2)
  220. FMUL 1 * SIZE(Y)
  221. faddp %st, %st(2)
  222. FLD 1 * SIZE(X)
  223. FLD 0 * SIZE(Y)
  224. fmul %st(1)
  225. faddp %st, %st(4)
  226. FMUL 1 * SIZE(Y)
  227. faddp %st, %st(4)
  228. addl INCX, X
  229. addl INCY, Y
  230. decl %eax
  231. jg .L31
  232. ALIGN_3
  233. .L30:
  234. movl N, %eax
  235. andl $1, %eax
  236. jle .L27
  237. ALIGN_3
  238. .L37:
  239. FLD 0 * SIZE(X)
  240. FLD 0 * SIZE(Y)
  241. fmul %st(1)
  242. faddp %st, %st(2)
  243. FMUL 1 * SIZE(Y)
  244. faddp %st, %st(2)
  245. FLD 1 * SIZE(X)
  246. FLD 0 * SIZE(Y)
  247. fmul %st(1)
  248. faddp %st, %st(4)
  249. FMUL 1 * SIZE(Y)
  250. faddp %st, %st(4)
  251. ALIGN_3
  252. .L27:
  253. #ifndef CONJ
  254. fsubp %st, %st(3)
  255. faddp %st, %st(1)
  256. #else
  257. faddp %st, %st(3)
  258. fsubp %st, %st(1)
  259. #endif
  260. #if !defined(DOUBLE) && !defined(XDOUBLE)
  261. subl $2 * SIZE, %esp
  262. FST 1 * SIZE(%esp)
  263. FST 0 * SIZE(%esp)
  264. movl 0 * SIZE(%esp), %eax
  265. movl 1 * SIZE(%esp), %edx
  266. addl $2 * SIZE, %esp
  267. #else
  268. movl RESULT, %eax
  269. FST 1 * SIZE(%eax)
  270. FST 0 * SIZE(%eax)
  271. #endif
  272. popl %ebx
  273. popl %esi
  274. popl %edi
  275. #if defined(F_INTERFACE) && defined(F_PATHSCALE)
  276. ret $0x4
  277. #else
  278. ret
  279. #endif
  280. ALIGN_3
  281. .L88:
  282. #if !defined(DOUBLE) && !defined(XDOUBLE)
  283. xor %eax, %eax
  284. xor %edx, %edx
  285. #else
  286. movl RESULT, %eax
  287. fldz
  288. fldz
  289. FST 1 * SIZE(%eax)
  290. FST 0 * SIZE(%eax)
  291. #endif
  292. popl %ebx
  293. popl %esi
  294. popl %edi
  295. #if defined(F_INTERFACE) && defined(F_PATHSCALE)
  296. ret $0x4
  297. #else
  298. ret
  299. #endif
  300. EPILOGUE