You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

asum_sse2.S 6.7 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 8
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define I %eax
  46. #define M %ecx
  47. #define X %esi
  48. #define INCX %ebx
  49. #include "l1param.h"
  50. PROLOGUE
  51. PROFCODE
  52. pushl %esi
  53. pushl %ebx
  54. movl STACK_M, M
  55. movl STACK_X, X
  56. movl STACK_INCX, INCX
  57. xorps %xmm0, %xmm0
  58. xorps %xmm1, %xmm1
  59. testl M, M
  60. jle .L999
  61. testl INCX, INCX
  62. jle .L999
  63. pcmpeqb %xmm3, %xmm3
  64. psrlq $1, %xmm3
  65. sall $BASE_SHIFT, INCX
  66. subl $-16 * SIZE, X
  67. cmpl $SIZE, INCX
  68. jne .L40
  69. testl $SIZE, X
  70. je .L05
  71. movsd -16 * SIZE(X), %xmm0
  72. addl $SIZE, X
  73. andps %xmm3, %xmm0
  74. subl $1, M
  75. jle .L999
  76. ALIGN_3
  77. .L05:
  78. movl M, I
  79. sarl $4, I
  80. jle .L20
  81. movaps -16 * SIZE(X), %xmm4
  82. movaps -14 * SIZE(X), %xmm5
  83. movaps -12 * SIZE(X), %xmm6
  84. movaps -10 * SIZE(X), %xmm7
  85. decl I
  86. jle .L11
  87. ALIGN_4
  88. .L10:
  89. #ifdef PREFETCH
  90. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  91. #endif
  92. andps %xmm3, %xmm4
  93. addpd %xmm4, %xmm0
  94. movaps -8 * SIZE(X), %xmm4
  95. andps %xmm3, %xmm5
  96. addpd %xmm5, %xmm1
  97. movaps -6 * SIZE(X), %xmm5
  98. andps %xmm3, %xmm6
  99. addpd %xmm6, %xmm0
  100. movaps -4 * SIZE(X), %xmm6
  101. andps %xmm3, %xmm7
  102. addpd %xmm7, %xmm1
  103. movaps -2 * SIZE(X), %xmm7
  104. #if defined(PREFETCH) && !defined(FETCH128)
  105. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  106. #endif
  107. andps %xmm3, %xmm4
  108. addpd %xmm4, %xmm0
  109. movaps 0 * SIZE(X), %xmm4
  110. andps %xmm3, %xmm5
  111. addpd %xmm5, %xmm1
  112. movaps 2 * SIZE(X), %xmm5
  113. andps %xmm3, %xmm6
  114. addpd %xmm6, %xmm0
  115. movaps 4 * SIZE(X), %xmm6
  116. andps %xmm3, %xmm7
  117. addpd %xmm7, %xmm1
  118. movaps 6 * SIZE(X), %xmm7
  119. subl $-16 * SIZE, X
  120. decl I
  121. jg .L10
  122. ALIGN_4
  123. .L11:
  124. andps %xmm3, %xmm4
  125. addpd %xmm4, %xmm0
  126. movaps -8 * SIZE(X), %xmm4
  127. andps %xmm3, %xmm5
  128. addpd %xmm5, %xmm1
  129. movaps -6 * SIZE(X), %xmm5
  130. andps %xmm3, %xmm6
  131. addpd %xmm6, %xmm0
  132. movaps -4 * SIZE(X), %xmm6
  133. andps %xmm3, %xmm7
  134. addpd %xmm7, %xmm1
  135. movaps -2 * SIZE(X), %xmm7
  136. andps %xmm3, %xmm4
  137. addpd %xmm4, %xmm0
  138. andps %xmm3, %xmm5
  139. addpd %xmm5, %xmm1
  140. andps %xmm3, %xmm6
  141. addpd %xmm6, %xmm0
  142. andps %xmm3, %xmm7
  143. addpd %xmm7, %xmm1
  144. subl $-16 * SIZE, X
  145. ALIGN_3
  146. .L20:
  147. andl $15, M
  148. jle .L999
  149. testl $8, M
  150. je .L21
  151. movaps -16 * SIZE(X), %xmm4
  152. movaps -14 * SIZE(X), %xmm5
  153. movaps -12 * SIZE(X), %xmm6
  154. movaps -10 * SIZE(X), %xmm7
  155. andps %xmm3, %xmm4
  156. addpd %xmm4, %xmm0
  157. andps %xmm3, %xmm5
  158. addpd %xmm5, %xmm1
  159. andps %xmm3, %xmm6
  160. addpd %xmm6, %xmm0
  161. andps %xmm3, %xmm7
  162. addpd %xmm7, %xmm1
  163. addl $8 * SIZE, X
  164. ALIGN_3
  165. .L21:
  166. testl $4, M
  167. je .L22
  168. movaps -16 * SIZE(X), %xmm4
  169. movaps -14 * SIZE(X), %xmm5
  170. andps %xmm3, %xmm4
  171. addpd %xmm4, %xmm0
  172. andps %xmm3, %xmm5
  173. addpd %xmm5, %xmm1
  174. addl $4 * SIZE, X
  175. ALIGN_3
  176. .L22:
  177. testl $2, M
  178. je .L23
  179. movaps -16 * SIZE(X), %xmm4
  180. andps %xmm3, %xmm4
  181. addpd %xmm4, %xmm0
  182. addl $2 * SIZE, X
  183. .L23:
  184. testl $1, M
  185. je .L999
  186. #ifdef movsd
  187. xorps %xmm4, %xmm4
  188. #endif
  189. movsd -16 * SIZE(X), %xmm4
  190. andps %xmm3, %xmm4
  191. addsd %xmm4, %xmm1
  192. jmp .L999
  193. ALIGN_3
  194. .L40:
  195. movl M, I
  196. sarl $3, I
  197. jle .L60
  198. ALIGN_4
  199. .L50:
  200. movsd -16 * SIZE(X), %xmm4
  201. addl INCX, X
  202. movhps -16 * SIZE(X), %xmm4
  203. addl INCX, X
  204. andps %xmm3, %xmm4
  205. addpd %xmm4, %xmm0
  206. movsd -16 * SIZE(X), %xmm5
  207. addl INCX, X
  208. movhps -16 * SIZE(X), %xmm5
  209. addl INCX, X
  210. andps %xmm3, %xmm5
  211. addpd %xmm5, %xmm1
  212. movsd -16 * SIZE(X), %xmm6
  213. addl INCX, X
  214. movhps -16 * SIZE(X), %xmm6
  215. addl INCX, X
  216. andps %xmm3, %xmm6
  217. addpd %xmm6, %xmm0
  218. movsd -16 * SIZE(X), %xmm7
  219. addl INCX, X
  220. movhps -16 * SIZE(X), %xmm7
  221. addl INCX, X
  222. andps %xmm3, %xmm7
  223. addpd %xmm7, %xmm1
  224. decl I
  225. jg .L50
  226. ALIGN_4
  227. .L60:
  228. #ifdef movsd
  229. xorps %xmm4, %xmm4
  230. #endif
  231. andl $7, M
  232. jle .L999
  233. ALIGN_4
  234. .L61:
  235. movsd -16 * SIZE(X), %xmm4
  236. andps %xmm3, %xmm4
  237. addsd %xmm4, %xmm0
  238. addl INCX, X
  239. decl M
  240. jg .L61
  241. ALIGN_4
  242. .L999:
  243. addpd %xmm1, %xmm0
  244. #ifndef HAVE_SSE3
  245. movaps %xmm0, %xmm1
  246. unpckhpd %xmm0, %xmm0
  247. addsd %xmm1, %xmm0
  248. #else
  249. haddpd %xmm0, %xmm0
  250. #endif
  251. movsd %xmm0, STACK_M
  252. fldl STACK_M
  253. popl %ebx
  254. popl %esi
  255. ret
  256. EPILOGUE