You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

asum_sse.S 7.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 8
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define I %eax
  46. #define M %ecx
  47. #define X %esi
  48. #define INCX %ebx
  49. #include "l1param.h"
  50. PROLOGUE
  51. PROFCODE
  52. pushl %esi
  53. pushl %ebx
  54. movl STACK_M, M
  55. movl STACK_X, X
  56. movl STACK_INCX, INCX
  57. xorps %xmm0, %xmm0
  58. testl M, M
  59. jle .L999
  60. testl INCX, INCX
  61. jle .L999
  62. xorps %xmm1, %xmm1
  63. #ifdef HAVE_SSE2
  64. pcmpeqb %xmm3, %xmm3
  65. psrld $1, %xmm3
  66. #else
  67. movl $0x7fffffff, STACK_M
  68. movss STACK_M, %xmm3
  69. shufps $0, %xmm3, %xmm3
  70. #endif
  71. leal (, INCX, SIZE), INCX
  72. cmpl $SIZE, INCX
  73. jne .L100
  74. subl $-32 * SIZE, X
  75. cmpl $3, M
  76. jle .L18
  77. testl $4, X
  78. je .L05
  79. movss -32 * SIZE(X), %xmm0
  80. andps %xmm3, %xmm0
  81. addl $SIZE, X
  82. decl M
  83. jle .L998
  84. ALIGN_3
  85. .L05:
  86. testl $8, X
  87. je .L10
  88. movsd -32 * SIZE(X), %xmm1
  89. andps %xmm3, %xmm1
  90. addl $2 * SIZE, X
  91. subl $2, M
  92. jle .L998
  93. ALIGN_3
  94. .L10:
  95. movl M, I
  96. sarl $5, I
  97. jle .L14
  98. movaps -32 * SIZE(X), %xmm4
  99. movaps -28 * SIZE(X), %xmm5
  100. movaps -24 * SIZE(X), %xmm6
  101. movaps -20 * SIZE(X), %xmm7
  102. decl I
  103. jle .L12
  104. ALIGN_3
  105. .L11:
  106. #ifdef PREFETCH
  107. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  108. #endif
  109. andps %xmm3, %xmm4
  110. addps %xmm4, %xmm0
  111. movaps -16 * SIZE(X), %xmm4
  112. andps %xmm3, %xmm5
  113. addps %xmm5, %xmm1
  114. movaps -12 * SIZE(X), %xmm5
  115. andps %xmm3, %xmm6
  116. addps %xmm6, %xmm0
  117. movaps -8 * SIZE(X), %xmm6
  118. andps %xmm3, %xmm7
  119. addps %xmm7, %xmm1
  120. movaps -4 * SIZE(X), %xmm7
  121. #if defined(PREFETCH) && !defined(FETCH128)
  122. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  123. #endif
  124. andps %xmm3, %xmm4
  125. addps %xmm4, %xmm0
  126. movaps 0 * SIZE(X), %xmm4
  127. andps %xmm3, %xmm5
  128. addps %xmm5, %xmm1
  129. movaps 4 * SIZE(X), %xmm5
  130. andps %xmm3, %xmm6
  131. addps %xmm6, %xmm0
  132. movaps 8 * SIZE(X), %xmm6
  133. andps %xmm3, %xmm7
  134. addps %xmm7, %xmm1
  135. movaps 12 * SIZE(X), %xmm7
  136. subl $-32 * SIZE, X
  137. decl I
  138. jg .L11
  139. ALIGN_3
  140. .L12:
  141. andps %xmm3, %xmm4
  142. addps %xmm4, %xmm0
  143. movaps -16 * SIZE(X), %xmm4
  144. andps %xmm3, %xmm5
  145. addps %xmm5, %xmm1
  146. movaps -12 * SIZE(X), %xmm5
  147. andps %xmm3, %xmm6
  148. addps %xmm6, %xmm0
  149. movaps -8 * SIZE(X), %xmm6
  150. andps %xmm3, %xmm7
  151. addps %xmm7, %xmm1
  152. movaps -4 * SIZE(X), %xmm7
  153. andps %xmm3, %xmm4
  154. addps %xmm4, %xmm0
  155. andps %xmm3, %xmm5
  156. addps %xmm5, %xmm1
  157. andps %xmm3, %xmm6
  158. addps %xmm6, %xmm0
  159. andps %xmm3, %xmm7
  160. addps %xmm7, %xmm1
  161. subl $-32 * SIZE, X
  162. ALIGN_3
  163. .L14:
  164. testl $16, M
  165. je .L16
  166. movaps -32 * SIZE(X), %xmm4
  167. andps %xmm3, %xmm4
  168. addps %xmm4, %xmm0
  169. movaps -28 * SIZE(X), %xmm5
  170. andps %xmm3, %xmm5
  171. addps %xmm5, %xmm1
  172. movaps -24 * SIZE(X), %xmm6
  173. andps %xmm3, %xmm6
  174. addps %xmm6, %xmm0
  175. movaps -20 * SIZE(X), %xmm7
  176. andps %xmm3, %xmm7
  177. addps %xmm7, %xmm1
  178. addl $16 * SIZE, X
  179. ALIGN_3
  180. .L16:
  181. testl $8, M
  182. je .L17
  183. movaps -32 * SIZE(X), %xmm4
  184. andps %xmm3, %xmm4
  185. addps %xmm4, %xmm0
  186. movaps -28 * SIZE(X), %xmm5
  187. andps %xmm3, %xmm5
  188. addps %xmm5, %xmm1
  189. addl $8 * SIZE, X
  190. ALIGN_3
  191. .L17:
  192. testl $4, M
  193. je .L18
  194. movaps -32 * SIZE(X), %xmm4
  195. andps %xmm3, %xmm4
  196. addps %xmm4, %xmm0
  197. addl $4 * SIZE, X
  198. ALIGN_3
  199. .L18:
  200. testl $2, M
  201. je .L19
  202. #ifdef movsd
  203. xorps %xmm4, %xmm4
  204. #endif
  205. movsd -32 * SIZE(X), %xmm4
  206. andps %xmm3, %xmm4
  207. addps %xmm4, %xmm1
  208. addl $2 * SIZE, X
  209. ALIGN_3
  210. .L19:
  211. testl $1, M
  212. je .L998
  213. movss -32 * SIZE(X), %xmm4
  214. andps %xmm3, %xmm4
  215. addps %xmm4, %xmm0
  216. jmp .L998
  217. ALIGN_4
  218. .L100:
  219. movl M, I
  220. sarl $3, I
  221. jle .L105
  222. ALIGN_4
  223. .L101:
  224. movss (X), %xmm4
  225. addl INCX, X
  226. andps %xmm3, %xmm4
  227. addss %xmm4, %xmm0
  228. movss (X), %xmm5
  229. addl INCX, X
  230. andps %xmm3, %xmm5
  231. addss %xmm5, %xmm1
  232. movss (X), %xmm6
  233. addl INCX, X
  234. andps %xmm3, %xmm6
  235. addss %xmm6, %xmm0
  236. movss (X), %xmm7
  237. addl INCX, X
  238. andps %xmm3, %xmm7
  239. addss %xmm7, %xmm1
  240. movss (X), %xmm4
  241. addl INCX, X
  242. andps %xmm3, %xmm4
  243. addss %xmm4, %xmm0
  244. movss (X), %xmm5
  245. addl INCX, X
  246. andps %xmm3, %xmm5
  247. addss %xmm5, %xmm1
  248. movss (X), %xmm6
  249. addl INCX, X
  250. andps %xmm3, %xmm6
  251. addss %xmm6, %xmm0
  252. movss (X), %xmm7
  253. addl INCX, X
  254. andps %xmm3, %xmm7
  255. addss %xmm7, %xmm1
  256. decl I
  257. jg .L101
  258. ALIGN_4
  259. .L105:
  260. andl $7, M
  261. jle .L998
  262. ALIGN_4
  263. .L106:
  264. movss (X), %xmm4
  265. andps %xmm3, %xmm4
  266. addss %xmm4, %xmm0
  267. addl INCX, X
  268. decl M
  269. jg .L106
  270. ALIGN_4
  271. .L998:
  272. addps %xmm1, %xmm0
  273. #ifndef HAVE_SSE3
  274. movhlps %xmm0, %xmm1
  275. addps %xmm1, %xmm0
  276. movaps %xmm0, %xmm1
  277. shufps $1, %xmm0, %xmm0
  278. addss %xmm1, %xmm0
  279. #else
  280. haddps %xmm0, %xmm0
  281. haddps %xmm0, %xmm0
  282. #endif
  283. ALIGN_4
  284. .L999:
  285. movss %xmm0, STACK_M
  286. flds STACK_M
  287. popl %ebx
  288. popl %esi
  289. ret
  290. EPILOGUE