You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dot_sse2_opteron.S 8.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_N 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define N %ecx
  48. #define X %esi
  49. #define INCX %ebx
  50. #define Y %edi
  51. #define INCY %edx
  52. #define PREFETCHSIZE 84
  53. PROLOGUE
  54. pushl %edi
  55. pushl %esi
  56. pushl %ebx
  57. PROFCODE
  58. movl STACK_N, N
  59. movl STACK_X, X
  60. movl STACK_INCX, INCX
  61. movl STACK_Y, Y
  62. movl STACK_INCY, INCY
  63. #ifdef F_INTERFACE
  64. movl (N), N # N
  65. movl (INCX),INCX # INCX
  66. movl (INCY),INCY # INCY
  67. #endif
  68. leal (, INCX, SIZE), INCX
  69. leal (, INCY, SIZE), INCY
  70. pxor %xmm0, %xmm0
  71. pxor %xmm1, %xmm1
  72. pxor %xmm2, %xmm2
  73. pxor %xmm3, %xmm3
  74. cmpl $0, N
  75. jle .L999
  76. cmpl $SIZE, INCX
  77. jne .L50
  78. cmpl $SIZE, INCY
  79. jne .L50
  80. testl $SIZE, Y
  81. je .L10
  82. movsd 0 * SIZE(X), %xmm0
  83. mulsd 0 * SIZE(Y), %xmm0
  84. addl $1 * SIZE, X
  85. addl $1 * SIZE, Y
  86. decl N
  87. ALIGN_2
  88. .L10:
  89. movl N, %eax
  90. sarl $4, %eax
  91. jle .L24
  92. movlpd 0 * SIZE(X), %xmm4
  93. movhpd 1 * SIZE(X), %xmm4
  94. movlpd 2 * SIZE(X), %xmm5
  95. movhpd 3 * SIZE(X), %xmm5
  96. movlpd 4 * SIZE(X), %xmm6
  97. movhpd 5 * SIZE(X), %xmm6
  98. movlpd 6 * SIZE(X), %xmm7
  99. movhpd 7 * SIZE(X), %xmm7
  100. mulpd 0 * SIZE(Y), %xmm4
  101. mulpd 2 * SIZE(Y), %xmm5
  102. mulpd 4 * SIZE(Y), %xmm6
  103. mulpd 6 * SIZE(Y), %xmm7
  104. decl %eax
  105. jle .L22
  106. ALIGN_3
  107. .L21:
  108. prefetch (PREFETCHSIZE + 0) * SIZE(Y)
  109. addpd %xmm4, %xmm0
  110. movlpd 8 * SIZE(X), %xmm4
  111. movhpd 9 * SIZE(X), %xmm4
  112. addpd %xmm5, %xmm1
  113. movlpd 10 * SIZE(X), %xmm5
  114. movhpd 11 * SIZE(X), %xmm5
  115. addpd %xmm6, %xmm2
  116. movlpd 12 * SIZE(X), %xmm6
  117. movhpd 13 * SIZE(X), %xmm6
  118. addpd %xmm7, %xmm3
  119. movlpd 14 * SIZE(X), %xmm7
  120. movhpd 15 * SIZE(X), %xmm7
  121. mulpd 8 * SIZE(Y), %xmm4
  122. mulpd 10 * SIZE(Y), %xmm5
  123. mulpd 12 * SIZE(Y), %xmm6
  124. mulpd 14 * SIZE(Y), %xmm7
  125. prefetch (PREFETCHSIZE + 8) * SIZE(Y)
  126. addpd %xmm4, %xmm0
  127. movlpd 16 * SIZE(X), %xmm4
  128. movhpd 17 * SIZE(X), %xmm4
  129. addpd %xmm5, %xmm1
  130. movlpd 18 * SIZE(X), %xmm5
  131. movhpd 19 * SIZE(X), %xmm5
  132. addpd %xmm6, %xmm2
  133. movlpd 20 * SIZE(X), %xmm6
  134. movhpd 21 * SIZE(X), %xmm6
  135. addpd %xmm7, %xmm3
  136. movlpd 22 * SIZE(X), %xmm7
  137. movhpd 23 * SIZE(X), %xmm7
  138. mulpd 16 * SIZE(Y), %xmm4
  139. mulpd 18 * SIZE(Y), %xmm5
  140. mulpd 20 * SIZE(Y), %xmm6
  141. mulpd 22 * SIZE(Y), %xmm7
  142. addl $16 * SIZE, X
  143. addl $16 * SIZE, Y
  144. decl %eax
  145. jg .L21
  146. ALIGN_3
  147. .L22:
  148. addpd %xmm4, %xmm0
  149. movlpd 8 * SIZE(X), %xmm4
  150. movhpd 9 * SIZE(X), %xmm4
  151. addpd %xmm5, %xmm1
  152. movlpd 10 * SIZE(X), %xmm5
  153. movhpd 11 * SIZE(X), %xmm5
  154. addpd %xmm6, %xmm2
  155. movlpd 12 * SIZE(X), %xmm6
  156. movhpd 13 * SIZE(X), %xmm6
  157. addpd %xmm7, %xmm3
  158. movlpd 14 * SIZE(X), %xmm7
  159. movhpd 15 * SIZE(X), %xmm7
  160. mulpd 8 * SIZE(Y), %xmm4
  161. mulpd 10 * SIZE(Y), %xmm5
  162. mulpd 12 * SIZE(Y), %xmm6
  163. mulpd 14 * SIZE(Y), %xmm7
  164. addpd %xmm4, %xmm0
  165. addpd %xmm5, %xmm1
  166. addpd %xmm6, %xmm2
  167. addpd %xmm7, %xmm3
  168. addl $16 * SIZE, X
  169. addl $16 * SIZE, Y
  170. ALIGN_3
  171. .L24:
  172. testl $15, N
  173. jle .L999
  174. testl $8, N
  175. jle .L25
  176. movlpd 0 * SIZE(X), %xmm4
  177. movhpd 1 * SIZE(X), %xmm4
  178. movlpd 2 * SIZE(X), %xmm5
  179. movhpd 3 * SIZE(X), %xmm5
  180. movlpd 4 * SIZE(X), %xmm6
  181. movhpd 5 * SIZE(X), %xmm6
  182. movlpd 6 * SIZE(X), %xmm7
  183. movhpd 7 * SIZE(X), %xmm7
  184. mulpd 0 * SIZE(Y), %xmm4
  185. mulpd 2 * SIZE(Y), %xmm5
  186. mulpd 4 * SIZE(Y), %xmm6
  187. mulpd 6 * SIZE(Y), %xmm7
  188. addpd %xmm4, %xmm0
  189. addpd %xmm5, %xmm1
  190. addpd %xmm6, %xmm2
  191. addpd %xmm7, %xmm3
  192. addl $8 * SIZE, X
  193. addl $8 * SIZE, Y
  194. ALIGN_3
  195. .L25:
  196. testl $4, N
  197. jle .L26
  198. movlpd 0 * SIZE(X), %xmm4
  199. movhpd 1 * SIZE(X), %xmm4
  200. movlpd 2 * SIZE(X), %xmm5
  201. movhpd 3 * SIZE(X), %xmm5
  202. mulpd 0 * SIZE(Y), %xmm4
  203. mulpd 2 * SIZE(Y), %xmm5
  204. addpd %xmm4, %xmm0
  205. addpd %xmm5, %xmm1
  206. addl $4 * SIZE, X
  207. addl $4 * SIZE, Y
  208. ALIGN_3
  209. .L26:
  210. testl $2, N
  211. jle .L27
  212. movlpd 0 * SIZE(X), %xmm4
  213. movhpd 1 * SIZE(X), %xmm4
  214. mulpd 0 * SIZE(Y), %xmm4
  215. addpd %xmm4, %xmm0
  216. addl $2 * SIZE, X
  217. addl $2 * SIZE, Y
  218. ALIGN_3
  219. .L27:
  220. testl $1, N
  221. jle .L999
  222. movsd 0 * SIZE(X), %xmm4
  223. mulsd 0 * SIZE(Y), %xmm4
  224. addsd %xmm4, %xmm0
  225. jmp .L999
  226. ALIGN_3
  227. .L50:
  228. #ifdef F_INTERFACE
  229. testl INCX, INCX
  230. jge .L51
  231. movl N, %eax
  232. decl %eax
  233. imull INCX, %eax
  234. subl %eax, X
  235. ALIGN_3
  236. .L51:
  237. testl INCY, INCY
  238. jge .L52
  239. movl N, %eax
  240. decl %eax
  241. imull INCY, %eax
  242. subl %eax, Y
  243. ALIGN_3
  244. .L52:
  245. #endif
  246. movl N, %eax
  247. sarl $2, %eax
  248. jle .L55
  249. ALIGN_3
  250. .L53:
  251. movsd 0 * SIZE(X), %xmm4
  252. addl INCX, X
  253. mulsd 0 * SIZE(Y), %xmm4
  254. addl INCY, Y
  255. movsd 0 * SIZE(X), %xmm5
  256. addl INCX, X
  257. mulsd 0 * SIZE(Y), %xmm5
  258. addl INCY, Y
  259. movsd 0 * SIZE(X), %xmm6
  260. addl INCX, X
  261. mulsd 0 * SIZE(Y), %xmm6
  262. addl INCY, Y
  263. movsd 0 * SIZE(X), %xmm7
  264. addl INCX, X
  265. mulsd 0 * SIZE(Y), %xmm7
  266. addl INCY, Y
  267. addsd %xmm4, %xmm0
  268. addsd %xmm5, %xmm1
  269. addsd %xmm6, %xmm2
  270. addsd %xmm7, %xmm3
  271. decl %eax
  272. jg .L53
  273. ALIGN_3
  274. .L55:
  275. movl N, %eax
  276. andl $3, %eax
  277. jle .L999
  278. ALIGN_3
  279. .L56:
  280. movsd 0 * SIZE(X), %xmm4
  281. addl INCX, X
  282. mulsd 0 * SIZE(Y), %xmm4
  283. addl INCY, Y
  284. addsd %xmm4, %xmm0
  285. decl %eax
  286. jg .L56
  287. ALIGN_3
  288. .L999:
  289. addpd %xmm1, %xmm0
  290. addpd %xmm3, %xmm2
  291. addpd %xmm2, %xmm0
  292. #if !defined(HAVE_SSE3) || defined(__INTERIX)
  293. movapd %xmm0, %xmm1
  294. unpckhpd %xmm0, %xmm0
  295. addsd %xmm1, %xmm0
  296. #else
  297. haddpd %xmm0, %xmm0
  298. #endif
  299. movsd %xmm0, STACK_N
  300. fldl STACK_N
  301. popl %ebx
  302. popl %esi
  303. popl %edi
  304. ret
  305. EPILOGUE