You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dcopy_bulldozer.S 6.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define Y ARG4 /* rcx */
  44. #ifndef WINDOWS_ABI
  45. #define INCY ARG5 /* r8 */
  46. #else
  47. #define INCY %r10
  48. #endif
  49. #include "l1param.h"
  50. #define VLOAD(OFFSET, ADDR, REG) vmovups OFFSET(ADDR), REG
  51. #define VSHUFPD_1(REG1 , REG2) vshufpd $0x01, REG1, REG2, REG2
  52. #define A_PRE 640
  53. #define B_PRE 640
  54. PROLOGUE
  55. PROFCODE
  56. #ifdef WINDOWS_ABI
  57. movq 40(%rsp), INCY
  58. #endif
  59. SAVEREGISTERS
  60. leaq (, INCX, SIZE), INCX
  61. leaq (, INCY, SIZE), INCY
  62. cmpq $SIZE, INCX
  63. jne .L40
  64. cmpq $SIZE, INCY
  65. jne .L40
  66. testq $SIZE, X
  67. je .L10
  68. vmovsd (X), %xmm0
  69. vmovsd %xmm0, (Y)
  70. addq $1 * SIZE, X
  71. addq $1 * SIZE, Y
  72. decq M
  73. jle .L19
  74. ALIGN_4
  75. .L10:
  76. subq $-16 * SIZE, X
  77. subq $-16 * SIZE, Y
  78. movq M, %rax
  79. sarq $4, %rax
  80. jle .L13
  81. vmovups -16 * SIZE(X), %xmm0
  82. vmovups -14 * SIZE(X), %xmm1
  83. vmovups -12 * SIZE(X), %xmm2
  84. vmovups -10 * SIZE(X), %xmm3
  85. vmovups -8 * SIZE(X), %xmm4
  86. vmovups -6 * SIZE(X), %xmm5
  87. vmovups -4 * SIZE(X), %xmm6
  88. vmovups -2 * SIZE(X), %xmm7
  89. decq %rax
  90. jle .L12
  91. ALIGN_4
  92. .L11:
  93. prefetchnta A_PRE(X)
  94. nop
  95. vmovups %xmm0, -16 * SIZE(Y)
  96. vmovups %xmm1, -14 * SIZE(Y)
  97. prefetchnta B_PRE(Y)
  98. nop
  99. vmovups %xmm2, -12 * SIZE(Y)
  100. vmovups %xmm3, -10 * SIZE(Y)
  101. VLOAD( 0 * SIZE, X, %xmm0)
  102. VLOAD( 2 * SIZE, X, %xmm1)
  103. VLOAD( 4 * SIZE, X, %xmm2)
  104. VLOAD( 6 * SIZE, X, %xmm3)
  105. prefetchnta A_PRE+64(X)
  106. nop
  107. vmovups %xmm4, -8 * SIZE(Y)
  108. vmovups %xmm5, -6 * SIZE(Y)
  109. prefetchnta B_PRE+64(Y)
  110. nop
  111. vmovups %xmm6, -4 * SIZE(Y)
  112. vmovups %xmm7, -2 * SIZE(Y)
  113. VLOAD( 8 * SIZE, X, %xmm4)
  114. VLOAD(10 * SIZE, X, %xmm5)
  115. subq $-16 * SIZE, Y
  116. VLOAD(12 * SIZE, X, %xmm6)
  117. VLOAD(14 * SIZE, X, %xmm7)
  118. subq $-16 * SIZE, X
  119. decq %rax
  120. jg .L11
  121. ALIGN_3
  122. .L12:
  123. vmovups %xmm0, -16 * SIZE(Y)
  124. vmovups %xmm1, -14 * SIZE(Y)
  125. vmovups %xmm2, -12 * SIZE(Y)
  126. vmovups %xmm3, -10 * SIZE(Y)
  127. vmovups %xmm4, -8 * SIZE(Y)
  128. vmovups %xmm5, -6 * SIZE(Y)
  129. vmovups %xmm6, -4 * SIZE(Y)
  130. vmovups %xmm7, -2 * SIZE(Y)
  131. subq $-16 * SIZE, Y
  132. subq $-16 * SIZE, X
  133. ALIGN_3
  134. .L13:
  135. testq $8, M
  136. jle .L14
  137. ALIGN_3
  138. vmovups -16 * SIZE(X), %xmm0
  139. vmovups -14 * SIZE(X), %xmm1
  140. vmovups -12 * SIZE(X), %xmm2
  141. vmovups -10 * SIZE(X), %xmm3
  142. vmovups %xmm0, -16 * SIZE(Y)
  143. vmovups %xmm1, -14 * SIZE(Y)
  144. vmovups %xmm2, -12 * SIZE(Y)
  145. vmovups %xmm3, -10 * SIZE(Y)
  146. addq $8 * SIZE, X
  147. addq $8 * SIZE, Y
  148. ALIGN_3
  149. .L14:
  150. testq $4, M
  151. jle .L15
  152. ALIGN_3
  153. vmovups -16 * SIZE(X), %xmm0
  154. vmovups -14 * SIZE(X), %xmm1
  155. vmovups %xmm0, -16 * SIZE(Y)
  156. vmovups %xmm1, -14 * SIZE(Y)
  157. addq $4 * SIZE, X
  158. addq $4 * SIZE, Y
  159. ALIGN_3
  160. .L15:
  161. testq $2, M
  162. jle .L16
  163. ALIGN_3
  164. vmovups -16 * SIZE(X), %xmm0
  165. vmovups %xmm0, -16 * SIZE(Y)
  166. addq $2 * SIZE, X
  167. addq $2 * SIZE, Y
  168. ALIGN_3
  169. .L16:
  170. testq $1, M
  171. jle .L19
  172. ALIGN_3
  173. vmovsd -16 * SIZE(X), %xmm0
  174. vmovsd %xmm0, -16 * SIZE(Y)
  175. ALIGN_3
  176. .L19:
  177. xorq %rax,%rax
  178. RESTOREREGISTERS
  179. ret
  180. ALIGN_3
  181. .L40:
  182. movq M, %rax
  183. sarq $3, %rax
  184. jle .L45
  185. ALIGN_3
  186. .L41:
  187. vmovsd (X), %xmm0
  188. addq INCX, X
  189. vmovsd (X), %xmm4
  190. addq INCX, X
  191. vmovsd (X), %xmm1
  192. addq INCX, X
  193. vmovsd (X), %xmm5
  194. addq INCX, X
  195. vmovsd (X), %xmm2
  196. addq INCX, X
  197. vmovsd (X), %xmm6
  198. addq INCX, X
  199. vmovsd (X), %xmm3
  200. addq INCX, X
  201. vmovsd (X), %xmm7
  202. addq INCX, X
  203. vmovsd %xmm0, (Y)
  204. addq INCY, Y
  205. vmovsd %xmm4, (Y)
  206. addq INCY, Y
  207. vmovsd %xmm1, (Y)
  208. addq INCY, Y
  209. vmovsd %xmm5, (Y)
  210. addq INCY, Y
  211. vmovsd %xmm2, (Y)
  212. addq INCY, Y
  213. vmovsd %xmm6, (Y)
  214. addq INCY, Y
  215. vmovsd %xmm3, (Y)
  216. addq INCY, Y
  217. vmovsd %xmm7, (Y)
  218. addq INCY, Y
  219. decq %rax
  220. jg .L41
  221. ALIGN_3
  222. .L45:
  223. movq M, %rax
  224. andq $7, %rax
  225. jle .L47
  226. ALIGN_3
  227. .L46:
  228. vmovsd (X), %xmm0
  229. addq INCX, X
  230. vmovsd %xmm0, (Y)
  231. addq INCY, Y
  232. decq %rax
  233. jg .L46
  234. ALIGN_3
  235. .L47:
  236. xorq %rax, %rax
  237. RESTOREREGISTERS
  238. ret
  239. EPILOGUE