You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

nrm2_sse.S 6.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define I %rax
  44. #include "l1param.h"
  45. PROLOGUE
  46. PROFCODE
  47. SAVEREGISTERS
  48. pxor %xmm0, %xmm0
  49. testq M, M
  50. jle .L999
  51. pxor %xmm1, %xmm1
  52. testq INCX, INCX
  53. je .L999
  54. pxor %xmm2, %xmm2
  55. leaq (, INCX, SIZE), INCX
  56. pxor %xmm3, %xmm3
  57. cmpq $SIZE, INCX
  58. jne .L40
  59. testq $SIZE, X
  60. je .L05
  61. movss 0 * SIZE(X), %xmm4
  62. cvtss2sd %xmm4, %xmm6
  63. mulsd %xmm6, %xmm6
  64. addsd %xmm6, %xmm3
  65. addq INCX, X
  66. decq M
  67. jle .L998
  68. ALIGN_3
  69. .L05:
  70. movq M, I
  71. sarq $3, I
  72. jle .L14
  73. movsd 0 * SIZE(X), %xmm4
  74. movsd 2 * SIZE(X), %xmm5
  75. movsd 4 * SIZE(X), %xmm6
  76. movsd 6 * SIZE(X), %xmm7
  77. addq $8 * SIZE, X
  78. decq I
  79. jle .L12
  80. ALIGN_3
  81. .L10:
  82. #ifdef PREFETCH
  83. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  84. #endif
  85. cvtps2pd %xmm4, %xmm8
  86. cvtps2pd %xmm5, %xmm9
  87. cvtps2pd %xmm6, %xmm10
  88. cvtps2pd %xmm7, %xmm11
  89. movsd 0 * SIZE(X), %xmm4
  90. movsd 2 * SIZE(X), %xmm5
  91. movsd 4 * SIZE(X), %xmm6
  92. movsd 6 * SIZE(X), %xmm7
  93. mulpd %xmm8, %xmm8
  94. mulpd %xmm9, %xmm9
  95. mulpd %xmm10, %xmm10
  96. mulpd %xmm11, %xmm11
  97. addpd %xmm8, %xmm0
  98. addpd %xmm9, %xmm1
  99. addpd %xmm10, %xmm2
  100. addpd %xmm11, %xmm3
  101. addq $8 * SIZE, X
  102. decq I
  103. jg .L10
  104. ALIGN_3
  105. .L12:
  106. cvtps2pd %xmm4, %xmm8
  107. cvtps2pd %xmm5, %xmm9
  108. cvtps2pd %xmm6, %xmm10
  109. cvtps2pd %xmm7, %xmm11
  110. mulpd %xmm8, %xmm8
  111. mulpd %xmm9, %xmm9
  112. mulpd %xmm10, %xmm10
  113. mulpd %xmm11, %xmm11
  114. addpd %xmm8, %xmm0
  115. addpd %xmm9, %xmm1
  116. addpd %xmm10, %xmm2
  117. addpd %xmm11, %xmm3
  118. ALIGN_3
  119. .L14:
  120. testq $4, M
  121. je .L15
  122. movsd 0 * SIZE(X), %xmm4
  123. movsd 2 * SIZE(X), %xmm5
  124. cvtps2pd %xmm4, %xmm6
  125. cvtps2pd %xmm5, %xmm7
  126. mulpd %xmm6, %xmm6
  127. mulpd %xmm7, %xmm7
  128. addpd %xmm6, %xmm0
  129. addpd %xmm7, %xmm1
  130. addq $4 * SIZE, X
  131. ALIGN_3
  132. .L15:
  133. testq $2, M
  134. je .L16
  135. movsd 0 * SIZE(X), %xmm4
  136. cvtps2pd %xmm4, %xmm6
  137. mulpd %xmm6, %xmm6
  138. addpd %xmm6, %xmm2
  139. addq $2 * SIZE, X
  140. ALIGN_3
  141. .L16:
  142. testq $1, M
  143. je .L998
  144. movss 0 * SIZE(X), %xmm4
  145. cvtss2sd %xmm4, %xmm6
  146. mulsd %xmm6, %xmm6
  147. addsd %xmm6, %xmm3
  148. jmp .L998
  149. ALIGN_4
  150. .L40:
  151. movq M, I
  152. sarq $3, I
  153. jle .L44
  154. ALIGN_4
  155. .L41:
  156. movss (X), %xmm4
  157. addq INCX, X
  158. movss (X), %xmm5
  159. addq INCX, X
  160. movss (X), %xmm6
  161. addq INCX, X
  162. movss (X), %xmm7
  163. addq INCX, X
  164. movss (X), %xmm8
  165. addq INCX, X
  166. movss (X), %xmm9
  167. addq INCX, X
  168. movss (X), %xmm10
  169. addq INCX, X
  170. movss (X), %xmm11
  171. addq INCX, X
  172. cvtss2sd %xmm4, %xmm4
  173. cvtss2sd %xmm5, %xmm5
  174. cvtss2sd %xmm6, %xmm6
  175. cvtss2sd %xmm7, %xmm7
  176. cvtss2sd %xmm8, %xmm8
  177. cvtss2sd %xmm9, %xmm9
  178. cvtss2sd %xmm10, %xmm10
  179. cvtss2sd %xmm11, %xmm11
  180. mulsd %xmm4, %xmm4
  181. mulsd %xmm5, %xmm5
  182. mulsd %xmm6, %xmm6
  183. mulsd %xmm7, %xmm7
  184. addsd %xmm4, %xmm0
  185. addsd %xmm5, %xmm1
  186. addsd %xmm6, %xmm2
  187. addsd %xmm7, %xmm3
  188. mulsd %xmm8, %xmm8
  189. mulsd %xmm9, %xmm9
  190. mulsd %xmm10, %xmm10
  191. mulsd %xmm11, %xmm11
  192. addsd %xmm8, %xmm0
  193. addsd %xmm9, %xmm1
  194. addsd %xmm10, %xmm2
  195. addsd %xmm11, %xmm3
  196. decq I
  197. jg .L41
  198. ALIGN_3
  199. .L44:
  200. testq $4, M
  201. je .L45
  202. movss (X), %xmm4
  203. addq INCX, X
  204. movss (X), %xmm5
  205. addq INCX, X
  206. movss (X), %xmm6
  207. addq INCX, X
  208. movss (X), %xmm7
  209. addq INCX, X
  210. cvtss2sd %xmm4, %xmm8
  211. cvtss2sd %xmm5, %xmm9
  212. cvtss2sd %xmm6, %xmm10
  213. cvtss2sd %xmm7, %xmm11
  214. mulsd %xmm8, %xmm8
  215. mulsd %xmm9, %xmm9
  216. mulsd %xmm10, %xmm10
  217. mulsd %xmm11, %xmm11
  218. addsd %xmm8, %xmm0
  219. addsd %xmm9, %xmm1
  220. addsd %xmm10, %xmm2
  221. addsd %xmm11, %xmm3
  222. ALIGN_3
  223. .L45:
  224. testq $2, M
  225. je .L46
  226. movss (X), %xmm4
  227. addq INCX, X
  228. movss (X), %xmm5
  229. addq INCX, X
  230. cvtss2sd %xmm4, %xmm6
  231. cvtss2sd %xmm5, %xmm7
  232. mulsd %xmm6, %xmm6
  233. mulsd %xmm7, %xmm7
  234. addsd %xmm6, %xmm1
  235. addsd %xmm7, %xmm2
  236. ALIGN_3
  237. .L46:
  238. testq $1, M
  239. je .L998
  240. movss (X), %xmm4
  241. cvtss2sd %xmm4, %xmm6
  242. mulsd %xmm6, %xmm6
  243. addsd %xmm6, %xmm3
  244. ALIGN_4
  245. .L998:
  246. addpd %xmm1, %xmm0
  247. addpd %xmm3, %xmm2
  248. addpd %xmm2, %xmm0
  249. #ifndef HAVE_SSE3
  250. movapd %xmm0, %xmm1
  251. unpckhpd %xmm0, %xmm0
  252. addsd %xmm1, %xmm0
  253. #else
  254. haddpd %xmm0, %xmm0
  255. #endif
  256. ALIGN_4
  257. .L999:
  258. sqrtsd %xmm0, %xmm0
  259. cvtsd2ss %xmm0, %xmm0
  260. RESTOREREGISTERS
  261. ret
  262. EPILOGUE