You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

asum_atom.S 8.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define I %rax
  44. #include "l1param.h"
  45. PROLOGUE
  46. PROFCODE
  47. SAVEREGISTERS
  48. xorps %xmm0, %xmm0
  49. testq M, M
  50. jle .L999
  51. testq INCX, INCX
  52. jle .L999
  53. xorps %xmm1, %xmm1
  54. xorps %xmm2, %xmm2
  55. xorps %xmm3, %xmm3
  56. pcmpeqb %xmm15, %xmm15
  57. psrlq $1, %xmm15
  58. salq $BASE_SHIFT, INCX
  59. xorps %xmm13, %xmm13
  60. cmpq $SIZE, INCX
  61. jne .L20
  62. testq $SIZE, X
  63. je .L05
  64. movsd (X), %xmm0
  65. addq $SIZE, X
  66. andps %xmm15, %xmm0
  67. decq M
  68. jle .L999
  69. ALIGN_3
  70. .L05:
  71. subq $-16 * SIZE, X
  72. movq M, I
  73. sarq $4, I
  74. jle .L12
  75. movaps -16 * SIZE(X), %xmm4
  76. movaps -14 * SIZE(X), %xmm5
  77. movaps -12 * SIZE(X), %xmm6
  78. movaps -10 * SIZE(X), %xmm7
  79. movaps -8 * SIZE(X), %xmm8
  80. movaps -6 * SIZE(X), %xmm9
  81. movaps -4 * SIZE(X), %xmm10
  82. movaps -2 * SIZE(X), %xmm11
  83. decq I
  84. jle .L11
  85. ALIGN_4
  86. .L10:
  87. #ifdef PREFETCH
  88. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  89. #endif
  90. andps %xmm15, %xmm4
  91. addsd %xmm13, %xmm3
  92. pshufd $0x4e, %xmm4, %xmm12
  93. addsd %xmm4, %xmm0
  94. movaps 0 * SIZE(X), %xmm4
  95. andps %xmm15, %xmm5
  96. addsd %xmm12, %xmm1
  97. pshufd $0x4e, %xmm5, %xmm13
  98. addsd %xmm5, %xmm2
  99. movaps 2 * SIZE(X), %xmm5
  100. andps %xmm15, %xmm6
  101. addsd %xmm13, %xmm3
  102. pshufd $0x4e, %xmm6, %xmm12
  103. addsd %xmm6, %xmm0
  104. movaps 4 * SIZE(X), %xmm6
  105. andps %xmm15, %xmm7
  106. addsd %xmm12, %xmm1
  107. pshufd $0x4e, %xmm7, %xmm13
  108. addsd %xmm7, %xmm2
  109. movaps 6 * SIZE(X), %xmm7
  110. #if defined(PREFETCH) && !defined(FETCH128)
  111. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  112. #endif
  113. andps %xmm15, %xmm8
  114. addsd %xmm13, %xmm3
  115. pshufd $0x4e, %xmm8, %xmm12
  116. addsd %xmm8, %xmm0
  117. movaps 8 * SIZE(X), %xmm8
  118. andps %xmm15, %xmm9
  119. addsd %xmm12, %xmm1
  120. pshufd $0x4e, %xmm9, %xmm13
  121. addsd %xmm9, %xmm2
  122. movaps 10 * SIZE(X), %xmm9
  123. andps %xmm15, %xmm10
  124. addsd %xmm13, %xmm3
  125. pshufd $0x4e, %xmm10, %xmm12
  126. addsd %xmm10, %xmm0
  127. movaps 12 * SIZE(X), %xmm10
  128. andps %xmm15, %xmm11
  129. addsd %xmm12, %xmm1
  130. pshufd $0x4e, %xmm11, %xmm13
  131. addsd %xmm11, %xmm2
  132. movaps 14 * SIZE(X), %xmm11
  133. subq $-16 * SIZE, X
  134. decq I
  135. jg .L10
  136. ALIGN_4
  137. .L11:
  138. andps %xmm15, %xmm4
  139. addsd %xmm13, %xmm3
  140. pshufd $0x4e, %xmm4, %xmm12
  141. addsd %xmm4, %xmm0
  142. andps %xmm15, %xmm5
  143. addsd %xmm12, %xmm1
  144. pshufd $0x4e, %xmm5, %xmm13
  145. addsd %xmm5, %xmm2
  146. andps %xmm15, %xmm6
  147. addsd %xmm13, %xmm3
  148. pshufd $0x4e, %xmm6, %xmm12
  149. addsd %xmm6, %xmm0
  150. andps %xmm15, %xmm7
  151. addsd %xmm12, %xmm1
  152. pshufd $0x4e, %xmm7, %xmm13
  153. addsd %xmm7, %xmm2
  154. andps %xmm15, %xmm8
  155. addsd %xmm13, %xmm3
  156. pshufd $0x4e, %xmm8, %xmm12
  157. addsd %xmm8, %xmm0
  158. andps %xmm15, %xmm9
  159. addsd %xmm12, %xmm1
  160. pshufd $0x4e, %xmm9, %xmm13
  161. addsd %xmm9, %xmm2
  162. andps %xmm15, %xmm10
  163. addsd %xmm13, %xmm3
  164. pshufd $0x4e, %xmm10, %xmm12
  165. addsd %xmm10, %xmm0
  166. andps %xmm15, %xmm11
  167. addsd %xmm12, %xmm1
  168. pshufd $0x4e, %xmm11, %xmm13
  169. addsd %xmm11, %xmm2
  170. addsd %xmm13, %xmm3
  171. subq $-16 * SIZE, X
  172. ALIGN_3
  173. .L12:
  174. andq $15, M
  175. jle .L998
  176. testq $8, M
  177. je .L13
  178. movaps -16 * SIZE(X), %xmm4
  179. movaps -14 * SIZE(X), %xmm5
  180. movaps -12 * SIZE(X), %xmm6
  181. movaps -10 * SIZE(X), %xmm7
  182. addq $8 * SIZE, X
  183. andps %xmm15, %xmm4
  184. pshufd $0x4e, %xmm4, %xmm12
  185. addsd %xmm4, %xmm0
  186. andps %xmm15, %xmm5
  187. addsd %xmm12, %xmm1
  188. pshufd $0x4e, %xmm5, %xmm13
  189. addsd %xmm5, %xmm2
  190. addsd %xmm13, %xmm3
  191. andps %xmm15, %xmm6
  192. pshufd $0x4e, %xmm6, %xmm12
  193. addsd %xmm6, %xmm0
  194. andps %xmm15, %xmm7
  195. addsd %xmm12, %xmm1
  196. pshufd $0x4e, %xmm7, %xmm13
  197. addsd %xmm7, %xmm2
  198. addsd %xmm13, %xmm3
  199. ALIGN_3
  200. .L13:
  201. testq $4, M
  202. je .L14
  203. movaps -16 * SIZE(X), %xmm4
  204. movaps -14 * SIZE(X), %xmm5
  205. addq $4 * SIZE, X
  206. andps %xmm15, %xmm4
  207. pshufd $0x4e, %xmm4, %xmm12
  208. addsd %xmm4, %xmm0
  209. andps %xmm15, %xmm5
  210. addsd %xmm12, %xmm1
  211. pshufd $0x4e, %xmm5, %xmm13
  212. addsd %xmm5, %xmm2
  213. addsd %xmm13, %xmm3
  214. ALIGN_3
  215. .L14:
  216. testq $2, M
  217. je .L15
  218. movaps -16 * SIZE(X), %xmm4
  219. addq $2 * SIZE, X
  220. andps %xmm15, %xmm4
  221. pshufd $0x4e, %xmm4, %xmm5
  222. addsd %xmm4, %xmm2
  223. addsd %xmm5, %xmm3
  224. ALIGN_3
  225. .L15:
  226. testq $1, M
  227. je .L998
  228. movsd -16 * SIZE(X), %xmm4
  229. andps %xmm15, %xmm4
  230. addsd %xmm4, %xmm0
  231. jmp .L998
  232. ALIGN_3
  233. .L20:
  234. movq M, I
  235. sarq $3, I
  236. jle .L25
  237. movsd (X), %xmm4
  238. addq INCX, X
  239. movsd (X), %xmm5
  240. addq INCX, X
  241. movsd (X), %xmm6
  242. addq INCX, X
  243. movsd (X), %xmm7
  244. addq INCX, X
  245. movsd (X), %xmm8
  246. addq INCX, X
  247. movsd (X), %xmm9
  248. addq INCX, X
  249. movsd (X), %xmm10
  250. addq INCX, X
  251. movsd (X), %xmm11
  252. decq I
  253. jle .L23
  254. ALIGN_4
  255. .L22:
  256. andps %xmm15, %xmm4
  257. addq INCX, X
  258. addsd %xmm4, %xmm0
  259. movsd (X), %xmm4
  260. andps %xmm15, %xmm5
  261. addq INCX, X
  262. addsd %xmm5, %xmm1
  263. movsd (X), %xmm5
  264. andps %xmm15, %xmm6
  265. addq INCX, X
  266. addsd %xmm6, %xmm2
  267. movsd (X), %xmm6
  268. andps %xmm15, %xmm7
  269. addq INCX, X
  270. addsd %xmm7, %xmm3
  271. movsd (X), %xmm7
  272. andps %xmm15, %xmm8
  273. addq INCX, X
  274. addsd %xmm8, %xmm0
  275. movsd (X), %xmm8
  276. andps %xmm15, %xmm9
  277. addq INCX, X
  278. addsd %xmm9, %xmm1
  279. movsd (X), %xmm9
  280. andps %xmm15, %xmm10
  281. addq INCX, X
  282. addsd %xmm10, %xmm2
  283. movsd (X), %xmm10
  284. andps %xmm15, %xmm11
  285. addq INCX, X
  286. addsd %xmm11, %xmm3
  287. movsd (X), %xmm11
  288. decq I
  289. jg .L22
  290. ALIGN_4
  291. .L23:
  292. andps %xmm15, %xmm4
  293. addq INCX, X
  294. addsd %xmm4, %xmm0
  295. andps %xmm15, %xmm5
  296. addsd %xmm5, %xmm1
  297. andps %xmm15, %xmm6
  298. addsd %xmm6, %xmm2
  299. andps %xmm15, %xmm7
  300. addsd %xmm7, %xmm3
  301. andps %xmm15, %xmm8
  302. addsd %xmm8, %xmm0
  303. andps %xmm15, %xmm9
  304. addsd %xmm9, %xmm1
  305. andps %xmm15, %xmm10
  306. addsd %xmm10, %xmm2
  307. andps %xmm15, %xmm11
  308. addsd %xmm11, %xmm3
  309. ALIGN_3
  310. .L25:
  311. andq $7, M
  312. jle .L998
  313. testq $4, M
  314. je .L26
  315. movsd (X), %xmm4
  316. addq INCX, X
  317. movsd (X), %xmm5
  318. addq INCX, X
  319. movsd (X), %xmm6
  320. andps %xmm15, %xmm4
  321. addsd %xmm4, %xmm0
  322. addq INCX, X
  323. movsd (X), %xmm7
  324. andps %xmm15, %xmm5
  325. addsd %xmm5, %xmm1
  326. addq INCX, X
  327. andps %xmm15, %xmm6
  328. addsd %xmm6, %xmm2
  329. andps %xmm15, %xmm7
  330. addsd %xmm7, %xmm3
  331. ALIGN_3
  332. .L26:
  333. testq $2, M
  334. je .L27
  335. movsd (X), %xmm4
  336. addq INCX, X
  337. movsd (X), %xmm5
  338. addq INCX, X
  339. andps %xmm15, %xmm4
  340. andps %xmm15, %xmm5
  341. addsd %xmm4, %xmm0
  342. addsd %xmm5, %xmm1
  343. ALIGN_3
  344. .L27:
  345. testq $1, M
  346. je .L998
  347. movsd (X), %xmm4
  348. andps %xmm15, %xmm4
  349. addsd %xmm4, %xmm0
  350. ALIGN_3
  351. .L998:
  352. addsd %xmm1, %xmm0
  353. addsd %xmm3, %xmm2
  354. addsd %xmm2, %xmm0
  355. ALIGN_4
  356. .L999:
  357. RESTOREREGISTERS
  358. ret
  359. EPILOGUE