You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

amax_sse.S 8.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define I %rax
  44. #ifdef USE_MIN
  45. #define maxps minps
  46. #define maxss minss
  47. #endif
  48. #include "l1param.h"
  49. PROLOGUE
  50. PROFCODE
  51. SAVEREGISTERS
  52. xorps %xmm0, %xmm0
  53. leaq (, INCX, SIZE), INCX
  54. testq M, M
  55. jle .L999
  56. #ifdef USE_ABS
  57. pcmpeqb %xmm15, %xmm15
  58. psrld $1, %xmm15
  59. #endif
  60. movss (X), %xmm0
  61. shufps $0, %xmm0, %xmm0
  62. #ifdef USE_ABS
  63. andps %xmm15, %xmm0
  64. #endif
  65. movaps %xmm0, %xmm1
  66. movaps %xmm0, %xmm2
  67. movaps %xmm0, %xmm3
  68. addq INCX, X
  69. decq M
  70. jle .L999
  71. cmpq $SIZE, INCX
  72. jne .L40
  73. subq $-32 * SIZE, X
  74. cmpq $3, M
  75. jle .L17
  76. testq $SIZE, X
  77. je .L05
  78. movss -32 * SIZE(X), %xmm1
  79. shufps $0, %xmm1, %xmm1
  80. #ifdef USE_ABS
  81. andps %xmm15, %xmm1
  82. #endif
  83. decq M
  84. addq $SIZE, X
  85. ALIGN_3
  86. .L05:
  87. testq $2 * SIZE, X
  88. je .L06
  89. movsd -32 * SIZE(X), %xmm2
  90. unpcklps %xmm2, %xmm2
  91. #ifdef USE_ABS
  92. andps %xmm15, %xmm2
  93. #endif
  94. subq $2, M
  95. addq $2 * SIZE, X
  96. ALIGN_3
  97. .L06:
  98. movq M, I
  99. sarq $5, I
  100. jle .L15
  101. movaps -32 * SIZE(X), %xmm4
  102. movaps -28 * SIZE(X), %xmm5
  103. movaps -24 * SIZE(X), %xmm6
  104. movaps -20 * SIZE(X), %xmm7
  105. decq I
  106. jle .L12
  107. ALIGN_4
  108. .L11:
  109. #ifdef PREFETCH
  110. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  111. #endif
  112. #ifdef USE_ABS
  113. andps %xmm15, %xmm4
  114. #endif
  115. maxps %xmm4, %xmm0
  116. movaps -16 * SIZE(X), %xmm4
  117. #ifdef USE_ABS
  118. andps %xmm15, %xmm5
  119. #endif
  120. maxps %xmm5, %xmm1
  121. movaps -12 * SIZE(X), %xmm5
  122. #ifdef USE_ABS
  123. andps %xmm15, %xmm6
  124. #endif
  125. maxps %xmm6, %xmm2
  126. movaps -8 * SIZE(X), %xmm6
  127. #ifdef USE_ABS
  128. andps %xmm15, %xmm7
  129. #endif
  130. maxps %xmm7, %xmm3
  131. movaps -4 * SIZE(X), %xmm7
  132. #if defined(PREFETCH) && !defined(FETCH128)
  133. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  134. #endif
  135. #ifdef USE_ABS
  136. andps %xmm15, %xmm4
  137. #endif
  138. maxps %xmm4, %xmm0
  139. movaps 0 * SIZE(X), %xmm4
  140. #ifdef USE_ABS
  141. andps %xmm15, %xmm5
  142. #endif
  143. maxps %xmm5, %xmm1
  144. movaps 4 * SIZE(X), %xmm5
  145. #ifdef USE_ABS
  146. andps %xmm15, %xmm6
  147. #endif
  148. maxps %xmm6, %xmm2
  149. movaps 8 * SIZE(X), %xmm6
  150. #ifdef USE_ABS
  151. andps %xmm15, %xmm7
  152. #endif
  153. maxps %xmm7, %xmm3
  154. movaps 12 * SIZE(X), %xmm7
  155. subq $-32 * SIZE, X
  156. decq I
  157. jg .L11
  158. ALIGN_4
  159. .L12:
  160. #ifdef USE_ABS
  161. andps %xmm15, %xmm4
  162. #endif
  163. maxps %xmm4, %xmm0
  164. movaps -16 * SIZE(X), %xmm4
  165. #ifdef USE_ABS
  166. andps %xmm15, %xmm5
  167. #endif
  168. maxps %xmm5, %xmm1
  169. movaps -12 * SIZE(X), %xmm5
  170. #ifdef USE_ABS
  171. andps %xmm15, %xmm6
  172. #endif
  173. maxps %xmm6, %xmm2
  174. movaps -8 * SIZE(X), %xmm6
  175. #ifdef USE_ABS
  176. andps %xmm15, %xmm7
  177. #endif
  178. maxps %xmm7, %xmm3
  179. movaps -4 * SIZE(X), %xmm7
  180. #ifdef USE_ABS
  181. andps %xmm15, %xmm4
  182. #endif
  183. maxps %xmm4, %xmm0
  184. #ifdef USE_ABS
  185. andps %xmm15, %xmm5
  186. #endif
  187. maxps %xmm5, %xmm1
  188. #ifdef USE_ABS
  189. andps %xmm15, %xmm6
  190. #endif
  191. maxps %xmm6, %xmm2
  192. #ifdef USE_ABS
  193. andps %xmm15, %xmm7
  194. #endif
  195. maxps %xmm7, %xmm3
  196. subq $-32 * SIZE, X
  197. ALIGN_3
  198. .L15:
  199. testq $16, M
  200. je .L16
  201. movaps -32 * SIZE(X), %xmm4
  202. #ifdef USE_ABS
  203. andps %xmm15, %xmm4
  204. #endif
  205. maxps %xmm4, %xmm0
  206. movaps -28 * SIZE(X), %xmm5
  207. #ifdef USE_ABS
  208. andps %xmm15, %xmm5
  209. #endif
  210. maxps %xmm5, %xmm1
  211. movaps -24 * SIZE(X), %xmm6
  212. #ifdef USE_ABS
  213. andps %xmm15, %xmm6
  214. #endif
  215. maxps %xmm6, %xmm2
  216. movaps -20 * SIZE(X), %xmm7
  217. #ifdef USE_ABS
  218. andps %xmm15, %xmm7
  219. #endif
  220. maxps %xmm7, %xmm3
  221. addq $16 * SIZE, X
  222. ALIGN_3
  223. .L16:
  224. testq $8, M
  225. je .L17
  226. movaps -32 * SIZE(X), %xmm4
  227. #ifdef USE_ABS
  228. andps %xmm15, %xmm4
  229. #endif
  230. maxps %xmm4, %xmm0
  231. movaps -28 * SIZE(X), %xmm5
  232. #ifdef USE_ABS
  233. andps %xmm15, %xmm5
  234. #endif
  235. maxps %xmm5, %xmm1
  236. addq $8 * SIZE, X
  237. ALIGN_3
  238. .L17:
  239. testq $4, M
  240. je .L18
  241. movaps -32 * SIZE(X), %xmm4
  242. #ifdef USE_ABS
  243. andps %xmm15, %xmm4
  244. #endif
  245. maxps %xmm4, %xmm2
  246. addq $4 * SIZE, X
  247. ALIGN_3
  248. .L18:
  249. testq $2, M
  250. je .L19
  251. movsd -32 * SIZE(X), %xmm4
  252. unpcklps %xmm4, %xmm4
  253. #ifdef USE_ABS
  254. andps %xmm15, %xmm4
  255. #endif
  256. maxps %xmm4, %xmm3
  257. addq $2 * SIZE, X
  258. ALIGN_3
  259. .L19:
  260. testq $1, M
  261. je .L998
  262. movss -32 * SIZE(X), %xmm4
  263. #ifdef USE_ABS
  264. andps %xmm15, %xmm4
  265. #endif
  266. maxss %xmm4, %xmm0
  267. jmp .L998
  268. ALIGN_3
  269. .L40:
  270. movq M, I
  271. sarq $3, I
  272. jle .L45
  273. ALIGN_4
  274. .L41:
  275. movss (X), %xmm4
  276. addq INCX, X
  277. #ifdef USE_ABS
  278. andps %xmm15, %xmm4
  279. #endif
  280. maxss %xmm4, %xmm0
  281. movss (X), %xmm5
  282. addq INCX, X
  283. #ifdef USE_ABS
  284. andps %xmm15, %xmm5
  285. #endif
  286. maxss %xmm5, %xmm1
  287. movss (X), %xmm6
  288. addq INCX, X
  289. #ifdef USE_ABS
  290. andps %xmm15, %xmm6
  291. #endif
  292. maxss %xmm6, %xmm2
  293. movss (X), %xmm7
  294. addq INCX, X
  295. #ifdef USE_ABS
  296. andps %xmm15, %xmm7
  297. #endif
  298. maxss %xmm7, %xmm3
  299. movss (X), %xmm4
  300. addq INCX, X
  301. #ifdef USE_ABS
  302. andps %xmm15, %xmm4
  303. #endif
  304. maxss %xmm4, %xmm0
  305. movss (X), %xmm5
  306. addq INCX, X
  307. #ifdef USE_ABS
  308. andps %xmm15, %xmm5
  309. #endif
  310. maxss %xmm5, %xmm1
  311. movss (X), %xmm6
  312. addq INCX, X
  313. #ifdef USE_ABS
  314. andps %xmm15, %xmm6
  315. #endif
  316. maxss %xmm6, %xmm2
  317. movss (X), %xmm7
  318. addq INCX, X
  319. #ifdef USE_ABS
  320. andps %xmm15, %xmm7
  321. #endif
  322. maxss %xmm7, %xmm3
  323. decq I
  324. jg .L41
  325. ALIGN_4
  326. .L45:
  327. testq $4, M
  328. je .L46
  329. movss (X), %xmm4
  330. addq INCX, X
  331. #ifdef USE_ABS
  332. andps %xmm15, %xmm4
  333. #endif
  334. maxss %xmm4, %xmm0
  335. movss (X), %xmm5
  336. addq INCX, X
  337. #ifdef USE_ABS
  338. andps %xmm15, %xmm5
  339. #endif
  340. maxss %xmm5, %xmm1
  341. movss (X), %xmm6
  342. addq INCX, X
  343. #ifdef USE_ABS
  344. andps %xmm15, %xmm6
  345. #endif
  346. maxss %xmm6, %xmm2
  347. movss (X), %xmm7
  348. addq INCX, X
  349. #ifdef USE_ABS
  350. andps %xmm15, %xmm7
  351. #endif
  352. maxss %xmm7, %xmm3
  353. ALIGN_3
  354. .L46:
  355. testq $2, M
  356. je .L47
  357. movss (X), %xmm4
  358. addq INCX, X
  359. #ifdef USE_ABS
  360. andps %xmm15, %xmm4
  361. #endif
  362. maxss %xmm4, %xmm0
  363. movss (X), %xmm5
  364. addq INCX, X
  365. #ifdef USE_ABS
  366. andps %xmm15, %xmm5
  367. #endif
  368. maxss %xmm5, %xmm1
  369. ALIGN_3
  370. .L47:
  371. testq $1, M
  372. je .L998
  373. movss (X), %xmm4
  374. addq INCX, X
  375. #ifdef USE_ABS
  376. andps %xmm15, %xmm4
  377. #endif
  378. maxss %xmm4, %xmm2
  379. ALIGN_4
  380. .L998:
  381. maxps %xmm1, %xmm0
  382. maxps %xmm3, %xmm2
  383. maxps %xmm2, %xmm0
  384. movaps %xmm0, %xmm1
  385. movhlps %xmm0, %xmm0
  386. maxps %xmm1, %xmm0
  387. movaps %xmm0, %xmm1
  388. shufps $1, %xmm0, %xmm0
  389. maxss %xmm1, %xmm0
  390. ALIGN_4
  391. .L999:
  392. RESTOREREGISTERS
  393. ret
  394. EPILOGUE