You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

amax_sse2.S 9.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define M %ebx
  46. #define X %ecx
  47. #define INCX %edx
  48. #define I %eax
  49. #ifdef USE_MIN
  50. #define maxpd minpd
  51. #define maxsd minsd
  52. #endif
  53. #include "l1param.h"
  54. PROLOGUE
  55. pushl %ebp
  56. pushl %edi
  57. pushl %esi
  58. pushl %ebx
  59. PROFCODE
  60. #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
  61. EMMS
  62. #endif
  63. movl STACK_M, M
  64. movl STACK_X, X
  65. movl STACK_INCX, INCX
  66. xorps %xmm0, %xmm0
  67. leal (, INCX, SIZE), INCX
  68. testl M, M
  69. jle .L999
  70. #ifdef USE_ABS
  71. pcmpeqb %xmm3, %xmm3
  72. psrlq $1, %xmm3
  73. #endif
  74. movsd (X), %xmm0
  75. addl INCX, X
  76. #ifdef USE_ABS
  77. andps %xmm3, %xmm0
  78. #endif
  79. unpcklpd %xmm0, %xmm0
  80. movaps %xmm0, %xmm1
  81. decl M
  82. jle .L999
  83. cmpl $SIZE, INCX
  84. jne .L40
  85. subl $-16 * SIZE, X
  86. testl $SIZE, X
  87. je .L05
  88. movsd -16 * SIZE(X), %xmm4
  89. #ifdef USE_ABS
  90. andps %xmm3, %xmm4
  91. #endif
  92. unpcklpd %xmm4, %xmm4
  93. maxpd %xmm4, %xmm0
  94. addl $SIZE, X
  95. decl M
  96. jle .L998
  97. ALIGN_3
  98. .L05:
  99. movl M, I
  100. sarl $4, I
  101. jle .L15
  102. movaps -16 * SIZE(X), %xmm4
  103. movaps -14 * SIZE(X), %xmm5
  104. movaps -12 * SIZE(X), %xmm6
  105. movaps -10 * SIZE(X), %xmm7
  106. decl I
  107. jle .L12
  108. ALIGN_4
  109. .L11:
  110. #ifdef PREFETCH
  111. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  112. #endif
  113. #ifdef USE_ABS
  114. andps %xmm3, %xmm4
  115. #endif
  116. maxpd %xmm4, %xmm0
  117. movaps -8 * SIZE(X), %xmm4
  118. #ifdef USE_ABS
  119. andps %xmm3, %xmm5
  120. #endif
  121. maxpd %xmm5, %xmm1
  122. movaps -6 * SIZE(X), %xmm5
  123. #ifdef USE_ABS
  124. andps %xmm3, %xmm6
  125. #endif
  126. maxpd %xmm6, %xmm0
  127. movaps -4 * SIZE(X), %xmm6
  128. #ifdef USE_ABS
  129. andps %xmm3, %xmm7
  130. #endif
  131. maxpd %xmm7, %xmm1
  132. movaps -2 * SIZE(X), %xmm7
  133. #if defined(PREFETCH) && !defined(FETCH128)
  134. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  135. #endif
  136. #ifdef USE_ABS
  137. andps %xmm3, %xmm4
  138. #endif
  139. maxpd %xmm4, %xmm0
  140. movaps 0 * SIZE(X), %xmm4
  141. #ifdef USE_ABS
  142. andps %xmm3, %xmm5
  143. #endif
  144. maxpd %xmm5, %xmm1
  145. movaps 2 * SIZE(X), %xmm5
  146. #ifdef USE_ABS
  147. andps %xmm3, %xmm6
  148. #endif
  149. maxpd %xmm6, %xmm0
  150. movaps 4 * SIZE(X), %xmm6
  151. #ifdef USE_ABS
  152. andps %xmm3, %xmm7
  153. #endif
  154. maxpd %xmm7, %xmm1
  155. movaps 6 * SIZE(X), %xmm7
  156. subl $-16 * SIZE, X
  157. decl I
  158. jg .L11
  159. ALIGN_4
  160. .L12:
  161. #ifdef USE_ABS
  162. andps %xmm3, %xmm4
  163. #endif
  164. maxpd %xmm4, %xmm0
  165. movaps -8 * SIZE(X), %xmm4
  166. #ifdef USE_ABS
  167. andps %xmm3, %xmm5
  168. #endif
  169. maxpd %xmm5, %xmm1
  170. movaps -6 * SIZE(X), %xmm5
  171. #ifdef USE_ABS
  172. andps %xmm3, %xmm6
  173. #endif
  174. maxpd %xmm6, %xmm0
  175. movaps -4 * SIZE(X), %xmm6
  176. #ifdef USE_ABS
  177. andps %xmm3, %xmm7
  178. #endif
  179. maxpd %xmm7, %xmm1
  180. movaps -2 * SIZE(X), %xmm7
  181. #ifdef USE_ABS
  182. andps %xmm3, %xmm4
  183. #endif
  184. maxpd %xmm4, %xmm0
  185. #ifdef USE_ABS
  186. andps %xmm3, %xmm5
  187. #endif
  188. maxpd %xmm5, %xmm1
  189. #ifdef USE_ABS
  190. andps %xmm3, %xmm6
  191. #endif
  192. maxpd %xmm6, %xmm0
  193. #ifdef USE_ABS
  194. andps %xmm3, %xmm7
  195. #endif
  196. maxpd %xmm7, %xmm1
  197. subl $-16 * SIZE, X
  198. ALIGN_4
  199. .L15:
  200. testl $8, M
  201. jle .L16
  202. movaps -16 * SIZE(X), %xmm4
  203. #ifdef USE_ABS
  204. andps %xmm3, %xmm4
  205. #endif
  206. maxpd %xmm4, %xmm0
  207. movaps -14 * SIZE(X), %xmm5
  208. #ifdef USE_ABS
  209. andps %xmm3, %xmm5
  210. #endif
  211. maxpd %xmm5, %xmm1
  212. movaps -12 * SIZE(X), %xmm6
  213. #ifdef USE_ABS
  214. andps %xmm3, %xmm6
  215. #endif
  216. maxpd %xmm6, %xmm0
  217. movaps -10 * SIZE(X), %xmm7
  218. #ifdef USE_ABS
  219. andps %xmm3, %xmm7
  220. #endif
  221. maxpd %xmm7, %xmm1
  222. addl $8 * SIZE, X
  223. ALIGN_3
  224. .L16:
  225. testl $4, M
  226. jle .L17
  227. movaps -16 * SIZE(X), %xmm4
  228. #ifdef USE_ABS
  229. andps %xmm3, %xmm4
  230. #endif
  231. maxpd %xmm4, %xmm0
  232. movaps -14 * SIZE(X), %xmm5
  233. #ifdef USE_ABS
  234. andps %xmm3, %xmm5
  235. #endif
  236. maxpd %xmm5, %xmm1
  237. addl $4 * SIZE, X
  238. ALIGN_3
  239. .L17:
  240. testl $2, M
  241. jle .L18
  242. movaps -16 * SIZE(X), %xmm4
  243. #ifdef USE_ABS
  244. andps %xmm3, %xmm4
  245. #endif
  246. maxpd %xmm4, %xmm0
  247. addl $2 * SIZE, X
  248. ALIGN_3
  249. .L18:
  250. testl $1, M
  251. jle .L998
  252. movsd -16 * SIZE(X), %xmm4
  253. unpcklpd %xmm4, %xmm4
  254. #ifdef USE_ABS
  255. andps %xmm3, %xmm4
  256. #endif
  257. maxpd %xmm4, %xmm1
  258. jmp .L998
  259. ALIGN_3
  260. .L40:
  261. movl M, I
  262. sarl $4, I
  263. jle .L45
  264. ALIGN_4
  265. .L41:
  266. #ifdef PREFETCH
  267. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  268. #endif
  269. movsd (X), %xmm4
  270. addl INCX, X
  271. movhps (X), %xmm4
  272. addl INCX, X
  273. #ifdef USE_ABS
  274. andps %xmm3, %xmm4
  275. #endif
  276. maxpd %xmm4, %xmm0
  277. movsd (X), %xmm5
  278. addl INCX, X
  279. movhps (X), %xmm5
  280. addl INCX, X
  281. #ifdef USE_ABS
  282. andps %xmm3, %xmm5
  283. #endif
  284. maxpd %xmm5, %xmm1
  285. movsd (X), %xmm6
  286. addl INCX, X
  287. movhps (X), %xmm6
  288. addl INCX, X
  289. #ifdef USE_ABS
  290. andps %xmm3, %xmm6
  291. #endif
  292. maxpd %xmm6, %xmm0
  293. movsd (X), %xmm7
  294. addl INCX, X
  295. movhps (X), %xmm7
  296. addl INCX, X
  297. #ifdef USE_ABS
  298. andps %xmm3, %xmm7
  299. #endif
  300. maxpd %xmm7, %xmm1
  301. #if defined(PREFETCH) && !defined(FETCH128)
  302. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  303. #endif
  304. movsd (X), %xmm4
  305. addl INCX, X
  306. movhps (X), %xmm4
  307. addl INCX, X
  308. #ifdef USE_ABS
  309. andps %xmm3, %xmm4
  310. #endif
  311. maxpd %xmm4, %xmm0
  312. movsd (X), %xmm5
  313. addl INCX, X
  314. movhps (X), %xmm5
  315. addl INCX, X
  316. #ifdef USE_ABS
  317. andps %xmm3, %xmm5
  318. #endif
  319. maxpd %xmm5, %xmm1
  320. movsd (X), %xmm6
  321. addl INCX, X
  322. movhps (X), %xmm6
  323. addl INCX, X
  324. #ifdef USE_ABS
  325. andps %xmm3, %xmm6
  326. #endif
  327. maxpd %xmm6, %xmm0
  328. movsd (X), %xmm7
  329. addl INCX, X
  330. movhps (X), %xmm7
  331. addl INCX, X
  332. #ifdef USE_ABS
  333. andps %xmm3, %xmm7
  334. #endif
  335. maxpd %xmm7, %xmm1
  336. decl I
  337. jg .L41
  338. ALIGN_4
  339. .L45:
  340. andl $15, M
  341. jle .L998
  342. testl $8, M
  343. je .L46
  344. movsd (X), %xmm4
  345. addl INCX, X
  346. movhps (X), %xmm4
  347. addl INCX, X
  348. #ifdef USE_ABS
  349. andps %xmm3, %xmm4
  350. #endif
  351. maxpd %xmm4, %xmm0
  352. movsd (X), %xmm5
  353. addl INCX, X
  354. movhps (X), %xmm5
  355. addl INCX, X
  356. #ifdef USE_ABS
  357. andps %xmm3, %xmm5
  358. #endif
  359. maxpd %xmm5, %xmm1
  360. movsd (X), %xmm6
  361. addl INCX, X
  362. movhps (X), %xmm6
  363. addl INCX, X
  364. #ifdef USE_ABS
  365. andps %xmm3, %xmm6
  366. #endif
  367. maxpd %xmm6, %xmm0
  368. movsd (X), %xmm7
  369. addl INCX, X
  370. movhps (X), %xmm7
  371. addl INCX, X
  372. #ifdef USE_ABS
  373. andps %xmm3, %xmm7
  374. #endif
  375. maxpd %xmm7, %xmm1
  376. ALIGN_3
  377. .L46:
  378. testl $4, M
  379. je .L47
  380. movsd (X), %xmm4
  381. addl INCX, X
  382. movhps (X), %xmm4
  383. addl INCX, X
  384. #ifdef USE_ABS
  385. andps %xmm3, %xmm4
  386. #endif
  387. maxpd %xmm4, %xmm0
  388. movsd (X), %xmm5
  389. addl INCX, X
  390. movhps (X), %xmm5
  391. addl INCX, X
  392. #ifdef USE_ABS
  393. andps %xmm3, %xmm5
  394. #endif
  395. maxpd %xmm5, %xmm1
  396. ALIGN_3
  397. .L47:
  398. testl $2, M
  399. je .L48
  400. movsd (X), %xmm6
  401. addl INCX, X
  402. movhps (X), %xmm6
  403. addl INCX, X
  404. #ifdef USE_ABS
  405. andps %xmm3, %xmm6
  406. #endif
  407. maxpd %xmm6, %xmm0
  408. ALIGN_3
  409. .L48:
  410. testl $1, M
  411. je .L998
  412. movsd (X), %xmm7
  413. unpcklpd %xmm7, %xmm7
  414. #ifdef USE_ABS
  415. andps %xmm3, %xmm7
  416. #endif
  417. maxpd %xmm7, %xmm1
  418. ALIGN_4
  419. .L998:
  420. maxpd %xmm1, %xmm0
  421. movaps %xmm0, %xmm1
  422. unpckhpd %xmm0, %xmm0
  423. maxsd %xmm1, %xmm0
  424. ALIGN_4
  425. .L999:
  426. subl $8, %esp
  427. movsd %xmm0, (%esp)
  428. fldl (%esp)
  429. addl $8, %esp
  430. popl %ebx
  431. popl %esi
  432. popl %edi
  433. popl %ebp
  434. ret
  435. EPILOGUE