You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

izamax_sse2.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define RET %eax
  46. #define M %ebx
  47. #define X %ecx
  48. #define INCX %edx
  49. #define I %esi
  50. #define MM %ebp
  51. #define XX %edi
  52. #define TEMP %ebx
  53. #ifdef USE_MIN
  54. #define maxpd minpd
  55. #define maxsd minsd
  56. #endif
  57. #include "l1param.h"
  58. PROLOGUE
  59. pushl %ebp
  60. pushl %edi
  61. pushl %esi
  62. pushl %ebx
  63. PROFCODE
  64. movl STACK_M, M
  65. movl STACK_X, X
  66. movl STACK_INCX, INCX
  67. #ifdef F_INTERFACE
  68. movl (M), M
  69. movl (INCX), INCX
  70. #endif
  71. pxor %xmm0, %xmm0
  72. pxor %xmm7, %xmm7
  73. xor RET, RET
  74. testl M, M
  75. jle .L999
  76. testl INCX, INCX
  77. jle .L999
  78. sall $ZBASE_SHIFT, INCX
  79. movl M, MM
  80. movl X, XX
  81. cmpeqpd %xmm7, %xmm7
  82. psrlq $1, %xmm7
  83. movsd 0 * SIZE(XX), %xmm0
  84. movsd 1 * SIZE(XX), %xmm1
  85. addl INCX, XX
  86. decl MM
  87. andpd %xmm7, %xmm0
  88. andpd %xmm7, %xmm1
  89. addpd %xmm1, %xmm0
  90. unpcklpd %xmm0, %xmm0
  91. cmpl $2 * SIZE, INCX
  92. jne .L60
  93. movl MM, I
  94. sarl $3, I
  95. jle .L25
  96. ALIGN_4
  97. .L21:
  98. #ifdef PREFETCH
  99. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  100. #endif
  101. movsd 0 * SIZE(XX), %xmm1
  102. movsd 1 * SIZE(XX), %xmm2
  103. movhpd 2 * SIZE(XX), %xmm1
  104. movhpd 3 * SIZE(XX), %xmm2
  105. andpd %xmm7, %xmm1
  106. andpd %xmm7, %xmm2
  107. addpd %xmm2, %xmm1
  108. maxpd %xmm1, %xmm0
  109. movsd 4 * SIZE(XX), %xmm3
  110. movsd 5 * SIZE(XX), %xmm4
  111. movhpd 6 * SIZE(XX), %xmm3
  112. movhpd 7 * SIZE(XX), %xmm4
  113. andpd %xmm7, %xmm3
  114. andpd %xmm7, %xmm4
  115. addpd %xmm4, %xmm3
  116. maxpd %xmm3, %xmm0
  117. #if defined(PREFETCH) && !defined(FETCH128)
  118. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX)
  119. #endif
  120. movsd 8 * SIZE(XX), %xmm1
  121. movsd 9 * SIZE(XX), %xmm2
  122. movhpd 10 * SIZE(XX), %xmm1
  123. movhpd 11 * SIZE(XX), %xmm2
  124. andpd %xmm7, %xmm1
  125. andpd %xmm7, %xmm2
  126. addpd %xmm2, %xmm1
  127. maxpd %xmm1, %xmm0
  128. movsd 12 * SIZE(XX), %xmm3
  129. movsd 13 * SIZE(XX), %xmm4
  130. movhpd 14 * SIZE(XX), %xmm3
  131. movhpd 15 * SIZE(XX), %xmm4
  132. andpd %xmm7, %xmm3
  133. andpd %xmm7, %xmm4
  134. addpd %xmm4, %xmm3
  135. maxpd %xmm3, %xmm0
  136. addl $16 * SIZE, XX
  137. decl I
  138. jg .L21
  139. ALIGN_4
  140. .L25:
  141. andl $7, MM
  142. jle .L30
  143. testl $4, MM
  144. je .L26
  145. movsd 0 * SIZE(XX), %xmm1
  146. movsd 1 * SIZE(XX), %xmm2
  147. movhpd 2 * SIZE(XX), %xmm1
  148. movhpd 3 * SIZE(XX), %xmm2
  149. andpd %xmm7, %xmm1
  150. andpd %xmm7, %xmm2
  151. addpd %xmm2, %xmm1
  152. maxpd %xmm1, %xmm0
  153. movsd 4 * SIZE(XX), %xmm3
  154. movsd 5 * SIZE(XX), %xmm4
  155. movhpd 6 * SIZE(XX), %xmm3
  156. movhpd 7 * SIZE(XX), %xmm4
  157. andpd %xmm7, %xmm3
  158. andpd %xmm7, %xmm4
  159. addpd %xmm4, %xmm3
  160. maxpd %xmm3, %xmm0
  161. addl $8 * SIZE, XX
  162. ALIGN_3
  163. .L26:
  164. testl $2, MM
  165. je .L27
  166. movsd 0 * SIZE(XX), %xmm1
  167. movsd 1 * SIZE(XX), %xmm2
  168. movhpd 2 * SIZE(XX), %xmm1
  169. movhpd 3 * SIZE(XX), %xmm2
  170. andpd %xmm7, %xmm1
  171. andpd %xmm7, %xmm2
  172. addpd %xmm2, %xmm1
  173. maxpd %xmm1, %xmm0
  174. addl $4 * SIZE, XX
  175. ALIGN_3
  176. .L27:
  177. testl $1, MM
  178. je .L30
  179. movsd 0 * SIZE(XX), %xmm1
  180. movsd 1 * SIZE(XX), %xmm2
  181. andpd %xmm7, %xmm1
  182. andpd %xmm7, %xmm2
  183. addpd %xmm2, %xmm1
  184. maxsd %xmm1, %xmm0
  185. ALIGN_4
  186. .L30:
  187. movl X, XX
  188. movl M, MM
  189. movapd %xmm0, %xmm1
  190. unpckhpd %xmm0, %xmm0
  191. maxsd %xmm1, %xmm0
  192. unpcklpd %xmm0, %xmm0
  193. movl MM, I
  194. sarl $2, I
  195. jle .L35
  196. ALIGN_4
  197. .L31:
  198. #ifdef PREFETCH
  199. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  200. #endif
  201. movsd 0 * SIZE(XX), %xmm1
  202. movsd 1 * SIZE(XX), %xmm2
  203. movhpd 2 * SIZE(XX), %xmm1
  204. movhpd 3 * SIZE(XX), %xmm2
  205. movsd 4 * SIZE(XX), %xmm3
  206. movsd 5 * SIZE(XX), %xmm4
  207. movhpd 6 * SIZE(XX), %xmm3
  208. movhpd 7 * SIZE(XX), %xmm4
  209. andpd %xmm7, %xmm1
  210. andpd %xmm7, %xmm2
  211. andpd %xmm7, %xmm3
  212. andpd %xmm7, %xmm4
  213. addpd %xmm2, %xmm1
  214. addpd %xmm4, %xmm3
  215. cmpeqpd %xmm0, %xmm1
  216. cmpeqpd %xmm0, %xmm3
  217. orpd %xmm3, %xmm1
  218. movmskpd %xmm1, TEMP
  219. testl $3, TEMP
  220. jne .L33
  221. addl $8 * SIZE, XX
  222. addl $4, RET
  223. decl I
  224. jg .L31
  225. jmp .L35
  226. ALIGN_4
  227. .L33:
  228. movsd 0 * SIZE(XX), %xmm1
  229. movsd 1 * SIZE(XX), %xmm2
  230. movsd 2 * SIZE(XX), %xmm3
  231. movsd 3 * SIZE(XX), %xmm4
  232. andpd %xmm7, %xmm1
  233. andpd %xmm7, %xmm2
  234. andpd %xmm7, %xmm3
  235. andpd %xmm7, %xmm4
  236. addpd %xmm2, %xmm1
  237. addpd %xmm4, %xmm3
  238. incl RET
  239. comisd %xmm0, %xmm1
  240. je .L999
  241. incl RET
  242. comisd %xmm0, %xmm3
  243. je .L999
  244. movsd 4 * SIZE(XX), %xmm1
  245. movsd 5 * SIZE(XX), %xmm2
  246. movsd 6 * SIZE(XX), %xmm3
  247. movsd 7 * SIZE(XX), %xmm4
  248. addl $8 * SIZE, XX
  249. andpd %xmm7, %xmm1
  250. andpd %xmm7, %xmm2
  251. andpd %xmm7, %xmm3
  252. andpd %xmm7, %xmm4
  253. addpd %xmm2, %xmm1
  254. addpd %xmm4, %xmm3
  255. incl RET
  256. comisd %xmm0, %xmm1
  257. je .L999
  258. incl RET
  259. comisd %xmm0, %xmm3
  260. je .L999
  261. ALIGN_3
  262. .L35:
  263. testl $2, MM
  264. je .L36
  265. movsd 0 * SIZE(XX), %xmm1
  266. movsd 1 * SIZE(XX), %xmm2
  267. movsd 2 * SIZE(XX), %xmm3
  268. movsd 3 * SIZE(XX), %xmm4
  269. addl $4 * SIZE, XX
  270. andpd %xmm7, %xmm1
  271. andpd %xmm7, %xmm2
  272. andpd %xmm7, %xmm3
  273. andpd %xmm7, %xmm4
  274. addpd %xmm2, %xmm1
  275. addpd %xmm4, %xmm3
  276. incl RET
  277. comisd %xmm0, %xmm1
  278. je .L999
  279. incl RET
  280. comisd %xmm0, %xmm3
  281. je .L999
  282. ALIGN_3
  283. .L36:
  284. incl RET
  285. jmp .L999
  286. ALIGN_3
  287. .L60:
  288. movl MM, I
  289. sarl $3, I
  290. jle .L65
  291. ALIGN_4
  292. .L61:
  293. #ifdef PREFETCH
  294. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  295. #endif
  296. movsd 0 * SIZE(XX), %xmm1
  297. movsd 1 * SIZE(XX), %xmm2
  298. addl INCX, XX
  299. movhpd 0 * SIZE(XX), %xmm1
  300. movhpd 1 * SIZE(XX), %xmm2
  301. addl INCX, XX
  302. andpd %xmm7, %xmm1
  303. andpd %xmm7, %xmm2
  304. addpd %xmm2, %xmm1
  305. maxpd %xmm1, %xmm0
  306. movsd 0 * SIZE(XX), %xmm3
  307. movsd 1 * SIZE(XX), %xmm4
  308. addl INCX, XX
  309. movhpd 0 * SIZE(XX), %xmm3
  310. movhpd 1 * SIZE(XX), %xmm4
  311. addl INCX, XX
  312. andpd %xmm7, %xmm3
  313. andpd %xmm7, %xmm4
  314. addpd %xmm4, %xmm3
  315. maxpd %xmm3, %xmm0
  316. #if defined(PREFETCH) && !defined(FETCH128)
  317. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  318. #endif
  319. movsd 0 * SIZE(XX), %xmm1
  320. movsd 1 * SIZE(XX), %xmm2
  321. addl INCX, XX
  322. movhpd 0 * SIZE(XX), %xmm1
  323. movhpd 1 * SIZE(XX), %xmm2
  324. addl INCX, XX
  325. andpd %xmm7, %xmm1
  326. andpd %xmm7, %xmm2
  327. addpd %xmm2, %xmm1
  328. maxpd %xmm1, %xmm0
  329. movsd 0 * SIZE(XX), %xmm3
  330. movsd 1 * SIZE(XX), %xmm4
  331. addl INCX, XX
  332. movhpd 0 * SIZE(XX), %xmm3
  333. movhpd 1 * SIZE(XX), %xmm4
  334. addl INCX, XX
  335. andpd %xmm7, %xmm3
  336. andpd %xmm7, %xmm4
  337. addpd %xmm4, %xmm3
  338. maxpd %xmm3, %xmm0
  339. decl I
  340. jg .L61
  341. ALIGN_4
  342. .L65:
  343. andl $7, MM
  344. jle .L70
  345. testl $4, MM
  346. je .L66
  347. movsd 0 * SIZE(XX), %xmm1
  348. movsd 1 * SIZE(XX), %xmm2
  349. addl INCX, XX
  350. movhpd 0 * SIZE(XX), %xmm1
  351. movhpd 1 * SIZE(XX), %xmm2
  352. addl INCX, XX
  353. andpd %xmm7, %xmm1
  354. andpd %xmm7, %xmm2
  355. addpd %xmm2, %xmm1
  356. maxpd %xmm1, %xmm0
  357. movsd 0 * SIZE(XX), %xmm3
  358. movsd 1 * SIZE(XX), %xmm4
  359. addl INCX, XX
  360. movhpd 0 * SIZE(XX), %xmm3
  361. movhpd 1 * SIZE(XX), %xmm4
  362. addl INCX, XX
  363. andpd %xmm7, %xmm3
  364. andpd %xmm7, %xmm4
  365. addpd %xmm4, %xmm3
  366. maxpd %xmm3, %xmm0
  367. ALIGN_3
  368. .L66:
  369. testl $2, MM
  370. je .L67
  371. movsd 0 * SIZE(XX), %xmm1
  372. movsd 1 * SIZE(XX), %xmm2
  373. addl INCX, XX
  374. movhpd 0 * SIZE(XX), %xmm1
  375. movhpd 1 * SIZE(XX), %xmm2
  376. addl INCX, XX
  377. andpd %xmm7, %xmm1
  378. andpd %xmm7, %xmm2
  379. addpd %xmm2, %xmm1
  380. maxpd %xmm1, %xmm0
  381. ALIGN_3
  382. .L67:
  383. testl $1, MM
  384. je .L70
  385. movsd 0 * SIZE(XX), %xmm1
  386. movsd 1 * SIZE(XX), %xmm2
  387. andpd %xmm7, %xmm1
  388. andpd %xmm7, %xmm2
  389. addpd %xmm2, %xmm1
  390. maxsd %xmm1, %xmm0
  391. ALIGN_3
  392. .L70:
  393. movl X, XX
  394. movl M, MM
  395. movapd %xmm0, %xmm1
  396. unpckhpd %xmm0, %xmm0
  397. maxsd %xmm1, %xmm0
  398. unpcklpd %xmm0, %xmm0
  399. movl MM, I
  400. sarl $2, I
  401. jle .L75
  402. ALIGN_4
  403. .L71:
  404. #ifdef PREFETCH
  405. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  406. #endif
  407. movsd 0 * SIZE(XX), %xmm1
  408. movsd 1 * SIZE(XX), %xmm2
  409. addl INCX, XX
  410. movhpd 0 * SIZE(XX), %xmm1
  411. movhpd 1 * SIZE(XX), %xmm2
  412. addl INCX, XX
  413. movsd 0 * SIZE(XX), %xmm3
  414. movsd 1 * SIZE(XX), %xmm4
  415. addl INCX, XX
  416. movhpd 0 * SIZE(XX), %xmm3
  417. movhpd 1 * SIZE(XX), %xmm4
  418. addl INCX, XX
  419. andpd %xmm7, %xmm1
  420. andpd %xmm7, %xmm2
  421. andpd %xmm7, %xmm3
  422. andpd %xmm7, %xmm4
  423. addpd %xmm2, %xmm1
  424. addpd %xmm4, %xmm3
  425. cmpeqpd %xmm0, %xmm1
  426. cmpeqpd %xmm0, %xmm3
  427. orpd %xmm3, %xmm1
  428. movmskpd %xmm1, TEMP
  429. testl $3, TEMP
  430. jne .L73
  431. addl $4, RET
  432. decl I
  433. jg .L71
  434. jmp .L75
  435. ALIGN_4
  436. .L73:
  437. leal (, INCX, 4), TEMP
  438. subl TEMP, XX
  439. movsd 0 * SIZE(XX), %xmm1
  440. movsd 1 * SIZE(XX), %xmm2
  441. addl INCX, XX
  442. movsd 0 * SIZE(XX), %xmm3
  443. movsd 1 * SIZE(XX), %xmm4
  444. addl INCX, XX
  445. andpd %xmm7, %xmm1
  446. andpd %xmm7, %xmm2
  447. andpd %xmm7, %xmm3
  448. andpd %xmm7, %xmm4
  449. addpd %xmm2, %xmm1
  450. addpd %xmm4, %xmm3
  451. incl RET
  452. comisd %xmm0, %xmm1
  453. je .L999
  454. incl RET
  455. comisd %xmm0, %xmm3
  456. je .L999
  457. movsd 0 * SIZE(XX), %xmm1
  458. movsd 1 * SIZE(XX), %xmm2
  459. addl INCX, XX
  460. movsd 0 * SIZE(XX), %xmm3
  461. movsd 1 * SIZE(XX), %xmm4
  462. addl INCX, XX
  463. andpd %xmm7, %xmm1
  464. andpd %xmm7, %xmm2
  465. andpd %xmm7, %xmm3
  466. andpd %xmm7, %xmm4
  467. addpd %xmm2, %xmm1
  468. addpd %xmm4, %xmm3
  469. incl RET
  470. comisd %xmm0, %xmm1
  471. je .L999
  472. incl RET
  473. comisd %xmm0, %xmm3
  474. je .L999
  475. ALIGN_3
  476. .L75:
  477. testl $2, MM
  478. je .L76
  479. movsd 0 * SIZE(XX), %xmm1
  480. movsd 1 * SIZE(XX), %xmm2
  481. addl INCX, XX
  482. movsd 0 * SIZE(XX), %xmm3
  483. movsd 1 * SIZE(XX), %xmm4
  484. addl INCX, XX
  485. andpd %xmm7, %xmm1
  486. andpd %xmm7, %xmm2
  487. andpd %xmm7, %xmm3
  488. andpd %xmm7, %xmm4
  489. addpd %xmm2, %xmm1
  490. addpd %xmm4, %xmm3
  491. incl RET
  492. comisd %xmm0, %xmm1
  493. je .L999
  494. incl RET
  495. comisd %xmm0, %xmm3
  496. je .L999
  497. ALIGN_3
  498. .L76:
  499. incl RET
  500. ALIGN_4
  501. .L999:
  502. popl %ebx
  503. popl %esi
  504. popl %edi
  505. popl %ebp
  506. ret
  507. EPILOGUE