You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

iamax_sse.S 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define RET %eax
  46. #define M %ebx
  47. #define X %ecx
  48. #define INCX %edx
  49. #define I %esi
  50. #define MM %ebp
  51. #define XX %edi
  52. #define TEMP %ebx
  53. #ifdef USE_MIN
  54. #define maxps minps
  55. #define maxss minss
  56. #endif
  57. #ifndef HAVE_SSE2
  58. #define pxor xorps
  59. #define movsd movlps
  60. #endif
  61. #include "l1param.h"
  62. PROLOGUE
  63. pushl %ebp
  64. pushl %edi
  65. pushl %esi
  66. pushl %ebx
  67. PROFCODE
  68. movl STACK_M, M
  69. movl STACK_X, X
  70. movl STACK_INCX, INCX
  71. #ifdef F_INTERFACE
  72. movl (M), M
  73. movl (INCX), INCX
  74. #endif
  75. pxor %xmm0, %xmm0 /* Return Value(Float) */
  76. #ifdef USE_ABS
  77. pxor %xmm7, %xmm7 /* Generate USE_ABS */
  78. #endif
  79. xor RET, RET /* Return Value(Int) */
  80. testl M, M
  81. jle .L999
  82. leal (, INCX, SIZE), INCX
  83. testl INCX, INCX
  84. jle .L999
  85. movl M, MM
  86. movl X, XX
  87. #ifdef USE_ABS
  88. #ifndef HAVE_SSE2
  89. subl $8, %esp
  90. movl $0x7fffffff, (%esp)
  91. movss (%esp), %xmm7
  92. shufps $0, %xmm7, %xmm7
  93. addl $8, %esp
  94. #else
  95. cmpeqps %xmm7, %xmm7
  96. psrld $1, %xmm7 /* Generate USE_ABS */
  97. #endif
  98. #endif
  99. movss (XX), %xmm0
  100. addl INCX, XX
  101. decl MM
  102. shufps $0, %xmm0, %xmm0
  103. #ifdef USE_ABS
  104. andps %xmm7, %xmm0
  105. #endif
  106. movaps %xmm0, %xmm1
  107. movaps %xmm0, %xmm2
  108. movaps %xmm0, %xmm3 /* Generating "seed value" */
  109. cmpl $SIZE, INCX
  110. jne .L80 /* Incx != 1 goto L80 */
  111. /* Analigned Check */
  112. testl $3, XX /* 00000011 */
  113. jne .L30 /* Purely Unaligned Mode */
  114. cmpl $8, MM
  115. jle .L30 /* if M <= 8 goto Unaligned mode */
  116. testl $4, XX /* bit test 000100 */
  117. je .L05
  118. movss 0 * SIZE(XX), %xmm4
  119. #ifdef USE_ABS
  120. andps %xmm7, %xmm4
  121. #endif
  122. maxss %xmm4, %xmm0
  123. decl MM
  124. addl $SIZE, XX
  125. ALIGN_3
  126. .L05:
  127. testl $8, XX
  128. je .L06
  129. movsd 0 * SIZE(XX), %xmm4
  130. unpcklps %xmm4, %xmm4
  131. #ifdef USE_ABS
  132. andps %xmm7, %xmm4
  133. #endif
  134. maxps %xmm4, %xmm1
  135. subl $2, MM
  136. addl $2 * SIZE, XX
  137. ALIGN_3
  138. .L06:
  139. movl MM, I
  140. sarl $4, I
  141. jle .L15
  142. ALIGN_4
  143. .L11:
  144. #ifdef PREFETCH
  145. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  146. #endif
  147. movaps 0 * SIZE(XX), %xmm4
  148. #ifdef USE_ABS
  149. andps %xmm7, %xmm4
  150. #endif
  151. maxps %xmm4, %xmm0
  152. movaps 4 * SIZE(XX), %xmm4
  153. #ifdef USE_ABS
  154. andps %xmm7, %xmm4
  155. #endif
  156. maxps %xmm4, %xmm1
  157. movaps 8 * SIZE(XX), %xmm4
  158. #ifdef USE_ABS
  159. andps %xmm7, %xmm4
  160. #endif
  161. maxps %xmm4, %xmm2
  162. movaps 12 * SIZE(XX), %xmm4
  163. #ifdef USE_ABS
  164. andps %xmm7, %xmm4
  165. #endif
  166. maxps %xmm4, %xmm3
  167. addl $16 * SIZE, XX
  168. decl I
  169. jg .L11
  170. ALIGN_4
  171. .L15:
  172. andl $15, MM
  173. jle .L20
  174. testl $8, MM
  175. je .L16
  176. movaps 0 * SIZE(XX), %xmm4
  177. #ifdef USE_ABS
  178. andps %xmm7, %xmm4
  179. #endif
  180. maxps %xmm4, %xmm0
  181. movaps 4 * SIZE(XX), %xmm4
  182. #ifdef USE_ABS
  183. andps %xmm7, %xmm4
  184. #endif
  185. maxps %xmm4, %xmm1
  186. addl $8 * SIZE, XX
  187. ALIGN_3
  188. .L16:
  189. testl $4, MM
  190. je .L17
  191. movaps 0 * SIZE(XX), %xmm4
  192. #ifdef USE_ABS
  193. andps %xmm7, %xmm4
  194. #endif
  195. maxps %xmm4, %xmm2
  196. addl $4 * SIZE, XX
  197. ALIGN_3
  198. .L17:
  199. testl $2, MM
  200. je .L18
  201. movsd 0 * SIZE(XX), %xmm4
  202. unpcklps %xmm4, %xmm4
  203. #ifdef USE_ABS
  204. andps %xmm7, %xmm4
  205. #endif
  206. maxps %xmm4, %xmm3
  207. addl $2 * SIZE, XX
  208. .L18:
  209. testl $1, MM
  210. je .L20
  211. movss 0 * SIZE(XX), %xmm4
  212. #ifdef USE_ABS
  213. andps %xmm7, %xmm4
  214. #endif
  215. maxss %xmm4, %xmm0
  216. ALIGN_3
  217. .L20:
  218. movl X, XX
  219. movl M, MM
  220. maxps %xmm1, %xmm0
  221. maxps %xmm3, %xmm2
  222. maxps %xmm2, %xmm0
  223. movaps %xmm0, %xmm1
  224. movhlps %xmm0, %xmm0
  225. maxps %xmm1, %xmm0
  226. movaps %xmm0, %xmm1
  227. shufps $1, %xmm0, %xmm0
  228. maxss %xmm1, %xmm0
  229. shufps $0, %xmm0, %xmm0
  230. testl $4, XX
  231. je .L21
  232. movss 0 * SIZE(XX), %xmm1
  233. decl MM
  234. addl $SIZE, XX
  235. #ifdef USE_ABS
  236. andps %xmm7, %xmm1
  237. #endif
  238. incl RET
  239. comiss %xmm0, %xmm1
  240. je .L999
  241. ALIGN_3
  242. .L21:
  243. testl $8, XX
  244. je .L22
  245. movss 0 * SIZE(XX), %xmm1
  246. movss 1 * SIZE(XX), %xmm2
  247. subl $2, MM
  248. addl $2 * SIZE, XX
  249. #ifdef USE_ABS
  250. andps %xmm7, %xmm1
  251. andps %xmm7, %xmm2
  252. #endif
  253. incl RET
  254. comiss %xmm0, %xmm1
  255. je .L999
  256. incl RET
  257. comiss %xmm0, %xmm2
  258. je .L999
  259. ALIGN_3
  260. .L22:
  261. movl MM, I
  262. sarl $3, I
  263. jle .L25
  264. ALIGN_4
  265. .L23:
  266. #ifdef PREFETCH
  267. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  268. #endif
  269. movaps 0 * SIZE(XX), %xmm1
  270. #ifdef USE_ABS
  271. andps %xmm7, %xmm1
  272. #endif
  273. cmpeqps %xmm0, %xmm1
  274. movaps 4 * SIZE(XX), %xmm2
  275. #ifdef USE_ABS
  276. andps %xmm7, %xmm2
  277. #endif
  278. cmpeqps %xmm0, %xmm2
  279. orps %xmm2, %xmm1
  280. movmskps %xmm1, TEMP
  281. testl $15, TEMP
  282. jne .L24
  283. addl $8 * SIZE, XX
  284. addl $8, RET
  285. decl I
  286. jg .L23
  287. jmp .L25
  288. ALIGN_3
  289. .L24:
  290. movss 0 * SIZE(XX), %xmm1
  291. movss 1 * SIZE(XX), %xmm2
  292. movss 2 * SIZE(XX), %xmm3
  293. movss 3 * SIZE(XX), %xmm4
  294. #ifdef USE_ABS
  295. andps %xmm7, %xmm1
  296. andps %xmm7, %xmm2
  297. andps %xmm7, %xmm3
  298. andps %xmm7, %xmm4
  299. #endif
  300. incl RET
  301. comiss %xmm0, %xmm1
  302. je .L999
  303. incl RET
  304. comiss %xmm0, %xmm2
  305. je .L999
  306. incl RET
  307. comiss %xmm0, %xmm3
  308. je .L999
  309. incl RET
  310. comiss %xmm0, %xmm4
  311. je .L999
  312. movss 4 * SIZE(XX), %xmm1
  313. movss 5 * SIZE(XX), %xmm2
  314. movss 6 * SIZE(XX), %xmm3
  315. #ifdef USE_ABS
  316. andps %xmm7, %xmm1
  317. andps %xmm7, %xmm2
  318. andps %xmm7, %xmm3
  319. #endif
  320. incl RET
  321. comiss %xmm0, %xmm1
  322. je .L999
  323. incl RET
  324. comiss %xmm0, %xmm2
  325. je .L999
  326. incl RET
  327. comiss %xmm0, %xmm3
  328. je .L999
  329. incl RET
  330. jmp .L999
  331. ALIGN_4
  332. .L25:
  333. testl $4, MM
  334. je .L26
  335. movss 0 * SIZE(XX), %xmm1
  336. movss 1 * SIZE(XX), %xmm2
  337. movss 2 * SIZE(XX), %xmm3
  338. movss 3 * SIZE(XX), %xmm4
  339. #ifdef USE_ABS
  340. andps %xmm7, %xmm1
  341. andps %xmm7, %xmm2
  342. andps %xmm7, %xmm3
  343. andps %xmm7, %xmm4
  344. #endif
  345. addl $4 * SIZE, XX
  346. incl RET
  347. comiss %xmm0, %xmm1
  348. je .L999
  349. incl RET
  350. comiss %xmm0, %xmm2
  351. je .L999
  352. incl RET
  353. comiss %xmm0, %xmm3
  354. je .L999
  355. incl RET
  356. comiss %xmm0, %xmm4
  357. je .L999
  358. ALIGN_3
  359. .L26:
  360. testl $2, MM
  361. je .L27
  362. movss 0 * SIZE(XX), %xmm1
  363. movss 1 * SIZE(XX), %xmm2
  364. #ifdef USE_ABS
  365. andps %xmm7, %xmm1
  366. andps %xmm7, %xmm2
  367. #endif
  368. addl $2 * SIZE, XX
  369. incl RET
  370. comiss %xmm0, %xmm1
  371. je .L999
  372. incl RET
  373. comiss %xmm0, %xmm2
  374. je .L999
  375. ALIGN_3
  376. .L27:
  377. incl RET
  378. jmp .L999
  379. ALIGN_3
  380. /* Unaligned Mode */
  381. .L30:
  382. movl MM, I
  383. sarl $4, I
  384. jle .L35
  385. ALIGN_4
  386. .L31:
  387. #ifdef PREFETCH
  388. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  389. #endif
  390. movsd 0 * SIZE(XX), %xmm4
  391. movhps 2 * SIZE(XX), %xmm4
  392. #ifdef USE_ABS
  393. andps %xmm7, %xmm4
  394. #endif
  395. maxps %xmm4, %xmm0
  396. movsd 4 * SIZE(XX), %xmm4
  397. movhps 6 * SIZE(XX), %xmm4
  398. #ifdef USE_ABS
  399. andps %xmm7, %xmm4
  400. #endif
  401. maxps %xmm4, %xmm1
  402. movsd 8 * SIZE(XX), %xmm4
  403. movhps 10 * SIZE(XX), %xmm4
  404. #ifdef USE_ABS
  405. andps %xmm7, %xmm4
  406. #endif
  407. maxps %xmm4, %xmm2
  408. movsd 12 * SIZE(XX), %xmm4
  409. movhps 14 * SIZE(XX), %xmm4
  410. #ifdef USE_ABS
  411. andps %xmm7, %xmm4
  412. #endif
  413. maxps %xmm4, %xmm3
  414. addl $16 * SIZE, XX
  415. decl I
  416. jg .L31
  417. ALIGN_4
  418. .L35:
  419. andl $15, MM
  420. jle .L40
  421. testl $8, MM
  422. je .L36
  423. movsd 0 * SIZE(XX), %xmm4
  424. movhps 2 * SIZE(XX), %xmm4
  425. #ifdef USE_ABS
  426. andps %xmm7, %xmm4
  427. #endif
  428. maxps %xmm4, %xmm0
  429. movsd 4 * SIZE(XX), %xmm4
  430. movhps 6 * SIZE(XX), %xmm4
  431. #ifdef USE_ABS
  432. andps %xmm7, %xmm4
  433. #endif
  434. maxps %xmm4, %xmm1
  435. addl $8 * SIZE, XX
  436. ALIGN_3
  437. .L36:
  438. testl $4, MM
  439. je .L37
  440. movsd 0 * SIZE(XX), %xmm4
  441. movhps 2 * SIZE(XX), %xmm4
  442. #ifdef USE_ABS
  443. andps %xmm7, %xmm4
  444. #endif
  445. maxps %xmm4, %xmm2
  446. addl $4 * SIZE, XX
  447. ALIGN_3
  448. .L37:
  449. testl $2, MM
  450. je .L38
  451. movsd 0 * SIZE(XX), %xmm4
  452. unpcklps %xmm4, %xmm4
  453. #ifdef USE_ABS
  454. andps %xmm7, %xmm4
  455. #endif
  456. maxps %xmm4, %xmm3
  457. addl $2 * SIZE, XX
  458. .L38:
  459. testl $1, MM
  460. je .L40
  461. movss 0 * SIZE(XX), %xmm4
  462. #ifdef USE_ABS
  463. andps %xmm7, %xmm4
  464. #endif
  465. maxss %xmm4, %xmm0
  466. jmp .L40
  467. ALIGN_4
  468. .L40:
  469. movl X, XX
  470. movl M, MM
  471. maxps %xmm1, %xmm0
  472. maxps %xmm3, %xmm2
  473. maxps %xmm2, %xmm0
  474. movaps %xmm0, %xmm1
  475. movhlps %xmm0, %xmm0
  476. maxps %xmm1, %xmm0
  477. movaps %xmm0, %xmm1
  478. shufps $1, %xmm0, %xmm0
  479. maxss %xmm1, %xmm0
  480. shufps $0, %xmm0, %xmm0
  481. movl MM, I
  482. sarl $3, I
  483. jle .L45
  484. ALIGN_4
  485. .L43:
  486. #ifdef PREFETCH
  487. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  488. #endif
  489. movsd 0 * SIZE(XX), %xmm1
  490. movhps 2 * SIZE(XX), %xmm1
  491. #ifdef USE_ABS
  492. andps %xmm7, %xmm1
  493. #endif
  494. cmpeqps %xmm0, %xmm1
  495. movsd 4 * SIZE(XX), %xmm2
  496. movhps 6 * SIZE(XX), %xmm2
  497. #ifdef USE_ABS
  498. andps %xmm7, %xmm2
  499. #endif
  500. cmpeqps %xmm0, %xmm2
  501. orps %xmm2, %xmm1
  502. movmskps %xmm1, TEMP
  503. testl $15, TEMP
  504. jne .L44
  505. addl $8 * SIZE, XX
  506. addl $8, RET
  507. decl I
  508. jg .L43
  509. jmp .L45
  510. ALIGN_3
  511. .L44:
  512. movss 0 * SIZE(XX), %xmm1
  513. movss 1 * SIZE(XX), %xmm2
  514. movss 2 * SIZE(XX), %xmm3
  515. movss 3 * SIZE(XX), %xmm4
  516. #ifdef USE_ABS
  517. andps %xmm7, %xmm1
  518. andps %xmm7, %xmm2
  519. andps %xmm7, %xmm3
  520. andps %xmm7, %xmm4
  521. #endif
  522. incl RET
  523. comiss %xmm0, %xmm1
  524. je .L999
  525. incl RET
  526. comiss %xmm0, %xmm2
  527. je .L999
  528. incl RET
  529. comiss %xmm0, %xmm3
  530. je .L999
  531. incl RET
  532. comiss %xmm0, %xmm4
  533. je .L999
  534. movss 4 * SIZE(XX), %xmm1
  535. movss 5 * SIZE(XX), %xmm2
  536. movss 6 * SIZE(XX), %xmm3
  537. #ifdef USE_ABS
  538. andps %xmm7, %xmm1
  539. andps %xmm7, %xmm2
  540. andps %xmm7, %xmm3
  541. #endif
  542. incl RET
  543. comiss %xmm0, %xmm1
  544. je .L999
  545. incl RET
  546. comiss %xmm0, %xmm2
  547. je .L999
  548. incl RET
  549. comiss %xmm0, %xmm3
  550. je .L999
  551. incl RET
  552. jmp .L999
  553. ALIGN_4
  554. .L45:
  555. testl $4, MM
  556. je .L46
  557. movss 0 * SIZE(XX), %xmm1
  558. movss 1 * SIZE(XX), %xmm2
  559. movss 2 * SIZE(XX), %xmm3
  560. movss 3 * SIZE(XX), %xmm4
  561. #ifdef USE_ABS
  562. andps %xmm7, %xmm1
  563. andps %xmm7, %xmm2
  564. andps %xmm7, %xmm3
  565. andps %xmm7, %xmm4
  566. #endif
  567. addl $4 * SIZE, XX
  568. incl RET
  569. comiss %xmm0, %xmm1
  570. je .L999
  571. incl RET
  572. comiss %xmm0, %xmm2
  573. je .L999
  574. incl RET
  575. comiss %xmm0, %xmm3
  576. je .L999
  577. incl RET
  578. comiss %xmm0, %xmm4
  579. je .L999
  580. ALIGN_3
  581. .L46:
  582. testl $2, MM
  583. je .L47
  584. movss 0 * SIZE(XX), %xmm1
  585. movss 1 * SIZE(XX), %xmm2
  586. #ifdef USE_ABS
  587. andps %xmm7, %xmm1
  588. andps %xmm7, %xmm2
  589. #endif
  590. addl $2 * SIZE, XX
  591. incl RET
  592. comiss %xmm0, %xmm1
  593. je .L999
  594. incl RET
  595. comiss %xmm0, %xmm2
  596. je .L999
  597. ALIGN_3
  598. .L47:
  599. incl RET
  600. jmp .L999
  601. ALIGN_3
  602. .L80:
  603. movl MM, I
  604. sarl $3, I
  605. jle .L85
  606. ALIGN_4
  607. .L81:
  608. #ifdef PREFETCH
  609. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  610. #endif
  611. movss 0 * SIZE(XX), %xmm4
  612. addl INCX, XX
  613. #ifdef USE_ABS
  614. andps %xmm7, %xmm4
  615. #endif
  616. maxss %xmm4, %xmm0
  617. movss 0 * SIZE(XX), %xmm4
  618. addl INCX, XX
  619. #ifdef USE_ABS
  620. andps %xmm7, %xmm4
  621. #endif
  622. maxss %xmm4, %xmm1
  623. movss 0 * SIZE(XX), %xmm4
  624. addl INCX, XX
  625. #ifdef USE_ABS
  626. andps %xmm7, %xmm4
  627. #endif
  628. maxss %xmm4, %xmm2
  629. movss 0 * SIZE(XX), %xmm4
  630. addl INCX, XX
  631. #ifdef USE_ABS
  632. andps %xmm7, %xmm4
  633. #endif
  634. maxss %xmm4, %xmm3
  635. movss 0 * SIZE(XX), %xmm4
  636. addl INCX, XX
  637. #ifdef USE_ABS
  638. andps %xmm7, %xmm4
  639. #endif
  640. maxss %xmm4, %xmm0
  641. movss 0 * SIZE(XX), %xmm4
  642. addl INCX, XX
  643. #ifdef USE_ABS
  644. andps %xmm7, %xmm4
  645. #endif
  646. maxss %xmm4, %xmm1
  647. movss 0 * SIZE(XX), %xmm4
  648. addl INCX, XX
  649. #ifdef USE_ABS
  650. andps %xmm7, %xmm4
  651. #endif
  652. maxss %xmm4, %xmm2
  653. movss 0 * SIZE(XX), %xmm4
  654. addl INCX, XX
  655. #ifdef USE_ABS
  656. andps %xmm7, %xmm4
  657. #endif
  658. maxss %xmm4, %xmm3
  659. decl I
  660. jg .L81
  661. ALIGN_4
  662. .L85:
  663. andl $7, MM
  664. jle .L90
  665. testl $4, MM
  666. je .L86
  667. movss 0 * SIZE(XX), %xmm4
  668. addl INCX, XX
  669. #ifdef USE_ABS
  670. andps %xmm7, %xmm4
  671. #endif
  672. maxss %xmm4, %xmm0
  673. movss 0 * SIZE(XX), %xmm4
  674. addl INCX, XX
  675. #ifdef USE_ABS
  676. andps %xmm7, %xmm4
  677. #endif
  678. maxss %xmm4, %xmm1
  679. movss 0 * SIZE(XX), %xmm4
  680. addl INCX, XX
  681. #ifdef USE_ABS
  682. andps %xmm7, %xmm4
  683. #endif
  684. maxss %xmm4, %xmm2
  685. movss 0 * SIZE(XX), %xmm4
  686. addl INCX, XX
  687. #ifdef USE_ABS
  688. andps %xmm7, %xmm4
  689. #endif
  690. maxss %xmm4, %xmm3
  691. ALIGN_3
  692. .L86:
  693. testl $2, MM
  694. je .L87
  695. movss 0 * SIZE(XX), %xmm4
  696. addl INCX, XX
  697. #ifdef USE_ABS
  698. andps %xmm7, %xmm4
  699. #endif
  700. maxss %xmm4, %xmm0
  701. movss 0 * SIZE(XX), %xmm4
  702. addl INCX, XX
  703. #ifdef USE_ABS
  704. andps %xmm7, %xmm4
  705. #endif
  706. maxss %xmm4, %xmm1
  707. ALIGN_3
  708. .L87:
  709. testl $1, MM
  710. je .L90
  711. movss 0 * SIZE(XX), %xmm4
  712. addl INCX, XX
  713. #ifdef USE_ABS
  714. andps %xmm7, %xmm4
  715. #endif
  716. maxss %xmm4, %xmm2
  717. ALIGN_4
  718. .L90:
  719. movl X, XX
  720. movl M, MM
  721. maxss %xmm1, %xmm0
  722. maxss %xmm3, %xmm2
  723. maxss %xmm2, %xmm0
  724. shufps $0, %xmm0, %xmm0
  725. movl MM, I
  726. sarl $2, I
  727. jle .L96
  728. ALIGN_4
  729. .L92:
  730. #ifdef PREFETCH
  731. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  732. #endif
  733. movss 0 * SIZE(XX), %xmm1
  734. addl INCX, XX
  735. #ifdef USE_ABS
  736. andps %xmm7, %xmm1
  737. #endif
  738. cmpeqss %xmm0, %xmm1
  739. movss 0 * SIZE(XX), %xmm2
  740. addl INCX, XX
  741. #ifdef USE_ABS
  742. andps %xmm7, %xmm2
  743. #endif
  744. cmpeqss %xmm0, %xmm2
  745. movss 0 * SIZE(XX), %xmm3
  746. addl INCX, XX
  747. #ifdef USE_ABS
  748. andps %xmm7, %xmm3
  749. #endif
  750. cmpeqss %xmm0, %xmm3
  751. movss 0 * SIZE(XX), %xmm4
  752. addl INCX, XX
  753. #ifdef USE_ABS
  754. andps %xmm7, %xmm4
  755. #endif
  756. cmpeqss %xmm0, %xmm4
  757. orps %xmm2, %xmm1
  758. orps %xmm4, %xmm3
  759. orps %xmm3, %xmm1
  760. movmskps %xmm1, TEMP
  761. testl $15, TEMP
  762. jne .L93
  763. addl $4, RET
  764. decl I
  765. jg .L92
  766. jmp .L96
  767. ALIGN_3
  768. .L93:
  769. leal (, INCX, 4), TEMP
  770. subl TEMP, XX
  771. movss 0 * SIZE(XX), %xmm1
  772. addl INCX, XX
  773. movss 0 * SIZE(XX), %xmm2
  774. addl INCX, XX
  775. movss 0 * SIZE(XX), %xmm3
  776. addl INCX, XX
  777. movss 0 * SIZE(XX), %xmm4
  778. addl INCX, XX
  779. #ifdef USE_ABS
  780. andps %xmm7, %xmm1
  781. andps %xmm7, %xmm2
  782. andps %xmm7, %xmm3
  783. andps %xmm7, %xmm4
  784. #endif
  785. incl RET
  786. comiss %xmm0, %xmm1
  787. je .L999
  788. incl RET
  789. comiss %xmm0, %xmm2
  790. je .L999
  791. incl RET
  792. comiss %xmm0, %xmm3
  793. je .L999
  794. incl RET
  795. comiss %xmm0, %xmm4
  796. je .L999
  797. ALIGN_3
  798. .L96:
  799. testl $2, MM
  800. je .L97
  801. movss 0 * SIZE(XX), %xmm1
  802. addl INCX, XX
  803. movss 0 * SIZE(XX), %xmm2
  804. addl INCX, XX
  805. #ifdef USE_ABS
  806. andps %xmm7, %xmm1
  807. andps %xmm7, %xmm2
  808. #endif
  809. incl RET
  810. comiss %xmm0, %xmm1
  811. je .L999
  812. incl RET
  813. comiss %xmm0, %xmm2
  814. je .L999
  815. ALIGN_3
  816. .L97:
  817. incl RET
  818. ALIGN_3
  819. .L999:
  820. popl %ebx
  821. popl %esi
  822. popl %edi
  823. popl %ebp
  824. ret
  825. EPILOGUE