You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

iamax_sse2.S 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define RET %eax
  46. #define M %ebx
  47. #define X %ecx
  48. #define INCX %edx
  49. #define I %esi
  50. #define MM %ebp
  51. #define XX %edi
  52. #define TEMP %ebx
  53. #ifdef USE_MIN
  54. #define maxpd minpd
  55. #define maxsd minsd
  56. #endif
  57. #include "l1param.h"
  58. PROLOGUE
  59. pushl %ebp
  60. pushl %edi
  61. pushl %esi
  62. pushl %ebx
  63. PROFCODE
  64. movl STACK_M, M
  65. movl STACK_X, X
  66. movl STACK_INCX, INCX
  67. #ifdef F_INTERFACE
  68. movl (M), M
  69. movl (INCX), INCX
  70. #endif
  71. pxor %xmm0, %xmm0
  72. #ifdef USE_ABS
  73. pxor %xmm7, %xmm7
  74. #endif
  75. xor RET, RET
  76. testl M, M
  77. jle .L999
  78. leal (, INCX, SIZE), INCX
  79. testl INCX, INCX
  80. jle .L999
  81. movl M, MM
  82. movl X, XX
  83. #ifdef USE_ABS
  84. cmpeqpd %xmm7, %xmm7
  85. psrlq $1, %xmm7
  86. #endif
  87. movsd (XX), %xmm0
  88. addl INCX, XX
  89. decl MM
  90. #ifdef USE_ABS
  91. andpd %xmm7, %xmm0
  92. #endif
  93. unpcklpd %xmm0, %xmm0
  94. movapd %xmm0, %xmm1
  95. movapd %xmm0, %xmm2
  96. movapd %xmm0, %xmm3
  97. cmpl $SIZE, INCX
  98. jne .L80
  99. /* Analigned Check */
  100. cmpl $7, MM
  101. jle .L50
  102. testl $7, XX
  103. jne .L50 # Purely Unaligned Mode
  104. testl $15, XX # Checking for 128bit align
  105. je .L05
  106. movsd 0 * SIZE(XX), %xmm4
  107. #ifdef USE_ABS
  108. andpd %xmm7, %xmm4
  109. #endif
  110. unpcklpd %xmm4, %xmm4
  111. maxpd %xmm4, %xmm3
  112. decl MM
  113. addl $SIZE, XX
  114. ALIGN_3
  115. .L05:
  116. movl MM, I
  117. sarl $4, I
  118. jle .L15
  119. ALIGN_4
  120. .L11:
  121. #ifdef PREFETCH
  122. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  123. #endif
  124. movapd 0 * SIZE(XX), %xmm4
  125. #ifdef USE_ABS
  126. andpd %xmm7, %xmm4
  127. #endif
  128. maxpd %xmm4, %xmm0
  129. movapd 2 * SIZE(XX), %xmm4
  130. #ifdef USE_ABS
  131. andpd %xmm7, %xmm4
  132. #endif
  133. maxpd %xmm4, %xmm1
  134. movapd 4 * SIZE(XX), %xmm4
  135. #ifdef USE_ABS
  136. andpd %xmm7, %xmm4
  137. #endif
  138. maxpd %xmm4, %xmm2
  139. movapd 6 * SIZE(XX), %xmm4
  140. #ifdef USE_ABS
  141. andpd %xmm7, %xmm4
  142. #endif
  143. maxpd %xmm4, %xmm3
  144. #ifdef PREFETCH
  145. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX)
  146. #endif
  147. movapd 8 * SIZE(XX), %xmm4
  148. #ifdef USE_ABS
  149. andpd %xmm7, %xmm4
  150. #endif
  151. maxpd %xmm4, %xmm0
  152. movapd 10 * SIZE(XX), %xmm4
  153. #ifdef USE_ABS
  154. andpd %xmm7, %xmm4
  155. #endif
  156. maxpd %xmm4, %xmm1
  157. movapd 12 * SIZE(XX), %xmm4
  158. #ifdef USE_ABS
  159. andpd %xmm7, %xmm4
  160. #endif
  161. maxpd %xmm4, %xmm2
  162. movapd 14 * SIZE(XX), %xmm4
  163. #ifdef USE_ABS
  164. andpd %xmm7, %xmm4
  165. #endif
  166. maxpd %xmm4, %xmm3
  167. addl $16 * SIZE, XX
  168. decl I
  169. jg .L11
  170. ALIGN_4
  171. .L15:
  172. andl $15, MM
  173. jle .L20
  174. testl $8, MM
  175. je .L16
  176. movapd 0 * SIZE(XX), %xmm4
  177. #ifdef USE_ABS
  178. andpd %xmm7, %xmm4
  179. #endif
  180. maxpd %xmm4, %xmm0
  181. movapd 2 * SIZE(XX), %xmm4
  182. #ifdef USE_ABS
  183. andpd %xmm7, %xmm4
  184. #endif
  185. maxpd %xmm4, %xmm1
  186. movapd 4 * SIZE(XX), %xmm4
  187. #ifdef USE_ABS
  188. andpd %xmm7, %xmm4
  189. #endif
  190. maxpd %xmm4, %xmm2
  191. movapd 6 * SIZE(XX), %xmm4
  192. #ifdef USE_ABS
  193. andpd %xmm7, %xmm4
  194. #endif
  195. maxpd %xmm4, %xmm3
  196. addl $8 * SIZE, XX
  197. ALIGN_3
  198. .L16:
  199. testl $4, MM
  200. je .L17
  201. movapd 0 * SIZE(XX), %xmm4
  202. #ifdef USE_ABS
  203. andpd %xmm7, %xmm4
  204. #endif
  205. maxpd %xmm4, %xmm0
  206. movapd 2 * SIZE(XX), %xmm4
  207. #ifdef USE_ABS
  208. andpd %xmm7, %xmm4
  209. #endif
  210. maxpd %xmm4, %xmm1
  211. addl $4 * SIZE, XX
  212. ALIGN_3
  213. .L17:
  214. testl $2, MM
  215. je .L18
  216. movapd 0 * SIZE(XX), %xmm4
  217. #ifdef USE_ABS
  218. andpd %xmm7, %xmm4
  219. #endif
  220. maxpd %xmm4, %xmm2
  221. addl $2 * SIZE, XX
  222. .L18:
  223. testl $1, MM
  224. je .L20
  225. movsd 0 * SIZE(XX), %xmm4
  226. #ifdef USE_ABS
  227. andpd %xmm7, %xmm4
  228. #endif
  229. unpcklpd %xmm4, %xmm4
  230. maxpd %xmm4, %xmm3
  231. ALIGN_3
  232. /* Finding Index */
  233. .L20:
  234. movl X, XX
  235. movl M, MM
  236. maxpd %xmm1, %xmm0
  237. maxpd %xmm3, %xmm2
  238. maxpd %xmm2, %xmm0
  239. movapd %xmm0, %xmm1
  240. unpckhpd %xmm0, %xmm0
  241. maxsd %xmm1, %xmm0
  242. unpcklpd %xmm0, %xmm0
  243. testl $15, XX # Checking for 128bit align
  244. je .L21
  245. movsd 0 * SIZE(XX), %xmm1
  246. #ifdef USE_ABS
  247. andpd %xmm7, %xmm1
  248. #endif
  249. incl RET
  250. comisd %xmm0, %xmm1
  251. je .L999
  252. addl $SIZE, XX
  253. decl MM
  254. ALIGN_3
  255. .L21:
  256. movl MM, I
  257. sarl $3, I
  258. jle .L25
  259. ALIGN_4
  260. .L22:
  261. #ifdef PREFETCH
  262. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  263. #endif
  264. movapd 0 * SIZE(XX), %xmm1
  265. #ifdef USE_ABS
  266. andpd %xmm7, %xmm1
  267. #endif
  268. cmpeqpd %xmm0, %xmm1
  269. movapd 2 * SIZE(XX), %xmm2
  270. #ifdef USE_ABS
  271. andpd %xmm7, %xmm2
  272. #endif
  273. cmpeqpd %xmm0, %xmm2
  274. movapd 4 * SIZE(XX), %xmm3
  275. #ifdef USE_ABS
  276. andpd %xmm7, %xmm3
  277. #endif
  278. cmpeqpd %xmm0, %xmm3
  279. movapd 6 * SIZE(XX), %xmm4
  280. #ifdef USE_ABS
  281. andpd %xmm7, %xmm4
  282. #endif
  283. cmpeqpd %xmm0, %xmm4
  284. orpd %xmm2, %xmm1
  285. orpd %xmm4, %xmm3
  286. orpd %xmm3, %xmm1
  287. movmskpd %xmm1, TEMP
  288. testl $3, TEMP
  289. jne .L23
  290. addl $8 * SIZE, XX
  291. addl $8, RET
  292. decl I
  293. jg .L22
  294. jmp .L25
  295. ALIGN_4
  296. .L23:
  297. movsd 0 * SIZE(XX), %xmm1
  298. movsd 1 * SIZE(XX), %xmm2
  299. movsd 2 * SIZE(XX), %xmm3
  300. movsd 3 * SIZE(XX), %xmm4
  301. #ifdef USE_ABS
  302. andpd %xmm7, %xmm1
  303. andpd %xmm7, %xmm2
  304. andpd %xmm7, %xmm3
  305. andpd %xmm7, %xmm4
  306. #endif
  307. incl RET
  308. comisd %xmm0, %xmm1
  309. je .L999
  310. incl RET
  311. comisd %xmm0, %xmm2
  312. je .L999
  313. incl RET
  314. comisd %xmm0, %xmm3
  315. je .L999
  316. incl RET
  317. comisd %xmm0, %xmm4
  318. je .L999
  319. movsd 4 * SIZE(XX), %xmm1
  320. movsd 5 * SIZE(XX), %xmm2
  321. movsd 6 * SIZE(XX), %xmm3
  322. #ifdef USE_ABS
  323. andpd %xmm7, %xmm1
  324. andpd %xmm7, %xmm2
  325. andpd %xmm7, %xmm3
  326. #endif
  327. incl RET
  328. comisd %xmm0, %xmm1
  329. je .L999
  330. incl RET
  331. comisd %xmm0, %xmm2
  332. je .L999
  333. incl RET
  334. comisd %xmm0, %xmm3
  335. je .L999
  336. incl RET
  337. jmp .L999
  338. ALIGN_3
  339. .L25:
  340. testl $4, MM
  341. je .L27
  342. movsd 0 * SIZE(XX), %xmm1
  343. movsd 1 * SIZE(XX), %xmm2
  344. movsd 2 * SIZE(XX), %xmm3
  345. movsd 3 * SIZE(XX), %xmm4
  346. #ifdef USE_ABS
  347. andpd %xmm7, %xmm1
  348. andpd %xmm7, %xmm2
  349. andpd %xmm7, %xmm3
  350. andpd %xmm7, %xmm4
  351. #endif
  352. addl $4 * SIZE, XX
  353. incl RET
  354. comisd %xmm0, %xmm1
  355. je .L999
  356. incl RET
  357. comisd %xmm0, %xmm2
  358. je .L999
  359. incl RET
  360. comisd %xmm0, %xmm3
  361. je .L999
  362. incl RET
  363. comisd %xmm0, %xmm4
  364. je .L999
  365. ALIGN_3
  366. .L27:
  367. testl $2, MM
  368. je .L28
  369. movsd 0 * SIZE(XX), %xmm1
  370. movsd 1 * SIZE(XX), %xmm2
  371. #ifdef USE_ABS
  372. andpd %xmm7, %xmm1
  373. andpd %xmm7, %xmm2
  374. #endif
  375. addl $2 * SIZE, XX
  376. incl RET
  377. comisd %xmm0, %xmm1
  378. je .L999
  379. incl RET
  380. comisd %xmm0, %xmm2
  381. je .L999
  382. ALIGN_3
  383. .L28:
  384. incl RET
  385. jmp .L999
  386. ALIGN_3
  387. .L50:
  388. /* Unaligned Mode */
  389. movl MM, I
  390. sarl $4, I
  391. jle .L55
  392. ALIGN_4
  393. .L51:
  394. #ifdef PREFETCH
  395. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  396. #endif
  397. movsd 0 * SIZE(XX), %xmm4
  398. movhpd 1 * SIZE(XX), %xmm4
  399. #ifdef USE_ABS
  400. andpd %xmm7, %xmm4
  401. #endif
  402. maxpd %xmm4, %xmm0
  403. movsd 2 * SIZE(XX), %xmm4
  404. movhpd 3 * SIZE(XX), %xmm4
  405. #ifdef USE_ABS
  406. andpd %xmm7, %xmm4
  407. #endif
  408. maxpd %xmm4, %xmm1
  409. movsd 4 * SIZE(XX), %xmm4
  410. movhpd 5 * SIZE(XX), %xmm4
  411. #ifdef USE_ABS
  412. andpd %xmm7, %xmm4
  413. #endif
  414. maxpd %xmm4, %xmm2
  415. movsd 6 * SIZE(XX), %xmm4
  416. movhpd 7 * SIZE(XX), %xmm4
  417. #ifdef USE_ABS
  418. andpd %xmm7, %xmm4
  419. #endif
  420. maxpd %xmm4, %xmm3
  421. #ifdef PREFETCH
  422. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX)
  423. #endif
  424. movsd 8 * SIZE(XX), %xmm4
  425. movhpd 9 * SIZE(XX), %xmm4
  426. #ifdef USE_ABS
  427. andpd %xmm7, %xmm4
  428. #endif
  429. maxpd %xmm4, %xmm0
  430. movsd 10 * SIZE(XX), %xmm4
  431. movhpd 11 * SIZE(XX), %xmm4
  432. #ifdef USE_ABS
  433. andpd %xmm7, %xmm4
  434. #endif
  435. maxpd %xmm4, %xmm1
  436. movsd 12 * SIZE(XX), %xmm4
  437. movhpd 13 * SIZE(XX), %xmm4
  438. #ifdef USE_ABS
  439. andpd %xmm7, %xmm4
  440. #endif
  441. maxpd %xmm4, %xmm2
  442. movsd 14 * SIZE(XX), %xmm4
  443. movhpd 15 * SIZE(XX), %xmm4
  444. #ifdef USE_ABS
  445. andpd %xmm7, %xmm4
  446. #endif
  447. maxpd %xmm4, %xmm3
  448. addl $16 * SIZE, XX
  449. decl I
  450. jg .L51
  451. ALIGN_4
  452. .L55:
  453. andl $15, MM
  454. jle .L60
  455. testl $8, MM
  456. je .L56
  457. movsd 0 * SIZE(XX), %xmm4
  458. movhpd 1 * SIZE(XX), %xmm4
  459. #ifdef USE_ABS
  460. andpd %xmm7, %xmm4
  461. #endif
  462. maxpd %xmm4, %xmm0
  463. movsd 2 * SIZE(XX), %xmm4
  464. movhpd 3 * SIZE(XX), %xmm4
  465. #ifdef USE_ABS
  466. andpd %xmm7, %xmm4
  467. #endif
  468. maxpd %xmm4, %xmm1
  469. movsd 4 * SIZE(XX), %xmm4
  470. movhpd 5 * SIZE(XX), %xmm4
  471. #ifdef USE_ABS
  472. andpd %xmm7, %xmm4
  473. #endif
  474. maxpd %xmm4, %xmm2
  475. movsd 6 * SIZE(XX), %xmm4
  476. movhpd 7 * SIZE(XX), %xmm4
  477. #ifdef USE_ABS
  478. andpd %xmm7, %xmm4
  479. #endif
  480. maxpd %xmm4, %xmm3
  481. addl $8 * SIZE, XX
  482. ALIGN_3
  483. .L56:
  484. testl $4, MM
  485. je .L57
  486. movsd 0 * SIZE(XX), %xmm4
  487. movhpd 1 * SIZE(XX), %xmm4
  488. #ifdef USE_ABS
  489. andpd %xmm7, %xmm4
  490. #endif
  491. maxpd %xmm4, %xmm0
  492. movsd 2 * SIZE(XX), %xmm4
  493. movhpd 3 * SIZE(XX), %xmm4
  494. #ifdef USE_ABS
  495. andpd %xmm7, %xmm4
  496. #endif
  497. maxpd %xmm4, %xmm1
  498. addl $4 * SIZE, XX
  499. ALIGN_3
  500. .L57:
  501. testl $2, MM
  502. je .L58
  503. movsd 0 * SIZE(XX), %xmm4
  504. movhpd 1 * SIZE(XX), %xmm4
  505. #ifdef USE_ABS
  506. andpd %xmm7, %xmm4
  507. #endif
  508. maxpd %xmm4, %xmm2
  509. addl $2 * SIZE, XX
  510. .L58:
  511. testl $1, MM
  512. je .L60
  513. movsd 0 * SIZE(XX), %xmm4
  514. unpcklpd %xmm4, %xmm4
  515. #ifdef USE_ABS
  516. andpd %xmm7, %xmm4
  517. #endif
  518. maxpd %xmm4, %xmm3
  519. ALIGN_3
  520. .L60:
  521. movl X, XX
  522. movl M, MM
  523. maxpd %xmm1, %xmm0
  524. maxpd %xmm3, %xmm2
  525. maxpd %xmm2, %xmm0
  526. movapd %xmm0, %xmm1
  527. unpckhpd %xmm0, %xmm0
  528. maxsd %xmm1, %xmm0
  529. unpcklpd %xmm0, %xmm0
  530. movl MM, I
  531. sarl $3, I
  532. jle .L65
  533. ALIGN_4
  534. .L62:
  535. #ifdef PREFETCH
  536. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  537. #endif
  538. movsd 0 * SIZE(XX), %xmm1
  539. movhpd 1 * SIZE(XX), %xmm1
  540. #ifdef USE_ABS
  541. andpd %xmm7, %xmm1
  542. #endif
  543. cmpeqpd %xmm0, %xmm1
  544. movsd 2 * SIZE(XX), %xmm2
  545. movhpd 3 * SIZE(XX), %xmm2
  546. #ifdef USE_ABS
  547. andpd %xmm7, %xmm2
  548. #endif
  549. cmpeqpd %xmm0, %xmm2
  550. movsd 4 * SIZE(XX), %xmm3
  551. movhpd 5 * SIZE(XX), %xmm3
  552. #ifdef USE_ABS
  553. andpd %xmm7, %xmm3
  554. #endif
  555. cmpeqpd %xmm0, %xmm3
  556. movsd 6 * SIZE(XX), %xmm4
  557. movhpd 7 * SIZE(XX), %xmm4
  558. #ifdef USE_ABS
  559. andpd %xmm7, %xmm4
  560. #endif
  561. cmpeqpd %xmm0, %xmm4
  562. orpd %xmm2, %xmm1
  563. orpd %xmm4, %xmm3
  564. orpd %xmm3, %xmm1
  565. movmskpd %xmm1, TEMP
  566. testl $3, TEMP
  567. jne .L63
  568. addl $8 * SIZE, XX
  569. addl $8, RET
  570. decl I
  571. jg .L62
  572. jmp .L65
  573. ALIGN_4
  574. .L63:
  575. movsd 0 * SIZE(XX), %xmm1
  576. movsd 1 * SIZE(XX), %xmm2
  577. movsd 2 * SIZE(XX), %xmm3
  578. movsd 3 * SIZE(XX), %xmm4
  579. #ifdef USE_ABS
  580. andpd %xmm7, %xmm1
  581. andpd %xmm7, %xmm2
  582. andpd %xmm7, %xmm3
  583. andpd %xmm7, %xmm4
  584. #endif
  585. incl RET
  586. comisd %xmm0, %xmm1
  587. je .L999
  588. incl RET
  589. comisd %xmm0, %xmm2
  590. je .L999
  591. incl RET
  592. comisd %xmm0, %xmm3
  593. je .L999
  594. incl RET
  595. comisd %xmm0, %xmm4
  596. je .L999
  597. incl RET
  598. movsd 4 * SIZE(XX), %xmm1
  599. movsd 5 * SIZE(XX), %xmm2
  600. movsd 6 * SIZE(XX), %xmm3
  601. #ifdef USE_ABS
  602. andpd %xmm7, %xmm1
  603. andpd %xmm7, %xmm2
  604. andpd %xmm7, %xmm3
  605. #endif
  606. comisd %xmm0, %xmm1
  607. je .L999
  608. incl RET
  609. comisd %xmm0, %xmm2
  610. je .L999
  611. incl RET
  612. comisd %xmm0, %xmm3
  613. je .L999
  614. incl RET
  615. jmp .L999
  616. ALIGN_3
  617. .L65:
  618. testl $4, MM
  619. je .L67
  620. movsd 0 * SIZE(XX), %xmm1
  621. movsd 1 * SIZE(XX), %xmm2
  622. movsd 2 * SIZE(XX), %xmm3
  623. movsd 3 * SIZE(XX), %xmm4
  624. #ifdef USE_ABS
  625. andpd %xmm7, %xmm1
  626. andpd %xmm7, %xmm2
  627. andpd %xmm7, %xmm3
  628. andpd %xmm7, %xmm4
  629. #endif
  630. addl $4 * SIZE, XX
  631. incl RET
  632. comisd %xmm0, %xmm1
  633. je .L999
  634. incl RET
  635. comisd %xmm0, %xmm2
  636. je .L999
  637. incl RET
  638. comisd %xmm0, %xmm3
  639. je .L999
  640. incl RET
  641. comisd %xmm0, %xmm4
  642. je .L999
  643. ALIGN_3
  644. .L67:
  645. testl $2, MM
  646. je .L68
  647. movsd 0 * SIZE(XX), %xmm1
  648. movsd 1 * SIZE(XX), %xmm2
  649. #ifdef USE_ABS
  650. andpd %xmm7, %xmm1
  651. andpd %xmm7, %xmm2
  652. #endif
  653. addl $2 * SIZE, XX
  654. incl RET
  655. comisd %xmm0, %xmm1
  656. je .L999
  657. incl RET
  658. comisd %xmm0, %xmm2
  659. je .L999
  660. ALIGN_3
  661. .L68:
  662. incl RET
  663. jmp .L999
  664. ALIGN_4
  665. .L80:
  666. movl MM, I
  667. sarl $4, I
  668. jle .L85
  669. ALIGN_4
  670. .L81:
  671. #ifdef PREFETCH
  672. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  673. #endif
  674. movsd 0 * SIZE(XX), %xmm4
  675. addl INCX, XX
  676. movhpd 0 * SIZE(XX), %xmm4
  677. addl INCX, XX
  678. #ifdef USE_ABS
  679. andpd %xmm7, %xmm4
  680. #endif
  681. maxpd %xmm4, %xmm0
  682. movsd 0 * SIZE(XX), %xmm4
  683. addl INCX, XX
  684. movhpd 0 * SIZE(XX), %xmm4
  685. addl INCX, XX
  686. #ifdef USE_ABS
  687. andpd %xmm7, %xmm4
  688. #endif
  689. maxpd %xmm4, %xmm1
  690. movsd 0 * SIZE(XX), %xmm4
  691. addl INCX, XX
  692. movhpd 0 * SIZE(XX), %xmm4
  693. addl INCX, XX
  694. #ifdef USE_ABS
  695. andpd %xmm7, %xmm4
  696. #endif
  697. maxpd %xmm4, %xmm2
  698. movsd 0 * SIZE(XX), %xmm4
  699. addl INCX, XX
  700. movhpd 0 * SIZE(XX), %xmm4
  701. addl INCX, XX
  702. #ifdef USE_ABS
  703. andpd %xmm7, %xmm4
  704. #endif
  705. maxpd %xmm4, %xmm3
  706. #ifdef PREFETCH
  707. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  708. #endif
  709. movsd 0 * SIZE(XX), %xmm4
  710. addl INCX, XX
  711. movhpd 0 * SIZE(XX), %xmm4
  712. addl INCX, XX
  713. #ifdef USE_ABS
  714. andpd %xmm7, %xmm4
  715. #endif
  716. maxpd %xmm4, %xmm0
  717. movsd 0 * SIZE(XX), %xmm4
  718. addl INCX, XX
  719. movhpd 0 * SIZE(XX), %xmm4
  720. addl INCX, XX
  721. #ifdef USE_ABS
  722. andpd %xmm7, %xmm4
  723. #endif
  724. maxpd %xmm4, %xmm1
  725. movsd 0 * SIZE(XX), %xmm4
  726. addl INCX, XX
  727. movhpd 0 * SIZE(XX), %xmm4
  728. addl INCX, XX
  729. #ifdef USE_ABS
  730. andpd %xmm7, %xmm4
  731. #endif
  732. maxpd %xmm4, %xmm2
  733. movsd 0 * SIZE(XX), %xmm4
  734. addl INCX, XX
  735. movhpd 0 * SIZE(XX), %xmm4
  736. addl INCX, XX
  737. #ifdef USE_ABS
  738. andpd %xmm7, %xmm4
  739. #endif
  740. maxpd %xmm4, %xmm3
  741. decl I
  742. jg .L81
  743. ALIGN_4
  744. .L85:
  745. andl $15, MM
  746. jle .L90
  747. testl $8, MM
  748. je .L86
  749. movsd 0 * SIZE(XX), %xmm4
  750. addl INCX, XX
  751. movhpd 0 * SIZE(XX), %xmm4
  752. addl INCX, XX
  753. #ifdef USE_ABS
  754. andpd %xmm7, %xmm4
  755. #endif
  756. maxpd %xmm4, %xmm0
  757. movsd 0 * SIZE(XX), %xmm4
  758. addl INCX, XX
  759. movhpd 0 * SIZE(XX), %xmm4
  760. addl INCX, XX
  761. #ifdef USE_ABS
  762. andpd %xmm7, %xmm4
  763. #endif
  764. maxpd %xmm4, %xmm1
  765. movsd 0 * SIZE(XX), %xmm4
  766. addl INCX, XX
  767. movhpd 0 * SIZE(XX), %xmm4
  768. addl INCX, XX
  769. #ifdef USE_ABS
  770. andpd %xmm7, %xmm4
  771. #endif
  772. maxpd %xmm4, %xmm2
  773. movsd 0 * SIZE(XX), %xmm4
  774. addl INCX, XX
  775. movhpd 0 * SIZE(XX), %xmm4
  776. addl INCX, XX
  777. #ifdef USE_ABS
  778. andpd %xmm7, %xmm4
  779. #endif
  780. maxpd %xmm4, %xmm3
  781. ALIGN_3
  782. .L86:
  783. testl $4, MM
  784. je .L87
  785. movsd 0 * SIZE(XX), %xmm4
  786. addl INCX, XX
  787. movhpd 0 * SIZE(XX), %xmm4
  788. addl INCX, XX
  789. #ifdef USE_ABS
  790. andpd %xmm7, %xmm4
  791. #endif
  792. maxpd %xmm4, %xmm0
  793. movsd 0 * SIZE(XX), %xmm4
  794. addl INCX, XX
  795. movhpd 0 * SIZE(XX), %xmm4
  796. addl INCX, XX
  797. #ifdef USE_ABS
  798. andpd %xmm7, %xmm4
  799. #endif
  800. maxpd %xmm4, %xmm1
  801. ALIGN_3
  802. .L87:
  803. testl $2, MM
  804. je .L88
  805. movsd 0 * SIZE(XX), %xmm4
  806. addl INCX, XX
  807. movhpd 0 * SIZE(XX), %xmm4
  808. addl INCX, XX
  809. #ifdef USE_ABS
  810. andpd %xmm7, %xmm4
  811. #endif
  812. maxpd %xmm4, %xmm2
  813. ALIGN_3
  814. .L88:
  815. testl $1, MM
  816. je .L90
  817. movsd 0 * SIZE(XX), %xmm4
  818. #ifdef USE_ABS
  819. andpd %xmm7, %xmm4
  820. #endif
  821. unpcklpd %xmm4, %xmm4
  822. maxpd %xmm4, %xmm3
  823. ALIGN_4
  824. .L90:
  825. movl X, XX
  826. movl M, MM
  827. maxpd %xmm1, %xmm0
  828. maxpd %xmm3, %xmm2
  829. maxpd %xmm2, %xmm0
  830. movapd %xmm0, %xmm1
  831. unpckhpd %xmm0, %xmm0
  832. maxsd %xmm1, %xmm0
  833. unpcklpd %xmm0, %xmm0
  834. movl MM, I
  835. sarl $3, I
  836. jle .L95
  837. ALIGN_4
  838. .L92:
  839. #ifdef PREFETCH
  840. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX)
  841. #endif
  842. movsd 0 * SIZE(XX), %xmm1
  843. addl INCX, XX
  844. movhpd 0 * SIZE(XX), %xmm1
  845. addl INCX, XX
  846. #ifdef USE_ABS
  847. andpd %xmm7, %xmm1
  848. #endif
  849. cmpeqpd %xmm0, %xmm1
  850. movsd 0 * SIZE(XX), %xmm2
  851. addl INCX, XX
  852. movhpd 0 * SIZE(XX), %xmm2
  853. addl INCX, XX
  854. #ifdef USE_ABS
  855. andpd %xmm7, %xmm2
  856. #endif
  857. cmpeqpd %xmm0, %xmm2
  858. movsd 0 * SIZE(XX), %xmm3
  859. addl INCX, XX
  860. movhpd 0 * SIZE(XX), %xmm3
  861. addl INCX, XX
  862. #ifdef USE_ABS
  863. andpd %xmm7, %xmm3
  864. #endif
  865. cmpeqpd %xmm0, %xmm3
  866. movsd 0 * SIZE(XX), %xmm4
  867. addl INCX, XX
  868. movhpd 0 * SIZE(XX), %xmm4
  869. addl INCX, XX
  870. #ifdef USE_ABS
  871. andpd %xmm7, %xmm4
  872. #endif
  873. cmpeqpd %xmm0, %xmm4
  874. orpd %xmm2, %xmm1
  875. orpd %xmm4, %xmm3
  876. orpd %xmm3, %xmm1
  877. movmskpd %xmm1, TEMP
  878. testl $3, TEMP
  879. jne .L93
  880. addl $8, RET
  881. decl I
  882. jg .L92
  883. jmp .L95
  884. ALIGN_4
  885. .L93:
  886. leal (, INCX, 8), TEMP
  887. subl TEMP, XX
  888. movsd 0 * SIZE(XX), %xmm1
  889. addl INCX, XX
  890. movsd 0 * SIZE(XX), %xmm2
  891. addl INCX, XX
  892. movsd 0 * SIZE(XX), %xmm3
  893. addl INCX, XX
  894. movsd 0 * SIZE(XX), %xmm4
  895. addl INCX, XX
  896. #ifdef USE_ABS
  897. andpd %xmm7, %xmm1
  898. andpd %xmm7, %xmm2
  899. andpd %xmm7, %xmm3
  900. andpd %xmm7, %xmm4
  901. #endif
  902. incl RET
  903. comisd %xmm0, %xmm1
  904. je .L999
  905. incl RET
  906. comisd %xmm0, %xmm2
  907. je .L999
  908. incl RET
  909. comisd %xmm0, %xmm3
  910. je .L999
  911. incl RET
  912. comisd %xmm0, %xmm4
  913. je .L999
  914. movsd 0 * SIZE(XX), %xmm1
  915. addl INCX, XX
  916. movsd 0 * SIZE(XX), %xmm2
  917. addl INCX, XX
  918. movsd 0 * SIZE(XX), %xmm3
  919. #ifdef USE_ABS
  920. andpd %xmm7, %xmm1
  921. andpd %xmm7, %xmm2
  922. andpd %xmm7, %xmm3
  923. #endif
  924. incl RET
  925. comisd %xmm0, %xmm1
  926. je .L999
  927. incl RET
  928. comisd %xmm0, %xmm2
  929. je .L999
  930. incl RET
  931. comisd %xmm0, %xmm3
  932. je .L999
  933. incl RET
  934. jmp .L999
  935. ALIGN_3
  936. .L95:
  937. testl $4, MM
  938. je .L97
  939. movsd 0 * SIZE(XX), %xmm1
  940. addl INCX, XX
  941. movsd 0 * SIZE(XX), %xmm2
  942. addl INCX, XX
  943. movsd 0 * SIZE(XX), %xmm3
  944. addl INCX, XX
  945. movsd 0 * SIZE(XX), %xmm4
  946. addl INCX, XX
  947. #ifdef USE_ABS
  948. andpd %xmm7, %xmm1
  949. andpd %xmm7, %xmm2
  950. andpd %xmm7, %xmm3
  951. andpd %xmm7, %xmm4
  952. #endif
  953. incl RET
  954. comisd %xmm0, %xmm1
  955. je .L999
  956. incl RET
  957. comisd %xmm0, %xmm2
  958. je .L999
  959. incl RET
  960. comisd %xmm0, %xmm3
  961. je .L999
  962. incl RET
  963. comisd %xmm0, %xmm4
  964. je .L999
  965. ALIGN_3
  966. .L97:
  967. testl $2, MM
  968. je .L98
  969. movsd 0 * SIZE(XX), %xmm1
  970. addl INCX, XX
  971. movsd 0 * SIZE(XX), %xmm2
  972. addl INCX, XX
  973. #ifdef USE_ABS
  974. andpd %xmm7, %xmm1
  975. andpd %xmm7, %xmm2
  976. #endif
  977. incl RET
  978. comisd %xmm0, %xmm1
  979. je .L999
  980. incl RET
  981. comisd %xmm0, %xmm2
  982. je .L999
  983. ALIGN_3
  984. .L98:
  985. incl RET
  986. ALIGN_3
  987. .L999:
  988. popl %ebx
  989. popl %esi
  990. popl %edi
  991. popl %ebp
  992. ret
  993. EPILOGUE