You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

swap_sse.S 21 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 20 + STACK + ARGS(%esp)
  44. #define STACK_INCX 24 + STACK + ARGS(%esp)
  45. #define STACK_Y 28 + STACK + ARGS(%esp)
  46. #define STACK_INCY 32 + STACK + ARGS(%esp)
  47. #define M %edx
  48. #define X %esi
  49. #define Y %edi
  50. #define INCX %ebx
  51. #define INCY %ecx
  52. #include "l1param.h"
  53. PROLOGUE
  54. PROFCODE
  55. pushl %ebp
  56. pushl %edi
  57. pushl %esi
  58. pushl %ebx
  59. movl STACK_M, M
  60. movl STACK_X, X
  61. movl STACK_Y, Y
  62. movl STACK_INCX, INCX
  63. movl STACK_INCY, INCY
  64. sall $BASE_SHIFT, %ebx
  65. sall $BASE_SHIFT, %ecx
  66. cmpl $SIZE, INCX
  67. jne .L50
  68. cmpl $SIZE, INCY
  69. jne .L50
  70. subl $-32 * SIZE, X
  71. subl $-32 * SIZE, Y
  72. cmpl $3, M
  73. jle .L16
  74. testl $SIZE, Y
  75. je .L05
  76. movss -32 * SIZE(X), %xmm0
  77. movss -32 * SIZE(Y), %xmm1
  78. movss %xmm1, -32 * SIZE(X)
  79. movss %xmm0, -32 * SIZE(Y)
  80. addl $1 * SIZE, X
  81. addl $1 * SIZE, Y
  82. decl M
  83. ALIGN_3
  84. .L05:
  85. testl $2 * SIZE, Y
  86. je .L10
  87. movsd -32 * SIZE(X), %xmm0
  88. movsd -32 * SIZE(Y), %xmm1
  89. movlps %xmm1, -32 * SIZE(X)
  90. movlps %xmm0, -32 * SIZE(Y)
  91. addl $2 * SIZE, X
  92. addl $2 * SIZE, Y
  93. subl $2, M
  94. jle .L19
  95. ALIGN_3
  96. .L10:
  97. cmpl $3, M
  98. jle .L16
  99. testl $2 * SIZE, X
  100. jne .L30
  101. testl $1 * SIZE, X
  102. jne .L20
  103. movl M, %eax
  104. sarl $5, %eax
  105. jle .L13
  106. ALIGN_3
  107. .L11:
  108. #ifdef PREFETCHW
  109. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  110. #endif
  111. movaps -32 * SIZE(X), %xmm0
  112. movaps -32 * SIZE(Y), %xmm1
  113. movaps %xmm0, -32 * SIZE(Y)
  114. movaps %xmm1, -32 * SIZE(X)
  115. movaps -28 * SIZE(X), %xmm0
  116. movaps -28 * SIZE(Y), %xmm1
  117. movaps %xmm0, -28 * SIZE(Y)
  118. movaps %xmm1, -28 * SIZE(X)
  119. #ifdef PREFETCHW
  120. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  121. #endif
  122. movaps -24 * SIZE(X), %xmm0
  123. movaps -24 * SIZE(Y), %xmm1
  124. movaps %xmm0, -24 * SIZE(Y)
  125. movaps %xmm1, -24 * SIZE(X)
  126. movaps -20 * SIZE(X), %xmm0
  127. movaps -20 * SIZE(Y), %xmm1
  128. movaps %xmm0, -20 * SIZE(Y)
  129. movaps %xmm1, -20 * SIZE(X)
  130. #ifdef PREFETCHW
  131. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  132. #endif
  133. movaps -16 * SIZE(X), %xmm0
  134. movaps -16 * SIZE(Y), %xmm1
  135. movaps %xmm0, -16 * SIZE(Y)
  136. movaps %xmm1, -16 * SIZE(X)
  137. movaps -12 * SIZE(X), %xmm0
  138. movaps -12 * SIZE(Y), %xmm1
  139. movaps %xmm0, -12 * SIZE(Y)
  140. movaps %xmm1, -12 * SIZE(X)
  141. #ifdef PREFETCHW
  142. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  143. #endif
  144. movaps -8 * SIZE(X), %xmm0
  145. movaps -8 * SIZE(Y), %xmm1
  146. movaps %xmm0, -8 * SIZE(Y)
  147. movaps %xmm1, -8 * SIZE(X)
  148. movaps -4 * SIZE(X), %xmm0
  149. movaps -4 * SIZE(Y), %xmm1
  150. movaps %xmm0, -4 * SIZE(Y)
  151. movaps %xmm1, -4 * SIZE(X)
  152. subl $-32 * SIZE, Y
  153. subl $-32 * SIZE, X
  154. decl %eax
  155. jg .L11
  156. ALIGN_3
  157. .L13:
  158. testl $16, M
  159. jle .L14
  160. movaps -32 * SIZE(X), %xmm0
  161. movaps -32 * SIZE(Y), %xmm1
  162. movaps %xmm0, -32 * SIZE(Y)
  163. movaps %xmm1, -32 * SIZE(X)
  164. movaps -28 * SIZE(X), %xmm0
  165. movaps -28 * SIZE(Y), %xmm1
  166. movaps %xmm0, -28 * SIZE(Y)
  167. movaps %xmm1, -28 * SIZE(X)
  168. movaps -24 * SIZE(X), %xmm0
  169. movaps -24 * SIZE(Y), %xmm1
  170. movaps %xmm0, -24 * SIZE(Y)
  171. movaps %xmm1, -24 * SIZE(X)
  172. movaps -20 * SIZE(X), %xmm0
  173. movaps -20 * SIZE(Y), %xmm1
  174. movaps %xmm0, -20 * SIZE(Y)
  175. movaps %xmm1, -20 * SIZE(X)
  176. addl $16 * SIZE, X
  177. addl $16 * SIZE, Y
  178. ALIGN_3
  179. .L14:
  180. testl $8, M
  181. jle .L15
  182. movaps -32 * SIZE(X), %xmm0
  183. movaps -32 * SIZE(Y), %xmm1
  184. movaps %xmm0, -32 * SIZE(Y)
  185. movaps %xmm1, -32 * SIZE(X)
  186. movaps -28 * SIZE(X), %xmm0
  187. movaps -28 * SIZE(Y), %xmm1
  188. movaps %xmm0, -28 * SIZE(Y)
  189. movaps %xmm1, -28 * SIZE(X)
  190. addl $8 * SIZE, X
  191. addl $8 * SIZE, Y
  192. ALIGN_3
  193. .L15:
  194. testl $4, M
  195. jle .L16
  196. movaps -32 * SIZE(X), %xmm0
  197. movaps -32 * SIZE(Y), %xmm1
  198. movaps %xmm0, -32 * SIZE(Y)
  199. movaps %xmm1, -32 * SIZE(X)
  200. addl $4 * SIZE, X
  201. addl $4 * SIZE, Y
  202. ALIGN_3
  203. .L16:
  204. testl $2, M
  205. jle .L17
  206. movsd -32 * SIZE(X), %xmm0
  207. movsd -32 * SIZE(Y), %xmm1
  208. movlps %xmm1, -32 * SIZE(X)
  209. addl $2 * SIZE, X
  210. movlps %xmm0, -32 * SIZE(Y)
  211. addl $2 * SIZE, Y
  212. ALIGN_3
  213. .L17:
  214. testl $1, M
  215. jle .L19
  216. movss -32 * SIZE(X), %xmm0
  217. movss -32 * SIZE(Y), %xmm1
  218. movss %xmm1, -32 * SIZE(X)
  219. movss %xmm0, -32 * SIZE(Y)
  220. ALIGN_3
  221. .L19:
  222. popl %ebx
  223. popl %esi
  224. popl %edi
  225. popl %ebp
  226. ret
  227. ALIGN_3
  228. .L20:
  229. movaps -33 * SIZE(X), %xmm0
  230. movaps -32 * SIZE(Y), %xmm1
  231. movss %xmm1, -32 * SIZE(X)
  232. PSHUFD2($0x39, %xmm1, %xmm3)
  233. movlps %xmm3, -31 * SIZE(X)
  234. subl $3, M
  235. movl M, %eax
  236. sarl $5, %eax
  237. jle .L23
  238. ALIGN_4
  239. .L21:
  240. #ifdef PREFETCHW
  241. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  242. #endif
  243. movaps -29 * SIZE(X), %xmm2
  244. movaps -28 * SIZE(Y), %xmm3
  245. movss %xmm2, %xmm0
  246. shufps $0x39, %xmm0, %xmm0
  247. movaps %xmm0, -32 * SIZE(Y)
  248. movss %xmm3, %xmm1
  249. shufps $0x93, %xmm3, %xmm1
  250. movaps %xmm1, -29 * SIZE(X)
  251. movaps -25 * SIZE(X), %xmm0
  252. movaps -24 * SIZE(Y), %xmm1
  253. movss %xmm0, %xmm2
  254. shufps $0x39, %xmm2, %xmm2
  255. movaps %xmm2, -28 * SIZE(Y)
  256. movss %xmm1, %xmm3
  257. shufps $0x93, %xmm1, %xmm3
  258. movaps %xmm3, -25 * SIZE(X)
  259. #ifdef PREFETCHW
  260. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  261. #endif
  262. movaps -21 * SIZE(X), %xmm2
  263. movaps -20 * SIZE(Y), %xmm3
  264. movss %xmm2, %xmm0
  265. shufps $0x39, %xmm0, %xmm0
  266. movaps %xmm0, -24 * SIZE(Y)
  267. movss %xmm3, %xmm1
  268. shufps $0x93, %xmm3, %xmm1
  269. movaps %xmm1, -21 * SIZE(X)
  270. movaps -17 * SIZE(X), %xmm0
  271. movaps -16 * SIZE(Y), %xmm1
  272. movss %xmm0, %xmm2
  273. shufps $0x39, %xmm2, %xmm2
  274. movaps %xmm2, -20 * SIZE(Y)
  275. movss %xmm1, %xmm3
  276. shufps $0x93, %xmm1, %xmm3
  277. movaps %xmm3, -17 * SIZE(X)
  278. #ifdef PREFETCHW
  279. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  280. #endif
  281. movaps -13 * SIZE(X), %xmm2
  282. movaps -12 * SIZE(Y), %xmm3
  283. movss %xmm2, %xmm0
  284. shufps $0x39, %xmm0, %xmm0
  285. movaps %xmm0, -16 * SIZE(Y)
  286. movss %xmm3, %xmm1
  287. shufps $0x93, %xmm3, %xmm1
  288. movaps %xmm1, -13 * SIZE(X)
  289. movaps -9 * SIZE(X), %xmm0
  290. movaps -8 * SIZE(Y), %xmm1
  291. movss %xmm0, %xmm2
  292. shufps $0x39, %xmm2, %xmm2
  293. movaps %xmm2, -12 * SIZE(Y)
  294. movss %xmm1, %xmm3
  295. shufps $0x93, %xmm1, %xmm3
  296. movaps %xmm3, -9 * SIZE(X)
  297. #ifdef PREFETCHW
  298. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  299. #endif
  300. movaps -5 * SIZE(X), %xmm2
  301. movaps -4 * SIZE(Y), %xmm3
  302. movss %xmm2, %xmm0
  303. shufps $0x39, %xmm0, %xmm0
  304. movaps %xmm0, -8 * SIZE(Y)
  305. movss %xmm3, %xmm1
  306. shufps $0x93, %xmm3, %xmm1
  307. movaps %xmm1, -5 * SIZE(X)
  308. movaps -1 * SIZE(X), %xmm0
  309. movaps 0 * SIZE(Y), %xmm1
  310. movss %xmm0, %xmm2
  311. shufps $0x39, %xmm2, %xmm2
  312. movaps %xmm2, -4 * SIZE(Y)
  313. movss %xmm1, %xmm3
  314. shufps $0x93, %xmm1, %xmm3
  315. movaps %xmm3, -1 * SIZE(X)
  316. subl $-32 * SIZE, X
  317. subl $-32 * SIZE, Y
  318. decl %eax
  319. jg .L21
  320. ALIGN_3
  321. .L23:
  322. testl $16, M
  323. jle .L24
  324. movaps -29 * SIZE(X), %xmm2
  325. movaps -28 * SIZE(Y), %xmm3
  326. movss %xmm2, %xmm0
  327. shufps $0x39, %xmm0, %xmm0
  328. movaps %xmm0, -32 * SIZE(Y)
  329. movss %xmm3, %xmm1
  330. shufps $0x93, %xmm3, %xmm1
  331. movaps %xmm1, -29 * SIZE(X)
  332. movaps -25 * SIZE(X), %xmm0
  333. movaps -24 * SIZE(Y), %xmm1
  334. movss %xmm0, %xmm2
  335. shufps $0x39, %xmm2, %xmm2
  336. movaps %xmm2, -28 * SIZE(Y)
  337. movss %xmm1, %xmm3
  338. shufps $0x93, %xmm1, %xmm3
  339. movaps %xmm3, -25 * SIZE(X)
  340. movaps -21 * SIZE(X), %xmm2
  341. movaps -20 * SIZE(Y), %xmm3
  342. movss %xmm2, %xmm0
  343. shufps $0x39, %xmm0, %xmm0
  344. movaps %xmm0, -24 * SIZE(Y)
  345. movss %xmm3, %xmm1
  346. shufps $0x93, %xmm3, %xmm1
  347. movaps %xmm1, -21 * SIZE(X)
  348. movaps -17 * SIZE(X), %xmm0
  349. movaps -16 * SIZE(Y), %xmm1
  350. movss %xmm0, %xmm2
  351. shufps $0x39, %xmm2, %xmm2
  352. movaps %xmm2, -20 * SIZE(Y)
  353. movss %xmm1, %xmm3
  354. shufps $0x93, %xmm1, %xmm3
  355. movaps %xmm3, -17 * SIZE(X)
  356. addl $16 * SIZE, X
  357. addl $16 * SIZE, Y
  358. ALIGN_3
  359. .L24:
  360. testl $8, M
  361. jle .L25
  362. movaps -29 * SIZE(X), %xmm2
  363. movaps -28 * SIZE(Y), %xmm3
  364. movss %xmm2, %xmm0
  365. shufps $0x39, %xmm0, %xmm0
  366. movaps %xmm0, -32 * SIZE(Y)
  367. movss %xmm3, %xmm1
  368. shufps $0x93, %xmm3, %xmm1
  369. movaps %xmm1, -29 * SIZE(X)
  370. movaps -25 * SIZE(X), %xmm0
  371. movaps -24 * SIZE(Y), %xmm1
  372. movss %xmm0, %xmm2
  373. shufps $0x39, %xmm2, %xmm2
  374. movaps %xmm2, -28 * SIZE(Y)
  375. movss %xmm1, %xmm3
  376. shufps $0x93, %xmm1, %xmm3
  377. movaps %xmm3, -25 * SIZE(X)
  378. addl $8 * SIZE, X
  379. addl $8 * SIZE, Y
  380. ALIGN_3
  381. .L25:
  382. testl $4, M
  383. jle .L26
  384. movaps -29 * SIZE(X), %xmm2
  385. movaps -28 * SIZE(Y), %xmm3
  386. movss %xmm2, %xmm0
  387. shufps $0x39, %xmm0, %xmm0
  388. movaps %xmm0, -32 * SIZE(Y)
  389. movss %xmm3, %xmm1
  390. shufps $0x93, %xmm3, %xmm1
  391. movaps %xmm1, -29 * SIZE(X)
  392. movaps %xmm2, %xmm0
  393. movaps %xmm3, %xmm1
  394. addl $4 * SIZE, X
  395. addl $4 * SIZE, Y
  396. ALIGN_3
  397. .L26:
  398. PSHUFD2($0x39, %xmm0, %xmm2)
  399. PSHUFD1($0xff, %xmm0)
  400. movlps %xmm2, -32 * SIZE(Y)
  401. movss %xmm0, -30 * SIZE(Y)
  402. testl $2, M
  403. jle .L27
  404. movsd -29 * SIZE(X), %xmm0
  405. movsd -29 * SIZE(Y), %xmm1
  406. movlps %xmm0, -29 * SIZE(Y)
  407. movlps %xmm1, -29 * SIZE(X)
  408. addl $2 * SIZE, X
  409. addl $2 * SIZE, Y
  410. ALIGN_3
  411. .L27:
  412. testl $1, M
  413. jle .L29
  414. movss -29 * SIZE(X), %xmm0
  415. movss -29 * SIZE(Y), %xmm1
  416. movss %xmm0, -29 * SIZE(Y)
  417. movss %xmm1, -29 * SIZE(X)
  418. ALIGN_3
  419. .L29:
  420. popl %ebx
  421. popl %esi
  422. popl %edi
  423. popl %ebp
  424. ret
  425. ALIGN_3
  426. .L30:
  427. testl $1 * SIZE, X
  428. jne .L40
  429. movhps -32 * SIZE(X), %xmm0
  430. movaps -32 * SIZE(Y), %xmm1
  431. movlps %xmm1, -32 * SIZE(X)
  432. subl $2, M
  433. movl M, %eax
  434. sarl $5, %eax
  435. jle .L33
  436. ALIGN_4
  437. .L31:
  438. #ifdef PREFETCHW
  439. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  440. #endif
  441. movaps -30 * SIZE(X), %xmm2
  442. movaps -28 * SIZE(Y), %xmm3
  443. SHUFPD_1 %xmm2, %xmm0
  444. movaps %xmm0, -32 * SIZE(Y)
  445. SHUFPD_1 %xmm3, %xmm1
  446. movaps %xmm1, -30 * SIZE(X)
  447. movaps -26 * SIZE(X), %xmm0
  448. movaps -24 * SIZE(Y), %xmm1
  449. SHUFPD_1 %xmm0, %xmm2
  450. movaps %xmm2, -28 * SIZE(Y)
  451. SHUFPD_1 %xmm1, %xmm3
  452. movaps %xmm3, -26 * SIZE(X)
  453. #ifdef PREFETCHW
  454. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  455. #endif
  456. movaps -22 * SIZE(X), %xmm2
  457. movaps -20 * SIZE(Y), %xmm3
  458. SHUFPD_1 %xmm2, %xmm0
  459. movaps %xmm0, -24 * SIZE(Y)
  460. SHUFPD_1 %xmm3, %xmm1
  461. movaps %xmm1, -22 * SIZE(X)
  462. movaps -18 * SIZE(X), %xmm0
  463. movaps -16 * SIZE(Y), %xmm1
  464. SHUFPD_1 %xmm0, %xmm2
  465. movaps %xmm2, -20 * SIZE(Y)
  466. SHUFPD_1 %xmm1, %xmm3
  467. movaps %xmm3, -18 * SIZE(X)
  468. #ifdef PREFETCHW
  469. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  470. #endif
  471. movaps -14 * SIZE(X), %xmm2
  472. movaps -12 * SIZE(Y), %xmm3
  473. SHUFPD_1 %xmm2, %xmm0
  474. movaps %xmm0, -16 * SIZE(Y)
  475. SHUFPD_1 %xmm3, %xmm1
  476. movaps %xmm1, -14 * SIZE(X)
  477. movaps -10 * SIZE(X), %xmm0
  478. movaps -8 * SIZE(Y), %xmm1
  479. SHUFPD_1 %xmm0, %xmm2
  480. movaps %xmm2, -12 * SIZE(Y)
  481. SHUFPD_1 %xmm1, %xmm3
  482. movaps %xmm3, -10 * SIZE(X)
  483. #ifdef PREFETCHW
  484. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  485. #endif
  486. movaps -6 * SIZE(X), %xmm2
  487. movaps -4 * SIZE(Y), %xmm3
  488. SHUFPD_1 %xmm2, %xmm0
  489. movaps %xmm0, -8 * SIZE(Y)
  490. SHUFPD_1 %xmm3, %xmm1
  491. movaps %xmm1, -6 * SIZE(X)
  492. movaps -2 * SIZE(X), %xmm0
  493. movaps 0 * SIZE(Y), %xmm1
  494. SHUFPD_1 %xmm0, %xmm2
  495. movaps %xmm2, -4 * SIZE(Y)
  496. SHUFPD_1 %xmm1, %xmm3
  497. movaps %xmm3, -2 * SIZE(X)
  498. subl $-32 * SIZE, X
  499. subl $-32 * SIZE, Y
  500. decl %eax
  501. jg .L31
  502. ALIGN_3
  503. .L33:
  504. testl $16, M
  505. jle .L34
  506. movaps -30 * SIZE(X), %xmm2
  507. movaps -28 * SIZE(Y), %xmm3
  508. SHUFPD_1 %xmm2, %xmm0
  509. movaps %xmm0, -32 * SIZE(Y)
  510. SHUFPD_1 %xmm3, %xmm1
  511. movaps %xmm1, -30 * SIZE(X)
  512. movaps -26 * SIZE(X), %xmm0
  513. movaps -24 * SIZE(Y), %xmm1
  514. SHUFPD_1 %xmm0, %xmm2
  515. movaps %xmm2, -28 * SIZE(Y)
  516. SHUFPD_1 %xmm1, %xmm3
  517. movaps %xmm3, -26 * SIZE(X)
  518. movaps -22 * SIZE(X), %xmm2
  519. movaps -20 * SIZE(Y), %xmm3
  520. SHUFPD_1 %xmm2, %xmm0
  521. movaps %xmm0, -24 * SIZE(Y)
  522. SHUFPD_1 %xmm3, %xmm1
  523. movaps %xmm1, -22 * SIZE(X)
  524. movaps -18 * SIZE(X), %xmm0
  525. movaps -16 * SIZE(Y), %xmm1
  526. SHUFPD_1 %xmm0, %xmm2
  527. movaps %xmm2, -20 * SIZE(Y)
  528. SHUFPD_1 %xmm1, %xmm3
  529. movaps %xmm3, -18 * SIZE(X)
  530. addl $16 * SIZE, X
  531. addl $16 * SIZE, Y
  532. ALIGN_3
  533. .L34:
  534. testl $8, M
  535. jle .L35
  536. movaps -30 * SIZE(X), %xmm2
  537. movaps -28 * SIZE(Y), %xmm3
  538. SHUFPD_1 %xmm2, %xmm0
  539. movaps %xmm0, -32 * SIZE(Y)
  540. SHUFPD_1 %xmm3, %xmm1
  541. movaps %xmm1, -30 * SIZE(X)
  542. movaps -26 * SIZE(X), %xmm0
  543. movaps -24 * SIZE(Y), %xmm1
  544. SHUFPD_1 %xmm0, %xmm2
  545. movaps %xmm2, -28 * SIZE(Y)
  546. SHUFPD_1 %xmm1, %xmm3
  547. movaps %xmm3, -26 * SIZE(X)
  548. addl $8 * SIZE, X
  549. addl $8 * SIZE, Y
  550. ALIGN_3
  551. .L35:
  552. testl $4, M
  553. jle .L36
  554. movaps -30 * SIZE(X), %xmm2
  555. movaps -28 * SIZE(Y), %xmm3
  556. SHUFPD_1 %xmm2, %xmm0
  557. movaps %xmm0, -32 * SIZE(Y)
  558. SHUFPD_1 %xmm3, %xmm1
  559. movaps %xmm1, -30 * SIZE(X)
  560. movaps %xmm2, %xmm0
  561. movaps %xmm3, %xmm1
  562. addl $4 * SIZE, X
  563. addl $4 * SIZE, Y
  564. ALIGN_3
  565. .L36:
  566. movhps %xmm0, -32 * SIZE(Y)
  567. testl $2, M
  568. jle .L37
  569. movsd -30 * SIZE(X), %xmm0
  570. movsd -30 * SIZE(Y), %xmm1
  571. movlps %xmm0, -30 * SIZE(Y)
  572. movlps %xmm1, -30 * SIZE(X)
  573. addl $2 * SIZE, X
  574. addl $2 * SIZE, Y
  575. ALIGN_3
  576. .L37:
  577. testl $1, M
  578. jle .L39
  579. movss -30 * SIZE(X), %xmm0
  580. movss -30 * SIZE(Y), %xmm1
  581. movss %xmm0, -30 * SIZE(Y)
  582. movss %xmm1, -30 * SIZE(X)
  583. ALIGN_3
  584. .L39:
  585. popl %ebx
  586. popl %esi
  587. popl %edi
  588. popl %ebp
  589. ret
  590. ALIGN_3
  591. .L40:
  592. movaps -35 * SIZE(X), %xmm0
  593. movaps -32 * SIZE(Y), %xmm1
  594. movss %xmm1, -32 * SIZE(X)
  595. subl $3, M
  596. movl M, %eax
  597. sarl $5, %eax
  598. jle .L43
  599. ALIGN_4
  600. .L41:
  601. #ifdef PREFETCHW
  602. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  603. #endif
  604. movaps -31 * SIZE(X), %xmm2
  605. movaps -28 * SIZE(Y), %xmm3
  606. movss %xmm2, %xmm0
  607. shufps $0x93, %xmm2, %xmm0
  608. movaps %xmm0, -32 * SIZE(Y)
  609. movss %xmm3, %xmm1
  610. shufps $0x39, %xmm1, %xmm1
  611. movaps %xmm1, -31 * SIZE(X)
  612. movaps -27 * SIZE(X), %xmm0
  613. movaps -24 * SIZE(Y), %xmm1
  614. movss %xmm0, %xmm2
  615. shufps $0x93, %xmm0, %xmm2
  616. movaps %xmm2, -28 * SIZE(Y)
  617. movss %xmm1, %xmm3
  618. shufps $0x39, %xmm3, %xmm3
  619. movaps %xmm3, -27 * SIZE(X)
  620. #ifdef PREFETCHW
  621. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  622. #endif
  623. movaps -23 * SIZE(X), %xmm2
  624. movaps -20 * SIZE(Y), %xmm3
  625. movss %xmm2, %xmm0
  626. shufps $0x93, %xmm2, %xmm0
  627. movaps %xmm0, -24 * SIZE(Y)
  628. movss %xmm3, %xmm1
  629. shufps $0x39, %xmm1, %xmm1
  630. movaps %xmm1, -23 * SIZE(X)
  631. movaps -19 * SIZE(X), %xmm0
  632. movaps -16 * SIZE(Y), %xmm1
  633. movss %xmm0, %xmm2
  634. shufps $0x93, %xmm0, %xmm2
  635. movaps %xmm2, -20 * SIZE(Y)
  636. movss %xmm1, %xmm3
  637. shufps $0x39, %xmm3, %xmm3
  638. movaps %xmm3, -19 * SIZE(X)
  639. #ifdef PREFETCHW
  640. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  641. #endif
  642. movaps -15 * SIZE(X), %xmm2
  643. movaps -12 * SIZE(Y), %xmm3
  644. movss %xmm2, %xmm0
  645. shufps $0x93, %xmm2, %xmm0
  646. movaps %xmm0, -16 * SIZE(Y)
  647. movss %xmm3, %xmm1
  648. shufps $0x39, %xmm1, %xmm1
  649. movaps %xmm1, -15 * SIZE(X)
  650. movaps -11 * SIZE(X), %xmm0
  651. movaps -8 * SIZE(Y), %xmm1
  652. movss %xmm0, %xmm2
  653. shufps $0x93, %xmm0, %xmm2
  654. movaps %xmm2, -12 * SIZE(Y)
  655. movss %xmm1, %xmm3
  656. shufps $0x39, %xmm3, %xmm3
  657. movaps %xmm3, -11 * SIZE(X)
  658. #ifdef PREFETCHW
  659. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  660. #endif
  661. movaps -7 * SIZE(X), %xmm2
  662. movaps -4 * SIZE(Y), %xmm3
  663. movss %xmm2, %xmm0
  664. shufps $0x93, %xmm2, %xmm0
  665. movaps %xmm0, -8 * SIZE(Y)
  666. movss %xmm3, %xmm1
  667. shufps $0x39, %xmm1, %xmm1
  668. movaps %xmm1, -7 * SIZE(X)
  669. movaps -3 * SIZE(X), %xmm0
  670. movaps 0 * SIZE(Y), %xmm1
  671. movss %xmm0, %xmm2
  672. shufps $0x93, %xmm0, %xmm2
  673. movaps %xmm2, -4 * SIZE(Y)
  674. movss %xmm1, %xmm3
  675. shufps $0x39, %xmm3, %xmm3
  676. movaps %xmm3, -3 * SIZE(X)
  677. subl $-32 * SIZE, X
  678. subl $-32 * SIZE, Y
  679. decl %eax
  680. jg .L41
  681. ALIGN_3
  682. .L43:
  683. testl $16, M
  684. jle .L44
  685. movaps -31 * SIZE(X), %xmm2
  686. movaps -28 * SIZE(Y), %xmm3
  687. movss %xmm2, %xmm0
  688. shufps $0x93, %xmm2, %xmm0
  689. movaps %xmm0, -32 * SIZE(Y)
  690. movss %xmm3, %xmm1
  691. shufps $0x39, %xmm1, %xmm1
  692. movaps %xmm1, -31 * SIZE(X)
  693. movaps -27 * SIZE(X), %xmm0
  694. movaps -24 * SIZE(Y), %xmm1
  695. movss %xmm0, %xmm2
  696. shufps $0x93, %xmm0, %xmm2
  697. movaps %xmm2, -28 * SIZE(Y)
  698. movss %xmm1, %xmm3
  699. shufps $0x39, %xmm3, %xmm3
  700. movaps %xmm3, -27 * SIZE(X)
  701. movaps -23 * SIZE(X), %xmm2
  702. movaps -20 * SIZE(Y), %xmm3
  703. movss %xmm2, %xmm0
  704. shufps $0x93, %xmm2, %xmm0
  705. movaps %xmm0, -24 * SIZE(Y)
  706. movss %xmm3, %xmm1
  707. shufps $0x39, %xmm1, %xmm1
  708. movaps %xmm1, -23 * SIZE(X)
  709. movaps -19 * SIZE(X), %xmm0
  710. movaps -16 * SIZE(Y), %xmm1
  711. movss %xmm0, %xmm2
  712. shufps $0x93, %xmm0, %xmm2
  713. movaps %xmm2, -20 * SIZE(Y)
  714. movss %xmm1, %xmm3
  715. shufps $0x39, %xmm3, %xmm3
  716. movaps %xmm3, -19 * SIZE(X)
  717. addl $16 * SIZE, X
  718. addl $16 * SIZE, Y
  719. ALIGN_3
  720. .L44:
  721. testl $8, M
  722. jle .L45
  723. movaps -31 * SIZE(X), %xmm2
  724. movaps -28 * SIZE(Y), %xmm3
  725. movss %xmm2, %xmm0
  726. shufps $0x93, %xmm2, %xmm0
  727. movaps %xmm0, -32 * SIZE(Y)
  728. movss %xmm3, %xmm1
  729. shufps $0x39, %xmm1, %xmm1
  730. movaps %xmm1, -31 * SIZE(X)
  731. movaps -27 * SIZE(X), %xmm0
  732. movaps -24 * SIZE(Y), %xmm1
  733. movss %xmm0, %xmm2
  734. shufps $0x93, %xmm0, %xmm2
  735. movaps %xmm2, -28 * SIZE(Y)
  736. movss %xmm1, %xmm3
  737. shufps $0x39, %xmm3, %xmm3
  738. movaps %xmm3, -27 * SIZE(X)
  739. addl $8 * SIZE, X
  740. addl $8 * SIZE, Y
  741. ALIGN_3
  742. .L45:
  743. testl $4, M
  744. jle .L46
  745. movaps -31 * SIZE(X), %xmm2
  746. movaps -28 * SIZE(Y), %xmm3
  747. movss %xmm2, %xmm0
  748. shufps $0x93, %xmm2, %xmm0
  749. movaps %xmm0, -32 * SIZE(Y)
  750. movss %xmm3, %xmm1
  751. shufps $0x39, %xmm1, %xmm1
  752. movaps %xmm1, -31 * SIZE(X)
  753. movaps %xmm2, %xmm0
  754. movaps %xmm3, %xmm1
  755. addl $4 * SIZE, X
  756. addl $4 * SIZE, Y
  757. ALIGN_3
  758. .L46:
  759. movsd -31 * SIZE(X), %xmm2
  760. PSHUFD2($0x39, %xmm1, %xmm1)
  761. movlps %xmm1, -31 * SIZE(X)
  762. PSHUFD1($0xff, %xmm0)
  763. movss %xmm0, -32 * SIZE(Y)
  764. movlps %xmm2, -31 * SIZE(Y)
  765. addl $3 * SIZE, X
  766. addl $3 * SIZE, Y
  767. testl $2, M
  768. jle .L47
  769. movsd -32 * SIZE(X), %xmm0
  770. movsd -32 * SIZE(Y), %xmm1
  771. movlps %xmm0, -32 * SIZE(Y)
  772. movlps %xmm1, -32 * SIZE(X)
  773. addl $2 * SIZE, X
  774. addl $2 * SIZE, Y
  775. ALIGN_3
  776. .L47:
  777. testl $1, M
  778. jle .L49
  779. movss -32 * SIZE(X), %xmm0
  780. movss -32 * SIZE(Y), %xmm1
  781. movss %xmm0, -32 * SIZE(Y)
  782. movss %xmm1, -32 * SIZE(X)
  783. ALIGN_3
  784. .L49:
  785. popl %ebx
  786. popl %esi
  787. popl %edi
  788. popl %ebp
  789. ret
  790. ALIGN_3
  791. .L50:
  792. movl M, %eax
  793. sarl $3, %eax
  794. jle .L55
  795. ALIGN_3
  796. .L51:
  797. movss (X), %xmm0
  798. movss (Y), %xmm1
  799. movss %xmm1, (X)
  800. addl INCX, X
  801. movss %xmm0, (Y)
  802. addl INCY, Y
  803. movss (X), %xmm0
  804. movss (Y), %xmm1
  805. movss %xmm1, (X)
  806. addl INCX, X
  807. movss %xmm0, (Y)
  808. addl INCY, Y
  809. movss (X), %xmm0
  810. movss (Y), %xmm1
  811. movss %xmm1, (X)
  812. addl INCX, X
  813. movss %xmm0, (Y)
  814. addl INCY, Y
  815. movss (X), %xmm0
  816. movss (Y), %xmm1
  817. movss %xmm1, (X)
  818. addl INCX, X
  819. movss %xmm0, (Y)
  820. addl INCY, Y
  821. movss (X), %xmm0
  822. movss (Y), %xmm1
  823. movss %xmm1, (X)
  824. addl INCX, X
  825. movss %xmm0, (Y)
  826. addl INCY, Y
  827. movss (X), %xmm0
  828. movss (Y), %xmm1
  829. movss %xmm1, (X)
  830. addl INCX, X
  831. movss %xmm0, (Y)
  832. addl INCY, Y
  833. movss (X), %xmm0
  834. movss (Y), %xmm1
  835. movss %xmm1, (X)
  836. addl INCX, X
  837. movss %xmm0, (Y)
  838. addl INCY, Y
  839. movss (X), %xmm0
  840. movss (Y), %xmm1
  841. movss %xmm1, (X)
  842. addl INCX, X
  843. movss %xmm0, (Y)
  844. addl INCY, Y
  845. decl %eax
  846. jg .L51
  847. ALIGN_3
  848. .L55:
  849. movl M, %eax
  850. andl $7, %eax
  851. jle .L57
  852. ALIGN_3
  853. .L56:
  854. movss (X), %xmm0
  855. movss (Y), %xmm1
  856. movss %xmm1, (X)
  857. movss %xmm0, (Y)
  858. addl INCX, X
  859. addl INCY, Y
  860. decl %eax
  861. jg .L56
  862. ALIGN_3
  863. .L57:
  864. popl %ebx
  865. popl %esi
  866. popl %edi
  867. popl %ebp
  868. ret
  869. EPILOGUE