You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zswap_sse.S 21 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 24 + STACK + ARGS(%esp)
  44. #define STACK_INCX 28 + STACK + ARGS(%esp)
  45. #define STACK_Y 32 + STACK + ARGS(%esp)
  46. #define STACK_INCY 36 + STACK + ARGS(%esp)
  47. #define M %edx
  48. #define X %esi
  49. #define Y %edi
  50. #define INCX %ebx
  51. #define INCY %ecx
  52. #include "l1param.h"
  53. PROLOGUE
  54. PROFCODE
  55. pushl %ebp
  56. pushl %edi
  57. pushl %esi
  58. pushl %ebx
  59. movl STACK_M, M
  60. movl STACK_X, X
  61. movl STACK_Y, Y
  62. movl STACK_INCX, INCX
  63. movl STACK_INCY, INCY
  64. sall $ZBASE_SHIFT, INCX
  65. sall $ZBASE_SHIFT, INCY
  66. testl M, M
  67. jle .L19
  68. cmpl $2 * SIZE, INCX
  69. jne .L50
  70. cmpl $2 * SIZE, INCY
  71. jne .L50
  72. addl M, M
  73. subl $-32 * SIZE, X
  74. subl $-32 * SIZE, Y
  75. cmpl $3, M
  76. jle .L16
  77. testl $SIZE, Y
  78. je .L05
  79. movss -32 * SIZE(X), %xmm0
  80. movss -32 * SIZE(Y), %xmm1
  81. movss %xmm1, -32 * SIZE(X)
  82. movss %xmm0, -32 * SIZE(Y)
  83. addl $1 * SIZE, X
  84. addl $1 * SIZE, Y
  85. decl M
  86. ALIGN_3
  87. .L05:
  88. testl $2 * SIZE, Y
  89. je .L10
  90. movsd -32 * SIZE(X), %xmm0
  91. movsd -32 * SIZE(Y), %xmm1
  92. movlps %xmm1, -32 * SIZE(X)
  93. movlps %xmm0, -32 * SIZE(Y)
  94. addl $2 * SIZE, X
  95. addl $2 * SIZE, Y
  96. subl $2, M
  97. jle .L19
  98. ALIGN_3
  99. .L10:
  100. cmpl $3, M
  101. jle .L16
  102. testl $2 * SIZE, X
  103. jne .L30
  104. testl $1 * SIZE, X
  105. jne .L20
  106. movl M, %eax
  107. sarl $5, %eax
  108. jle .L13
  109. ALIGN_3
  110. .L11:
  111. #ifdef PREFETCHW
  112. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  113. #endif
  114. movaps -32 * SIZE(X), %xmm0
  115. movaps -32 * SIZE(Y), %xmm1
  116. movaps %xmm0, -32 * SIZE(Y)
  117. movaps %xmm1, -32 * SIZE(X)
  118. movaps -28 * SIZE(X), %xmm0
  119. movaps -28 * SIZE(Y), %xmm1
  120. movaps %xmm0, -28 * SIZE(Y)
  121. movaps %xmm1, -28 * SIZE(X)
  122. #ifdef PREFETCHW
  123. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  124. #endif
  125. movaps -24 * SIZE(X), %xmm0
  126. movaps -24 * SIZE(Y), %xmm1
  127. movaps %xmm0, -24 * SIZE(Y)
  128. movaps %xmm1, -24 * SIZE(X)
  129. movaps -20 * SIZE(X), %xmm0
  130. movaps -20 * SIZE(Y), %xmm1
  131. movaps %xmm0, -20 * SIZE(Y)
  132. movaps %xmm1, -20 * SIZE(X)
  133. #ifdef PREFETCHW
  134. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  135. #endif
  136. movaps -16 * SIZE(X), %xmm0
  137. movaps -16 * SIZE(Y), %xmm1
  138. movaps %xmm0, -16 * SIZE(Y)
  139. movaps %xmm1, -16 * SIZE(X)
  140. movaps -12 * SIZE(X), %xmm0
  141. movaps -12 * SIZE(Y), %xmm1
  142. movaps %xmm0, -12 * SIZE(Y)
  143. movaps %xmm1, -12 * SIZE(X)
  144. #ifdef PREFETCHW
  145. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  146. #endif
  147. movaps -8 * SIZE(X), %xmm0
  148. movaps -8 * SIZE(Y), %xmm1
  149. movaps %xmm0, -8 * SIZE(Y)
  150. movaps %xmm1, -8 * SIZE(X)
  151. movaps -4 * SIZE(X), %xmm0
  152. movaps -4 * SIZE(Y), %xmm1
  153. movaps %xmm0, -4 * SIZE(Y)
  154. movaps %xmm1, -4 * SIZE(X)
  155. subl $-32 * SIZE, Y
  156. subl $-32 * SIZE, X
  157. decl %eax
  158. jg .L11
  159. ALIGN_3
  160. .L13:
  161. testl $16, M
  162. jle .L14
  163. movaps -32 * SIZE(X), %xmm0
  164. movaps -32 * SIZE(Y), %xmm1
  165. movaps %xmm0, -32 * SIZE(Y)
  166. movaps %xmm1, -32 * SIZE(X)
  167. movaps -28 * SIZE(X), %xmm0
  168. movaps -28 * SIZE(Y), %xmm1
  169. movaps %xmm0, -28 * SIZE(Y)
  170. movaps %xmm1, -28 * SIZE(X)
  171. movaps -24 * SIZE(X), %xmm0
  172. movaps -24 * SIZE(Y), %xmm1
  173. movaps %xmm0, -24 * SIZE(Y)
  174. movaps %xmm1, -24 * SIZE(X)
  175. movaps -20 * SIZE(X), %xmm0
  176. movaps -20 * SIZE(Y), %xmm1
  177. movaps %xmm0, -20 * SIZE(Y)
  178. movaps %xmm1, -20 * SIZE(X)
  179. addl $16 * SIZE, X
  180. addl $16 * SIZE, Y
  181. ALIGN_3
  182. .L14:
  183. testl $8, M
  184. jle .L15
  185. movaps -32 * SIZE(X), %xmm0
  186. movaps -32 * SIZE(Y), %xmm1
  187. movaps %xmm0, -32 * SIZE(Y)
  188. movaps %xmm1, -32 * SIZE(X)
  189. movaps -28 * SIZE(X), %xmm0
  190. movaps -28 * SIZE(Y), %xmm1
  191. movaps %xmm0, -28 * SIZE(Y)
  192. movaps %xmm1, -28 * SIZE(X)
  193. addl $8 * SIZE, X
  194. addl $8 * SIZE, Y
  195. ALIGN_3
  196. .L15:
  197. testl $4, M
  198. jle .L16
  199. movaps -32 * SIZE(X), %xmm0
  200. movaps -32 * SIZE(Y), %xmm1
  201. movaps %xmm0, -32 * SIZE(Y)
  202. movaps %xmm1, -32 * SIZE(X)
  203. addl $4 * SIZE, X
  204. addl $4 * SIZE, Y
  205. ALIGN_3
  206. .L16:
  207. testl $2, M
  208. jle .L17
  209. movsd -32 * SIZE(X), %xmm0
  210. movsd -32 * SIZE(Y), %xmm1
  211. movlps %xmm1, -32 * SIZE(X)
  212. addl $2 * SIZE, X
  213. movlps %xmm0, -32 * SIZE(Y)
  214. addl $2 * SIZE, Y
  215. ALIGN_3
  216. .L17:
  217. testl $1, M
  218. jle .L19
  219. movss -32 * SIZE(X), %xmm0
  220. movss -32 * SIZE(Y), %xmm1
  221. movss %xmm1, -32 * SIZE(X)
  222. movss %xmm0, -32 * SIZE(Y)
  223. ALIGN_3
  224. .L19:
  225. popl %ebx
  226. popl %esi
  227. popl %edi
  228. popl %ebp
  229. ret
  230. ALIGN_3
  231. .L20:
  232. movaps -33 * SIZE(X), %xmm0
  233. movaps -32 * SIZE(Y), %xmm1
  234. movss %xmm1, -32 * SIZE(X)
  235. PSHUFD2($0x39, %xmm1, %xmm3)
  236. movlps %xmm3, -31 * SIZE(X)
  237. subl $3, M
  238. movl M, %eax
  239. sarl $5, %eax
  240. jle .L23
  241. ALIGN_4
  242. .L21:
  243. #ifdef PREFETCHW
  244. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  245. #endif
  246. movaps -29 * SIZE(X), %xmm2
  247. movaps -28 * SIZE(Y), %xmm3
  248. movss %xmm2, %xmm0
  249. shufps $0x39, %xmm0, %xmm0
  250. movaps %xmm0, -32 * SIZE(Y)
  251. movss %xmm3, %xmm1
  252. shufps $0x93, %xmm3, %xmm1
  253. movaps %xmm1, -29 * SIZE(X)
  254. movaps -25 * SIZE(X), %xmm0
  255. movaps -24 * SIZE(Y), %xmm1
  256. movss %xmm0, %xmm2
  257. shufps $0x39, %xmm2, %xmm2
  258. movaps %xmm2, -28 * SIZE(Y)
  259. movss %xmm1, %xmm3
  260. shufps $0x93, %xmm1, %xmm3
  261. movaps %xmm3, -25 * SIZE(X)
  262. #ifdef PREFETCHW
  263. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  264. #endif
  265. movaps -21 * SIZE(X), %xmm2
  266. movaps -20 * SIZE(Y), %xmm3
  267. movss %xmm2, %xmm0
  268. shufps $0x39, %xmm0, %xmm0
  269. movaps %xmm0, -24 * SIZE(Y)
  270. movss %xmm3, %xmm1
  271. shufps $0x93, %xmm3, %xmm1
  272. movaps %xmm1, -21 * SIZE(X)
  273. movaps -17 * SIZE(X), %xmm0
  274. movaps -16 * SIZE(Y), %xmm1
  275. movss %xmm0, %xmm2
  276. shufps $0x39, %xmm2, %xmm2
  277. movaps %xmm2, -20 * SIZE(Y)
  278. movss %xmm1, %xmm3
  279. shufps $0x93, %xmm1, %xmm3
  280. movaps %xmm3, -17 * SIZE(X)
  281. #ifdef PREFETCHW
  282. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  283. #endif
  284. movaps -13 * SIZE(X), %xmm2
  285. movaps -12 * SIZE(Y), %xmm3
  286. movss %xmm2, %xmm0
  287. shufps $0x39, %xmm0, %xmm0
  288. movaps %xmm0, -16 * SIZE(Y)
  289. movss %xmm3, %xmm1
  290. shufps $0x93, %xmm3, %xmm1
  291. movaps %xmm1, -13 * SIZE(X)
  292. movaps -9 * SIZE(X), %xmm0
  293. movaps -8 * SIZE(Y), %xmm1
  294. movss %xmm0, %xmm2
  295. shufps $0x39, %xmm2, %xmm2
  296. movaps %xmm2, -12 * SIZE(Y)
  297. movss %xmm1, %xmm3
  298. shufps $0x93, %xmm1, %xmm3
  299. movaps %xmm3, -9 * SIZE(X)
  300. #ifdef PREFETCHW
  301. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  302. #endif
  303. movaps -5 * SIZE(X), %xmm2
  304. movaps -4 * SIZE(Y), %xmm3
  305. movss %xmm2, %xmm0
  306. shufps $0x39, %xmm0, %xmm0
  307. movaps %xmm0, -8 * SIZE(Y)
  308. movss %xmm3, %xmm1
  309. shufps $0x93, %xmm3, %xmm1
  310. movaps %xmm1, -5 * SIZE(X)
  311. movaps -1 * SIZE(X), %xmm0
  312. movaps 0 * SIZE(Y), %xmm1
  313. movss %xmm0, %xmm2
  314. shufps $0x39, %xmm2, %xmm2
  315. movaps %xmm2, -4 * SIZE(Y)
  316. movss %xmm1, %xmm3
  317. shufps $0x93, %xmm1, %xmm3
  318. movaps %xmm3, -1 * SIZE(X)
  319. subl $-32 * SIZE, X
  320. subl $-32 * SIZE, Y
  321. decl %eax
  322. jg .L21
  323. ALIGN_3
  324. .L23:
  325. testl $16, M
  326. jle .L24
  327. movaps -29 * SIZE(X), %xmm2
  328. movaps -28 * SIZE(Y), %xmm3
  329. movss %xmm2, %xmm0
  330. shufps $0x39, %xmm0, %xmm0
  331. movaps %xmm0, -32 * SIZE(Y)
  332. movss %xmm3, %xmm1
  333. shufps $0x93, %xmm3, %xmm1
  334. movaps %xmm1, -29 * SIZE(X)
  335. movaps -25 * SIZE(X), %xmm0
  336. movaps -24 * SIZE(Y), %xmm1
  337. movss %xmm0, %xmm2
  338. shufps $0x39, %xmm2, %xmm2
  339. movaps %xmm2, -28 * SIZE(Y)
  340. movss %xmm1, %xmm3
  341. shufps $0x93, %xmm1, %xmm3
  342. movaps %xmm3, -25 * SIZE(X)
  343. movaps -21 * SIZE(X), %xmm2
  344. movaps -20 * SIZE(Y), %xmm3
  345. movss %xmm2, %xmm0
  346. shufps $0x39, %xmm0, %xmm0
  347. movaps %xmm0, -24 * SIZE(Y)
  348. movss %xmm3, %xmm1
  349. shufps $0x93, %xmm3, %xmm1
  350. movaps %xmm1, -21 * SIZE(X)
  351. movaps -17 * SIZE(X), %xmm0
  352. movaps -16 * SIZE(Y), %xmm1
  353. movss %xmm0, %xmm2
  354. shufps $0x39, %xmm2, %xmm2
  355. movaps %xmm2, -20 * SIZE(Y)
  356. movss %xmm1, %xmm3
  357. shufps $0x93, %xmm1, %xmm3
  358. movaps %xmm3, -17 * SIZE(X)
  359. addl $16 * SIZE, X
  360. addl $16 * SIZE, Y
  361. ALIGN_3
  362. .L24:
  363. testl $8, M
  364. jle .L25
  365. movaps -29 * SIZE(X), %xmm2
  366. movaps -28 * SIZE(Y), %xmm3
  367. movss %xmm2, %xmm0
  368. shufps $0x39, %xmm0, %xmm0
  369. movaps %xmm0, -32 * SIZE(Y)
  370. movss %xmm3, %xmm1
  371. shufps $0x93, %xmm3, %xmm1
  372. movaps %xmm1, -29 * SIZE(X)
  373. movaps -25 * SIZE(X), %xmm0
  374. movaps -24 * SIZE(Y), %xmm1
  375. movss %xmm0, %xmm2
  376. shufps $0x39, %xmm2, %xmm2
  377. movaps %xmm2, -28 * SIZE(Y)
  378. movss %xmm1, %xmm3
  379. shufps $0x93, %xmm1, %xmm3
  380. movaps %xmm3, -25 * SIZE(X)
  381. addl $8 * SIZE, X
  382. addl $8 * SIZE, Y
  383. ALIGN_3
  384. .L25:
  385. testl $4, M
  386. jle .L26
  387. movaps -29 * SIZE(X), %xmm2
  388. movaps -28 * SIZE(Y), %xmm3
  389. movss %xmm2, %xmm0
  390. shufps $0x39, %xmm0, %xmm0
  391. movaps %xmm0, -32 * SIZE(Y)
  392. movss %xmm3, %xmm1
  393. shufps $0x93, %xmm3, %xmm1
  394. movaps %xmm1, -29 * SIZE(X)
  395. movaps %xmm2, %xmm0
  396. movaps %xmm3, %xmm1
  397. addl $4 * SIZE, X
  398. addl $4 * SIZE, Y
  399. ALIGN_3
  400. .L26:
  401. PSHUFD2($0x39, %xmm0, %xmm2)
  402. PSHUFD1($0xff, %xmm0)
  403. movlps %xmm2, -32 * SIZE(Y)
  404. movss %xmm0, -30 * SIZE(Y)
  405. testl $2, M
  406. jle .L27
  407. movsd -29 * SIZE(X), %xmm0
  408. movsd -29 * SIZE(Y), %xmm1
  409. movlps %xmm0, -29 * SIZE(Y)
  410. movlps %xmm1, -29 * SIZE(X)
  411. addl $2 * SIZE, X
  412. addl $2 * SIZE, Y
  413. ALIGN_3
  414. .L27:
  415. testl $1, M
  416. jle .L29
  417. movss -29 * SIZE(X), %xmm0
  418. movss -29 * SIZE(Y), %xmm1
  419. movss %xmm0, -29 * SIZE(Y)
  420. movss %xmm1, -29 * SIZE(X)
  421. ALIGN_3
  422. .L29:
  423. popl %ebx
  424. popl %esi
  425. popl %edi
  426. popl %ebp
  427. ret
  428. ALIGN_3
  429. .L30:
  430. testl $1 * SIZE, X
  431. jne .L40
  432. movhps -32 * SIZE(X), %xmm0
  433. movaps -32 * SIZE(Y), %xmm1
  434. movlps %xmm1, -32 * SIZE(X)
  435. subl $2, M
  436. movl M, %eax
  437. sarl $5, %eax
  438. jle .L33
  439. ALIGN_4
  440. .L31:
  441. #ifdef PREFETCHW
  442. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  443. #endif
  444. movaps -30 * SIZE(X), %xmm2
  445. movaps -28 * SIZE(Y), %xmm3
  446. SHUFPD_1 %xmm2, %xmm0
  447. movaps %xmm0, -32 * SIZE(Y)
  448. SHUFPD_1 %xmm3, %xmm1
  449. movaps %xmm1, -30 * SIZE(X)
  450. movaps -26 * SIZE(X), %xmm0
  451. movaps -24 * SIZE(Y), %xmm1
  452. SHUFPD_1 %xmm0, %xmm2
  453. movaps %xmm2, -28 * SIZE(Y)
  454. SHUFPD_1 %xmm1, %xmm3
  455. movaps %xmm3, -26 * SIZE(X)
  456. #ifdef PREFETCHW
  457. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  458. #endif
  459. movaps -22 * SIZE(X), %xmm2
  460. movaps -20 * SIZE(Y), %xmm3
  461. SHUFPD_1 %xmm2, %xmm0
  462. movaps %xmm0, -24 * SIZE(Y)
  463. SHUFPD_1 %xmm3, %xmm1
  464. movaps %xmm1, -22 * SIZE(X)
  465. movaps -18 * SIZE(X), %xmm0
  466. movaps -16 * SIZE(Y), %xmm1
  467. SHUFPD_1 %xmm0, %xmm2
  468. movaps %xmm2, -20 * SIZE(Y)
  469. SHUFPD_1 %xmm1, %xmm3
  470. movaps %xmm3, -18 * SIZE(X)
  471. #ifdef PREFETCHW
  472. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  473. #endif
  474. movaps -14 * SIZE(X), %xmm2
  475. movaps -12 * SIZE(Y), %xmm3
  476. SHUFPD_1 %xmm2, %xmm0
  477. movaps %xmm0, -16 * SIZE(Y)
  478. SHUFPD_1 %xmm3, %xmm1
  479. movaps %xmm1, -14 * SIZE(X)
  480. movaps -10 * SIZE(X), %xmm0
  481. movaps -8 * SIZE(Y), %xmm1
  482. SHUFPD_1 %xmm0, %xmm2
  483. movaps %xmm2, -12 * SIZE(Y)
  484. SHUFPD_1 %xmm1, %xmm3
  485. movaps %xmm3, -10 * SIZE(X)
  486. #ifdef PREFETCHW
  487. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  488. #endif
  489. movaps -6 * SIZE(X), %xmm2
  490. movaps -4 * SIZE(Y), %xmm3
  491. SHUFPD_1 %xmm2, %xmm0
  492. movaps %xmm0, -8 * SIZE(Y)
  493. SHUFPD_1 %xmm3, %xmm1
  494. movaps %xmm1, -6 * SIZE(X)
  495. movaps -2 * SIZE(X), %xmm0
  496. movaps 0 * SIZE(Y), %xmm1
  497. SHUFPD_1 %xmm0, %xmm2
  498. movaps %xmm2, -4 * SIZE(Y)
  499. SHUFPD_1 %xmm1, %xmm3
  500. movaps %xmm3, -2 * SIZE(X)
  501. subl $-32 * SIZE, X
  502. subl $-32 * SIZE, Y
  503. decl %eax
  504. jg .L31
  505. ALIGN_3
  506. .L33:
  507. testl $16, M
  508. jle .L34
  509. movaps -30 * SIZE(X), %xmm2
  510. movaps -28 * SIZE(Y), %xmm3
  511. SHUFPD_1 %xmm2, %xmm0
  512. movaps %xmm0, -32 * SIZE(Y)
  513. SHUFPD_1 %xmm3, %xmm1
  514. movaps %xmm1, -30 * SIZE(X)
  515. movaps -26 * SIZE(X), %xmm0
  516. movaps -24 * SIZE(Y), %xmm1
  517. SHUFPD_1 %xmm0, %xmm2
  518. movaps %xmm2, -28 * SIZE(Y)
  519. SHUFPD_1 %xmm1, %xmm3
  520. movaps %xmm3, -26 * SIZE(X)
  521. movaps -22 * SIZE(X), %xmm2
  522. movaps -20 * SIZE(Y), %xmm3
  523. SHUFPD_1 %xmm2, %xmm0
  524. movaps %xmm0, -24 * SIZE(Y)
  525. SHUFPD_1 %xmm3, %xmm1
  526. movaps %xmm1, -22 * SIZE(X)
  527. movaps -18 * SIZE(X), %xmm0
  528. movaps -16 * SIZE(Y), %xmm1
  529. SHUFPD_1 %xmm0, %xmm2
  530. movaps %xmm2, -20 * SIZE(Y)
  531. SHUFPD_1 %xmm1, %xmm3
  532. movaps %xmm3, -18 * SIZE(X)
  533. addl $16 * SIZE, X
  534. addl $16 * SIZE, Y
  535. ALIGN_3
  536. .L34:
  537. testl $8, M
  538. jle .L35
  539. movaps -30 * SIZE(X), %xmm2
  540. movaps -28 * SIZE(Y), %xmm3
  541. SHUFPD_1 %xmm2, %xmm0
  542. movaps %xmm0, -32 * SIZE(Y)
  543. SHUFPD_1 %xmm3, %xmm1
  544. movaps %xmm1, -30 * SIZE(X)
  545. movaps -26 * SIZE(X), %xmm0
  546. movaps -24 * SIZE(Y), %xmm1
  547. SHUFPD_1 %xmm0, %xmm2
  548. movaps %xmm2, -28 * SIZE(Y)
  549. SHUFPD_1 %xmm1, %xmm3
  550. movaps %xmm3, -26 * SIZE(X)
  551. addl $8 * SIZE, X
  552. addl $8 * SIZE, Y
  553. ALIGN_3
  554. .L35:
  555. testl $4, M
  556. jle .L36
  557. movaps -30 * SIZE(X), %xmm2
  558. movaps -28 * SIZE(Y), %xmm3
  559. SHUFPD_1 %xmm2, %xmm0
  560. movaps %xmm0, -32 * SIZE(Y)
  561. SHUFPD_1 %xmm3, %xmm1
  562. movaps %xmm1, -30 * SIZE(X)
  563. movaps %xmm2, %xmm0
  564. movaps %xmm3, %xmm1
  565. addl $4 * SIZE, X
  566. addl $4 * SIZE, Y
  567. ALIGN_3
  568. .L36:
  569. movhps %xmm0, -32 * SIZE(Y)
  570. testl $2, M
  571. jle .L37
  572. movsd -30 * SIZE(X), %xmm0
  573. movsd -30 * SIZE(Y), %xmm1
  574. movlps %xmm0, -30 * SIZE(Y)
  575. movlps %xmm1, -30 * SIZE(X)
  576. addl $2 * SIZE, X
  577. addl $2 * SIZE, Y
  578. ALIGN_3
  579. .L37:
  580. testl $1, M
  581. jle .L39
  582. movss -30 * SIZE(X), %xmm0
  583. movss -30 * SIZE(Y), %xmm1
  584. movss %xmm0, -30 * SIZE(Y)
  585. movss %xmm1, -30 * SIZE(X)
  586. ALIGN_3
  587. .L39:
  588. popl %ebx
  589. popl %esi
  590. popl %edi
  591. popl %ebp
  592. ret
  593. ALIGN_3
  594. .L40:
  595. movaps -35 * SIZE(X), %xmm0
  596. movaps -32 * SIZE(Y), %xmm1
  597. movss %xmm1, -32 * SIZE(X)
  598. subl $3, M
  599. movl M, %eax
  600. sarl $5, %eax
  601. jle .L43
  602. ALIGN_4
  603. .L41:
  604. #ifdef PREFETCHW
  605. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  606. #endif
  607. movaps -31 * SIZE(X), %xmm2
  608. movaps -28 * SIZE(Y), %xmm3
  609. movss %xmm2, %xmm0
  610. shufps $0x93, %xmm2, %xmm0
  611. movaps %xmm0, -32 * SIZE(Y)
  612. movss %xmm3, %xmm1
  613. shufps $0x39, %xmm1, %xmm1
  614. movaps %xmm1, -31 * SIZE(X)
  615. movaps -27 * SIZE(X), %xmm0
  616. movaps -24 * SIZE(Y), %xmm1
  617. movss %xmm0, %xmm2
  618. shufps $0x93, %xmm0, %xmm2
  619. movaps %xmm2, -28 * SIZE(Y)
  620. movss %xmm1, %xmm3
  621. shufps $0x39, %xmm3, %xmm3
  622. movaps %xmm3, -27 * SIZE(X)
  623. #ifdef PREFETCHW
  624. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  625. #endif
  626. movaps -23 * SIZE(X), %xmm2
  627. movaps -20 * SIZE(Y), %xmm3
  628. movss %xmm2, %xmm0
  629. shufps $0x93, %xmm2, %xmm0
  630. movaps %xmm0, -24 * SIZE(Y)
  631. movss %xmm3, %xmm1
  632. shufps $0x39, %xmm1, %xmm1
  633. movaps %xmm1, -23 * SIZE(X)
  634. movaps -19 * SIZE(X), %xmm0
  635. movaps -16 * SIZE(Y), %xmm1
  636. movss %xmm0, %xmm2
  637. shufps $0x93, %xmm0, %xmm2
  638. movaps %xmm2, -20 * SIZE(Y)
  639. movss %xmm1, %xmm3
  640. shufps $0x39, %xmm3, %xmm3
  641. movaps %xmm3, -19 * SIZE(X)
  642. #ifdef PREFETCHW
  643. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  644. #endif
  645. movaps -15 * SIZE(X), %xmm2
  646. movaps -12 * SIZE(Y), %xmm3
  647. movss %xmm2, %xmm0
  648. shufps $0x93, %xmm2, %xmm0
  649. movaps %xmm0, -16 * SIZE(Y)
  650. movss %xmm3, %xmm1
  651. shufps $0x39, %xmm1, %xmm1
  652. movaps %xmm1, -15 * SIZE(X)
  653. movaps -11 * SIZE(X), %xmm0
  654. movaps -8 * SIZE(Y), %xmm1
  655. movss %xmm0, %xmm2
  656. shufps $0x93, %xmm0, %xmm2
  657. movaps %xmm2, -12 * SIZE(Y)
  658. movss %xmm1, %xmm3
  659. shufps $0x39, %xmm3, %xmm3
  660. movaps %xmm3, -11 * SIZE(X)
  661. #ifdef PREFETCHW
  662. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  663. #endif
  664. movaps -7 * SIZE(X), %xmm2
  665. movaps -4 * SIZE(Y), %xmm3
  666. movss %xmm2, %xmm0
  667. shufps $0x93, %xmm2, %xmm0
  668. movaps %xmm0, -8 * SIZE(Y)
  669. movss %xmm3, %xmm1
  670. shufps $0x39, %xmm1, %xmm1
  671. movaps %xmm1, -7 * SIZE(X)
  672. movaps -3 * SIZE(X), %xmm0
  673. movaps 0 * SIZE(Y), %xmm1
  674. movss %xmm0, %xmm2
  675. shufps $0x93, %xmm0, %xmm2
  676. movaps %xmm2, -4 * SIZE(Y)
  677. movss %xmm1, %xmm3
  678. shufps $0x39, %xmm3, %xmm3
  679. movaps %xmm3, -3 * SIZE(X)
  680. subl $-32 * SIZE, X
  681. subl $-32 * SIZE, Y
  682. decl %eax
  683. jg .L41
  684. ALIGN_3
  685. .L43:
  686. testl $16, M
  687. jle .L44
  688. movaps -31 * SIZE(X), %xmm2
  689. movaps -28 * SIZE(Y), %xmm3
  690. movss %xmm2, %xmm0
  691. shufps $0x93, %xmm2, %xmm0
  692. movaps %xmm0, -32 * SIZE(Y)
  693. movss %xmm3, %xmm1
  694. shufps $0x39, %xmm1, %xmm1
  695. movaps %xmm1, -31 * SIZE(X)
  696. movaps -27 * SIZE(X), %xmm0
  697. movaps -24 * SIZE(Y), %xmm1
  698. movss %xmm0, %xmm2
  699. shufps $0x93, %xmm0, %xmm2
  700. movaps %xmm2, -28 * SIZE(Y)
  701. movss %xmm1, %xmm3
  702. shufps $0x39, %xmm3, %xmm3
  703. movaps %xmm3, -27 * SIZE(X)
  704. movaps -23 * SIZE(X), %xmm2
  705. movaps -20 * SIZE(Y), %xmm3
  706. movss %xmm2, %xmm0
  707. shufps $0x93, %xmm2, %xmm0
  708. movaps %xmm0, -24 * SIZE(Y)
  709. movss %xmm3, %xmm1
  710. shufps $0x39, %xmm1, %xmm1
  711. movaps %xmm1, -23 * SIZE(X)
  712. movaps -19 * SIZE(X), %xmm0
  713. movaps -16 * SIZE(Y), %xmm1
  714. movss %xmm0, %xmm2
  715. shufps $0x93, %xmm0, %xmm2
  716. movaps %xmm2, -20 * SIZE(Y)
  717. movss %xmm1, %xmm3
  718. shufps $0x39, %xmm3, %xmm3
  719. movaps %xmm3, -19 * SIZE(X)
  720. addl $16 * SIZE, X
  721. addl $16 * SIZE, Y
  722. ALIGN_3
  723. .L44:
  724. testl $8, M
  725. jle .L45
  726. movaps -31 * SIZE(X), %xmm2
  727. movaps -28 * SIZE(Y), %xmm3
  728. movss %xmm2, %xmm0
  729. shufps $0x93, %xmm2, %xmm0
  730. movaps %xmm0, -32 * SIZE(Y)
  731. movss %xmm3, %xmm1
  732. shufps $0x39, %xmm1, %xmm1
  733. movaps %xmm1, -31 * SIZE(X)
  734. movaps -27 * SIZE(X), %xmm0
  735. movaps -24 * SIZE(Y), %xmm1
  736. movss %xmm0, %xmm2
  737. shufps $0x93, %xmm0, %xmm2
  738. movaps %xmm2, -28 * SIZE(Y)
  739. movss %xmm1, %xmm3
  740. shufps $0x39, %xmm3, %xmm3
  741. movaps %xmm3, -27 * SIZE(X)
  742. addl $8 * SIZE, X
  743. addl $8 * SIZE, Y
  744. ALIGN_3
  745. .L45:
  746. testl $4, M
  747. jle .L46
  748. movaps -31 * SIZE(X), %xmm2
  749. movaps -28 * SIZE(Y), %xmm3
  750. movss %xmm2, %xmm0
  751. shufps $0x93, %xmm2, %xmm0
  752. movaps %xmm0, -32 * SIZE(Y)
  753. movss %xmm3, %xmm1
  754. shufps $0x39, %xmm1, %xmm1
  755. movaps %xmm1, -31 * SIZE(X)
  756. movaps %xmm2, %xmm0
  757. movaps %xmm3, %xmm1
  758. addl $4 * SIZE, X
  759. addl $4 * SIZE, Y
  760. ALIGN_3
  761. .L46:
  762. movsd -31 * SIZE(X), %xmm2
  763. PSHUFD2($0x39, %xmm1, %xmm1)
  764. movlps %xmm1, -31 * SIZE(X)
  765. PSHUFD1($0xff, %xmm0)
  766. movss %xmm0, -32 * SIZE(Y)
  767. movlps %xmm2, -31 * SIZE(Y)
  768. addl $3 * SIZE, X
  769. addl $3 * SIZE, Y
  770. testl $2, M
  771. jle .L47
  772. movsd -32 * SIZE(X), %xmm0
  773. movsd -32 * SIZE(Y), %xmm1
  774. movlps %xmm0, -32 * SIZE(Y)
  775. movlps %xmm1, -32 * SIZE(X)
  776. addl $2 * SIZE, X
  777. addl $2 * SIZE, Y
  778. ALIGN_3
  779. .L47:
  780. testl $1, M
  781. jle .L49
  782. movss -32 * SIZE(X), %xmm0
  783. movss -32 * SIZE(Y), %xmm1
  784. movss %xmm0, -32 * SIZE(Y)
  785. movss %xmm1, -32 * SIZE(X)
  786. ALIGN_3
  787. .L49:
  788. popl %ebx
  789. popl %esi
  790. popl %edi
  791. popl %ebp
  792. ret
  793. ALIGN_3
  794. .L50:
  795. movl M, %eax
  796. sarl $2, %eax
  797. jle .L55
  798. ALIGN_3
  799. .L51:
  800. movsd (X), %xmm0
  801. movsd (Y), %xmm1
  802. movlps %xmm1, (X)
  803. addl INCX, X
  804. movlps %xmm0, (Y)
  805. addl INCY, Y
  806. movsd (X), %xmm0
  807. movsd (Y), %xmm1
  808. movlps %xmm1, (X)
  809. addl INCX, X
  810. movlps %xmm0, (Y)
  811. addl INCY, Y
  812. movsd (X), %xmm0
  813. movsd (Y), %xmm1
  814. movlps %xmm1, (X)
  815. addl INCX, X
  816. movlps %xmm0, (Y)
  817. addl INCY, Y
  818. movsd (X), %xmm0
  819. movsd (Y), %xmm1
  820. movlps %xmm1, (X)
  821. addl INCX, X
  822. movlps %xmm0, (Y)
  823. addl INCY, Y
  824. decl %eax
  825. jg .L51
  826. ALIGN_3
  827. .L55:
  828. movl M, %eax
  829. andl $3, %eax
  830. jle .L57
  831. ALIGN_3
  832. .L56:
  833. movsd (X), %xmm0
  834. movsd (Y), %xmm1
  835. movlps %xmm1, (X)
  836. addl INCX, X
  837. movlps %xmm0, (Y)
  838. addl INCY, Y
  839. decl %eax
  840. jg .L56
  841. ALIGN_3
  842. .L57:
  843. popl %ebx
  844. popl %esi
  845. popl %edi
  846. popl %ebp
  847. ret
  848. EPILOGUE