You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

swap_sse.S 21 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef WINDOWS_ABI
  41. #define M ARG1
  42. #define X ARG4
  43. #define INCX ARG5
  44. #define Y ARG6
  45. #define INCY ARG2
  46. #else
  47. #define M ARG1
  48. #define X ARG2
  49. #define INCX ARG3
  50. #define Y ARG4
  51. #define INCY %rbx
  52. #endif
  53. #include "l1param.h"
  54. PROLOGUE
  55. PROFCODE
  56. #ifndef WINDOWS_ABI
  57. movq 8(%rsp), INCY
  58. #else
  59. pushq %rbx
  60. movq 48(%rsp), X
  61. movq 56(%rsp), INCX
  62. movq 64(%rsp), Y
  63. movq 72(%rsp), INCY
  64. #endif
  65. SAVEREGISTERS
  66. leaq (, INCX, SIZE), INCX
  67. leaq (, INCY, SIZE), INCY
  68. cmpq $SIZE, INCX
  69. jne .L50
  70. cmpq $SIZE, INCY
  71. jne .L50
  72. subq $-32 * SIZE, X
  73. subq $-32 * SIZE, Y
  74. cmpq $3, M
  75. jle .L16
  76. testq $SIZE, Y
  77. je .L05
  78. movss -32 * SIZE(X), %xmm0
  79. movss -32 * SIZE(Y), %xmm1
  80. movss %xmm1, -32 * SIZE(X)
  81. movss %xmm0, -32 * SIZE(Y)
  82. addq $1 * SIZE, X
  83. addq $1 * SIZE, Y
  84. decq M
  85. ALIGN_3
  86. .L05:
  87. testq $2 * SIZE, Y
  88. je .L10
  89. movsd -32 * SIZE(X), %xmm0
  90. movsd -32 * SIZE(Y), %xmm1
  91. movlps %xmm1, -32 * SIZE(X)
  92. movlps %xmm0, -32 * SIZE(Y)
  93. addq $2 * SIZE, X
  94. addq $2 * SIZE, Y
  95. subq $2, M
  96. jle .L19
  97. ALIGN_3
  98. .L10:
  99. cmpq $3, M
  100. jle .L16
  101. testq $2 * SIZE, X
  102. jne .L30
  103. testq $1 * SIZE, X
  104. jne .L20
  105. movq M, %rax
  106. sarq $5, %rax
  107. jle .L13
  108. ALIGN_3
  109. .L11:
  110. #ifdef PREFETCHW
  111. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  112. #endif
  113. movaps -32 * SIZE(X), %xmm0
  114. movaps -32 * SIZE(Y), %xmm1
  115. movaps %xmm0, -32 * SIZE(Y)
  116. movaps %xmm1, -32 * SIZE(X)
  117. movaps -28 * SIZE(X), %xmm0
  118. movaps -28 * SIZE(Y), %xmm1
  119. movaps %xmm0, -28 * SIZE(Y)
  120. movaps %xmm1, -28 * SIZE(X)
  121. #ifdef PREFETCHW
  122. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  123. #endif
  124. movaps -24 * SIZE(X), %xmm0
  125. movaps -24 * SIZE(Y), %xmm1
  126. movaps %xmm0, -24 * SIZE(Y)
  127. movaps %xmm1, -24 * SIZE(X)
  128. movaps -20 * SIZE(X), %xmm0
  129. movaps -20 * SIZE(Y), %xmm1
  130. movaps %xmm0, -20 * SIZE(Y)
  131. movaps %xmm1, -20 * SIZE(X)
  132. #ifdef PREFETCHW
  133. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  134. #endif
  135. movaps -16 * SIZE(X), %xmm0
  136. movaps -16 * SIZE(Y), %xmm1
  137. movaps %xmm0, -16 * SIZE(Y)
  138. movaps %xmm1, -16 * SIZE(X)
  139. movaps -12 * SIZE(X), %xmm0
  140. movaps -12 * SIZE(Y), %xmm1
  141. movaps %xmm0, -12 * SIZE(Y)
  142. movaps %xmm1, -12 * SIZE(X)
  143. #ifdef PREFETCHW
  144. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  145. #endif
  146. movaps -8 * SIZE(X), %xmm0
  147. movaps -8 * SIZE(Y), %xmm1
  148. movaps %xmm0, -8 * SIZE(Y)
  149. movaps %xmm1, -8 * SIZE(X)
  150. movaps -4 * SIZE(X), %xmm0
  151. movaps -4 * SIZE(Y), %xmm1
  152. movaps %xmm0, -4 * SIZE(Y)
  153. movaps %xmm1, -4 * SIZE(X)
  154. subq $-32 * SIZE, Y
  155. subq $-32 * SIZE, X
  156. decq %rax
  157. jg .L11
  158. ALIGN_3
  159. .L13:
  160. testq $16, M
  161. jle .L14
  162. movaps -32 * SIZE(X), %xmm0
  163. movaps -32 * SIZE(Y), %xmm1
  164. movaps %xmm0, -32 * SIZE(Y)
  165. movaps %xmm1, -32 * SIZE(X)
  166. movaps -28 * SIZE(X), %xmm0
  167. movaps -28 * SIZE(Y), %xmm1
  168. movaps %xmm0, -28 * SIZE(Y)
  169. movaps %xmm1, -28 * SIZE(X)
  170. movaps -24 * SIZE(X), %xmm0
  171. movaps -24 * SIZE(Y), %xmm1
  172. movaps %xmm0, -24 * SIZE(Y)
  173. movaps %xmm1, -24 * SIZE(X)
  174. movaps -20 * SIZE(X), %xmm0
  175. movaps -20 * SIZE(Y), %xmm1
  176. movaps %xmm0, -20 * SIZE(Y)
  177. movaps %xmm1, -20 * SIZE(X)
  178. addq $16 * SIZE, X
  179. addq $16 * SIZE, Y
  180. ALIGN_3
  181. .L14:
  182. testq $8, M
  183. jle .L15
  184. movaps -32 * SIZE(X), %xmm0
  185. movaps -32 * SIZE(Y), %xmm1
  186. movaps %xmm0, -32 * SIZE(Y)
  187. movaps %xmm1, -32 * SIZE(X)
  188. movaps -28 * SIZE(X), %xmm0
  189. movaps -28 * SIZE(Y), %xmm1
  190. movaps %xmm0, -28 * SIZE(Y)
  191. movaps %xmm1, -28 * SIZE(X)
  192. addq $8 * SIZE, X
  193. addq $8 * SIZE, Y
  194. ALIGN_3
  195. .L15:
  196. testq $4, M
  197. jle .L16
  198. movaps -32 * SIZE(X), %xmm0
  199. movaps -32 * SIZE(Y), %xmm1
  200. movaps %xmm0, -32 * SIZE(Y)
  201. movaps %xmm1, -32 * SIZE(X)
  202. addq $4 * SIZE, X
  203. addq $4 * SIZE, Y
  204. ALIGN_3
  205. .L16:
  206. testq $2, M
  207. jle .L17
  208. movsd -32 * SIZE(X), %xmm0
  209. movsd -32 * SIZE(Y), %xmm1
  210. movlps %xmm1, -32 * SIZE(X)
  211. addq $2 * SIZE, X
  212. movlps %xmm0, -32 * SIZE(Y)
  213. addq $2 * SIZE, Y
  214. ALIGN_3
  215. .L17:
  216. testq $1, M
  217. jle .L19
  218. movss -32 * SIZE(X), %xmm0
  219. movss -32 * SIZE(Y), %xmm1
  220. movss %xmm1, -32 * SIZE(X)
  221. movss %xmm0, -32 * SIZE(Y)
  222. ALIGN_3
  223. .L19:
  224. xorq %rax,%rax
  225. RESTOREREGISTERS
  226. #ifdef WINDOWS_ABI
  227. popq %rbx
  228. #endif
  229. ret
  230. ALIGN_3
  231. .L20:
  232. movaps -33 * SIZE(X), %xmm0
  233. movaps -32 * SIZE(Y), %xmm1
  234. movss %xmm1, -32 * SIZE(X)
  235. pshufd $0x39, %xmm1, %xmm3
  236. movlps %xmm3, -31 * SIZE(X)
  237. subq $3, M
  238. movq M, %rax
  239. sarq $5, %rax
  240. jle .L23
  241. ALIGN_4
  242. .L21:
  243. #ifdef PREFETCHW
  244. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  245. #endif
  246. movaps -29 * SIZE(X), %xmm2
  247. movaps -28 * SIZE(Y), %xmm3
  248. movss %xmm2, %xmm0
  249. shufps $0x39, %xmm0, %xmm0
  250. movaps %xmm0, -32 * SIZE(Y)
  251. movss %xmm3, %xmm1
  252. shufps $0x93, %xmm3, %xmm1
  253. movaps %xmm1, -29 * SIZE(X)
  254. movaps -25 * SIZE(X), %xmm0
  255. movaps -24 * SIZE(Y), %xmm1
  256. movss %xmm0, %xmm2
  257. shufps $0x39, %xmm2, %xmm2
  258. movaps %xmm2, -28 * SIZE(Y)
  259. movss %xmm1, %xmm3
  260. shufps $0x93, %xmm1, %xmm3
  261. movaps %xmm3, -25 * SIZE(X)
  262. #ifdef PREFETCHW
  263. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  264. #endif
  265. movaps -21 * SIZE(X), %xmm2
  266. movaps -20 * SIZE(Y), %xmm3
  267. movss %xmm2, %xmm0
  268. shufps $0x39, %xmm0, %xmm0
  269. movaps %xmm0, -24 * SIZE(Y)
  270. movss %xmm3, %xmm1
  271. shufps $0x93, %xmm3, %xmm1
  272. movaps %xmm1, -21 * SIZE(X)
  273. movaps -17 * SIZE(X), %xmm0
  274. movaps -16 * SIZE(Y), %xmm1
  275. movss %xmm0, %xmm2
  276. shufps $0x39, %xmm2, %xmm2
  277. movaps %xmm2, -20 * SIZE(Y)
  278. movss %xmm1, %xmm3
  279. shufps $0x93, %xmm1, %xmm3
  280. movaps %xmm3, -17 * SIZE(X)
  281. #ifdef PREFETCHW
  282. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  283. #endif
  284. movaps -13 * SIZE(X), %xmm2
  285. movaps -12 * SIZE(Y), %xmm3
  286. movss %xmm2, %xmm0
  287. shufps $0x39, %xmm0, %xmm0
  288. movaps %xmm0, -16 * SIZE(Y)
  289. movss %xmm3, %xmm1
  290. shufps $0x93, %xmm3, %xmm1
  291. movaps %xmm1, -13 * SIZE(X)
  292. movaps -9 * SIZE(X), %xmm0
  293. movaps -8 * SIZE(Y), %xmm1
  294. movss %xmm0, %xmm2
  295. shufps $0x39, %xmm2, %xmm2
  296. movaps %xmm2, -12 * SIZE(Y)
  297. movss %xmm1, %xmm3
  298. shufps $0x93, %xmm1, %xmm3
  299. movaps %xmm3, -9 * SIZE(X)
  300. #ifdef PREFETCHW
  301. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  302. #endif
  303. movaps -5 * SIZE(X), %xmm2
  304. movaps -4 * SIZE(Y), %xmm3
  305. movss %xmm2, %xmm0
  306. shufps $0x39, %xmm0, %xmm0
  307. movaps %xmm0, -8 * SIZE(Y)
  308. movss %xmm3, %xmm1
  309. shufps $0x93, %xmm3, %xmm1
  310. movaps %xmm1, -5 * SIZE(X)
  311. movaps -1 * SIZE(X), %xmm0
  312. movaps 0 * SIZE(Y), %xmm1
  313. movss %xmm0, %xmm2
  314. shufps $0x39, %xmm2, %xmm2
  315. movaps %xmm2, -4 * SIZE(Y)
  316. movss %xmm1, %xmm3
  317. shufps $0x93, %xmm1, %xmm3
  318. movaps %xmm3, -1 * SIZE(X)
  319. subq $-32 * SIZE, X
  320. subq $-32 * SIZE, Y
  321. decq %rax
  322. jg .L21
  323. ALIGN_3
  324. .L23:
  325. testq $16, M
  326. jle .L24
  327. movaps -29 * SIZE(X), %xmm2
  328. movaps -28 * SIZE(Y), %xmm3
  329. movss %xmm2, %xmm0
  330. shufps $0x39, %xmm0, %xmm0
  331. movaps %xmm0, -32 * SIZE(Y)
  332. movss %xmm3, %xmm1
  333. shufps $0x93, %xmm3, %xmm1
  334. movaps %xmm1, -29 * SIZE(X)
  335. movaps -25 * SIZE(X), %xmm0
  336. movaps -24 * SIZE(Y), %xmm1
  337. movss %xmm0, %xmm2
  338. shufps $0x39, %xmm2, %xmm2
  339. movaps %xmm2, -28 * SIZE(Y)
  340. movss %xmm1, %xmm3
  341. shufps $0x93, %xmm1, %xmm3
  342. movaps %xmm3, -25 * SIZE(X)
  343. movaps -21 * SIZE(X), %xmm2
  344. movaps -20 * SIZE(Y), %xmm3
  345. movss %xmm2, %xmm0
  346. shufps $0x39, %xmm0, %xmm0
  347. movaps %xmm0, -24 * SIZE(Y)
  348. movss %xmm3, %xmm1
  349. shufps $0x93, %xmm3, %xmm1
  350. movaps %xmm1, -21 * SIZE(X)
  351. movaps -17 * SIZE(X), %xmm0
  352. movaps -16 * SIZE(Y), %xmm1
  353. movss %xmm0, %xmm2
  354. shufps $0x39, %xmm2, %xmm2
  355. movaps %xmm2, -20 * SIZE(Y)
  356. movss %xmm1, %xmm3
  357. shufps $0x93, %xmm1, %xmm3
  358. movaps %xmm3, -17 * SIZE(X)
  359. addq $16 * SIZE, X
  360. addq $16 * SIZE, Y
  361. ALIGN_3
  362. .L24:
  363. testq $8, M
  364. jle .L25
  365. movaps -29 * SIZE(X), %xmm2
  366. movaps -28 * SIZE(Y), %xmm3
  367. movss %xmm2, %xmm0
  368. shufps $0x39, %xmm0, %xmm0
  369. movaps %xmm0, -32 * SIZE(Y)
  370. movss %xmm3, %xmm1
  371. shufps $0x93, %xmm3, %xmm1
  372. movaps %xmm1, -29 * SIZE(X)
  373. movaps -25 * SIZE(X), %xmm0
  374. movaps -24 * SIZE(Y), %xmm1
  375. movss %xmm0, %xmm2
  376. shufps $0x39, %xmm2, %xmm2
  377. movaps %xmm2, -28 * SIZE(Y)
  378. movss %xmm1, %xmm3
  379. shufps $0x93, %xmm1, %xmm3
  380. movaps %xmm3, -25 * SIZE(X)
  381. addq $8 * SIZE, X
  382. addq $8 * SIZE, Y
  383. ALIGN_3
  384. .L25:
  385. testq $4, M
  386. jle .L26
  387. movaps -29 * SIZE(X), %xmm2
  388. movaps -28 * SIZE(Y), %xmm3
  389. movss %xmm2, %xmm0
  390. shufps $0x39, %xmm0, %xmm0
  391. movaps %xmm0, -32 * SIZE(Y)
  392. movss %xmm3, %xmm1
  393. shufps $0x93, %xmm3, %xmm1
  394. movaps %xmm1, -29 * SIZE(X)
  395. movaps %xmm2, %xmm0
  396. movaps %xmm3, %xmm1
  397. addq $4 * SIZE, X
  398. addq $4 * SIZE, Y
  399. ALIGN_3
  400. .L26:
  401. pshufd $0x39, %xmm0, %xmm2
  402. pshufd $0xff, %xmm0, %xmm0
  403. movlps %xmm2, -32 * SIZE(Y)
  404. movss %xmm0, -30 * SIZE(Y)
  405. testq $2, M
  406. jle .L27
  407. movsd -29 * SIZE(X), %xmm0
  408. movsd -29 * SIZE(Y), %xmm1
  409. movlps %xmm0, -29 * SIZE(Y)
  410. movlps %xmm1, -29 * SIZE(X)
  411. addq $2 * SIZE, X
  412. addq $2 * SIZE, Y
  413. ALIGN_3
  414. .L27:
  415. testq $1, M
  416. jle .L29
  417. movss -29 * SIZE(X), %xmm0
  418. movss -29 * SIZE(Y), %xmm1
  419. movss %xmm0, -29 * SIZE(Y)
  420. movss %xmm1, -29 * SIZE(X)
  421. ALIGN_3
  422. .L29:
  423. xorq %rax,%rax
  424. RESTOREREGISTERS
  425. #ifdef WINDOWS_ABI
  426. popq %rbx
  427. #endif
  428. ret
  429. ALIGN_3
  430. .L30:
  431. testq $1 * SIZE, X
  432. jne .L40
  433. movhps -32 * SIZE(X), %xmm0
  434. movaps -32 * SIZE(Y), %xmm1
  435. movlps %xmm1, -32 * SIZE(X)
  436. subq $2, M
  437. movq M, %rax
  438. sarq $5, %rax
  439. jle .L33
  440. ALIGN_4
  441. .L31:
  442. #ifdef PREFETCHW
  443. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  444. #endif
  445. movaps -30 * SIZE(X), %xmm2
  446. movaps -28 * SIZE(Y), %xmm3
  447. SHUFPD_1 %xmm2, %xmm0
  448. movaps %xmm0, -32 * SIZE(Y)
  449. SHUFPD_1 %xmm3, %xmm1
  450. movaps %xmm1, -30 * SIZE(X)
  451. movaps -26 * SIZE(X), %xmm0
  452. movaps -24 * SIZE(Y), %xmm1
  453. SHUFPD_1 %xmm0, %xmm2
  454. movaps %xmm2, -28 * SIZE(Y)
  455. SHUFPD_1 %xmm1, %xmm3
  456. movaps %xmm3, -26 * SIZE(X)
  457. #ifdef PREFETCHW
  458. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  459. #endif
  460. movaps -22 * SIZE(X), %xmm2
  461. movaps -20 * SIZE(Y), %xmm3
  462. SHUFPD_1 %xmm2, %xmm0
  463. movaps %xmm0, -24 * SIZE(Y)
  464. SHUFPD_1 %xmm3, %xmm1
  465. movaps %xmm1, -22 * SIZE(X)
  466. movaps -18 * SIZE(X), %xmm0
  467. movaps -16 * SIZE(Y), %xmm1
  468. SHUFPD_1 %xmm0, %xmm2
  469. movaps %xmm2, -20 * SIZE(Y)
  470. SHUFPD_1 %xmm1, %xmm3
  471. movaps %xmm3, -18 * SIZE(X)
  472. #ifdef PREFETCHW
  473. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  474. #endif
  475. movaps -14 * SIZE(X), %xmm2
  476. movaps -12 * SIZE(Y), %xmm3
  477. SHUFPD_1 %xmm2, %xmm0
  478. movaps %xmm0, -16 * SIZE(Y)
  479. SHUFPD_1 %xmm3, %xmm1
  480. movaps %xmm1, -14 * SIZE(X)
  481. movaps -10 * SIZE(X), %xmm0
  482. movaps -8 * SIZE(Y), %xmm1
  483. SHUFPD_1 %xmm0, %xmm2
  484. movaps %xmm2, -12 * SIZE(Y)
  485. SHUFPD_1 %xmm1, %xmm3
  486. movaps %xmm3, -10 * SIZE(X)
  487. #ifdef PREFETCHW
  488. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  489. #endif
  490. movaps -6 * SIZE(X), %xmm2
  491. movaps -4 * SIZE(Y), %xmm3
  492. SHUFPD_1 %xmm2, %xmm0
  493. movaps %xmm0, -8 * SIZE(Y)
  494. SHUFPD_1 %xmm3, %xmm1
  495. movaps %xmm1, -6 * SIZE(X)
  496. movaps -2 * SIZE(X), %xmm0
  497. movaps 0 * SIZE(Y), %xmm1
  498. SHUFPD_1 %xmm0, %xmm2
  499. movaps %xmm2, -4 * SIZE(Y)
  500. SHUFPD_1 %xmm1, %xmm3
  501. movaps %xmm3, -2 * SIZE(X)
  502. subq $-32 * SIZE, X
  503. subq $-32 * SIZE, Y
  504. decq %rax
  505. jg .L31
  506. ALIGN_3
  507. .L33:
  508. testq $16, M
  509. jle .L34
  510. movaps -30 * SIZE(X), %xmm2
  511. movaps -28 * SIZE(Y), %xmm3
  512. SHUFPD_1 %xmm2, %xmm0
  513. movaps %xmm0, -32 * SIZE(Y)
  514. SHUFPD_1 %xmm3, %xmm1
  515. movaps %xmm1, -30 * SIZE(X)
  516. movaps -26 * SIZE(X), %xmm0
  517. movaps -24 * SIZE(Y), %xmm1
  518. SHUFPD_1 %xmm0, %xmm2
  519. movaps %xmm2, -28 * SIZE(Y)
  520. SHUFPD_1 %xmm1, %xmm3
  521. movaps %xmm3, -26 * SIZE(X)
  522. movaps -22 * SIZE(X), %xmm2
  523. movaps -20 * SIZE(Y), %xmm3
  524. SHUFPD_1 %xmm2, %xmm0
  525. movaps %xmm0, -24 * SIZE(Y)
  526. SHUFPD_1 %xmm3, %xmm1
  527. movaps %xmm1, -22 * SIZE(X)
  528. movaps -18 * SIZE(X), %xmm0
  529. movaps -16 * SIZE(Y), %xmm1
  530. SHUFPD_1 %xmm0, %xmm2
  531. movaps %xmm2, -20 * SIZE(Y)
  532. SHUFPD_1 %xmm1, %xmm3
  533. movaps %xmm3, -18 * SIZE(X)
  534. addq $16 * SIZE, X
  535. addq $16 * SIZE, Y
  536. ALIGN_3
  537. .L34:
  538. testq $8, M
  539. jle .L35
  540. movaps -30 * SIZE(X), %xmm2
  541. movaps -28 * SIZE(Y), %xmm3
  542. SHUFPD_1 %xmm2, %xmm0
  543. movaps %xmm0, -32 * SIZE(Y)
  544. SHUFPD_1 %xmm3, %xmm1
  545. movaps %xmm1, -30 * SIZE(X)
  546. movaps -26 * SIZE(X), %xmm0
  547. movaps -24 * SIZE(Y), %xmm1
  548. SHUFPD_1 %xmm0, %xmm2
  549. movaps %xmm2, -28 * SIZE(Y)
  550. SHUFPD_1 %xmm1, %xmm3
  551. movaps %xmm3, -26 * SIZE(X)
  552. addq $8 * SIZE, X
  553. addq $8 * SIZE, Y
  554. ALIGN_3
  555. .L35:
  556. testq $4, M
  557. jle .L36
  558. movaps -30 * SIZE(X), %xmm2
  559. movaps -28 * SIZE(Y), %xmm3
  560. SHUFPD_1 %xmm2, %xmm0
  561. movaps %xmm0, -32 * SIZE(Y)
  562. SHUFPD_1 %xmm3, %xmm1
  563. movaps %xmm1, -30 * SIZE(X)
  564. movaps %xmm2, %xmm0
  565. movaps %xmm3, %xmm1
  566. addq $4 * SIZE, X
  567. addq $4 * SIZE, Y
  568. ALIGN_3
  569. .L36:
  570. movhps %xmm0, -32 * SIZE(Y)
  571. testq $2, M
  572. jle .L37
  573. movsd -30 * SIZE(X), %xmm0
  574. movsd -30 * SIZE(Y), %xmm1
  575. movlps %xmm0, -30 * SIZE(Y)
  576. movlps %xmm1, -30 * SIZE(X)
  577. addq $2 * SIZE, X
  578. addq $2 * SIZE, Y
  579. ALIGN_3
  580. .L37:
  581. testq $1, M
  582. jle .L39
  583. movss -30 * SIZE(X), %xmm0
  584. movss -30 * SIZE(Y), %xmm1
  585. movss %xmm0, -30 * SIZE(Y)
  586. movss %xmm1, -30 * SIZE(X)
  587. ALIGN_3
  588. .L39:
  589. xorq %rax,%rax
  590. RESTOREREGISTERS
  591. #ifdef WINDOWS_ABI
  592. popq %rbx
  593. #endif
  594. ret
  595. ALIGN_3
  596. .L40:
  597. movaps -35 * SIZE(X), %xmm0
  598. movaps -32 * SIZE(Y), %xmm1
  599. movss %xmm1, -32 * SIZE(X)
  600. subq $3, M
  601. movq M, %rax
  602. sarq $5, %rax
  603. jle .L43
  604. ALIGN_4
  605. .L41:
  606. #ifdef PREFETCHW
  607. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  608. #endif
  609. movaps -31 * SIZE(X), %xmm2
  610. movaps -28 * SIZE(Y), %xmm3
  611. movss %xmm2, %xmm0
  612. shufps $0x93, %xmm2, %xmm0
  613. movaps %xmm0, -32 * SIZE(Y)
  614. movss %xmm3, %xmm1
  615. shufps $0x39, %xmm1, %xmm1
  616. movaps %xmm1, -31 * SIZE(X)
  617. movaps -27 * SIZE(X), %xmm0
  618. movaps -24 * SIZE(Y), %xmm1
  619. movss %xmm0, %xmm2
  620. shufps $0x93, %xmm0, %xmm2
  621. movaps %xmm2, -28 * SIZE(Y)
  622. movss %xmm1, %xmm3
  623. shufps $0x39, %xmm3, %xmm3
  624. movaps %xmm3, -27 * SIZE(X)
  625. #ifdef PREFETCHW
  626. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  627. #endif
  628. movaps -23 * SIZE(X), %xmm2
  629. movaps -20 * SIZE(Y), %xmm3
  630. movss %xmm2, %xmm0
  631. shufps $0x93, %xmm2, %xmm0
  632. movaps %xmm0, -24 * SIZE(Y)
  633. movss %xmm3, %xmm1
  634. shufps $0x39, %xmm1, %xmm1
  635. movaps %xmm1, -23 * SIZE(X)
  636. movaps -19 * SIZE(X), %xmm0
  637. movaps -16 * SIZE(Y), %xmm1
  638. movss %xmm0, %xmm2
  639. shufps $0x93, %xmm0, %xmm2
  640. movaps %xmm2, -20 * SIZE(Y)
  641. movss %xmm1, %xmm3
  642. shufps $0x39, %xmm3, %xmm3
  643. movaps %xmm3, -19 * SIZE(X)
  644. #ifdef PREFETCHW
  645. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  646. #endif
  647. movaps -15 * SIZE(X), %xmm2
  648. movaps -12 * SIZE(Y), %xmm3
  649. movss %xmm2, %xmm0
  650. shufps $0x93, %xmm2, %xmm0
  651. movaps %xmm0, -16 * SIZE(Y)
  652. movss %xmm3, %xmm1
  653. shufps $0x39, %xmm1, %xmm1
  654. movaps %xmm1, -15 * SIZE(X)
  655. movaps -11 * SIZE(X), %xmm0
  656. movaps -8 * SIZE(Y), %xmm1
  657. movss %xmm0, %xmm2
  658. shufps $0x93, %xmm0, %xmm2
  659. movaps %xmm2, -12 * SIZE(Y)
  660. movss %xmm1, %xmm3
  661. shufps $0x39, %xmm3, %xmm3
  662. movaps %xmm3, -11 * SIZE(X)
  663. #ifdef PREFETCHW
  664. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  665. #endif
  666. movaps -7 * SIZE(X), %xmm2
  667. movaps -4 * SIZE(Y), %xmm3
  668. movss %xmm2, %xmm0
  669. shufps $0x93, %xmm2, %xmm0
  670. movaps %xmm0, -8 * SIZE(Y)
  671. movss %xmm3, %xmm1
  672. shufps $0x39, %xmm1, %xmm1
  673. movaps %xmm1, -7 * SIZE(X)
  674. movaps -3 * SIZE(X), %xmm0
  675. movaps 0 * SIZE(Y), %xmm1
  676. movss %xmm0, %xmm2
  677. shufps $0x93, %xmm0, %xmm2
  678. movaps %xmm2, -4 * SIZE(Y)
  679. movss %xmm1, %xmm3
  680. shufps $0x39, %xmm3, %xmm3
  681. movaps %xmm3, -3 * SIZE(X)
  682. subq $-32 * SIZE, X
  683. subq $-32 * SIZE, Y
  684. decq %rax
  685. jg .L41
  686. ALIGN_3
  687. .L43:
  688. testq $16, M
  689. jle .L44
  690. movaps -31 * SIZE(X), %xmm2
  691. movaps -28 * SIZE(Y), %xmm3
  692. movss %xmm2, %xmm0
  693. shufps $0x93, %xmm2, %xmm0
  694. movaps %xmm0, -32 * SIZE(Y)
  695. movss %xmm3, %xmm1
  696. shufps $0x39, %xmm1, %xmm1
  697. movaps %xmm1, -31 * SIZE(X)
  698. movaps -27 * SIZE(X), %xmm0
  699. movaps -24 * SIZE(Y), %xmm1
  700. movss %xmm0, %xmm2
  701. shufps $0x93, %xmm0, %xmm2
  702. movaps %xmm2, -28 * SIZE(Y)
  703. movss %xmm1, %xmm3
  704. shufps $0x39, %xmm3, %xmm3
  705. movaps %xmm3, -27 * SIZE(X)
  706. movaps -23 * SIZE(X), %xmm2
  707. movaps -20 * SIZE(Y), %xmm3
  708. movss %xmm2, %xmm0
  709. shufps $0x93, %xmm2, %xmm0
  710. movaps %xmm0, -24 * SIZE(Y)
  711. movss %xmm3, %xmm1
  712. shufps $0x39, %xmm1, %xmm1
  713. movaps %xmm1, -23 * SIZE(X)
  714. movaps -19 * SIZE(X), %xmm0
  715. movaps -16 * SIZE(Y), %xmm1
  716. movss %xmm0, %xmm2
  717. shufps $0x93, %xmm0, %xmm2
  718. movaps %xmm2, -20 * SIZE(Y)
  719. movss %xmm1, %xmm3
  720. shufps $0x39, %xmm3, %xmm3
  721. movaps %xmm3, -19 * SIZE(X)
  722. addq $16 * SIZE, X
  723. addq $16 * SIZE, Y
  724. ALIGN_3
  725. .L44:
  726. testq $8, M
  727. jle .L45
  728. movaps -31 * SIZE(X), %xmm2
  729. movaps -28 * SIZE(Y), %xmm3
  730. movss %xmm2, %xmm0
  731. shufps $0x93, %xmm2, %xmm0
  732. movaps %xmm0, -32 * SIZE(Y)
  733. movss %xmm3, %xmm1
  734. shufps $0x39, %xmm1, %xmm1
  735. movaps %xmm1, -31 * SIZE(X)
  736. movaps -27 * SIZE(X), %xmm0
  737. movaps -24 * SIZE(Y), %xmm1
  738. movss %xmm0, %xmm2
  739. shufps $0x93, %xmm0, %xmm2
  740. movaps %xmm2, -28 * SIZE(Y)
  741. movss %xmm1, %xmm3
  742. shufps $0x39, %xmm3, %xmm3
  743. movaps %xmm3, -27 * SIZE(X)
  744. addq $8 * SIZE, X
  745. addq $8 * SIZE, Y
  746. ALIGN_3
  747. .L45:
  748. testq $4, M
  749. jle .L46
  750. movaps -31 * SIZE(X), %xmm2
  751. movaps -28 * SIZE(Y), %xmm3
  752. movss %xmm2, %xmm0
  753. shufps $0x93, %xmm2, %xmm0
  754. movaps %xmm0, -32 * SIZE(Y)
  755. movss %xmm3, %xmm1
  756. shufps $0x39, %xmm1, %xmm1
  757. movaps %xmm1, -31 * SIZE(X)
  758. movaps %xmm2, %xmm0
  759. movaps %xmm3, %xmm1
  760. addq $4 * SIZE, X
  761. addq $4 * SIZE, Y
  762. ALIGN_3
  763. .L46:
  764. movsd -31 * SIZE(X), %xmm2
  765. pshufd $0x39, %xmm1, %xmm1
  766. movlps %xmm1, -31 * SIZE(X)
  767. pshufd $0xff, %xmm0, %xmm0
  768. movss %xmm0, -32 * SIZE(Y)
  769. movlps %xmm2, -31 * SIZE(Y)
  770. addq $3 * SIZE, X
  771. addq $3 * SIZE, Y
  772. testq $2, M
  773. jle .L47
  774. movsd -32 * SIZE(X), %xmm0
  775. movsd -32 * SIZE(Y), %xmm1
  776. movlps %xmm0, -32 * SIZE(Y)
  777. movlps %xmm1, -32 * SIZE(X)
  778. addq $2 * SIZE, X
  779. addq $2 * SIZE, Y
  780. ALIGN_3
  781. .L47:
  782. testq $1, M
  783. jle .L49
  784. movss -32 * SIZE(X), %xmm0
  785. movss -32 * SIZE(Y), %xmm1
  786. movss %xmm0, -32 * SIZE(Y)
  787. movss %xmm1, -32 * SIZE(X)
  788. ALIGN_3
  789. .L49:
  790. xorq %rax,%rax
  791. RESTOREREGISTERS
  792. #ifdef WINDOWS_ABI
  793. popq %rbx
  794. #endif
  795. ret
  796. ALIGN_3
  797. .L50:
  798. movq M, %rax
  799. sarq $3, %rax
  800. jle .L55
  801. ALIGN_3
  802. .L51:
  803. movss (X), %xmm0
  804. movss (Y), %xmm1
  805. movss %xmm1, (X)
  806. addq INCX, X
  807. movss %xmm0, (Y)
  808. addq INCY, Y
  809. movss (X), %xmm0
  810. movss (Y), %xmm1
  811. movss %xmm1, (X)
  812. addq INCX, X
  813. movss %xmm0, (Y)
  814. addq INCY, Y
  815. movss (X), %xmm0
  816. movss (Y), %xmm1
  817. movss %xmm1, (X)
  818. addq INCX, X
  819. movss %xmm0, (Y)
  820. addq INCY, Y
  821. movss (X), %xmm0
  822. movss (Y), %xmm1
  823. movss %xmm1, (X)
  824. addq INCX, X
  825. movss %xmm0, (Y)
  826. addq INCY, Y
  827. movss (X), %xmm0
  828. movss (Y), %xmm1
  829. movss %xmm1, (X)
  830. addq INCX, X
  831. movss %xmm0, (Y)
  832. addq INCY, Y
  833. movss (X), %xmm0
  834. movss (Y), %xmm1
  835. movss %xmm1, (X)
  836. addq INCX, X
  837. movss %xmm0, (Y)
  838. addq INCY, Y
  839. movss (X), %xmm0
  840. movss (Y), %xmm1
  841. movss %xmm1, (X)
  842. addq INCX, X
  843. movss %xmm0, (Y)
  844. addq INCY, Y
  845. movss (X), %xmm0
  846. movss (Y), %xmm1
  847. movss %xmm1, (X)
  848. addq INCX, X
  849. movss %xmm0, (Y)
  850. addq INCY, Y
  851. decq %rax
  852. jg .L51
  853. ALIGN_3
  854. .L55:
  855. movq M, %rax
  856. andq $7, %rax
  857. jle .L57
  858. ALIGN_3
  859. .L56:
  860. movss (X), %xmm0
  861. movss (Y), %xmm1
  862. movss %xmm1, (X)
  863. movss %xmm0, (Y)
  864. addq INCX, X
  865. addq INCY, Y
  866. decq %rax
  867. jg .L56
  868. ALIGN_3
  869. .L57:
  870. xorq %rax, %rax
  871. RESTOREREGISTERS
  872. #ifdef WINDOWS_ABI
  873. popq %rbx
  874. #endif
  875. ret
  876. EPILOGUE