You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy_sse.S 28 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef WINDOWS_ABI
  41. #define M ARG1
  42. #define X ARG4
  43. #define INCX ARG5
  44. #define Y ARG6
  45. #define INCY ARG2
  46. #else
  47. #define M ARG1
  48. #define X ARG2
  49. #define INCX ARG3
  50. #define Y ARG4
  51. #define INCY %r10
  52. #endif
  53. #define YY %r11
  54. #define ALPHA %xmm15
  55. #include "l1param.h"
  56. PROLOGUE
  57. PROFCODE
  58. #ifndef WINDOWS_ABI
  59. #ifndef XDOUBLE
  60. movq 8(%rsp), INCY
  61. #else
  62. movq 24(%rsp), INCY
  63. #endif
  64. movaps %xmm0, ALPHA
  65. #else
  66. movq 40(%rsp), X
  67. movq 48(%rsp), INCX
  68. movq 56(%rsp), Y
  69. movq 64(%rsp), INCY
  70. #endif
  71. SAVEREGISTERS
  72. #ifdef WINDOWS_ABI
  73. movaps %xmm3, ALPHA
  74. #endif
  75. shufps $0, ALPHA, ALPHA
  76. leaq (, INCX, SIZE), INCX
  77. leaq (, INCY, SIZE), INCY
  78. testq M, M
  79. jle .L19
  80. cmpq $SIZE, INCX
  81. jne .L50
  82. cmpq $SIZE, INCY
  83. jne .L50
  84. subq $-32 * SIZE, X
  85. subq $-32 * SIZE, Y
  86. cmpq $3, M
  87. jle .L16
  88. testq $SIZE, Y
  89. je .L00
  90. movss -32 * SIZE(X), %xmm0
  91. mulss ALPHA, %xmm0
  92. addss -32 * SIZE(Y), %xmm0
  93. movss %xmm0, -32 * SIZE(Y)
  94. addq $1 * SIZE, X
  95. addq $1 * SIZE, Y
  96. decq M
  97. jle .L19
  98. ALIGN_3
  99. .L00:
  100. testq $SIZE * 2, Y
  101. je .L10
  102. movsd -32 * SIZE(X), %xmm0
  103. movsd -32 * SIZE(Y), %xmm4
  104. mulps ALPHA, %xmm0
  105. addps %xmm4, %xmm0
  106. movsd %xmm0, -32 * SIZE(Y)
  107. addq $2 * SIZE, X
  108. addq $2 * SIZE, Y
  109. subq $2, M
  110. jle .L19
  111. ALIGN_3
  112. .L10:
  113. testq $SIZE * 3, X
  114. jne .L20
  115. movq M, %rax
  116. sarq $5, %rax
  117. jle .L13
  118. movaps -32 * SIZE(X), %xmm0
  119. movaps -28 * SIZE(X), %xmm1
  120. movaps -24 * SIZE(X), %xmm2
  121. movaps -20 * SIZE(X), %xmm3
  122. decq %rax
  123. jle .L12
  124. ALIGN_4
  125. .L11:
  126. movaps -16 * SIZE(X), %xmm4
  127. movaps -12 * SIZE(X), %xmm5
  128. #ifdef PREFETCHW
  129. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  130. #endif
  131. mulps ALPHA, %xmm0
  132. addps -32 * SIZE(Y), %xmm0
  133. movaps %xmm0, -32 * SIZE(Y)
  134. mulps ALPHA, %xmm1
  135. addps -28 * SIZE(Y), %xmm1
  136. movaps %xmm1, -28 * SIZE(Y)
  137. movaps -8 * SIZE(X), %xmm6
  138. movaps -4 * SIZE(X), %xmm7
  139. #ifdef PREFETCH
  140. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  141. #endif
  142. mulps ALPHA, %xmm2
  143. addps -24 * SIZE(Y), %xmm2
  144. movaps %xmm2, -24 * SIZE(Y)
  145. mulps ALPHA, %xmm3
  146. addps -20 * SIZE(Y), %xmm3
  147. movaps %xmm3, -20 * SIZE(Y)
  148. movaps 0 * SIZE(X), %xmm0
  149. movaps 4 * SIZE(X), %xmm1
  150. #if defined(PREFETCHW) && !defined(FETCH128)
  151. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  152. #endif
  153. mulps ALPHA, %xmm4
  154. addps -16 * SIZE(Y), %xmm4
  155. movaps %xmm4, -16 * SIZE(Y)
  156. mulps ALPHA, %xmm5
  157. addps -12 * SIZE(Y), %xmm5
  158. movaps %xmm5, -12 * SIZE(Y)
  159. movaps 8 * SIZE(X), %xmm2
  160. movaps 12 * SIZE(X), %xmm3
  161. #if defined(PREFETCH) && !defined(FETCH128)
  162. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  163. #endif
  164. mulps ALPHA, %xmm6
  165. addps -8 * SIZE(Y), %xmm6
  166. movaps %xmm6, -8 * SIZE(Y)
  167. mulps ALPHA, %xmm7
  168. addps -4 * SIZE(Y), %xmm7
  169. movaps %xmm7, -4 * SIZE(Y)
  170. subq $-32 * SIZE, X
  171. subq $-32 * SIZE, Y
  172. decq %rax
  173. jg .L11
  174. ALIGN_3
  175. .L12:
  176. movaps -16 * SIZE(X), %xmm4
  177. movaps -12 * SIZE(X), %xmm5
  178. mulps ALPHA, %xmm0
  179. addps -32 * SIZE(Y), %xmm0
  180. movaps %xmm0, -32 * SIZE(Y)
  181. mulps ALPHA, %xmm1
  182. addps -28 * SIZE(Y), %xmm1
  183. movaps %xmm1, -28 * SIZE(Y)
  184. movaps -8 * SIZE(X), %xmm6
  185. movaps -4 * SIZE(X), %xmm7
  186. mulps ALPHA, %xmm2
  187. addps -24 * SIZE(Y), %xmm2
  188. movaps %xmm2, -24 * SIZE(Y)
  189. mulps ALPHA, %xmm3
  190. addps -20 * SIZE(Y), %xmm3
  191. movaps %xmm3, -20 * SIZE(Y)
  192. mulps ALPHA, %xmm4
  193. addps -16 * SIZE(Y), %xmm4
  194. movaps %xmm4, -16 * SIZE(Y)
  195. mulps ALPHA, %xmm5
  196. addps -12 * SIZE(Y), %xmm5
  197. movaps %xmm5, -12 * SIZE(Y)
  198. mulps ALPHA, %xmm6
  199. addps -8 * SIZE(Y), %xmm6
  200. movaps %xmm6, -8 * SIZE(Y)
  201. mulps ALPHA, %xmm7
  202. addps -4 * SIZE(Y), %xmm7
  203. movaps %xmm7, -4 * SIZE(Y)
  204. subq $-32 * SIZE, X
  205. subq $-32 * SIZE, Y
  206. ALIGN_3
  207. .L13:
  208. movq M, %rax
  209. andq $16, %rax
  210. jle .L14
  211. ALIGN_3
  212. movaps -32 * SIZE(X), %xmm0
  213. movaps -28 * SIZE(X), %xmm1
  214. movaps -24 * SIZE(X), %xmm2
  215. movaps -20 * SIZE(X), %xmm3
  216. mulps ALPHA, %xmm0
  217. addps -32 * SIZE(Y), %xmm0
  218. mulps ALPHA, %xmm1
  219. addps -28 * SIZE(Y), %xmm1
  220. mulps ALPHA, %xmm2
  221. addps -24 * SIZE(Y), %xmm2
  222. mulps ALPHA, %xmm3
  223. addps -20 * SIZE(Y), %xmm3
  224. movaps %xmm0, -32 * SIZE(Y)
  225. movaps %xmm1, -28 * SIZE(Y)
  226. movaps %xmm2, -24 * SIZE(Y)
  227. movaps %xmm3, -20 * SIZE(Y)
  228. addq $16 * SIZE, X
  229. addq $16 * SIZE, Y
  230. ALIGN_3
  231. .L14:
  232. movq M, %rax
  233. andq $8, %rax
  234. jle .L15
  235. ALIGN_3
  236. movaps -32 * SIZE(X), %xmm0
  237. movaps -28 * SIZE(X), %xmm1
  238. mulps ALPHA, %xmm0
  239. addps -32 * SIZE(Y), %xmm0
  240. mulps ALPHA, %xmm1
  241. addps -28 * SIZE(Y), %xmm1
  242. movaps %xmm0, -32 * SIZE(Y)
  243. movaps %xmm1, -28 * SIZE(Y)
  244. addq $8 * SIZE, X
  245. addq $8 * SIZE, Y
  246. ALIGN_3
  247. .L15:
  248. movq M, %rax
  249. andq $4, %rax
  250. jle .L16
  251. ALIGN_3
  252. movaps -32 * SIZE(X), %xmm0
  253. mulps ALPHA, %xmm0
  254. addps -32 * SIZE(Y), %xmm0
  255. movaps %xmm0, -32 * SIZE(Y)
  256. addq $4 * SIZE, X
  257. addq $4 * SIZE, Y
  258. ALIGN_3
  259. .L16:
  260. movq M, %rax
  261. andq $2, %rax
  262. jle .L17
  263. ALIGN_3
  264. movsd -32 * SIZE(X), %xmm0
  265. movsd -32 * SIZE(Y), %xmm4
  266. mulps ALPHA, %xmm0
  267. addps %xmm4, %xmm0
  268. movsd %xmm0, -32 * SIZE(Y)
  269. addq $2 * SIZE, X
  270. addq $2 * SIZE, Y
  271. ALIGN_3
  272. .L17:
  273. movq M, %rax
  274. andq $1, %rax
  275. jle .L19
  276. ALIGN_3
  277. movss -32 * SIZE(X), %xmm0
  278. mulss ALPHA, %xmm0
  279. addss -32 * SIZE(Y), %xmm0
  280. movss %xmm0, -32 * SIZE(Y)
  281. ALIGN_3
  282. .L19:
  283. xorq %rax,%rax
  284. RESTOREREGISTERS
  285. ret
  286. ALIGN_3
  287. .L20:
  288. #ifdef ALIGNED_ACCESS
  289. testq $SIZE, X
  290. jne .L30
  291. movhps -32 * SIZE(X), %xmm0
  292. movq M, %rax
  293. sarq $5, %rax
  294. jle .L23
  295. movaps -30 * SIZE(X), %xmm1
  296. movaps -26 * SIZE(X), %xmm2
  297. movaps -22 * SIZE(X), %xmm3
  298. movaps -18 * SIZE(X), %xmm4
  299. decq %rax
  300. jle .L22
  301. ALIGN_4
  302. .L21:
  303. movaps -14 * SIZE(X), %xmm5
  304. movaps -10 * SIZE(X), %xmm6
  305. #ifdef PREFETCHW
  306. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  307. #endif
  308. SHUFPD_1 %xmm1, %xmm0
  309. mulps ALPHA, %xmm0
  310. addps -32 * SIZE(Y), %xmm0
  311. movaps %xmm0, -32 * SIZE(Y)
  312. SHUFPD_1 %xmm2, %xmm1
  313. mulps ALPHA, %xmm1
  314. addps -28 * SIZE(Y), %xmm1
  315. movaps %xmm1, -28 * SIZE(Y)
  316. movaps -6 * SIZE(X), %xmm7
  317. movaps -2 * SIZE(X), %xmm0
  318. #ifdef PREFETCH
  319. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  320. #endif
  321. SHUFPD_1 %xmm3, %xmm2
  322. mulps ALPHA, %xmm2
  323. addps -24 * SIZE(Y), %xmm2
  324. movaps %xmm2, -24 * SIZE(Y)
  325. SHUFPD_1 %xmm4, %xmm3
  326. mulps ALPHA, %xmm3
  327. addps -20 * SIZE(Y), %xmm3
  328. movaps %xmm3, -20 * SIZE(Y)
  329. movaps 2 * SIZE(X), %xmm1
  330. movaps 6 * SIZE(X), %xmm2
  331. #if defined(PREFETCHW) && !defined(FETCH128)
  332. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  333. #endif
  334. SHUFPD_1 %xmm5, %xmm4
  335. mulps ALPHA, %xmm4
  336. addps -16 * SIZE(Y), %xmm4
  337. movaps %xmm4, -16 * SIZE(Y)
  338. SHUFPD_1 %xmm6, %xmm5
  339. mulps ALPHA, %xmm5
  340. addps -12 * SIZE(Y), %xmm5
  341. movaps %xmm5, -12 * SIZE(Y)
  342. movaps 10 * SIZE(X), %xmm3
  343. movaps 14 * SIZE(X), %xmm4
  344. #if defined(PREFETCH) && !defined(FETCH128)
  345. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  346. #endif
  347. SHUFPD_1 %xmm7, %xmm6
  348. mulps ALPHA, %xmm6
  349. addps -8 * SIZE(Y), %xmm6
  350. movaps %xmm6, -8 * SIZE(Y)
  351. SHUFPD_1 %xmm0, %xmm7
  352. mulps ALPHA, %xmm7
  353. addps -4 * SIZE(Y), %xmm7
  354. movaps %xmm7, -4 * SIZE(Y)
  355. subq $-32 * SIZE, X
  356. subq $-32 * SIZE, Y
  357. decq %rax
  358. jg .L21
  359. ALIGN_3
  360. .L22:
  361. movaps -14 * SIZE(X), %xmm5
  362. movaps -10 * SIZE(X), %xmm6
  363. SHUFPD_1 %xmm1, %xmm0
  364. mulps ALPHA, %xmm0
  365. addps -32 * SIZE(Y), %xmm0
  366. movaps %xmm0, -32 * SIZE(Y)
  367. SHUFPD_1 %xmm2, %xmm1
  368. mulps ALPHA, %xmm1
  369. addps -28 * SIZE(Y), %xmm1
  370. movaps %xmm1, -28 * SIZE(Y)
  371. movaps -6 * SIZE(X), %xmm7
  372. movaps -2 * SIZE(X), %xmm0
  373. SHUFPD_1 %xmm3, %xmm2
  374. mulps ALPHA, %xmm2
  375. addps -24 * SIZE(Y), %xmm2
  376. movaps %xmm2, -24 * SIZE(Y)
  377. SHUFPD_1 %xmm4, %xmm3
  378. mulps ALPHA, %xmm3
  379. addps -20 * SIZE(Y), %xmm3
  380. movaps %xmm3, -20 * SIZE(Y)
  381. SHUFPD_1 %xmm5, %xmm4
  382. mulps ALPHA, %xmm4
  383. addps -16 * SIZE(Y), %xmm4
  384. movaps %xmm4, -16 * SIZE(Y)
  385. SHUFPD_1 %xmm6, %xmm5
  386. mulps ALPHA, %xmm5
  387. addps -12 * SIZE(Y), %xmm5
  388. movaps %xmm5, -12 * SIZE(Y)
  389. SHUFPD_1 %xmm7, %xmm6
  390. mulps ALPHA, %xmm6
  391. addps -8 * SIZE(Y), %xmm6
  392. movaps %xmm6, -8 * SIZE(Y)
  393. SHUFPD_1 %xmm0, %xmm7
  394. mulps ALPHA, %xmm7
  395. addps -4 * SIZE(Y), %xmm7
  396. movaps %xmm7, -4 * SIZE(Y)
  397. subq $-32 * SIZE, X
  398. subq $-32 * SIZE, Y
  399. ALIGN_3
  400. .L23:
  401. movq M, %rax
  402. andq $16, %rax
  403. jle .L24
  404. ALIGN_3
  405. movaps -30 * SIZE(X), %xmm1
  406. movaps -26 * SIZE(X), %xmm2
  407. movaps -22 * SIZE(X), %xmm3
  408. movaps -18 * SIZE(X), %xmm4
  409. SHUFPD_1 %xmm1, %xmm0
  410. SHUFPD_1 %xmm2, %xmm1
  411. SHUFPD_1 %xmm3, %xmm2
  412. SHUFPD_1 %xmm4, %xmm3
  413. mulps ALPHA, %xmm0
  414. addps -32 * SIZE(Y), %xmm0
  415. mulps ALPHA, %xmm1
  416. addps -28 * SIZE(Y), %xmm1
  417. mulps ALPHA, %xmm2
  418. addps -24 * SIZE(Y), %xmm2
  419. mulps ALPHA, %xmm3
  420. addps -20 * SIZE(Y), %xmm3
  421. movaps %xmm0, -32 * SIZE(Y)
  422. movaps %xmm1, -28 * SIZE(Y)
  423. movaps %xmm2, -24 * SIZE(Y)
  424. movaps %xmm3, -20 * SIZE(Y)
  425. movaps %xmm4, %xmm0
  426. addq $16 * SIZE, X
  427. addq $16 * SIZE, Y
  428. ALIGN_3
  429. .L24:
  430. movq M, %rax
  431. andq $8, %rax
  432. jle .L25
  433. ALIGN_3
  434. movaps -30 * SIZE(X), %xmm1
  435. movaps -26 * SIZE(X), %xmm2
  436. SHUFPD_1 %xmm1, %xmm0
  437. mulps ALPHA, %xmm0
  438. addps -32 * SIZE(Y), %xmm0
  439. SHUFPD_1 %xmm2, %xmm1
  440. mulps ALPHA, %xmm1
  441. addps -28 * SIZE(Y), %xmm1
  442. movaps %xmm0, -32 * SIZE(Y)
  443. movaps %xmm1, -28 * SIZE(Y)
  444. movaps %xmm2, %xmm0
  445. addq $8 * SIZE, X
  446. addq $8 * SIZE, Y
  447. ALIGN_3
  448. .L25:
  449. movq M, %rax
  450. andq $4, %rax
  451. jle .L26
  452. ALIGN_3
  453. movaps -30 * SIZE(X), %xmm1
  454. SHUFPD_1 %xmm1, %xmm0
  455. mulps ALPHA, %xmm0
  456. addps -32 * SIZE(Y), %xmm0
  457. movaps %xmm0, -32 * SIZE(Y)
  458. addq $4 * SIZE, X
  459. addq $4 * SIZE, Y
  460. ALIGN_3
  461. .L26:
  462. movq M, %rax
  463. andq $2, %rax
  464. jle .L27
  465. ALIGN_3
  466. movsd -32 * SIZE(X), %xmm0
  467. movsd -32 * SIZE(Y), %xmm4
  468. mulps ALPHA, %xmm0
  469. addps %xmm4, %xmm0
  470. movsd %xmm0, -32 * SIZE(Y)
  471. addq $2 * SIZE, X
  472. addq $2 * SIZE, Y
  473. ALIGN_3
  474. .L27:
  475. movq M, %rax
  476. andq $1, %rax
  477. jle .L29
  478. ALIGN_3
  479. movss -32 * SIZE(X), %xmm0
  480. mulss ALPHA, %xmm0
  481. addss -32 * SIZE(Y), %xmm0
  482. movss %xmm0, -32 * SIZE(Y)
  483. addq $SIZE, Y
  484. ALIGN_3
  485. .L29:
  486. xorq %rax,%rax
  487. RESTOREREGISTERS
  488. ret
  489. ALIGN_3
  490. .L30:
  491. testq $2 * SIZE, X
  492. jne .L40
  493. movaps -33 * SIZE(X), %xmm0
  494. movq M, %rax
  495. sarq $5, %rax
  496. jle .L33
  497. movaps -29 * SIZE(X), %xmm1
  498. movaps -25 * SIZE(X), %xmm2
  499. movaps -21 * SIZE(X), %xmm3
  500. movaps -17 * SIZE(X), %xmm4
  501. decq %rax
  502. jle .L32
  503. ALIGN_4
  504. .L31:
  505. movaps -13 * SIZE(X), %xmm5
  506. movaps -9 * SIZE(X), %xmm6
  507. #ifdef PREFETCHW
  508. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  509. #endif
  510. movss %xmm1, %xmm0
  511. SHUFPS_39 %xmm0, %xmm0
  512. mulps ALPHA, %xmm0
  513. addps -32 * SIZE(Y), %xmm0
  514. movaps %xmm0, -32 * SIZE(Y)
  515. movss %xmm2, %xmm1
  516. SHUFPS_39 %xmm1, %xmm1
  517. mulps ALPHA, %xmm1
  518. addps -28 * SIZE(Y), %xmm1
  519. movaps %xmm1, -28 * SIZE(Y)
  520. movaps -5 * SIZE(X), %xmm7
  521. movaps -1 * SIZE(X), %xmm0
  522. #ifdef PREFETCH
  523. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  524. #endif
  525. movss %xmm3, %xmm2
  526. SHUFPS_39 %xmm2, %xmm2
  527. mulps ALPHA, %xmm2
  528. addps -24 * SIZE(Y), %xmm2
  529. movaps %xmm2, -24 * SIZE(Y)
  530. movss %xmm4, %xmm3
  531. SHUFPS_39 %xmm3, %xmm3
  532. mulps ALPHA, %xmm3
  533. addps -20 * SIZE(Y), %xmm3
  534. movaps %xmm3, -20 * SIZE(Y)
  535. movaps 3 * SIZE(X), %xmm1
  536. movaps 7 * SIZE(X), %xmm2
  537. #if defined(PREFETCHW) && !defined(FETCH128)
  538. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  539. #endif
  540. movss %xmm5, %xmm4
  541. SHUFPS_39 %xmm4, %xmm4
  542. mulps ALPHA, %xmm4
  543. addps -16 * SIZE(Y), %xmm4
  544. movaps %xmm4, -16 * SIZE(Y)
  545. movss %xmm6, %xmm5
  546. SHUFPS_39 %xmm5, %xmm5
  547. mulps ALPHA, %xmm5
  548. addps -12 * SIZE(Y), %xmm5
  549. movaps %xmm5, -12 * SIZE(Y)
  550. movaps 11 * SIZE(X), %xmm3
  551. movaps 15 * SIZE(X), %xmm4
  552. #if defined(PREFETCH) && !defined(FETCH128)
  553. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  554. #endif
  555. movss %xmm7, %xmm6
  556. SHUFPS_39 %xmm6, %xmm6
  557. mulps ALPHA, %xmm6
  558. addps -8 * SIZE(Y), %xmm6
  559. movaps %xmm6, -8 * SIZE(Y)
  560. movss %xmm0, %xmm7
  561. SHUFPS_39 %xmm7, %xmm7
  562. mulps ALPHA, %xmm7
  563. addps -4 * SIZE(Y), %xmm7
  564. movaps %xmm7, -4 * SIZE(Y)
  565. subq $-32 * SIZE, X
  566. subq $-32 * SIZE, Y
  567. decq %rax
  568. jg .L31
  569. ALIGN_3
  570. .L32:
  571. movaps -13 * SIZE(X), %xmm5
  572. movaps -9 * SIZE(X), %xmm6
  573. movss %xmm1, %xmm0
  574. SHUFPS_39 %xmm0, %xmm0
  575. mulps ALPHA, %xmm0
  576. addps -32 * SIZE(Y), %xmm0
  577. movaps %xmm0, -32 * SIZE(Y)
  578. movss %xmm2, %xmm1
  579. SHUFPS_39 %xmm1, %xmm1
  580. mulps ALPHA, %xmm1
  581. addps -28 * SIZE(Y), %xmm1
  582. movaps %xmm1, -28 * SIZE(Y)
  583. movaps -5 * SIZE(X), %xmm7
  584. movaps -1 * SIZE(X), %xmm0
  585. movss %xmm3, %xmm2
  586. SHUFPS_39 %xmm2, %xmm2
  587. mulps ALPHA, %xmm2
  588. addps -24 * SIZE(Y), %xmm2
  589. movaps %xmm2, -24 * SIZE(Y)
  590. movss %xmm4, %xmm3
  591. SHUFPS_39 %xmm3, %xmm3
  592. mulps ALPHA, %xmm3
  593. addps -20 * SIZE(Y), %xmm3
  594. movaps %xmm3, -20 * SIZE(Y)
  595. movss %xmm5, %xmm4
  596. SHUFPS_39 %xmm4, %xmm4
  597. mulps ALPHA, %xmm4
  598. addps -16 * SIZE(Y), %xmm4
  599. movaps %xmm4, -16 * SIZE(Y)
  600. movss %xmm6, %xmm5
  601. SHUFPS_39 %xmm5, %xmm5
  602. mulps ALPHA, %xmm5
  603. addps -12 * SIZE(Y), %xmm5
  604. movaps %xmm5, -12 * SIZE(Y)
  605. movss %xmm7, %xmm6
  606. SHUFPS_39 %xmm6, %xmm6
  607. mulps ALPHA, %xmm6
  608. addps -8 * SIZE(Y), %xmm6
  609. movaps %xmm6, -8 * SIZE(Y)
  610. movss %xmm0, %xmm7
  611. SHUFPS_39 %xmm7, %xmm7
  612. mulps ALPHA, %xmm7
  613. addps -4 * SIZE(Y), %xmm7
  614. movaps %xmm7, -4 * SIZE(Y)
  615. subq $-32 * SIZE, X
  616. subq $-32 * SIZE, Y
  617. ALIGN_3
  618. .L33:
  619. movq M, %rax
  620. andq $16, %rax
  621. jle .L34
  622. ALIGN_3
  623. movaps -29 * SIZE(X), %xmm1
  624. movaps -25 * SIZE(X), %xmm2
  625. movaps -21 * SIZE(X), %xmm3
  626. movaps -17 * SIZE(X), %xmm4
  627. movss %xmm1, %xmm0
  628. SHUFPS_39 %xmm0, %xmm0
  629. mulps ALPHA, %xmm0
  630. addps -32 * SIZE(Y), %xmm0
  631. movss %xmm2, %xmm1
  632. SHUFPS_39 %xmm1, %xmm1
  633. mulps ALPHA, %xmm1
  634. addps -28 * SIZE(Y), %xmm1
  635. movss %xmm3, %xmm2
  636. SHUFPS_39 %xmm2, %xmm2
  637. mulps ALPHA, %xmm2
  638. addps -24 * SIZE(Y), %xmm2
  639. movss %xmm4, %xmm3
  640. SHUFPS_39 %xmm3, %xmm3
  641. mulps ALPHA, %xmm3
  642. addps -20 * SIZE(Y), %xmm3
  643. movaps %xmm0, -32 * SIZE(Y)
  644. movaps %xmm1, -28 * SIZE(Y)
  645. movaps %xmm2, -24 * SIZE(Y)
  646. movaps %xmm3, -20 * SIZE(Y)
  647. movaps %xmm4, %xmm0
  648. addq $16 * SIZE, X
  649. addq $16 * SIZE, Y
  650. ALIGN_3
  651. .L34:
  652. movq M, %rax
  653. andq $8, %rax
  654. jle .L35
  655. ALIGN_3
  656. movaps -29 * SIZE(X), %xmm1
  657. movaps -25 * SIZE(X), %xmm2
  658. movss %xmm1, %xmm0
  659. SHUFPS_39 %xmm0, %xmm0
  660. mulps ALPHA, %xmm0
  661. addps -32 * SIZE(Y), %xmm0
  662. movss %xmm2, %xmm1
  663. SHUFPS_39 %xmm1, %xmm1
  664. mulps ALPHA, %xmm1
  665. addps -28 * SIZE(Y), %xmm1
  666. movaps %xmm0, -32 * SIZE(Y)
  667. movaps %xmm1, -28 * SIZE(Y)
  668. movaps %xmm2, %xmm0
  669. addq $8 * SIZE, X
  670. addq $8 * SIZE, Y
  671. ALIGN_3
  672. .L35:
  673. movq M, %rax
  674. andq $4, %rax
  675. jle .L36
  676. ALIGN_3
  677. movaps -29 * SIZE(X), %xmm1
  678. movss %xmm1, %xmm0
  679. SHUFPS_39 %xmm0, %xmm0
  680. mulps ALPHA, %xmm0
  681. addps -32 * SIZE(Y), %xmm0
  682. movaps %xmm0, -32 * SIZE(Y)
  683. addq $4 * SIZE, X
  684. addq $4 * SIZE, Y
  685. ALIGN_3
  686. .L36:
  687. movq M, %rax
  688. andq $2, %rax
  689. jle .L37
  690. ALIGN_3
  691. movsd -32 * SIZE(X), %xmm0
  692. movsd -32 * SIZE(Y), %xmm4
  693. mulps ALPHA, %xmm0
  694. addps %xmm4, %xmm0
  695. movsd %xmm0, -32 * SIZE(Y)
  696. addq $2 * SIZE, X
  697. addq $2 * SIZE, Y
  698. ALIGN_3
  699. .L37:
  700. movq M, %rax
  701. andq $1, %rax
  702. jle .L39
  703. ALIGN_3
  704. movss -32 * SIZE(X), %xmm0
  705. mulss ALPHA, %xmm0
  706. addss -32 * SIZE(Y), %xmm0
  707. movss %xmm0, -32 * SIZE(Y)
  708. addq $SIZE, Y
  709. ALIGN_3
  710. .L39:
  711. xorq %rax,%rax
  712. RESTOREREGISTERS
  713. ret
  714. ALIGN_3
  715. .L40:
  716. movaps -35 * SIZE(X), %xmm0
  717. movq M, %rax
  718. sarq $5, %rax
  719. jle .L43
  720. movaps -31 * SIZE(X), %xmm1
  721. movaps -27 * SIZE(X), %xmm2
  722. movaps -23 * SIZE(X), %xmm3
  723. movaps -19 * SIZE(X), %xmm4
  724. decq %rax
  725. jle .L42
  726. ALIGN_4
  727. .L41:
  728. movaps -15 * SIZE(X), %xmm5
  729. movaps -11 * SIZE(X), %xmm6
  730. #ifdef PREFETCHW
  731. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  732. #endif
  733. movss %xmm1, %xmm0
  734. shufps $0x93, %xmm1, %xmm0
  735. mulps ALPHA, %xmm0
  736. addps -32 * SIZE(Y), %xmm0
  737. movaps %xmm0, -32 * SIZE(Y)
  738. movss %xmm2, %xmm1
  739. shufps $0x93, %xmm2, %xmm1
  740. mulps ALPHA, %xmm1
  741. addps -28 * SIZE(Y), %xmm1
  742. movaps %xmm1, -28 * SIZE(Y)
  743. movaps -7 * SIZE(X), %xmm7
  744. movaps -3 * SIZE(X), %xmm0
  745. #ifdef PREFETCH
  746. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  747. #endif
  748. movss %xmm3, %xmm2
  749. shufps $0x93, %xmm3, %xmm2
  750. mulps ALPHA, %xmm2
  751. addps -24 * SIZE(Y), %xmm2
  752. movaps %xmm2, -24 * SIZE(Y)
  753. movss %xmm4, %xmm3
  754. shufps $0x93, %xmm4, %xmm3
  755. mulps ALPHA, %xmm3
  756. addps -20 * SIZE(Y), %xmm3
  757. movaps %xmm3, -20 * SIZE(Y)
  758. movaps 1 * SIZE(X), %xmm1
  759. movaps 5 * SIZE(X), %xmm2
  760. #if defined(PREFETCHW) && !defined(FETCH128)
  761. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  762. #endif
  763. movss %xmm5, %xmm4
  764. shufps $0x93, %xmm5, %xmm4
  765. mulps ALPHA, %xmm4
  766. addps -16 * SIZE(Y), %xmm4
  767. movaps %xmm4, -16 * SIZE(Y)
  768. movss %xmm6, %xmm5
  769. shufps $0x93, %xmm6, %xmm5
  770. mulps ALPHA, %xmm5
  771. addps -12 * SIZE(Y), %xmm5
  772. movaps %xmm5, -12 * SIZE(Y)
  773. movaps 9 * SIZE(X), %xmm3
  774. movaps 13 * SIZE(X), %xmm4
  775. #if defined(PREFETCH) && !defined(FETCH128)
  776. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  777. #endif
  778. movss %xmm7, %xmm6
  779. shufps $0x93, %xmm7, %xmm6
  780. mulps ALPHA, %xmm6
  781. addps -8 * SIZE(Y), %xmm6
  782. movaps %xmm6, -8 * SIZE(Y)
  783. movss %xmm0, %xmm7
  784. shufps $0x93, %xmm0, %xmm7
  785. mulps ALPHA, %xmm7
  786. addps -4 * SIZE(Y), %xmm7
  787. movaps %xmm7, -4 * SIZE(Y)
  788. subq $-32 * SIZE, X
  789. subq $-32 * SIZE, Y
  790. decq %rax
  791. jg .L41
  792. ALIGN_3
  793. .L42:
  794. movaps -15 * SIZE(X), %xmm5
  795. movaps -11 * SIZE(X), %xmm6
  796. movss %xmm1, %xmm0
  797. shufps $0x93, %xmm1, %xmm0
  798. mulps ALPHA, %xmm0
  799. addps -32 * SIZE(Y), %xmm0
  800. movaps %xmm0, -32 * SIZE(Y)
  801. movss %xmm2, %xmm1
  802. shufps $0x93, %xmm2, %xmm1
  803. mulps ALPHA, %xmm1
  804. addps -28 * SIZE(Y), %xmm1
  805. movaps %xmm1, -28 * SIZE(Y)
  806. movaps -7 * SIZE(X), %xmm7
  807. movaps -3 * SIZE(X), %xmm0
  808. movss %xmm3, %xmm2
  809. shufps $0x93, %xmm3, %xmm2
  810. mulps ALPHA, %xmm2
  811. addps -24 * SIZE(Y), %xmm2
  812. movaps %xmm2, -24 * SIZE(Y)
  813. movss %xmm4, %xmm3
  814. shufps $0x93, %xmm4, %xmm3
  815. mulps ALPHA, %xmm3
  816. addps -20 * SIZE(Y), %xmm3
  817. movaps %xmm3, -20 * SIZE(Y)
  818. movss %xmm5, %xmm4
  819. shufps $0x93, %xmm5, %xmm4
  820. mulps ALPHA, %xmm4
  821. addps -16 * SIZE(Y), %xmm4
  822. movaps %xmm4, -16 * SIZE(Y)
  823. movss %xmm6, %xmm5
  824. shufps $0x93, %xmm6, %xmm5
  825. mulps ALPHA, %xmm5
  826. addps -12 * SIZE(Y), %xmm5
  827. movaps %xmm5, -12 * SIZE(Y)
  828. movss %xmm7, %xmm6
  829. shufps $0x93, %xmm7, %xmm6
  830. mulps ALPHA, %xmm6
  831. addps -8 * SIZE(Y), %xmm6
  832. movaps %xmm6, -8 * SIZE(Y)
  833. movss %xmm0, %xmm7
  834. shufps $0x93, %xmm0, %xmm7
  835. mulps ALPHA, %xmm7
  836. addps -4 * SIZE(Y), %xmm7
  837. movaps %xmm7, -4 * SIZE(Y)
  838. subq $-32 * SIZE, X
  839. subq $-32 * SIZE, Y
  840. ALIGN_3
  841. .L43:
  842. movq M, %rax
  843. andq $16, %rax
  844. jle .L44
  845. ALIGN_3
  846. movaps -31 * SIZE(X), %xmm1
  847. movaps -27 * SIZE(X), %xmm2
  848. movaps -23 * SIZE(X), %xmm3
  849. movaps -19 * SIZE(X), %xmm4
  850. movss %xmm1, %xmm0
  851. shufps $0x93, %xmm1, %xmm0
  852. mulps ALPHA, %xmm0
  853. addps -32 * SIZE(Y), %xmm0
  854. movss %xmm2, %xmm1
  855. shufps $0x93, %xmm2, %xmm1
  856. mulps ALPHA, %xmm1
  857. addps -28 * SIZE(Y), %xmm1
  858. movss %xmm3, %xmm2
  859. shufps $0x93, %xmm3, %xmm2
  860. mulps ALPHA, %xmm2
  861. addps -24 * SIZE(Y), %xmm2
  862. movss %xmm4, %xmm3
  863. shufps $0x93, %xmm4, %xmm3
  864. mulps ALPHA, %xmm3
  865. addps -20 * SIZE(Y), %xmm3
  866. movaps %xmm0, -32 * SIZE(Y)
  867. movaps %xmm1, -28 * SIZE(Y)
  868. movaps %xmm2, -24 * SIZE(Y)
  869. movaps %xmm3, -20 * SIZE(Y)
  870. movaps %xmm4, %xmm0
  871. addq $16 * SIZE, X
  872. addq $16 * SIZE, Y
  873. ALIGN_3
  874. .L44:
  875. movq M, %rax
  876. andq $8, %rax
  877. jle .L45
  878. ALIGN_3
  879. movaps -31 * SIZE(X), %xmm1
  880. movaps -27 * SIZE(X), %xmm2
  881. movss %xmm1, %xmm0
  882. shufps $0x93, %xmm1, %xmm0
  883. mulps ALPHA, %xmm0
  884. addps -32 * SIZE(Y), %xmm0
  885. movss %xmm2, %xmm1
  886. shufps $0x93, %xmm2, %xmm1
  887. mulps ALPHA, %xmm1
  888. addps -28 * SIZE(Y), %xmm1
  889. movaps %xmm0, -32 * SIZE(Y)
  890. movaps %xmm1, -28 * SIZE(Y)
  891. movaps %xmm2, %xmm0
  892. addq $8 * SIZE, X
  893. addq $8 * SIZE, Y
  894. ALIGN_3
  895. .L45:
  896. movq M, %rax
  897. andq $4, %rax
  898. jle .L46
  899. ALIGN_3
  900. movaps -31 * SIZE(X), %xmm1
  901. movss %xmm1, %xmm0
  902. shufps $0x93, %xmm1, %xmm0
  903. mulps ALPHA, %xmm0
  904. addps -32 * SIZE(Y), %xmm0
  905. movaps %xmm0, -32 * SIZE(Y)
  906. addq $4 * SIZE, X
  907. addq $4 * SIZE, Y
  908. ALIGN_3
  909. .L46:
  910. movq M, %rax
  911. andq $2, %rax
  912. jle .L47
  913. ALIGN_3
  914. movsd -32 * SIZE(X), %xmm0
  915. movsd -32 * SIZE(Y), %xmm4
  916. mulps ALPHA, %xmm0
  917. addps %xmm4, %xmm0
  918. movsd %xmm0, -32 * SIZE(Y)
  919. addq $2 * SIZE, X
  920. addq $2 * SIZE, Y
  921. ALIGN_3
  922. .L47:
  923. movq M, %rax
  924. andq $1, %rax
  925. jle .L49
  926. ALIGN_3
  927. movss -32 * SIZE(X), %xmm0
  928. mulss ALPHA, %xmm0
  929. addss -32 * SIZE(Y), %xmm0
  930. movss %xmm0, -32 * SIZE(Y)
  931. addq $SIZE, Y
  932. ALIGN_3
  933. .L49:
  934. xorq %rax,%rax
  935. RESTOREREGISTERS
  936. ret
  937. #else
  938. movq M, %rax
  939. sarq $5, %rax
  940. jle .L23
  941. movsd -32 * SIZE(X), %xmm0
  942. movhps -30 * SIZE(X), %xmm0
  943. movsd -28 * SIZE(X), %xmm1
  944. movhps -26 * SIZE(X), %xmm1
  945. movsd -24 * SIZE(X), %xmm2
  946. movhps -22 * SIZE(X), %xmm2
  947. movsd -20 * SIZE(X), %xmm3
  948. movhps -18 * SIZE(X), %xmm3
  949. decq %rax
  950. jle .L22
  951. ALIGN_4
  952. .L21:
  953. movsd -16 * SIZE(X), %xmm4
  954. movhps -14 * SIZE(X), %xmm4
  955. movsd -12 * SIZE(X), %xmm5
  956. movhps -10 * SIZE(X), %xmm5
  957. #ifdef PREFETCHW
  958. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  959. #endif
  960. mulps ALPHA, %xmm0
  961. addps -32 * SIZE(Y), %xmm0
  962. movaps %xmm0, -32 * SIZE(Y)
  963. mulps ALPHA, %xmm1
  964. addps -28 * SIZE(Y), %xmm1
  965. movaps %xmm1, -28 * SIZE(Y)
  966. movsd -8 * SIZE(X), %xmm6
  967. movhps -6 * SIZE(X), %xmm6
  968. movsd -4 * SIZE(X), %xmm7
  969. movhps -2 * SIZE(X), %xmm7
  970. #ifdef PREFETCH
  971. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  972. #endif
  973. mulps ALPHA, %xmm2
  974. addps -24 * SIZE(Y), %xmm2
  975. movaps %xmm2, -24 * SIZE(Y)
  976. mulps ALPHA, %xmm3
  977. addps -20 * SIZE(Y), %xmm3
  978. movaps %xmm3, -20 * SIZE(Y)
  979. movsd 0 * SIZE(X), %xmm0
  980. movhps 2 * SIZE(X), %xmm0
  981. movsd 4 * SIZE(X), %xmm1
  982. movhps 6 * SIZE(X), %xmm1
  983. #if defined(PREFETCHW) && !defined(FETCH128)
  984. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  985. #endif
  986. mulps ALPHA, %xmm4
  987. addps -16 * SIZE(Y), %xmm4
  988. movaps %xmm4, -16 * SIZE(Y)
  989. mulps ALPHA, %xmm5
  990. addps -12 * SIZE(Y), %xmm5
  991. movaps %xmm5, -12 * SIZE(Y)
  992. movsd 8 * SIZE(X), %xmm2
  993. movhps 10 * SIZE(X), %xmm2
  994. movsd 12 * SIZE(X), %xmm3
  995. movhps 14 * SIZE(X), %xmm3
  996. #if defined(PREFETCH) && !defined(FETCH128)
  997. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  998. #endif
  999. mulps ALPHA, %xmm6
  1000. addps -8 * SIZE(Y), %xmm6
  1001. movaps %xmm6, -8 * SIZE(Y)
  1002. mulps ALPHA, %xmm7
  1003. addps -4 * SIZE(Y), %xmm7
  1004. movaps %xmm7, -4 * SIZE(Y)
  1005. subq $-32 * SIZE, X
  1006. subq $-32 * SIZE, Y
  1007. decq %rax
  1008. jg .L21
  1009. ALIGN_3
  1010. .L22:
  1011. movsd -16 * SIZE(X), %xmm4
  1012. movhps -14 * SIZE(X), %xmm4
  1013. movsd -12 * SIZE(X), %xmm5
  1014. movhps -10 * SIZE(X), %xmm5
  1015. mulps ALPHA, %xmm0
  1016. addps -32 * SIZE(Y), %xmm0
  1017. movaps %xmm0, -32 * SIZE(Y)
  1018. mulps ALPHA, %xmm1
  1019. addps -28 * SIZE(Y), %xmm1
  1020. movaps %xmm1, -28 * SIZE(Y)
  1021. movsd -8 * SIZE(X), %xmm6
  1022. movhps -6 * SIZE(X), %xmm6
  1023. movsd -4 * SIZE(X), %xmm7
  1024. movhps -2 * SIZE(X), %xmm7
  1025. mulps ALPHA, %xmm2
  1026. addps -24 * SIZE(Y), %xmm2
  1027. movaps %xmm2, -24 * SIZE(Y)
  1028. mulps ALPHA, %xmm3
  1029. addps -20 * SIZE(Y), %xmm3
  1030. movaps %xmm3, -20 * SIZE(Y)
  1031. mulps ALPHA, %xmm4
  1032. addps -16 * SIZE(Y), %xmm4
  1033. movaps %xmm4, -16 * SIZE(Y)
  1034. mulps ALPHA, %xmm5
  1035. addps -12 * SIZE(Y), %xmm5
  1036. movaps %xmm5, -12 * SIZE(Y)
  1037. mulps ALPHA, %xmm6
  1038. addps -8 * SIZE(Y), %xmm6
  1039. movaps %xmm6, -8 * SIZE(Y)
  1040. mulps ALPHA, %xmm7
  1041. addps -4 * SIZE(Y), %xmm7
  1042. movaps %xmm7, -4 * SIZE(Y)
  1043. subq $-32 * SIZE, X
  1044. subq $-32 * SIZE, Y
  1045. ALIGN_3
  1046. .L23:
  1047. movq M, %rax
  1048. andq $16, %rax
  1049. jle .L24
  1050. ALIGN_3
  1051. movsd -32 * SIZE(X), %xmm0
  1052. movhps -30 * SIZE(X), %xmm0
  1053. movsd -28 * SIZE(X), %xmm1
  1054. movhps -26 * SIZE(X), %xmm1
  1055. mulps ALPHA, %xmm0
  1056. addps -32 * SIZE(Y), %xmm0
  1057. movaps %xmm0, -32 * SIZE(Y)
  1058. mulps ALPHA, %xmm1
  1059. addps -28 * SIZE(Y), %xmm1
  1060. movaps %xmm1, -28 * SIZE(Y)
  1061. movsd -24 * SIZE(X), %xmm2
  1062. movhps -22 * SIZE(X), %xmm2
  1063. movsd -20 * SIZE(X), %xmm3
  1064. movhps -18 * SIZE(X), %xmm3
  1065. mulps ALPHA, %xmm2
  1066. addps -24 * SIZE(Y), %xmm2
  1067. movaps %xmm2, -24 * SIZE(Y)
  1068. mulps ALPHA, %xmm3
  1069. addps -20 * SIZE(Y), %xmm3
  1070. movaps %xmm3, -20 * SIZE(Y)
  1071. addq $16 * SIZE, X
  1072. addq $16 * SIZE, Y
  1073. ALIGN_3
  1074. .L24:
  1075. movq M, %rax
  1076. andq $8, %rax
  1077. jle .L25
  1078. ALIGN_3
  1079. movsd -32 * SIZE(X), %xmm0
  1080. movhps -30 * SIZE(X), %xmm0
  1081. movsd -28 * SIZE(X), %xmm1
  1082. movhps -26 * SIZE(X), %xmm1
  1083. mulps ALPHA, %xmm0
  1084. addps -32 * SIZE(Y), %xmm0
  1085. mulps ALPHA, %xmm1
  1086. addps -28 * SIZE(Y), %xmm1
  1087. movaps %xmm0, -32 * SIZE(Y)
  1088. movaps %xmm1, -28 * SIZE(Y)
  1089. addq $8 * SIZE, X
  1090. addq $8 * SIZE, Y
  1091. ALIGN_3
  1092. .L25:
  1093. movq M, %rax
  1094. andq $4, %rax
  1095. jle .L26
  1096. ALIGN_3
  1097. movsd -32 * SIZE(X), %xmm0
  1098. movhps -30 * SIZE(X), %xmm0
  1099. mulps ALPHA, %xmm0
  1100. addps -32 * SIZE(Y), %xmm0
  1101. movaps %xmm0, -32 * SIZE(Y)
  1102. addq $4 * SIZE, X
  1103. addq $4 * SIZE, Y
  1104. ALIGN_3
  1105. .L26:
  1106. movq M, %rax
  1107. andq $2, %rax
  1108. jle .L27
  1109. ALIGN_3
  1110. movsd -32 * SIZE(X), %xmm0
  1111. movsd -32 * SIZE(Y), %xmm4
  1112. mulps ALPHA, %xmm0
  1113. addps %xmm4, %xmm0
  1114. movsd %xmm0, -32 * SIZE(Y)
  1115. addq $2 * SIZE, X
  1116. addq $2 * SIZE, Y
  1117. ALIGN_3
  1118. .L27:
  1119. movq M, %rax
  1120. andq $1, %rax
  1121. jle .L29
  1122. ALIGN_3
  1123. movss -32 * SIZE(X), %xmm0
  1124. mulss ALPHA, %xmm0
  1125. addss -32 * SIZE(Y), %xmm0
  1126. movss %xmm0, -32 * SIZE(Y)
  1127. addq $SIZE, Y
  1128. ALIGN_3
  1129. .L29:
  1130. xorq %rax,%rax
  1131. RESTOREREGISTERS
  1132. ret
  1133. #endif
  1134. ALIGN_3
  1135. .L50:
  1136. movq M, %rax
  1137. movq Y, YY
  1138. //If incx==0 || incy==0, avoid unloop.
  1139. cmpq $0, INCX
  1140. je .L56
  1141. cmpq $0, INCY
  1142. je .L56
  1143. sarq $3, %rax
  1144. jle .L55
  1145. ALIGN_3
  1146. .L51:
  1147. movss (X), %xmm0
  1148. addq INCX, X
  1149. mulss ALPHA, %xmm0
  1150. movss (YY), %xmm6
  1151. addq INCY, YY
  1152. addss %xmm6, %xmm0
  1153. movss (X), %xmm1
  1154. addq INCX, X
  1155. mulss ALPHA, %xmm1
  1156. movss (YY), %xmm6
  1157. addq INCY, YY
  1158. addss %xmm6, %xmm1
  1159. movss (X), %xmm2
  1160. addq INCX, X
  1161. mulss ALPHA, %xmm2
  1162. movss (YY), %xmm6
  1163. addq INCY, YY
  1164. addss %xmm6, %xmm2
  1165. movss (X), %xmm3
  1166. addq INCX, X
  1167. mulss ALPHA, %xmm3
  1168. movss (YY), %xmm6
  1169. addq INCY, YY
  1170. addss %xmm6, %xmm3
  1171. movss %xmm0, (Y)
  1172. addq INCY, Y
  1173. movss %xmm1, (Y)
  1174. addq INCY, Y
  1175. movss %xmm2, (Y)
  1176. addq INCY, Y
  1177. movss %xmm3, (Y)
  1178. addq INCY, Y
  1179. movss (X), %xmm0
  1180. addq INCX, X
  1181. mulss ALPHA, %xmm0
  1182. movss (YY), %xmm6
  1183. addq INCY, YY
  1184. addss %xmm6, %xmm0
  1185. movss (X), %xmm1
  1186. addq INCX, X
  1187. mulss ALPHA, %xmm1
  1188. movss (YY), %xmm6
  1189. addq INCY, YY
  1190. addss %xmm6, %xmm1
  1191. movss (X), %xmm2
  1192. addq INCX, X
  1193. mulss ALPHA, %xmm2
  1194. movss (YY), %xmm6
  1195. addq INCY, YY
  1196. addss %xmm6, %xmm2
  1197. movss (X), %xmm3
  1198. addq INCX, X
  1199. mulss ALPHA, %xmm3
  1200. movss (YY), %xmm6
  1201. addq INCY, YY
  1202. addss %xmm6, %xmm3
  1203. movss %xmm0, (Y)
  1204. addq INCY, Y
  1205. movss %xmm1, (Y)
  1206. addq INCY, Y
  1207. movss %xmm2, (Y)
  1208. addq INCY, Y
  1209. movss %xmm3, (Y)
  1210. addq INCY, Y
  1211. decq %rax
  1212. jg .L51
  1213. ALIGN_3
  1214. .L55:
  1215. movq M, %rax
  1216. andq $7, %rax
  1217. jle .L59
  1218. ALIGN_3
  1219. .L56:
  1220. movss (X), %xmm0
  1221. addq INCX, X
  1222. mulss ALPHA, %xmm0
  1223. movss (Y), %xmm6
  1224. addss %xmm6, %xmm0
  1225. movss %xmm0, (Y)
  1226. addq INCY, Y
  1227. decq %rax
  1228. jg .L56
  1229. ALIGN_3
  1230. .L59:
  1231. xorq %rax,%rax
  1232. RESTOREREGISTERS
  1233. ret
  1234. ALIGN_3
  1235. EPILOGUE