You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t_dup.S 25 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "l2param.h"
  41. #ifndef WINDOWS_ABI
  42. #define STACKSIZE 64
  43. #define OLD_INCX 8 + STACKSIZE(%rsp)
  44. #define OLD_Y 16 + STACKSIZE(%rsp)
  45. #define OLD_INCY 24 + STACKSIZE(%rsp)
  46. #define OLD_BUFFER 32 + STACKSIZE(%rsp)
  47. #define M %rdi
  48. #define N %rsi
  49. #define A %rcx
  50. #define LDA %r8
  51. #define X %r9
  52. #define INCX %rdx
  53. #define Y %rbp
  54. #define INCY %r10
  55. #else
  56. #define STACKSIZE 256
  57. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  58. #define OLD_A 48 + STACKSIZE(%rsp)
  59. #define OLD_LDA 56 + STACKSIZE(%rsp)
  60. #define OLD_X 64 + STACKSIZE(%rsp)
  61. #define OLD_INCX 72 + STACKSIZE(%rsp)
  62. #define OLD_Y 80 + STACKSIZE(%rsp)
  63. #define OLD_INCY 88 + STACKSIZE(%rsp)
  64. #define OLD_BUFFER 96 + STACKSIZE(%rsp)
  65. #define M %rcx
  66. #define N %rdx
  67. #define A %r8
  68. #define LDA %r9
  69. #define X %rdi
  70. #define INCX %rsi
  71. #define Y %rbp
  72. #define INCY %r10
  73. #endif
  74. #define I %rax
  75. #define J %rbx
  76. #define A1 %r11
  77. #define A2 %r12
  78. #define X1 %r13
  79. #define Y1 %r14
  80. #define BUFFER %r15
  81. #define ALPHA_R %xmm14
  82. #define ALPHA_I %xmm15
  83. #undef SUBPD
  84. #ifndef CONJ
  85. #define SUBPD addpd
  86. #else
  87. #define SUBPD subpd
  88. #endif
  89. PROLOGUE
  90. PROFCODE
  91. subq $STACKSIZE, %rsp
  92. movq %rbx, 0(%rsp)
  93. movq %rbp, 8(%rsp)
  94. movq %r12, 16(%rsp)
  95. movq %r13, 24(%rsp)
  96. movq %r14, 32(%rsp)
  97. movq %r15, 40(%rsp)
  98. #ifdef WINDOWS_ABI
  99. movq %rdi, 48(%rsp)
  100. movq %rsi, 56(%rsp)
  101. movups %xmm6, 64(%rsp)
  102. movups %xmm7, 80(%rsp)
  103. movups %xmm8, 96(%rsp)
  104. movups %xmm9, 112(%rsp)
  105. movups %xmm10, 128(%rsp)
  106. movups %xmm11, 144(%rsp)
  107. movups %xmm12, 160(%rsp)
  108. movups %xmm13, 176(%rsp)
  109. movups %xmm14, 192(%rsp)
  110. movups %xmm15, 208(%rsp)
  111. movq OLD_A, A
  112. movq OLD_LDA, LDA
  113. movq OLD_X, X
  114. movaps %xmm3, %xmm0
  115. movss OLD_ALPHA_I, %xmm1
  116. #endif
  117. movq OLD_INCX, INCX
  118. movq OLD_Y, Y
  119. movq OLD_INCY, INCY
  120. movq OLD_BUFFER, BUFFER
  121. salq $ZBASE_SHIFT, LDA
  122. salq $ZBASE_SHIFT, INCX
  123. salq $ZBASE_SHIFT, INCY
  124. pcmpeqb %xmm5, %xmm5
  125. psllq $63, %xmm5
  126. shufps $0x04, %xmm5, %xmm5
  127. unpcklpd %xmm1, %xmm0
  128. movaps %xmm0, ALPHA_R
  129. pshufd $0x4e, %xmm0, ALPHA_I
  130. xorps %xmm5, ALPHA_I
  131. subq $-16 * SIZE, A
  132. testq M, M
  133. jle .L999
  134. testq N, N
  135. jle .L999
  136. ALIGN_3
  137. movq BUFFER, X1
  138. movq Y, Y1
  139. movq M, I
  140. sarq $2, I
  141. jle .L05
  142. ALIGN_4
  143. .L02:
  144. movsd 0 * SIZE(X), %xmm0
  145. movhpd 1 * SIZE(X), %xmm0
  146. addq INCX, X
  147. movsd 0 * SIZE(X), %xmm1
  148. movhpd 1 * SIZE(X), %xmm1
  149. addq INCX, X
  150. movsd 0 * SIZE(X), %xmm2
  151. movhpd 1 * SIZE(X), %xmm2
  152. addq INCX, X
  153. movsd 0 * SIZE(X), %xmm3
  154. movhpd 1 * SIZE(X), %xmm3
  155. addq INCX, X
  156. movapd %xmm0, 0 * SIZE(X1)
  157. movapd %xmm1, 2 * SIZE(X1)
  158. movapd %xmm2, 4 * SIZE(X1)
  159. movapd %xmm3, 6 * SIZE(X1)
  160. addq $8 * SIZE, X1
  161. decq I
  162. jg .L02
  163. ALIGN_4
  164. .L05:
  165. movq M, I
  166. andq $3, I
  167. jle .L10
  168. ALIGN_2
  169. .L06:
  170. movsd 0 * SIZE(X), %xmm0
  171. movhpd 1 * SIZE(X), %xmm0
  172. addq INCX, X
  173. movapd %xmm0, 0 * SIZE(X1)
  174. addq $2 * SIZE, X1
  175. decq I
  176. jg .L06
  177. ALIGN_4
  178. .L10:
  179. #if GEMV_UNROLL >= 4
  180. cmpq $4, N
  181. jl .L20
  182. ALIGN_3
  183. .L11:
  184. subq $4, N
  185. leaq 16 * SIZE(BUFFER), X1
  186. movq A, A1
  187. leaq (A1, LDA, 2), A2
  188. leaq (A1, LDA, 4), A
  189. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  190. xorps %xmm0, %xmm0
  191. xorps %xmm1, %xmm1
  192. xorps %xmm2, %xmm2
  193. xorps %xmm3, %xmm3
  194. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  195. xorps %xmm4, %xmm4
  196. xorps %xmm5, %xmm5
  197. xorps %xmm6, %xmm6
  198. xorps %xmm7, %xmm7
  199. #ifdef PREFETCHW
  200. PREFETCHW 3 * SIZE(Y1)
  201. #endif
  202. movq M, I
  203. sarq $2, I
  204. jle .L15
  205. movddup -16 * SIZE(A1), %xmm8
  206. movddup -15 * SIZE(A1), %xmm9
  207. movddup -16 * SIZE(A1, LDA), %xmm10
  208. movddup -15 * SIZE(A1, LDA), %xmm11
  209. decq I
  210. jle .L14
  211. ALIGN_3
  212. .L13:
  213. #ifdef PREFETCH
  214. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1)
  215. #endif
  216. mulpd %xmm12, %xmm8
  217. addpd %xmm8, %xmm0
  218. movddup -16 * SIZE(A2), %xmm8
  219. mulpd %xmm12, %xmm9
  220. addpd %xmm9, %xmm1
  221. movddup -15 * SIZE(A2), %xmm9
  222. mulpd %xmm12, %xmm10
  223. addpd %xmm10, %xmm2
  224. movddup -16 * SIZE(A2, LDA), %xmm10
  225. mulpd %xmm12, %xmm11
  226. addpd %xmm11, %xmm3
  227. movddup -15 * SIZE(A2, LDA), %xmm11
  228. mulpd %xmm12, %xmm8
  229. addpd %xmm8, %xmm4
  230. movddup -14 * SIZE(A1), %xmm8
  231. mulpd %xmm12, %xmm9
  232. addpd %xmm9, %xmm5
  233. movddup -13 * SIZE(A1), %xmm9
  234. mulpd %xmm12, %xmm10
  235. addpd %xmm10, %xmm6
  236. movddup -14 * SIZE(A1, LDA), %xmm10
  237. mulpd %xmm12, %xmm11
  238. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  239. addpd %xmm11, %xmm7
  240. movddup -13 * SIZE(A1, LDA), %xmm11
  241. #ifdef PREFETCH
  242. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
  243. #endif
  244. mulpd %xmm13, %xmm8
  245. addpd %xmm8, %xmm0
  246. movddup -14 * SIZE(A2), %xmm8
  247. mulpd %xmm13, %xmm9
  248. addpd %xmm9, %xmm1
  249. movddup -13 * SIZE(A2), %xmm9
  250. mulpd %xmm13, %xmm10
  251. addpd %xmm10, %xmm2
  252. movddup -14 * SIZE(A2, LDA), %xmm10
  253. mulpd %xmm13, %xmm11
  254. addpd %xmm11, %xmm3
  255. movddup -13 * SIZE(A2, LDA), %xmm11
  256. mulpd %xmm13, %xmm8
  257. addpd %xmm8, %xmm4
  258. movddup -12 * SIZE(A1), %xmm8
  259. mulpd %xmm13, %xmm9
  260. addpd %xmm9, %xmm5
  261. movddup -11 * SIZE(A1), %xmm9
  262. mulpd %xmm13, %xmm10
  263. addpd %xmm10, %xmm6
  264. movddup -12 * SIZE(A1, LDA), %xmm10
  265. mulpd %xmm13, %xmm11
  266. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  267. addpd %xmm11, %xmm7
  268. movddup -11 * SIZE(A1, LDA), %xmm11
  269. #ifdef PREFETCH
  270. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2)
  271. #endif
  272. mulpd %xmm12, %xmm8
  273. addpd %xmm8, %xmm0
  274. movddup -12 * SIZE(A2), %xmm8
  275. mulpd %xmm12, %xmm9
  276. addpd %xmm9, %xmm1
  277. movddup -11 * SIZE(A2), %xmm9
  278. mulpd %xmm12, %xmm10
  279. addpd %xmm10, %xmm2
  280. movddup -12 * SIZE(A2, LDA), %xmm10
  281. mulpd %xmm12, %xmm11
  282. addpd %xmm11, %xmm3
  283. movddup -11 * SIZE(A2, LDA), %xmm11
  284. mulpd %xmm12, %xmm8
  285. addpd %xmm8, %xmm4
  286. movddup -10 * SIZE(A1), %xmm8
  287. mulpd %xmm12, %xmm9
  288. addpd %xmm9, %xmm5
  289. movddup -9 * SIZE(A1), %xmm9
  290. mulpd %xmm12, %xmm10
  291. addpd %xmm10, %xmm6
  292. movddup -10 * SIZE(A1, LDA), %xmm10
  293. mulpd %xmm12, %xmm11
  294. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  295. addpd %xmm11, %xmm7
  296. movddup -9 * SIZE(A1, LDA), %xmm11
  297. #ifdef PREFETCH
  298. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
  299. #endif
  300. mulpd %xmm13, %xmm8
  301. addpd %xmm8, %xmm0
  302. movddup -10 * SIZE(A2), %xmm8
  303. mulpd %xmm13, %xmm9
  304. addpd %xmm9, %xmm1
  305. movddup -9 * SIZE(A2), %xmm9
  306. mulpd %xmm13, %xmm10
  307. addpd %xmm10, %xmm2
  308. movddup -10 * SIZE(A2, LDA), %xmm10
  309. mulpd %xmm13, %xmm11
  310. addpd %xmm11, %xmm3
  311. movddup -9 * SIZE(A2, LDA), %xmm11
  312. #ifdef PREFETCHW
  313. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1)
  314. #endif
  315. mulpd %xmm13, %xmm8
  316. addpd %xmm8, %xmm4
  317. movddup -8 * SIZE(A1), %xmm8
  318. mulpd %xmm13, %xmm9
  319. addpd %xmm9, %xmm5
  320. movddup -7 * SIZE(A1), %xmm9
  321. mulpd %xmm13, %xmm10
  322. addpd %xmm10, %xmm6
  323. movddup -8 * SIZE(A1, LDA), %xmm10
  324. mulpd %xmm13, %xmm11
  325. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  326. addpd %xmm11, %xmm7
  327. movddup -7 * SIZE(A1, LDA), %xmm11
  328. subq $-8 * SIZE, A1
  329. subq $-8 * SIZE, A2
  330. subq $-8 * SIZE, X1
  331. subq $1, I
  332. BRANCH
  333. jg .L13
  334. ALIGN_3
  335. .L14:
  336. mulpd %xmm12, %xmm8
  337. addpd %xmm8, %xmm0
  338. movddup -16 * SIZE(A2), %xmm8
  339. mulpd %xmm12, %xmm9
  340. addpd %xmm9, %xmm1
  341. movddup -15 * SIZE(A2), %xmm9
  342. mulpd %xmm12, %xmm10
  343. addpd %xmm10, %xmm2
  344. movddup -16 * SIZE(A2, LDA), %xmm10
  345. mulpd %xmm12, %xmm11
  346. addpd %xmm11, %xmm3
  347. movddup -15 * SIZE(A2, LDA), %xmm11
  348. mulpd %xmm12, %xmm8
  349. addpd %xmm8, %xmm4
  350. movddup -14 * SIZE(A1), %xmm8
  351. mulpd %xmm12, %xmm9
  352. addpd %xmm9, %xmm5
  353. movddup -13 * SIZE(A1), %xmm9
  354. mulpd %xmm12, %xmm10
  355. addpd %xmm10, %xmm6
  356. movddup -14 * SIZE(A1, LDA), %xmm10
  357. mulpd %xmm12, %xmm11
  358. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  359. addpd %xmm11, %xmm7
  360. movddup -13 * SIZE(A1, LDA), %xmm11
  361. mulpd %xmm13, %xmm8
  362. addpd %xmm8, %xmm0
  363. movddup -14 * SIZE(A2), %xmm8
  364. mulpd %xmm13, %xmm9
  365. addpd %xmm9, %xmm1
  366. movddup -13 * SIZE(A2), %xmm9
  367. mulpd %xmm13, %xmm10
  368. addpd %xmm10, %xmm2
  369. movddup -14 * SIZE(A2, LDA), %xmm10
  370. mulpd %xmm13, %xmm11
  371. addpd %xmm11, %xmm3
  372. movddup -13 * SIZE(A2, LDA), %xmm11
  373. mulpd %xmm13, %xmm8
  374. addpd %xmm8, %xmm4
  375. movddup -12 * SIZE(A1), %xmm8
  376. mulpd %xmm13, %xmm9
  377. addpd %xmm9, %xmm5
  378. movddup -11 * SIZE(A1), %xmm9
  379. mulpd %xmm13, %xmm10
  380. addpd %xmm10, %xmm6
  381. movddup -12 * SIZE(A1, LDA), %xmm10
  382. mulpd %xmm13, %xmm11
  383. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  384. addpd %xmm11, %xmm7
  385. movddup -11 * SIZE(A1, LDA), %xmm11
  386. mulpd %xmm12, %xmm8
  387. addpd %xmm8, %xmm0
  388. movddup -12 * SIZE(A2), %xmm8
  389. mulpd %xmm12, %xmm9
  390. addpd %xmm9, %xmm1
  391. movddup -11 * SIZE(A2), %xmm9
  392. mulpd %xmm12, %xmm10
  393. addpd %xmm10, %xmm2
  394. movddup -12 * SIZE(A2, LDA), %xmm10
  395. mulpd %xmm12, %xmm11
  396. addpd %xmm11, %xmm3
  397. movddup -11 * SIZE(A2, LDA), %xmm11
  398. mulpd %xmm12, %xmm8
  399. addpd %xmm8, %xmm4
  400. movddup -10 * SIZE(A1), %xmm8
  401. mulpd %xmm12, %xmm9
  402. addpd %xmm9, %xmm5
  403. movddup -9 * SIZE(A1), %xmm9
  404. mulpd %xmm12, %xmm10
  405. addpd %xmm10, %xmm6
  406. movddup -10 * SIZE(A1, LDA), %xmm10
  407. mulpd %xmm12, %xmm11
  408. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  409. addpd %xmm11, %xmm7
  410. movddup -9 * SIZE(A1, LDA), %xmm11
  411. mulpd %xmm13, %xmm8
  412. addpd %xmm8, %xmm0
  413. movddup -10 * SIZE(A2), %xmm8
  414. mulpd %xmm13, %xmm9
  415. addpd %xmm9, %xmm1
  416. movddup -9 * SIZE(A2), %xmm9
  417. mulpd %xmm13, %xmm10
  418. addpd %xmm10, %xmm2
  419. movddup -10 * SIZE(A2, LDA), %xmm10
  420. mulpd %xmm13, %xmm11
  421. addpd %xmm11, %xmm3
  422. movddup -9 * SIZE(A2, LDA), %xmm11
  423. mulpd %xmm13, %xmm8
  424. addpd %xmm8, %xmm4
  425. mulpd %xmm13, %xmm9
  426. addpd %xmm9, %xmm5
  427. mulpd %xmm13, %xmm10
  428. addpd %xmm10, %xmm6
  429. mulpd %xmm13, %xmm11
  430. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  431. addpd %xmm11, %xmm7
  432. subq $-8 * SIZE, A1
  433. subq $-8 * SIZE, A2
  434. subq $-8 * SIZE, X1
  435. ALIGN_3
  436. .L15:
  437. testq $2, M
  438. je .L17
  439. movddup -16 * SIZE(A1), %xmm8
  440. movddup -15 * SIZE(A1), %xmm9
  441. movddup -16 * SIZE(A1, LDA), %xmm10
  442. movddup -15 * SIZE(A1, LDA), %xmm11
  443. mulpd %xmm12, %xmm8
  444. addpd %xmm8, %xmm0
  445. movddup -16 * SIZE(A2), %xmm8
  446. mulpd %xmm12, %xmm9
  447. addpd %xmm9, %xmm1
  448. movddup -15 * SIZE(A2), %xmm9
  449. mulpd %xmm12, %xmm10
  450. addpd %xmm10, %xmm2
  451. movddup -16 * SIZE(A2, LDA), %xmm10
  452. mulpd %xmm12, %xmm11
  453. addpd %xmm11, %xmm3
  454. movddup -15 * SIZE(A2, LDA), %xmm11
  455. mulpd %xmm12, %xmm8
  456. addpd %xmm8, %xmm4
  457. movddup -14 * SIZE(A1), %xmm8
  458. mulpd %xmm12, %xmm9
  459. addpd %xmm9, %xmm5
  460. movddup -13 * SIZE(A1), %xmm9
  461. mulpd %xmm12, %xmm10
  462. addpd %xmm10, %xmm6
  463. movddup -14 * SIZE(A1, LDA), %xmm10
  464. mulpd %xmm12, %xmm11
  465. addpd %xmm11, %xmm7
  466. movddup -13 * SIZE(A1, LDA), %xmm11
  467. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  468. mulpd %xmm13, %xmm8
  469. addpd %xmm8, %xmm0
  470. movddup -14 * SIZE(A2), %xmm8
  471. mulpd %xmm13, %xmm9
  472. addpd %xmm9, %xmm1
  473. movddup -13 * SIZE(A2), %xmm9
  474. mulpd %xmm13, %xmm10
  475. addpd %xmm10, %xmm2
  476. movddup -14 * SIZE(A2, LDA), %xmm10
  477. mulpd %xmm13, %xmm11
  478. addpd %xmm11, %xmm3
  479. movddup -13 * SIZE(A2, LDA), %xmm11
  480. mulpd %xmm13, %xmm8
  481. addpd %xmm8, %xmm4
  482. mulpd %xmm13, %xmm9
  483. addpd %xmm9, %xmm5
  484. mulpd %xmm13, %xmm10
  485. addpd %xmm10, %xmm6
  486. mulpd %xmm13, %xmm11
  487. addpd %xmm11, %xmm7
  488. addq $4 * SIZE, A1
  489. addq $4 * SIZE, A2
  490. ALIGN_3
  491. .L17:
  492. testq $1, M
  493. je .L19
  494. movddup -16 * SIZE(A1), %xmm8
  495. movddup -15 * SIZE(A1), %xmm9
  496. movddup -16 * SIZE(A1, LDA), %xmm10
  497. movddup -15 * SIZE(A1, LDA), %xmm11
  498. mulpd %xmm12, %xmm8
  499. addpd %xmm8, %xmm0
  500. movddup -16 * SIZE(A2), %xmm8
  501. mulpd %xmm12, %xmm9
  502. addpd %xmm9, %xmm1
  503. movddup -15 * SIZE(A2), %xmm9
  504. mulpd %xmm12, %xmm10
  505. addpd %xmm10, %xmm2
  506. movddup -16 * SIZE(A2, LDA), %xmm10
  507. mulpd %xmm12, %xmm11
  508. addpd %xmm11, %xmm3
  509. movddup -15 * SIZE(A2, LDA), %xmm11
  510. mulpd %xmm12, %xmm8
  511. addpd %xmm8, %xmm4
  512. mulpd %xmm12, %xmm9
  513. addpd %xmm9, %xmm5
  514. mulpd %xmm12, %xmm10
  515. addpd %xmm10, %xmm6
  516. mulpd %xmm12, %xmm11
  517. addpd %xmm11, %xmm7
  518. ALIGN_3
  519. .L19:
  520. pcmpeqb %xmm13, %xmm13
  521. psllq $63, %xmm13
  522. shufps $0x40, %xmm13, %xmm13
  523. #ifndef XCONJ
  524. xorps %xmm13, %xmm1
  525. xorps %xmm13, %xmm3
  526. xorps %xmm13, %xmm5
  527. xorps %xmm13, %xmm7
  528. #else
  529. xorps %xmm13, %xmm0
  530. xorps %xmm13, %xmm2
  531. xorps %xmm13, %xmm4
  532. xorps %xmm13, %xmm6
  533. #endif
  534. pshufd $0x4e, %xmm1, %xmm1
  535. pshufd $0x4e, %xmm3, %xmm3
  536. pshufd $0x4e, %xmm5, %xmm5
  537. pshufd $0x4e, %xmm7, %xmm7
  538. #ifndef CONJ
  539. addpd %xmm1, %xmm0
  540. addpd %xmm3, %xmm2
  541. addpd %xmm5, %xmm4
  542. addpd %xmm7, %xmm6
  543. #else
  544. subpd %xmm1, %xmm0
  545. subpd %xmm3, %xmm2
  546. subpd %xmm5, %xmm4
  547. subpd %xmm7, %xmm6
  548. #endif
  549. pshufd $0xee, %xmm0, %xmm1
  550. movddup %xmm0, %xmm0
  551. pshufd $0xee, %xmm2, %xmm3
  552. movddup %xmm2, %xmm2
  553. pshufd $0xee, %xmm4, %xmm5
  554. movddup %xmm4, %xmm4
  555. pshufd $0xee, %xmm6, %xmm7
  556. movddup %xmm6, %xmm6
  557. mulpd ALPHA_R, %xmm0
  558. mulpd ALPHA_I, %xmm1
  559. mulpd ALPHA_R, %xmm2
  560. mulpd ALPHA_I, %xmm3
  561. mulpd ALPHA_R, %xmm4
  562. mulpd ALPHA_I, %xmm5
  563. mulpd ALPHA_R, %xmm6
  564. mulpd ALPHA_I, %xmm7
  565. addpd %xmm1, %xmm0
  566. addpd %xmm3, %xmm2
  567. addpd %xmm5, %xmm4
  568. addpd %xmm7, %xmm6
  569. movsd 0 * SIZE(Y), %xmm1
  570. movhpd 1 * SIZE(Y), %xmm1
  571. addq INCY, Y
  572. movsd 0 * SIZE(Y), %xmm3
  573. movhpd 1 * SIZE(Y), %xmm3
  574. addq INCY, Y
  575. movsd 0 * SIZE(Y), %xmm5
  576. movhpd 1 * SIZE(Y), %xmm5
  577. addq INCY, Y
  578. movsd 0 * SIZE(Y), %xmm7
  579. movhpd 1 * SIZE(Y), %xmm7
  580. addq INCY, Y
  581. addpd %xmm1, %xmm0
  582. addpd %xmm3, %xmm2
  583. addpd %xmm5, %xmm4
  584. addpd %xmm7, %xmm6
  585. movlpd %xmm0, 0 * SIZE(Y1)
  586. movhpd %xmm0, 1 * SIZE(Y1)
  587. addq INCY, Y1
  588. movlpd %xmm2, 0 * SIZE(Y1)
  589. movhpd %xmm2, 1 * SIZE(Y1)
  590. addq INCY, Y1
  591. movlpd %xmm4, 0 * SIZE(Y1)
  592. movhpd %xmm4, 1 * SIZE(Y1)
  593. addq INCY, Y1
  594. movlpd %xmm6, 0 * SIZE(Y1)
  595. movhpd %xmm6, 1 * SIZE(Y1)
  596. addq INCY, Y1
  597. cmpq $4, N
  598. jge .L11
  599. ALIGN_3
  600. .L20:
  601. #endif
  602. #if GEMV_UNROLL >= 2
  603. cmpq $2, N
  604. jl .L30
  605. #if GEMV_UNROLL == 2
  606. ALIGN_3
  607. .L21:
  608. #endif
  609. subq $2, N
  610. leaq 16 * SIZE(BUFFER), X1
  611. movq A, A1
  612. leaq (A1, LDA), A2
  613. leaq (A1, LDA, 2), A
  614. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  615. xorps %xmm0, %xmm0
  616. xorps %xmm1, %xmm1
  617. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  618. xorps %xmm2, %xmm2
  619. xorps %xmm3, %xmm3
  620. #ifdef PREFETCHW
  621. PREFETCHW 3 * SIZE(Y1)
  622. #endif
  623. movq M, I
  624. sarq $2, I
  625. jle .L25
  626. movddup -16 * SIZE(A1), %xmm8
  627. movddup -15 * SIZE(A1), %xmm9
  628. movddup -16 * SIZE(A1, LDA), %xmm10
  629. movddup -15 * SIZE(A1, LDA), %xmm11
  630. decq I
  631. jle .L24
  632. ALIGN_3
  633. .L23:
  634. #ifdef PREFETCH
  635. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  636. #endif
  637. mulpd %xmm12, %xmm8
  638. addpd %xmm8, %xmm0
  639. movddup -14 * SIZE(A1), %xmm8
  640. mulpd %xmm12, %xmm9
  641. addpd %xmm9, %xmm1
  642. movddup -13 * SIZE(A1), %xmm9
  643. mulpd %xmm12, %xmm10
  644. addpd %xmm10, %xmm2
  645. movddup -14 * SIZE(A1, LDA), %xmm10
  646. mulpd %xmm12, %xmm11
  647. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  648. addpd %xmm11, %xmm3
  649. movddup -13 * SIZE(A1, LDA), %xmm11
  650. mulpd %xmm13, %xmm8
  651. addpd %xmm8, %xmm0
  652. movddup -12 * SIZE(A1), %xmm8
  653. mulpd %xmm13, %xmm9
  654. addpd %xmm9, %xmm1
  655. movddup -11 * SIZE(A1), %xmm9
  656. mulpd %xmm13, %xmm10
  657. addpd %xmm10, %xmm2
  658. movddup -12 * SIZE(A1, LDA), %xmm10
  659. mulpd %xmm13, %xmm11
  660. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  661. addpd %xmm11, %xmm3
  662. movddup -11 * SIZE(A1, LDA), %xmm11
  663. #ifdef PREFETCH
  664. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  665. #endif
  666. mulpd %xmm12, %xmm8
  667. addpd %xmm8, %xmm0
  668. movddup -10 * SIZE(A1), %xmm8
  669. mulpd %xmm12, %xmm9
  670. addpd %xmm9, %xmm1
  671. movddup -9 * SIZE(A1), %xmm9
  672. mulpd %xmm12, %xmm10
  673. addpd %xmm10, %xmm2
  674. movddup -10 * SIZE(A1, LDA), %xmm10
  675. mulpd %xmm12, %xmm11
  676. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  677. addpd %xmm11, %xmm3
  678. movddup -9 * SIZE(A1, LDA), %xmm11
  679. #ifdef PREFETCHW
  680. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
  681. #endif
  682. mulpd %xmm13, %xmm8
  683. addpd %xmm8, %xmm0
  684. movddup -8 * SIZE(A1), %xmm8
  685. mulpd %xmm13, %xmm9
  686. addpd %xmm9, %xmm1
  687. movddup -7 * SIZE(A1), %xmm9
  688. mulpd %xmm13, %xmm10
  689. addpd %xmm10, %xmm2
  690. movddup -8 * SIZE(A1, LDA), %xmm10
  691. mulpd %xmm13, %xmm11
  692. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  693. addpd %xmm11, %xmm3
  694. movddup -7 * SIZE(A1, LDA), %xmm11
  695. subq $-8 * SIZE, A1
  696. subq $-8 * SIZE, A2
  697. subq $-8 * SIZE, X1
  698. subq $1, I
  699. BRANCH
  700. jg .L23
  701. ALIGN_3
  702. .L24:
  703. mulpd %xmm12, %xmm8
  704. addpd %xmm8, %xmm0
  705. movddup -14 * SIZE(A1), %xmm8
  706. mulpd %xmm12, %xmm9
  707. addpd %xmm9, %xmm1
  708. movddup -13 * SIZE(A1), %xmm9
  709. mulpd %xmm12, %xmm10
  710. addpd %xmm10, %xmm2
  711. movddup -14 * SIZE(A1, LDA), %xmm10
  712. mulpd %xmm12, %xmm11
  713. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  714. addpd %xmm11, %xmm3
  715. movddup -13 * SIZE(A1, LDA), %xmm11
  716. mulpd %xmm13, %xmm8
  717. addpd %xmm8, %xmm0
  718. movddup -12 * SIZE(A1), %xmm8
  719. mulpd %xmm13, %xmm9
  720. addpd %xmm9, %xmm1
  721. movddup -11 * SIZE(A1), %xmm9
  722. mulpd %xmm13, %xmm10
  723. addpd %xmm10, %xmm2
  724. movddup -12 * SIZE(A1, LDA), %xmm10
  725. mulpd %xmm13, %xmm11
  726. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  727. addpd %xmm11, %xmm3
  728. movddup -11 * SIZE(A1, LDA), %xmm11
  729. mulpd %xmm12, %xmm8
  730. addpd %xmm8, %xmm0
  731. movddup -10 * SIZE(A1), %xmm8
  732. mulpd %xmm12, %xmm9
  733. addpd %xmm9, %xmm1
  734. movddup -9 * SIZE(A1), %xmm9
  735. mulpd %xmm12, %xmm10
  736. addpd %xmm10, %xmm2
  737. movddup -10 * SIZE(A1, LDA), %xmm10
  738. mulpd %xmm12, %xmm11
  739. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  740. addpd %xmm11, %xmm3
  741. movddup -9 * SIZE(A1, LDA), %xmm11
  742. mulpd %xmm13, %xmm8
  743. addpd %xmm8, %xmm0
  744. mulpd %xmm13, %xmm9
  745. addpd %xmm9, %xmm1
  746. mulpd %xmm13, %xmm10
  747. addpd %xmm10, %xmm2
  748. mulpd %xmm13, %xmm11
  749. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  750. addpd %xmm11, %xmm3
  751. subq $-8 * SIZE, A1
  752. subq $-8 * SIZE, A2
  753. subq $-8 * SIZE, X1
  754. ALIGN_3
  755. .L25:
  756. testq $2, M
  757. je .L27
  758. movddup -16 * SIZE(A1), %xmm8
  759. movddup -15 * SIZE(A1), %xmm9
  760. movddup -16 * SIZE(A1, LDA), %xmm10
  761. movddup -15 * SIZE(A1, LDA), %xmm11
  762. mulpd %xmm12, %xmm8
  763. addpd %xmm8, %xmm0
  764. movddup -14 * SIZE(A1), %xmm8
  765. mulpd %xmm12, %xmm9
  766. addpd %xmm9, %xmm1
  767. movddup -13 * SIZE(A1), %xmm9
  768. mulpd %xmm12, %xmm10
  769. addpd %xmm10, %xmm2
  770. movddup -14 * SIZE(A1, LDA), %xmm10
  771. mulpd %xmm12, %xmm11
  772. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  773. addpd %xmm11, %xmm3
  774. movddup -13 * SIZE(A1, LDA), %xmm11
  775. mulpd %xmm13, %xmm8
  776. addpd %xmm8, %xmm0
  777. mulpd %xmm13, %xmm9
  778. addpd %xmm9, %xmm1
  779. mulpd %xmm13, %xmm10
  780. addpd %xmm10, %xmm2
  781. mulpd %xmm13, %xmm11
  782. addpd %xmm11, %xmm3
  783. addq $4 * SIZE, A1
  784. addq $4 * SIZE, A2
  785. ALIGN_3
  786. .L27:
  787. testq $1, M
  788. je .L29
  789. movddup -16 * SIZE(A1), %xmm8
  790. movddup -15 * SIZE(A1), %xmm9
  791. movddup -16 * SIZE(A1, LDA), %xmm10
  792. movddup -15 * SIZE(A1, LDA), %xmm11
  793. mulpd %xmm12, %xmm8
  794. addpd %xmm8, %xmm0
  795. mulpd %xmm12, %xmm9
  796. addpd %xmm9, %xmm1
  797. mulpd %xmm12, %xmm10
  798. addpd %xmm10, %xmm2
  799. mulpd %xmm12, %xmm11
  800. addpd %xmm11, %xmm3
  801. ALIGN_3
  802. .L29:
  803. pcmpeqb %xmm13, %xmm13
  804. psllq $63, %xmm13
  805. shufps $0x40, %xmm13, %xmm13
  806. #ifndef XCONJ
  807. xorps %xmm13, %xmm1
  808. xorps %xmm13, %xmm3
  809. #else
  810. xorps %xmm13, %xmm0
  811. xorps %xmm13, %xmm2
  812. #endif
  813. pshufd $0x4e, %xmm1, %xmm1
  814. pshufd $0x4e, %xmm3, %xmm3
  815. #ifndef CONJ
  816. addpd %xmm1, %xmm0
  817. addpd %xmm3, %xmm2
  818. #else
  819. subpd %xmm1, %xmm0
  820. subpd %xmm3, %xmm2
  821. #endif
  822. pshufd $0xee, %xmm0, %xmm1
  823. movddup %xmm0, %xmm0
  824. pshufd $0xee, %xmm2, %xmm3
  825. movddup %xmm2, %xmm2
  826. mulpd ALPHA_R, %xmm0
  827. mulpd ALPHA_I, %xmm1
  828. mulpd ALPHA_R, %xmm2
  829. mulpd ALPHA_I, %xmm3
  830. addpd %xmm1, %xmm0
  831. addpd %xmm3, %xmm2
  832. movsd 0 * SIZE(Y), %xmm1
  833. movhpd 1 * SIZE(Y), %xmm1
  834. addq INCY, Y
  835. movsd 0 * SIZE(Y), %xmm3
  836. movhpd 1 * SIZE(Y), %xmm3
  837. addq INCY, Y
  838. addpd %xmm1, %xmm0
  839. addpd %xmm3, %xmm2
  840. movlpd %xmm0, 0 * SIZE(Y1)
  841. movhpd %xmm0, 1 * SIZE(Y1)
  842. addq INCY, Y1
  843. movlpd %xmm2, 0 * SIZE(Y1)
  844. movhpd %xmm2, 1 * SIZE(Y1)
  845. addq INCY, Y1
  846. #if GEMV_UNROLL == 2
  847. cmpq $2, N
  848. jge .L21
  849. #endif
  850. ALIGN_3
  851. .L30:
  852. #endif
  853. cmpq $1, N
  854. jl .L999
  855. #if GEMV_UNROLL == 1
  856. .L31:
  857. decq N
  858. #endif
  859. leaq 16 * SIZE(BUFFER), X1
  860. movq A, A1
  861. #if GEMV_UNROLL == 1
  862. addq LDA, A
  863. #endif
  864. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  865. xorps %xmm0, %xmm0
  866. xorps %xmm1, %xmm1
  867. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  868. xorps %xmm2, %xmm2
  869. xorps %xmm3, %xmm3
  870. movq M, I
  871. sarq $2, I
  872. jle .L35
  873. movddup -16 * SIZE(A1), %xmm8
  874. movddup -15 * SIZE(A1), %xmm9
  875. movddup -14 * SIZE(A1), %xmm10
  876. movddup -13 * SIZE(A1), %xmm11
  877. decq I
  878. jle .L34
  879. ALIGN_3
  880. .L33:
  881. #ifdef PREFETCH
  882. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  883. #endif
  884. mulpd %xmm12, %xmm8
  885. addpd %xmm8, %xmm0
  886. movddup -12 * SIZE(A1), %xmm8
  887. mulpd %xmm12, %xmm9
  888. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  889. addpd %xmm9, %xmm1
  890. movddup -11 * SIZE(A1), %xmm9
  891. mulpd %xmm13, %xmm10
  892. addpd %xmm10, %xmm2
  893. movddup -10 * SIZE(A1), %xmm10
  894. mulpd %xmm13, %xmm11
  895. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  896. addpd %xmm11, %xmm3
  897. movddup -9 * SIZE(A1), %xmm11
  898. #ifdef PREFETCHW
  899. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
  900. #endif
  901. mulpd %xmm12, %xmm8
  902. addpd %xmm8, %xmm0
  903. movddup -8 * SIZE(A1), %xmm8
  904. mulpd %xmm12, %xmm9
  905. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  906. addpd %xmm9, %xmm1
  907. movddup -7 * SIZE(A1), %xmm9
  908. mulpd %xmm13, %xmm10
  909. addpd %xmm10, %xmm2
  910. movddup -6 * SIZE(A1), %xmm10
  911. mulpd %xmm13, %xmm11
  912. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  913. addpd %xmm11, %xmm3
  914. movddup -5 * SIZE(A1), %xmm11
  915. subq $-8 * SIZE, A1
  916. subq $-8 * SIZE, X1
  917. subq $1, I
  918. BRANCH
  919. jg .L33
  920. ALIGN_3
  921. .L34:
  922. mulpd %xmm12, %xmm8
  923. addpd %xmm8, %xmm0
  924. movddup -12 * SIZE(A1), %xmm8
  925. mulpd %xmm12, %xmm9
  926. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  927. addpd %xmm9, %xmm1
  928. movddup -11 * SIZE(A1), %xmm9
  929. mulpd %xmm13, %xmm10
  930. addpd %xmm10, %xmm2
  931. movddup -10 * SIZE(A1), %xmm10
  932. mulpd %xmm13, %xmm11
  933. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  934. addpd %xmm11, %xmm3
  935. movddup -9 * SIZE(A1), %xmm11
  936. mulpd %xmm12, %xmm8
  937. addpd %xmm8, %xmm0
  938. mulpd %xmm12, %xmm9
  939. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  940. addpd %xmm9, %xmm1
  941. mulpd %xmm13, %xmm10
  942. addpd %xmm10, %xmm2
  943. mulpd %xmm13, %xmm11
  944. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  945. addpd %xmm11, %xmm3
  946. subq $-8 * SIZE, A1
  947. subq $-8 * SIZE, X1
  948. ALIGN_3
  949. .L35:
  950. testq $2, M
  951. je .L37
  952. movddup -16 * SIZE(A1), %xmm8
  953. movddup -15 * SIZE(A1), %xmm9
  954. movddup -14 * SIZE(A1), %xmm10
  955. movddup -13 * SIZE(A1), %xmm11
  956. mulpd %xmm12, %xmm8
  957. addpd %xmm8, %xmm0
  958. mulpd %xmm12, %xmm9
  959. addpd %xmm9, %xmm1
  960. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  961. mulpd %xmm13, %xmm10
  962. addpd %xmm10, %xmm2
  963. mulpd %xmm13, %xmm11
  964. addpd %xmm11, %xmm3
  965. addq $4 * SIZE, A1
  966. ALIGN_3
  967. .L37:
  968. testq $1, M
  969. je .L39
  970. movddup -16 * SIZE(A1), %xmm8
  971. movddup -15 * SIZE(A1), %xmm9
  972. mulpd %xmm12, %xmm8
  973. addpd %xmm8, %xmm0
  974. mulpd %xmm12, %xmm9
  975. addpd %xmm9, %xmm1
  976. ALIGN_3
  977. .L39:
  978. addpd %xmm2, %xmm0
  979. addpd %xmm3, %xmm1
  980. pcmpeqb %xmm13, %xmm13
  981. psllq $63, %xmm13
  982. shufps $0x40, %xmm13, %xmm13
  983. #ifndef XCONJ
  984. xorps %xmm13, %xmm1
  985. #else
  986. xorps %xmm13, %xmm0
  987. #endif
  988. pshufd $0x4e, %xmm1, %xmm1
  989. #ifndef CONJ
  990. addpd %xmm1, %xmm0
  991. #else
  992. subpd %xmm1, %xmm0
  993. #endif
  994. pshufd $0xee, %xmm0, %xmm1
  995. movddup %xmm0, %xmm0
  996. mulpd ALPHA_R, %xmm0
  997. mulpd ALPHA_I, %xmm1
  998. addpd %xmm1, %xmm0
  999. movsd 0 * SIZE(Y), %xmm1
  1000. movhpd 1 * SIZE(Y), %xmm1
  1001. addpd %xmm1, %xmm0
  1002. movlpd %xmm0, 0 * SIZE(Y1)
  1003. movhpd %xmm0, 1 * SIZE(Y1)
  1004. #if GEMV_UNROLL == 1
  1005. addq INCY, Y
  1006. addq INCY, Y1
  1007. cmpq $1, N
  1008. jge .L31
  1009. #endif
  1010. ALIGN_4
  1011. .L999:
  1012. movq 0(%rsp), %rbx
  1013. movq 8(%rsp), %rbp
  1014. movq 16(%rsp), %r12
  1015. movq 24(%rsp), %r13
  1016. movq 32(%rsp), %r14
  1017. movq 40(%rsp), %r15
  1018. #ifdef WINDOWS_ABI
  1019. movq 48(%rsp), %rdi
  1020. movq 56(%rsp), %rsi
  1021. movups 64(%rsp), %xmm6
  1022. movups 80(%rsp), %xmm7
  1023. movups 96(%rsp), %xmm8
  1024. movups 112(%rsp), %xmm9
  1025. movups 128(%rsp), %xmm10
  1026. movups 144(%rsp), %xmm11
  1027. movups 160(%rsp), %xmm12
  1028. movups 176(%rsp), %xmm13
  1029. movups 192(%rsp), %xmm14
  1030. movups 208(%rsp), %xmm15
  1031. #endif
  1032. addq $STACKSIZE, %rsp
  1033. ret
  1034. EPILOGUE