You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t.S 48 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "l2param.h"
  41. #ifndef WINDOWS_ABI
  42. #define STACKSIZE 128
  43. #define OLD_INCX 8 + STACKSIZE(%rsp)
  44. #define OLD_Y 16 + STACKSIZE(%rsp)
  45. #define OLD_INCY 24 + STACKSIZE(%rsp)
  46. #define OLD_BUFFER 32 + STACKSIZE(%rsp)
  47. #define MMM 64(%rsp)
  48. #define NN 72(%rsp)
  49. #define AA 80(%rsp)
  50. #define LDAX 88(%rsp)
  51. #define ALPHAR 96(%rsp)
  52. #define ALPHAI 104(%rsp)
  53. #define M %rdi
  54. #define N %rsi
  55. #define A %rcx
  56. #define LDA %r8
  57. #define X %r9
  58. #define INCX %rdx
  59. #define Y %rbp
  60. #define INCY %r10
  61. #else
  62. #define STACKSIZE 288
  63. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  64. #define OLD_A 48 + STACKSIZE(%rsp)
  65. #define OLD_LDA 56 + STACKSIZE(%rsp)
  66. #define OLD_X 64 + STACKSIZE(%rsp)
  67. #define OLD_INCX 72 + STACKSIZE(%rsp)
  68. #define OLD_Y 80 + STACKSIZE(%rsp)
  69. #define OLD_INCY 88 + STACKSIZE(%rsp)
  70. #define OLD_BUFFER 96 + STACKSIZE(%rsp)
  71. #define MMM 232(%rsp)
  72. #define NN 240(%rsp)
  73. #define AA 248(%rsp)
  74. #define LDAX 256(%rsp)
  75. #define ALPHAR 264(%rsp)
  76. #define ALPHAI 272(%rsp)
  77. #define M %rcx
  78. #define N %rdx
  79. #define A %r8
  80. #define LDA %r9
  81. #define X %rdi
  82. #define INCX %rsi
  83. #define Y %rbp
  84. #define INCY %r10
  85. #endif
  86. #define I %rax
  87. #define J %rbx
  88. #define A1 %r11
  89. #define A2 %r12
  90. #define X1 %r13
  91. #define Y1 %r14
  92. #define BUFFER %r15
  93. #define ALPHA_R %xmm14
  94. #define ALPHA_I %xmm15
  95. #undef SUBPD
  96. #ifndef CONJ
  97. #define SUBPD addpd
  98. #else
  99. #define SUBPD subpd
  100. #endif
  101. PROLOGUE
  102. PROFCODE
  103. subq $STACKSIZE, %rsp
  104. movq %rbx, 0(%rsp)
  105. movq %rbp, 8(%rsp)
  106. movq %r12, 16(%rsp)
  107. movq %r13, 24(%rsp)
  108. movq %r14, 32(%rsp)
  109. movq %r15, 40(%rsp)
  110. #ifdef WINDOWS_ABI
  111. movq %rdi, 48(%rsp)
  112. movq %rsi, 56(%rsp)
  113. movups %xmm6, 64(%rsp)
  114. movups %xmm7, 80(%rsp)
  115. movups %xmm8, 96(%rsp)
  116. movups %xmm9, 112(%rsp)
  117. movups %xmm10, 128(%rsp)
  118. movups %xmm11, 144(%rsp)
  119. movups %xmm12, 160(%rsp)
  120. movups %xmm13, 176(%rsp)
  121. movups %xmm14, 192(%rsp)
  122. movups %xmm15, 208(%rsp)
  123. movq OLD_A, A
  124. movq OLD_LDA, LDA
  125. movq OLD_X, X
  126. movapd %xmm3, %xmm0
  127. movsd OLD_ALPHA_I, %xmm1
  128. #endif
  129. movq A, AA
  130. movq N, NN
  131. movq M, MMM
  132. movq LDA, LDAX
  133. movsd %xmm0,ALPHAR
  134. movsd %xmm1,ALPHAI
  135. .L0t:
  136. xorq I,I
  137. addq $1,I
  138. salq $19,I
  139. subq I,MMM
  140. movq I,M
  141. movsd ALPHAR,%xmm0
  142. movsd ALPHAI,%xmm1
  143. jge .L00t
  144. movq MMM,M
  145. addq I,M
  146. jle .L999x
  147. .L00t:
  148. movq AA, A
  149. movq NN, N
  150. movq LDAX, LDA
  151. movq OLD_INCX, INCX
  152. movq OLD_Y, Y
  153. movq OLD_INCY, INCY
  154. movq OLD_BUFFER, BUFFER
  155. salq $ZBASE_SHIFT, LDA
  156. salq $ZBASE_SHIFT, INCX
  157. salq $ZBASE_SHIFT, INCY
  158. #ifdef HAVE_SSE3
  159. movddup %xmm0, ALPHA_R
  160. movddup %xmm1, ALPHA_I
  161. #else
  162. pshufd $0x44, %xmm0, ALPHA_R
  163. pshufd $0x44, %xmm1, ALPHA_I
  164. #endif
  165. subq $-16 * SIZE, A
  166. testq M, M
  167. jle .L999
  168. testq N, N
  169. jle .L999
  170. ALIGN_3
  171. movq BUFFER, X1
  172. movq Y, Y1
  173. movq M, I
  174. sarq $2, I
  175. jle .L05
  176. ALIGN_4
  177. .L02:
  178. movsd 0 * SIZE(X), %xmm0
  179. movhpd 1 * SIZE(X), %xmm0
  180. addq INCX, X
  181. movsd 0 * SIZE(X), %xmm1
  182. movhpd 1 * SIZE(X), %xmm1
  183. addq INCX, X
  184. movsd 0 * SIZE(X), %xmm2
  185. movhpd 1 * SIZE(X), %xmm2
  186. addq INCX, X
  187. movsd 0 * SIZE(X), %xmm3
  188. movhpd 1 * SIZE(X), %xmm3
  189. addq INCX, X
  190. movapd %xmm0, 0 * SIZE(X1)
  191. movapd %xmm1, 2 * SIZE(X1)
  192. movapd %xmm2, 4 * SIZE(X1)
  193. movapd %xmm3, 6 * SIZE(X1)
  194. addq $8 * SIZE, X1
  195. decq I
  196. jg .L02
  197. ALIGN_4
  198. .L05:
  199. movq M, I
  200. andq $3, I
  201. jle .L10
  202. ALIGN_2
  203. .L06:
  204. movsd 0 * SIZE(X), %xmm0
  205. movhpd 1 * SIZE(X), %xmm0
  206. addq INCX, X
  207. movapd %xmm0, 0 * SIZE(X1)
  208. addq $2 * SIZE, X1
  209. decq I
  210. jg .L06
  211. ALIGN_4
  212. .L10:
  213. #ifdef ALIGNED_ACCESS
  214. testq $SIZE, A
  215. jne .L100
  216. #endif
  217. #if GEMV_UNROLL >= 4
  218. cmpq $4, N
  219. jl .L20
  220. ALIGN_3
  221. .L11:
  222. subq $4, N
  223. leaq 16 * SIZE(BUFFER), X1
  224. movq A, A1
  225. leaq (A1, LDA, 2), A2
  226. leaq (A1, LDA, 4), A
  227. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  228. xorpd %xmm0, %xmm0
  229. xorpd %xmm1, %xmm1
  230. xorpd %xmm2, %xmm2
  231. xorpd %xmm3, %xmm3
  232. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  233. xorpd %xmm4, %xmm4
  234. xorpd %xmm5, %xmm5
  235. xorpd %xmm6, %xmm6
  236. xorpd %xmm7, %xmm7
  237. #ifdef PREFETCHW
  238. PREFETCHW 3 * SIZE(Y1)
  239. #endif
  240. movq M, I
  241. sarq $2, I
  242. jle .L15
  243. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  244. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10)
  245. decq I
  246. jle .L14
  247. ALIGN_3
  248. .L13:
  249. #ifdef PREFETCH
  250. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1)
  251. #endif
  252. pshufd $0x4e, %xmm8, %xmm9
  253. mulpd %xmm12, %xmm8
  254. addpd %xmm8, %xmm0
  255. MOVUPS_A1(-16 * SIZE, A2, %xmm8)
  256. mulpd %xmm12, %xmm9
  257. SUBPD %xmm9, %xmm1
  258. pshufd $0x4e, %xmm10, %xmm11
  259. mulpd %xmm12, %xmm10
  260. addpd %xmm10, %xmm2
  261. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10)
  262. mulpd %xmm12, %xmm11
  263. SUBPD %xmm11, %xmm3
  264. pshufd $0x4e, %xmm8, %xmm9
  265. mulpd %xmm12, %xmm8
  266. addpd %xmm8, %xmm4
  267. MOVUPS_A1(-14 * SIZE, A1, %xmm8)
  268. mulpd %xmm12, %xmm9
  269. SUBPD %xmm9, %xmm5
  270. pshufd $0x4e, %xmm10, %xmm11
  271. mulpd %xmm12, %xmm10
  272. addpd %xmm10, %xmm6
  273. MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10)
  274. mulpd %xmm12, %xmm11
  275. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  276. SUBPD %xmm11, %xmm7
  277. #ifdef PREFETCH
  278. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
  279. #endif
  280. pshufd $0x4e, %xmm8, %xmm9
  281. mulpd %xmm13, %xmm8
  282. addpd %xmm8, %xmm0
  283. MOVUPS_A1(-14 * SIZE, A2, %xmm8)
  284. mulpd %xmm13, %xmm9
  285. SUBPD %xmm9, %xmm1
  286. pshufd $0x4e, %xmm10, %xmm11
  287. mulpd %xmm13, %xmm10
  288. addpd %xmm10, %xmm2
  289. MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10)
  290. mulpd %xmm13, %xmm11
  291. SUBPD %xmm11, %xmm3
  292. pshufd $0x4e, %xmm8, %xmm9
  293. mulpd %xmm13, %xmm8
  294. addpd %xmm8, %xmm4
  295. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  296. mulpd %xmm13, %xmm9
  297. SUBPD %xmm9, %xmm5
  298. pshufd $0x4e, %xmm10, %xmm11
  299. mulpd %xmm13, %xmm10
  300. addpd %xmm10, %xmm6
  301. MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm10)
  302. mulpd %xmm13, %xmm11
  303. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  304. SUBPD %xmm11, %xmm7
  305. #ifdef PREFETCH
  306. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2)
  307. #endif
  308. pshufd $0x4e, %xmm8, %xmm9
  309. mulpd %xmm12, %xmm8
  310. addpd %xmm8, %xmm0
  311. MOVUPS_A1(-12 * SIZE, A2, %xmm8)
  312. mulpd %xmm12, %xmm9
  313. SUBPD %xmm9, %xmm1
  314. pshufd $0x4e, %xmm10, %xmm11
  315. mulpd %xmm12, %xmm10
  316. addpd %xmm10, %xmm2
  317. MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm10)
  318. mulpd %xmm12, %xmm11
  319. SUBPD %xmm11, %xmm3
  320. pshufd $0x4e, %xmm8, %xmm9
  321. mulpd %xmm12, %xmm8
  322. addpd %xmm8, %xmm4
  323. MOVUPS_A1(-10 * SIZE, A1, %xmm8)
  324. mulpd %xmm12, %xmm9
  325. SUBPD %xmm9, %xmm5
  326. pshufd $0x4e, %xmm10, %xmm11
  327. mulpd %xmm12, %xmm10
  328. addpd %xmm10, %xmm6
  329. MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm10)
  330. mulpd %xmm12, %xmm11
  331. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  332. SUBPD %xmm11, %xmm7
  333. #ifdef PREFETCH
  334. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
  335. #endif
  336. pshufd $0x4e, %xmm8, %xmm9
  337. mulpd %xmm13, %xmm8
  338. addpd %xmm8, %xmm0
  339. MOVUPS_A1(-10 * SIZE, A2, %xmm8)
  340. mulpd %xmm13, %xmm9
  341. SUBPD %xmm9, %xmm1
  342. pshufd $0x4e, %xmm10, %xmm11
  343. mulpd %xmm13, %xmm10
  344. addpd %xmm10, %xmm2
  345. MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm10)
  346. mulpd %xmm13, %xmm11
  347. SUBPD %xmm11, %xmm3
  348. #ifdef PREFETCHW
  349. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1)
  350. #endif
  351. pshufd $0x4e, %xmm8, %xmm9
  352. mulpd %xmm13, %xmm8
  353. addpd %xmm8, %xmm4
  354. MOVUPS_A1( -8 * SIZE, A1, %xmm8)
  355. mulpd %xmm13, %xmm9
  356. SUBPD %xmm9, %xmm5
  357. pshufd $0x4e, %xmm10, %xmm11
  358. mulpd %xmm13, %xmm10
  359. addpd %xmm10, %xmm6
  360. MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm10)
  361. mulpd %xmm13, %xmm11
  362. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  363. SUBPD %xmm11, %xmm7
  364. subq $-8 * SIZE, A1
  365. subq $-8 * SIZE, A2
  366. subq $-8 * SIZE, X1
  367. subq $1, I
  368. BRANCH
  369. jg .L13
  370. ALIGN_3
  371. .L14:
  372. pshufd $0x4e, %xmm8, %xmm9
  373. mulpd %xmm12, %xmm8
  374. addpd %xmm8, %xmm0
  375. MOVUPS_A1(-16 * SIZE, A2, %xmm8)
  376. mulpd %xmm12, %xmm9
  377. SUBPD %xmm9, %xmm1
  378. pshufd $0x4e, %xmm10, %xmm11
  379. mulpd %xmm12, %xmm10
  380. addpd %xmm10, %xmm2
  381. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10)
  382. mulpd %xmm12, %xmm11
  383. SUBPD %xmm11, %xmm3
  384. pshufd $0x4e, %xmm8, %xmm9
  385. mulpd %xmm12, %xmm8
  386. addpd %xmm8, %xmm4
  387. MOVUPS_A1(-14 * SIZE, A1, %xmm8)
  388. mulpd %xmm12, %xmm9
  389. SUBPD %xmm9, %xmm5
  390. pshufd $0x4e, %xmm10, %xmm11
  391. mulpd %xmm12, %xmm10
  392. addpd %xmm10, %xmm6
  393. MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10)
  394. mulpd %xmm12, %xmm11
  395. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  396. SUBPD %xmm11, %xmm7
  397. pshufd $0x4e, %xmm8, %xmm9
  398. mulpd %xmm13, %xmm8
  399. addpd %xmm8, %xmm0
  400. MOVUPS_A1(-14 * SIZE, A2, %xmm8)
  401. mulpd %xmm13, %xmm9
  402. SUBPD %xmm9, %xmm1
  403. pshufd $0x4e, %xmm10, %xmm11
  404. mulpd %xmm13, %xmm10
  405. addpd %xmm10, %xmm2
  406. MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10)
  407. mulpd %xmm13, %xmm11
  408. SUBPD %xmm11, %xmm3
  409. pshufd $0x4e, %xmm8, %xmm9
  410. mulpd %xmm13, %xmm8
  411. addpd %xmm8, %xmm4
  412. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  413. mulpd %xmm13, %xmm9
  414. SUBPD %xmm9, %xmm5
  415. pshufd $0x4e, %xmm10, %xmm11
  416. mulpd %xmm13, %xmm10
  417. addpd %xmm10, %xmm6
  418. MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm10)
  419. mulpd %xmm13, %xmm11
  420. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  421. SUBPD %xmm11, %xmm7
  422. pshufd $0x4e, %xmm8, %xmm9
  423. mulpd %xmm12, %xmm8
  424. addpd %xmm8, %xmm0
  425. MOVUPS_A1(-12 * SIZE, A2, %xmm8)
  426. mulpd %xmm12, %xmm9
  427. SUBPD %xmm9, %xmm1
  428. pshufd $0x4e, %xmm10, %xmm11
  429. mulpd %xmm12, %xmm10
  430. addpd %xmm10, %xmm2
  431. MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm10)
  432. mulpd %xmm12, %xmm11
  433. SUBPD %xmm11, %xmm3
  434. pshufd $0x4e, %xmm8, %xmm9
  435. mulpd %xmm12, %xmm8
  436. addpd %xmm8, %xmm4
  437. MOVUPS_A1(-10 * SIZE, A1, %xmm8)
  438. mulpd %xmm12, %xmm9
  439. SUBPD %xmm9, %xmm5
  440. pshufd $0x4e, %xmm10, %xmm11
  441. mulpd %xmm12, %xmm10
  442. addpd %xmm10, %xmm6
  443. MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm10)
  444. mulpd %xmm12, %xmm11
  445. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  446. SUBPD %xmm11, %xmm7
  447. pshufd $0x4e, %xmm8, %xmm9
  448. mulpd %xmm13, %xmm8
  449. addpd %xmm8, %xmm0
  450. MOVUPS_A1(-10 * SIZE, A2, %xmm8)
  451. mulpd %xmm13, %xmm9
  452. SUBPD %xmm9, %xmm1
  453. pshufd $0x4e, %xmm10, %xmm11
  454. mulpd %xmm13, %xmm10
  455. addpd %xmm10, %xmm2
  456. MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm10)
  457. mulpd %xmm13, %xmm11
  458. SUBPD %xmm11, %xmm3
  459. pshufd $0x4e, %xmm8, %xmm9
  460. mulpd %xmm13, %xmm8
  461. addpd %xmm8, %xmm4
  462. mulpd %xmm13, %xmm9
  463. SUBPD %xmm9, %xmm5
  464. pshufd $0x4e, %xmm10, %xmm11
  465. mulpd %xmm13, %xmm10
  466. addpd %xmm10, %xmm6
  467. mulpd %xmm13, %xmm11
  468. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  469. SUBPD %xmm11, %xmm7
  470. subq $-8 * SIZE, A1
  471. subq $-8 * SIZE, A2
  472. subq $-8 * SIZE, X1
  473. ALIGN_3
  474. .L15:
  475. testq $2, M
  476. je .L17
  477. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  478. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10)
  479. pshufd $0x4e, %xmm8, %xmm9
  480. mulpd %xmm12, %xmm8
  481. addpd %xmm8, %xmm0
  482. MOVUPS_A1(-16 * SIZE, A2, %xmm8)
  483. mulpd %xmm12, %xmm9
  484. SUBPD %xmm9, %xmm1
  485. pshufd $0x4e, %xmm10, %xmm11
  486. mulpd %xmm12, %xmm10
  487. addpd %xmm10, %xmm2
  488. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10)
  489. mulpd %xmm12, %xmm11
  490. SUBPD %xmm11, %xmm3
  491. pshufd $0x4e, %xmm8, %xmm9
  492. mulpd %xmm12, %xmm8
  493. addpd %xmm8, %xmm4
  494. MOVUPS_A1(-14 * SIZE, A1, %xmm8)
  495. mulpd %xmm12, %xmm9
  496. SUBPD %xmm9, %xmm5
  497. pshufd $0x4e, %xmm10, %xmm11
  498. mulpd %xmm12, %xmm10
  499. addpd %xmm10, %xmm6
  500. MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10)
  501. mulpd %xmm12, %xmm11
  502. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  503. SUBPD %xmm11, %xmm7
  504. pshufd $0x4e, %xmm8, %xmm9
  505. mulpd %xmm13, %xmm8
  506. addpd %xmm8, %xmm0
  507. MOVUPS_A1(-14 * SIZE, A2, %xmm8)
  508. mulpd %xmm13, %xmm9
  509. SUBPD %xmm9, %xmm1
  510. pshufd $0x4e, %xmm10, %xmm11
  511. mulpd %xmm13, %xmm10
  512. addpd %xmm10, %xmm2
  513. MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10)
  514. mulpd %xmm13, %xmm11
  515. SUBPD %xmm11, %xmm3
  516. pshufd $0x4e, %xmm8, %xmm9
  517. mulpd %xmm13, %xmm8
  518. addpd %xmm8, %xmm4
  519. mulpd %xmm13, %xmm9
  520. SUBPD %xmm9, %xmm5
  521. pshufd $0x4e, %xmm10, %xmm11
  522. mulpd %xmm13, %xmm10
  523. addpd %xmm10, %xmm6
  524. mulpd %xmm13, %xmm11
  525. SUBPD %xmm11, %xmm7
  526. addq $4 * SIZE, A1
  527. addq $4 * SIZE, A2
  528. ALIGN_3
  529. .L17:
  530. testq $1, M
  531. je .L19
  532. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  533. MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10)
  534. pshufd $0x4e, %xmm8, %xmm9
  535. mulpd %xmm12, %xmm8
  536. addpd %xmm8, %xmm0
  537. MOVUPS_A1(-16 * SIZE, A2, %xmm8)
  538. mulpd %xmm12, %xmm9
  539. SUBPD %xmm9, %xmm1
  540. pshufd $0x4e, %xmm10, %xmm11
  541. mulpd %xmm12, %xmm10
  542. addpd %xmm10, %xmm2
  543. MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10)
  544. mulpd %xmm12, %xmm11
  545. SUBPD %xmm11, %xmm3
  546. pshufd $0x4e, %xmm8, %xmm9
  547. mulpd %xmm12, %xmm8
  548. addpd %xmm8, %xmm4
  549. mulpd %xmm12, %xmm9
  550. SUBPD %xmm9, %xmm5
  551. pshufd $0x4e, %xmm10, %xmm11
  552. mulpd %xmm12, %xmm10
  553. addpd %xmm10, %xmm6
  554. mulpd %xmm12, %xmm11
  555. SUBPD %xmm11, %xmm7
  556. ALIGN_3
  557. .L19:
  558. pcmpeqb %xmm13, %xmm13
  559. psllq $63, %xmm13
  560. shufps $0xc0, %xmm13, %xmm13
  561. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  562. xorpd %xmm13, %xmm0
  563. xorpd %xmm13, %xmm2
  564. xorpd %xmm13, %xmm4
  565. xorpd %xmm13, %xmm6
  566. #else
  567. xorpd %xmm13, %xmm1
  568. xorpd %xmm13, %xmm3
  569. xorpd %xmm13, %xmm5
  570. xorpd %xmm13, %xmm7
  571. #endif
  572. #ifdef HAVE_SSE3
  573. haddpd %xmm1, %xmm0
  574. haddpd %xmm3, %xmm2
  575. haddpd %xmm5, %xmm4
  576. haddpd %xmm7, %xmm6
  577. #else
  578. movapd %xmm0, %xmm8
  579. unpcklpd %xmm1, %xmm0
  580. unpckhpd %xmm1, %xmm8
  581. movapd %xmm2, %xmm9
  582. unpcklpd %xmm3, %xmm2
  583. unpckhpd %xmm3, %xmm9
  584. movapd %xmm4, %xmm10
  585. unpcklpd %xmm5, %xmm4
  586. unpckhpd %xmm5, %xmm10
  587. movapd %xmm6, %xmm11
  588. unpcklpd %xmm7, %xmm6
  589. unpckhpd %xmm7, %xmm11
  590. addpd %xmm8, %xmm0
  591. addpd %xmm9, %xmm2
  592. addpd %xmm10, %xmm4
  593. addpd %xmm11, %xmm6
  594. #endif
  595. pshufd $0x4e, %xmm0, %xmm1
  596. pshufd $0x4e, %xmm2, %xmm3
  597. pshufd $0x4e, %xmm4, %xmm5
  598. pshufd $0x4e, %xmm6, %xmm7
  599. mulpd ALPHA_R, %xmm0
  600. mulpd ALPHA_I, %xmm1
  601. mulpd ALPHA_R, %xmm2
  602. mulpd ALPHA_I, %xmm3
  603. mulpd ALPHA_R, %xmm4
  604. mulpd ALPHA_I, %xmm5
  605. mulpd ALPHA_R, %xmm6
  606. mulpd ALPHA_I, %xmm7
  607. xorpd %xmm13, %xmm1
  608. xorpd %xmm13, %xmm3
  609. xorpd %xmm13, %xmm5
  610. xorpd %xmm13, %xmm7
  611. subpd %xmm1, %xmm0
  612. subpd %xmm3, %xmm2
  613. subpd %xmm5, %xmm4
  614. subpd %xmm7, %xmm6
  615. movsd 0 * SIZE(Y), %xmm1
  616. movhpd 1 * SIZE(Y), %xmm1
  617. addq INCY, Y
  618. movsd 0 * SIZE(Y), %xmm3
  619. movhpd 1 * SIZE(Y), %xmm3
  620. addq INCY, Y
  621. movsd 0 * SIZE(Y), %xmm5
  622. movhpd 1 * SIZE(Y), %xmm5
  623. addq INCY, Y
  624. movsd 0 * SIZE(Y), %xmm7
  625. movhpd 1 * SIZE(Y), %xmm7
  626. addq INCY, Y
  627. addpd %xmm1, %xmm0
  628. addpd %xmm3, %xmm2
  629. addpd %xmm5, %xmm4
  630. addpd %xmm7, %xmm6
  631. movlpd %xmm0, 0 * SIZE(Y1)
  632. movhpd %xmm0, 1 * SIZE(Y1)
  633. addq INCY, Y1
  634. movlpd %xmm2, 0 * SIZE(Y1)
  635. movhpd %xmm2, 1 * SIZE(Y1)
  636. addq INCY, Y1
  637. movlpd %xmm4, 0 * SIZE(Y1)
  638. movhpd %xmm4, 1 * SIZE(Y1)
  639. addq INCY, Y1
  640. movlpd %xmm6, 0 * SIZE(Y1)
  641. movhpd %xmm6, 1 * SIZE(Y1)
  642. addq INCY, Y1
  643. cmpq $4, N
  644. jge .L11
  645. ALIGN_3
  646. .L20:
  647. #endif
  648. #if GEMV_UNROLL >= 2
  649. cmpq $2, N
  650. jl .L30
  651. #if GEMV_UNROLL == 2
  652. ALIGN_3
  653. .L21:
  654. #endif
  655. subq $2, N
  656. leaq 16 * SIZE(BUFFER), X1
  657. movq A, A1
  658. leaq (A1, LDA), A2
  659. leaq (A1, LDA, 2), A
  660. xorpd %xmm0, %xmm0
  661. xorpd %xmm1, %xmm1
  662. xorpd %xmm2, %xmm2
  663. xorpd %xmm3, %xmm3
  664. MOVUPS_XL1(-16 * SIZE, X1, %xmm4)
  665. MOVUPS_XL1(-14 * SIZE, X1, %xmm5)
  666. #ifdef PREFETCHW
  667. PREFETCHW 3 * SIZE(Y1)
  668. #endif
  669. movq M, I
  670. sarq $2, I
  671. jle .L25
  672. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  673. MOVUPS_A1(-16 * SIZE, A2, %xmm10)
  674. MOVUPS_A1(-14 * SIZE, A1, %xmm12)
  675. MOVUPS_A1(-14 * SIZE, A2, %xmm6)
  676. decq I
  677. jle .L24
  678. ALIGN_3
  679. .L23:
  680. #ifdef PREFETCH
  681. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  682. #endif
  683. pshufd $0x4e, %xmm8, %xmm9
  684. mulpd %xmm4, %xmm8
  685. addpd %xmm8, %xmm0
  686. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  687. mulpd %xmm4, %xmm9
  688. SUBPD %xmm9, %xmm1
  689. pshufd $0x4e, %xmm10, %xmm11
  690. mulpd %xmm4, %xmm10
  691. addpd %xmm10, %xmm2
  692. MOVUPS_A1(-12 * SIZE, A2, %xmm10)
  693. mulpd %xmm4, %xmm11
  694. SUBPD %xmm11, %xmm3
  695. MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
  696. pshufd $0x4e, %xmm12, %xmm13
  697. mulpd %xmm5, %xmm12
  698. addpd %xmm12, %xmm0
  699. MOVUPS_A1(-10 * SIZE, A1, %xmm12)
  700. mulpd %xmm5, %xmm13
  701. SUBPD %xmm13, %xmm1
  702. pshufd $0x4e, %xmm6, %xmm7
  703. mulpd %xmm5, %xmm6
  704. addpd %xmm6, %xmm2
  705. MOVUPS_A1(-10 * SIZE, A2, %xmm6)
  706. mulpd %xmm5, %xmm7
  707. SUBPD %xmm7, %xmm3
  708. MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
  709. #ifdef PREFETCH
  710. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  711. #endif
  712. pshufd $0x4e, %xmm8, %xmm9
  713. mulpd %xmm4, %xmm8
  714. addpd %xmm8, %xmm0
  715. MOVUPS_A1( -8 * SIZE, A1, %xmm8)
  716. mulpd %xmm4, %xmm9
  717. SUBPD %xmm9, %xmm1
  718. pshufd $0x4e, %xmm10, %xmm11
  719. mulpd %xmm4, %xmm10
  720. addpd %xmm10, %xmm2
  721. MOVUPS_A1( -8 * SIZE, A2, %xmm10)
  722. mulpd %xmm4, %xmm11
  723. SUBPD %xmm11, %xmm3
  724. MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
  725. #ifdef PREFETCHW
  726. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
  727. #endif
  728. pshufd $0x4e, %xmm12, %xmm13
  729. mulpd %xmm5, %xmm12
  730. addpd %xmm12, %xmm0
  731. MOVUPS_A1( -6 * SIZE, A1, %xmm12)
  732. mulpd %xmm5, %xmm13
  733. SUBPD %xmm13, %xmm1
  734. pshufd $0x4e, %xmm6, %xmm7
  735. mulpd %xmm5, %xmm6
  736. addpd %xmm6, %xmm2
  737. MOVUPS_A1( -6 * SIZE, A2, %xmm6)
  738. mulpd %xmm5, %xmm7
  739. SUBPD %xmm7, %xmm3
  740. MOVUPS_XL1( -6 * SIZE, X1, %xmm5)
  741. subq $-8 * SIZE, A1
  742. subq $-8 * SIZE, A2
  743. subq $-8 * SIZE, X1
  744. subq $1, I
  745. BRANCH
  746. jg .L23
  747. ALIGN_3
  748. .L24:
  749. pshufd $0x4e, %xmm8, %xmm9
  750. mulpd %xmm4, %xmm8
  751. addpd %xmm8, %xmm0
  752. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  753. mulpd %xmm4, %xmm9
  754. SUBPD %xmm9, %xmm1
  755. pshufd $0x4e, %xmm10, %xmm11
  756. mulpd %xmm4, %xmm10
  757. addpd %xmm10, %xmm2
  758. MOVUPS_A1(-12 * SIZE, A2, %xmm10)
  759. mulpd %xmm4, %xmm11
  760. SUBPD %xmm11, %xmm3
  761. MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
  762. pshufd $0x4e, %xmm12, %xmm13
  763. mulpd %xmm5, %xmm12
  764. addpd %xmm12, %xmm0
  765. MOVUPS_A1(-10 * SIZE, A1, %xmm12)
  766. mulpd %xmm5, %xmm13
  767. SUBPD %xmm13, %xmm1
  768. pshufd $0x4e, %xmm6, %xmm7
  769. mulpd %xmm5, %xmm6
  770. addpd %xmm6, %xmm2
  771. MOVUPS_A1(-10 * SIZE, A2, %xmm6)
  772. mulpd %xmm5, %xmm7
  773. SUBPD %xmm7, %xmm3
  774. MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
  775. pshufd $0x4e, %xmm8, %xmm9
  776. mulpd %xmm4, %xmm8
  777. addpd %xmm8, %xmm0
  778. mulpd %xmm4, %xmm9
  779. SUBPD %xmm9, %xmm1
  780. pshufd $0x4e, %xmm10, %xmm11
  781. mulpd %xmm4, %xmm10
  782. addpd %xmm10, %xmm2
  783. mulpd %xmm4, %xmm11
  784. SUBPD %xmm11, %xmm3
  785. MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
  786. pshufd $0x4e, %xmm12, %xmm13
  787. mulpd %xmm5, %xmm12
  788. addpd %xmm12, %xmm0
  789. mulpd %xmm5, %xmm13
  790. SUBPD %xmm13, %xmm1
  791. pshufd $0x4e, %xmm6, %xmm7
  792. mulpd %xmm5, %xmm6
  793. addpd %xmm6, %xmm2
  794. mulpd %xmm5, %xmm7
  795. SUBPD %xmm7, %xmm3
  796. MOVUPS_XL1( -6 * SIZE, X1, %xmm5)
  797. subq $-8 * SIZE, A1
  798. subq $-8 * SIZE, A2
  799. subq $-8 * SIZE, X1
  800. ALIGN_3
  801. .L25:
  802. testq $2, M
  803. je .L27
  804. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  805. MOVUPS_A1(-16 * SIZE, A2, %xmm10)
  806. MOVUPS_A1(-14 * SIZE, A1, %xmm12)
  807. MOVUPS_A1(-14 * SIZE, A2, %xmm6)
  808. pshufd $0x4e, %xmm8, %xmm9
  809. mulpd %xmm4, %xmm8
  810. addpd %xmm8, %xmm0
  811. mulpd %xmm4, %xmm9
  812. SUBPD %xmm9, %xmm1
  813. pshufd $0x4e, %xmm10, %xmm11
  814. mulpd %xmm4, %xmm10
  815. addpd %xmm10, %xmm2
  816. mulpd %xmm4, %xmm11
  817. SUBPD %xmm11, %xmm3
  818. MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
  819. pshufd $0x4e, %xmm12, %xmm13
  820. mulpd %xmm5, %xmm12
  821. addpd %xmm12, %xmm0
  822. mulpd %xmm5, %xmm13
  823. SUBPD %xmm13, %xmm1
  824. pshufd $0x4e, %xmm6, %xmm7
  825. mulpd %xmm5, %xmm6
  826. addpd %xmm6, %xmm2
  827. mulpd %xmm5, %xmm7
  828. SUBPD %xmm7, %xmm3
  829. addq $4 * SIZE, A1
  830. addq $4 * SIZE, A2
  831. ALIGN_3
  832. .L27:
  833. testq $1, M
  834. je .L29
  835. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  836. MOVUPS_A1(-16 * SIZE, A2, %xmm10)
  837. pshufd $0x4e, %xmm8, %xmm9
  838. mulpd %xmm4, %xmm8
  839. addpd %xmm8, %xmm0
  840. mulpd %xmm4, %xmm9
  841. SUBPD %xmm9, %xmm1
  842. pshufd $0x4e, %xmm10, %xmm11
  843. mulpd %xmm4, %xmm10
  844. addpd %xmm10, %xmm2
  845. mulpd %xmm4, %xmm11
  846. SUBPD %xmm11, %xmm3
  847. ALIGN_3
  848. .L29:
  849. pcmpeqb %xmm11, %xmm11
  850. psllq $63, %xmm11
  851. shufps $0xc0, %xmm11, %xmm11
  852. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  853. xorpd %xmm11, %xmm0
  854. xorpd %xmm11, %xmm2
  855. #else
  856. xorpd %xmm11, %xmm1
  857. xorpd %xmm11, %xmm3
  858. #endif
  859. #ifdef HAVE_SSE3
  860. haddpd %xmm1, %xmm0
  861. haddpd %xmm3, %xmm2
  862. #else
  863. movapd %xmm0, %xmm8
  864. unpcklpd %xmm1, %xmm0
  865. unpckhpd %xmm1, %xmm8
  866. movapd %xmm2, %xmm9
  867. unpcklpd %xmm3, %xmm2
  868. unpckhpd %xmm3, %xmm9
  869. addpd %xmm8, %xmm0
  870. addpd %xmm9, %xmm2
  871. #endif
  872. pshufd $0x4e, %xmm0, %xmm1
  873. pshufd $0x4e, %xmm2, %xmm3
  874. mulpd ALPHA_R, %xmm0
  875. mulpd ALPHA_I, %xmm1
  876. mulpd ALPHA_R, %xmm2
  877. mulpd ALPHA_I, %xmm3
  878. xorpd %xmm11, %xmm1
  879. xorpd %xmm11, %xmm3
  880. subpd %xmm1, %xmm0
  881. subpd %xmm3, %xmm2
  882. movsd 0 * SIZE(Y), %xmm4
  883. movhpd 1 * SIZE(Y), %xmm4
  884. addq INCY, Y
  885. movsd 0 * SIZE(Y), %xmm5
  886. movhpd 1 * SIZE(Y), %xmm5
  887. addq INCY, Y
  888. addpd %xmm4, %xmm0
  889. addpd %xmm5, %xmm2
  890. movlpd %xmm0, 0 * SIZE(Y1)
  891. movhpd %xmm0, 1 * SIZE(Y1)
  892. addq INCY, Y1
  893. movlpd %xmm2, 0 * SIZE(Y1)
  894. movhpd %xmm2, 1 * SIZE(Y1)
  895. addq INCY, Y1
  896. #if GEMV_UNROLL == 2
  897. cmpq $2, N
  898. jge .L21
  899. #endif
  900. ALIGN_3
  901. .L30:
  902. #endif
  903. cmpq $1, N
  904. jl .L999
  905. #if GEMV_UNROLL == 1
  906. .L31:
  907. decq N
  908. #endif
  909. leaq 16 * SIZE(BUFFER), X1
  910. movq A, A1
  911. #if GEMV_UNROLL == 1
  912. addq LDA, A
  913. #endif
  914. xorpd %xmm0, %xmm0
  915. xorpd %xmm1, %xmm1
  916. MOVUPS_XL1(-16 * SIZE, X1, %xmm4)
  917. MOVUPS_XL1(-14 * SIZE, X1, %xmm5)
  918. movq M, I
  919. sarq $2, I
  920. jle .L35
  921. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  922. MOVUPS_A1(-14 * SIZE, A1, %xmm12)
  923. decq I
  924. jle .L34
  925. ALIGN_3
  926. .L33:
  927. #ifdef PREFETCH
  928. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  929. #endif
  930. pshufd $0x4e, %xmm8, %xmm9
  931. mulpd %xmm4, %xmm8
  932. addpd %xmm8, %xmm0
  933. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  934. mulpd %xmm4, %xmm9
  935. SUBPD %xmm9, %xmm1
  936. MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
  937. pshufd $0x4e, %xmm12, %xmm13
  938. mulpd %xmm5, %xmm12
  939. addpd %xmm12, %xmm0
  940. MOVUPS_A1(-10 * SIZE, A1, %xmm12)
  941. mulpd %xmm5, %xmm13
  942. SUBPD %xmm13, %xmm1
  943. MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
  944. #ifdef PREFETCHW
  945. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
  946. #endif
  947. pshufd $0x4e, %xmm8, %xmm9
  948. mulpd %xmm4, %xmm8
  949. addpd %xmm8, %xmm0
  950. MOVUPS_A1( -8 * SIZE, A1, %xmm8)
  951. mulpd %xmm4, %xmm9
  952. SUBPD %xmm9, %xmm1
  953. MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
  954. pshufd $0x4e, %xmm12, %xmm13
  955. mulpd %xmm5, %xmm12
  956. addpd %xmm12, %xmm0
  957. MOVUPS_A1( -6 * SIZE, A1, %xmm12)
  958. mulpd %xmm5, %xmm13
  959. SUBPD %xmm13, %xmm1
  960. MOVUPS_XL1(-6 * SIZE, X1, %xmm5)
  961. subq $-8 * SIZE, A1
  962. subq $-8 * SIZE, X1
  963. subq $1, I
  964. BRANCH
  965. jg .L33
  966. ALIGN_3
  967. .L34:
  968. pshufd $0x4e, %xmm8, %xmm9
  969. mulpd %xmm4, %xmm8
  970. addpd %xmm8, %xmm0
  971. MOVUPS_A1(-12 * SIZE, A1, %xmm8)
  972. mulpd %xmm4, %xmm9
  973. SUBPD %xmm9, %xmm1
  974. MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
  975. pshufd $0x4e, %xmm12, %xmm13
  976. mulpd %xmm5, %xmm12
  977. addpd %xmm12, %xmm0
  978. MOVUPS_A1(-10 * SIZE, A1, %xmm12)
  979. mulpd %xmm5, %xmm13
  980. SUBPD %xmm13, %xmm1
  981. MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
  982. pshufd $0x4e, %xmm8, %xmm9
  983. mulpd %xmm4, %xmm8
  984. addpd %xmm8, %xmm0
  985. mulpd %xmm4, %xmm9
  986. SUBPD %xmm9, %xmm1
  987. MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
  988. pshufd $0x4e, %xmm12, %xmm13
  989. mulpd %xmm5, %xmm12
  990. addpd %xmm12, %xmm0
  991. mulpd %xmm5, %xmm13
  992. SUBPD %xmm13, %xmm1
  993. MOVUPS_XL1(-6 * SIZE, X1, %xmm5)
  994. subq $-8 * SIZE, A1
  995. subq $-8 * SIZE, X1
  996. ALIGN_3
  997. .L35:
  998. testq $2, M
  999. je .L37
  1000. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  1001. MOVUPS_A1(-14 * SIZE, A1, %xmm12)
  1002. pshufd $0x4e, %xmm8, %xmm9
  1003. mulpd %xmm4, %xmm8
  1004. addpd %xmm8, %xmm0
  1005. mulpd %xmm4, %xmm9
  1006. SUBPD %xmm9, %xmm1
  1007. MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
  1008. pshufd $0x4e, %xmm12, %xmm13
  1009. mulpd %xmm5, %xmm12
  1010. addpd %xmm12, %xmm0
  1011. mulpd %xmm5, %xmm13
  1012. SUBPD %xmm13, %xmm1
  1013. addq $4 * SIZE, A1
  1014. ALIGN_3
  1015. .L37:
  1016. testq $1, M
  1017. je .L39
  1018. MOVUPS_A1(-16 * SIZE, A1, %xmm8)
  1019. pshufd $0x4e, %xmm8, %xmm9
  1020. mulpd %xmm4, %xmm8
  1021. addpd %xmm8, %xmm0
  1022. mulpd %xmm4, %xmm9
  1023. SUBPD %xmm9, %xmm1
  1024. ALIGN_3
  1025. .L39:
  1026. pcmpeqb %xmm11, %xmm11
  1027. psllq $63, %xmm11
  1028. shufps $0xc0, %xmm11, %xmm11
  1029. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  1030. xorpd %xmm11, %xmm0
  1031. #else
  1032. xorpd %xmm11, %xmm1
  1033. #endif
  1034. #ifdef HAVE_SSE3
  1035. haddpd %xmm1, %xmm0
  1036. #else
  1037. movapd %xmm0, %xmm8
  1038. unpcklpd %xmm1, %xmm0
  1039. unpckhpd %xmm1, %xmm8
  1040. addpd %xmm8, %xmm0
  1041. #endif
  1042. pshufd $0x4e, %xmm0, %xmm1
  1043. mulpd ALPHA_R, %xmm0
  1044. mulpd ALPHA_I, %xmm1
  1045. xorpd %xmm11, %xmm1
  1046. subpd %xmm1, %xmm0
  1047. movsd 0 * SIZE(Y), %xmm4
  1048. movhpd 1 * SIZE(Y), %xmm4
  1049. addpd %xmm4, %xmm0
  1050. movlpd %xmm0, 0 * SIZE(Y1)
  1051. movhpd %xmm0, 1 * SIZE(Y1)
  1052. #if GEMV_UNROLL == 1
  1053. addq INCY, Y
  1054. addq INCY, Y1
  1055. cmpq $1, N
  1056. jge .L31
  1057. #endif
  1058. #ifdef ALIGNED_ACCESS
  1059. jmp .L999
  1060. ALIGN_3
  1061. .L100:
  1062. #if GEMV_UNROLL >= 4
  1063. cmpq $4, N
  1064. jl .L110
  1065. ALIGN_3
  1066. .L101:
  1067. subq $4, N
  1068. leaq 16 * SIZE(BUFFER), X1
  1069. movq A, A1
  1070. leaq (A1, LDA, 2), A2
  1071. leaq (A1, LDA, 4), A
  1072. MOVUPS_XL1(-16 * SIZE, X1, %xmm12)
  1073. xorpd %xmm0, %xmm0
  1074. xorpd %xmm1, %xmm1
  1075. xorpd %xmm2, %xmm2
  1076. xorpd %xmm3, %xmm3
  1077. MOVUPS_XL1(-14 * SIZE, X1, %xmm13)
  1078. xorpd %xmm4, %xmm4
  1079. xorpd %xmm5, %xmm5
  1080. xorpd %xmm6, %xmm6
  1081. xorpd %xmm7, %xmm7
  1082. #ifdef PREFETCHW
  1083. PREFETCHW 3 * SIZE(Y1)
  1084. #endif
  1085. movq M, I
  1086. sarq $2, I
  1087. jle .L105
  1088. movsd -16 * SIZE(A1), %xmm8
  1089. movhpd -15 * SIZE(A1), %xmm8
  1090. movsd -16 * SIZE(A1, LDA), %xmm10
  1091. movhpd -15 * SIZE(A1, LDA), %xmm10
  1092. decq I
  1093. jle .L104
  1094. ALIGN_3
  1095. .L103:
  1096. #ifdef PREFETCH
  1097. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1)
  1098. #endif
  1099. pshufd $0x4e, %xmm8, %xmm9
  1100. mulpd %xmm12, %xmm8
  1101. addpd %xmm8, %xmm0
  1102. movsd -16 * SIZE(A2), %xmm8
  1103. movhpd -15 * SIZE(A2), %xmm8
  1104. mulpd %xmm12, %xmm9
  1105. SUBPD %xmm9, %xmm1
  1106. pshufd $0x4e, %xmm10, %xmm11
  1107. mulpd %xmm12, %xmm10
  1108. addpd %xmm10, %xmm2
  1109. movsd -16 * SIZE(A2, LDA), %xmm10
  1110. movhpd -15 * SIZE(A2, LDA), %xmm10
  1111. mulpd %xmm12, %xmm11
  1112. SUBPD %xmm11, %xmm3
  1113. pshufd $0x4e, %xmm8, %xmm9
  1114. mulpd %xmm12, %xmm8
  1115. addpd %xmm8, %xmm4
  1116. movsd -14 * SIZE(A1), %xmm8
  1117. movhpd -13 * SIZE(A1), %xmm8
  1118. mulpd %xmm12, %xmm9
  1119. SUBPD %xmm9, %xmm5
  1120. pshufd $0x4e, %xmm10, %xmm11
  1121. mulpd %xmm12, %xmm10
  1122. addpd %xmm10, %xmm6
  1123. movsd -14 * SIZE(A1, LDA), %xmm10
  1124. movhpd -13 * SIZE(A1, LDA), %xmm10
  1125. mulpd %xmm12, %xmm11
  1126. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  1127. SUBPD %xmm11, %xmm7
  1128. #ifdef PREFETCH
  1129. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
  1130. #endif
  1131. pshufd $0x4e, %xmm8, %xmm9
  1132. mulpd %xmm13, %xmm8
  1133. addpd %xmm8, %xmm0
  1134. movsd -14 * SIZE(A2), %xmm8
  1135. movhpd -13 * SIZE(A2), %xmm8
  1136. mulpd %xmm13, %xmm9
  1137. SUBPD %xmm9, %xmm1
  1138. pshufd $0x4e, %xmm10, %xmm11
  1139. mulpd %xmm13, %xmm10
  1140. addpd %xmm10, %xmm2
  1141. movsd -14 * SIZE(A2, LDA), %xmm10
  1142. movhpd -13 * SIZE(A2, LDA), %xmm10
  1143. mulpd %xmm13, %xmm11
  1144. SUBPD %xmm11, %xmm3
  1145. pshufd $0x4e, %xmm8, %xmm9
  1146. mulpd %xmm13, %xmm8
  1147. addpd %xmm8, %xmm4
  1148. movsd -12 * SIZE(A1), %xmm8
  1149. movhpd -11 * SIZE(A1), %xmm8
  1150. mulpd %xmm13, %xmm9
  1151. SUBPD %xmm9, %xmm5
  1152. pshufd $0x4e, %xmm10, %xmm11
  1153. mulpd %xmm13, %xmm10
  1154. addpd %xmm10, %xmm6
  1155. movsd -12 * SIZE(A1, LDA), %xmm10
  1156. movhpd -11 * SIZE(A1, LDA), %xmm10
  1157. mulpd %xmm13, %xmm11
  1158. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  1159. SUBPD %xmm11, %xmm7
  1160. #ifdef PREFETCH
  1161. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2)
  1162. #endif
  1163. pshufd $0x4e, %xmm8, %xmm9
  1164. mulpd %xmm12, %xmm8
  1165. addpd %xmm8, %xmm0
  1166. movsd -12 * SIZE(A2), %xmm8
  1167. movhpd -11 * SIZE(A2), %xmm8
  1168. mulpd %xmm12, %xmm9
  1169. SUBPD %xmm9, %xmm1
  1170. pshufd $0x4e, %xmm10, %xmm11
  1171. mulpd %xmm12, %xmm10
  1172. addpd %xmm10, %xmm2
  1173. movsd -12 * SIZE(A2, LDA), %xmm10
  1174. movhpd -11 * SIZE(A2, LDA), %xmm10
  1175. mulpd %xmm12, %xmm11
  1176. SUBPD %xmm11, %xmm3
  1177. pshufd $0x4e, %xmm8, %xmm9
  1178. mulpd %xmm12, %xmm8
  1179. addpd %xmm8, %xmm4
  1180. movsd -10 * SIZE(A1), %xmm8
  1181. movhpd -9 * SIZE(A1), %xmm8
  1182. mulpd %xmm12, %xmm9
  1183. SUBPD %xmm9, %xmm5
  1184. pshufd $0x4e, %xmm10, %xmm11
  1185. mulpd %xmm12, %xmm10
  1186. addpd %xmm10, %xmm6
  1187. movsd -10 * SIZE(A1, LDA), %xmm10
  1188. movhpd -9 * SIZE(A1, LDA), %xmm10
  1189. mulpd %xmm12, %xmm11
  1190. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  1191. SUBPD %xmm11, %xmm7
  1192. #ifdef PREFETCH
  1193. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
  1194. #endif
  1195. pshufd $0x4e, %xmm8, %xmm9
  1196. mulpd %xmm13, %xmm8
  1197. addpd %xmm8, %xmm0
  1198. movsd -10 * SIZE(A2), %xmm8
  1199. movhpd -9 * SIZE(A2), %xmm8
  1200. mulpd %xmm13, %xmm9
  1201. SUBPD %xmm9, %xmm1
  1202. pshufd $0x4e, %xmm10, %xmm11
  1203. mulpd %xmm13, %xmm10
  1204. addpd %xmm10, %xmm2
  1205. movsd -10 * SIZE(A2, LDA), %xmm10
  1206. movhpd -9 * SIZE(A2, LDA), %xmm10
  1207. mulpd %xmm13, %xmm11
  1208. SUBPD %xmm11, %xmm3
  1209. #ifdef PREFETCHW
  1210. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1)
  1211. #endif
  1212. pshufd $0x4e, %xmm8, %xmm9
  1213. mulpd %xmm13, %xmm8
  1214. addpd %xmm8, %xmm4
  1215. movsd -8 * SIZE(A1), %xmm8
  1216. movhpd -7 * SIZE(A1), %xmm8
  1217. mulpd %xmm13, %xmm9
  1218. SUBPD %xmm9, %xmm5
  1219. pshufd $0x4e, %xmm10, %xmm11
  1220. mulpd %xmm13, %xmm10
  1221. addpd %xmm10, %xmm6
  1222. movsd -8 * SIZE(A1, LDA), %xmm10
  1223. movhpd -7 * SIZE(A1, LDA), %xmm10
  1224. mulpd %xmm13, %xmm11
  1225. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  1226. SUBPD %xmm11, %xmm7
  1227. subq $-8 * SIZE, A1
  1228. subq $-8 * SIZE, A2
  1229. subq $-8 * SIZE, X1
  1230. subq $1, I
  1231. BRANCH
  1232. jg .L103
  1233. ALIGN_3
  1234. .L104:
  1235. pshufd $0x4e, %xmm8, %xmm9
  1236. mulpd %xmm12, %xmm8
  1237. addpd %xmm8, %xmm0
  1238. movsd -16 * SIZE(A2), %xmm8
  1239. movhpd -15 * SIZE(A2), %xmm8
  1240. mulpd %xmm12, %xmm9
  1241. SUBPD %xmm9, %xmm1
  1242. pshufd $0x4e, %xmm10, %xmm11
  1243. mulpd %xmm12, %xmm10
  1244. addpd %xmm10, %xmm2
  1245. movsd -16 * SIZE(A2, LDA), %xmm10
  1246. movhpd -15 * SIZE(A2, LDA), %xmm10
  1247. mulpd %xmm12, %xmm11
  1248. SUBPD %xmm11, %xmm3
  1249. pshufd $0x4e, %xmm8, %xmm9
  1250. mulpd %xmm12, %xmm8
  1251. addpd %xmm8, %xmm4
  1252. movsd -14 * SIZE(A1), %xmm8
  1253. movhpd -13 * SIZE(A1), %xmm8
  1254. mulpd %xmm12, %xmm9
  1255. SUBPD %xmm9, %xmm5
  1256. pshufd $0x4e, %xmm10, %xmm11
  1257. mulpd %xmm12, %xmm10
  1258. addpd %xmm10, %xmm6
  1259. movsd -14 * SIZE(A1, LDA), %xmm10
  1260. movhpd -13 * SIZE(A1, LDA), %xmm10
  1261. mulpd %xmm12, %xmm11
  1262. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  1263. SUBPD %xmm11, %xmm7
  1264. pshufd $0x4e, %xmm8, %xmm9
  1265. mulpd %xmm13, %xmm8
  1266. addpd %xmm8, %xmm0
  1267. movsd -14 * SIZE(A2), %xmm8
  1268. movhpd -13 * SIZE(A2), %xmm8
  1269. mulpd %xmm13, %xmm9
  1270. SUBPD %xmm9, %xmm1
  1271. pshufd $0x4e, %xmm10, %xmm11
  1272. mulpd %xmm13, %xmm10
  1273. addpd %xmm10, %xmm2
  1274. movsd -14 * SIZE(A2, LDA), %xmm10
  1275. movhpd -13 * SIZE(A2, LDA), %xmm10
  1276. mulpd %xmm13, %xmm11
  1277. SUBPD %xmm11, %xmm3
  1278. pshufd $0x4e, %xmm8, %xmm9
  1279. mulpd %xmm13, %xmm8
  1280. addpd %xmm8, %xmm4
  1281. movsd -12 * SIZE(A1), %xmm8
  1282. movhpd -11 * SIZE(A1), %xmm8
  1283. mulpd %xmm13, %xmm9
  1284. SUBPD %xmm9, %xmm5
  1285. pshufd $0x4e, %xmm10, %xmm11
  1286. mulpd %xmm13, %xmm10
  1287. addpd %xmm10, %xmm6
  1288. movsd -12 * SIZE(A1, LDA), %xmm10
  1289. movhpd -11 * SIZE(A1, LDA), %xmm10
  1290. mulpd %xmm13, %xmm11
  1291. MOVUPS_XL1(-10 * SIZE, X1, %xmm13)
  1292. SUBPD %xmm11, %xmm7
  1293. pshufd $0x4e, %xmm8, %xmm9
  1294. mulpd %xmm12, %xmm8
  1295. addpd %xmm8, %xmm0
  1296. movsd -12 * SIZE(A2), %xmm8
  1297. movhpd -11 * SIZE(A2), %xmm8
  1298. mulpd %xmm12, %xmm9
  1299. SUBPD %xmm9, %xmm1
  1300. pshufd $0x4e, %xmm10, %xmm11
  1301. mulpd %xmm12, %xmm10
  1302. addpd %xmm10, %xmm2
  1303. movsd -12 * SIZE(A2, LDA), %xmm10
  1304. movhpd -11 * SIZE(A2, LDA), %xmm10
  1305. mulpd %xmm12, %xmm11
  1306. SUBPD %xmm11, %xmm3
  1307. pshufd $0x4e, %xmm8, %xmm9
  1308. mulpd %xmm12, %xmm8
  1309. addpd %xmm8, %xmm4
  1310. movsd -10 * SIZE(A1), %xmm8
  1311. movhpd -9 * SIZE(A1), %xmm8
  1312. mulpd %xmm12, %xmm9
  1313. SUBPD %xmm9, %xmm5
  1314. pshufd $0x4e, %xmm10, %xmm11
  1315. mulpd %xmm12, %xmm10
  1316. addpd %xmm10, %xmm6
  1317. movsd -10 * SIZE(A1, LDA), %xmm10
  1318. movhpd -9 * SIZE(A1, LDA), %xmm10
  1319. mulpd %xmm12, %xmm11
  1320. MOVUPS_XL1( -8 * SIZE, X1, %xmm12)
  1321. SUBPD %xmm11, %xmm7
  1322. pshufd $0x4e, %xmm8, %xmm9
  1323. mulpd %xmm13, %xmm8
  1324. addpd %xmm8, %xmm0
  1325. movsd -10 * SIZE(A2), %xmm8
  1326. movhpd -9 * SIZE(A2), %xmm8
  1327. mulpd %xmm13, %xmm9
  1328. SUBPD %xmm9, %xmm1
  1329. pshufd $0x4e, %xmm10, %xmm11
  1330. mulpd %xmm13, %xmm10
  1331. addpd %xmm10, %xmm2
  1332. movsd -10 * SIZE(A2, LDA), %xmm10
  1333. movhpd -9 * SIZE(A2, LDA), %xmm10
  1334. mulpd %xmm13, %xmm11
  1335. SUBPD %xmm11, %xmm3
  1336. pshufd $0x4e, %xmm8, %xmm9
  1337. mulpd %xmm13, %xmm8
  1338. addpd %xmm8, %xmm4
  1339. mulpd %xmm13, %xmm9
  1340. SUBPD %xmm9, %xmm5
  1341. pshufd $0x4e, %xmm10, %xmm11
  1342. mulpd %xmm13, %xmm10
  1343. addpd %xmm10, %xmm6
  1344. mulpd %xmm13, %xmm11
  1345. MOVUPS_XL1( -6 * SIZE, X1, %xmm13)
  1346. SUBPD %xmm11, %xmm7
  1347. subq $-8 * SIZE, A1
  1348. subq $-8 * SIZE, A2
  1349. subq $-8 * SIZE, X1
  1350. ALIGN_3
  1351. .L105:
  1352. testq $2, M
  1353. je .L107
  1354. movsd -16 * SIZE(A1), %xmm8
  1355. movhpd -15 * SIZE(A1), %xmm8
  1356. movsd -16 * SIZE(A1, LDA), %xmm10
  1357. movhpd -15 * SIZE(A1, LDA), %xmm10
  1358. pshufd $0x4e, %xmm8, %xmm9
  1359. mulpd %xmm12, %xmm8
  1360. addpd %xmm8, %xmm0
  1361. movsd -16 * SIZE(A2), %xmm8
  1362. movhpd -15 * SIZE(A2), %xmm8
  1363. mulpd %xmm12, %xmm9
  1364. SUBPD %xmm9, %xmm1
  1365. pshufd $0x4e, %xmm10, %xmm11
  1366. mulpd %xmm12, %xmm10
  1367. addpd %xmm10, %xmm2
  1368. movsd -16 * SIZE(A2, LDA), %xmm10
  1369. movhpd -15 * SIZE(A2, LDA), %xmm10
  1370. mulpd %xmm12, %xmm11
  1371. SUBPD %xmm11, %xmm3
  1372. pshufd $0x4e, %xmm8, %xmm9
  1373. mulpd %xmm12, %xmm8
  1374. addpd %xmm8, %xmm4
  1375. movsd -14 * SIZE(A1), %xmm8
  1376. movhpd -13 * SIZE(A1), %xmm8
  1377. mulpd %xmm12, %xmm9
  1378. SUBPD %xmm9, %xmm5
  1379. pshufd $0x4e, %xmm10, %xmm11
  1380. mulpd %xmm12, %xmm10
  1381. addpd %xmm10, %xmm6
  1382. movsd -14 * SIZE(A1, LDA), %xmm10
  1383. movhpd -13 * SIZE(A1, LDA), %xmm10
  1384. mulpd %xmm12, %xmm11
  1385. MOVUPS_XL1(-12 * SIZE, X1, %xmm12)
  1386. SUBPD %xmm11, %xmm7
  1387. pshufd $0x4e, %xmm8, %xmm9
  1388. mulpd %xmm13, %xmm8
  1389. addpd %xmm8, %xmm0
  1390. movsd -14 * SIZE(A2), %xmm8
  1391. movhpd -13 * SIZE(A2), %xmm8
  1392. mulpd %xmm13, %xmm9
  1393. SUBPD %xmm9, %xmm1
  1394. pshufd $0x4e, %xmm10, %xmm11
  1395. mulpd %xmm13, %xmm10
  1396. addpd %xmm10, %xmm2
  1397. movsd -14 * SIZE(A2, LDA), %xmm10
  1398. movhpd -13 * SIZE(A2, LDA), %xmm10
  1399. mulpd %xmm13, %xmm11
  1400. SUBPD %xmm11, %xmm3
  1401. pshufd $0x4e, %xmm8, %xmm9
  1402. mulpd %xmm13, %xmm8
  1403. addpd %xmm8, %xmm4
  1404. mulpd %xmm13, %xmm9
  1405. SUBPD %xmm9, %xmm5
  1406. pshufd $0x4e, %xmm10, %xmm11
  1407. mulpd %xmm13, %xmm10
  1408. addpd %xmm10, %xmm6
  1409. mulpd %xmm13, %xmm11
  1410. SUBPD %xmm11, %xmm7
  1411. addq $4 * SIZE, A1
  1412. addq $4 * SIZE, A2
  1413. ALIGN_3
  1414. .L107:
  1415. testq $1, M
  1416. je .L109
  1417. movsd -16 * SIZE(A1), %xmm8
  1418. movhpd -15 * SIZE(A1), %xmm8
  1419. movsd -16 * SIZE(A1, LDA), %xmm10
  1420. movhpd -15 * SIZE(A1, LDA), %xmm10
  1421. pshufd $0x4e, %xmm8, %xmm9
  1422. mulpd %xmm12, %xmm8
  1423. addpd %xmm8, %xmm0
  1424. movsd -16 * SIZE(A2), %xmm8
  1425. movhpd -15 * SIZE(A2), %xmm8
  1426. mulpd %xmm12, %xmm9
  1427. SUBPD %xmm9, %xmm1
  1428. pshufd $0x4e, %xmm10, %xmm11
  1429. mulpd %xmm12, %xmm10
  1430. addpd %xmm10, %xmm2
  1431. movsd -16 * SIZE(A2, LDA), %xmm10
  1432. movhpd -15 * SIZE(A2, LDA), %xmm10
  1433. mulpd %xmm12, %xmm11
  1434. SUBPD %xmm11, %xmm3
  1435. pshufd $0x4e, %xmm8, %xmm9
  1436. mulpd %xmm12, %xmm8
  1437. addpd %xmm8, %xmm4
  1438. mulpd %xmm12, %xmm9
  1439. SUBPD %xmm9, %xmm5
  1440. pshufd $0x4e, %xmm10, %xmm11
  1441. mulpd %xmm12, %xmm10
  1442. addpd %xmm10, %xmm6
  1443. mulpd %xmm12, %xmm11
  1444. SUBPD %xmm11, %xmm7
  1445. ALIGN_3
  1446. .L109:
  1447. pcmpeqb %xmm13, %xmm13
  1448. psllq $63, %xmm13
  1449. shufps $0xc0, %xmm13, %xmm13
  1450. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  1451. xorpd %xmm13, %xmm0
  1452. xorpd %xmm13, %xmm2
  1453. xorpd %xmm13, %xmm4
  1454. xorpd %xmm13, %xmm6
  1455. #else
  1456. xorpd %xmm13, %xmm1
  1457. xorpd %xmm13, %xmm3
  1458. xorpd %xmm13, %xmm5
  1459. xorpd %xmm13, %xmm7
  1460. #endif
  1461. #ifdef HAVE_SSE3
  1462. haddpd %xmm1, %xmm0
  1463. haddpd %xmm3, %xmm2
  1464. haddpd %xmm5, %xmm4
  1465. haddpd %xmm7, %xmm6
  1466. #else
  1467. movapd %xmm0, %xmm8
  1468. unpcklpd %xmm1, %xmm0
  1469. unpckhpd %xmm1, %xmm8
  1470. movapd %xmm2, %xmm9
  1471. unpcklpd %xmm3, %xmm2
  1472. unpckhpd %xmm3, %xmm9
  1473. movapd %xmm4, %xmm10
  1474. unpcklpd %xmm5, %xmm4
  1475. unpckhpd %xmm5, %xmm10
  1476. movapd %xmm6, %xmm11
  1477. unpcklpd %xmm7, %xmm6
  1478. unpckhpd %xmm7, %xmm11
  1479. addpd %xmm8, %xmm0
  1480. addpd %xmm9, %xmm2
  1481. addpd %xmm10, %xmm4
  1482. addpd %xmm11, %xmm6
  1483. #endif
  1484. pshufd $0x4e, %xmm0, %xmm1
  1485. pshufd $0x4e, %xmm2, %xmm3
  1486. pshufd $0x4e, %xmm4, %xmm5
  1487. pshufd $0x4e, %xmm6, %xmm7
  1488. mulpd ALPHA_R, %xmm0
  1489. mulpd ALPHA_I, %xmm1
  1490. mulpd ALPHA_R, %xmm2
  1491. mulpd ALPHA_I, %xmm3
  1492. mulpd ALPHA_R, %xmm4
  1493. mulpd ALPHA_I, %xmm5
  1494. mulpd ALPHA_R, %xmm6
  1495. mulpd ALPHA_I, %xmm7
  1496. xorpd %xmm13, %xmm1
  1497. xorpd %xmm13, %xmm3
  1498. xorpd %xmm13, %xmm5
  1499. xorpd %xmm13, %xmm7
  1500. subpd %xmm1, %xmm0
  1501. subpd %xmm3, %xmm2
  1502. subpd %xmm5, %xmm4
  1503. subpd %xmm7, %xmm6
  1504. movsd 0 * SIZE(Y), %xmm1
  1505. movhpd 1 * SIZE(Y), %xmm1
  1506. addq INCY, Y
  1507. movsd 0 * SIZE(Y), %xmm3
  1508. movhpd 1 * SIZE(Y), %xmm3
  1509. addq INCY, Y
  1510. movsd 0 * SIZE(Y), %xmm5
  1511. movhpd 1 * SIZE(Y), %xmm5
  1512. addq INCY, Y
  1513. movsd 0 * SIZE(Y), %xmm7
  1514. movhpd 1 * SIZE(Y), %xmm7
  1515. addq INCY, Y
  1516. addpd %xmm1, %xmm0
  1517. addpd %xmm3, %xmm2
  1518. addpd %xmm5, %xmm4
  1519. addpd %xmm7, %xmm6
  1520. movlpd %xmm0, 0 * SIZE(Y1)
  1521. movhpd %xmm0, 1 * SIZE(Y1)
  1522. addq INCY, Y1
  1523. movlpd %xmm2, 0 * SIZE(Y1)
  1524. movhpd %xmm2, 1 * SIZE(Y1)
  1525. addq INCY, Y1
  1526. movlpd %xmm4, 0 * SIZE(Y1)
  1527. movhpd %xmm4, 1 * SIZE(Y1)
  1528. addq INCY, Y1
  1529. movlpd %xmm6, 0 * SIZE(Y1)
  1530. movhpd %xmm6, 1 * SIZE(Y1)
  1531. addq INCY, Y1
  1532. cmpq $4, N
  1533. jge .L101
  1534. ALIGN_3
  1535. .L110:
  1536. #endif
  1537. #if GEMV_UNROLL >= 2
  1538. cmpq $2, N
  1539. jl .L120
  1540. #if GEMV_UNROLL == 2
  1541. ALIGN_3
  1542. .L111:
  1543. #endif
  1544. subq $2, N
  1545. leaq 16 * SIZE(BUFFER), X1
  1546. movq A, A1
  1547. leaq (A1, LDA), A2
  1548. leaq (A1, LDA, 2), A
  1549. xorpd %xmm0, %xmm0
  1550. xorpd %xmm1, %xmm1
  1551. xorpd %xmm2, %xmm2
  1552. xorpd %xmm3, %xmm3
  1553. MOVUPS_XL1(-16 * SIZE, X1, %xmm4)
  1554. MOVUPS_XL1(-14 * SIZE, X1, %xmm5)
  1555. #ifdef PREFETCHW
  1556. PREFETCHW 3 * SIZE(Y1)
  1557. #endif
  1558. movq M, I
  1559. sarq $2, I
  1560. jle .L115
  1561. movsd -16 * SIZE(A1), %xmm8
  1562. movhpd -15 * SIZE(A1), %xmm8
  1563. movsd -16 * SIZE(A2), %xmm10
  1564. movhpd -15 * SIZE(A2), %xmm10
  1565. movsd -14 * SIZE(A1), %xmm12
  1566. movhpd -13 * SIZE(A1), %xmm12
  1567. movsd -14 * SIZE(A2), %xmm6
  1568. movhpd -13 * SIZE(A2), %xmm6
  1569. decq I
  1570. jle .L114
  1571. ALIGN_3
  1572. .L113:
  1573. #ifdef PREFETCH
  1574. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  1575. #endif
  1576. pshufd $0x4e, %xmm8, %xmm9
  1577. mulpd %xmm4, %xmm8
  1578. addpd %xmm8, %xmm0
  1579. movsd -12 * SIZE(A1), %xmm8
  1580. movhpd -11 * SIZE(A1), %xmm8
  1581. mulpd %xmm4, %xmm9
  1582. SUBPD %xmm9, %xmm1
  1583. pshufd $0x4e, %xmm10, %xmm11
  1584. mulpd %xmm4, %xmm10
  1585. addpd %xmm10, %xmm2
  1586. movsd -12 * SIZE(A2), %xmm10
  1587. movhpd -11 * SIZE(A2), %xmm10
  1588. mulpd %xmm4, %xmm11
  1589. SUBPD %xmm11, %xmm3
  1590. MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
  1591. pshufd $0x4e, %xmm12, %xmm13
  1592. mulpd %xmm5, %xmm12
  1593. addpd %xmm12, %xmm0
  1594. movsd -10 * SIZE(A1), %xmm12
  1595. movhpd -9 * SIZE(A1), %xmm12
  1596. mulpd %xmm5, %xmm13
  1597. SUBPD %xmm13, %xmm1
  1598. pshufd $0x4e, %xmm6, %xmm7
  1599. mulpd %xmm5, %xmm6
  1600. addpd %xmm6, %xmm2
  1601. movsd -10 * SIZE(A2), %xmm6
  1602. movhpd -9 * SIZE(A2), %xmm6
  1603. mulpd %xmm5, %xmm7
  1604. SUBPD %xmm7, %xmm3
  1605. MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
  1606. #ifdef PREFETCH
  1607. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  1608. #endif
  1609. pshufd $0x4e, %xmm8, %xmm9
  1610. mulpd %xmm4, %xmm8
  1611. addpd %xmm8, %xmm0
  1612. movsd -8 * SIZE(A1), %xmm8
  1613. movhpd -7 * SIZE(A1), %xmm8
  1614. mulpd %xmm4, %xmm9
  1615. SUBPD %xmm9, %xmm1
  1616. pshufd $0x4e, %xmm10, %xmm11
  1617. mulpd %xmm4, %xmm10
  1618. addpd %xmm10, %xmm2
  1619. movsd -8 * SIZE(A2), %xmm10
  1620. movhpd -7 * SIZE(A2), %xmm10
  1621. mulpd %xmm4, %xmm11
  1622. SUBPD %xmm11, %xmm3
  1623. MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
  1624. #ifdef PREFETCHW
  1625. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1)
  1626. #endif
  1627. pshufd $0x4e, %xmm12, %xmm13
  1628. mulpd %xmm5, %xmm12
  1629. addpd %xmm12, %xmm0
  1630. movsd -6 * SIZE(A1), %xmm12
  1631. movhpd -5 * SIZE(A1), %xmm12
  1632. mulpd %xmm5, %xmm13
  1633. SUBPD %xmm13, %xmm1
  1634. pshufd $0x4e, %xmm6, %xmm7
  1635. mulpd %xmm5, %xmm6
  1636. addpd %xmm6, %xmm2
  1637. movsd -6 * SIZE(A2), %xmm6
  1638. movhpd -5 * SIZE(A2), %xmm6
  1639. mulpd %xmm5, %xmm7
  1640. SUBPD %xmm7, %xmm3
  1641. MOVUPS_XL1( -6 * SIZE, X1, %xmm5)
  1642. subq $-8 * SIZE, A1
  1643. subq $-8 * SIZE, A2
  1644. subq $-8 * SIZE, X1
  1645. subq $1, I
  1646. BRANCH
  1647. jg .L113
  1648. ALIGN_3
  1649. .L114:
  1650. pshufd $0x4e, %xmm8, %xmm9
  1651. mulpd %xmm4, %xmm8
  1652. addpd %xmm8, %xmm0
  1653. movsd -12 * SIZE(A1), %xmm8
  1654. movhpd -11 * SIZE(A1), %xmm8
  1655. mulpd %xmm4, %xmm9
  1656. SUBPD %xmm9, %xmm1
  1657. pshufd $0x4e, %xmm10, %xmm11
  1658. mulpd %xmm4, %xmm10
  1659. addpd %xmm10, %xmm2
  1660. movsd -12 * SIZE(A2), %xmm10
  1661. movhpd -11 * SIZE(A2), %xmm10
  1662. mulpd %xmm4, %xmm11
  1663. SUBPD %xmm11, %xmm3
  1664. MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
  1665. pshufd $0x4e, %xmm12, %xmm13
  1666. mulpd %xmm5, %xmm12
  1667. addpd %xmm12, %xmm0
  1668. movsd -10 * SIZE(A1), %xmm12
  1669. movhpd -9 * SIZE(A1), %xmm12
  1670. mulpd %xmm5, %xmm13
  1671. SUBPD %xmm13, %xmm1
  1672. pshufd $0x4e, %xmm6, %xmm7
  1673. mulpd %xmm5, %xmm6
  1674. addpd %xmm6, %xmm2
  1675. movsd -10 * SIZE(A2), %xmm6
  1676. movhpd -9 * SIZE(A2), %xmm6
  1677. mulpd %xmm5, %xmm7
  1678. SUBPD %xmm7, %xmm3
  1679. MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
  1680. pshufd $0x4e, %xmm8, %xmm9
  1681. mulpd %xmm4, %xmm8
  1682. addpd %xmm8, %xmm0
  1683. mulpd %xmm4, %xmm9
  1684. SUBPD %xmm9, %xmm1
  1685. pshufd $0x4e, %xmm10, %xmm11
  1686. mulpd %xmm4, %xmm10
  1687. addpd %xmm10, %xmm2
  1688. mulpd %xmm4, %xmm11
  1689. SUBPD %xmm11, %xmm3
  1690. MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
  1691. pshufd $0x4e, %xmm12, %xmm13
  1692. mulpd %xmm5, %xmm12
  1693. addpd %xmm12, %xmm0
  1694. mulpd %xmm5, %xmm13
  1695. SUBPD %xmm13, %xmm1
  1696. pshufd $0x4e, %xmm6, %xmm7
  1697. mulpd %xmm5, %xmm6
  1698. addpd %xmm6, %xmm2
  1699. mulpd %xmm5, %xmm7
  1700. SUBPD %xmm7, %xmm3
  1701. MOVUPS_XL1( -6 * SIZE, X1, %xmm5)
  1702. subq $-8 * SIZE, A1
  1703. subq $-8 * SIZE, A2
  1704. subq $-8 * SIZE, X1
  1705. ALIGN_3
  1706. .L115:
  1707. testq $2, M
  1708. je .L117
  1709. movsd -16 * SIZE(A1), %xmm8
  1710. movhpd -15 * SIZE(A1), %xmm8
  1711. movsd -16 * SIZE(A2), %xmm10
  1712. movhpd -15 * SIZE(A2), %xmm10
  1713. movsd -14 * SIZE(A1), %xmm12
  1714. movhpd -13 * SIZE(A1), %xmm12
  1715. movsd -14 * SIZE(A2), %xmm6
  1716. movhpd -13 * SIZE(A2), %xmm6
  1717. pshufd $0x4e, %xmm8, %xmm9
  1718. mulpd %xmm4, %xmm8
  1719. addpd %xmm8, %xmm0
  1720. mulpd %xmm4, %xmm9
  1721. SUBPD %xmm9, %xmm1
  1722. pshufd $0x4e, %xmm10, %xmm11
  1723. mulpd %xmm4, %xmm10
  1724. addpd %xmm10, %xmm2
  1725. mulpd %xmm4, %xmm11
  1726. SUBPD %xmm11, %xmm3
  1727. MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
  1728. pshufd $0x4e, %xmm12, %xmm13
  1729. mulpd %xmm5, %xmm12
  1730. addpd %xmm12, %xmm0
  1731. mulpd %xmm5, %xmm13
  1732. SUBPD %xmm13, %xmm1
  1733. pshufd $0x4e, %xmm6, %xmm7
  1734. mulpd %xmm5, %xmm6
  1735. addpd %xmm6, %xmm2
  1736. mulpd %xmm5, %xmm7
  1737. SUBPD %xmm7, %xmm3
  1738. addq $4 * SIZE, A1
  1739. addq $4 * SIZE, A2
  1740. ALIGN_3
  1741. .L117:
  1742. testq $1, M
  1743. je .L119
  1744. movsd -16 * SIZE(A1), %xmm8
  1745. movhpd -15 * SIZE(A1), %xmm8
  1746. movsd -16 * SIZE(A2), %xmm10
  1747. movhpd -15 * SIZE(A2), %xmm10
  1748. pshufd $0x4e, %xmm8, %xmm9
  1749. mulpd %xmm4, %xmm8
  1750. addpd %xmm8, %xmm0
  1751. mulpd %xmm4, %xmm9
  1752. SUBPD %xmm9, %xmm1
  1753. pshufd $0x4e, %xmm10, %xmm11
  1754. mulpd %xmm4, %xmm10
  1755. addpd %xmm10, %xmm2
  1756. mulpd %xmm4, %xmm11
  1757. SUBPD %xmm11, %xmm3
  1758. ALIGN_3
  1759. .L119:
  1760. pcmpeqb %xmm11, %xmm11
  1761. psllq $63, %xmm11
  1762. shufps $0xc0, %xmm11, %xmm11
  1763. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  1764. xorpd %xmm11, %xmm0
  1765. xorpd %xmm11, %xmm2
  1766. #else
  1767. xorpd %xmm11, %xmm1
  1768. xorpd %xmm11, %xmm3
  1769. #endif
  1770. #ifdef HAVE_SSE3
  1771. haddpd %xmm1, %xmm0
  1772. haddpd %xmm3, %xmm2
  1773. #else
  1774. movapd %xmm0, %xmm8
  1775. unpcklpd %xmm1, %xmm0
  1776. unpckhpd %xmm1, %xmm8
  1777. movapd %xmm2, %xmm9
  1778. unpcklpd %xmm3, %xmm2
  1779. unpckhpd %xmm3, %xmm9
  1780. addpd %xmm8, %xmm0
  1781. addpd %xmm9, %xmm2
  1782. #endif
  1783. pshufd $0x4e, %xmm0, %xmm1
  1784. pshufd $0x4e, %xmm2, %xmm3
  1785. mulpd ALPHA_R, %xmm0
  1786. mulpd ALPHA_I, %xmm1
  1787. mulpd ALPHA_R, %xmm2
  1788. mulpd ALPHA_I, %xmm3
  1789. xorpd %xmm11, %xmm1
  1790. xorpd %xmm11, %xmm3
  1791. subpd %xmm1, %xmm0
  1792. subpd %xmm3, %xmm2
  1793. movsd 0 * SIZE(Y), %xmm4
  1794. movhpd 1 * SIZE(Y), %xmm4
  1795. addq INCY, Y
  1796. movsd 0 * SIZE(Y), %xmm5
  1797. movhpd 1 * SIZE(Y), %xmm5
  1798. addq INCY, Y
  1799. addpd %xmm4, %xmm0
  1800. addpd %xmm5, %xmm2
  1801. movlpd %xmm0, 0 * SIZE(Y1)
  1802. movhpd %xmm0, 1 * SIZE(Y1)
  1803. addq INCY, Y1
  1804. movlpd %xmm2, 0 * SIZE(Y1)
  1805. movhpd %xmm2, 1 * SIZE(Y1)
  1806. addq INCY, Y1
  1807. #if GEMV_UNROLL == 2
  1808. cmpq $2, N
  1809. jge .L111
  1810. #endif
  1811. ALIGN_3
  1812. .L120:
  1813. #endif
  1814. cmpq $1, N
  1815. jl .L999
  1816. #if GEMV_UNROLL == 1
  1817. .L121:
  1818. decq N
  1819. #endif
  1820. leaq 16 * SIZE(BUFFER), X1
  1821. movq A, A1
  1822. #if GEMV_UNROLL == 1
  1823. addq LDA, A
  1824. #endif
  1825. xorpd %xmm0, %xmm0
  1826. xorpd %xmm1, %xmm1
  1827. MOVUPS_XL1(-16 * SIZE, X1, %xmm4)
  1828. MOVUPS_XL1(-14 * SIZE, X1, %xmm5)
  1829. movq M, I
  1830. sarq $2, I
  1831. jle .L125
  1832. movsd -16 * SIZE(A1), %xmm8
  1833. movhpd -15 * SIZE(A1), %xmm8
  1834. movsd -14 * SIZE(A1), %xmm12
  1835. movhpd -13 * SIZE(A1), %xmm12
  1836. decq I
  1837. jle .L124
  1838. ALIGN_3
  1839. .L123:
  1840. #ifdef PREFETCH
  1841. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  1842. #endif
  1843. pshufd $0x4e, %xmm8, %xmm9
  1844. mulpd %xmm4, %xmm8
  1845. addpd %xmm8, %xmm0
  1846. movsd -12 * SIZE(A1), %xmm8
  1847. movhpd -11 * SIZE(A1), %xmm8
  1848. mulpd %xmm4, %xmm9
  1849. SUBPD %xmm9, %xmm1
  1850. MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
  1851. pshufd $0x4e, %xmm12, %xmm13
  1852. mulpd %xmm5, %xmm12
  1853. addpd %xmm12, %xmm0
  1854. movsd -10 * SIZE(A1), %xmm12
  1855. movhpd -9 * SIZE(A1), %xmm12
  1856. mulpd %xmm5, %xmm13
  1857. SUBPD %xmm13, %xmm1
  1858. MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
  1859. #ifdef PREFETCHW
  1860. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1)
  1861. #endif
  1862. pshufd $0x4e, %xmm8, %xmm9
  1863. mulpd %xmm4, %xmm8
  1864. addpd %xmm8, %xmm0
  1865. movsd -8 * SIZE(A1), %xmm8
  1866. movhpd -7 * SIZE(A1), %xmm8
  1867. mulpd %xmm4, %xmm9
  1868. SUBPD %xmm9, %xmm1
  1869. MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
  1870. pshufd $0x4e, %xmm12, %xmm13
  1871. mulpd %xmm5, %xmm12
  1872. addpd %xmm12, %xmm0
  1873. movsd -6 * SIZE(A1), %xmm12
  1874. movhpd -5 * SIZE(A1), %xmm12
  1875. mulpd %xmm5, %xmm13
  1876. SUBPD %xmm13, %xmm1
  1877. MOVUPS_XL1(-6 * SIZE, X1, %xmm5)
  1878. subq $-8 * SIZE, A1
  1879. subq $-8 * SIZE, X1
  1880. subq $1, I
  1881. BRANCH
  1882. jg .L123
  1883. ALIGN_3
  1884. .L124:
  1885. pshufd $0x4e, %xmm8, %xmm9
  1886. mulpd %xmm4, %xmm8
  1887. addpd %xmm8, %xmm0
  1888. movsd -12 * SIZE(A1), %xmm8
  1889. movhpd -11 * SIZE(A1), %xmm8
  1890. mulpd %xmm4, %xmm9
  1891. SUBPD %xmm9, %xmm1
  1892. MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
  1893. pshufd $0x4e, %xmm12, %xmm13
  1894. mulpd %xmm5, %xmm12
  1895. addpd %xmm12, %xmm0
  1896. movsd -10 * SIZE(A1), %xmm12
  1897. movhpd -9 * SIZE(A1), %xmm12
  1898. mulpd %xmm5, %xmm13
  1899. SUBPD %xmm13, %xmm1
  1900. MOVUPS_XL1(-10 * SIZE, X1, %xmm5)
  1901. pshufd $0x4e, %xmm8, %xmm9
  1902. mulpd %xmm4, %xmm8
  1903. addpd %xmm8, %xmm0
  1904. mulpd %xmm4, %xmm9
  1905. SUBPD %xmm9, %xmm1
  1906. MOVUPS_XL1( -8 * SIZE, X1, %xmm4)
  1907. pshufd $0x4e, %xmm12, %xmm13
  1908. mulpd %xmm5, %xmm12
  1909. addpd %xmm12, %xmm0
  1910. mulpd %xmm5, %xmm13
  1911. SUBPD %xmm13, %xmm1
  1912. MOVUPS_XL1(-6 * SIZE, X1, %xmm5)
  1913. subq $-8 * SIZE, A1
  1914. subq $-8 * SIZE, X1
  1915. ALIGN_3
  1916. .L125:
  1917. testq $2, M
  1918. je .L127
  1919. movsd -16 * SIZE(A1), %xmm8
  1920. movhpd -15 * SIZE(A1), %xmm8
  1921. movsd -14 * SIZE(A1), %xmm12
  1922. movhpd -13 * SIZE(A1), %xmm12
  1923. pshufd $0x4e, %xmm8, %xmm9
  1924. mulpd %xmm4, %xmm8
  1925. addpd %xmm8, %xmm0
  1926. mulpd %xmm4, %xmm9
  1927. SUBPD %xmm9, %xmm1
  1928. MOVUPS_XL1(-12 * SIZE, X1, %xmm4)
  1929. pshufd $0x4e, %xmm12, %xmm13
  1930. mulpd %xmm5, %xmm12
  1931. addpd %xmm12, %xmm0
  1932. mulpd %xmm5, %xmm13
  1933. SUBPD %xmm13, %xmm1
  1934. addq $4 * SIZE, A1
  1935. ALIGN_3
  1936. .L127:
  1937. testq $1, M
  1938. je .L129
  1939. movsd -16 * SIZE(A1), %xmm8
  1940. movhpd -15 * SIZE(A1), %xmm8
  1941. pshufd $0x4e, %xmm8, %xmm9
  1942. mulpd %xmm4, %xmm8
  1943. addpd %xmm8, %xmm0
  1944. mulpd %xmm4, %xmm9
  1945. SUBPD %xmm9, %xmm1
  1946. ALIGN_3
  1947. .L129:
  1948. pcmpeqb %xmm11, %xmm11
  1949. psllq $63, %xmm11
  1950. shufps $0xc0, %xmm11, %xmm11
  1951. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  1952. xorpd %xmm11, %xmm0
  1953. #else
  1954. xorpd %xmm11, %xmm1
  1955. #endif
  1956. #ifdef HAVE_SSE3
  1957. haddpd %xmm1, %xmm0
  1958. #else
  1959. movapd %xmm0, %xmm8
  1960. unpcklpd %xmm1, %xmm0
  1961. unpckhpd %xmm1, %xmm8
  1962. addpd %xmm8, %xmm0
  1963. #endif
  1964. pshufd $0x4e, %xmm0, %xmm1
  1965. mulpd ALPHA_R, %xmm0
  1966. mulpd ALPHA_I, %xmm1
  1967. xorpd %xmm11, %xmm1
  1968. subpd %xmm1, %xmm0
  1969. movsd 0 * SIZE(Y), %xmm4
  1970. movhpd 1 * SIZE(Y), %xmm4
  1971. addpd %xmm4, %xmm0
  1972. movlpd %xmm0, 0 * SIZE(Y1)
  1973. movhpd %xmm0, 1 * SIZE(Y1)
  1974. #if GEMV_UNROLL == 1
  1975. addq INCY, Y
  1976. addq INCY, Y1
  1977. cmpq $1, N
  1978. jge .L121
  1979. #endif
  1980. #endif
  1981. ALIGN_3
  1982. .L999:
  1983. movq M, I
  1984. salq $ZBASE_SHIFT,I
  1985. addq I,AA
  1986. jmp .L0t
  1987. .L999x:
  1988. movq 0(%rsp), %rbx
  1989. movq 8(%rsp), %rbp
  1990. movq 16(%rsp), %r12
  1991. movq 24(%rsp), %r13
  1992. movq 32(%rsp), %r14
  1993. movq 40(%rsp), %r15
  1994. #ifdef WINDOWS_ABI
  1995. movq 48(%rsp), %rdi
  1996. movq 56(%rsp), %rsi
  1997. movups 64(%rsp), %xmm6
  1998. movups 80(%rsp), %xmm7
  1999. movups 96(%rsp), %xmm8
  2000. movups 112(%rsp), %xmm9
  2001. movups 128(%rsp), %xmm10
  2002. movups 144(%rsp), %xmm11
  2003. movups 160(%rsp), %xmm12
  2004. movups 176(%rsp), %xmm13
  2005. movups 192(%rsp), %xmm14
  2006. movups 208(%rsp), %xmm15
  2007. #endif
  2008. addq $STACKSIZE, %rsp
  2009. ret
  2010. EPILOGUE