You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_n_dup.S 30 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "l2param.h"
  41. #ifndef WINDOWS_ABI
  42. #define STACKSIZE 64
  43. #define OLD_INCX 8 + STACKSIZE(%rsp)
  44. #define OLD_Y 16 + STACKSIZE(%rsp)
  45. #define OLD_INCY 24 + STACKSIZE(%rsp)
  46. #define OLD_BUFFER 32 + STACKSIZE(%rsp)
  47. #define ALPHA_R 48 (%rsp)
  48. #define ALPHA_I 56 (%rsp)
  49. #define M %rdi
  50. #define N %rsi
  51. #define A %rcx
  52. #define LDA %r8
  53. #define X %r9
  54. #define INCX %rdx
  55. #define Y %rbp
  56. #define INCY %r10
  57. #else
  58. #define STACKSIZE 256
  59. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  60. #define OLD_A 48 + STACKSIZE(%rsp)
  61. #define OLD_LDA 56 + STACKSIZE(%rsp)
  62. #define OLD_X 64 + STACKSIZE(%rsp)
  63. #define OLD_INCX 72 + STACKSIZE(%rsp)
  64. #define OLD_Y 80 + STACKSIZE(%rsp)
  65. #define OLD_INCY 88 + STACKSIZE(%rsp)
  66. #define OLD_BUFFER 96 + STACKSIZE(%rsp)
  67. #define ALPHA_R 224 (%rsp)
  68. #define ALPHA_I 232 (%rsp)
  69. #define M %rcx
  70. #define N %rdx
  71. #define A %r8
  72. #define LDA %r9
  73. #define X %rdi
  74. #define INCX %rsi
  75. #define Y %rbp
  76. #define INCY %r10
  77. #endif
  78. #define I %rax
  79. #define A1 %r12
  80. #define A2 %r13
  81. #define Y1 %r14
  82. #define BUFFER %r15
  83. #define J %r11
  84. #undef SUBPD
  85. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  86. #define SUBPD subpd
  87. #else
  88. #define SUBPD addpd
  89. #endif
  90. PROLOGUE
  91. PROFCODE
  92. subq $STACKSIZE, %rsp
  93. movq %rbx, 0(%rsp)
  94. movq %rbp, 8(%rsp)
  95. movq %r12, 16(%rsp)
  96. movq %r13, 24(%rsp)
  97. movq %r14, 32(%rsp)
  98. movq %r15, 40(%rsp)
  99. #ifdef WINDOWS_ABI
  100. movq %rdi, 48(%rsp)
  101. movq %rsi, 56(%rsp)
  102. movups %xmm6, 64(%rsp)
  103. movups %xmm7, 80(%rsp)
  104. movups %xmm8, 96(%rsp)
  105. movups %xmm9, 112(%rsp)
  106. movups %xmm10, 128(%rsp)
  107. movups %xmm11, 144(%rsp)
  108. movups %xmm12, 160(%rsp)
  109. movups %xmm13, 176(%rsp)
  110. movups %xmm14, 192(%rsp)
  111. movups %xmm15, 208(%rsp)
  112. movq OLD_A, A
  113. movq OLD_LDA, LDA
  114. movq OLD_X, X
  115. movapd %xmm3, %xmm0
  116. movsd OLD_ALPHA_I, %xmm1
  117. #endif
  118. movq OLD_INCX, INCX
  119. movq OLD_Y, Y
  120. movq OLD_INCY, INCY
  121. movq OLD_BUFFER, BUFFER
  122. salq $ZBASE_SHIFT, LDA
  123. salq $ZBASE_SHIFT, INCX
  124. salq $ZBASE_SHIFT, INCY
  125. movlps %xmm0, ALPHA_R
  126. movlps %xmm1, ALPHA_I
  127. subq $-16 * SIZE, A
  128. testq M, M
  129. jle .L999
  130. testq N, N
  131. jle .L999
  132. ALIGN_3
  133. movq BUFFER, Y1
  134. xorps %xmm4, %xmm4
  135. movq M, %rax
  136. addq $8, %rax
  137. sarq $3, %rax
  138. ALIGN_3
  139. .L01:
  140. movaps %xmm4, 0 * SIZE(Y1)
  141. movaps %xmm4, 2 * SIZE(Y1)
  142. movaps %xmm4, 4 * SIZE(Y1)
  143. movaps %xmm4, 6 * SIZE(Y1)
  144. movaps %xmm4, 8 * SIZE(Y1)
  145. movaps %xmm4, 10 * SIZE(Y1)
  146. movaps %xmm4, 12 * SIZE(Y1)
  147. movaps %xmm4, 14 * SIZE(Y1)
  148. subq $-16 * SIZE, Y1
  149. decq %rax
  150. jg .L01
  151. ALIGN_3
  152. .L10:
  153. #if GEMV_UNROLL >= 4
  154. cmpq $4, N
  155. jl .L20
  156. ALIGN_3
  157. .L11:
  158. subq $4, N
  159. leaq 16 * SIZE(BUFFER), Y1
  160. movq A, A1
  161. leaq (A, LDA, 2), A2
  162. leaq (A, LDA, 4), A
  163. movddup 0 * SIZE(X), %xmm8
  164. movddup 1 * SIZE(X), %xmm9
  165. addq INCX, X
  166. movddup 0 * SIZE(X), %xmm10
  167. movddup 1 * SIZE(X), %xmm11
  168. addq INCX, X
  169. movddup 0 * SIZE(X), %xmm12
  170. movddup 1 * SIZE(X), %xmm13
  171. addq INCX, X
  172. movddup 0 * SIZE(X), %xmm14
  173. movddup 1 * SIZE(X), %xmm15
  174. addq INCX, X
  175. pcmpeqb %xmm5, %xmm5
  176. psllq $63, %xmm5
  177. shufps $0x40, %xmm5, %xmm5
  178. movsd ALPHA_R, %xmm6
  179. movhps ALPHA_I, %xmm6
  180. pshufd $0x4e, %xmm6, %xmm7
  181. #ifndef XCONJ
  182. xorps %xmm5, %xmm7
  183. #else
  184. xorps %xmm5, %xmm6
  185. #endif
  186. mulpd %xmm6, %xmm8
  187. mulpd %xmm7, %xmm9
  188. mulpd %xmm6, %xmm10
  189. mulpd %xmm7, %xmm11
  190. mulpd %xmm6, %xmm12
  191. mulpd %xmm7, %xmm13
  192. mulpd %xmm6, %xmm14
  193. mulpd %xmm7, %xmm15
  194. #ifndef XCONJ
  195. subpd %xmm9, %xmm8
  196. subpd %xmm11, %xmm10
  197. subpd %xmm13, %xmm12
  198. subpd %xmm15, %xmm14
  199. #else
  200. addpd %xmm9, %xmm8
  201. addpd %xmm11, %xmm10
  202. addpd %xmm13, %xmm12
  203. addpd %xmm15, %xmm14
  204. #endif
  205. pshufd $0x4e, %xmm8, %xmm9
  206. pshufd $0x4e, %xmm10, %xmm11
  207. pshufd $0x4e, %xmm12, %xmm13
  208. pshufd $0x4e, %xmm14, %xmm15
  209. #ifndef XCONJ
  210. xorps %xmm5, %xmm9
  211. xorps %xmm5, %xmm11
  212. xorps %xmm5, %xmm13
  213. xorps %xmm5, %xmm15
  214. #else
  215. xorps %xmm5, %xmm8
  216. xorps %xmm5, %xmm10
  217. xorps %xmm5, %xmm12
  218. xorps %xmm5, %xmm14
  219. #endif
  220. MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
  221. MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
  222. MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
  223. MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
  224. ALIGN_3
  225. movq M, I
  226. sarq $2, I
  227. jle .L15
  228. movddup -16 * SIZE(A1), %xmm4
  229. movddup -14 * SIZE(A1), %xmm5
  230. movddup -12 * SIZE(A1), %xmm6
  231. movddup -10 * SIZE(A1), %xmm7
  232. decq I
  233. jle .L14
  234. ALIGN_3
  235. .L13:
  236. #ifdef PREFETCH
  237. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1)
  238. #endif
  239. mulpd %xmm8, %xmm4
  240. addpd %xmm4, %xmm0
  241. movddup -15 * SIZE(A1), %xmm4
  242. mulpd %xmm8, %xmm5
  243. addpd %xmm5, %xmm1
  244. movddup -13 * SIZE(A1), %xmm5
  245. mulpd %xmm8, %xmm6
  246. addpd %xmm6, %xmm2
  247. movddup -11 * SIZE(A1), %xmm6
  248. mulpd %xmm8, %xmm7
  249. addpd %xmm7, %xmm3
  250. movddup -9 * SIZE(A1), %xmm7
  251. mulpd %xmm9, %xmm4
  252. SUBPD %xmm4, %xmm0
  253. movddup -16 * SIZE(A1, LDA), %xmm4
  254. mulpd %xmm9, %xmm5
  255. SUBPD %xmm5, %xmm1
  256. movddup -14 * SIZE(A1, LDA), %xmm5
  257. mulpd %xmm9, %xmm6
  258. SUBPD %xmm6, %xmm2
  259. movddup -12 * SIZE(A1, LDA), %xmm6
  260. mulpd %xmm9, %xmm7
  261. SUBPD %xmm7, %xmm3
  262. movddup -10 * SIZE(A1, LDA), %xmm7
  263. #ifdef PREFETCH
  264. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
  265. #endif
  266. mulpd %xmm10, %xmm4
  267. addpd %xmm4, %xmm0
  268. movddup -15 * SIZE(A1, LDA), %xmm4
  269. mulpd %xmm10, %xmm5
  270. addpd %xmm5, %xmm1
  271. movddup -13 * SIZE(A1, LDA), %xmm5
  272. mulpd %xmm10, %xmm6
  273. addpd %xmm6, %xmm2
  274. movddup -11 * SIZE(A1, LDA), %xmm6
  275. mulpd %xmm10, %xmm7
  276. addpd %xmm7, %xmm3
  277. movddup -9 * SIZE(A1, LDA), %xmm7
  278. mulpd %xmm11, %xmm4
  279. SUBPD %xmm4, %xmm0
  280. movddup -16 * SIZE(A2), %xmm4
  281. mulpd %xmm11, %xmm5
  282. SUBPD %xmm5, %xmm1
  283. movddup -14 * SIZE(A2), %xmm5
  284. mulpd %xmm11, %xmm6
  285. SUBPD %xmm6, %xmm2
  286. movddup -12 * SIZE(A2), %xmm6
  287. mulpd %xmm11, %xmm7
  288. SUBPD %xmm7, %xmm3
  289. movddup -10 * SIZE(A2), %xmm7
  290. #ifdef PREFETCH
  291. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2)
  292. #endif
  293. mulpd %xmm12, %xmm4
  294. addpd %xmm4, %xmm0
  295. movddup -15 * SIZE(A2), %xmm4
  296. mulpd %xmm12, %xmm5
  297. addpd %xmm5, %xmm1
  298. movddup -13 * SIZE(A2), %xmm5
  299. mulpd %xmm12, %xmm6
  300. addpd %xmm6, %xmm2
  301. movddup -11 * SIZE(A2), %xmm6
  302. mulpd %xmm12, %xmm7
  303. addpd %xmm7, %xmm3
  304. movddup -9 * SIZE(A2), %xmm7
  305. mulpd %xmm13, %xmm4
  306. SUBPD %xmm4, %xmm0
  307. movddup -16 * SIZE(A2, LDA), %xmm4
  308. mulpd %xmm13, %xmm5
  309. SUBPD %xmm5, %xmm1
  310. movddup -14 * SIZE(A2, LDA), %xmm5
  311. mulpd %xmm13, %xmm6
  312. SUBPD %xmm6, %xmm2
  313. movddup -12 * SIZE(A2, LDA), %xmm6
  314. mulpd %xmm13, %xmm7
  315. SUBPD %xmm7, %xmm3
  316. movddup -10 * SIZE(A2, LDA), %xmm7
  317. #ifdef PREFETCH
  318. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
  319. #endif
  320. mulpd %xmm14, %xmm4
  321. addpd %xmm4, %xmm0
  322. movddup -15 * SIZE(A2, LDA), %xmm4
  323. mulpd %xmm14, %xmm5
  324. addpd %xmm5, %xmm1
  325. movddup -13 * SIZE(A2, LDA), %xmm5
  326. mulpd %xmm14, %xmm6
  327. addpd %xmm6, %xmm2
  328. movddup -11 * SIZE(A2, LDA), %xmm6
  329. mulpd %xmm14, %xmm7
  330. addpd %xmm7, %xmm3
  331. movddup -9 * SIZE(A2, LDA), %xmm7
  332. mulpd %xmm15, %xmm4
  333. SUBPD %xmm4, %xmm0
  334. movddup -8 * SIZE(A1), %xmm4
  335. mulpd %xmm15, %xmm5
  336. SUBPD %xmm5, %xmm1
  337. movddup -6 * SIZE(A1), %xmm5
  338. mulpd %xmm15, %xmm6
  339. SUBPD %xmm6, %xmm2
  340. movddup -4 * SIZE(A1), %xmm6
  341. mulpd %xmm15, %xmm7
  342. SUBPD %xmm7, %xmm3
  343. movddup -2 * SIZE(A1), %xmm7
  344. #ifdef PREFETCHW
  345. PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1)
  346. #endif
  347. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  348. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  349. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  350. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  351. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  352. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  353. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  354. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  355. subq $-8 * SIZE, A1
  356. subq $-8 * SIZE, A2
  357. subq $-8 * SIZE, Y1
  358. subq $1, I
  359. BRANCH
  360. jg .L13
  361. ALIGN_3
  362. .L14:
  363. mulpd %xmm8, %xmm4
  364. addpd %xmm4, %xmm0
  365. movddup -15 * SIZE(A1), %xmm4
  366. mulpd %xmm8, %xmm5
  367. addpd %xmm5, %xmm1
  368. movddup -13 * SIZE(A1), %xmm5
  369. mulpd %xmm8, %xmm6
  370. addpd %xmm6, %xmm2
  371. movddup -11 * SIZE(A1), %xmm6
  372. mulpd %xmm8, %xmm7
  373. addpd %xmm7, %xmm3
  374. movddup -9 * SIZE(A1), %xmm7
  375. mulpd %xmm9, %xmm4
  376. SUBPD %xmm4, %xmm0
  377. movddup -16 * SIZE(A1, LDA), %xmm4
  378. mulpd %xmm9, %xmm5
  379. SUBPD %xmm5, %xmm1
  380. movddup -14 * SIZE(A1, LDA), %xmm5
  381. mulpd %xmm9, %xmm6
  382. SUBPD %xmm6, %xmm2
  383. movddup -12 * SIZE(A1, LDA), %xmm6
  384. mulpd %xmm9, %xmm7
  385. SUBPD %xmm7, %xmm3
  386. movddup -10 * SIZE(A1, LDA), %xmm7
  387. mulpd %xmm10, %xmm4
  388. addpd %xmm4, %xmm0
  389. movddup -15 * SIZE(A1, LDA), %xmm4
  390. mulpd %xmm10, %xmm5
  391. addpd %xmm5, %xmm1
  392. movddup -13 * SIZE(A1, LDA), %xmm5
  393. mulpd %xmm10, %xmm6
  394. addpd %xmm6, %xmm2
  395. movddup -11 * SIZE(A1, LDA), %xmm6
  396. mulpd %xmm10, %xmm7
  397. addpd %xmm7, %xmm3
  398. movddup -9 * SIZE(A1, LDA), %xmm7
  399. mulpd %xmm11, %xmm4
  400. SUBPD %xmm4, %xmm0
  401. movddup -16 * SIZE(A2), %xmm4
  402. mulpd %xmm11, %xmm5
  403. SUBPD %xmm5, %xmm1
  404. movddup -14 * SIZE(A2), %xmm5
  405. mulpd %xmm11, %xmm6
  406. SUBPD %xmm6, %xmm2
  407. movddup -12 * SIZE(A2), %xmm6
  408. mulpd %xmm11, %xmm7
  409. SUBPD %xmm7, %xmm3
  410. movddup -10 * SIZE(A2), %xmm7
  411. mulpd %xmm12, %xmm4
  412. addpd %xmm4, %xmm0
  413. movddup -15 * SIZE(A2), %xmm4
  414. mulpd %xmm12, %xmm5
  415. addpd %xmm5, %xmm1
  416. movddup -13 * SIZE(A2), %xmm5
  417. mulpd %xmm12, %xmm6
  418. addpd %xmm6, %xmm2
  419. movddup -11 * SIZE(A2), %xmm6
  420. mulpd %xmm12, %xmm7
  421. addpd %xmm7, %xmm3
  422. movddup -9 * SIZE(A2), %xmm7
  423. mulpd %xmm13, %xmm4
  424. SUBPD %xmm4, %xmm0
  425. movddup -16 * SIZE(A2, LDA), %xmm4
  426. mulpd %xmm13, %xmm5
  427. SUBPD %xmm5, %xmm1
  428. movddup -14 * SIZE(A2, LDA), %xmm5
  429. mulpd %xmm13, %xmm6
  430. SUBPD %xmm6, %xmm2
  431. movddup -12 * SIZE(A2, LDA), %xmm6
  432. mulpd %xmm13, %xmm7
  433. SUBPD %xmm7, %xmm3
  434. movddup -10 * SIZE(A2, LDA), %xmm7
  435. mulpd %xmm14, %xmm4
  436. addpd %xmm4, %xmm0
  437. movddup -15 * SIZE(A2, LDA), %xmm4
  438. mulpd %xmm14, %xmm5
  439. addpd %xmm5, %xmm1
  440. movddup -13 * SIZE(A2, LDA), %xmm5
  441. mulpd %xmm14, %xmm6
  442. addpd %xmm6, %xmm2
  443. movddup -11 * SIZE(A2, LDA), %xmm6
  444. mulpd %xmm14, %xmm7
  445. addpd %xmm7, %xmm3
  446. movddup -9 * SIZE(A2, LDA), %xmm7
  447. mulpd %xmm15, %xmm4
  448. SUBPD %xmm4, %xmm0
  449. mulpd %xmm15, %xmm5
  450. SUBPD %xmm5, %xmm1
  451. mulpd %xmm15, %xmm6
  452. SUBPD %xmm6, %xmm2
  453. mulpd %xmm15, %xmm7
  454. SUBPD %xmm7, %xmm3
  455. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  456. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  457. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  458. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  459. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  460. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  461. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  462. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  463. subq $-8 * SIZE, A1
  464. subq $-8 * SIZE, A2
  465. subq $-8 * SIZE, Y1
  466. ALIGN_3
  467. .L15:
  468. testq $2, M
  469. je .L17
  470. movddup -16 * SIZE(A1), %xmm4
  471. movddup -15 * SIZE(A1), %xmm5
  472. movddup -14 * SIZE(A1), %xmm6
  473. movddup -13 * SIZE(A1), %xmm7
  474. mulpd %xmm8, %xmm4
  475. addpd %xmm4, %xmm0
  476. movddup -16 * SIZE(A1, LDA, 1), %xmm4
  477. mulpd %xmm8, %xmm6
  478. addpd %xmm6, %xmm1
  479. movddup -14 * SIZE(A1, LDA, 1), %xmm6
  480. mulpd %xmm9, %xmm5
  481. SUBPD %xmm5, %xmm0
  482. movddup -15 * SIZE(A1, LDA, 1), %xmm5
  483. mulpd %xmm9, %xmm7
  484. SUBPD %xmm7, %xmm1
  485. movddup -13 * SIZE(A1, LDA, 1), %xmm7
  486. mulpd %xmm10, %xmm4
  487. addpd %xmm4, %xmm0
  488. movddup -16 * SIZE(A2), %xmm4
  489. mulpd %xmm10, %xmm6
  490. addpd %xmm6, %xmm1
  491. movddup -14 * SIZE(A2), %xmm6
  492. mulpd %xmm11, %xmm5
  493. SUBPD %xmm5, %xmm0
  494. movddup -15 * SIZE(A2), %xmm5
  495. mulpd %xmm11, %xmm7
  496. SUBPD %xmm7, %xmm1
  497. movddup -13 * SIZE(A2), %xmm7
  498. mulpd %xmm12, %xmm4
  499. addpd %xmm4, %xmm0
  500. movddup -16 * SIZE(A2, LDA, 1), %xmm4
  501. mulpd %xmm12, %xmm6
  502. addpd %xmm6, %xmm1
  503. movddup -14 * SIZE(A2, LDA, 1), %xmm6
  504. mulpd %xmm13, %xmm5
  505. SUBPD %xmm5, %xmm0
  506. movddup -15 * SIZE(A2, LDA, 1), %xmm5
  507. mulpd %xmm13, %xmm7
  508. SUBPD %xmm7, %xmm1
  509. movddup -13 * SIZE(A2, LDA, 1), %xmm7
  510. mulpd %xmm14, %xmm4
  511. addpd %xmm4, %xmm0
  512. mulpd %xmm14, %xmm6
  513. addpd %xmm6, %xmm1
  514. mulpd %xmm15, %xmm5
  515. SUBPD %xmm5, %xmm0
  516. mulpd %xmm15, %xmm7
  517. SUBPD %xmm7, %xmm1
  518. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  519. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  520. movaps %xmm2, %xmm0
  521. addq $4 * SIZE, A1
  522. addq $4 * SIZE, A2
  523. addq $4 * SIZE, Y1
  524. ALIGN_3
  525. .L17:
  526. testq $1, M
  527. je .L19
  528. movddup -16 * SIZE(A1), %xmm4
  529. movddup -15 * SIZE(A1), %xmm5
  530. movddup -16 * SIZE(A1, LDA, 1), %xmm6
  531. movddup -15 * SIZE(A1, LDA, 1), %xmm7
  532. mulpd %xmm8, %xmm4
  533. addpd %xmm4, %xmm0
  534. movddup -16 * SIZE(A2), %xmm4
  535. mulpd %xmm9, %xmm5
  536. SUBPD %xmm5, %xmm0
  537. movddup -15 * SIZE(A2), %xmm5
  538. mulpd %xmm10, %xmm6
  539. addpd %xmm6, %xmm0
  540. movddup -16 * SIZE(A2, LDA, 1), %xmm6
  541. mulpd %xmm11, %xmm7
  542. SUBPD %xmm7, %xmm0
  543. movddup -15 * SIZE(A2, LDA, 1), %xmm7
  544. mulpd %xmm12, %xmm4
  545. addpd %xmm4, %xmm0
  546. mulpd %xmm13, %xmm5
  547. SUBPD %xmm5, %xmm0
  548. mulpd %xmm14, %xmm6
  549. addpd %xmm6, %xmm0
  550. mulpd %xmm15, %xmm7
  551. SUBPD %xmm7, %xmm0
  552. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  553. ALIGN_3
  554. .L19:
  555. cmpq $4, N
  556. jge .L11
  557. ALIGN_3
  558. .L20:
  559. #endif
  560. #if GEMV_UNROLL >= 2
  561. cmpq $2, N
  562. jl .L30
  563. #if GEMV_UNROLL == 2
  564. ALIGN_3
  565. .L21:
  566. #endif
  567. subq $2, N
  568. leaq 16 * SIZE(BUFFER), Y1
  569. movq A, A1
  570. leaq (A, LDA, 1), A2
  571. leaq (A, LDA, 2), A
  572. movddup 0 * SIZE(X), %xmm8
  573. movddup 1 * SIZE(X), %xmm9
  574. addq INCX, X
  575. movddup 0 * SIZE(X), %xmm10
  576. movddup 1 * SIZE(X), %xmm11
  577. addq INCX, X
  578. pcmpeqb %xmm5, %xmm5
  579. psllq $63, %xmm5
  580. shufps $0x40, %xmm5, %xmm5
  581. movsd ALPHA_R, %xmm6
  582. movhps ALPHA_I, %xmm6
  583. pshufd $0x4e, %xmm6, %xmm7
  584. #ifndef XCONJ
  585. xorps %xmm5, %xmm7
  586. #else
  587. xorps %xmm5, %xmm6
  588. #endif
  589. mulpd %xmm6, %xmm8
  590. mulpd %xmm7, %xmm9
  591. mulpd %xmm6, %xmm10
  592. mulpd %xmm7, %xmm11
  593. #ifndef XCONJ
  594. subpd %xmm9, %xmm8
  595. subpd %xmm11, %xmm10
  596. #else
  597. addpd %xmm9, %xmm8
  598. addpd %xmm11, %xmm10
  599. #endif
  600. pshufd $0x4e, %xmm8, %xmm9
  601. pshufd $0x4e, %xmm10, %xmm11
  602. #ifndef XCONJ
  603. xorps %xmm5, %xmm9
  604. xorps %xmm5, %xmm11
  605. #else
  606. xorps %xmm5, %xmm8
  607. xorps %xmm5, %xmm10
  608. #endif
  609. MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
  610. MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
  611. MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
  612. MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
  613. movq M, I
  614. sarq $2, I
  615. jle .L25
  616. movddup -16 * SIZE(A1), %xmm4
  617. movddup -14 * SIZE(A1), %xmm5
  618. movddup -12 * SIZE(A1), %xmm6
  619. movddup -10 * SIZE(A1), %xmm7
  620. decq I
  621. jle .L24
  622. ALIGN_3
  623. .L23:
  624. #ifdef PREFETCH
  625. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  626. #endif
  627. mulpd %xmm8, %xmm4
  628. addpd %xmm4, %xmm0
  629. movddup -15 * SIZE(A1), %xmm4
  630. mulpd %xmm8, %xmm5
  631. addpd %xmm5, %xmm1
  632. movddup -13 * SIZE(A1), %xmm5
  633. mulpd %xmm8, %xmm6
  634. addpd %xmm6, %xmm2
  635. movddup -11 * SIZE(A1), %xmm6
  636. mulpd %xmm8, %xmm7
  637. addpd %xmm7, %xmm3
  638. movddup -9 * SIZE(A1), %xmm7
  639. mulpd %xmm9, %xmm4
  640. SUBPD %xmm4, %xmm0
  641. movddup -16 * SIZE(A2), %xmm4
  642. mulpd %xmm9, %xmm5
  643. SUBPD %xmm5, %xmm1
  644. movddup -14 * SIZE(A2), %xmm5
  645. mulpd %xmm9, %xmm6
  646. SUBPD %xmm6, %xmm2
  647. movddup -12 * SIZE(A2), %xmm6
  648. mulpd %xmm9, %xmm7
  649. SUBPD %xmm7, %xmm3
  650. movddup -10 * SIZE(A2), %xmm7
  651. #ifdef PREFETCH
  652. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  653. #endif
  654. mulpd %xmm10, %xmm4
  655. addpd %xmm4, %xmm0
  656. movddup -15 * SIZE(A2), %xmm4
  657. mulpd %xmm10, %xmm5
  658. addpd %xmm5, %xmm1
  659. movddup -13 * SIZE(A2), %xmm5
  660. mulpd %xmm10, %xmm6
  661. addpd %xmm6, %xmm2
  662. movddup -11 * SIZE(A2), %xmm6
  663. mulpd %xmm10, %xmm7
  664. addpd %xmm7, %xmm3
  665. movddup -9 * SIZE(A2), %xmm7
  666. mulpd %xmm11, %xmm4
  667. SUBPD %xmm4, %xmm0
  668. movddup -8 * SIZE(A1), %xmm4
  669. mulpd %xmm11, %xmm5
  670. SUBPD %xmm5, %xmm1
  671. movddup -6 * SIZE(A1), %xmm5
  672. mulpd %xmm11, %xmm6
  673. SUBPD %xmm6, %xmm2
  674. movddup -4 * SIZE(A1), %xmm6
  675. mulpd %xmm11, %xmm7
  676. SUBPD %xmm7, %xmm3
  677. movddup -2 * SIZE(A1), %xmm7
  678. #ifdef PREFETCHW
  679. PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1)
  680. #endif
  681. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  682. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  683. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  684. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  685. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  686. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  687. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  688. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  689. subq $-8 * SIZE, A1
  690. subq $-8 * SIZE, A2
  691. subq $-8 * SIZE, Y1
  692. subq $1, I
  693. BRANCH
  694. jg .L23
  695. ALIGN_3
  696. .L24:
  697. mulpd %xmm8, %xmm4
  698. addpd %xmm4, %xmm0
  699. movddup -15 * SIZE(A1), %xmm4
  700. mulpd %xmm8, %xmm5
  701. addpd %xmm5, %xmm1
  702. movddup -13 * SIZE(A1), %xmm5
  703. mulpd %xmm8, %xmm6
  704. addpd %xmm6, %xmm2
  705. movddup -11 * SIZE(A1), %xmm6
  706. mulpd %xmm8, %xmm7
  707. addpd %xmm7, %xmm3
  708. movddup -9 * SIZE(A1), %xmm7
  709. mulpd %xmm9, %xmm4
  710. SUBPD %xmm4, %xmm0
  711. movddup -16 * SIZE(A2), %xmm4
  712. mulpd %xmm9, %xmm5
  713. SUBPD %xmm5, %xmm1
  714. movddup -14 * SIZE(A2), %xmm5
  715. mulpd %xmm9, %xmm6
  716. SUBPD %xmm6, %xmm2
  717. movddup -12 * SIZE(A2), %xmm6
  718. mulpd %xmm9, %xmm7
  719. SUBPD %xmm7, %xmm3
  720. movddup -10 * SIZE(A2), %xmm7
  721. mulpd %xmm10, %xmm4
  722. addpd %xmm4, %xmm0
  723. movddup -15 * SIZE(A2), %xmm4
  724. mulpd %xmm10, %xmm5
  725. addpd %xmm5, %xmm1
  726. movddup -13 * SIZE(A2), %xmm5
  727. mulpd %xmm10, %xmm6
  728. addpd %xmm6, %xmm2
  729. movddup -11 * SIZE(A2), %xmm6
  730. mulpd %xmm10, %xmm7
  731. addpd %xmm7, %xmm3
  732. movddup -9 * SIZE(A2), %xmm7
  733. mulpd %xmm11, %xmm4
  734. SUBPD %xmm4, %xmm0
  735. mulpd %xmm11, %xmm5
  736. SUBPD %xmm5, %xmm1
  737. mulpd %xmm11, %xmm6
  738. SUBPD %xmm6, %xmm2
  739. mulpd %xmm11, %xmm7
  740. SUBPD %xmm7, %xmm3
  741. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  742. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  743. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  744. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  745. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  746. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  747. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  748. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  749. subq $-8 * SIZE, A1
  750. subq $-8 * SIZE, A2
  751. subq $-8 * SIZE, Y1
  752. ALIGN_3
  753. .L25:
  754. testq $2, M
  755. je .L27
  756. movddup -16 * SIZE(A1), %xmm4
  757. movddup -15 * SIZE(A1), %xmm5
  758. movddup -14 * SIZE(A1), %xmm6
  759. movddup -13 * SIZE(A1), %xmm7
  760. mulpd %xmm8, %xmm4
  761. addpd %xmm4, %xmm0
  762. movddup -16 * SIZE(A2), %xmm4
  763. mulpd %xmm8, %xmm6
  764. addpd %xmm6, %xmm1
  765. movddup -14 * SIZE(A2), %xmm6
  766. mulpd %xmm9, %xmm5
  767. SUBPD %xmm5, %xmm0
  768. movddup -15 * SIZE(A2), %xmm5
  769. mulpd %xmm9, %xmm7
  770. SUBPD %xmm7, %xmm1
  771. movddup -13 * SIZE(A2), %xmm7
  772. mulpd %xmm10, %xmm4
  773. addpd %xmm4, %xmm0
  774. mulpd %xmm10, %xmm6
  775. addpd %xmm6, %xmm1
  776. mulpd %xmm11, %xmm5
  777. SUBPD %xmm5, %xmm0
  778. mulpd %xmm11, %xmm7
  779. SUBPD %xmm7, %xmm1
  780. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  781. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  782. movaps %xmm2, %xmm0
  783. addq $4 * SIZE, A1
  784. addq $4 * SIZE, A2
  785. addq $4 * SIZE, Y1
  786. ALIGN_3
  787. .L27:
  788. testq $1, M
  789. #if GEMV_UNROLL == 2
  790. je .L29
  791. #else
  792. je .L30
  793. #endif
  794. movddup -16 * SIZE(A1), %xmm4
  795. movddup -15 * SIZE(A1), %xmm5
  796. movddup -16 * SIZE(A2), %xmm6
  797. movddup -15 * SIZE(A2), %xmm7
  798. mulpd %xmm8, %xmm4
  799. addpd %xmm4, %xmm0
  800. mulpd %xmm9, %xmm5
  801. SUBPD %xmm5, %xmm0
  802. mulpd %xmm10, %xmm6
  803. addpd %xmm6, %xmm0
  804. mulpd %xmm11, %xmm7
  805. SUBPD %xmm7, %xmm0
  806. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  807. #if GEMV_UNROLL == 2
  808. ALIGN_3
  809. .L29:
  810. cmpq $2, N
  811. jge .L21
  812. #endif
  813. ALIGN_3
  814. .L30:
  815. #endif
  816. cmpq $1, N
  817. jl .L980
  818. #if GEMV_UNROLL == 1
  819. .L31:
  820. decq N
  821. #endif
  822. leaq 16 * SIZE(BUFFER), Y1
  823. movq A, A1
  824. #if GEMV_UNROLL == 1
  825. addq LDA, A
  826. #endif
  827. movddup 0 * SIZE(X), %xmm8
  828. movddup 1 * SIZE(X), %xmm9
  829. addq INCX, X
  830. pcmpeqb %xmm5, %xmm5
  831. psllq $63, %xmm5
  832. shufps $0x40, %xmm5, %xmm5
  833. movsd ALPHA_R, %xmm6
  834. movhps ALPHA_I, %xmm6
  835. pshufd $0x4e, %xmm6, %xmm7
  836. #ifndef XCONJ
  837. xorps %xmm5, %xmm7
  838. #else
  839. xorps %xmm5, %xmm6
  840. #endif
  841. mulpd %xmm6, %xmm8
  842. mulpd %xmm7, %xmm9
  843. #ifndef XCONJ
  844. subpd %xmm9, %xmm8
  845. #else
  846. addpd %xmm9, %xmm8
  847. #endif
  848. pshufd $0x4e, %xmm8, %xmm9
  849. #ifndef XCONJ
  850. xorps %xmm5, %xmm9
  851. #else
  852. xorps %xmm5, %xmm8
  853. #endif
  854. MOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
  855. MOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
  856. MOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
  857. MOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
  858. movq M, I
  859. sarq $2, I
  860. jle .L35
  861. movddup -16 * SIZE(A1), %xmm4
  862. movddup -14 * SIZE(A1), %xmm5
  863. movddup -12 * SIZE(A1), %xmm6
  864. movddup -10 * SIZE(A1), %xmm7
  865. decq I
  866. jle .L34
  867. ALIGN_3
  868. .L33:
  869. #ifdef PREFETCH
  870. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  871. #endif
  872. mulpd %xmm8, %xmm4
  873. addpd %xmm4, %xmm0
  874. movddup -15 * SIZE(A1), %xmm4
  875. mulpd %xmm8, %xmm5
  876. addpd %xmm5, %xmm1
  877. movddup -13 * SIZE(A1), %xmm5
  878. mulpd %xmm8, %xmm6
  879. addpd %xmm6, %xmm2
  880. movddup -11 * SIZE(A1), %xmm6
  881. mulpd %xmm8, %xmm7
  882. addpd %xmm7, %xmm3
  883. movddup -9 * SIZE(A1), %xmm7
  884. mulpd %xmm9, %xmm4
  885. SUBPD %xmm4, %xmm0
  886. movddup -8 * SIZE(A1), %xmm4
  887. mulpd %xmm9, %xmm5
  888. SUBPD %xmm5, %xmm1
  889. movddup -6 * SIZE(A1), %xmm5
  890. mulpd %xmm9, %xmm6
  891. SUBPD %xmm6, %xmm2
  892. movddup -4 * SIZE(A1), %xmm6
  893. mulpd %xmm9, %xmm7
  894. SUBPD %xmm7, %xmm3
  895. movddup -2 * SIZE(A1), %xmm7
  896. #ifdef PREFETCHW
  897. PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1)
  898. #endif
  899. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  900. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  901. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  902. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  903. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  904. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  905. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  906. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  907. subq $-8 * SIZE, A1
  908. subq $-8 * SIZE, Y1
  909. subq $1, I
  910. BRANCH
  911. jg .L33
  912. ALIGN_3
  913. .L34:
  914. mulpd %xmm8, %xmm4
  915. addpd %xmm4, %xmm0
  916. movddup -15 * SIZE(A1), %xmm4
  917. mulpd %xmm8, %xmm5
  918. addpd %xmm5, %xmm1
  919. movddup -13 * SIZE(A1), %xmm5
  920. mulpd %xmm8, %xmm6
  921. addpd %xmm6, %xmm2
  922. movddup -11 * SIZE(A1), %xmm6
  923. mulpd %xmm8, %xmm7
  924. addpd %xmm7, %xmm3
  925. movddup -9 * SIZE(A1), %xmm7
  926. mulpd %xmm9, %xmm4
  927. SUBPD %xmm4, %xmm0
  928. mulpd %xmm9, %xmm5
  929. SUBPD %xmm5, %xmm1
  930. mulpd %xmm9, %xmm6
  931. SUBPD %xmm6, %xmm2
  932. mulpd %xmm9, %xmm7
  933. SUBPD %xmm7, %xmm3
  934. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  935. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  936. MOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
  937. MOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
  938. MOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
  939. MOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
  940. MOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
  941. MOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
  942. subq $-8 * SIZE, A1
  943. subq $-8 * SIZE, Y1
  944. ALIGN_3
  945. .L35:
  946. testq $2, M
  947. je .L37
  948. movddup -16 * SIZE(A1), %xmm4
  949. movddup -15 * SIZE(A1), %xmm5
  950. movddup -14 * SIZE(A1), %xmm6
  951. movddup -13 * SIZE(A1), %xmm7
  952. mulpd %xmm8, %xmm4
  953. addpd %xmm4, %xmm0
  954. mulpd %xmm8, %xmm6
  955. addpd %xmm6, %xmm1
  956. mulpd %xmm9, %xmm5
  957. SUBPD %xmm5, %xmm0
  958. mulpd %xmm9, %xmm7
  959. SUBPD %xmm7, %xmm1
  960. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  961. MOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
  962. movaps %xmm2, %xmm0
  963. addq $4 * SIZE, A1
  964. addq $4 * SIZE, Y1
  965. ALIGN_3
  966. .L37:
  967. testq $1, M
  968. #if GEMV_UNROLL == 1
  969. je .L39
  970. #else
  971. je .L980
  972. #endif
  973. movddup -16 * SIZE(A1), %xmm4
  974. movddup -15 * SIZE(A1), %xmm5
  975. mulpd %xmm8, %xmm4
  976. addpd %xmm4, %xmm0
  977. mulpd %xmm9, %xmm5
  978. SUBPD %xmm5, %xmm0
  979. MOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
  980. #if GEMV_UNROLL == 1
  981. ALIGN_3
  982. .L39:
  983. cmpq $1, N
  984. jge .L31
  985. #endif
  986. .L980:
  987. testq $SIZE, Y
  988. jne .L990
  989. movq Y, Y1
  990. movq M, %rax
  991. sarq $3, %rax
  992. jle .L184
  993. ALIGN_3
  994. .L182:
  995. movaps (Y), %xmm0
  996. addq INCY, Y
  997. movaps (Y), %xmm1
  998. addq INCY, Y
  999. movaps (Y), %xmm2
  1000. addq INCY, Y
  1001. movaps (Y), %xmm3
  1002. addq INCY, Y
  1003. movaps (Y), %xmm4
  1004. addq INCY, Y
  1005. movaps (Y), %xmm5
  1006. addq INCY, Y
  1007. movaps (Y), %xmm6
  1008. addq INCY, Y
  1009. movaps (Y), %xmm7
  1010. addq INCY, Y
  1011. addpd 0 * SIZE(BUFFER), %xmm0
  1012. addpd 2 * SIZE(BUFFER), %xmm1
  1013. addpd 4 * SIZE(BUFFER), %xmm2
  1014. addpd 6 * SIZE(BUFFER), %xmm3
  1015. addpd 8 * SIZE(BUFFER), %xmm4
  1016. addpd 10 * SIZE(BUFFER), %xmm5
  1017. addpd 12 * SIZE(BUFFER), %xmm6
  1018. addpd 14 * SIZE(BUFFER), %xmm7
  1019. movaps %xmm0, (Y1)
  1020. addq INCY, Y1
  1021. movaps %xmm1, (Y1)
  1022. addq INCY, Y1
  1023. movaps %xmm2, (Y1)
  1024. addq INCY, Y1
  1025. movaps %xmm3, (Y1)
  1026. addq INCY, Y1
  1027. movaps %xmm4, (Y1)
  1028. addq INCY, Y1
  1029. movaps %xmm5, (Y1)
  1030. addq INCY, Y1
  1031. movaps %xmm6, (Y1)
  1032. addq INCY, Y1
  1033. movaps %xmm7, (Y1)
  1034. addq INCY, Y1
  1035. subq $-16 * SIZE, BUFFER
  1036. decq %rax
  1037. jg .L182
  1038. ALIGN_3
  1039. .L184:
  1040. testq $7, M
  1041. jle .L999
  1042. testq $4, M
  1043. jle .L185
  1044. movaps (Y), %xmm0
  1045. addq INCY, Y
  1046. movaps (Y), %xmm1
  1047. addq INCY, Y
  1048. movaps (Y), %xmm2
  1049. addq INCY, Y
  1050. movaps (Y), %xmm3
  1051. addq INCY, Y
  1052. addpd 0 * SIZE(BUFFER), %xmm0
  1053. addpd 2 * SIZE(BUFFER), %xmm1
  1054. addpd 4 * SIZE(BUFFER), %xmm2
  1055. addpd 6 * SIZE(BUFFER), %xmm3
  1056. movaps %xmm0, (Y1)
  1057. addq INCY, Y1
  1058. movaps %xmm1, (Y1)
  1059. addq INCY, Y1
  1060. movaps %xmm2, (Y1)
  1061. addq INCY, Y1
  1062. movaps %xmm3, (Y1)
  1063. addq INCY, Y1
  1064. addq $8 * SIZE, BUFFER
  1065. ALIGN_3
  1066. .L185:
  1067. testq $2, M
  1068. jle .L186
  1069. movaps (Y), %xmm0
  1070. addq INCY, Y
  1071. movaps (Y), %xmm1
  1072. addq INCY, Y
  1073. addpd 0 * SIZE(BUFFER), %xmm0
  1074. addpd 2 * SIZE(BUFFER), %xmm1
  1075. movaps %xmm0, (Y1)
  1076. addq INCY, Y1
  1077. movaps %xmm1, (Y1)
  1078. addq INCY, Y1
  1079. addq $4 * SIZE, BUFFER
  1080. ALIGN_3
  1081. .L186:
  1082. testq $1, M
  1083. jle .L999
  1084. movaps (Y), %xmm0
  1085. addpd (BUFFER), %xmm0
  1086. movaps %xmm0, (Y1)
  1087. jmp .L999
  1088. ALIGN_3
  1089. .L990:
  1090. movq Y, Y1
  1091. movq M, %rax
  1092. sarq $3, %rax
  1093. jle .L994
  1094. ALIGN_3
  1095. .L992:
  1096. movsd 0 * SIZE(Y), %xmm0
  1097. movhpd 1 * SIZE(Y), %xmm0
  1098. addq INCY, Y
  1099. movsd 0 * SIZE(Y), %xmm1
  1100. movhpd 1 * SIZE(Y), %xmm1
  1101. addq INCY, Y
  1102. movsd 0 * SIZE(Y), %xmm2
  1103. movhpd 1 * SIZE(Y), %xmm2
  1104. addq INCY, Y
  1105. movsd 0 * SIZE(Y), %xmm3
  1106. movhpd 1 * SIZE(Y), %xmm3
  1107. addq INCY, Y
  1108. movsd 0 * SIZE(Y), %xmm4
  1109. movhpd 1 * SIZE(Y), %xmm4
  1110. addq INCY, Y
  1111. movsd 0 * SIZE(Y), %xmm5
  1112. movhpd 1 * SIZE(Y), %xmm5
  1113. addq INCY, Y
  1114. movsd 0 * SIZE(Y), %xmm6
  1115. movhpd 1 * SIZE(Y), %xmm6
  1116. addq INCY, Y
  1117. movsd 0 * SIZE(Y), %xmm7
  1118. movhpd 1 * SIZE(Y), %xmm7
  1119. addq INCY, Y
  1120. addpd 0 * SIZE(BUFFER), %xmm0
  1121. addpd 2 * SIZE(BUFFER), %xmm1
  1122. addpd 4 * SIZE(BUFFER), %xmm2
  1123. addpd 6 * SIZE(BUFFER), %xmm3
  1124. addpd 8 * SIZE(BUFFER), %xmm4
  1125. addpd 10 * SIZE(BUFFER), %xmm5
  1126. addpd 12 * SIZE(BUFFER), %xmm6
  1127. addpd 14 * SIZE(BUFFER), %xmm7
  1128. movlpd %xmm0, 0 * SIZE(Y1)
  1129. movhpd %xmm0, 1 * SIZE(Y1)
  1130. addq INCY, Y1
  1131. movlpd %xmm1, 0 * SIZE(Y1)
  1132. movhpd %xmm1, 1 * SIZE(Y1)
  1133. addq INCY, Y1
  1134. movlpd %xmm2, 0 * SIZE(Y1)
  1135. movhpd %xmm2, 1 * SIZE(Y1)
  1136. addq INCY, Y1
  1137. movlpd %xmm3, 0 * SIZE(Y1)
  1138. movhpd %xmm3, 1 * SIZE(Y1)
  1139. addq INCY, Y1
  1140. movlpd %xmm4, 0 * SIZE(Y1)
  1141. movhpd %xmm4, 1 * SIZE(Y1)
  1142. addq INCY, Y1
  1143. movlpd %xmm5, 0 * SIZE(Y1)
  1144. movhpd %xmm5, 1 * SIZE(Y1)
  1145. addq INCY, Y1
  1146. movlpd %xmm6, 0 * SIZE(Y1)
  1147. movhpd %xmm6, 1 * SIZE(Y1)
  1148. addq INCY, Y1
  1149. movlpd %xmm7, 0 * SIZE(Y1)
  1150. movhpd %xmm7, 1 * SIZE(Y1)
  1151. addq INCY, Y1
  1152. subq $-16 * SIZE, BUFFER
  1153. decq %rax
  1154. jg .L992
  1155. ALIGN_3
  1156. .L994:
  1157. testq $7, M
  1158. jle .L999
  1159. testq $4, M
  1160. jle .L995
  1161. movsd 0 * SIZE(Y), %xmm0
  1162. movhpd 1 * SIZE(Y), %xmm0
  1163. addq INCY, Y
  1164. movsd 0 * SIZE(Y), %xmm1
  1165. movhpd 1 * SIZE(Y), %xmm1
  1166. addq INCY, Y
  1167. movsd 0 * SIZE(Y), %xmm2
  1168. movhpd 1 * SIZE(Y), %xmm2
  1169. addq INCY, Y
  1170. movsd 0 * SIZE(Y), %xmm3
  1171. movhpd 1 * SIZE(Y), %xmm3
  1172. addq INCY, Y
  1173. addpd 0 * SIZE(BUFFER), %xmm0
  1174. addpd 2 * SIZE(BUFFER), %xmm1
  1175. addpd 4 * SIZE(BUFFER), %xmm2
  1176. addpd 6 * SIZE(BUFFER), %xmm3
  1177. movlpd %xmm0, 0 * SIZE(Y1)
  1178. movhpd %xmm0, 1 * SIZE(Y1)
  1179. addq INCY, Y1
  1180. movlpd %xmm1, 0 * SIZE(Y1)
  1181. movhpd %xmm1, 1 * SIZE(Y1)
  1182. addq INCY, Y1
  1183. movlpd %xmm2, 0 * SIZE(Y1)
  1184. movhpd %xmm2, 1 * SIZE(Y1)
  1185. addq INCY, Y1
  1186. movlpd %xmm3, 0 * SIZE(Y1)
  1187. movhpd %xmm3, 1 * SIZE(Y1)
  1188. addq INCY, Y1
  1189. addq $8 * SIZE, BUFFER
  1190. ALIGN_3
  1191. .L995:
  1192. testq $2, M
  1193. jle .L996
  1194. movsd 0 * SIZE(Y), %xmm0
  1195. movhpd 1 * SIZE(Y), %xmm0
  1196. addq INCY, Y
  1197. movsd 0 * SIZE(Y), %xmm1
  1198. movhpd 1 * SIZE(Y), %xmm1
  1199. addq INCY, Y
  1200. addpd 0 * SIZE(BUFFER), %xmm0
  1201. addpd 2 * SIZE(BUFFER), %xmm1
  1202. movlpd %xmm0, 0 * SIZE(Y1)
  1203. movhpd %xmm0, 1 * SIZE(Y1)
  1204. addq INCY, Y1
  1205. movlpd %xmm1, 0 * SIZE(Y1)
  1206. movhpd %xmm1, 1 * SIZE(Y1)
  1207. addq INCY, Y1
  1208. addq $4 * SIZE, BUFFER
  1209. ALIGN_3
  1210. .L996:
  1211. testq $1, M
  1212. jle .L999
  1213. movsd 0 * SIZE(Y), %xmm0
  1214. movhpd 1 * SIZE(Y), %xmm0
  1215. addpd 0 * SIZE(BUFFER), %xmm0
  1216. movlpd %xmm0, 0 * SIZE(Y1)
  1217. movhpd %xmm0, 1 * SIZE(Y1)
  1218. ALIGN_3
  1219. .L999:
  1220. movq 0(%rsp), %rbx
  1221. movq 8(%rsp), %rbp
  1222. movq 16(%rsp), %r12
  1223. movq 24(%rsp), %r13
  1224. movq 32(%rsp), %r14
  1225. movq 40(%rsp), %r15
  1226. #ifdef WINDOWS_ABI
  1227. movq 48(%rsp), %rdi
  1228. movq 56(%rsp), %rsi
  1229. movups 64(%rsp), %xmm6
  1230. movups 80(%rsp), %xmm7
  1231. movups 96(%rsp), %xmm8
  1232. movups 112(%rsp), %xmm9
  1233. movups 128(%rsp), %xmm10
  1234. movups 144(%rsp), %xmm11
  1235. movups 160(%rsp), %xmm12
  1236. movups 176(%rsp), %xmm13
  1237. movups 192(%rsp), %xmm14
  1238. movups 208(%rsp), %xmm15
  1239. #endif
  1240. addq $STACKSIZE, %rsp
  1241. ret
  1242. EPILOGUE