You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

symv_L_sse2.S 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef ATOM
  41. #define PREFETCH prefetcht0
  42. #define PREFETCHW prefetcht0
  43. #define PREFETCHSIZE (16 * 12)
  44. #endif
  45. #ifdef CORE2
  46. #define PREFETCH prefetcht0
  47. #define PREFETCHW prefetcht0
  48. #define PREFETCHSIZE (16 * 12)
  49. #endif
  50. #if defined(PENRYN) || defined(DUNNINGTON)
  51. #define PREFETCH prefetcht0
  52. #define PREFETCHW prefetcht0
  53. #define PREFETCHSIZE (16 * 12)
  54. #endif
  55. #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
  56. #define PREFETCH prefetcht0
  57. #define PREFETCHW prefetcht0
  58. #define PREFETCHSIZE (16 * 12)
  59. #endif
  60. #ifdef PENTIUM4
  61. #define PREFETCH prefetcht0
  62. #define PREFETCHW prefetcht0
  63. #define PREFETCHSIZE (16 * 20)
  64. #endif
  65. #ifdef OPTERON
  66. #define PREFETCH prefetch
  67. #define PREFETCHW prefetchw
  68. #define PREFETCHSIZE (16 * 8)
  69. #define movsd movlpd
  70. #endif
  71. #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  72. #define PREFETCH prefetch
  73. #define PREFETCHW prefetchw
  74. #define PREFETCHSIZE (16 * 16)
  75. #endif
  76. #ifdef NANO
  77. #define PREFETCH prefetcht0
  78. #define PREFETCHW prefetcht0
  79. #define PREFETCHSIZE (8 * 24)
  80. #endif
  81. #ifdef GENERIC
  82. #define PREFETCH prefetcht0
  83. #define PREFETCHW prefetcht0
  84. #define PREFETCHSIZE (16 * 20)
  85. #endif
  86. #ifndef WINDOWS_ABI
  87. #define STACKSIZE 80
  88. #define OLD_Y 8 + STACKSIZE(%rsp)
  89. #define OLD_INCY 16 + STACKSIZE(%rsp)
  90. #define OLD_BUFFER 24 + STACKSIZE(%rsp)
  91. #define M ARG1
  92. #define N ARG2
  93. #define A ARG3
  94. #define LDA ARG4
  95. #define X ARG5
  96. #define INCX ARG6
  97. #else
  98. #define STACKSIZE 256
  99. #define OLD_LDA 40 + STACKSIZE(%rsp)
  100. #define OLD_X 48 + STACKSIZE(%rsp)
  101. #define OLD_INCX 56 + STACKSIZE(%rsp)
  102. #define OLD_Y 64 + STACKSIZE(%rsp)
  103. #define OLD_INCY 72 + STACKSIZE(%rsp)
  104. #define OLD_BUFFER 80 + STACKSIZE(%rsp)
  105. #define M ARG1
  106. #define N ARG2
  107. #define A ARG4
  108. #define LDA ARG3
  109. #define X %rdi
  110. #define INCX %rsi
  111. #endif
  112. #define Y %r10
  113. #define INCY %r11
  114. #define BUFFER %r12
  115. #define TEMP %rax
  116. #define I %rax
  117. #define A1 %rbx
  118. #define A2 %rbp
  119. #define XX %r13
  120. #define YY %r14
  121. #define IS %r15
  122. #define NEW_X BUFFER
  123. #define NEW_Y X
  124. #define ALPHA %xmm0
  125. #define xtemp1 %xmm0
  126. #define xtemp2 %xmm1
  127. #define yy1 %xmm2
  128. #define yy2 %xmm3
  129. #define atemp1 %xmm4
  130. #define atemp2 %xmm5
  131. #define atemp3 %xmm6
  132. #define atemp4 %xmm7
  133. #define xsum1 %xmm8
  134. #define xsum2 %xmm9
  135. #define xsum3 %xmm10
  136. #define xsum4 %xmm11
  137. #define a1 %xmm12
  138. #define a2 %xmm13
  139. #define a3 %xmm14
  140. #define xt1 %xmm15
  141. PROLOGUE
  142. PROFCODE
  143. subq $STACKSIZE, %rsp
  144. movq %rbx, 0(%rsp)
  145. movq %rbp, 8(%rsp)
  146. movq %r12, 16(%rsp)
  147. movq %r13, 24(%rsp)
  148. movq %r14, 32(%rsp)
  149. movq %r15, 40(%rsp)
  150. #ifdef WINDOWS_ABI
  151. movq %rdi, 48(%rsp)
  152. movq %rsi, 56(%rsp)
  153. movups %xmm6, 64(%rsp)
  154. movups %xmm7, 80(%rsp)
  155. movups %xmm8, 96(%rsp)
  156. movups %xmm9, 112(%rsp)
  157. movups %xmm10, 128(%rsp)
  158. movups %xmm11, 144(%rsp)
  159. movups %xmm12, 160(%rsp)
  160. movups %xmm13, 176(%rsp)
  161. movups %xmm14, 192(%rsp)
  162. movups %xmm15, 208(%rsp)
  163. movq OLD_LDA, LDA
  164. movq OLD_X, X
  165. movq OLD_INCX, INCX
  166. movaps %xmm2, %xmm0
  167. #endif
  168. movq OLD_Y, Y
  169. movq OLD_INCY, INCY
  170. movq OLD_BUFFER, BUFFER
  171. leaq (,INCX, SIZE), INCX
  172. leaq (,INCY, SIZE), INCY
  173. leaq (,LDA, SIZE), LDA
  174. testq M, M
  175. jle .L999
  176. unpcklpd ALPHA, ALPHA
  177. movq BUFFER, XX
  178. movq M, %rax
  179. sarq $3, %rax
  180. jle .L02
  181. ALIGN_3
  182. .L01:
  183. movsd 0 * SIZE(X), %xmm1
  184. addq INCX, X
  185. movhpd 0 * SIZE(X), %xmm1
  186. addq INCX, X
  187. movsd 0 * SIZE(X), %xmm2
  188. addq INCX, X
  189. movhpd 0 * SIZE(X), %xmm2
  190. addq INCX, X
  191. movsd 0 * SIZE(X), %xmm3
  192. addq INCX, X
  193. movhpd 0 * SIZE(X), %xmm3
  194. addq INCX, X
  195. movsd 0 * SIZE(X), %xmm4
  196. addq INCX, X
  197. movhpd 0 * SIZE(X), %xmm4
  198. addq INCX, X
  199. mulpd ALPHA, %xmm1
  200. mulpd ALPHA, %xmm2
  201. mulpd ALPHA, %xmm3
  202. mulpd ALPHA, %xmm4
  203. movapd %xmm1, 0 * SIZE(XX)
  204. movapd %xmm2, 2 * SIZE(XX)
  205. movapd %xmm3, 4 * SIZE(XX)
  206. movapd %xmm4, 6 * SIZE(XX)
  207. addq $8 * SIZE, XX
  208. decq %rax
  209. jg .L01
  210. ALIGN_3
  211. .L02:
  212. movq M, %rax
  213. andq $7, %rax
  214. jle .L05
  215. ALIGN_3
  216. .L03:
  217. movsd 0 * SIZE(X), %xmm1
  218. addq INCX, X
  219. mulsd ALPHA, %xmm1
  220. movlpd %xmm1, 0 * SIZE(XX)
  221. addq $1 * SIZE, XX
  222. decq %rax
  223. jg .L03
  224. ALIGN_3
  225. .L05:
  226. /* now we don't need original X */
  227. movq Y, NEW_Y
  228. addq $512, XX
  229. andq $-512, XX
  230. cmpq $SIZE, INCY
  231. je .L10
  232. movq Y, YY
  233. movq XX, NEW_Y
  234. movq M, %rax
  235. sarq $3, %rax
  236. jle .L07
  237. ALIGN_3
  238. .L06:
  239. movsd 0 * SIZE(YY), %xmm0
  240. addq INCY, YY
  241. movhpd 0 * SIZE(YY), %xmm0
  242. addq INCY, YY
  243. movsd 0 * SIZE(YY), %xmm1
  244. addq INCY, YY
  245. movhpd 0 * SIZE(YY), %xmm1
  246. addq INCY, YY
  247. movsd 0 * SIZE(YY), %xmm2
  248. addq INCY, YY
  249. movhpd 0 * SIZE(YY), %xmm2
  250. addq INCY, YY
  251. movsd 0 * SIZE(YY), %xmm3
  252. addq INCY, YY
  253. movhpd 0 * SIZE(YY), %xmm3
  254. addq INCY, YY
  255. movapd %xmm0, 0 * SIZE(XX)
  256. movapd %xmm1, 2 * SIZE(XX)
  257. movapd %xmm2, 4 * SIZE(XX)
  258. movapd %xmm3, 6 * SIZE(XX)
  259. addq $8 * SIZE, XX
  260. decq %rax
  261. jg .L06
  262. ALIGN_3
  263. .L07:
  264. movq M, %rax
  265. andq $7, %rax
  266. jle .L10
  267. ALIGN_3
  268. .L08:
  269. movsd 0 * SIZE(YY), %xmm0
  270. addq INCY, YY
  271. movsd %xmm0, 0 * SIZE(XX)
  272. addq $1 * SIZE, XX
  273. decq %rax
  274. jg .L08
  275. ALIGN_3
  276. .L10:
  277. xorq IS, IS # is = 0
  278. cmpq $4, N
  279. jl .L20
  280. ALIGN_3
  281. .L11:
  282. movq A, A1
  283. leaq (A, LDA, 2), A2
  284. leaq 4 * SIZE(A, LDA, 4), A
  285. leaq (NEW_X, IS, SIZE), XX
  286. leaq 4 * SIZE(NEW_Y, IS, SIZE), YY
  287. movapd 0 * SIZE(XX), atemp2
  288. movapd 2 * SIZE(XX), atemp4
  289. movsd 0 * SIZE(A1), xsum1
  290. movhpd 1 * SIZE(A1), xsum1
  291. mulpd atemp2, xsum1
  292. movsd 1 * SIZE(A1), xsum2
  293. movhpd 1 * SIZE(A1, LDA, 1), xsum2
  294. mulpd atemp2, xsum2
  295. movsd 2 * SIZE(A1), xsum3
  296. movhpd 2 * SIZE(A1, LDA, 1), xsum3
  297. mulpd atemp2, xsum3
  298. movsd 3 * SIZE(A1), xsum4
  299. movhpd 3 * SIZE(A1, LDA, 1), xsum4
  300. mulpd atemp2, xsum4
  301. movsd 2 * SIZE(A1), a1
  302. movhpd 3 * SIZE(A1), a1
  303. mulpd atemp4, a1
  304. addpd a1, xsum1
  305. movsd 2 * SIZE(A1, LDA, 1), a1
  306. movhpd 3 * SIZE(A1, LDA, 1), a1
  307. mulpd atemp4, a1
  308. addpd a1, xsum2
  309. movsd 2 * SIZE(A2), a1
  310. movhpd 3 * SIZE(A2), a1
  311. mulpd atemp4, a1
  312. addpd a1, xsum3
  313. movsd 3 * SIZE(A2), a1
  314. movhpd 3 * SIZE(A2, LDA, 1), a1
  315. mulpd atemp4, a1
  316. addpd a1, xsum4
  317. movapd 4 * SIZE(XX), xtemp1
  318. movapd 6 * SIZE(XX), xtemp2
  319. movsd 4 * SIZE(A1), a1
  320. movhpd 5 * SIZE(A1), a1
  321. movsd 6 * SIZE(A1), a2
  322. movhpd 7 * SIZE(A1), a2
  323. movsd 4 * SIZE(A1, LDA, 1), a3
  324. movhpd 5 * SIZE(A1, LDA, 1), a3
  325. movsd 0 * SIZE(YY), yy1
  326. movhpd 1 * SIZE(YY), yy1
  327. movsd 2 * SIZE(YY), yy2
  328. movhpd 3 * SIZE(YY), yy2
  329. #ifndef HAVE_SSE3
  330. movapd atemp2, atemp1
  331. unpcklpd atemp1, atemp1
  332. unpckhpd atemp2, atemp2
  333. movapd atemp4, atemp3
  334. unpcklpd atemp3, atemp3
  335. unpckhpd atemp4, atemp4
  336. #else
  337. movddup atemp2, atemp1
  338. unpckhpd atemp2, atemp2
  339. movddup atemp4, atemp3
  340. unpckhpd atemp4, atemp4
  341. #endif
  342. addq $4 * SIZE, XX
  343. addq $4 * SIZE, A1
  344. addq $4 * SIZE, A2
  345. movq M, I
  346. subq IS, I
  347. subq $4, I
  348. sarq $3, I
  349. jle .L15
  350. ALIGN_3
  351. .L12:
  352. movapd xtemp1, xt1
  353. mulpd a1, xt1
  354. mulpd atemp1, a1
  355. addpd xt1, xsum1
  356. addpd a1, yy1
  357. movsd 2 * SIZE(A1, LDA, 1), a1
  358. movhpd 3 * SIZE(A1, LDA, 1), a1
  359. PREFETCH PREFETCHSIZE(A1)
  360. movapd xtemp2, xt1
  361. mulpd a2, xt1
  362. mulpd atemp1, a2
  363. addpd xt1, xsum1
  364. addpd a2, yy2
  365. movsd 0 * SIZE(A2), a2
  366. movhpd 1 * SIZE(A2), a2
  367. movapd xtemp1, xt1
  368. mulpd a3, xt1
  369. mulpd atemp2, a3
  370. addpd xt1, xsum2
  371. addpd a3, yy1
  372. movsd 2 * SIZE(A2), a3
  373. movhpd 3 * SIZE(A2), a3
  374. #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
  375. PREFETCH PREFETCHSIZE(XX)
  376. #endif
  377. movapd xtemp2, xt1
  378. mulpd a1, xt1
  379. mulpd atemp2, a1
  380. addpd xt1, xsum2
  381. addpd a1, yy2
  382. movsd 0 * SIZE(A2, LDA, 1), a1
  383. movhpd 1 * SIZE(A2, LDA, 1), a1
  384. movapd xtemp1, xt1
  385. mulpd a2, xt1
  386. mulpd atemp3, a2
  387. addpd xt1, xsum3
  388. addpd a2, yy1
  389. movsd 2 * SIZE(A2, LDA, 1), a2
  390. movhpd 3 * SIZE(A2, LDA, 1), a2
  391. PREFETCH PREFETCHSIZE(A1, LDA, 1)
  392. movapd xtemp2, xt1
  393. mulpd a3, xt1
  394. mulpd atemp3, a3
  395. addpd xt1, xsum3
  396. addpd a3, yy2
  397. movsd 4 * SIZE(A1), a3
  398. movhpd 5 * SIZE(A1), a3
  399. movapd xtemp1, xt1
  400. movapd 4 * SIZE(XX), xtemp1
  401. mulpd a1, xt1
  402. mulpd atemp4, a1
  403. addpd xt1, xsum4
  404. addpd a1, yy1
  405. movsd 6 * SIZE(A1), a1
  406. movhpd 7 * SIZE(A1), a1
  407. movapd xtemp2, xt1
  408. movapd 6 * SIZE(XX), xtemp2
  409. mulpd a2, xt1
  410. mulpd atemp4, a2
  411. addpd xt1, xsum4
  412. addpd a2, yy2
  413. movsd 4 * SIZE(A1, LDA, 1), a2
  414. movhpd 5 * SIZE(A1, LDA, 1), a2
  415. movsd yy1, 0 * SIZE(YY)
  416. movhpd yy1, 1 * SIZE(YY)
  417. movsd 4 * SIZE(YY), yy1
  418. movhpd 5 * SIZE(YY), yy1
  419. movsd yy2, 2 * SIZE(YY)
  420. movhpd yy2, 3 * SIZE(YY)
  421. movsd 6 * SIZE(YY), yy2
  422. movhpd 7 * SIZE(YY), yy2
  423. movapd xtemp1, xt1
  424. mulpd a3, xt1
  425. mulpd atemp1, a3
  426. addpd xt1, xsum1
  427. addpd a3, yy1
  428. movsd 6 * SIZE(A1, LDA, 1), a3
  429. movhpd 7 * SIZE(A1, LDA, 1), a3
  430. PREFETCH PREFETCHSIZE(A2)
  431. movapd xtemp2, xt1
  432. mulpd a1, xt1
  433. mulpd atemp1, a1
  434. addpd xt1, xsum1
  435. addpd a1, yy2
  436. movsd 4 * SIZE(A2), a1
  437. movhpd 5 * SIZE(A2), a1
  438. movapd xtemp1, xt1
  439. mulpd a2, xt1
  440. mulpd atemp2, a2
  441. addpd xt1, xsum2
  442. addpd a2, yy1
  443. movsd 6 * SIZE(A2), a2
  444. movhpd 7 * SIZE(A2), a2
  445. #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
  446. PREFETCHW PREFETCHSIZE(YY)
  447. #endif
  448. movapd xtemp2, xt1
  449. mulpd a3, xt1
  450. mulpd atemp2, a3
  451. addpd xt1, xsum2
  452. addpd a3, yy2
  453. movsd 4 * SIZE(A2, LDA, 1), a3
  454. movhpd 5 * SIZE(A2, LDA, 1), a3
  455. movapd xtemp1, xt1
  456. mulpd a1, xt1
  457. mulpd atemp3, a1
  458. addpd xt1, xsum3
  459. addpd a1, yy1
  460. movsd 6 * SIZE(A2, LDA, 1), a1
  461. movhpd 7 * SIZE(A2, LDA, 1), a1
  462. PREFETCH PREFETCHSIZE(A2, LDA, 1)
  463. movapd xtemp2, xt1
  464. mulpd a2, xt1
  465. mulpd atemp3, a2
  466. addpd xt1, xsum3
  467. addpd a2, yy2
  468. movsd 10 * SIZE(A1), a2
  469. movhpd 11 * SIZE(A1), a2
  470. movapd xtemp1, xt1
  471. movapd 8 * SIZE(XX), xtemp1
  472. mulpd a3, xt1
  473. mulpd atemp4, a3
  474. addpd xt1, xsum4
  475. addpd a3, yy1
  476. movsd 8 * SIZE(A1, LDA, 1), a3
  477. movhpd 9 * SIZE(A1, LDA, 1), a3
  478. movapd xtemp2, xt1
  479. movapd 10 * SIZE(XX), xtemp2
  480. mulpd a1, xt1
  481. mulpd atemp4, a1
  482. addpd xt1, xsum4
  483. addpd a1, yy2
  484. movsd 8 * SIZE(A1), a1
  485. movhpd 9 * SIZE(A1), a1
  486. movsd yy1, 4 * SIZE(YY)
  487. movhpd yy1, 5 * SIZE(YY)
  488. movsd 8 * SIZE(YY), yy1
  489. movhpd 9 * SIZE(YY), yy1
  490. movsd yy2, 6 * SIZE(YY)
  491. movhpd yy2, 7 * SIZE(YY)
  492. movsd 10 * SIZE(YY), yy2
  493. movhpd 11 * SIZE(YY), yy2
  494. addq $8 * SIZE, XX
  495. addq $8 * SIZE, YY
  496. addq $8 * SIZE, A1
  497. addq $8 * SIZE, A2
  498. decq I
  499. jg .L12
  500. ALIGN_3
  501. .L15:
  502. movq M, I
  503. subq IS, I
  504. subq $4, I
  505. test $4, I
  506. jle .L17
  507. movapd xtemp1, xt1
  508. mulpd a1, xt1
  509. mulpd atemp1, a1
  510. addpd xt1, xsum1
  511. addpd a1, yy1
  512. movsd 2 * SIZE(A1, LDA, 1), a1
  513. movhpd 3 * SIZE(A1, LDA, 1), a1
  514. movapd xtemp2, xt1
  515. mulpd a2, xt1
  516. mulpd atemp1, a2
  517. addpd xt1, xsum1
  518. addpd a2, yy2
  519. movsd 0 * SIZE(A2), a2
  520. movhpd 1 * SIZE(A2), a2
  521. movapd xtemp1, xt1
  522. mulpd a3, xt1
  523. mulpd atemp2, a3
  524. addpd xt1, xsum2
  525. addpd a3, yy1
  526. movsd 2 * SIZE(A2), a3
  527. movhpd 3 * SIZE(A2), a3
  528. movapd xtemp2, xt1
  529. mulpd a1, xt1
  530. mulpd atemp2, a1
  531. addpd xt1, xsum2
  532. addpd a1, yy2
  533. movsd 0 * SIZE(A2, LDA, 1), a1
  534. movhpd 1 * SIZE(A2, LDA, 1), a1
  535. movapd xtemp1, xt1
  536. mulpd a2, xt1
  537. mulpd atemp3, a2
  538. addpd xt1, xsum3
  539. addpd a2, yy1
  540. movsd 2 * SIZE(A2, LDA, 1), a2
  541. movhpd 3 * SIZE(A2, LDA, 1), a2
  542. movapd xtemp2, xt1
  543. mulpd a3, xt1
  544. mulpd atemp3, a3
  545. addpd xt1, xsum3
  546. addpd a3, yy2
  547. movsd 4 * SIZE(A1, LDA, 1), a3
  548. movhpd 5 * SIZE(A1, LDA, 1), a3
  549. movapd xtemp1, xt1
  550. movapd 4 * SIZE(XX), xtemp1
  551. mulpd a1, xt1
  552. mulpd atemp4, a1
  553. addpd xt1, xsum4
  554. addpd a1, yy1
  555. movsd 4 * SIZE(A1), a1
  556. movhpd 5 * SIZE(A1), a1
  557. movapd xtemp2, xt1
  558. movapd 6 * SIZE(XX), xtemp2
  559. mulpd a2, xt1
  560. mulpd atemp4, a2
  561. addpd xt1, xsum4
  562. addpd a2, yy2
  563. movsd 6 * SIZE(A1), a2
  564. movhpd 7 * SIZE(A1), a2
  565. movsd yy1, 0 * SIZE(YY)
  566. movhpd yy1, 1 * SIZE(YY)
  567. movsd 4 * SIZE(YY), yy1
  568. movhpd 5 * SIZE(YY), yy1
  569. movsd yy2, 2 * SIZE(YY)
  570. movhpd yy2, 3 * SIZE(YY)
  571. movsd 6 * SIZE(YY), yy2
  572. movhpd 7 * SIZE(YY), yy2
  573. addq $4 * SIZE, XX
  574. addq $4 * SIZE, YY
  575. addq $4 * SIZE, A1
  576. addq $4 * SIZE, A2
  577. ALIGN_3
  578. .L17:
  579. testq $2, M
  580. jle .L18
  581. movapd xtemp1, xt1
  582. mulpd a1, xt1
  583. mulpd atemp1, a1
  584. addpd xt1, xsum1
  585. addpd a1, yy1
  586. movsd 0 * SIZE(A1, LDA, 1), a1
  587. movhpd 1 * SIZE(A1, LDA, 1), a1
  588. movapd xtemp1, xt1
  589. mulpd a1, xt1
  590. mulpd atemp2, a1
  591. addpd xt1, xsum2
  592. addpd a1, yy1
  593. movsd 0 * SIZE(A2), a1
  594. movhpd 1 * SIZE(A2), a1
  595. movapd xtemp1, xt1
  596. mulpd a1, xt1
  597. mulpd atemp3, a1
  598. addpd xt1, xsum3
  599. addpd a1, yy1
  600. movsd 0 * SIZE(A2, LDA, 1), a1
  601. movhpd 1 * SIZE(A2, LDA, 1), a1
  602. movapd xtemp1, xt1
  603. movapd 2 * SIZE(XX), xtemp1
  604. mulpd a1, xt1
  605. mulpd atemp4, a1
  606. addpd xt1, xsum4
  607. addpd a1, yy1
  608. movsd 2 * SIZE(A1), a1
  609. movsd yy1, 0 * SIZE(YY)
  610. movhpd yy1, 1 * SIZE(YY)
  611. movsd 2 * SIZE(YY), yy1
  612. addq $2 * SIZE, XX
  613. addq $2 * SIZE, YY
  614. addq $2 * SIZE, A1
  615. addq $2 * SIZE, A2
  616. ALIGN_3
  617. .L18:
  618. testq $1, M
  619. jle .L19
  620. movapd xtemp1, xt1
  621. mulsd a1, xt1
  622. mulsd atemp1, a1
  623. addsd xt1, xsum1
  624. addpd a1, yy1
  625. movsd 0 * SIZE(A1, LDA, 1), a1
  626. movapd xtemp1, xt1
  627. mulsd a1, xt1
  628. mulsd atemp2, a1
  629. addsd xt1, xsum2
  630. addsd a1, yy1
  631. movsd 0 * SIZE(A2), a1
  632. movapd xtemp1, xt1
  633. mulsd a1, xt1
  634. mulsd atemp3, a1
  635. addsd xt1, xsum3
  636. addsd a1, yy1
  637. movsd 0 * SIZE(A2, LDA, 1), a1
  638. movapd xtemp1, xt1
  639. mulsd a1, xt1
  640. mulsd atemp4, a1
  641. addsd xt1, xsum4
  642. addsd a1, yy1
  643. movsd yy1, 0 * SIZE(YY)
  644. ALIGN_3
  645. .L19:
  646. #ifndef HAVE_SSE3
  647. movapd xsum1, atemp1
  648. movapd xsum3, atemp3
  649. unpcklpd xsum2, xsum1
  650. unpcklpd xsum4, xsum3
  651. unpckhpd xsum2, atemp1
  652. unpckhpd xsum4, atemp3
  653. addpd atemp1, xsum1
  654. addpd atemp3, xsum3
  655. #else
  656. haddpd xsum2, xsum1
  657. haddpd xsum4, xsum3
  658. #endif
  659. movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
  660. movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1
  661. movsd 2 * SIZE(NEW_Y, IS, SIZE), yy2
  662. movhpd 3 * SIZE(NEW_Y, IS, SIZE), yy2
  663. addpd xsum1, yy1
  664. addpd xsum3, yy2
  665. movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE)
  666. movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE)
  667. movsd yy2, 2 * SIZE(NEW_Y, IS, SIZE)
  668. movhpd yy2, 3 * SIZE(NEW_Y, IS, SIZE)
  669. addq $4, IS
  670. movq IS, I
  671. addq $4, I
  672. cmpq N, I
  673. jle .L11
  674. ALIGN_3
  675. .L20:
  676. testq $2, N
  677. jle .L30
  678. movq A, A1
  679. leaq 2 * SIZE(A, LDA, 2), A
  680. movapd 0 * SIZE(NEW_X, IS, SIZE), atemp2
  681. movsd 0 * SIZE(A1), xsum1
  682. movhpd 1 * SIZE(A1), xsum1
  683. mulpd atemp2, xsum1
  684. movsd 1 * SIZE(A1), xsum2
  685. movhpd 1 * SIZE(A1, LDA, 1), xsum2
  686. mulpd atemp2, xsum2
  687. #ifndef HAVE_SSE3
  688. movapd atemp2, atemp1
  689. unpcklpd atemp1, atemp1
  690. #else
  691. movddup atemp2, atemp1
  692. #endif
  693. unpckhpd atemp2, atemp2
  694. testq $1, M
  695. jle .L29
  696. movsd 2 * SIZE(A1), a1
  697. movsd 2 * SIZE(A1, LDA, 1), a2
  698. movsd 2 * SIZE(NEW_X, IS, SIZE), xtemp1
  699. movsd 2 * SIZE(NEW_Y, IS, SIZE), yy1
  700. movapd xtemp1, xt1
  701. mulsd a1, xt1
  702. mulsd atemp1, a1
  703. addsd xt1, xsum1
  704. addpd a1, yy1
  705. movapd xtemp1, xt1
  706. mulsd a2, xt1
  707. mulsd atemp2, a2
  708. addsd xt1, xsum2
  709. addsd a2, yy1
  710. movsd yy1, 2 * SIZE(NEW_Y, IS, SIZE)
  711. ALIGN_3
  712. .L29:
  713. #ifndef HAVE_SSE3
  714. movapd xsum1, atemp1
  715. unpcklpd xsum2, xsum1
  716. unpckhpd xsum2, atemp1
  717. addpd atemp1, xsum1
  718. #else
  719. haddpd xsum2, xsum1
  720. #endif
  721. movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
  722. movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1
  723. addpd xsum1, yy1
  724. movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE)
  725. movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE)
  726. addq $2, IS
  727. ALIGN_3
  728. .L30:
  729. testq $1, N
  730. jle .L990
  731. movsd 0 * SIZE(A), xsum1
  732. movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1
  733. movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
  734. mulsd atemp1, xsum1
  735. addsd xsum1, yy1
  736. movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE)
  737. ALIGN_3
  738. .L990:
  739. cmpq $SIZE, INCY
  740. je .L999
  741. movq M, %rax
  742. sarq $3, %rax
  743. jle .L997
  744. ALIGN_3
  745. .L996:
  746. movapd 0 * SIZE(NEW_Y), %xmm0
  747. movapd 2 * SIZE(NEW_Y), %xmm1
  748. movapd 4 * SIZE(NEW_Y), %xmm2
  749. movapd 6 * SIZE(NEW_Y), %xmm3
  750. movsd %xmm0, 0 * SIZE(Y)
  751. addq INCY, Y
  752. movhpd %xmm0, 0 * SIZE(Y)
  753. addq INCY, Y
  754. movsd %xmm1, 0 * SIZE(Y)
  755. addq INCY, Y
  756. movhpd %xmm1, 0 * SIZE(Y)
  757. addq INCY, Y
  758. movsd %xmm2, 0 * SIZE(Y)
  759. addq INCY, Y
  760. movhpd %xmm2, 0 * SIZE(Y)
  761. addq INCY, Y
  762. movsd %xmm3, 0 * SIZE(Y)
  763. addq INCY, Y
  764. movhpd %xmm3, 0 * SIZE(Y)
  765. addq INCY, Y
  766. addq $8 * SIZE, NEW_Y
  767. decq %rax
  768. jg .L996
  769. ALIGN_3
  770. .L997:
  771. movq M, %rax
  772. andq $7, %rax
  773. jle .L999
  774. ALIGN_3
  775. .L998:
  776. movsd 0 * SIZE(NEW_Y), %xmm0
  777. movsd %xmm0, 0 * SIZE(Y)
  778. addq INCY, Y
  779. addq $1 * SIZE, NEW_Y
  780. decq %rax
  781. jg .L998
  782. ALIGN_3
  783. .L999:
  784. movq 0(%rsp), %rbx
  785. movq 8(%rsp), %rbp
  786. movq 16(%rsp), %r12
  787. movq 24(%rsp), %r13
  788. movq 32(%rsp), %r14
  789. movq 40(%rsp), %r15
  790. #ifdef WINDOWS_ABI
  791. movq 48(%rsp), %rdi
  792. movq 56(%rsp), %rsi
  793. movups 64(%rsp), %xmm6
  794. movups 80(%rsp), %xmm7
  795. movups 96(%rsp), %xmm8
  796. movups 112(%rsp), %xmm9
  797. movups 128(%rsp), %xmm10
  798. movups 144(%rsp), %xmm11
  799. movups 160(%rsp), %xmm12
  800. movups 176(%rsp), %xmm13
  801. movups 192(%rsp), %xmm14
  802. movups 208(%rsp), %xmm15
  803. #endif
  804. addq $STACKSIZE, %rsp
  805. ret
  806. EPILOGUE