You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

symv_U_sse2.S 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef ATOM
  41. #define PREFETCH prefetcht0
  42. #define PREFETCHW prefetcht0
  43. #define PREFETCHSIZE (16 * 12)
  44. #endif
  45. #ifdef CORE2
  46. #define PREFETCH prefetcht0
  47. #define PREFETCHW prefetcht0
  48. #define PREFETCHSIZE (16 * 12)
  49. #endif
  50. #if defined(PENRYN) || defined(DUNNINGTON)
  51. #define PREFETCH prefetcht0
  52. #define PREFETCHW prefetcht0
  53. #define PREFETCHSIZE (16 * 12)
  54. #endif
  55. #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
  56. #define PREFETCH prefetcht0
  57. #define PREFETCHW prefetcht0
  58. #define PREFETCHSIZE (16 * 24)
  59. #endif
  60. #ifdef PENTIUM4
  61. #define PREFETCH prefetcht0
  62. #define PREFETCHW prefetcht0
  63. #define PREFETCHSIZE (16 * 20)
  64. #endif
  65. #ifdef OPTERON
  66. #define PREFETCH prefetch
  67. #define PREFETCHW prefetchw
  68. #define PREFETCHSIZE (16 * 8)
  69. #define movsd movlpd
  70. #endif
  71. #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  72. #define PREFETCH prefetch
  73. #define PREFETCHW prefetchw
  74. #define PREFETCHSIZE (16 * 16)
  75. #endif
  76. #ifdef NANO
  77. #define PREFETCH prefetcht0
  78. #define PREFETCHW prefetcht0
  79. #define PREFETCHSIZE (8 * 24)
  80. #endif
  81. #ifdef GENERIC
  82. #define PREFETCH prefetcht0
  83. #define PREFETCHW prefetcht0
  84. #define PREFETCHSIZE (16 * 20)
  85. #endif
  86. #ifndef WINDOWS_ABI
  87. #define STACKSIZE 80
  88. #define OLD_Y 8 + STACKSIZE(%rsp)
  89. #define OLD_INCY 16 + STACKSIZE(%rsp)
  90. #define OLD_BUFFER 24 + STACKSIZE(%rsp)
  91. #define M ARG1
  92. #define IS ARG2
  93. #define A ARG3
  94. #define LDA ARG4
  95. #define X ARG5
  96. #define INCX ARG6
  97. #else
  98. #define STACKSIZE 256
  99. #define OLD_LDA 40 + STACKSIZE(%rsp)
  100. #define OLD_X 48 + STACKSIZE(%rsp)
  101. #define OLD_INCX 56 + STACKSIZE(%rsp)
  102. #define OLD_Y 64 + STACKSIZE(%rsp)
  103. #define OLD_INCY 72 + STACKSIZE(%rsp)
  104. #define OLD_BUFFER 80 + STACKSIZE(%rsp)
  105. #define M ARG1
  106. #define IS ARG2
  107. #define A ARG4
  108. #define LDA ARG3
  109. #define X %rdi
  110. #define INCX %rsi
  111. #endif
  112. #define Y %r10
  113. #define INCY %r11
  114. #define BUFFER %r12
  115. #define TEMP %rax
  116. #define I %rax
  117. #define A1 %rbx
  118. #define A2 %rbp
  119. #define XX %r13
  120. #define YY %r14
  121. #define NEW_X BUFFER
  122. #define NEW_Y X
  123. #define ALPHA %xmm0
  124. #define xtemp1 %xmm0
  125. #define xtemp2 %xmm1
  126. #define yy1 %xmm2
  127. #define yy2 %xmm3
  128. #define atemp1 %xmm4
  129. #define atemp2 %xmm5
  130. #define atemp3 %xmm6
  131. #define atemp4 %xmm7
  132. #define xsum1 %xmm8
  133. #define xsum2 %xmm9
  134. #define xsum3 %xmm10
  135. #define xsum4 %xmm11
  136. #define a1 %xmm12
  137. #define a2 %xmm13
  138. #define a3 %xmm14
  139. #define xt1 %xmm15
  140. PROLOGUE
  141. PROFCODE
  142. subq $STACKSIZE, %rsp
  143. movq %rbx, 0(%rsp)
  144. movq %rbp, 8(%rsp)
  145. movq %r12, 16(%rsp)
  146. movq %r13, 24(%rsp)
  147. movq %r14, 32(%rsp)
  148. movq %r15, 40(%rsp)
  149. #ifdef WINDOWS_ABI
  150. movq %rdi, 48(%rsp)
  151. movq %rsi, 56(%rsp)
  152. movups %xmm6, 64(%rsp)
  153. movups %xmm7, 80(%rsp)
  154. movups %xmm8, 96(%rsp)
  155. movups %xmm9, 112(%rsp)
  156. movups %xmm10, 128(%rsp)
  157. movups %xmm11, 144(%rsp)
  158. movups %xmm12, 160(%rsp)
  159. movups %xmm13, 176(%rsp)
  160. movups %xmm14, 192(%rsp)
  161. movups %xmm15, 208(%rsp)
  162. movq OLD_LDA, LDA
  163. movq OLD_X, X
  164. movq OLD_INCX, INCX
  165. movaps %xmm2, %xmm0
  166. #endif
  167. movq OLD_Y, Y
  168. movq OLD_INCY, INCY
  169. movq OLD_BUFFER, BUFFER
  170. leaq (,INCX, SIZE), INCX
  171. leaq (,INCY, SIZE), INCY
  172. leaq (,LDA, SIZE), LDA
  173. testq M, M
  174. jle .L999
  175. negq IS
  176. addq M, IS
  177. movq IS, TEMP
  178. imulq LDA, TEMP
  179. addq TEMP, A
  180. unpcklpd ALPHA, ALPHA
  181. movq BUFFER, XX
  182. movq M, %rax
  183. sarq $3, %rax
  184. jle .L02
  185. ALIGN_3
  186. .L01:
  187. movsd 0 * SIZE(X), %xmm1
  188. addq INCX, X
  189. movhpd 0 * SIZE(X), %xmm1
  190. addq INCX, X
  191. movsd 0 * SIZE(X), %xmm2
  192. addq INCX, X
  193. movhpd 0 * SIZE(X), %xmm2
  194. addq INCX, X
  195. movsd 0 * SIZE(X), %xmm3
  196. addq INCX, X
  197. movhpd 0 * SIZE(X), %xmm3
  198. addq INCX, X
  199. movsd 0 * SIZE(X), %xmm4
  200. addq INCX, X
  201. movhpd 0 * SIZE(X), %xmm4
  202. addq INCX, X
  203. mulpd ALPHA, %xmm1
  204. mulpd ALPHA, %xmm2
  205. mulpd ALPHA, %xmm3
  206. mulpd ALPHA, %xmm4
  207. movapd %xmm1, 0 * SIZE(XX)
  208. movapd %xmm2, 2 * SIZE(XX)
  209. movapd %xmm3, 4 * SIZE(XX)
  210. movapd %xmm4, 6 * SIZE(XX)
  211. addq $8 * SIZE, XX
  212. decq %rax
  213. jg .L01
  214. ALIGN_3
  215. .L02:
  216. movq M, %rax
  217. andq $7, %rax
  218. jle .L05
  219. ALIGN_3
  220. .L03:
  221. movsd 0 * SIZE(X), %xmm1
  222. addq INCX, X
  223. mulsd ALPHA, %xmm1
  224. movlpd %xmm1, 0 * SIZE(XX)
  225. addq $1 * SIZE, XX
  226. decq %rax
  227. jg .L03
  228. ALIGN_3
  229. .L05:
  230. /* now we don't need original X */
  231. movq Y, NEW_Y
  232. addq $512, XX
  233. andq $-512, XX
  234. cmpq $SIZE, INCY
  235. je .L10
  236. movq Y, YY
  237. movq XX, NEW_Y
  238. movq M, %rax
  239. sarq $3, %rax
  240. jle .L07
  241. ALIGN_3
  242. .L06:
  243. movsd 0 * SIZE(YY), %xmm0
  244. addq INCY, YY
  245. movhpd 0 * SIZE(YY), %xmm0
  246. addq INCY, YY
  247. movsd 0 * SIZE(YY), %xmm1
  248. addq INCY, YY
  249. movhpd 0 * SIZE(YY), %xmm1
  250. addq INCY, YY
  251. movsd 0 * SIZE(YY), %xmm2
  252. addq INCY, YY
  253. movhpd 0 * SIZE(YY), %xmm2
  254. addq INCY, YY
  255. movsd 0 * SIZE(YY), %xmm3
  256. addq INCY, YY
  257. movhpd 0 * SIZE(YY), %xmm3
  258. addq INCY, YY
  259. movapd %xmm0, 0 * SIZE(XX)
  260. movapd %xmm1, 2 * SIZE(XX)
  261. movapd %xmm2, 4 * SIZE(XX)
  262. movapd %xmm3, 6 * SIZE(XX)
  263. addq $8 * SIZE, XX
  264. decq %rax
  265. jg .L06
  266. ALIGN_3
  267. .L07:
  268. movq M, %rax
  269. andq $7, %rax
  270. jle .L10
  271. ALIGN_3
  272. .L08:
  273. movsd 0 * SIZE(YY), %xmm0
  274. addq INCY, YY
  275. movsd %xmm0, 0 * SIZE(XX)
  276. addq $1 * SIZE, XX
  277. decq %rax
  278. jg .L08
  279. ALIGN_3
  280. .L10:
  281. movq IS, I
  282. addq $4, I
  283. cmpq M, I
  284. jg .L20
  285. ALIGN_3
  286. .L11:
  287. movq A, A1
  288. leaq (A, LDA, 2), A2
  289. leaq (A, LDA, 4), A
  290. #ifdef HAVE_SSE3
  291. movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1
  292. movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2
  293. movddup 2 * SIZE(NEW_X, IS, SIZE), atemp3
  294. movddup 3 * SIZE(NEW_X, IS, SIZE), atemp4
  295. #else
  296. movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1
  297. movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1
  298. movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2
  299. movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2
  300. movsd 2 * SIZE(NEW_X, IS, SIZE), atemp3
  301. movhpd 2 * SIZE(NEW_X, IS, SIZE), atemp3
  302. movsd 3 * SIZE(NEW_X, IS, SIZE), atemp4
  303. movhpd 3 * SIZE(NEW_X, IS, SIZE), atemp4
  304. #endif
  305. pxor xsum1, xsum1
  306. pxor xsum2, xsum2
  307. pxor xsum3, xsum3
  308. pxor xsum4, xsum4
  309. movapd 0 * SIZE(NEW_X), xtemp1
  310. movapd 2 * SIZE(NEW_X), xtemp2
  311. movsd 0 * SIZE(A1), a1
  312. movhpd 1 * SIZE(A1), a1
  313. movsd 2 * SIZE(A1), a2
  314. movhpd 3 * SIZE(A1), a2
  315. movsd 0 * SIZE(A1, LDA, 1), a3
  316. movhpd 1 * SIZE(A1, LDA, 1), a3
  317. movsd 0 * SIZE(NEW_Y), yy1
  318. movhpd 1 * SIZE(NEW_Y), yy1
  319. movsd 2 * SIZE(NEW_Y), yy2
  320. movhpd 3 * SIZE(NEW_Y), yy2
  321. movq NEW_X, XX
  322. movq NEW_Y, YY
  323. movq IS, I
  324. sarq $3, I
  325. jle .L15
  326. ALIGN_3
  327. .L12:
  328. movapd xtemp1, xt1
  329. mulpd a1, xt1
  330. mulpd atemp1, a1
  331. addpd xt1, xsum1
  332. addpd a1, yy1
  333. movsd 2 * SIZE(A1, LDA, 1), a1
  334. movhpd 3 * SIZE(A1, LDA, 1), a1
  335. PREFETCH PREFETCHSIZE(A1)
  336. movapd xtemp2, xt1
  337. mulpd a2, xt1
  338. mulpd atemp1, a2
  339. addpd xt1, xsum1
  340. addpd a2, yy2
  341. movsd 0 * SIZE(A2), a2
  342. movhpd 1 * SIZE(A2), a2
  343. movapd xtemp1, xt1
  344. mulpd a3, xt1
  345. mulpd atemp2, a3
  346. addpd xt1, xsum2
  347. addpd a3, yy1
  348. movsd 2 * SIZE(A2), a3
  349. movhpd 3 * SIZE(A2), a3
  350. #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
  351. PREFETCH PREFETCHSIZE(XX)
  352. #endif
  353. movapd xtemp2, xt1
  354. mulpd a1, xt1
  355. mulpd atemp2, a1
  356. addpd xt1, xsum2
  357. addpd a1, yy2
  358. movsd 0 * SIZE(A2, LDA, 1), a1
  359. movhpd 1 * SIZE(A2, LDA, 1), a1
  360. movapd xtemp1, xt1
  361. mulpd a2, xt1
  362. mulpd atemp3, a2
  363. addpd xt1, xsum3
  364. addpd a2, yy1
  365. movsd 2 * SIZE(A2, LDA, 1), a2
  366. movhpd 3 * SIZE(A2, LDA, 1), a2
  367. PREFETCH PREFETCHSIZE(A1, LDA, 1)
  368. movapd xtemp2, xt1
  369. mulpd a3, xt1
  370. mulpd atemp3, a3
  371. addpd xt1, xsum3
  372. addpd a3, yy2
  373. movsd 4 * SIZE(A1), a3
  374. movhpd 5 * SIZE(A1), a3
  375. movapd xtemp1, xt1
  376. movapd 4 * SIZE(XX), xtemp1
  377. mulpd a1, xt1
  378. mulpd atemp4, a1
  379. addpd xt1, xsum4
  380. addpd a1, yy1
  381. movsd 6 * SIZE(A1), a1
  382. movhpd 7 * SIZE(A1), a1
  383. movapd xtemp2, xt1
  384. movapd 6 * SIZE(XX), xtemp2
  385. mulpd a2, xt1
  386. mulpd atemp4, a2
  387. addpd xt1, xsum4
  388. addpd a2, yy2
  389. movsd 4 * SIZE(A1, LDA, 1), a2
  390. movhpd 5 * SIZE(A1, LDA, 1), a2
  391. movsd yy1, 0 * SIZE(YY)
  392. movhpd yy1, 1 * SIZE(YY)
  393. movsd 4 * SIZE(YY), yy1
  394. movhpd 5 * SIZE(YY), yy1
  395. movsd yy2, 2 * SIZE(YY)
  396. movhpd yy2, 3 * SIZE(YY)
  397. movsd 6 * SIZE(YY), yy2
  398. movhpd 7 * SIZE(YY), yy2
  399. movapd xtemp1, xt1
  400. mulpd a3, xt1
  401. mulpd atemp1, a3
  402. addpd xt1, xsum1
  403. addpd a3, yy1
  404. movsd 6 * SIZE(A1, LDA, 1), a3
  405. movhpd 7 * SIZE(A1, LDA, 1), a3
  406. PREFETCH PREFETCHSIZE(A2)
  407. movapd xtemp2, xt1
  408. mulpd a1, xt1
  409. mulpd atemp1, a1
  410. addpd xt1, xsum1
  411. addpd a1, yy2
  412. movsd 4 * SIZE(A2), a1
  413. movhpd 5 * SIZE(A2), a1
  414. movapd xtemp1, xt1
  415. mulpd a2, xt1
  416. mulpd atemp2, a2
  417. addpd xt1, xsum2
  418. addpd a2, yy1
  419. movsd 6 * SIZE(A2), a2
  420. movhpd 7 * SIZE(A2), a2
  421. #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
  422. PREFETCHW PREFETCHSIZE(YY)
  423. #endif
  424. movapd xtemp2, xt1
  425. mulpd a3, xt1
  426. mulpd atemp2, a3
  427. addpd xt1, xsum2
  428. addpd a3, yy2
  429. movsd 4 * SIZE(A2, LDA, 1), a3
  430. movhpd 5 * SIZE(A2, LDA, 1), a3
  431. movapd xtemp1, xt1
  432. mulpd a1, xt1
  433. mulpd atemp3, a1
  434. addpd xt1, xsum3
  435. addpd a1, yy1
  436. movsd 6 * SIZE(A2, LDA, 1), a1
  437. movhpd 7 * SIZE(A2, LDA, 1), a1
  438. PREFETCH PREFETCHSIZE(A2, LDA, 1)
  439. movapd xtemp2, xt1
  440. mulpd a2, xt1
  441. mulpd atemp3, a2
  442. addpd xt1, xsum3
  443. addpd a2, yy2
  444. movsd 10 * SIZE(A1), a2
  445. movhpd 11 * SIZE(A1), a2
  446. movapd xtemp1, xt1
  447. movapd 8 * SIZE(XX), xtemp1
  448. mulpd a3, xt1
  449. mulpd atemp4, a3
  450. addpd xt1, xsum4
  451. addpd a3, yy1
  452. movsd 8 * SIZE(A1, LDA, 1), a3
  453. movhpd 9 * SIZE(A1, LDA, 1), a3
  454. movapd xtemp2, xt1
  455. movapd 10 * SIZE(XX), xtemp2
  456. mulpd a1, xt1
  457. mulpd atemp4, a1
  458. addpd xt1, xsum4
  459. addpd a1, yy2
  460. movsd 8 * SIZE(A1), a1
  461. movhpd 9 * SIZE(A1), a1
  462. movsd yy1, 4 * SIZE(YY)
  463. movhpd yy1, 5 * SIZE(YY)
  464. movsd 8 * SIZE(YY), yy1
  465. movhpd 9 * SIZE(YY), yy1
  466. movsd yy2, 6 * SIZE(YY)
  467. movhpd yy2, 7 * SIZE(YY)
  468. movsd 10 * SIZE(YY), yy2
  469. movhpd 11 * SIZE(YY), yy2
  470. addq $8 * SIZE, XX
  471. addq $8 * SIZE, YY
  472. addq $8 * SIZE, A1
  473. addq $8 * SIZE, A2
  474. decq I
  475. jg .L12
  476. ALIGN_3
  477. .L15:
  478. testq $4, IS
  479. jle .L18
  480. movapd xtemp1, xt1
  481. mulpd a1, xt1
  482. mulpd atemp1, a1
  483. addpd xt1, xsum1
  484. addpd a1, yy1
  485. movsd 2 * SIZE(A1, LDA, 1), a1
  486. movhpd 3 * SIZE(A1, LDA, 1), a1
  487. movapd xtemp2, xt1
  488. mulpd a2, xt1
  489. mulpd atemp1, a2
  490. addpd xt1, xsum1
  491. addpd a2, yy2
  492. movsd 0 * SIZE(A2), a2
  493. movhpd 1 * SIZE(A2), a2
  494. movapd xtemp1, xt1
  495. mulpd a3, xt1
  496. mulpd atemp2, a3
  497. addpd xt1, xsum2
  498. addpd a3, yy1
  499. movsd 2 * SIZE(A2), a3
  500. movhpd 3 * SIZE(A2), a3
  501. movapd xtemp2, xt1
  502. mulpd a1, xt1
  503. mulpd atemp2, a1
  504. addpd xt1, xsum2
  505. addpd a1, yy2
  506. movsd 0 * SIZE(A2, LDA, 1), a1
  507. movhpd 1 * SIZE(A2, LDA, 1), a1
  508. movapd xtemp1, xt1
  509. mulpd a2, xt1
  510. mulpd atemp3, a2
  511. addpd xt1, xsum3
  512. addpd a2, yy1
  513. movsd 2 * SIZE(A2, LDA, 1), a2
  514. movhpd 3 * SIZE(A2, LDA, 1), a2
  515. movapd xtemp2, xt1
  516. mulpd a3, xt1
  517. mulpd atemp3, a3
  518. addpd xt1, xsum3
  519. addpd a3, yy2
  520. movapd xtemp1, xt1
  521. movapd 4 * SIZE(XX), xtemp1
  522. mulpd a1, xt1
  523. mulpd atemp4, a1
  524. addpd xt1, xsum4
  525. addpd a1, yy1
  526. movapd xtemp2, xt1
  527. movapd 6 * SIZE(XX), xtemp2
  528. mulpd a2, xt1
  529. mulpd atemp4, a2
  530. addpd xt1, xsum4
  531. addpd a2, yy2
  532. movsd yy1, 0 * SIZE(YY)
  533. movhpd yy1, 1 * SIZE(YY)
  534. movsd 4 * SIZE(YY), yy1
  535. movhpd 5 * SIZE(YY), yy1
  536. movsd yy2, 2 * SIZE(YY)
  537. movhpd yy2, 3 * SIZE(YY)
  538. movsd 6 * SIZE(YY), yy2
  539. movhpd 7 * SIZE(YY), yy2
  540. addq $4 * SIZE, XX
  541. addq $4 * SIZE, YY
  542. addq $4 * SIZE, A1
  543. addq $4 * SIZE, A2
  544. ALIGN_3
  545. .L18:
  546. unpckhpd atemp2, atemp1
  547. unpckhpd atemp4, atemp3
  548. movsd 0 * SIZE(A1), a1
  549. movhpd 0 * SIZE(A1, LDA, 1), a1
  550. mulpd atemp1, a1
  551. addpd a1, xsum1
  552. movsd 0 * SIZE(A1, LDA, 1), a1
  553. movhpd 1 * SIZE(A1, LDA, 1), a1
  554. mulpd atemp1, a1
  555. addpd a1, xsum2
  556. movsd 0 * SIZE(A2), a1
  557. movhpd 1 * SIZE(A2), a1
  558. mulpd atemp1, a1
  559. addpd a1, xsum3
  560. movsd 0 * SIZE(A2, LDA, 1), a1
  561. movhpd 1 * SIZE(A2, LDA, 1), a1
  562. mulpd atemp1, a1
  563. addpd a1, xsum4
  564. movsd 0 * SIZE(A2), a1
  565. movhpd 0 * SIZE(A2, LDA, 1), a1
  566. mulpd atemp3, a1
  567. addpd a1, xsum1
  568. movsd 1 * SIZE(A2), a1
  569. movhpd 1 * SIZE(A2, LDA, 1), a1
  570. mulpd atemp3, a1
  571. addpd a1, xsum2
  572. movsd 2 * SIZE(A2), a1
  573. movhpd 2 * SIZE(A2, LDA, 1), a1
  574. mulpd atemp3, a1
  575. addpd a1, xsum3
  576. movsd 2 * SIZE(A2, LDA, 1), a1
  577. movhpd 3 * SIZE(A2, LDA, 1), a1
  578. mulpd atemp3, a1
  579. addpd a1, xsum4
  580. #ifndef HAVE_SSE3
  581. movapd xsum1, atemp1
  582. movapd xsum3, atemp3
  583. unpcklpd xsum2, xsum1
  584. unpcklpd xsum4, xsum3
  585. unpckhpd xsum2, atemp1
  586. unpckhpd xsum4, atemp3
  587. addpd atemp1, xsum1
  588. addpd atemp3, xsum3
  589. #else
  590. haddpd xsum2, xsum1
  591. haddpd xsum4, xsum3
  592. #endif
  593. addpd xsum1, yy1
  594. addpd xsum3, yy2
  595. movsd yy1, 0 * SIZE(YY)
  596. movhpd yy1, 1 * SIZE(YY)
  597. movsd yy2, 2 * SIZE(YY)
  598. movhpd yy2, 3 * SIZE(YY)
  599. addq $4, IS
  600. movq IS, I
  601. addq $4, I
  602. cmpq M, I
  603. jle .L11
  604. ALIGN_3
  605. .L20:
  606. testq $2, M
  607. je .L30
  608. ALIGN_3
  609. .L21:
  610. movq A, A1
  611. leaq (A, LDA, 2), A
  612. #ifdef HAVE_SSE3
  613. movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1
  614. movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2
  615. #else
  616. movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1
  617. movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1
  618. movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2
  619. movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2
  620. #endif
  621. pxor xsum1, xsum1
  622. pxor xsum2, xsum2
  623. movapd 0 * SIZE(NEW_X), xtemp1
  624. movsd 0 * SIZE(NEW_Y), yy1
  625. movhpd 1 * SIZE(NEW_Y), yy1
  626. movsd 0 * SIZE(A1), a1
  627. movhpd 1 * SIZE(A1), a1
  628. movsd 0 * SIZE(A1, LDA, 1), a2
  629. movhpd 1 * SIZE(A1, LDA, 1), a2
  630. movq NEW_X, XX
  631. movq NEW_Y, YY
  632. movq IS, I
  633. sarq $1, I
  634. jle .L28
  635. ALIGN_3
  636. .L22:
  637. movapd xtemp1, xt1
  638. mulpd a1, xt1
  639. mulpd atemp1, a1
  640. addpd xt1, xsum1
  641. addpd a1, yy1
  642. movsd 2 * SIZE(A1), a1
  643. movhpd 3 * SIZE(A1), a1
  644. movapd xtemp1, xt1
  645. movapd 2 * SIZE(XX), xtemp1
  646. mulpd a2, xt1
  647. mulpd atemp2, a2
  648. addpd xt1, xsum2
  649. addpd a2, yy1
  650. movsd 2 * SIZE(A1, LDA, 1), a2
  651. movhpd 3 * SIZE(A1, LDA, 1), a2
  652. movsd yy1, 0 * SIZE(YY)
  653. movhpd yy1, 1 * SIZE(YY)
  654. movsd 2 * SIZE(YY), yy1
  655. movhpd 3 * SIZE(YY), yy1
  656. addq $2 * SIZE, XX
  657. addq $2 * SIZE, YY
  658. addq $2 * SIZE, A1
  659. decq I
  660. jg .L22
  661. ALIGN_3
  662. .L28:
  663. unpckhpd atemp2, atemp1
  664. movsd 0 * SIZE(A1), a1
  665. movhpd 0 * SIZE(A1, LDA, 1), a1
  666. mulpd atemp1, a1
  667. addpd a1, xsum1
  668. movsd 0 * SIZE(A1, LDA, 1), a1
  669. movhpd 1 * SIZE(A1, LDA, 1), a1
  670. mulpd atemp1, a1
  671. addpd a1, xsum2
  672. #ifndef HAVE_SSE3
  673. movapd xsum1, atemp1
  674. unpcklpd xsum2, xsum1
  675. unpckhpd xsum2, atemp1
  676. addpd atemp1, xsum1
  677. #else
  678. haddpd xsum2, xsum1
  679. #endif
  680. addpd xsum1, yy1
  681. movsd yy1, 0 * SIZE(YY)
  682. movhpd yy1, 1 * SIZE(YY)
  683. addq $2, IS
  684. ALIGN_3
  685. .L30:
  686. testq $1, M
  687. je .L990
  688. ALIGN_3
  689. .L31:
  690. movq A, A1
  691. #ifdef HAVE_SSE3
  692. movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1
  693. #else
  694. movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1
  695. movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1
  696. #endif
  697. pxor xsum1, xsum1
  698. movsd 0 * SIZE(NEW_X), xtemp1
  699. movsd 0 * SIZE(NEW_Y), yy1
  700. movsd 0 * SIZE(A1), a1
  701. movq NEW_X, XX
  702. movq NEW_Y, YY
  703. movq IS, I
  704. testq I, I
  705. jle .L38
  706. ALIGN_3
  707. .L32:
  708. movapd xtemp1, xt1
  709. mulpd a1, xt1
  710. mulpd atemp1, a1
  711. addpd xt1, xsum1
  712. addpd a1, yy1
  713. movsd 1 * SIZE(A1), a1
  714. movsd 1 * SIZE(XX), xtemp1
  715. movsd yy1, 0 * SIZE(YY)
  716. movsd 1 * SIZE(YY), yy1
  717. addq $1 * SIZE, XX
  718. addq $1 * SIZE, YY
  719. addq $1 * SIZE, A1
  720. decq I
  721. jg .L32
  722. ALIGN_3
  723. .L38:
  724. movsd 0 * SIZE(A1), a1
  725. mulsd atemp1, a1
  726. addsd a1, xsum1
  727. addsd xsum1, yy1
  728. movsd yy1, 0 * SIZE(YY)
  729. ALIGN_3
  730. .L990:
  731. cmpq $SIZE, INCY
  732. je .L999
  733. movq M, %rax
  734. sarq $3, %rax
  735. jle .L997
  736. ALIGN_3
  737. .L996:
  738. movapd 0 * SIZE(NEW_Y), %xmm0
  739. movapd 2 * SIZE(NEW_Y), %xmm1
  740. movapd 4 * SIZE(NEW_Y), %xmm2
  741. movapd 6 * SIZE(NEW_Y), %xmm3
  742. movsd %xmm0, 0 * SIZE(Y)
  743. addq INCY, Y
  744. movhpd %xmm0, 0 * SIZE(Y)
  745. addq INCY, Y
  746. movsd %xmm1, 0 * SIZE(Y)
  747. addq INCY, Y
  748. movhpd %xmm1, 0 * SIZE(Y)
  749. addq INCY, Y
  750. movsd %xmm2, 0 * SIZE(Y)
  751. addq INCY, Y
  752. movhpd %xmm2, 0 * SIZE(Y)
  753. addq INCY, Y
  754. movsd %xmm3, 0 * SIZE(Y)
  755. addq INCY, Y
  756. movhpd %xmm3, 0 * SIZE(Y)
  757. addq INCY, Y
  758. addq $8 * SIZE, NEW_Y
  759. decq %rax
  760. jg .L996
  761. ALIGN_3
  762. .L997:
  763. movq M, %rax
  764. andq $7, %rax
  765. jle .L999
  766. ALIGN_3
  767. .L998:
  768. movsd 0 * SIZE(NEW_Y), %xmm0
  769. movsd %xmm0, 0 * SIZE(Y)
  770. addq INCY, Y
  771. addq $1 * SIZE, NEW_Y
  772. decq %rax
  773. jg .L998
  774. ALIGN_3
  775. .L999:
  776. movq 0(%rsp), %rbx
  777. movq 8(%rsp), %rbp
  778. movq 16(%rsp), %r12
  779. movq 24(%rsp), %r13
  780. movq 32(%rsp), %r14
  781. movq 40(%rsp), %r15
  782. #ifdef WINDOWS_ABI
  783. movq 48(%rsp), %rdi
  784. movq 56(%rsp), %rsi
  785. movups 64(%rsp), %xmm6
  786. movups 80(%rsp), %xmm7
  787. movups 96(%rsp), %xmm8
  788. movups 112(%rsp), %xmm9
  789. movups 128(%rsp), %xmm10
  790. movups 144(%rsp), %xmm11
  791. movups 160(%rsp), %xmm12
  792. movups 176(%rsp), %xmm13
  793. movups 192(%rsp), %xmm14
  794. movups 208(%rsp), %xmm15
  795. #endif
  796. addq $STACKSIZE, %rsp
  797. ret
  798. EPILOGUE