You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zsymv_U_sse.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef ATOM
  41. #define PREFETCH prefetcht0
  42. #define PREFETCHW prefetcht0
  43. #define PREFETCHSIZE (16 * 24)
  44. #endif
  45. #ifdef CORE2
  46. #define PREFETCH prefetcht0
  47. #define PREFETCHW prefetcht0
  48. #define PREFETCHSIZE (16 * 24)
  49. #endif
  50. #if defined(PENRYN) || defined(DUNNINGTON)
  51. #define PREFETCH prefetcht0
  52. #define PREFETCHW prefetcht0
  53. #define PREFETCHSIZE (16 * 24)
  54. #endif
  55. #if defined(NEHALEM) || defined(SANDYBRIDGE)
  56. #define PREFETCH prefetcht0
  57. #define PREFETCHW prefetcht0
  58. #define PREFETCHSIZE (16 * 24)
  59. #endif
  60. #ifdef PENTIUM4
  61. #define PREFETCH prefetcht0
  62. #define PREFETCHW prefetcht0
  63. #define PREFETCHSIZE (16 * 28)
  64. #endif
  65. #ifdef OPTERON
  66. #define PREFETCH prefetch
  67. #define PREFETCHW prefetchw
  68. #define PREFETCHSIZE (16 * 12)
  69. #define movsd movlpd
  70. #endif
  71. #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  72. #define PREFETCH prefetch
  73. #define PREFETCHW prefetchw
  74. #define PREFETCHSIZE (16 * 16)
  75. #endif
  76. #ifdef NANO
  77. #define PREFETCH prefetcht0
  78. #define PREFETCHW prefetcht0
  79. #define PREFETCHSIZE (16 * 24)
  80. #endif
  81. #ifdef GENERIC
  82. #define PREFETCH prefetcht0
  83. #define PREFETCHW prefetcht0
  84. #define PREFETCHSIZE (16 * 14)
  85. #endif
  86. #ifndef WINDOWS_ABI
  87. #define STACKSIZE 80
  88. #define OLD_Y 8 + STACKSIZE(%rsp)
  89. #define OLD_INCY 16 + STACKSIZE(%rsp)
  90. #define OLD_BUFFER 24 + STACKSIZE(%rsp)
  91. #define M ARG1
  92. #define N ARG2
  93. #define A ARG3
  94. #define LDA ARG4
  95. #define X ARG5
  96. #define INCX ARG6
  97. #else
  98. #define STACKSIZE 256
  99. #define OLD_A 40 + STACKSIZE(%rsp)
  100. #define OLD_LDA 48 + STACKSIZE(%rsp)
  101. #define OLD_X 56 + STACKSIZE(%rsp)
  102. #define OLD_INCX 64 + STACKSIZE(%rsp)
  103. #define OLD_Y 72 + STACKSIZE(%rsp)
  104. #define OLD_INCY 80 + STACKSIZE(%rsp)
  105. #define OLD_BUFFER 88 + STACKSIZE(%rsp)
  106. #define M ARG1
  107. #define N ARG2
  108. #define A ARG4
  109. #define LDA ARG3
  110. #define X %rdi
  111. #define INCX %rsi
  112. #endif
  113. #define Y %r10
  114. #define INCY %r11
  115. #define BUFFER %r12
  116. #define TEMP %rax
  117. #define I %rax
  118. #define A1 %rbx
  119. #define A2 %rbp
  120. #define XX %r13
  121. #define YY %r14
  122. #define IS %r15
  123. #define NEW_X BUFFER
  124. #define NEW_Y X
  125. #define ALPHA_R %xmm0
  126. #define ALPHA_I %xmm1
  127. #define xsum1 %xmm0
  128. #define xsum2 %xmm1
  129. #define xsum3 %xmm2
  130. #define xsum4 %xmm3
  131. #define atemp1 %xmm4
  132. #define atemp2 %xmm5
  133. #define atemp3 %xmm6
  134. #define atemp4 %xmm7
  135. #define xtemp1 %xmm8
  136. #define xtemp2 %xmm9
  137. #define a1 %xmm10
  138. #define a2 %xmm11
  139. #define a3 %xmm12
  140. #define yy1 %xmm13
  141. #define xt1 %xmm14
  142. #define xt2 %xmm15
  143. #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
  144. #define MOVDDUP(a, b, c) movddup a(b), c
  145. #define MOVDDUP2(a, b, c) movddup a##b, c
  146. #else
  147. #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c
  148. #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c
  149. #endif
  150. PROLOGUE
  151. PROFCODE
  152. subq $STACKSIZE, %rsp
  153. movq %rbx, 0(%rsp)
  154. movq %rbp, 8(%rsp)
  155. movq %r12, 16(%rsp)
  156. movq %r13, 24(%rsp)
  157. movq %r14, 32(%rsp)
  158. movq %r15, 40(%rsp)
  159. #ifdef WINDOWS_ABI
  160. movq %rdi, 48(%rsp)
  161. movq %rsi, 56(%rsp)
  162. movups %xmm6, 64(%rsp)
  163. movups %xmm7, 80(%rsp)
  164. movups %xmm8, 96(%rsp)
  165. movups %xmm9, 112(%rsp)
  166. movups %xmm10, 128(%rsp)
  167. movups %xmm11, 144(%rsp)
  168. movups %xmm12, 160(%rsp)
  169. movups %xmm13, 176(%rsp)
  170. movups %xmm14, 192(%rsp)
  171. movups %xmm15, 208(%rsp)
  172. movq OLD_A, A
  173. movq OLD_LDA, LDA
  174. movq OLD_X, X
  175. movq OLD_INCX, INCX
  176. movaps %xmm2, %xmm0
  177. movaps %xmm3, %xmm1
  178. #endif
  179. movq OLD_Y, Y
  180. movq OLD_INCY, INCY
  181. movq OLD_BUFFER, BUFFER
  182. salq $ZBASE_SHIFT, INCX
  183. salq $ZBASE_SHIFT, INCY
  184. salq $ZBASE_SHIFT, LDA
  185. testq M, M
  186. jle .L999
  187. negq IS
  188. addq M, IS
  189. movq IS, TEMP
  190. imulq LDA, TEMP
  191. addq TEMP, A
  192. pcmpeqb %xmm3, %xmm3
  193. xorpd %xmm2, %xmm2
  194. pslld $31, %xmm3
  195. unpckhps %xmm3, %xmm2
  196. shufps $0, ALPHA_R, ALPHA_R
  197. shufps $0, ALPHA_I, ALPHA_I
  198. movaps ALPHA_I, %xmm3
  199. unpcklps ALPHA_R, ALPHA_I
  200. unpcklps %xmm3, ALPHA_R
  201. pxor %xmm2, ALPHA_R
  202. movq BUFFER, XX
  203. movq M, %rax
  204. sarq $2, %rax
  205. jle .L02
  206. ALIGN_3
  207. .L01:
  208. movsd 0 * SIZE(X), %xmm4
  209. addq INCX, X
  210. movhps 0 * SIZE(X), %xmm4
  211. addq INCX, X
  212. movsd 0 * SIZE(X), %xmm6
  213. addq INCX, X
  214. movhps 0 * SIZE(X), %xmm6
  215. addq INCX, X
  216. movsldup %xmm4, %xmm3
  217. movshdup %xmm4, %xmm4
  218. movsldup %xmm6, %xmm5
  219. movshdup %xmm6, %xmm6
  220. mulps ALPHA_I, %xmm3
  221. mulps ALPHA_R, %xmm4
  222. mulps ALPHA_I, %xmm5
  223. mulps ALPHA_R, %xmm6
  224. addps %xmm4, %xmm3
  225. addps %xmm6, %xmm5
  226. movaps %xmm3, 4 * SIZE(XX)
  227. movaps %xmm5, 12 * SIZE(XX)
  228. shufps $0xb1, %xmm3, %xmm3
  229. shufps $0xb1, %xmm5, %xmm5
  230. pxor %xmm2, %xmm3
  231. pxor %xmm2, %xmm5
  232. movaps %xmm3, 0 * SIZE(XX)
  233. movaps %xmm5, 8 * SIZE(XX)
  234. subq $-16 * SIZE, XX
  235. decq %rax
  236. jg .L01
  237. ALIGN_3
  238. .L02:
  239. testq $2, M
  240. jle .L03
  241. movsd 0 * SIZE(X), %xmm4
  242. addq INCX, X
  243. movhps 0 * SIZE(X), %xmm4
  244. addq INCX, X
  245. movsldup %xmm4, %xmm3
  246. movshdup %xmm4, %xmm4
  247. mulps ALPHA_I, %xmm3
  248. mulps ALPHA_R, %xmm4
  249. addps %xmm4, %xmm3
  250. movaps %xmm3, 4 * SIZE(XX)
  251. shufps $0xb1, %xmm3, %xmm3
  252. pxor %xmm2, %xmm3
  253. movaps %xmm3, 0 * SIZE(XX)
  254. subq $-8 * SIZE, XX
  255. ALIGN_3
  256. .L03:
  257. testq $1, M
  258. jle .L05
  259. movsd 0 * SIZE(X), %xmm4
  260. addq INCX, X
  261. movsldup %xmm4, %xmm3
  262. movshdup %xmm4, %xmm4
  263. mulps ALPHA_I, %xmm3
  264. mulps ALPHA_R, %xmm4
  265. addps %xmm4, %xmm3
  266. movlps %xmm3, 2 * SIZE(XX)
  267. shufps $0xb1, %xmm3, %xmm3
  268. pxor %xmm2, %xmm3
  269. movlps %xmm3, 0 * SIZE(XX)
  270. subq $-4 * SIZE, XX
  271. ALIGN_3
  272. .L05:
  273. /* now we don't need original X */
  274. movq Y, NEW_Y
  275. addq $512, XX
  276. andq $-512, XX
  277. cmpq $2 * SIZE, INCY
  278. je .L10
  279. movq Y, YY
  280. movq XX, NEW_Y
  281. movq M, %rax
  282. sarq $2, %rax
  283. jle .L07
  284. ALIGN_3
  285. .L06:
  286. movsd 0 * SIZE(YY), %xmm0
  287. addq INCY, YY
  288. movhps 0 * SIZE(YY), %xmm0
  289. addq INCY, YY
  290. movsd 0 * SIZE(YY), %xmm1
  291. addq INCY, YY
  292. movhps 0 * SIZE(YY), %xmm1
  293. addq INCY, YY
  294. movaps %xmm0, 0 * SIZE(XX)
  295. movaps %xmm1, 8 * SIZE(XX)
  296. addq $8 * SIZE, XX
  297. decq %rax
  298. jg .L06
  299. ALIGN_3
  300. .L07:
  301. movq M, %rax
  302. andq $3, %rax
  303. jle .L10
  304. ALIGN_3
  305. .L08:
  306. movsd 0 * SIZE(YY), %xmm0
  307. addq INCY, YY
  308. movlps %xmm0, 0 * SIZE(XX)
  309. addq $2 * SIZE, XX
  310. decq %rax
  311. jg .L08
  312. ALIGN_3
  313. .L10:
  314. movq IS, I
  315. addq $2, I
  316. cmpq M, I
  317. jg .L20
  318. ALIGN_3
  319. .L11:
  320. movq A, A1
  321. leaq (A, LDA, 1), A2
  322. leaq (A, LDA, 2), A
  323. leaq (, IS, 4), I
  324. movsd 0 * SIZE(NEW_X, I, SIZE), atemp2
  325. movhps 4 * SIZE(NEW_X, I, SIZE), atemp2
  326. movsd 2 * SIZE(NEW_X, I, SIZE), atemp4
  327. movhps 6 * SIZE(NEW_X, I, SIZE), atemp4
  328. pshufd $0xcc, atemp2, atemp1
  329. pshufd $0x99, atemp2, atemp2
  330. pshufd $0xcc, atemp4, atemp3
  331. pshufd $0x99, atemp4, atemp4
  332. pxor xsum1, xsum1
  333. pxor xsum2, xsum2
  334. pxor xsum3, xsum3
  335. pxor xsum4, xsum4
  336. movq NEW_X, XX
  337. movq NEW_Y, YY
  338. movq IS, I
  339. sarq $2, I
  340. jle .L15
  341. ALIGN_3
  342. .L12:
  343. HALT
  344. subq $-16 * SIZE, XX
  345. addq $ 8 * SIZE, YY
  346. addq $ 8 * SIZE, A1
  347. addq $ 8 * SIZE, A2
  348. decq I
  349. jg .L12
  350. ALIGN_3
  351. .L15:
  352. testq $2, IS
  353. jle .L18
  354. movsd 0 * SIZE(YY), yy1
  355. movhps 2 * SIZE(YY), yy1
  356. movaps 0 * SIZE(XX), xtemp1
  357. movaps 4 * SIZE(XX), xtemp2
  358. movsd 0 * SIZE(A1), a1
  359. movhps 2 * SIZE(A1), a1
  360. movaps xtemp1, xt1
  361. movaps xtemp2, xt2
  362. mulps a1, xt1
  363. mulps a1, xt2
  364. addps xt1, xsum1
  365. addps xt2, xsum2
  366. pshufd $0xb1, a1, xt2
  367. mulps atemp1, a1
  368. mulps atemp2, xt2
  369. addps a1, yy1
  370. addps xt2, yy1
  371. movsd 0 * SIZE(A2), a1
  372. movhps 2 * SIZE(A2), a1
  373. movaps xtemp1, xt1
  374. movaps xtemp2, xt2
  375. mulps a1, xt1
  376. mulps a1, xt2
  377. addps xt1, xsum3
  378. addps xt2, xsum4
  379. pshufd $0xb1, a1, xt2
  380. mulps atemp1, a1
  381. mulps atemp2, xt2
  382. addps a1, yy1
  383. addps xt2, yy1
  384. movlps yy1, 0 * SIZE(YY)
  385. movhps yy1, 2 * SIZE(YY)
  386. addq $8 * SIZE, XX
  387. addq $4 * SIZE, YY
  388. addq $4 * SIZE, A1
  389. addq $4 * SIZE, A2
  390. ALIGN_3
  391. .L18:
  392. leaq (, IS, 4), I
  393. movaps 0 * SIZE(NEW_X, I, SIZE), atemp1
  394. movaps 4 * SIZE(NEW_X, I, SIZE), atemp2
  395. movlps 0 * SIZE(YY), yy1
  396. movhps 2 * SIZE(YY), yy1
  397. movsd 0 * SIZE(A1), a1
  398. movhps 0 * SIZE(A2), a1
  399. movaps a1, a2
  400. mulps atemp1, a1
  401. mulps atemp2, a2
  402. addps a1, xsum1
  403. addps a2, xsum2
  404. movsd 0 * SIZE(A2), a1
  405. movhps 2 * SIZE(A2), a1
  406. movaps a1, a2
  407. mulps atemp1, a1
  408. mulps atemp2, a2
  409. addps a1, xsum3
  410. addps a2, xsum4
  411. haddps xsum2, xsum1
  412. haddps xsum4, xsum3
  413. haddps xsum3, xsum1
  414. addps xsum1, yy1
  415. movlps yy1, 0 * SIZE(YY)
  416. movhps yy1, 2 * SIZE(YY)
  417. addq $2, IS
  418. movq IS, I
  419. addq $2, I
  420. cmpq M, I
  421. jle .L11
  422. ALIGN_3
  423. .L20:
  424. testq $1, M
  425. jle .L990
  426. .L990:
  427. cmpq $2 * SIZE, INCY
  428. je .L999
  429. movq M, %rax
  430. sarq $2, %rax
  431. jle .L997
  432. ALIGN_3
  433. .L996:
  434. movaps 0 * SIZE(NEW_Y), %xmm0
  435. movaps 4 * SIZE(NEW_Y), %xmm1
  436. movlps %xmm0, 0 * SIZE(Y)
  437. addq INCY, Y
  438. movhps %xmm0, 0 * SIZE(Y)
  439. addq INCY, Y
  440. movlps %xmm1, 0 * SIZE(Y)
  441. addq INCY, Y
  442. movhps %xmm1, 0 * SIZE(Y)
  443. addq INCY, Y
  444. addq $8 * SIZE, NEW_Y
  445. decq %rax
  446. jg .L996
  447. ALIGN_3
  448. .L997:
  449. movq M, %rax
  450. andq $3, %rax
  451. jle .L999
  452. ALIGN_3
  453. .L998:
  454. movlps 0 * SIZE(NEW_Y), %xmm0
  455. addq $2 * SIZE, NEW_Y
  456. movlps %xmm0, 0 * SIZE(Y)
  457. addq INCY, Y
  458. decq %rax
  459. jg .L998
  460. ALIGN_3
  461. .L999:
  462. movq 0(%rsp), %rbx
  463. movq 8(%rsp), %rbp
  464. movq 16(%rsp), %r12
  465. movq 24(%rsp), %r13
  466. movq 32(%rsp), %r14
  467. movq 40(%rsp), %r15
  468. addq $STACKSIZE, %rsp
  469. ret
  470. EPILOGUE