You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_n_sse.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef movsd
  41. #undef movsd
  42. #endif
  43. #ifdef PENTIUM3
  44. #ifdef HAVE_SSE
  45. #define PREFETCH prefetcht0
  46. #define PREFETCHW prefetcht0
  47. #define PREFETCHSIZE (16 * 2)
  48. #endif
  49. #define movsd movlps
  50. #endif
  51. #ifdef PENTIUM4
  52. #define PREFETCH prefetcht0
  53. #define PREFETCHW prefetcht0
  54. #define PREFETCHSIZE (16 * 2)
  55. #endif
  56. #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
  57. #define PREFETCH prefetcht0
  58. #define PREFETCHW prefetcht0
  59. #define PREFETCHSIZE (16 * 7)
  60. #endif
  61. #ifdef OPTERON
  62. #define PREFETCH prefetchnta
  63. #define PREFETCHW prefetchw
  64. #define PREFETCHSIZE (16 * 3)
  65. #define movsd movlps
  66. #endif
  67. #if defined(BARCELONA) || defined(BULLDOZER)
  68. #define PREFETCH prefetchnta
  69. #define PREFETCHW prefetchw
  70. #define PREFETCHSIZE (16 * 5)
  71. #endif
  72. #ifdef ATOM
  73. #define PREFETCH prefetchnta
  74. #define PREFETCHW prefetcht0
  75. #define PREFETCHSIZE (16 * 6)
  76. #endif
  77. #ifdef NANO
  78. #define PREFETCH prefetcht0
  79. #define PREFETCHSIZE (16 * 4)
  80. #endif
  81. #define STACKSIZE 16
  82. #define ARGS 20
  83. #define M 4 + STACKSIZE+ARGS(%esp)
  84. #define N 8 + STACKSIZE+ARGS(%esp)
  85. #define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
  86. #define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
  87. #define A 24 + STACKSIZE+ARGS(%esp)
  88. #define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
  89. #define STACK_X 32 + STACKSIZE+ARGS(%esp)
  90. #define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
  91. #define Y 40 + STACKSIZE+ARGS(%esp)
  92. #define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
  93. #define BUFFER 48 + STACKSIZE+ARGS(%esp)
  94. #define MMM 0+ARGS(%esp)
  95. #define YY 4+ARGS(%esp)
  96. #define AA 8+ARGS(%esp)
  97. #define I %eax
  98. #define J %ebx
  99. #define INCX %ecx
  100. #define INCY J
  101. #define A1 %esi
  102. #define X %edx
  103. #define Y1 %edi
  104. #define LDA %ebp
  105. #undef SUBPS
  106. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  107. #define SUBPS subps
  108. #else
  109. #define SUBPS addps
  110. #endif
  111. PROLOGUE
  112. subl $ARGS,%esp
  113. pushl %ebp
  114. pushl %edi
  115. pushl %esi
  116. pushl %ebx
  117. PROFCODE
  118. movl Y,J
  119. movl J,YY
  120. movl A,J
  121. movl J,AA
  122. movl M,J
  123. movl J,MMM
  124. .L0t:
  125. xorl J,J
  126. addl $1,J
  127. sall $20,J
  128. subl J,MMM
  129. movl J,M
  130. jge .L00t
  131. ALIGN_3
  132. movl MMM,%eax
  133. addl J,%eax
  134. jle .L999x
  135. movl %eax,M
  136. .L00t:
  137. movl AA,%eax
  138. movl %eax,A
  139. movl YY,J
  140. movl J,Y
  141. movl STACK_LDA, LDA
  142. movl STACK_X, X
  143. movl STACK_INCX, INCX
  144. sall $ZBASE_SHIFT, INCX
  145. sall $ZBASE_SHIFT, LDA
  146. subl $-32 * SIZE, A
  147. cmpl $0, N
  148. jle .L999
  149. cmpl $0, M
  150. jle .L999
  151. movl BUFFER, Y1
  152. movl N, J
  153. xorps %xmm7, %xmm7
  154. movl M, %eax
  155. addl $8, %eax
  156. sarl $3, %eax
  157. ALIGN_3
  158. .L01:
  159. movaps %xmm7, 0 * SIZE(Y1)
  160. movaps %xmm7, 4 * SIZE(Y1)
  161. movaps %xmm7, 8 * SIZE(Y1)
  162. movaps %xmm7, 12 * SIZE(Y1)
  163. subl $-16 * SIZE, Y1
  164. decl %eax
  165. jg .L01
  166. ALIGN_3
  167. .L10:
  168. movl BUFFER, Y1
  169. addl $32 * SIZE, Y1
  170. movl A, A1
  171. addl LDA, A
  172. movsd (X), %xmm7
  173. addl INCX, X
  174. #ifdef HAVE_SSE2
  175. pcmpeqb %xmm5, %xmm5
  176. psllq $63, %xmm5
  177. #else
  178. subl $8, %esp
  179. movl $0x00000000, 0(%esp)
  180. movl $0x80000000, 4(%esp)
  181. movlps (%esp), %xmm5
  182. addl $8, %esp
  183. movlhps %xmm5, %xmm5
  184. #endif
  185. #ifdef HAVE_SSE2
  186. pshufd $0x00, %xmm7, %xmm6
  187. pshufd $0x55, %xmm7, %xmm7
  188. #else
  189. movaps %xmm7, %xmm6
  190. shufps $0x00, %xmm6, %xmm6
  191. shufps $0x55, %xmm7, %xmm7
  192. #endif
  193. #ifdef HAVE_SSE3
  194. movddup ALPHA_R, %xmm3
  195. #else
  196. movsd ALPHA_R, %xmm3
  197. movlhps %xmm3, %xmm3
  198. #endif
  199. #ifdef HAVE_SSE2
  200. pshufd $0xb1, %xmm3, %xmm4
  201. #else
  202. movaps %xmm3, %xmm4
  203. shufps $0xb1, %xmm4, %xmm4
  204. #endif
  205. #ifndef XCONJ
  206. xorps %xmm5, %xmm7
  207. #else
  208. xorps %xmm5, %xmm6
  209. #endif
  210. mulps %xmm3, %xmm6
  211. mulps %xmm4, %xmm7
  212. #ifndef XCONJ
  213. subps %xmm7, %xmm6
  214. #else
  215. addps %xmm7, %xmm6
  216. #endif
  217. #ifdef HAVE_SSE2
  218. pshufd $0x55, %xmm6, %xmm7
  219. pshufd $0x00, %xmm6, %xmm6
  220. #else
  221. movaps %xmm6, %xmm7
  222. shufps $0x55, %xmm7, %xmm7
  223. shufps $0x00, %xmm6, %xmm6
  224. #endif
  225. #ifndef CONJ
  226. xorps %xmm5, %xmm7
  227. #else
  228. xorps %xmm5, %xmm6
  229. #endif
  230. movaps -32 * SIZE(Y1), %xmm0
  231. movaps -28 * SIZE(Y1), %xmm1
  232. ALIGN_3
  233. movl M, I
  234. sarl $3, I
  235. jle .L15
  236. movsd -32 * SIZE(A1), %xmm2
  237. movhps -30 * SIZE(A1), %xmm2
  238. movsd -28 * SIZE(A1), %xmm4
  239. movhps -26 * SIZE(A1), %xmm4
  240. decl I
  241. jle .L14
  242. ALIGN_3
  243. .L13:
  244. #ifdef PREFETCH
  245. PREFETCH (PREFETCHSIZE + 0) * SIZE(A1)
  246. #endif
  247. #ifdef HAVE_SSE2
  248. pshufd $0xb1, %xmm2, %xmm3
  249. #else
  250. movaps %xmm2, %xmm3
  251. shufps $0xb1, %xmm3, %xmm3
  252. #endif
  253. mulps %xmm6, %xmm2
  254. addps %xmm2, %xmm0
  255. movsd -24 * SIZE(A1), %xmm2
  256. movhps -22 * SIZE(A1), %xmm2
  257. #ifdef HAVE_SSE2
  258. pshufd $0xb1, %xmm4, %xmm5
  259. #else
  260. movaps %xmm4, %xmm5
  261. shufps $0xb1, %xmm5, %xmm5
  262. #endif
  263. mulps %xmm6, %xmm4
  264. addps %xmm4, %xmm1
  265. movsd -20 * SIZE(A1), %xmm4
  266. movhps -18 * SIZE(A1), %xmm4
  267. mulps %xmm7, %xmm3
  268. SUBPS %xmm3, %xmm0
  269. movaps %xmm0, -32 * SIZE(Y1)
  270. movaps -24 * SIZE(Y1), %xmm0
  271. mulps %xmm7, %xmm5
  272. SUBPS %xmm5, %xmm1
  273. movaps %xmm1, -28 * SIZE(Y1)
  274. movaps -20 * SIZE(Y1), %xmm1
  275. #ifdef HAVE_SSE2
  276. pshufd $0xb1, %xmm2, %xmm3
  277. #else
  278. movaps %xmm2, %xmm3
  279. shufps $0xb1, %xmm3, %xmm3
  280. #endif
  281. mulps %xmm6, %xmm2
  282. addps %xmm2, %xmm0
  283. movsd -16 * SIZE(A1), %xmm2
  284. movhps -14 * SIZE(A1), %xmm2
  285. #ifdef HAVE_SSE2
  286. pshufd $0xb1, %xmm4, %xmm5
  287. #else
  288. movaps %xmm4, %xmm5
  289. shufps $0xb1, %xmm5, %xmm5
  290. #endif
  291. mulps %xmm6, %xmm4
  292. addps %xmm4, %xmm1
  293. movsd -12 * SIZE(A1), %xmm4
  294. movhps -10 * SIZE(A1), %xmm4
  295. mulps %xmm7, %xmm3
  296. SUBPS %xmm3, %xmm0
  297. movaps %xmm0, -24 * SIZE(Y1)
  298. movaps -16 * SIZE(Y1), %xmm0
  299. mulps %xmm7, %xmm5
  300. SUBPS %xmm5, %xmm1
  301. movaps %xmm1, -20 * SIZE(Y1)
  302. movaps -12 * SIZE(Y1), %xmm1
  303. subl $-16 * SIZE, A1
  304. subl $-16 * SIZE, Y1
  305. subl $1, I
  306. BRANCH
  307. jg .L13
  308. ALIGN_3
  309. .L14:
  310. #ifdef HAVE_SSE2
  311. pshufd $0xb1, %xmm2, %xmm3
  312. #else
  313. movaps %xmm2, %xmm3
  314. shufps $0xb1, %xmm3, %xmm3
  315. #endif
  316. mulps %xmm6, %xmm2
  317. addps %xmm2, %xmm0
  318. movsd -24 * SIZE(A1), %xmm2
  319. movhps -22 * SIZE(A1), %xmm2
  320. #ifdef HAVE_SSE2
  321. pshufd $0xb1, %xmm4, %xmm5
  322. #else
  323. movaps %xmm4, %xmm5
  324. shufps $0xb1, %xmm5, %xmm5
  325. #endif
  326. mulps %xmm6, %xmm4
  327. addps %xmm4, %xmm1
  328. movsd -20 * SIZE(A1), %xmm4
  329. movhps -18 * SIZE(A1), %xmm4
  330. mulps %xmm7, %xmm3
  331. SUBPS %xmm3, %xmm0
  332. movaps %xmm0, -32 * SIZE(Y1)
  333. movaps -24 * SIZE(Y1), %xmm0
  334. mulps %xmm7, %xmm5
  335. SUBPS %xmm5, %xmm1
  336. movaps %xmm1, -28 * SIZE(Y1)
  337. movaps -20 * SIZE(Y1), %xmm1
  338. #ifdef HAVE_SSE2
  339. pshufd $0xb1, %xmm2, %xmm3
  340. #else
  341. movaps %xmm2, %xmm3
  342. shufps $0xb1, %xmm3, %xmm3
  343. #endif
  344. mulps %xmm6, %xmm2
  345. addps %xmm2, %xmm0
  346. #ifdef HAVE_SSE2
  347. pshufd $0xb1, %xmm4, %xmm5
  348. #else
  349. movaps %xmm4, %xmm5
  350. shufps $0xb1, %xmm5, %xmm5
  351. #endif
  352. mulps %xmm6, %xmm4
  353. addps %xmm4, %xmm1
  354. mulps %xmm7, %xmm3
  355. SUBPS %xmm3, %xmm0
  356. movaps %xmm0, -24 * SIZE(Y1)
  357. movaps -16 * SIZE(Y1), %xmm0
  358. mulps %xmm7, %xmm5
  359. SUBPS %xmm5, %xmm1
  360. movaps %xmm1, -20 * SIZE(Y1)
  361. movaps -12 * SIZE(Y1), %xmm1
  362. subl $-16 * SIZE, A1
  363. subl $-16 * SIZE, Y1
  364. ALIGN_3
  365. .L15:
  366. testl $4, M
  367. je .L17
  368. movsd -32 * SIZE(A1), %xmm2
  369. movhps -30 * SIZE(A1), %xmm2
  370. movsd -28 * SIZE(A1), %xmm4
  371. movhps -26 * SIZE(A1), %xmm4
  372. #ifdef HAVE_SSE2
  373. pshufd $0xb1, %xmm2, %xmm3
  374. #else
  375. movaps %xmm2, %xmm3
  376. shufps $0xb1, %xmm3, %xmm3
  377. #endif
  378. mulps %xmm6, %xmm2
  379. addps %xmm2, %xmm0
  380. #ifdef HAVE_SSE2
  381. pshufd $0xb1, %xmm4, %xmm5
  382. #else
  383. movaps %xmm4, %xmm5
  384. shufps $0xb1, %xmm5, %xmm5
  385. #endif
  386. mulps %xmm6, %xmm4
  387. addps %xmm4, %xmm1
  388. mulps %xmm7, %xmm3
  389. SUBPS %xmm3, %xmm0
  390. movaps %xmm0, -32 * SIZE(Y1)
  391. movaps -24 * SIZE(Y1), %xmm0
  392. mulps %xmm7, %xmm5
  393. SUBPS %xmm5, %xmm1
  394. movaps %xmm1, -28 * SIZE(Y1)
  395. movaps -20 * SIZE(Y1), %xmm1
  396. addl $8 * SIZE, A1
  397. addl $8 * SIZE, Y1
  398. ALIGN_3
  399. .L17:
  400. testl $2, M
  401. je .L18
  402. movsd -32 * SIZE(A1), %xmm2
  403. movhps -30 * SIZE(A1), %xmm2
  404. #ifdef HAVE_SSE2
  405. pshufd $0xb1, %xmm2, %xmm3
  406. #else
  407. movaps %xmm2, %xmm3
  408. shufps $0xb1, %xmm3, %xmm3
  409. #endif
  410. mulps %xmm6, %xmm2
  411. addps %xmm2, %xmm0
  412. mulps %xmm7, %xmm3
  413. SUBPS %xmm3, %xmm0
  414. movaps %xmm0, -32 * SIZE(Y1)
  415. movaps %xmm1, %xmm0
  416. addl $4 * SIZE, A1
  417. addl $4 * SIZE, Y1
  418. ALIGN_3
  419. .L18:
  420. testl $1, M
  421. je .L19
  422. #ifdef movsd
  423. xorps %xmm2, %xmm2
  424. #endif
  425. movsd -32 * SIZE(A1), %xmm2
  426. #ifdef HAVE_SSE2
  427. pshufd $0xb1, %xmm2, %xmm3
  428. #else
  429. movaps %xmm2, %xmm3
  430. shufps $0xb1, %xmm3, %xmm3
  431. #endif
  432. mulps %xmm6, %xmm2
  433. addps %xmm2, %xmm0
  434. mulps %xmm7, %xmm3
  435. SUBPS %xmm3, %xmm0
  436. movlps %xmm0, -32 * SIZE(Y1)
  437. ALIGN_3
  438. .L19:
  439. decl J
  440. jg .L10
  441. ALIGN_4
  442. .L990:
  443. movl Y, Y1
  444. movl BUFFER, X
  445. movl STACK_INCY, INCY
  446. sall $ZBASE_SHIFT, INCY
  447. movl M, %eax
  448. sarl $3, %eax
  449. jle .L994
  450. ALIGN_3
  451. .L992:
  452. movsd (Y1), %xmm0
  453. movhps (Y1, INCY), %xmm0
  454. addps 0 * SIZE(X), %xmm0
  455. movlps %xmm0, (Y1)
  456. movhps %xmm0, (Y1, INCY)
  457. leal (Y1, INCY, 2), Y1
  458. movsd (Y1), %xmm0
  459. movhps (Y1, INCY), %xmm0
  460. addps 4 * SIZE(X), %xmm0
  461. movlps %xmm0, (Y1)
  462. movhps %xmm0, (Y1, INCY)
  463. leal (Y1, INCY, 2), Y1
  464. movsd (Y1), %xmm0
  465. movhps (Y1, INCY), %xmm0
  466. addps 8 * SIZE(X), %xmm0
  467. movlps %xmm0, (Y1)
  468. movhps %xmm0, (Y1, INCY)
  469. leal (Y1, INCY, 2), Y1
  470. movsd (Y1), %xmm0
  471. movhps (Y1, INCY), %xmm0
  472. addps 12 * SIZE(X), %xmm0
  473. movlps %xmm0, (Y1)
  474. movhps %xmm0, (Y1, INCY)
  475. leal (Y1, INCY, 2), Y1
  476. addl $16 * SIZE, X
  477. decl %eax
  478. jg .L992
  479. ALIGN_3
  480. .L994:
  481. testl $4, M
  482. jle .L995
  483. movsd (Y1), %xmm0
  484. movhps (Y1, INCY), %xmm0
  485. addps 0 * SIZE(X), %xmm0
  486. movlps %xmm0, (Y1)
  487. movhps %xmm0, (Y1, INCY)
  488. leal (Y1, INCY, 2), Y1
  489. movsd (Y1), %xmm0
  490. movhps (Y1, INCY), %xmm0
  491. addps 4 * SIZE(X), %xmm0
  492. movlps %xmm0, (Y1)
  493. movhps %xmm0, (Y1, INCY)
  494. leal (Y1, INCY, 2), Y1
  495. addl $8 * SIZE, X
  496. ALIGN_3
  497. .L995:
  498. testl $2, M
  499. jle .L996
  500. movsd (Y1), %xmm0
  501. movhps (Y1, INCY), %xmm0
  502. addps 0 * SIZE(X), %xmm0
  503. movlps %xmm0, (Y1)
  504. movhps %xmm0, (Y1, INCY)
  505. leal (Y1, INCY, 2), Y1
  506. addl $4 * SIZE, X
  507. ALIGN_3
  508. .L996:
  509. testl $1, M
  510. jle .L999
  511. #ifdef movsd
  512. xorps %xmm0, %xmm0
  513. #endif
  514. movsd (Y1), %xmm0
  515. addps 0 * SIZE(X), %xmm0
  516. movlps %xmm0, (Y1)
  517. ALIGN_3
  518. .L999:
  519. movl M,%eax
  520. sall $ZBASE_SHIFT,%eax
  521. addl %eax,AA
  522. movl STACK_INCY,INCY
  523. imull INCY,%eax
  524. addl %eax,YY
  525. jmp .L0t
  526. ALIGN_3
  527. .L999x:
  528. popl %ebx
  529. popl %esi
  530. popl %edi
  531. popl %ebp
  532. addl $ARGS,%esp
  533. ret
  534. EPILOGUE