You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

symv_U_sse.S 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef ATOM
  41. #define PREFETCH prefetcht0
  42. #define PREFETCHW prefetcht0
  43. #define PREFETCHSIZE (16 * 12)
  44. #endif
  45. #ifdef CORE2
  46. #define PREFETCH prefetcht0
  47. #define PREFETCHW prefetcht0
  48. #define PREFETCHSIZE (16 * 12)
  49. #endif
  50. #if defined(PENRYN) || defined(DUNNINGTON)
  51. #define PREFETCH prefetcht0
  52. #define PREFETCHW prefetcht0
  53. #define PREFETCHSIZE (16 * 12)
  54. #endif
  55. #if defined(NEHALEM) || defined(SANDYBRIDGE)
  56. #define PREFETCH prefetcht0
  57. #define PREFETCHW prefetcht0
  58. #define PREFETCHSIZE (16 * 12)
  59. #endif
  60. #ifdef PENTIUM4
  61. #define PREFETCH prefetcht0
  62. #define PREFETCHW prefetcht0
  63. #define PREFETCHSIZE (16 * 20)
  64. #endif
  65. #ifdef OPTERON
  66. #define PREFETCH prefetch
  67. #define PREFETCHW prefetchw
  68. #define PREFETCHSIZE (16 * 8)
  69. #define movsd movlps
  70. #endif
  71. #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
  72. #define PREFETCH prefetch
  73. #define PREFETCHW prefetchw
  74. #define PREFETCHSIZE (16 * 16)
  75. #endif
  76. #ifdef NANO
  77. #define PREFETCH prefetcht0
  78. #define PREFETCHW prefetcht0
  79. #define PREFETCHSIZE (16 * 24)
  80. #endif
  81. #ifdef GENERIC
  82. #define PREFETCH prefetcht0
  83. #define PREFETCHW prefetcht0
  84. #define PREFETCHSIZE (16 * 20)
  85. #endif
  86. #ifndef WINDOWS_ABI
  87. #define STACKSIZE 80
  88. #define OLD_Y 8 + STACKSIZE(%rsp)
  89. #define OLD_INCY 16 + STACKSIZE(%rsp)
  90. #define OLD_BUFFER 24 + STACKSIZE(%rsp)
  91. #define M ARG1
  92. #define IS ARG2
  93. #define A ARG3
  94. #define LDA ARG4
  95. #define X ARG5
  96. #define INCX ARG6
  97. #else
  98. #define STACKSIZE 256
  99. #define OLD_LDA 40 + STACKSIZE(%rsp)
  100. #define OLD_X 48 + STACKSIZE(%rsp)
  101. #define OLD_INCX 56 + STACKSIZE(%rsp)
  102. #define OLD_Y 64 + STACKSIZE(%rsp)
  103. #define OLD_INCY 72 + STACKSIZE(%rsp)
  104. #define OLD_BUFFER 80 + STACKSIZE(%rsp)
  105. #define M ARG1
  106. #define IS ARG2
  107. #define A ARG4
  108. #define LDA ARG3
  109. #define X %rdi
  110. #define INCX %rsi
  111. #endif
  112. #define Y %r10
  113. #define INCY %r11
  114. #define BUFFER %r12
  115. #define TEMP %rax
  116. #define I %rax
  117. #define A1 %rbx
  118. #define A2 %rbp
  119. #define XX %r13
  120. #define YY %r14
  121. #define NEW_X BUFFER
  122. #define NEW_Y X
  123. #define ALPHA %xmm0
  124. #define atemp1 %xmm0
  125. #define atemp2 %xmm1
  126. #define atemp3 %xmm2
  127. #define atemp4 %xmm3
  128. #define xsum1 %xmm4
  129. #define xsum2 %xmm5
  130. #define xsum3 %xmm6
  131. #define xsum4 %xmm7
  132. #define xtemp1 %xmm8
  133. #define xtemp2 %xmm9
  134. #define yy1 %xmm10
  135. #define xt1 %xmm11
  136. #define a1 %xmm12
  137. #define a2 %xmm13
  138. #define a3 %xmm14
  139. #define a4 %xmm15
  140. PROLOGUE
  141. PROFCODE
  142. subq $STACKSIZE, %rsp
  143. movq %rbx, 0(%rsp)
  144. movq %rbp, 8(%rsp)
  145. movq %r12, 16(%rsp)
  146. movq %r13, 24(%rsp)
  147. movq %r14, 32(%rsp)
  148. movq %r15, 40(%rsp)
  149. #ifdef WINDOWS_ABI
  150. movq %rdi, 48(%rsp)
  151. movq %rsi, 56(%rsp)
  152. movups %xmm6, 64(%rsp)
  153. movups %xmm7, 80(%rsp)
  154. movups %xmm8, 96(%rsp)
  155. movups %xmm9, 112(%rsp)
  156. movups %xmm10, 128(%rsp)
  157. movups %xmm11, 144(%rsp)
  158. movups %xmm12, 160(%rsp)
  159. movups %xmm13, 176(%rsp)
  160. movups %xmm14, 192(%rsp)
  161. movups %xmm15, 208(%rsp)
  162. movq OLD_LDA, LDA
  163. movq OLD_X, X
  164. movq OLD_INCX, INCX
  165. movaps %xmm2, %xmm0
  166. #endif
  167. movq OLD_Y, Y
  168. movq OLD_INCY, INCY
  169. movq OLD_BUFFER, BUFFER
  170. leaq (,INCX, SIZE), INCX
  171. leaq (,INCY, SIZE), INCY
  172. leaq (,LDA, SIZE), LDA
  173. testq M, M
  174. jle .L999
  175. negq IS
  176. addq M, IS
  177. movq IS, TEMP
  178. imulq LDA, TEMP
  179. addq TEMP, A
  180. shufps $0, ALPHA, ALPHA
  181. movq BUFFER, XX
  182. movq M, %rax
  183. sarq $3, %rax
  184. jle .L02
  185. ALIGN_3
  186. .L01:
  187. movss 0 * SIZE(X), %xmm1
  188. addq INCX, X
  189. movss 0 * SIZE(X), %xmm2
  190. addq INCX, X
  191. movss 0 * SIZE(X), %xmm3
  192. addq INCX, X
  193. movss 0 * SIZE(X), %xmm4
  194. addq INCX, X
  195. movss 0 * SIZE(X), %xmm5
  196. addq INCX, X
  197. movss 0 * SIZE(X), %xmm6
  198. addq INCX, X
  199. movss 0 * SIZE(X), %xmm7
  200. addq INCX, X
  201. movss 0 * SIZE(X), %xmm8
  202. addq INCX, X
  203. mulss ALPHA, %xmm1
  204. mulss ALPHA, %xmm2
  205. mulss ALPHA, %xmm3
  206. mulss ALPHA, %xmm4
  207. mulss ALPHA, %xmm5
  208. mulss ALPHA, %xmm6
  209. mulss ALPHA, %xmm7
  210. mulss ALPHA, %xmm8
  211. movss %xmm1, 0 * SIZE(XX)
  212. movss %xmm2, 1 * SIZE(XX)
  213. movss %xmm3, 2 * SIZE(XX)
  214. movss %xmm4, 3 * SIZE(XX)
  215. movss %xmm5, 4 * SIZE(XX)
  216. movss %xmm6, 5 * SIZE(XX)
  217. movss %xmm7, 6 * SIZE(XX)
  218. movss %xmm8, 7 * SIZE(XX)
  219. addq $8 * SIZE, XX
  220. decq %rax
  221. jg .L01
  222. ALIGN_3
  223. .L02:
  224. movq M, %rax
  225. andq $7, %rax
  226. jle .L05
  227. ALIGN_3
  228. .L03:
  229. movss 0 * SIZE(X), %xmm1
  230. addq INCX, X
  231. mulss ALPHA, %xmm1
  232. movss %xmm1, 0 * SIZE(XX)
  233. addq $1 * SIZE, XX
  234. decq %rax
  235. jg .L03
  236. ALIGN_3
  237. .L05:
  238. /* now we don't need original X */
  239. movq Y, NEW_Y
  240. addq $512, XX
  241. andq $-512, XX
  242. cmpq $SIZE, INCY
  243. je .L10
  244. movq Y, YY
  245. movq XX, NEW_Y
  246. movq M, %rax
  247. sarq $3, %rax
  248. jle .L07
  249. ALIGN_3
  250. .L06:
  251. movss 0 * SIZE(YY), %xmm0
  252. addq INCY, YY
  253. movss 0 * SIZE(YY), %xmm1
  254. addq INCY, YY
  255. movss 0 * SIZE(YY), %xmm2
  256. addq INCY, YY
  257. movss 0 * SIZE(YY), %xmm3
  258. addq INCY, YY
  259. movss 0 * SIZE(YY), %xmm4
  260. addq INCY, YY
  261. movss 0 * SIZE(YY), %xmm5
  262. addq INCY, YY
  263. movss 0 * SIZE(YY), %xmm6
  264. addq INCY, YY
  265. movss 0 * SIZE(YY), %xmm7
  266. addq INCY, YY
  267. movss %xmm0, 0 * SIZE(XX)
  268. movss %xmm1, 1 * SIZE(XX)
  269. movss %xmm2, 2 * SIZE(XX)
  270. movss %xmm3, 3 * SIZE(XX)
  271. movss %xmm4, 4 * SIZE(XX)
  272. movss %xmm5, 5 * SIZE(XX)
  273. movss %xmm6, 6 * SIZE(XX)
  274. movss %xmm7, 7 * SIZE(XX)
  275. addq $8 * SIZE, XX
  276. decq %rax
  277. jg .L06
  278. ALIGN_3
  279. .L07:
  280. movq M, %rax
  281. andq $7, %rax
  282. jle .L10
  283. ALIGN_3
  284. .L08:
  285. movss 0 * SIZE(YY), %xmm0
  286. addq INCY, YY
  287. movss %xmm0, 0 * SIZE(XX)
  288. addq $1 * SIZE, XX
  289. decq %rax
  290. jg .L08
  291. ALIGN_3
  292. .L10:
  293. movq IS, I
  294. addq $4, I
  295. cmpq M, I
  296. jg .L20
  297. ALIGN_3
  298. .L11:
  299. movq A, A1
  300. leaq (A, LDA, 2), A2
  301. leaq (A, LDA, 4), A
  302. movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4
  303. pshufd $0x00, atemp4, atemp1
  304. pshufd $0x55, atemp4, atemp2
  305. pshufd $0xaa, atemp4, atemp3
  306. pshufd $0xff, atemp4, atemp4
  307. pxor xsum1, xsum1
  308. pxor xsum2, xsum2
  309. pxor xsum3, xsum3
  310. pxor xsum4, xsum4
  311. movaps 0 * SIZE(NEW_X), xtemp1
  312. movaps 4 * SIZE(NEW_X), xtemp2
  313. movsd 0 * SIZE(A1), a1
  314. movhps 2 * SIZE(A1), a1
  315. movsd 0 * SIZE(A1, LDA, 1), a2
  316. movhps 2 * SIZE(A1, LDA, 1), a2
  317. movsd 0 * SIZE(A2), a3
  318. movhps 2 * SIZE(A2), a3
  319. movsd 0 * SIZE(A2, LDA, 1), a4
  320. movhps 2 * SIZE(A2, LDA, 1), a4
  321. movsd 0 * SIZE(NEW_Y), yy1
  322. movhps 2 * SIZE(NEW_Y), yy1
  323. movq NEW_X, XX
  324. movq NEW_Y, YY
  325. movq IS, I
  326. sarq $4, I
  327. jle .L14
  328. ALIGN_3
  329. .L12:
  330. movaps xtemp1, xt1
  331. mulps a1, xt1
  332. mulps atemp1, a1
  333. addps xt1, xsum1
  334. addps a1, yy1
  335. movsd 4 * SIZE(A1), a1
  336. movhps 6 * SIZE(A1), a1
  337. PREFETCH PREFETCHSIZE(A1)
  338. movaps xtemp1, xt1
  339. mulps a2, xt1
  340. mulps atemp2, a2
  341. addps xt1, xsum2
  342. addps a2, yy1
  343. movsd 4 * SIZE(A1, LDA, 1), a2
  344. movhps 6 * SIZE(A1, LDA, 1), a2
  345. movaps xtemp1, xt1
  346. mulps a3, xt1
  347. mulps atemp3, a3
  348. addps xt1, xsum3
  349. addps a3, yy1
  350. movsd 4 * SIZE(A2), a3
  351. movhps 6 * SIZE(A2), a3
  352. #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
  353. PREFETCH PREFETCHSIZE(XX)
  354. #endif
  355. movaps xtemp1, xt1
  356. movaps 8 * SIZE(XX), xtemp1
  357. mulps a4, xt1
  358. mulps atemp4, a4
  359. addps xt1, xsum4
  360. addps a4, yy1
  361. movsd 4 * SIZE(A2, LDA, 1), a4
  362. movhps 6 * SIZE(A2, LDA, 1), a4
  363. movlps yy1, 0 * SIZE(YY)
  364. movhps yy1, 2 * SIZE(YY)
  365. movsd 4 * SIZE(YY), yy1
  366. movhps 6 * SIZE(YY), yy1
  367. movaps xtemp2, xt1
  368. mulps a1, xt1
  369. mulps atemp1, a1
  370. addps xt1, xsum1
  371. addps a1, yy1
  372. movsd 8 * SIZE(A1), a1
  373. movhps 10 * SIZE(A1), a1
  374. PREFETCH PREFETCHSIZE(A1, LDA, 1)
  375. movaps xtemp2, xt1
  376. mulps a2, xt1
  377. mulps atemp2, a2
  378. addps xt1, xsum2
  379. addps a2, yy1
  380. movsd 8 * SIZE(A1, LDA, 1), a2
  381. movhps 10 * SIZE(A1, LDA, 1), a2
  382. movaps xtemp2, xt1
  383. mulps a3, xt1
  384. mulps atemp3, a3
  385. addps xt1, xsum3
  386. addps a3, yy1
  387. movsd 8 * SIZE(A2), a3
  388. movhps 10 * SIZE(A2), a3
  389. movaps xtemp2, xt1
  390. movaps 12 * SIZE(XX), xtemp2
  391. mulps a4, xt1
  392. mulps atemp4, a4
  393. addps xt1, xsum4
  394. addps a4, yy1
  395. movsd 8 * SIZE(A2, LDA, 1), a4
  396. movhps 10 * SIZE(A2, LDA, 1), a4
  397. movlps yy1, 4 * SIZE(YY)
  398. movhps yy1, 6 * SIZE(YY)
  399. movsd 8 * SIZE(YY), yy1
  400. movhps 10 * SIZE(YY), yy1
  401. movaps xtemp1, xt1
  402. mulps a1, xt1
  403. mulps atemp1, a1
  404. addps xt1, xsum1
  405. addps a1, yy1
  406. movsd 12 * SIZE(A1), a1
  407. movhps 14 * SIZE(A1), a1
  408. PREFETCH PREFETCHSIZE(A2)
  409. movaps xtemp1, xt1
  410. mulps a2, xt1
  411. mulps atemp2, a2
  412. addps xt1, xsum2
  413. addps a2, yy1
  414. movsd 12 * SIZE(A1, LDA, 1), a2
  415. movhps 14 * SIZE(A1, LDA, 1), a2
  416. movaps xtemp1, xt1
  417. mulps a3, xt1
  418. mulps atemp3, a3
  419. addps xt1, xsum3
  420. addps a3, yy1
  421. movsd 12 * SIZE(A2), a3
  422. movhps 14 * SIZE(A2), a3
  423. #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
  424. PREFETCHW PREFETCHSIZE(YY)
  425. #endif
  426. movaps xtemp1, xt1
  427. movaps 16 * SIZE(XX), xtemp1
  428. mulps a4, xt1
  429. mulps atemp4, a4
  430. addps xt1, xsum4
  431. addps a4, yy1
  432. movsd 12 * SIZE(A2, LDA, 1), a4
  433. movhps 14 * SIZE(A2, LDA, 1), a4
  434. movlps yy1, 8 * SIZE(YY)
  435. movhps yy1, 10 * SIZE(YY)
  436. movsd 12 * SIZE(YY), yy1
  437. movhps 14 * SIZE(YY), yy1
  438. movaps xtemp2, xt1
  439. mulps a1, xt1
  440. mulps atemp1, a1
  441. addps xt1, xsum1
  442. addps a1, yy1
  443. movsd 16 * SIZE(A1), a1
  444. movhps 18 * SIZE(A1), a1
  445. PREFETCH PREFETCHSIZE(A2, LDA, 1)
  446. movaps xtemp2, xt1
  447. mulps a2, xt1
  448. mulps atemp2, a2
  449. addps xt1, xsum2
  450. addps a2, yy1
  451. movsd 16 * SIZE(A1, LDA, 1), a2
  452. movhps 18 * SIZE(A1, LDA, 1), a2
  453. movaps xtemp2, xt1
  454. mulps a3, xt1
  455. mulps atemp3, a3
  456. addps xt1, xsum3
  457. addps a3, yy1
  458. movsd 16 * SIZE(A2), a3
  459. movhps 18 * SIZE(A2), a3
  460. movaps xtemp2, xt1
  461. movaps 20 * SIZE(XX), xtemp2
  462. mulps a4, xt1
  463. mulps atemp4, a4
  464. addps xt1, xsum4
  465. addps a4, yy1
  466. movsd 16 * SIZE(A2, LDA, 1), a4
  467. movhps 18 * SIZE(A2, LDA, 1), a4
  468. movlps yy1, 12 * SIZE(YY)
  469. movhps yy1, 14 * SIZE(YY)
  470. movsd 16 * SIZE(YY), yy1
  471. movhps 18 * SIZE(YY), yy1
  472. addq $16 * SIZE, XX
  473. addq $16 * SIZE, YY
  474. addq $16 * SIZE, A1
  475. addq $16 * SIZE, A2
  476. decq I
  477. jg .L12
  478. ALIGN_3
  479. .L14:
  480. testq $8, IS
  481. jle .L15
  482. movaps xtemp1, xt1
  483. mulps a1, xt1
  484. mulps atemp1, a1
  485. addps xt1, xsum1
  486. addps a1, yy1
  487. movsd 4 * SIZE(A1), a1
  488. movhps 6 * SIZE(A1), a1
  489. movaps xtemp1, xt1
  490. mulps a2, xt1
  491. mulps atemp2, a2
  492. addps xt1, xsum2
  493. addps a2, yy1
  494. movsd 4 * SIZE(A1, LDA, 1), a2
  495. movhps 6 * SIZE(A1, LDA, 1), a2
  496. movaps xtemp1, xt1
  497. mulps a3, xt1
  498. mulps atemp3, a3
  499. addps xt1, xsum3
  500. addps a3, yy1
  501. movsd 4 * SIZE(A2), a3
  502. movhps 6 * SIZE(A2), a3
  503. movaps xtemp1, xt1
  504. movaps 8 * SIZE(XX), xtemp1
  505. mulps a4, xt1
  506. mulps atemp4, a4
  507. addps xt1, xsum4
  508. addps a4, yy1
  509. movsd 4 * SIZE(A2, LDA, 1), a4
  510. movhps 6 * SIZE(A2, LDA, 1), a4
  511. movlps yy1, 0 * SIZE(YY)
  512. movhps yy1, 2 * SIZE(YY)
  513. movsd 4 * SIZE(YY), yy1
  514. movhps 6 * SIZE(YY), yy1
  515. movaps xtemp2, xt1
  516. mulps a1, xt1
  517. mulps atemp1, a1
  518. addps xt1, xsum1
  519. addps a1, yy1
  520. movsd 8 * SIZE(A1), a1
  521. movhps 10 * SIZE(A1), a1
  522. movaps xtemp2, xt1
  523. mulps a2, xt1
  524. mulps atemp2, a2
  525. addps xt1, xsum2
  526. addps a2, yy1
  527. movsd 8 * SIZE(A1, LDA, 1), a2
  528. movhps 10 * SIZE(A1, LDA, 1), a2
  529. movaps xtemp2, xt1
  530. mulps a3, xt1
  531. mulps atemp3, a3
  532. addps xt1, xsum3
  533. addps a3, yy1
  534. movsd 8 * SIZE(A2), a3
  535. movhps 10 * SIZE(A2), a3
  536. movaps xtemp2, xt1
  537. movaps 12 * SIZE(XX), xtemp2
  538. mulps a4, xt1
  539. mulps atemp4, a4
  540. addps xt1, xsum4
  541. addps a4, yy1
  542. movsd 8 * SIZE(A2, LDA, 1), a4
  543. movhps 10 * SIZE(A2, LDA, 1), a4
  544. movlps yy1, 4 * SIZE(YY)
  545. movhps yy1, 6 * SIZE(YY)
  546. movsd 8 * SIZE(YY), yy1
  547. movhps 10 * SIZE(YY), yy1
  548. addq $8 * SIZE, XX
  549. addq $8 * SIZE, YY
  550. addq $8 * SIZE, A1
  551. addq $8 * SIZE, A2
  552. ALIGN_3
  553. .L15:
  554. testq $4, IS
  555. jle .L18
  556. movaps xtemp1, xt1
  557. mulps a1, xt1
  558. mulps atemp1, a1
  559. addps xt1, xsum1
  560. addps a1, yy1
  561. movaps xtemp1, xt1
  562. mulps a2, xt1
  563. mulps atemp2, a2
  564. addps xt1, xsum2
  565. addps a2, yy1
  566. movaps xtemp1, xt1
  567. mulps a3, xt1
  568. mulps atemp3, a3
  569. addps xt1, xsum3
  570. addps a3, yy1
  571. movaps xtemp1, xt1
  572. mulps a4, xt1
  573. mulps atemp4, a4
  574. addps xt1, xsum4
  575. addps a4, yy1
  576. movlps yy1, 0 * SIZE(YY)
  577. movhps yy1, 2 * SIZE(YY)
  578. movsd 4 * SIZE(YY), yy1
  579. movhps 6 * SIZE(YY), yy1
  580. addq $4 * SIZE, XX
  581. addq $4 * SIZE, YY
  582. addq $4 * SIZE, A1
  583. addq $4 * SIZE, A2
  584. ALIGN_3
  585. .L18:
  586. movaps 0 * SIZE(NEW_X, IS, SIZE), atemp1
  587. movss 0 * SIZE(A1), a1
  588. movss 0 * SIZE(A1, LDA, 1), a2
  589. movss 0 * SIZE(A2), a3
  590. movss 0 * SIZE(A2, LDA, 1), a4
  591. unpcklps a3, a1
  592. unpcklps a4, a2
  593. unpcklps a2, a1
  594. mulps atemp1, a1
  595. addps a1, xsum1
  596. movsd 0 * SIZE(A1, LDA, 1), a1
  597. movss 1 * SIZE(A2), a2
  598. movhps 1 * SIZE(A2, LDA, 1), a2
  599. shufps $0x84, a2, a1
  600. mulps atemp1, a1
  601. addps a1, xsum2
  602. movsd 0 * SIZE(A2), a1
  603. movss 2 * SIZE(A2), a2
  604. movhps 2 * SIZE(A2, LDA, 1), a2
  605. shufps $0x84, a2, a1
  606. mulps atemp1, a1
  607. addps a1, xsum3
  608. movsd 0 * SIZE(A2, LDA, 1), a1
  609. movhps 2 * SIZE(A2, LDA, 1), a1
  610. mulps atemp1, a1
  611. addps a1, xsum4
  612. #ifndef HAVE_SSE3
  613. movaps xsum1, xtemp1
  614. unpcklps xsum3, xsum1
  615. unpckhps xsum3, xtemp1
  616. movaps xsum2, xtemp2
  617. unpcklps xsum4, xsum2
  618. unpckhps xsum4, xtemp2
  619. movaps xsum1, xsum3
  620. unpcklps xsum2, xsum1
  621. unpckhps xsum2, xsum3
  622. movaps xtemp1, xsum4
  623. unpcklps xtemp2, xtemp1
  624. unpckhps xtemp2, xsum4
  625. addps xsum3, xsum1
  626. addps xtemp1, xsum4
  627. addps xsum4, xsum1
  628. #else
  629. haddps xsum2, xsum1
  630. haddps xsum4, xsum3
  631. haddps xsum3, xsum1
  632. #endif
  633. addps xsum1, yy1
  634. movlps yy1, 0 * SIZE(YY)
  635. movhps yy1, 2 * SIZE(YY)
  636. addq $4, IS
  637. movq IS, I
  638. addq $4, I
  639. cmpq M, I
  640. jle .L11
  641. ALIGN_3
  642. .L20:
  643. testq $2, M
  644. jle .L30
  645. movq A, A1
  646. leaq (A, LDA, 2), A
  647. movsd 0 * SIZE(NEW_X, IS, SIZE), atemp4
  648. pshufd $0x00, atemp4, atemp1
  649. pshufd $0x55, atemp4, atemp2
  650. pxor xsum1, xsum1
  651. pxor xsum2, xsum2
  652. movaps 0 * SIZE(NEW_X), xtemp1
  653. movsd 0 * SIZE(A1), a1
  654. movhps 2 * SIZE(A1), a1
  655. movsd 0 * SIZE(A1, LDA, 1), a2
  656. movhps 2 * SIZE(A1, LDA, 1), a2
  657. movsd 0 * SIZE(NEW_Y), yy1
  658. movhps 2 * SIZE(NEW_Y), yy1
  659. movq NEW_X, XX
  660. movq NEW_Y, YY
  661. movq IS, I
  662. sarq $2, I
  663. jle .L28
  664. ALIGN_3
  665. .L22:
  666. movaps xtemp1, xt1
  667. mulps a1, xt1
  668. mulps atemp1, a1
  669. addps xt1, xsum1
  670. addps a1, yy1
  671. movsd 4 * SIZE(A1), a1
  672. movhps 6 * SIZE(A1), a1
  673. movaps xtemp1, xt1
  674. movaps 4 * SIZE(XX), xtemp1
  675. mulps a2, xt1
  676. mulps atemp2, a2
  677. addps xt1, xsum2
  678. addps a2, yy1
  679. movsd 4 * SIZE(A1, LDA, 1), a2
  680. movhps 6 * SIZE(A1, LDA, 1), a2
  681. movlps yy1, 0 * SIZE(YY)
  682. movhps yy1, 2 * SIZE(YY)
  683. movsd 4 * SIZE(YY), yy1
  684. movhps 6 * SIZE(YY), yy1
  685. addq $4 * SIZE, XX
  686. addq $4 * SIZE, YY
  687. addq $4 * SIZE, A1
  688. decq I
  689. jg .L22
  690. ALIGN_3
  691. .L28:
  692. movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1
  693. movss 0 * SIZE(A1), a1
  694. movss 0 * SIZE(A1, LDA, 1), a2
  695. unpcklps a2, a1
  696. mulps atemp1, a1
  697. addps a1, xsum1
  698. movsd 0 * SIZE(A1, LDA, 1), a1
  699. mulps atemp1, a1
  700. addps a1, xsum2
  701. #ifndef HAVE_SSE3
  702. movhlps xsum1, xsum3
  703. movhlps xsum2, xsum4
  704. addps xsum3, xsum1
  705. addps xsum4, xsum2
  706. unpcklps xsum2, xsum1
  707. movhlps xsum1, xsum2
  708. addps xsum2, xsum1
  709. #else
  710. haddps xsum2, xsum1
  711. haddps xsum1, xsum1
  712. #endif
  713. addps xsum1, yy1
  714. movlps yy1, 0 * SIZE(YY)
  715. addq $2, IS
  716. ALIGN_3
  717. .L30:
  718. testq $1, M
  719. jle .L990
  720. movq A, A1
  721. movss 0 * SIZE(NEW_X, IS, SIZE), atemp1
  722. pshufd $0x00, atemp1, atemp1
  723. pxor xsum1, xsum1
  724. pxor xsum2, xsum2
  725. movss 0 * SIZE(NEW_Y), yy1
  726. movss 0 * SIZE(NEW_X), xtemp1
  727. movss 1 * SIZE(NEW_X), xtemp2
  728. movss 0 * SIZE(A1), a1
  729. movss 1 * SIZE(A1), a2
  730. movq NEW_X, XX
  731. movq NEW_Y, YY
  732. movq IS, I
  733. sarq $1, I
  734. jle .L38
  735. ALIGN_3
  736. .L32:
  737. movaps xtemp1, xt1
  738. movss 2 * SIZE(XX), xtemp1
  739. mulps a1, xt1
  740. mulps atemp1, a1
  741. addps xt1, xsum1
  742. addps a1, yy1
  743. movss 2 * SIZE(A1), a1
  744. movss yy1, 0 * SIZE(YY)
  745. movss 1 * SIZE(YY), yy1
  746. movaps xtemp2, xt1
  747. movss 3 * SIZE(XX), xtemp2
  748. mulps a2, xt1
  749. mulps atemp1, a2
  750. addps xt1, xsum1
  751. addps a2, yy1
  752. movss 3 * SIZE(A1), a2
  753. movss yy1, 1 * SIZE(YY)
  754. movss 2 * SIZE(YY), yy1
  755. addq $2 * SIZE, XX
  756. addq $2 * SIZE, YY
  757. addq $2 * SIZE, A1
  758. decq I
  759. jg .L32
  760. ALIGN_3
  761. .L38:
  762. movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1
  763. movss 0 * SIZE(A1), a1
  764. mulss atemp1, a1
  765. addss a1, xsum1
  766. #ifndef HAVE_SSE3
  767. movhlps xsum1, xsum3
  768. movhlps xsum2, xsum4
  769. addps xsum3, xsum1
  770. addps xsum4, xsum2
  771. unpcklps xsum2, xsum1
  772. movhlps xsum1, xsum2
  773. addps xsum2, xsum1
  774. #else
  775. addss xsum2, xsum1
  776. #endif
  777. addss xsum1, yy1
  778. movss yy1, 0 * SIZE(YY)
  779. addq $2, IS
  780. ALIGN_3
  781. .L990:
  782. cmpq $SIZE, INCY
  783. je .L999
  784. movq M, %rax
  785. sarq $3, %rax
  786. jle .L997
  787. ALIGN_3
  788. .L996:
  789. movss 0 * SIZE(NEW_Y), %xmm0
  790. movss 1 * SIZE(NEW_Y), %xmm1
  791. movss 2 * SIZE(NEW_Y), %xmm2
  792. movss 3 * SIZE(NEW_Y), %xmm3
  793. movss 4 * SIZE(NEW_Y), %xmm4
  794. movss 5 * SIZE(NEW_Y), %xmm5
  795. movss 6 * SIZE(NEW_Y), %xmm6
  796. movss 7 * SIZE(NEW_Y), %xmm7
  797. movss %xmm0, 0 * SIZE(Y)
  798. addq INCY, Y
  799. movss %xmm1, 0 * SIZE(Y)
  800. addq INCY, Y
  801. movss %xmm2, 0 * SIZE(Y)
  802. addq INCY, Y
  803. movss %xmm3, 0 * SIZE(Y)
  804. addq INCY, Y
  805. movss %xmm4, 0 * SIZE(Y)
  806. addq INCY, Y
  807. movss %xmm5, 0 * SIZE(Y)
  808. addq INCY, Y
  809. movss %xmm6, 0 * SIZE(Y)
  810. addq INCY, Y
  811. movss %xmm7, 0 * SIZE(Y)
  812. addq INCY, Y
  813. addq $8 * SIZE, NEW_Y
  814. decq %rax
  815. jg .L996
  816. ALIGN_3
  817. .L997:
  818. movq M, %rax
  819. andq $7, %rax
  820. jle .L999
  821. ALIGN_3
  822. .L998:
  823. movss 0 * SIZE(NEW_Y), %xmm0
  824. movss %xmm0, 0 * SIZE(Y)
  825. addq INCY, Y
  826. addq $1 * SIZE, NEW_Y
  827. decq %rax
  828. jg .L998
  829. ALIGN_3
  830. .L999:
  831. movq 0(%rsp), %rbx
  832. movq 8(%rsp), %rbp
  833. movq 16(%rsp), %r12
  834. movq 24(%rsp), %r13
  835. movq 32(%rsp), %r14
  836. movq 40(%rsp), %r15
  837. #ifdef WINDOWS_ABI
  838. movq 48(%rsp), %rdi
  839. movq 56(%rsp), %rsi
  840. movups 64(%rsp), %xmm6
  841. movups 80(%rsp), %xmm7
  842. movups 96(%rsp), %xmm8
  843. movups 112(%rsp), %xmm9
  844. movups 128(%rsp), %xmm10
  845. movups 144(%rsp), %xmm11
  846. movups 160(%rsp), %xmm12
  847. movups 176(%rsp), %xmm13
  848. movups 192(%rsp), %xmm14
  849. movups 208(%rsp), %xmm15
  850. #endif
  851. addq $STACKSIZE, %rsp
  852. ret
  853. EPILOGUE