You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n_atom.S 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef ATOM
  41. #define PREFETCH prefetchnta
  42. #define PREFETCHW prefetcht0
  43. #define PREFETCHSIZE (8 * 6)
  44. #endif
  45. #define STACKSIZE 16
  46. #define M 4 + STACKSIZE(%esp)
  47. #define N 8 + STACKSIZE(%esp)
  48. #define ALPHA 16 + STACKSIZE(%esp)
  49. #define A 24 + STACKSIZE(%esp)
  50. #define STACK_LDA 28 + STACKSIZE(%esp)
  51. #define STACK_X 32 + STACKSIZE(%esp)
  52. #define STACK_INCX 36 + STACKSIZE(%esp)
  53. #define Y 40 + STACKSIZE(%esp)
  54. #define STACK_INCY 44 + STACKSIZE(%esp)
  55. #define BUFFER 48 + STACKSIZE(%esp)
  56. #define I %eax
  57. #define J %ebx
  58. #define INCX %ecx
  59. #define INCY J
  60. #define A1 %esi
  61. #define X %edx
  62. #define Y1 %edi
  63. #define LDA %ebp
  64. PROLOGUE
  65. pushl %ebp
  66. pushl %edi
  67. pushl %esi
  68. pushl %ebx
  69. PROFCODE
  70. movl STACK_LDA, LDA
  71. movl STACK_X, X
  72. movl STACK_INCX, INCX
  73. leal (,INCX, SIZE), INCX
  74. leal (,LDA, SIZE), LDA
  75. subl $-16 * SIZE, A
  76. cmpl $0, N
  77. jle .L999
  78. cmpl $0, M
  79. jle .L999
  80. movl BUFFER, Y1
  81. pxor %xmm7, %xmm7
  82. movl M, %eax
  83. addl $16, %eax
  84. sarl $4, %eax
  85. ALIGN_3
  86. .L01:
  87. movapd %xmm7, 0 * SIZE(Y1)
  88. movapd %xmm7, 2 * SIZE(Y1)
  89. movapd %xmm7, 4 * SIZE(Y1)
  90. movapd %xmm7, 6 * SIZE(Y1)
  91. movapd %xmm7, 8 * SIZE(Y1)
  92. movapd %xmm7, 10 * SIZE(Y1)
  93. movapd %xmm7, 12 * SIZE(Y1)
  94. movapd %xmm7, 14 * SIZE(Y1)
  95. subl $-16 * SIZE, Y1
  96. decl %eax
  97. jg .L01
  98. ALIGN_3
  99. .L10:
  100. movl N, J
  101. sarl $1, J
  102. jle .L20
  103. ALIGN_3
  104. .L11:
  105. movl BUFFER, Y1
  106. addl $16 * SIZE, Y1
  107. movl A, A1
  108. leal (A1, LDA, 2), %eax
  109. movl %eax, A
  110. movsd (X), %xmm6
  111. addl INCX, X
  112. movsd (X), %xmm7
  113. addl INCX, X
  114. movsd ALPHA, %xmm0
  115. mulsd %xmm0, %xmm6
  116. mulsd %xmm0, %xmm7
  117. movsd -16 * SIZE(Y1), %xmm0
  118. movsd -15 * SIZE(Y1), %xmm1
  119. movl M, I
  120. sarl $3, I
  121. jle .L15
  122. movsd -16 * SIZE(A1), %xmm2
  123. movsd -15 * SIZE(A1), %xmm3
  124. movsd -16 * SIZE(A1, LDA), %xmm4
  125. movsd -15 * SIZE(A1, LDA), %xmm5
  126. mulsd %xmm6, %xmm2
  127. mulsd %xmm6, %xmm3
  128. decl I
  129. jle .L14
  130. ALIGN_3
  131. .L13:
  132. #ifdef PREFETCH
  133. PREFETCH (PREFETCHSIZE + 0) * SIZE(A1)
  134. #endif
  135. mulsd %xmm7, %xmm4
  136. addsd %xmm2, %xmm0
  137. movsd -14 * SIZE(A1), %xmm2
  138. mulsd %xmm7, %xmm5
  139. addsd %xmm3, %xmm1
  140. movsd -13 * SIZE(A1), %xmm3
  141. addsd %xmm4, %xmm0
  142. movsd -14 * SIZE(A1, LDA), %xmm4
  143. mulsd %xmm6, %xmm2
  144. addsd %xmm5, %xmm1
  145. movsd -13 * SIZE(A1, LDA), %xmm5
  146. mulsd %xmm6, %xmm3
  147. movlpd %xmm0, -16 * SIZE(Y1)
  148. movsd -14 * SIZE(Y1), %xmm0
  149. movlpd %xmm1, -15 * SIZE(Y1)
  150. movsd -13 * SIZE(Y1), %xmm1
  151. mulsd %xmm7, %xmm4
  152. addsd %xmm2, %xmm0
  153. movsd -12 * SIZE(A1), %xmm2
  154. mulsd %xmm7, %xmm5
  155. addsd %xmm3, %xmm1
  156. movsd -11 * SIZE(A1), %xmm3
  157. addsd %xmm4, %xmm0
  158. movsd -12 * SIZE(A1, LDA), %xmm4
  159. mulsd %xmm6, %xmm2
  160. addsd %xmm5, %xmm1
  161. movsd -11 * SIZE(A1, LDA), %xmm5
  162. mulsd %xmm6, %xmm3
  163. movlpd %xmm0, -14 * SIZE(Y1)
  164. movsd -12 * SIZE(Y1), %xmm0
  165. movlpd %xmm1, -13 * SIZE(Y1)
  166. movsd -11 * SIZE(Y1), %xmm1
  167. #ifdef PREFETCH
  168. PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA)
  169. #endif
  170. mulsd %xmm7, %xmm4
  171. addsd %xmm2, %xmm0
  172. movsd -10 * SIZE(A1), %xmm2
  173. mulsd %xmm7, %xmm5
  174. addsd %xmm3, %xmm1
  175. movsd -9 * SIZE(A1), %xmm3
  176. addsd %xmm4, %xmm0
  177. movsd -10 * SIZE(A1, LDA), %xmm4
  178. mulsd %xmm6, %xmm2
  179. addsd %xmm5, %xmm1
  180. movsd -9 * SIZE(A1, LDA), %xmm5
  181. mulsd %xmm6, %xmm3
  182. movlpd %xmm0, -12 * SIZE(Y1)
  183. movsd -10 * SIZE(Y1), %xmm0
  184. movlpd %xmm1, -11 * SIZE(Y1)
  185. movsd -9 * SIZE(Y1), %xmm1
  186. mulsd %xmm7, %xmm4
  187. addsd %xmm2, %xmm0
  188. movsd -8 * SIZE(A1), %xmm2
  189. mulsd %xmm7, %xmm5
  190. addsd %xmm3, %xmm1
  191. movsd -7 * SIZE(A1), %xmm3
  192. addsd %xmm4, %xmm0
  193. movsd -8 * SIZE(A1, LDA), %xmm4
  194. mulsd %xmm6, %xmm2
  195. addsd %xmm5, %xmm1
  196. movsd -7 * SIZE(A1, LDA), %xmm5
  197. mulsd %xmm6, %xmm3
  198. movlpd %xmm0, -10 * SIZE(Y1)
  199. movsd -8 * SIZE(Y1), %xmm0
  200. movlpd %xmm1, -9 * SIZE(Y1)
  201. movsd -7 * SIZE(Y1), %xmm1
  202. subl $-8 * SIZE, A1
  203. subl $-8 * SIZE, Y1
  204. subl $1, I
  205. BRANCH
  206. jg .L13
  207. ALIGN_3
  208. .L14:
  209. mulsd %xmm7, %xmm4
  210. addsd %xmm2, %xmm0
  211. movsd -14 * SIZE(A1), %xmm2
  212. mulsd %xmm7, %xmm5
  213. addsd %xmm3, %xmm1
  214. movsd -13 * SIZE(A1), %xmm3
  215. addsd %xmm4, %xmm0
  216. movsd -14 * SIZE(A1, LDA), %xmm4
  217. mulsd %xmm6, %xmm2
  218. addsd %xmm5, %xmm1
  219. movsd -13 * SIZE(A1, LDA), %xmm5
  220. mulsd %xmm6, %xmm3
  221. movlpd %xmm0, -16 * SIZE(Y1)
  222. movsd -14 * SIZE(Y1), %xmm0
  223. movlpd %xmm1, -15 * SIZE(Y1)
  224. movsd -13 * SIZE(Y1), %xmm1
  225. mulsd %xmm7, %xmm4
  226. addsd %xmm2, %xmm0
  227. movsd -12 * SIZE(A1), %xmm2
  228. mulsd %xmm7, %xmm5
  229. addsd %xmm3, %xmm1
  230. movsd -11 * SIZE(A1), %xmm3
  231. addsd %xmm4, %xmm0
  232. movsd -12 * SIZE(A1, LDA), %xmm4
  233. mulsd %xmm6, %xmm2
  234. addsd %xmm5, %xmm1
  235. movsd -11 * SIZE(A1, LDA), %xmm5
  236. mulsd %xmm6, %xmm3
  237. movlpd %xmm0, -14 * SIZE(Y1)
  238. movsd -12 * SIZE(Y1), %xmm0
  239. movlpd %xmm1, -13 * SIZE(Y1)
  240. movsd -11 * SIZE(Y1), %xmm1
  241. mulsd %xmm7, %xmm4
  242. addsd %xmm2, %xmm0
  243. movsd -10 * SIZE(A1), %xmm2
  244. mulsd %xmm7, %xmm5
  245. addsd %xmm3, %xmm1
  246. movsd -9 * SIZE(A1), %xmm3
  247. addsd %xmm4, %xmm0
  248. movsd -10 * SIZE(A1, LDA), %xmm4
  249. mulsd %xmm6, %xmm2
  250. addsd %xmm5, %xmm1
  251. movsd -9 * SIZE(A1, LDA), %xmm5
  252. mulsd %xmm6, %xmm3
  253. movlpd %xmm0, -12 * SIZE(Y1)
  254. movsd -10 * SIZE(Y1), %xmm0
  255. movlpd %xmm1, -11 * SIZE(Y1)
  256. movsd -9 * SIZE(Y1), %xmm1
  257. mulsd %xmm7, %xmm4
  258. addsd %xmm2, %xmm0
  259. mulsd %xmm7, %xmm5
  260. addsd %xmm3, %xmm1
  261. addsd %xmm4, %xmm0
  262. addsd %xmm5, %xmm1
  263. movlpd %xmm0, -10 * SIZE(Y1)
  264. movsd -8 * SIZE(Y1), %xmm0
  265. movlpd %xmm1, -9 * SIZE(Y1)
  266. movsd -7 * SIZE(Y1), %xmm1
  267. subl $-8 * SIZE, A1
  268. subl $-8 * SIZE, Y1
  269. ALIGN_3
  270. .L15:
  271. testl $4, M
  272. je .L16
  273. movsd -16 * SIZE(A1), %xmm2
  274. movsd -15 * SIZE(A1), %xmm3
  275. movsd -16 * SIZE(A1, LDA), %xmm4
  276. movsd -15 * SIZE(A1, LDA), %xmm5
  277. mulsd %xmm6, %xmm2
  278. mulsd %xmm6, %xmm3
  279. mulsd %xmm7, %xmm4
  280. addsd %xmm2, %xmm0
  281. movsd -14 * SIZE(A1), %xmm2
  282. mulsd %xmm7, %xmm5
  283. addsd %xmm3, %xmm1
  284. movsd -13 * SIZE(A1), %xmm3
  285. addsd %xmm4, %xmm0
  286. movsd -14 * SIZE(A1, LDA), %xmm4
  287. mulsd %xmm6, %xmm2
  288. addsd %xmm5, %xmm1
  289. movsd -13 * SIZE(A1, LDA), %xmm5
  290. mulsd %xmm6, %xmm3
  291. movlpd %xmm0, -16 * SIZE(Y1)
  292. movsd -14 * SIZE(Y1), %xmm0
  293. movlpd %xmm1, -15 * SIZE(Y1)
  294. movsd -13 * SIZE(Y1), %xmm1
  295. mulsd %xmm7, %xmm4
  296. addsd %xmm2, %xmm0
  297. mulsd %xmm7, %xmm5
  298. addsd %xmm3, %xmm1
  299. addsd %xmm4, %xmm0
  300. addsd %xmm5, %xmm1
  301. movlpd %xmm0, -14 * SIZE(Y1)
  302. movsd -12 * SIZE(Y1), %xmm0
  303. movlpd %xmm1, -13 * SIZE(Y1)
  304. movsd -11 * SIZE(Y1), %xmm1
  305. addl $4 * SIZE, A1
  306. addl $4 * SIZE, Y1
  307. ALIGN_3
  308. .L16:
  309. testl $2, M
  310. je .L17
  311. movsd -16 * SIZE(A1), %xmm2
  312. movsd -15 * SIZE(A1), %xmm3
  313. movsd -16 * SIZE(A1, LDA), %xmm4
  314. movsd -15 * SIZE(A1, LDA), %xmm5
  315. mulsd %xmm6, %xmm2
  316. mulsd %xmm6, %xmm3
  317. mulsd %xmm7, %xmm4
  318. addsd %xmm2, %xmm0
  319. mulsd %xmm7, %xmm5
  320. addsd %xmm3, %xmm1
  321. addsd %xmm4, %xmm0
  322. addsd %xmm5, %xmm1
  323. movlpd %xmm0, -16 * SIZE(Y1)
  324. movsd -14 * SIZE(Y1), %xmm0
  325. movlpd %xmm1, -15 * SIZE(Y1)
  326. addl $2 * SIZE, A1
  327. addl $2 * SIZE, Y1
  328. ALIGN_3
  329. .L17:
  330. testl $1, M
  331. je .L19
  332. movsd -16 * SIZE(A1), %xmm2
  333. movsd -16 * SIZE(A1, LDA), %xmm3
  334. movsd -16 * SIZE(Y1), %xmm0
  335. mulsd %xmm6, %xmm2
  336. addsd %xmm2, %xmm0
  337. mulsd %xmm7, %xmm3
  338. addsd %xmm3, %xmm0
  339. movsd %xmm0, -16 * SIZE(Y1)
  340. ALIGN_3
  341. .L19:
  342. decl J
  343. jg .L11
  344. ALIGN_4
  345. .L20:
  346. testl $1, N
  347. jle .L990
  348. movl BUFFER, Y1
  349. addl $16 * SIZE, Y1
  350. movl A, A1
  351. leal (A1, LDA, 2), %eax
  352. movl %eax, A
  353. movsd (X), %xmm6
  354. addl INCX, X
  355. movsd (X), %xmm7
  356. addl INCX, X
  357. movsd ALPHA, %xmm0
  358. mulsd %xmm0, %xmm6
  359. mulsd %xmm0, %xmm7
  360. movsd -16 * SIZE(Y1), %xmm0
  361. movsd -15 * SIZE(Y1), %xmm1
  362. movsd -14 * SIZE(Y1), %xmm4
  363. movsd -13 * SIZE(Y1), %xmm5
  364. movl M, I
  365. sarl $3, I
  366. jle .L25
  367. movsd -16 * SIZE(A1), %xmm2
  368. movsd -15 * SIZE(A1), %xmm3
  369. mulsd %xmm6, %xmm2
  370. mulsd %xmm6, %xmm3
  371. decl I
  372. jle .L24
  373. ALIGN_3
  374. .L23:
  375. #ifdef PREFETCH
  376. PREFETCH (PREFETCHSIZE + 0) * SIZE(A1)
  377. #endif
  378. addsd %xmm2, %xmm0
  379. movsd -14 * SIZE(A1), %xmm2
  380. addsd %xmm3, %xmm1
  381. movsd -13 * SIZE(A1), %xmm3
  382. mulsd %xmm6, %xmm2
  383. movlpd %xmm0, -16 * SIZE(Y1)
  384. movsd -12 * SIZE(Y1), %xmm0
  385. mulsd %xmm6, %xmm3
  386. movlpd %xmm1, -15 * SIZE(Y1)
  387. movsd -11 * SIZE(Y1), %xmm1
  388. addsd %xmm2, %xmm4
  389. movsd -12 * SIZE(A1), %xmm2
  390. addsd %xmm3, %xmm5
  391. movsd -11 * SIZE(A1), %xmm3
  392. mulsd %xmm6, %xmm2
  393. movlpd %xmm4, -14 * SIZE(Y1)
  394. movsd -10 * SIZE(Y1), %xmm4
  395. mulsd %xmm6, %xmm3
  396. movlpd %xmm5, -13 * SIZE(Y1)
  397. movsd -9 * SIZE(Y1), %xmm5
  398. addsd %xmm2, %xmm0
  399. movsd -10 * SIZE(A1), %xmm2
  400. addsd %xmm3, %xmm1
  401. movsd -9 * SIZE(A1), %xmm3
  402. mulsd %xmm6, %xmm2
  403. movlpd %xmm0, -12 * SIZE(Y1)
  404. movsd -8 * SIZE(Y1), %xmm0
  405. mulsd %xmm6, %xmm3
  406. movlpd %xmm1, -11 * SIZE(Y1)
  407. movsd -7 * SIZE(Y1), %xmm1
  408. addsd %xmm2, %xmm4
  409. movsd -8 * SIZE(A1), %xmm2
  410. addsd %xmm3, %xmm5
  411. movsd -7 * SIZE(A1), %xmm3
  412. mulsd %xmm6, %xmm2
  413. movlpd %xmm4, -10 * SIZE(Y1)
  414. movsd -6 * SIZE(Y1), %xmm4
  415. mulsd %xmm6, %xmm3
  416. movlpd %xmm5, -9 * SIZE(Y1)
  417. movsd -5 * SIZE(Y1), %xmm5
  418. subl $-8 * SIZE, A1
  419. subl $-8 * SIZE, Y1
  420. subl $1, I
  421. BRANCH
  422. jg .L23
  423. ALIGN_3
  424. .L24:
  425. addsd %xmm2, %xmm0
  426. movsd -14 * SIZE(A1), %xmm2
  427. addsd %xmm3, %xmm1
  428. movsd -13 * SIZE(A1), %xmm3
  429. mulsd %xmm6, %xmm2
  430. movlpd %xmm0, -16 * SIZE(Y1)
  431. movsd -12 * SIZE(Y1), %xmm0
  432. mulsd %xmm6, %xmm3
  433. movlpd %xmm1, -15 * SIZE(Y1)
  434. movsd -11 * SIZE(Y1), %xmm1
  435. addsd %xmm2, %xmm4
  436. movsd -12 * SIZE(A1), %xmm2
  437. addsd %xmm3, %xmm5
  438. movsd -11 * SIZE(A1), %xmm3
  439. mulsd %xmm6, %xmm2
  440. movlpd %xmm4, -14 * SIZE(Y1)
  441. movsd -10 * SIZE(Y1), %xmm4
  442. mulsd %xmm6, %xmm3
  443. movlpd %xmm5, -13 * SIZE(Y1)
  444. movsd -9 * SIZE(Y1), %xmm5
  445. addsd %xmm2, %xmm0
  446. movsd -10 * SIZE(A1), %xmm2
  447. addsd %xmm3, %xmm1
  448. movsd -9 * SIZE(A1), %xmm3
  449. mulsd %xmm6, %xmm2
  450. movlpd %xmm0, -12 * SIZE(Y1)
  451. mulsd %xmm6, %xmm3
  452. movlpd %xmm1, -11 * SIZE(Y1)
  453. addsd %xmm2, %xmm4
  454. movsd -8 * SIZE(Y1), %xmm0
  455. addsd %xmm3, %xmm5
  456. movsd -7 * SIZE(Y1), %xmm1
  457. movlpd %xmm4, -10 * SIZE(Y1)
  458. movsd -6 * SIZE(Y1), %xmm4
  459. movlpd %xmm5, -9 * SIZE(Y1)
  460. movsd -5 * SIZE(Y1), %xmm5
  461. subl $-8 * SIZE, A1
  462. subl $-8 * SIZE, Y1
  463. ALIGN_3
  464. .L25:
  465. testl $4, M
  466. je .L26
  467. movsd -16 * SIZE(A1), %xmm2
  468. movsd -15 * SIZE(A1), %xmm3
  469. mulsd %xmm6, %xmm2
  470. mulsd %xmm6, %xmm3
  471. addsd %xmm2, %xmm0
  472. movsd -14 * SIZE(A1), %xmm2
  473. addsd %xmm3, %xmm1
  474. movsd -13 * SIZE(A1), %xmm3
  475. mulsd %xmm6, %xmm2
  476. movlpd %xmm0, -16 * SIZE(Y1)
  477. movsd -12 * SIZE(Y1), %xmm0
  478. mulsd %xmm6, %xmm3
  479. movlpd %xmm1, -15 * SIZE(Y1)
  480. movsd -11 * SIZE(Y1), %xmm1
  481. addsd %xmm2, %xmm4
  482. addsd %xmm3, %xmm5
  483. movlpd %xmm4, -14 * SIZE(Y1)
  484. movlpd %xmm5, -13 * SIZE(Y1)
  485. addl $4 * SIZE, A1
  486. addl $4 * SIZE, Y1
  487. ALIGN_3
  488. .L26:
  489. testl $2, M
  490. je .L27
  491. movsd -16 * SIZE(A1), %xmm2
  492. movsd -15 * SIZE(A1), %xmm3
  493. mulsd %xmm6, %xmm2
  494. mulsd %xmm6, %xmm3
  495. addsd %xmm2, %xmm0
  496. addsd %xmm3, %xmm1
  497. movlpd %xmm0, -16 * SIZE(Y1)
  498. movsd -14 * SIZE(Y1), %xmm0
  499. movlpd %xmm1, -15 * SIZE(Y1)
  500. addl $2 * SIZE, A1
  501. addl $2 * SIZE, Y1
  502. ALIGN_3
  503. .L27:
  504. testl $1, M
  505. je .L990
  506. movsd -16 * SIZE(A1), %xmm2
  507. movsd -16 * SIZE(Y1), %xmm0
  508. mulsd %xmm6, %xmm2
  509. addsd %xmm2, %xmm0
  510. movsd %xmm0, -16 * SIZE(Y1)
  511. ALIGN_3
  512. .L990:
  513. movl Y, Y1
  514. movl BUFFER, X
  515. movl Y1, A1
  516. movl STACK_INCY, INCY
  517. sall $BASE_SHIFT, INCY
  518. movl M, %eax
  519. sarl $3, %eax
  520. jle .L994
  521. ALIGN_3
  522. .L992:
  523. movsd (Y1), %xmm0
  524. addl INCY, Y1
  525. movsd (Y1), %xmm1
  526. addl INCY, Y1
  527. movsd (Y1), %xmm2
  528. addl INCY, Y1
  529. movsd (Y1), %xmm3
  530. addl INCY, Y1
  531. movsd (Y1), %xmm4
  532. addl INCY, Y1
  533. movsd (Y1), %xmm5
  534. addl INCY, Y1
  535. movsd (Y1), %xmm6
  536. addl INCY, Y1
  537. movsd (Y1), %xmm7
  538. addl INCY, Y1
  539. addsd 0 * SIZE(X), %xmm0
  540. addsd 1 * SIZE(X), %xmm1
  541. addsd 2 * SIZE(X), %xmm2
  542. addsd 3 * SIZE(X), %xmm3
  543. addsd 4 * SIZE(X), %xmm4
  544. addsd 5 * SIZE(X), %xmm5
  545. addsd 6 * SIZE(X), %xmm6
  546. addsd 7 * SIZE(X), %xmm7
  547. movlpd %xmm0, (A1)
  548. addl INCY, A1
  549. movlpd %xmm1, (A1)
  550. addl INCY, A1
  551. movlpd %xmm2, (A1)
  552. addl INCY, A1
  553. movlpd %xmm3, (A1)
  554. addl INCY, A1
  555. movlpd %xmm4, (A1)
  556. addl INCY, A1
  557. movlpd %xmm5, (A1)
  558. addl INCY, A1
  559. movlpd %xmm6, (A1)
  560. addl INCY, A1
  561. movlpd %xmm7, (A1)
  562. addl INCY, A1
  563. addl $8 * SIZE, X
  564. decl %eax
  565. jg .L992
  566. ALIGN_3
  567. .L994:
  568. testl $7, M
  569. jle .L999
  570. testl $4, M
  571. jle .L995
  572. movsd (Y1), %xmm0
  573. addl INCY, Y1
  574. movsd (Y1), %xmm1
  575. addl INCY, Y1
  576. movsd (Y1), %xmm2
  577. addl INCY, Y1
  578. movsd (Y1), %xmm3
  579. addl INCY, Y1
  580. addsd 0 * SIZE(X), %xmm0
  581. addsd 1 * SIZE(X), %xmm1
  582. addsd 2 * SIZE(X), %xmm2
  583. addsd 3 * SIZE(X), %xmm3
  584. movlpd %xmm0, (A1)
  585. addl INCY, A1
  586. movlpd %xmm1, (A1)
  587. addl INCY, A1
  588. movlpd %xmm2, (A1)
  589. addl INCY, A1
  590. movlpd %xmm3, (A1)
  591. addl INCY, A1
  592. addl $4 * SIZE, X
  593. ALIGN_3
  594. .L995:
  595. testl $2, M
  596. jle .L996
  597. movsd (Y1), %xmm0
  598. addl INCY, Y1
  599. movsd (Y1), %xmm1
  600. addl INCY, Y1
  601. addsd 0 * SIZE(X), %xmm0
  602. addsd 1 * SIZE(X), %xmm1
  603. movlpd %xmm0, (A1)
  604. addl INCY, A1
  605. movlpd %xmm1, (A1)
  606. addl INCY, A1
  607. addl $2 * SIZE, X
  608. ALIGN_3
  609. .L996:
  610. testl $1, M
  611. jle .L999
  612. movsd (Y1), %xmm0
  613. addsd 0 * SIZE(X), %xmm0
  614. movlpd %xmm0, (A1)
  615. ALIGN_3
  616. .L999:
  617. popl %ebx
  618. popl %esi
  619. popl %edi
  620. popl %ebp
  621. ret
  622. EPILOGUE