You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

scal_sse2.S 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_ALPHA 16 + STACK + ARGS(%esp)
  44. #define STACK_X 24 + STACK + ARGS(%esp)
  45. #define STACK_INCX 28 + STACK + ARGS(%esp)
  46. #define M %ebx
  47. #define X %ecx
  48. #define INCX %edx
  49. #define I %esi
  50. #define XX %edi
  51. #include "l1param.h"
  52. PROLOGUE
  53. PROFCODE
  54. pushl %edi
  55. pushl %esi
  56. pushl %ebx
  57. movl STACK_M, M
  58. movl STACK_X, X
  59. movl STACK_INCX, INCX
  60. movsd STACK_ALPHA, %xmm0
  61. testl M, M
  62. jle .L999
  63. leal (, INCX, SIZE), INCX
  64. xorps %xmm1, %xmm1
  65. comisd %xmm0, %xmm1
  66. jne .L100 # Alpha != ZERO
  67. jp .L100 # For Alpha = NaN
  68. /* Alpha == ZERO */
  69. cmpl $SIZE, INCX
  70. jne .L50
  71. /* INCX == 1 */
  72. testl $15, X # aligned for quad word?
  73. je .L05
  74. movsd %xmm1, 0 * SIZE(X)
  75. addl $SIZE, X
  76. decl M
  77. jle .L999
  78. ALIGN_3
  79. .L05:
  80. /* Aligned Mode */
  81. movl M, I # rcx = n
  82. sarl $4, I
  83. jle .L12
  84. ALIGN_4
  85. .L11:
  86. #ifdef PREFETCHW
  87. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  88. #endif
  89. movaps %xmm1, 0 * SIZE(X)
  90. movaps %xmm1, 2 * SIZE(X)
  91. movaps %xmm1, 4 * SIZE(X)
  92. movaps %xmm1, 6 * SIZE(X)
  93. #ifdef PREFETCHW
  94. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  95. #endif
  96. movaps %xmm1, 8 * SIZE(X)
  97. movaps %xmm1, 10 * SIZE(X)
  98. movaps %xmm1, 12 * SIZE(X)
  99. movaps %xmm1, 14 * SIZE(X)
  100. addl $16 * SIZE, X
  101. decl I
  102. jg .L11
  103. ALIGN_4
  104. .L12:
  105. testl $15, M
  106. je .L999
  107. testl $8, M
  108. je .L13
  109. movaps %xmm1, 0 * SIZE(X)
  110. movaps %xmm1, 2 * SIZE(X)
  111. movaps %xmm1, 4 * SIZE(X)
  112. movaps %xmm1, 6 * SIZE(X)
  113. addl $8 * SIZE, X
  114. ALIGN_3
  115. .L13:
  116. testl $4, M
  117. je .L14
  118. movaps %xmm1, 0 * SIZE(X)
  119. movaps %xmm1, 2 * SIZE(X)
  120. addl $4 * SIZE, X
  121. ALIGN_3
  122. .L14:
  123. testl $2, M
  124. je .L15
  125. movaps %xmm1, 0 * SIZE(X)
  126. addl $2 * SIZE, X
  127. ALIGN_3
  128. .L15:
  129. testl $1, M
  130. je .L999
  131. movsd %xmm1, 0 * SIZE(X)
  132. jmp .L999
  133. ALIGN_4
  134. .L50:
  135. movl M, I
  136. sarl $3, I
  137. jle .L52
  138. ALIGN_4
  139. .L51:
  140. #ifdef PREFETCHW
  141. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  142. #endif
  143. movsd %xmm1, (X)
  144. addl INCX, X
  145. movsd %xmm1, (X)
  146. addl INCX, X
  147. movsd %xmm1, (X)
  148. addl INCX, X
  149. movsd %xmm1, (X)
  150. addl INCX, X
  151. movsd %xmm1, (X)
  152. addl INCX, X
  153. movsd %xmm1, (X)
  154. addl INCX, X
  155. movsd %xmm1, (X)
  156. addl INCX, X
  157. movsd %xmm1, (X)
  158. addl INCX, X
  159. decl I
  160. jg .L51
  161. ALIGN_4
  162. .L52:
  163. testl $7, M
  164. je .L999
  165. testl $4, M
  166. je .L53
  167. movsd %xmm1, (X)
  168. addl INCX, X
  169. movsd %xmm1, (X)
  170. addl INCX, X
  171. movsd %xmm1, (X)
  172. addl INCX, X
  173. movsd %xmm1, (X)
  174. addl INCX, X
  175. ALIGN_3
  176. .L53:
  177. testl $2, M
  178. je .L54
  179. movsd %xmm1, (X)
  180. addl INCX, X
  181. movsd %xmm1, (X)
  182. addl INCX, X
  183. ALIGN_3
  184. .L54:
  185. testl $1, M
  186. je .L999
  187. movsd %xmm1, (X)
  188. jmp .L999
  189. ALIGN_4
  190. /* Alpha != ZERO */
  191. .L100:
  192. unpcklpd %xmm0, %xmm0
  193. cmpl $SIZE, INCX
  194. jne .L150
  195. testl $SIZE, X
  196. je .L105
  197. movsd 0 * SIZE(X), %xmm1
  198. mulsd %xmm0, %xmm1
  199. movsd %xmm1, 0 * SIZE(X)
  200. addl $SIZE, X
  201. decl M
  202. jle .L999
  203. ALIGN_3
  204. .L105:
  205. subl $-16 * SIZE, X
  206. movl M, I # rcx = n
  207. sarl $4, I
  208. jle .L113
  209. #if defined(BARCELONA) || defined(BULLDOZER)
  210. movaps %xmm0, %xmm1
  211. mulpd -16 * SIZE(X), %xmm1
  212. movaps %xmm0, %xmm2
  213. mulpd -14 * SIZE(X), %xmm2
  214. movaps %xmm0, %xmm3
  215. mulpd -12 * SIZE(X), %xmm3
  216. movaps %xmm0, %xmm4
  217. mulpd -10 * SIZE(X), %xmm4
  218. decl I
  219. jle .L112
  220. ALIGN_4
  221. .L111:
  222. #ifdef PREFETCHW
  223. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  224. #endif
  225. movaps %xmm1, -16 * SIZE(X)
  226. movaps %xmm0, %xmm1
  227. mulpd -8 * SIZE(X), %xmm1
  228. movaps %xmm2, -14 * SIZE(X)
  229. movaps %xmm0, %xmm2
  230. mulpd -6 * SIZE(X), %xmm2
  231. movaps %xmm3, -12 * SIZE(X)
  232. movaps %xmm0, %xmm3
  233. mulpd -4 * SIZE(X), %xmm3
  234. movaps %xmm4, -10 * SIZE(X)
  235. movaps %xmm0, %xmm4
  236. mulpd -2 * SIZE(X), %xmm4
  237. #ifdef PREFETCHW
  238. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  239. #endif
  240. movaps %xmm1, -8 * SIZE(X)
  241. movaps %xmm0, %xmm1
  242. mulpd 0 * SIZE(X), %xmm1
  243. movaps %xmm2, -6 * SIZE(X)
  244. movaps %xmm0, %xmm2
  245. mulpd 2 * SIZE(X), %xmm2
  246. movaps %xmm3, -4 * SIZE(X)
  247. movaps %xmm0, %xmm3
  248. mulpd 4 * SIZE(X), %xmm3
  249. movaps %xmm4, -2 * SIZE(X)
  250. movaps %xmm0, %xmm4
  251. mulpd 6 * SIZE(X), %xmm4
  252. subl $-16 * SIZE, X
  253. decl I
  254. jg .L111
  255. ALIGN_4
  256. .L112:
  257. movaps %xmm1, -16 * SIZE(X)
  258. movaps %xmm0, %xmm1
  259. mulpd -8 * SIZE(X), %xmm1
  260. movaps %xmm2, -14 * SIZE(X)
  261. movaps %xmm0, %xmm2
  262. mulpd -6 * SIZE(X), %xmm2
  263. movaps %xmm3, -12 * SIZE(X)
  264. movaps %xmm0, %xmm3
  265. mulpd -4 * SIZE(X), %xmm3
  266. movaps %xmm4, -10 * SIZE(X)
  267. movaps %xmm0, %xmm4
  268. mulpd -2 * SIZE(X), %xmm4
  269. movaps %xmm1, -8 * SIZE(X)
  270. movaps %xmm2, -6 * SIZE(X)
  271. movaps %xmm3, -4 * SIZE(X)
  272. movaps %xmm4, -2 * SIZE(X)
  273. #else
  274. movaps -16 * SIZE(X), %xmm1
  275. movaps -14 * SIZE(X), %xmm2
  276. movaps -12 * SIZE(X), %xmm3
  277. movaps -10 * SIZE(X), %xmm4
  278. decl I
  279. jle .L112
  280. ALIGN_4
  281. .L111:
  282. #ifdef PREFETCHW
  283. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  284. #endif
  285. mulpd %xmm0, %xmm1
  286. movaps %xmm1, -16 * SIZE(X)
  287. movaps -8 * SIZE(X), %xmm1
  288. mulpd %xmm0, %xmm2
  289. movaps %xmm2, -14 * SIZE(X)
  290. movaps -6 * SIZE(X), %xmm2
  291. mulpd %xmm0, %xmm3
  292. movaps %xmm3, -12 * SIZE(X)
  293. movaps -4 * SIZE(X), %xmm3
  294. mulpd %xmm0, %xmm4
  295. movaps %xmm4, -10 * SIZE(X)
  296. movaps -2 * SIZE(X), %xmm4
  297. #ifdef PREFETCHW
  298. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  299. #endif
  300. mulpd %xmm0, %xmm1
  301. movaps %xmm1, -8 * SIZE(X)
  302. movaps 0 * SIZE(X), %xmm1
  303. mulpd %xmm0, %xmm2
  304. movaps %xmm2, -6 * SIZE(X)
  305. movaps 2 * SIZE(X), %xmm2
  306. mulpd %xmm0, %xmm3
  307. movaps %xmm3, -4 * SIZE(X)
  308. movaps 4 * SIZE(X), %xmm3
  309. mulpd %xmm0, %xmm4
  310. movaps %xmm4, -2 * SIZE(X)
  311. movaps 6 * SIZE(X), %xmm4
  312. subl $-16 * SIZE, X
  313. decl I
  314. jg .L111
  315. ALIGN_4
  316. .L112:
  317. mulpd %xmm0, %xmm1
  318. movaps %xmm1, -16 * SIZE(X)
  319. movaps -8 * SIZE(X), %xmm1
  320. mulpd %xmm0, %xmm2
  321. movaps %xmm2, -14 * SIZE(X)
  322. movaps -6 * SIZE(X), %xmm2
  323. mulpd %xmm0, %xmm3
  324. movaps %xmm3, -12 * SIZE(X)
  325. movaps -4 * SIZE(X), %xmm3
  326. mulpd %xmm0, %xmm4
  327. movaps %xmm4, -10 * SIZE(X)
  328. movaps -2 * SIZE(X), %xmm4
  329. mulpd %xmm0, %xmm1
  330. movaps %xmm1, -8 * SIZE(X)
  331. mulpd %xmm0, %xmm2
  332. movaps %xmm2, -6 * SIZE(X)
  333. mulpd %xmm0, %xmm3
  334. movaps %xmm3, -4 * SIZE(X)
  335. mulpd %xmm0, %xmm4
  336. movaps %xmm4, -2 * SIZE(X)
  337. #endif
  338. subl $-16 * SIZE, X
  339. ALIGN_3
  340. .L113:
  341. testl $15, M
  342. je .L999
  343. testl $8, M
  344. je .L114
  345. movaps -16 * SIZE(X), %xmm1
  346. movaps -14 * SIZE(X), %xmm2
  347. movaps -12 * SIZE(X), %xmm3
  348. movaps -10 * SIZE(X), %xmm4
  349. mulpd %xmm0, %xmm1
  350. movaps %xmm1, -16 * SIZE(X)
  351. mulpd %xmm0, %xmm2
  352. movaps %xmm2, -14 * SIZE(X)
  353. mulpd %xmm0, %xmm3
  354. movaps %xmm3, -12 * SIZE(X)
  355. mulpd %xmm0, %xmm4
  356. movaps %xmm4, -10 * SIZE(X)
  357. addl $8 * SIZE, X
  358. ALIGN_3
  359. .L114:
  360. testl $4, M
  361. je .L115
  362. movaps -16 * SIZE(X), %xmm1
  363. movaps -14 * SIZE(X), %xmm2
  364. mulpd %xmm0, %xmm1
  365. movaps %xmm1, -16 * SIZE(X)
  366. mulpd %xmm0, %xmm2
  367. movaps %xmm2, -14 * SIZE(X)
  368. addl $4 * SIZE, X
  369. ALIGN_3
  370. .L115:
  371. testl $2, M
  372. je .L116
  373. movaps -16 * SIZE(X), %xmm1
  374. mulpd %xmm0, %xmm1
  375. movaps %xmm1, -16 * SIZE(X)
  376. addl $2 * SIZE, X
  377. ALIGN_3
  378. .L116:
  379. testl $1, M
  380. je .L999
  381. movsd -16 * SIZE(X), %xmm1
  382. mulsd %xmm0, %xmm1
  383. movsd %xmm1, -16 * SIZE(X)
  384. jmp .L999
  385. ALIGN_3
  386. /* incx != 1 */
  387. .L150:
  388. movl X, XX
  389. movl M, I
  390. sarl $2, I
  391. jle .L152
  392. ALIGN_4
  393. .L151:
  394. #ifdef PREFETCHW
  395. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  396. #endif
  397. movsd (X), %xmm1
  398. addl INCX, X
  399. movsd (X), %xmm2
  400. addl INCX, X
  401. movsd (X), %xmm3
  402. addl INCX, X
  403. movsd (X), %xmm4
  404. addl INCX, X
  405. mulsd %xmm0, %xmm1
  406. mulsd %xmm0, %xmm2
  407. mulsd %xmm0, %xmm3
  408. mulsd %xmm0, %xmm4
  409. movsd %xmm1, (XX)
  410. addl INCX, XX
  411. movsd %xmm2, (XX)
  412. addl INCX, XX
  413. movsd %xmm3, (XX)
  414. addl INCX, XX
  415. movsd %xmm4, (XX)
  416. addl INCX, XX
  417. decl I
  418. jg .L151
  419. ALIGN_4
  420. .L152:
  421. testl $2, M
  422. je .L154
  423. movsd (X), %xmm1
  424. addl INCX, X
  425. movsd (X), %xmm2
  426. addl INCX, X
  427. mulsd %xmm0, %xmm1
  428. mulsd %xmm0, %xmm2
  429. movsd %xmm1, (XX)
  430. addl INCX, XX
  431. movsd %xmm2, (XX)
  432. addl INCX, XX
  433. ALIGN_3
  434. .L154:
  435. testl $1, M
  436. je .L999
  437. movsd (X), %xmm1
  438. mulsd %xmm0, %xmm1
  439. movsd %xmm1, (X)
  440. ALIGN_4
  441. .L999:
  442. xorl %eax, %eax
  443. popl %ebx
  444. popl %esi
  445. popl %edi
  446. ret
  447. EPILOGUE