You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

scal_sse.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef WINDOWS_ABI
  41. #define M ARG1
  42. #define X ARG4
  43. #define INCX ARG5
  44. #else
  45. #define M ARG1
  46. #define X ARG2
  47. #define INCX ARG3
  48. #endif
  49. #define XX %r10
  50. #define I %rax
  51. #include "l1param.h"
  52. PROLOGUE
  53. PROFCODE
  54. #ifdef WINDOWS_ABI
  55. movq 40(%rsp), X
  56. movq 48(%rsp), INCX
  57. movq 64(%rsp), %r9
  58. movaps %xmm3, %xmm0
  59. #else
  60. movq 24(%rsp), %r9
  61. #endif
  62. SAVEREGISTERS
  63. testq M, M
  64. jle .L999
  65. lea (, INCX, SIZE), INCX
  66. pxor %xmm1, %xmm1
  67. comiss %xmm0, %xmm1
  68. shufps $0, %xmm0, %xmm0
  69. jne .L100 # Alpha != ZERO
  70. cmpq $1, %r9
  71. je .L100
  72. /* Alpha == ZERO */
  73. cmpq $SIZE, INCX
  74. jne .L50
  75. /* INCX == 1 */
  76. cmpq $3, M
  77. jle .L14
  78. testq $4, X # aligned for double word?
  79. je .L05
  80. movss %xmm1, 0 * SIZE(X)
  81. addq $SIZE, X
  82. decq M
  83. jle .L999
  84. ALIGN_3
  85. .L05:
  86. testq $8, X # aligned for quad word?
  87. je .L06
  88. movsd %xmm1, 0 * SIZE(X)
  89. addq $2 * SIZE, X
  90. subq $2, M
  91. jle .L999
  92. ALIGN_3
  93. .L06:
  94. movq M, I
  95. sarq $4, I
  96. jle .L12
  97. ALIGN_4
  98. .L11:
  99. #ifdef PREFETCHW
  100. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  101. #endif
  102. movaps %xmm1, 0 * SIZE(X)
  103. movaps %xmm1, 4 * SIZE(X)
  104. movaps %xmm1, 8 * SIZE(X)
  105. movaps %xmm1, 12 * SIZE(X)
  106. addq $16 * SIZE, X
  107. decq I
  108. jg .L11
  109. ALIGN_4
  110. .L12:
  111. testq $15, M
  112. je .L999
  113. testq $8, M
  114. je .L13
  115. movaps %xmm1, 0 * SIZE(X)
  116. movaps %xmm1, 4 * SIZE(X)
  117. addq $8 * SIZE, X
  118. ALIGN_3
  119. .L13:
  120. testq $4, M
  121. je .L14
  122. movaps %xmm1, 0 * SIZE(X)
  123. addq $4 * SIZE, X
  124. ALIGN_3
  125. .L14:
  126. testq $2, M
  127. je .L15
  128. movsd %xmm1, 0 * SIZE(X)
  129. addq $2 * SIZE, X
  130. ALIGN_3
  131. .L15:
  132. testq $1, M
  133. je .L999
  134. movss %xmm1, 0 * SIZE(X)
  135. jmp .L999
  136. ALIGN_4
  137. /* incx != 1 */
  138. .L50:
  139. movq M, I # rcx = n
  140. sarq $3, I # (n >> 3)
  141. jle .L52
  142. ALIGN_4
  143. .L51:
  144. #ifdef PREFETCHW
  145. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  146. #endif
  147. movss %xmm1, (X)
  148. addq INCX, X
  149. movss %xmm1, (X)
  150. addq INCX, X
  151. movss %xmm1, (X)
  152. addq INCX, X
  153. movss %xmm1, (X)
  154. addq INCX, X
  155. movss %xmm1, (X)
  156. addq INCX, X
  157. movss %xmm1, (X)
  158. addq INCX, X
  159. movss %xmm1, (X)
  160. addq INCX, X
  161. movss %xmm1, (X)
  162. addq INCX, X
  163. decq I
  164. jg .L51
  165. ALIGN_4
  166. .L52:
  167. testq $7, M
  168. je .L999
  169. testq $4, M
  170. je .L53
  171. movss %xmm1, (X)
  172. addq INCX, X
  173. movss %xmm1, (X)
  174. addq INCX, X
  175. movss %xmm1, (X)
  176. addq INCX, X
  177. movss %xmm1, (X)
  178. addq INCX, X
  179. ALIGN_3
  180. .L53:
  181. testq $2, M
  182. je .L54
  183. movss %xmm1, (X)
  184. addq INCX, X
  185. movss %xmm1, (X)
  186. addq INCX, X
  187. ALIGN_3
  188. .L54:
  189. testq $1, M
  190. je .L999
  191. movss %xmm1, (X)
  192. jmp .L999
  193. ALIGN_4
  194. /* Alpha != ZERO */
  195. .L100:
  196. cmpq $SIZE, INCX
  197. jne .L150
  198. subq $-32 * SIZE, X
  199. cmpq $3, M
  200. jle .L116
  201. testq $SIZE, X
  202. je .L105
  203. movss -32 * SIZE(X), %xmm1
  204. mulss %xmm0, %xmm1
  205. movss %xmm1, -32 * SIZE(X)
  206. addq $SIZE, X
  207. decq M
  208. jle .L999
  209. ALIGN_3
  210. .L105:
  211. testq $2 * SIZE, X
  212. je .L110
  213. movsd -32 * SIZE(X), %xmm1
  214. mulps %xmm0, %xmm1
  215. movsd %xmm1, -32 * SIZE(X)
  216. addq $2 * SIZE, X
  217. subq $2, M
  218. jle .L999
  219. ALIGN_3
  220. .L110:
  221. movq M, I
  222. sarq $5, I
  223. jle .L113
  224. #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  225. movaps %xmm0, %xmm1
  226. mulps -32 * SIZE(X), %xmm1
  227. movaps %xmm0, %xmm2
  228. mulps -28 * SIZE(X), %xmm2
  229. movaps %xmm0, %xmm3
  230. mulps -24 * SIZE(X), %xmm3
  231. movaps %xmm0, %xmm4
  232. mulps -20 * SIZE(X), %xmm4
  233. movaps %xmm0, %xmm5
  234. mulps -16 * SIZE(X), %xmm5
  235. movaps %xmm0, %xmm6
  236. mulps -12 * SIZE(X), %xmm6
  237. movaps %xmm0, %xmm7
  238. mulps -8 * SIZE(X), %xmm7
  239. movaps %xmm0, %xmm8
  240. mulps -4 * SIZE(X), %xmm8
  241. decq I
  242. jle .L112
  243. ALIGN_4
  244. .L111:
  245. #ifdef PREFETCHW
  246. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  247. #endif
  248. movaps %xmm1, -32 * SIZE(X)
  249. movaps %xmm2, -28 * SIZE(X)
  250. movaps %xmm3, -24 * SIZE(X)
  251. movaps %xmm4, -20 * SIZE(X)
  252. movaps %xmm0, %xmm1
  253. mulps 0 * SIZE(X), %xmm1
  254. movaps %xmm0, %xmm2
  255. mulps 4 * SIZE(X), %xmm2
  256. movaps %xmm0, %xmm3
  257. mulps 8 * SIZE(X), %xmm3
  258. movaps %xmm0, %xmm4
  259. mulps 12 * SIZE(X), %xmm4
  260. #ifdef PREFETCHW
  261. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  262. #endif
  263. movaps %xmm5, -16 * SIZE(X)
  264. movaps %xmm6, -12 * SIZE(X)
  265. movaps %xmm7, -8 * SIZE(X)
  266. movaps %xmm8, -4 * SIZE(X)
  267. movaps %xmm0, %xmm5
  268. mulps 16 * SIZE(X), %xmm5
  269. movaps %xmm0, %xmm6
  270. mulps 20 * SIZE(X), %xmm6
  271. movaps %xmm0, %xmm7
  272. mulps 24 * SIZE(X), %xmm7
  273. movaps %xmm0, %xmm8
  274. mulps 28 * SIZE(X), %xmm8
  275. subq $-32 * SIZE, X
  276. decq I
  277. jg .L111
  278. ALIGN_4
  279. .L112:
  280. movaps %xmm1, -32 * SIZE(X)
  281. movaps %xmm2, -28 * SIZE(X)
  282. movaps %xmm3, -24 * SIZE(X)
  283. movaps %xmm4, -20 * SIZE(X)
  284. movaps %xmm5, -16 * SIZE(X)
  285. movaps %xmm6, -12 * SIZE(X)
  286. movaps %xmm7, -8 * SIZE(X)
  287. movaps %xmm8, -4 * SIZE(X)
  288. #else
  289. movaps -32 * SIZE(X), %xmm1
  290. movaps -28 * SIZE(X), %xmm2
  291. movaps -24 * SIZE(X), %xmm3
  292. movaps -20 * SIZE(X), %xmm4
  293. movaps -16 * SIZE(X), %xmm5
  294. movaps -12 * SIZE(X), %xmm6
  295. movaps -8 * SIZE(X), %xmm7
  296. movaps -4 * SIZE(X), %xmm8
  297. decq I
  298. jle .L112
  299. ALIGN_4
  300. .L111:
  301. #ifdef PREFETCHW
  302. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  303. #endif
  304. mulps %xmm0, %xmm1
  305. movaps %xmm1, -32 * SIZE(X)
  306. movaps 0 * SIZE(X), %xmm1
  307. mulps %xmm0, %xmm2
  308. movaps %xmm2, -28 * SIZE(X)
  309. movaps 4 * SIZE(X), %xmm2
  310. mulps %xmm0, %xmm3
  311. movaps %xmm3, -24 * SIZE(X)
  312. movaps 8 * SIZE(X), %xmm3
  313. mulps %xmm0, %xmm4
  314. movaps %xmm4, -20 * SIZE(X)
  315. movaps 12 * SIZE(X), %xmm4
  316. #ifdef PREFETCHW
  317. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  318. #endif
  319. mulps %xmm0, %xmm5
  320. movaps %xmm5, -16 * SIZE(X)
  321. movaps 16 * SIZE(X), %xmm5
  322. mulps %xmm0, %xmm6
  323. movaps %xmm6, -12 * SIZE(X)
  324. movaps 20 * SIZE(X), %xmm6
  325. mulps %xmm0, %xmm7
  326. movaps %xmm7, -8 * SIZE(X)
  327. movaps 24 * SIZE(X), %xmm7
  328. mulps %xmm0, %xmm8
  329. movaps %xmm8, -4 * SIZE(X)
  330. movaps 28 * SIZE(X), %xmm8
  331. subq $-32 * SIZE, X
  332. decq I
  333. jg .L111
  334. ALIGN_4
  335. .L112:
  336. mulps %xmm0, %xmm1
  337. movaps %xmm1, -32 * SIZE(X)
  338. mulps %xmm0, %xmm2
  339. movaps %xmm2, -28 * SIZE(X)
  340. mulps %xmm0, %xmm3
  341. movaps %xmm3, -24 * SIZE(X)
  342. mulps %xmm0, %xmm4
  343. movaps %xmm4, -20 * SIZE(X)
  344. mulps %xmm0, %xmm5
  345. movaps %xmm5, -16 * SIZE(X)
  346. mulps %xmm0, %xmm6
  347. movaps %xmm6, -12 * SIZE(X)
  348. mulps %xmm0, %xmm7
  349. movaps %xmm7, -8 * SIZE(X)
  350. mulps %xmm0, %xmm8
  351. movaps %xmm8, -4 * SIZE(X)
  352. #endif
  353. subq $-32 * SIZE, X
  354. ALIGN_3
  355. .L113:
  356. testq $31, M
  357. je .L999
  358. testq $16, M
  359. je .L114
  360. movaps -32 * SIZE(X), %xmm1
  361. movaps -28 * SIZE(X), %xmm3
  362. movaps -24 * SIZE(X), %xmm5
  363. movaps -20 * SIZE(X), %xmm7
  364. mulps %xmm0, %xmm1
  365. movaps %xmm1, -32 * SIZE(X)
  366. mulps %xmm0, %xmm3
  367. movaps %xmm3, -28 * SIZE(X)
  368. mulps %xmm0, %xmm5
  369. movaps %xmm5, -24 * SIZE(X)
  370. mulps %xmm0, %xmm7
  371. movaps %xmm7, -20 * SIZE(X)
  372. addq $16 * SIZE, X
  373. ALIGN_3
  374. .L114:
  375. testq $8, M
  376. je .L115
  377. movaps -32 * SIZE(X), %xmm1
  378. movaps -28 * SIZE(X), %xmm3
  379. mulps %xmm0, %xmm1
  380. movaps %xmm1, -32 * SIZE(X)
  381. mulps %xmm0, %xmm3
  382. movaps %xmm3, -28 * SIZE(X)
  383. addq $8 * SIZE, X
  384. ALIGN_3
  385. .L115:
  386. testq $4, M
  387. je .L116
  388. movaps -32 * SIZE(X), %xmm1
  389. mulps %xmm0, %xmm1
  390. movaps %xmm1, -32 * SIZE(X)
  391. addq $4 * SIZE, X
  392. ALIGN_3
  393. .L116:
  394. testq $2, M
  395. je .L117
  396. movsd -32 * SIZE(X), %xmm1
  397. mulps %xmm0, %xmm1
  398. movsd %xmm1, -32 * SIZE(X)
  399. addq $2 * SIZE, X
  400. ALIGN_3
  401. .L117:
  402. testq $1, M
  403. je .L999
  404. movss -32 * SIZE(X), %xmm1
  405. mulss %xmm0, %xmm1
  406. movss %xmm1, -32 * SIZE(X)
  407. jmp .L999
  408. ALIGN_3
  409. /* incx != 1 */
  410. .L150:
  411. movq X, XX
  412. movq M, I # rcx = n
  413. sarq $3, I # (n >> 3)
  414. jle .L152
  415. ALIGN_4
  416. .L151:
  417. movss (X), %xmm1
  418. addq INCX, X
  419. movss (X), %xmm2
  420. addq INCX, X
  421. movss (X), %xmm3
  422. addq INCX, X
  423. movss (X), %xmm4
  424. addq INCX, X
  425. movss (X), %xmm5
  426. addq INCX, X
  427. movss (X), %xmm6
  428. addq INCX, X
  429. movss (X), %xmm7
  430. addq INCX, X
  431. movss (X), %xmm8
  432. addq INCX, X
  433. mulss %xmm0, %xmm1
  434. mulss %xmm0, %xmm2
  435. mulss %xmm0, %xmm3
  436. mulss %xmm0, %xmm4
  437. mulss %xmm0, %xmm5
  438. mulss %xmm0, %xmm6
  439. mulss %xmm0, %xmm7
  440. mulss %xmm0, %xmm8
  441. movss %xmm1, (XX)
  442. addq INCX, XX
  443. movss %xmm2, (XX)
  444. addq INCX, XX
  445. movss %xmm3, (XX)
  446. addq INCX, XX
  447. movss %xmm4, (XX)
  448. addq INCX, XX
  449. movss %xmm5, (XX)
  450. addq INCX, XX
  451. movss %xmm6, (XX)
  452. addq INCX, XX
  453. movss %xmm7, (XX)
  454. addq INCX, XX
  455. movss %xmm8, (XX)
  456. addq INCX, XX
  457. decq I
  458. jg .L151
  459. ALIGN_4
  460. .L152:
  461. testq $7, M
  462. je .L999
  463. testq $4, M
  464. je .L153
  465. movss (X), %xmm1
  466. addq INCX, X
  467. movss (X), %xmm2
  468. addq INCX, X
  469. movss (X), %xmm3
  470. addq INCX, X
  471. movss (X), %xmm4
  472. addq INCX, X
  473. mulss %xmm0, %xmm1
  474. mulss %xmm0, %xmm2
  475. mulss %xmm0, %xmm3
  476. mulss %xmm0, %xmm4
  477. movss %xmm1, (XX)
  478. addq INCX, XX
  479. movss %xmm2, (XX)
  480. addq INCX, XX
  481. movss %xmm3, (XX)
  482. addq INCX, XX
  483. movss %xmm4, (XX)
  484. addq INCX, XX
  485. ALIGN_3
  486. .L153:
  487. testq $2, M
  488. je .L154
  489. movss (X), %xmm1
  490. addq INCX, X
  491. movss (X), %xmm2
  492. addq INCX, X
  493. mulss %xmm0, %xmm1
  494. mulss %xmm0, %xmm2
  495. movss %xmm1, (XX)
  496. addq INCX, XX
  497. movss %xmm2, (XX)
  498. addq INCX, XX
  499. ALIGN_3
  500. .L154:
  501. testq $1, M
  502. je .L999
  503. movss (X), %xmm1
  504. mulss %xmm0, %xmm1
  505. movss %xmm1, (X)
  506. ALIGN_4
  507. .L999:
  508. xorq %rax, %rax
  509. RESTOREREGISTERS
  510. ret
  511. EPILOGUE