You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zscal.S 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N %i0
  41. #if defined(DOUBLE) && !defined(__64BIT__)
  42. #define X %i3
  43. #define INCX %i4
  44. #else
  45. #define X %i5
  46. #define INCX %i3
  47. #endif
  48. #define I %i1
  49. #define XX %i2
  50. #ifdef DOUBLE
  51. #define c1 %f0
  52. #define c2 %f2
  53. #define c3 %f4
  54. #define c4 %f6
  55. #define c5 %f8
  56. #define c6 %f10
  57. #define c7 %f12
  58. #define c8 %f14
  59. #define t1 %f16
  60. #define t2 %f18
  61. #define t3 %f20
  62. #define t4 %f22
  63. #define t5 %f24
  64. #define t6 %f26
  65. #define t7 %f28
  66. #define t8 %f30
  67. #define c9 %f32
  68. #define c10 %f34
  69. #define c11 %f36
  70. #define c12 %f38
  71. #define c13 %f40
  72. #define c14 %f42
  73. #define c15 %f44
  74. #define c16 %f46
  75. #define s1 %f32
  76. #define s2 %f34
  77. #define s3 %f36
  78. #define s4 %f38
  79. #define s5 %f40
  80. #define s6 %f42
  81. #define s7 %f44
  82. #define s8 %f46
  83. #define FZERO %f48
  84. #define ALPHA_R %f50
  85. #define ALPHA_I %f52
  86. #else
  87. #define c1 %f0
  88. #define c2 %f1
  89. #define c3 %f2
  90. #define c4 %f3
  91. #define c5 %f4
  92. #define c6 %f5
  93. #define c7 %f6
  94. #define c8 %f7
  95. #define c9 %f8
  96. #define c10 %f9
  97. #define c11 %f10
  98. #define c12 %f11
  99. #define c13 %f12
  100. #define c14 %f13
  101. #define c15 %f14
  102. #define c16 %f15
  103. #define s1 %f8
  104. #define s2 %f9
  105. #define s3 %f10
  106. #define s4 %f11
  107. #define s5 %f12
  108. #define s6 %f13
  109. #define s7 %f14
  110. #define s8 %f15
  111. #define t1 %f16
  112. #define t2 %f17
  113. #define t3 %f18
  114. #define t4 %f19
  115. #define t5 %f20
  116. #define t6 %f21
  117. #define t7 %f22
  118. #define t8 %f23
  119. #define FZERO %f24
  120. #define ALPHA_R %f25
  121. #define ALPHA_I %f26
  122. #endif
  123. #define PREFETCHSIZE 128
  124. PROLOGUE
  125. SAVESP
  126. #ifndef __64BIT__
  127. #ifdef DOUBLE
  128. st %i3, [%sp + STACK_START + 16]
  129. st %i4, [%sp + STACK_START + 20]
  130. st %i5, [%sp + STACK_START + 24]
  131. ld [%sp+ STACK_START + 32], X
  132. ld [%sp+ STACK_START + 36], INCX
  133. #else
  134. st %i3, [%sp + STACK_START + 16]
  135. st %i4, [%sp + STACK_START + 24]
  136. ld [%sp+ STACK_START + 28], INCX
  137. #endif
  138. LDF [%sp + STACK_START + 16], ALPHA_R
  139. LDF [%sp + STACK_START + 24], ALPHA_I
  140. #else
  141. ldx [%sp + STACK_START + 56], INCX
  142. #ifdef DOUBLE
  143. FMOV %f6, ALPHA_R
  144. FMOV %f8, ALPHA_I
  145. #else
  146. FMOV %f7, ALPHA_R
  147. FMOV %f9, ALPHA_I
  148. #endif
  149. #endif
  150. #ifdef DOUBLE
  151. FCLR(17)
  152. #else
  153. FCLR(24)
  154. #endif
  155. FCMP ALPHA_R, FZERO
  156. fbne .LL100
  157. sll INCX, ZBASE_SHIFT, INCX
  158. FCMP ALPHA_I, FZERO
  159. fbne .LL100
  160. nop
  161. cmp INCX, 2 * SIZE
  162. bne .LL50
  163. nop
  164. sra N, 2, I
  165. cmp I, 0
  166. ble,pn %icc, .LL15
  167. nop
  168. .LL11:
  169. prefetch [X + PREFETCHSIZE * SIZE], 0
  170. STF FZERO, [X + 0 * SIZE]
  171. add I, -1, I
  172. STF FZERO, [X + 1 * SIZE]
  173. cmp I, 0
  174. STF FZERO, [X + 2 * SIZE]
  175. STF FZERO, [X + 3 * SIZE]
  176. STF FZERO, [X + 4 * SIZE]
  177. STF FZERO, [X + 5 * SIZE]
  178. add X, 8 * SIZE, X
  179. STF FZERO, [X - 2 * SIZE]
  180. bg,pt %icc, .LL11
  181. STF FZERO, [X - 1 * SIZE]
  182. .LL15:
  183. and N, 3, I
  184. cmp I, 0
  185. ble,a,pn %icc, .LL19
  186. nop
  187. .LL16:
  188. STF FZERO, [X + 0 * SIZE]
  189. STF FZERO, [X + 1 * SIZE]
  190. add I, -1, I
  191. cmp I, 0
  192. bg,pt %icc, .LL16
  193. add X, 2 * SIZE, X
  194. .LL19:
  195. return %i7 + 8
  196. clr %o0
  197. .LL50:
  198. sra N, 2, I
  199. cmp I, 0
  200. ble,pn %icc, .LL55
  201. nop
  202. .LL51:
  203. STF FZERO, [X + 0 * SIZE]
  204. add I, -1, I
  205. STF FZERO, [X + 1 * SIZE]
  206. add X, INCX, X
  207. STF FZERO, [X + 0 * SIZE]
  208. cmp I, 0
  209. STF FZERO, [X + 1 * SIZE]
  210. add X, INCX, X
  211. STF FZERO, [X + 0 * SIZE]
  212. STF FZERO, [X + 1 * SIZE]
  213. add X, INCX, X
  214. STF FZERO, [X + 0 * SIZE]
  215. STF FZERO, [X + 1 * SIZE]
  216. bg,pt %icc, .LL51
  217. add X, INCX, X
  218. .LL55:
  219. and N, 3, I
  220. cmp I, 0
  221. ble,a,pn %icc, .LL59
  222. nop
  223. .LL56:
  224. STF FZERO, [X + 0 * SIZE]
  225. add I, -1, I
  226. STF FZERO, [X + 1 * SIZE]
  227. cmp I, 0
  228. bg,pt %icc, .LL56
  229. add X, INCX, X
  230. .LL59:
  231. return %i7 + 8
  232. clr %o0
  233. .LL100:
  234. cmp INCX, 2 * SIZE
  235. bne .LL150
  236. sra N, 2, I
  237. cmp I, 0
  238. ble,pn %icc, .LL115
  239. nop
  240. LDF [X + 0 * SIZE], c1
  241. LDF [X + 1 * SIZE], c2
  242. LDF [X + 2 * SIZE], c3
  243. LDF [X + 3 * SIZE], c4
  244. LDF [X + 4 * SIZE], c5
  245. LDF [X + 5 * SIZE], c6
  246. LDF [X + 6 * SIZE], c7
  247. LDF [X + 7 * SIZE], c8
  248. FMUL ALPHA_R, c1, t1
  249. FMUL ALPHA_I, c2, t3
  250. FMUL ALPHA_I, c1, t2
  251. LDF [X + 8 * SIZE], c1
  252. FMUL ALPHA_R, c2, t4
  253. LDF [X + 9 * SIZE], c2
  254. FMUL ALPHA_R, c3, t5
  255. deccc I
  256. FMUL ALPHA_I, c4, t7
  257. FSUB t1, t3, s1
  258. FMUL ALPHA_I, c3, t6
  259. LDF [X + 10 * SIZE], c3
  260. FMUL ALPHA_R, c4, t8
  261. LDF [X + 11 * SIZE], c4
  262. FADD t4, t2, s2
  263. ble,pn %icc, .LL112
  264. nop
  265. .LL111:
  266. prefetch [X + PREFETCHSIZE * SIZE], 0
  267. FMUL ALPHA_R, c5, t1
  268. FMUL ALPHA_I, c6, t3
  269. FSUB t5, t7, s3
  270. STF s1, [X + 0 * SIZE]
  271. FMUL ALPHA_I, c5, t2
  272. LDF [X + 12 * SIZE], c5
  273. FMUL ALPHA_R, c6, t4
  274. LDF [X + 13 * SIZE], c6
  275. FADD t8, t6, s4
  276. STF s2, [X + 1 * SIZE]
  277. FMUL ALPHA_R, c7, t5
  278. FMUL ALPHA_I, c8, t7
  279. FSUB t1, t3, s5
  280. STF s3, [X + 2 * SIZE]
  281. FMUL ALPHA_I, c7, t6
  282. LDF [X + 14 * SIZE], c7
  283. FMUL ALPHA_R, c8, t8
  284. LDF [X + 15 * SIZE], c8
  285. FADD t4, t2, s6
  286. STF s4, [X + 3 * SIZE]
  287. FMUL ALPHA_R, c1, t1
  288. FMUL ALPHA_I, c2, t3
  289. FSUB t5, t7, s7
  290. STF s5, [X + 4 * SIZE]
  291. FMUL ALPHA_I, c1, t2
  292. LDF [X + 16 * SIZE], c1
  293. FMUL ALPHA_R, c2, t4
  294. LDF [X + 17 * SIZE], c2
  295. FADD t8, t6, s8
  296. STF s6, [X + 5 * SIZE]
  297. FMUL ALPHA_R, c3, t5
  298. deccc I
  299. FMUL ALPHA_I, c4, t7
  300. FSUB t1, t3, s1
  301. STF s7, [X + 6 * SIZE]
  302. FMUL ALPHA_I, c3, t6
  303. LDF [X + 18 * SIZE], c3
  304. FMUL ALPHA_R, c4, t8
  305. LDF [X + 19 * SIZE], c4
  306. FADD t4, t2, s2
  307. STF s8, [X + 7 * SIZE]
  308. bg,pt %icc, .LL111
  309. add X, 8 * SIZE, X
  310. .LL112:
  311. FMUL ALPHA_R, c5, t1
  312. FMUL ALPHA_I, c6, t3
  313. FSUB t5, t7, s3
  314. STF s1, [X + 0 * SIZE]
  315. FMUL ALPHA_I, c5, t2
  316. FMUL ALPHA_R, c6, t4
  317. FADD t8, t6, s4
  318. STF s2, [X + 1 * SIZE]
  319. FMUL ALPHA_R, c7, t5
  320. FMUL ALPHA_I, c8, t7
  321. FSUB t1, t3, s5
  322. STF s3, [X + 2 * SIZE]
  323. FMUL ALPHA_I, c7, t6
  324. FMUL ALPHA_R, c8, t8
  325. FADD t4, t2, s6
  326. STF s4, [X + 3 * SIZE]
  327. FSUB t5, t7, s7
  328. FADD t8, t6, s8
  329. STF s5, [X + 4 * SIZE]
  330. STF s6, [X + 5 * SIZE]
  331. STF s7, [X + 6 * SIZE]
  332. STF s8, [X + 7 * SIZE]
  333. add X, 8 * SIZE, X
  334. .LL115:
  335. and N, 3, I
  336. cmp I, 0
  337. ble,a,pn %icc, .LL119
  338. nop
  339. .LL116:
  340. LDF [X + 0 * SIZE], c1
  341. LDF [X + 1 * SIZE], c2
  342. FMUL ALPHA_R, c1, c3
  343. FMUL ALPHA_I, c1, c4
  344. FMUL ALPHA_I, c2, c1
  345. FMUL ALPHA_R, c2, c2
  346. FSUB c3, c1, c1
  347. FADD c2, c4, c2
  348. STF c1, [X + 0 * SIZE]
  349. STF c2, [X + 1 * SIZE]
  350. add I, -1, I
  351. cmp I, 0
  352. bg,pt %icc, .LL116
  353. add X, 2 * SIZE, X
  354. .LL119:
  355. return %i7 + 8
  356. clr %o0
  357. .LL150:
  358. sra N, 2, I
  359. cmp I, 0
  360. ble,pn %icc, .LL155
  361. mov X, XX
  362. .LL151:
  363. LDF [X + 0 * SIZE], c1
  364. LDF [X + 1 * SIZE], c2
  365. add X, INCX, X
  366. LDF [X + 0 * SIZE], c3
  367. FMUL ALPHA_R, c1, c9
  368. LDF [X + 1 * SIZE], c4
  369. FMUL ALPHA_I, c1, c10
  370. add X, INCX, X
  371. LDF [X + 0 * SIZE], c5
  372. FMUL ALPHA_I, c2, c1
  373. LDF [X + 1 * SIZE], c6
  374. FMUL ALPHA_R, c2, c2
  375. add X, INCX, X
  376. LDF [X + 0 * SIZE], c7
  377. FMUL ALPHA_R, c3, c11
  378. LDF [X + 1 * SIZE], c8
  379. FMUL ALPHA_I, c3, c12
  380. add X, INCX, X
  381. FMUL ALPHA_I, c4, c3
  382. FMUL ALPHA_R, c4, c4
  383. FMUL ALPHA_R, c5, c13
  384. FMUL ALPHA_I, c5, c14
  385. FMUL ALPHA_I, c6, c5
  386. FMUL ALPHA_R, c6, c6
  387. FMUL ALPHA_R, c7, c15
  388. FSUB c9, c1, c1
  389. FMUL ALPHA_I, c7, c16
  390. FADD c2, c10, c2
  391. FMUL ALPHA_I, c8, c7
  392. FSUB c11, c3, c3
  393. FMUL ALPHA_R, c8, c8
  394. FADD c4, c12, c4
  395. STF c1, [XX + 0 * SIZE]
  396. FSUB c13, c5, c5
  397. add I, -1, I
  398. STF c2, [XX + 1 * SIZE]
  399. FADD c6, c14, c6
  400. add XX, INCX, XX
  401. STF c3, [XX + 0 * SIZE]
  402. FSUB c15, c7, c7
  403. cmp I, 0
  404. STF c4, [XX + 1 * SIZE]
  405. FADD c8, c16, c8
  406. add XX, INCX, XX
  407. STF c5, [XX + 0 * SIZE]
  408. STF c6, [XX + 1 * SIZE]
  409. add XX, INCX, XX
  410. STF c7, [XX + 0 * SIZE]
  411. STF c8, [XX + 1 * SIZE]
  412. bg,pt %icc, .LL151
  413. add XX, INCX, XX
  414. .LL155:
  415. and N, 3, I
  416. cmp I, 0
  417. ble,a,pn %icc, .LL159
  418. nop
  419. .LL156:
  420. LDF [X + 0 * SIZE], c1
  421. LDF [X + 1 * SIZE], c2
  422. FMUL ALPHA_R, c1, c3
  423. FMUL ALPHA_I, c1, c4
  424. FMUL ALPHA_I, c2, c1
  425. FMUL ALPHA_R, c2, c2
  426. FSUB c3, c1, c1
  427. FADD c2, c4, c2
  428. STF c1, [X + 0 * SIZE]
  429. STF c2, [X + 1 * SIZE]
  430. add I, -1, I
  431. cmp I, 0
  432. bg,pt %icc, .LL156
  433. add X, INCX, X
  434. .LL159:
  435. return %i7 + 8
  436. clr %o0
  437. EPILOGUE