You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zscal.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef XDOUBLE
  41. #define PREFETCH_SIZE ( 8 * 16)
  42. #elif defined(DOUBLE)
  43. #define PREFETCH_SIZE (16 * 16)
  44. #else
  45. #define PREFETCH_SIZE (32 * 16)
  46. #endif
  47. #define SP r12
  48. #ifdef XDOUBLE
  49. #define N r32
  50. #define X1 r14
  51. #define INCX r15
  52. #else
  53. #define N r32
  54. #define X1 r37
  55. #define INCX r38
  56. #endif
  57. #define X2 r16
  58. #define Y1 r17
  59. #define INCX3 r18
  60. #define PRE r19
  61. #define INCX8 r20
  62. #define I r29
  63. #define J r28
  64. #define PR r30
  65. #define ARLC r31
  66. #define ALPHA_R f8
  67. #define ALPHA_I f9
  68. PROLOGUE
  69. .prologue
  70. PROFCODE
  71. {.mmi
  72. adds r22 = 16, SP
  73. adds r23 = 24, SP
  74. mov PR = pr
  75. }
  76. { .mib
  77. cmp.ge p7, p0 = 0, N
  78. shr I = N, 3
  79. (p7) br.ret.sptk.many b0
  80. }
  81. ;;
  82. #ifdef XDOUBLE
  83. { .mmi
  84. ld8 X1 = [r22]
  85. ld8 INCX = [r23]
  86. nop __LINE__
  87. }
  88. ;;
  89. #endif
  90. { .mfi
  91. and J = 7, N
  92. fcmp.eq p0, p11 = ALPHA_I, f0
  93. .save ar.lc, ARLC
  94. mov ARLC = ar.lc
  95. }
  96. { .mfi
  97. adds I = -1, I
  98. fcmp.eq p0, p10 = ALPHA_R, f0
  99. shl INCX = INCX, ZBASE_SHIFT
  100. }
  101. ;;
  102. .body
  103. { .mmi
  104. shladd INCX8 = INCX, 3, r0
  105. shladd X2 = INCX, 1, X1
  106. mov pr.rot= 0
  107. }
  108. { .mmi
  109. shladd INCX3 = INCX, 1, INCX
  110. adds PRE = PREFETCH_SIZE * SIZE, X1
  111. mov Y1 = X1
  112. }
  113. ;;
  114. { .mmi
  115. cmp.gt p8, p0 = 0, I
  116. cmp.ge p9, p0 = 0, J
  117. mov ar.lc = I
  118. }
  119. { .mmi
  120. adds INCX = -1 * SIZE, INCX
  121. adds INCX3 = -1 * SIZE, INCX3
  122. tbit.z p0, p13 = N, 2
  123. }
  124. ;;
  125. { .bbb
  126. (p10) br.cond.dptk .L100
  127. (p11) br.cond.dptk .L100
  128. (p8) br.cond.dpnt .L20
  129. }
  130. ;;
  131. .align 32
  132. .L10:
  133. { .mmb
  134. STFD [X1] = f0, 1 * SIZE
  135. STFD [X2] = f0, 1 * SIZE
  136. nop.b 0
  137. }
  138. { .mmb
  139. lfetch.excl.nt1 [PRE], INCX8
  140. nop.m 0
  141. }
  142. ;;
  143. { .mmb
  144. STFD [X1] = f0
  145. add X1 = INCX, X1
  146. }
  147. { .mmb
  148. STFD [X2] = f0
  149. add X2 = INCX, X2
  150. }
  151. ;;
  152. { .mmb
  153. STFD [X1] = f0, 1 * SIZE
  154. STFD [X2] = f0, 1 * SIZE
  155. nop.b 0
  156. }
  157. ;;
  158. { .mmb
  159. STFD [X1] = f0
  160. add X1 = INCX3, X1
  161. }
  162. { .mmb
  163. STFD [X2] = f0
  164. add X2 = INCX3, X2
  165. }
  166. ;;
  167. { .mmb
  168. STFD [X1] = f0, 1 * SIZE
  169. STFD [X2] = f0, 1 * SIZE
  170. nop.b 0
  171. }
  172. ;;
  173. { .mmb
  174. STFD [X1] = f0
  175. add X1 = INCX, X1
  176. }
  177. { .mmb
  178. STFD [X2] = f0
  179. add X2 = INCX, X2
  180. }
  181. ;;
  182. { .mmb
  183. STFD [X1] = f0, 1 * SIZE
  184. STFD [X2] = f0, 1 * SIZE
  185. nop.b 0
  186. }
  187. ;;
  188. { .mmb
  189. STFD [X1] = f0
  190. add X1 = INCX3, X1
  191. }
  192. { .mmb
  193. STFD [X2] = f0
  194. add X2 = INCX3, X2
  195. br.cloop.sptk.few .L10
  196. }
  197. ;;
  198. .align 32
  199. .L20:
  200. { .mmi
  201. (p13) STFD [X1] = f0, 1 * SIZE
  202. (p13) STFD [X2] = f0, 1 * SIZE
  203. mov ar.lc = ARLC
  204. }
  205. ;;
  206. { .mmi
  207. (p13) STFD [X1] = f0
  208. (p13) add X1 = INCX, X1
  209. tbit.z p0, p14 = N, 1
  210. }
  211. { .mmi
  212. (p13) STFD [X2] = f0
  213. (p13) add X2 = INCX, X2
  214. tbit.z p0, p15 = N, 0
  215. }
  216. ;;
  217. { .mmb
  218. (p13) STFD [X1] = f0, 1 * SIZE
  219. (p13) STFD [X2] = f0, 1 * SIZE
  220. nop.b 0
  221. }
  222. { .mib
  223. nop.m 0
  224. mov pr = PR, -65474
  225. (p9) br.ret.sptk.many b0
  226. }
  227. ;;
  228. { .mmb
  229. (p13) STFD [X1] = f0
  230. (p13) add X1 = INCX3, X1
  231. }
  232. { .mmb
  233. (p13) STFD [X2] = f0
  234. (p13) add X2 = INCX3, X2
  235. }
  236. ;;
  237. (p14) STFD [X1] = f0, 1 * SIZE
  238. ;;
  239. { .mmb
  240. (p14) STFD [X1] = f0
  241. (p14) add X1 = INCX, X1
  242. }
  243. ;;
  244. (p14) STFD [X1] = f0, 1 * SIZE
  245. ;;
  246. { .mmb
  247. (p14) STFD [X1] = f0
  248. (p14) add X1 = INCX, X1
  249. }
  250. ;;
  251. (p15) STFD [X1] = f0, 1 * SIZE
  252. ;;
  253. { .mib
  254. (p15) STFD [X1] = f0
  255. mov pr = PR, -65474
  256. br.ret.sptk.many b0
  257. }
  258. ;;
  259. .align 32
  260. .L100:
  261. cmp.eq p16, p0 = r0, r0
  262. mov.i ar.ec = 6
  263. (p8) br.cond.dpnt .L170
  264. ;;
  265. .align 32
  266. .L160:
  267. { .mmf
  268. (p21) STFD [X1] = f6, 1 * SIZE
  269. (p16) lfetch.excl.nt1 [PRE], INCX8
  270. (p21) FMS f12 = ALPHA_R, f85, f12
  271. }
  272. { .mfb
  273. (p16) LDFD f32 = [Y1], 1 * SIZE
  274. (p20) FMPY f6 = ALPHA_I, f42
  275. }
  276. ;;
  277. { .mmf
  278. (p21) STFD [X1] = f43
  279. (p21) add X1 = INCX, X1
  280. (p21) FMA f91 = ALPHA_I, f85, f91
  281. }
  282. { .mfb
  283. (p16) LDFD f38 = [Y1], INCX
  284. (p20) FMPY f42 = ALPHA_R, f42
  285. }
  286. ;;
  287. { .mmf
  288. (p21) STFD [X1] = f7, 1 * SIZE
  289. (p21) FMS f13 = ALPHA_R, f97, f13
  290. }
  291. { .mfb
  292. (p16) LDFD f44 = [Y1], 1 * SIZE
  293. (p20) FMPY f7 = ALPHA_I, f54
  294. }
  295. ;;
  296. { .mmf
  297. (p21) STFD [X1] = f55
  298. (p21) add X1 = INCX, X1
  299. (p21) FMA f103 = ALPHA_I, f97, f103
  300. }
  301. { .mfb
  302. (p16) LDFD f50 = [Y1], INCX
  303. (p20) FMPY f54 = ALPHA_R, f54
  304. }
  305. ;;
  306. { .mmf
  307. (p21) STFD [X1] = f10, 1 * SIZE
  308. (p21) FMS f14 = ALPHA_R, f109, f14
  309. }
  310. { .mfb
  311. (p16) LDFD f56 = [Y1], 1 * SIZE
  312. (p20) FMPY f10 = ALPHA_I, f66
  313. }
  314. ;;
  315. { .mmf
  316. (p21) STFD [X1] = f67
  317. (p21) add X1 = INCX, X1
  318. (p21) FMA f115 = ALPHA_I, f109, f115
  319. }
  320. { .mfb
  321. (p16) LDFD f62 = [Y1], INCX
  322. (p20) FMPY f66 = ALPHA_R, f66
  323. }
  324. ;;
  325. { .mmf
  326. (p21) STFD [X1] = f11, 1 * SIZE
  327. (p21) FMS f15 = ALPHA_R, f121, f15
  328. }
  329. { .mfb
  330. (p16) LDFD f68 = [Y1], 1 * SIZE
  331. (p20) FMPY f11 = ALPHA_I, f78
  332. }
  333. ;;
  334. { .mmf
  335. (p21) STFD [X1] = f79
  336. (p21) add X1 = INCX, X1
  337. (p21) FMA f127 = ALPHA_I, f121, f127
  338. }
  339. { .mfb
  340. (p16) LDFD f74 = [Y1], INCX
  341. (p20) FMPY f78 = ALPHA_R, f78
  342. }
  343. ;;
  344. { .mmf
  345. (p21) STFD [X1] = f12, 1 * SIZE
  346. (p20) FMS f6 = ALPHA_R, f36, f6
  347. }
  348. { .mfb
  349. (p16) LDFD f80 = [Y1], 1 * SIZE
  350. (p20) FMPY f12 = ALPHA_I, f90
  351. }
  352. ;;
  353. { .mmf
  354. (p21) STFD [X1] = f91
  355. (p21) add X1 = INCX, X1
  356. (p20) FMA f42 = ALPHA_I, f36, f42
  357. }
  358. { .mfb
  359. (p16) LDFD f86 = [Y1], INCX
  360. (p20) FMPY f90 = ALPHA_R, f90
  361. }
  362. ;;
  363. { .mmf
  364. (p21) STFD [X1] = f13, 1 * SIZE
  365. (p20) FMS f7 = ALPHA_R, f48, f7
  366. }
  367. { .mfb
  368. (p16) LDFD f92 = [Y1], 1 * SIZE
  369. (p20) FMPY f13 = ALPHA_I, f102
  370. }
  371. ;;
  372. { .mmf
  373. (p21) STFD [X1] = f103
  374. (p21) add X1 = INCX, X1
  375. (p20) FMA f54 = ALPHA_I, f48, f54
  376. }
  377. { .mfb
  378. (p16) LDFD f98 = [Y1], INCX
  379. (p20) FMPY f102 = ALPHA_R, f102
  380. }
  381. ;;
  382. { .mmf
  383. (p21) STFD [X1] = f14, 1 * SIZE
  384. (p20) FMS f10 = ALPHA_R, f60, f10
  385. }
  386. { .mfb
  387. (p16) LDFD f104 = [Y1], 1 * SIZE
  388. (p20) FMPY f14 = ALPHA_I, f114
  389. }
  390. ;;
  391. { .mmf
  392. (p21) STFD [X1] = f115
  393. (p21) add X1 = INCX, X1
  394. (p20) FMA f66 = ALPHA_I, f60, f66
  395. }
  396. { .mfb
  397. (p16) LDFD f110 = [Y1], INCX
  398. (p20) FMPY f114 = ALPHA_R, f114
  399. }
  400. ;;
  401. { .mmf
  402. (p21) STFD [X1] = f15, 1 * SIZE
  403. (p20) FMS f11 = ALPHA_R, f72, f11
  404. }
  405. { .mfb
  406. (p16) LDFD f116 = [Y1], 1 * SIZE
  407. (p20) FMPY f15 = ALPHA_I, f126
  408. }
  409. ;;
  410. { .mmf
  411. (p21) STFD [X1] = f127
  412. (p21) add X1 = INCX, X1
  413. (p20) FMA f78 = ALPHA_I, f72, f78
  414. }
  415. { .mfb
  416. (p16) LDFD f122 = [Y1], INCX
  417. (p20) FMPY f126 = ALPHA_R, f126
  418. br.ctop.sptk.few .L160
  419. }
  420. ;;
  421. .align 16
  422. .L170:
  423. { .mmi
  424. (p13) LDFD f48 = [Y1], 1 * SIZE
  425. mov ar.lc = ARLC
  426. }
  427. ;;
  428. { .mib
  429. (p13) LDFD f49 = [Y1], INCX
  430. mov pr = PR, -65474
  431. (p9) br.ret.sptk.many b0
  432. }
  433. ;;
  434. (p13) LDFD f50 = [Y1], 1 * SIZE
  435. tbit.z p0, p14 = N, 1
  436. ;;
  437. (p13) LDFD f51 = [Y1], INCX
  438. tbit.z p0, p15 = N, 0
  439. ;;
  440. (p13) LDFD f52 = [Y1], 1 * SIZE
  441. ;;
  442. (p13) LDFD f53 = [Y1], INCX
  443. ;;
  444. (p13) LDFD f54 = [Y1], 1 * SIZE
  445. (p13) FMPY f112 = ALPHA_I, f48
  446. ;;
  447. (p13) LDFD f55 = [Y1], INCX
  448. (p13) FMPY f111 = ALPHA_I, f49
  449. ;;
  450. (p14) LDFD f56 = [Y1], 1 * SIZE
  451. (p13) FMPY f114 = ALPHA_I, f50
  452. ;;
  453. (p14) LDFD f57 = [Y1], INCX
  454. (p13) FMPY f113 = ALPHA_I, f51
  455. ;;
  456. (p14) LDFD f58 = [Y1], 1 * SIZE
  457. (p13) FMPY f116 = ALPHA_I, f52
  458. ;;
  459. (p14) LDFD f59 = [Y1], INCX
  460. (p13) FMPY f115 = ALPHA_I, f53
  461. ;;
  462. (p15) LDFD f60 = [Y1], 1 * SIZE
  463. (p13) FMPY f118 = ALPHA_I, f54
  464. ;;
  465. (p15) LDFD f61 = [Y1], INCX
  466. (p13) FMPY f117 = ALPHA_I, f55
  467. ;;
  468. (p14) FMPY f120 = ALPHA_I, f56
  469. (p14) FMPY f119 = ALPHA_I, f57
  470. (p14) FMPY f122 = ALPHA_I, f58
  471. (p14) FMPY f121 = ALPHA_I, f59
  472. (p15) FMPY f124 = ALPHA_I, f60
  473. (p15) FMPY f123 = ALPHA_I, f61
  474. ;;
  475. (p13) FMS f48 = ALPHA_R, f48, f111
  476. (p13) FMA f49 = ALPHA_R, f49, f112
  477. (p13) FMS f50 = ALPHA_R, f50, f113
  478. (p13) FMA f51 = ALPHA_R, f51, f114
  479. ;;
  480. (p13) STFD [X1] = f48, 1 * SIZE
  481. (p13) FMS f52 = ALPHA_R, f52, f115
  482. ;;
  483. (p13) STFD [X1] = f49
  484. (p13) add X1 = INCX, X1
  485. (p13) FMA f53 = ALPHA_R, f53, f116
  486. ;;
  487. (p13) STFD [X1] = f50, 1 * SIZE
  488. (p13) FMS f54 = ALPHA_R, f54, f117
  489. ;;
  490. (p13) STFD [X1] = f51
  491. (p13) add X1 = INCX, X1
  492. (p13) FMA f55 = ALPHA_R, f55, f118
  493. ;;
  494. (p13) STFD [X1] = f52, 1 * SIZE
  495. (p14) FMS f56 = ALPHA_R, f56, f119
  496. ;;
  497. (p13) STFD [X1] = f53
  498. (p13) add X1 = INCX, X1
  499. (p14) FMA f57 = ALPHA_R, f57, f120
  500. ;;
  501. (p13) STFD [X1] = f54, 1 * SIZE
  502. (p14) FMS f58 = ALPHA_R, f58, f121
  503. ;;
  504. (p13) STFD [X1] = f55
  505. (p13) add X1 = INCX, X1
  506. (p14) FMA f59 = ALPHA_R, f59, f122
  507. ;;
  508. (p14) STFD [X1] = f56, 1 * SIZE
  509. (p15) FMS f60 = ALPHA_R, f60, f123
  510. ;;
  511. (p14) STFD [X1] = f57
  512. (p14) add X1 = INCX, X1
  513. (p15) FMA f61 = ALPHA_R, f61, f124
  514. ;;
  515. (p14) STFD [X1] = f58, 1 * SIZE
  516. ;;
  517. (p14) STFD [X1] = f59
  518. (p14) add X1 = INCX, X1
  519. ;;
  520. (p15) STFD [X1] = f60, 1 * SIZE
  521. ;;
  522. (p15) STFD [X1] = f61
  523. mov pr = PR, -65474
  524. br.ret.sptk.many b0
  525. EPILOGUE