You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCHSIZE (8 * 16)
  41. #define N r32
  42. #define X1 r38
  43. #define INCX r39
  44. #define Y1 r33
  45. #define INCY r34
  46. #define PRE1 r2
  47. #define PRE2 r3
  48. #define I r14
  49. #define J r15
  50. #define X2 r16
  51. #define Y2 r17
  52. #define X3 r18
  53. #define Y3 r19
  54. #define X4 r20
  55. #define Y4 r21
  56. #define YY1 r22
  57. #define YY2 r23
  58. #define YY3 r24
  59. #define YY4 r25
  60. #define INCX4 r8
  61. #define INCY4 r9
  62. #define INCX2 r10
  63. #define INCY2 r11
  64. #define INCX8 r26
  65. #define INCY8 r27
  66. #define PR r30
  67. #define ARLC r31
  68. #define ALPHA f8
  69. #define SP r12
  70. PROLOGUE
  71. .prologue
  72. PROFCODE
  73. { .mmi
  74. adds r8 = 16, SP
  75. adds r9 = 24, SP
  76. .save ar.lc, ARLC
  77. mov ARLC = ar.lc
  78. }
  79. { .mmb
  80. adds PRE1 = (PREFETCHSIZE + 2) * SIZE, X1
  81. cmp.lt p0, p6 = r0, N
  82. (p6) br.ret.sptk.many b0
  83. }
  84. ;;
  85. { .mmi
  86. ld8 Y1 = [r8]
  87. ld8 INCY = [r9]
  88. mov PR = pr
  89. }
  90. ;;
  91. .body
  92. { .mmi
  93. shladd INCX = INCX, BASE_SHIFT, r0
  94. shladd INCY = INCY, BASE_SHIFT, r0
  95. mov pr.rot = 0
  96. }
  97. ;;
  98. { .mmi
  99. shladd INCX4 = INCX, 2, r0
  100. shladd INCY4 = INCY, 2, r0
  101. mov ar.ec = 3
  102. }
  103. { .mmi
  104. shladd INCX8 = INCX, 3, r0
  105. shladd INCY8 = INCY, 3, r0
  106. shr I = N, 4
  107. }
  108. ;;
  109. { .mmi
  110. add X2 = INCX, X1
  111. add Y2 = INCY, Y1
  112. add YY2 = INCY, Y1
  113. }
  114. ;;
  115. { .mmi
  116. shladd X3 = INCX, 1, X1
  117. shladd Y3 = INCY, 1, Y1
  118. shladd YY3 = INCY, 1, Y1
  119. }
  120. { .mmi
  121. shladd X4 = INCX, 1, X2
  122. shladd Y4 = INCY, 1, Y2
  123. shladd YY4 = INCY, 1, Y2
  124. }
  125. ;;
  126. { .mmi
  127. cmp.eq p7 ,p0 = 0, I
  128. adds I = -1, I
  129. mov YY1 = Y1
  130. }
  131. { .mmi
  132. and r28 = 127, Y1
  133. and PRE1 = -128, PRE1
  134. cmp.eq p16, p0 = r0, r0
  135. }
  136. ;;
  137. { .mmi
  138. adds PRE2 = (PREFETCHSIZE + 2) * SIZE, Y1
  139. or PRE1 = PRE1, r28
  140. mov ar.lc = I
  141. }
  142. { .mib
  143. and J = 15, N
  144. tbit.z p0, p12 = N, 3
  145. (p7) br.cond.dpnt .L115
  146. }
  147. ;;
  148. .align 32
  149. .L112:
  150. { .mmf
  151. (p18) STFD [YY1] = f6
  152. (p18) STFD [YY2] = f7
  153. (p18) FMA f6 = ALPHA, f58, f106
  154. }
  155. { .mmf
  156. (p16) lfetch.excl.nt1 [PRE2], INCY8
  157. nop __LINE__
  158. (p18) FMA f7 = ALPHA, f61, f109
  159. }
  160. ;;
  161. { .mmf
  162. (p18) STFD [YY3] = f10
  163. (p18) STFD [YY4] = f11
  164. (p18) FMA f10 = ALPHA, f64, f112
  165. }
  166. { .mmf
  167. (p16) lfetch.nt1 [PRE1], INCX8
  168. nop __LINE__
  169. (p18) FMA f11 = ALPHA, f67, f115
  170. }
  171. ;;
  172. { .mmi
  173. (p16) LDFD f32 = [X1], INCX4
  174. (p16) LDFD f35 = [X2], INCX4
  175. (p18) add YY1 = INCY4, YY1
  176. }
  177. { .mmi
  178. (p16) LDFD f38 = [X3], INCX4
  179. (p16) LDFD f41 = [X4], INCX4
  180. (p18) add YY2 = INCY4, YY2
  181. }
  182. ;;
  183. { .mmi
  184. (p17) LDFD f117 = [Y1], INCY4
  185. (p17) LDFD f120 = [Y2], INCY4
  186. (p18) add YY3 = INCY4, YY3
  187. }
  188. { .mmi
  189. (p17) LDFD f123 = [Y3], INCY4
  190. (p17) LDFD f126 = [Y4], INCY4
  191. (p18) add YY4 = INCY4, YY4
  192. }
  193. ;;
  194. { .mmf
  195. (p18) STFD [YY1] = f12
  196. (p18) STFD [YY2] = f13
  197. (p18) FMA f12 = ALPHA, f70, f118
  198. }
  199. { .mmf
  200. (p18) add YY1 = INCY4, YY1
  201. (p18) add YY2 = INCY4, YY2
  202. (p18) FMA f13 = ALPHA, f73, f121
  203. }
  204. ;;
  205. { .mmf
  206. (p18) STFD [YY3] = f14
  207. (p18) STFD [YY4] = f15
  208. (p18) FMA f14 = ALPHA, f76, f124
  209. }
  210. { .mmf
  211. (p18) add YY3 = INCY4, YY3
  212. (p18) add YY4 = INCY4, YY4
  213. (p18) FMA f15 = ALPHA, f79, f127
  214. }
  215. ;;
  216. { .mmi
  217. (p16) LDFD f44 = [X1], INCX4
  218. (p16) LDFD f47 = [X2], INCX4
  219. nop __LINE__
  220. }
  221. { .mmi
  222. (p16) LDFD f50 = [X3], INCX4
  223. (p16) LDFD f53 = [X4], INCX4
  224. nop __LINE__
  225. }
  226. ;;
  227. { .mmi
  228. (p16) LDFD f80 = [Y1], INCY4
  229. (p16) LDFD f83 = [Y2], INCY4
  230. nop __LINE__
  231. }
  232. { .mmi
  233. (p16) LDFD f86 = [Y3], INCY4
  234. (p16) LDFD f89 = [Y4], INCY4
  235. nop __LINE__
  236. }
  237. ;;
  238. { .mmf
  239. (p18) STFD [YY1] = f6
  240. (p18) STFD [YY2] = f7
  241. (p17) FMA f6 = ALPHA, f33, f81
  242. }
  243. { .mmf
  244. (p16) lfetch.excl.nt1 [PRE2], INCY8
  245. nop __LINE__
  246. (p17) FMA f7 = ALPHA, f36, f84
  247. }
  248. ;;
  249. { .mmf
  250. (p18) STFD [YY3] = f10
  251. (p18) STFD [YY4] = f11
  252. (p17) FMA f10 = ALPHA, f39, f87
  253. }
  254. { .mmf
  255. (p16) lfetch.nt1 [PRE1], INCX8
  256. nop __LINE__
  257. (p17) FMA f11 = ALPHA, f42, f90
  258. }
  259. ;;
  260. { .mmi
  261. (p16) LDFD f56 = [X1], INCX4
  262. (p16) LDFD f59 = [X2], INCX4
  263. (p18) add YY1 = INCY4, YY1
  264. }
  265. { .mmi
  266. (p16) LDFD f62 = [X3], INCX4
  267. (p16) LDFD f65 = [X4], INCX4
  268. (p18) add YY2 = INCY4, YY2
  269. }
  270. ;;
  271. { .mmi
  272. (p16) LDFD f92 = [Y1], INCY4
  273. (p16) LDFD f95 = [Y2], INCY4
  274. (p18) add YY3 = INCY4, YY3
  275. }
  276. { .mmi
  277. (p16) LDFD f98 = [Y3], INCY4
  278. (p16) LDFD f101 = [Y4], INCY4
  279. (p18) add YY4 = INCY4, YY4
  280. }
  281. ;;
  282. { .mmf
  283. (p18) STFD [YY1] = f12
  284. (p18) STFD [YY2] = f13
  285. (p17) FMA f12 = ALPHA, f45, f93
  286. }
  287. { .mmf
  288. (p18) add YY1 = INCY4, YY1
  289. (p18) add YY2 = INCY4, YY2
  290. (p17) FMA f13 = ALPHA, f48, f96
  291. }
  292. ;;
  293. { .mmf
  294. (p18) STFD [YY3] = f14
  295. (p18) STFD [YY4] = f15
  296. (p17) FMA f14 = ALPHA, f51, f99
  297. }
  298. { .mmf
  299. (p18) add YY3 = INCY4, YY3
  300. (p18) add YY4 = INCY4, YY4
  301. (p17) FMA f15 = ALPHA, f54, f102
  302. }
  303. ;;
  304. { .mmi
  305. (p16) LDFD f68 = [X1], INCX4
  306. (p16) LDFD f71 = [X2], INCX4
  307. nop __LINE__
  308. }
  309. { .mmi
  310. (p16) LDFD f74 = [X3], INCX4
  311. (p16) LDFD f77 = [X4], INCX4
  312. nop __LINE__
  313. }
  314. ;;
  315. { .mmi
  316. (p16) LDFD f104 = [Y1], INCY4
  317. (p16) LDFD f107 = [Y2], INCY4
  318. nop __LINE__
  319. }
  320. { .mmb
  321. (p16) LDFD f110 = [Y3], INCY4
  322. (p16) LDFD f113 = [Y4], INCY4
  323. br.ctop.sptk.few .L112
  324. }
  325. ;;
  326. .align 32
  327. .L115:
  328. { .mmi
  329. (p12) LDFD f32 = [X1], INCX4
  330. (p12) LDFD f33 = [X2], INCX4
  331. mov pr = PR, -65474
  332. }
  333. { .mmi
  334. (p12) LDFD f34 = [X3], INCX4
  335. (p12) LDFD f35 = [X4], INCX4
  336. cmp.eq p9, p0 = r0, J
  337. }
  338. ;;
  339. { .mmi
  340. (p12) LDFD f64 = [Y1], INCY4
  341. (p12) LDFD f65 = [Y2], INCY4
  342. mov ar.lc = ARLC
  343. }
  344. { .mmb
  345. (p12) LDFD f66 = [Y3], INCY4
  346. (p12) LDFD f67 = [Y4], INCY4
  347. (p9) br.ret.sptk.many b0
  348. }
  349. ;;
  350. { .mmi
  351. (p12) LDFD f36 = [X1], INCX4
  352. (p12) LDFD f37 = [X2], INCX4
  353. tbit.z p0, p13 = N, 2
  354. }
  355. { .mmi
  356. (p12) LDFD f38 = [X3], INCX4
  357. (p12) LDFD f39 = [X4], INCX4
  358. tbit.z p0, p14 = N, 1
  359. }
  360. ;;
  361. { .mmi
  362. (p12) LDFD f68 = [Y1], INCY4
  363. (p12) LDFD f69 = [Y2], INCY4
  364. tbit.z p0, p15 = N, 0
  365. }
  366. { .mmi
  367. (p12) LDFD f70 = [Y3], INCY4
  368. (p12) LDFD f71 = [Y4], INCY4
  369. nop __LINE__
  370. }
  371. ;;
  372. { .mmi
  373. (p13) LDFD f40 = [X1], INCX4
  374. (p13) LDFD f41 = [X2], INCX4
  375. shladd INCX2 = INCX, 1, r0
  376. }
  377. { .mmi
  378. (p13) LDFD f42 = [X3], INCX4
  379. (p13) LDFD f43 = [X4], INCX4
  380. shladd INCY2 = INCY, 1, r0
  381. }
  382. ;;
  383. { .mmi
  384. (p13) LDFD f72 = [Y1], INCY4
  385. (p13) LDFD f73 = [Y2], INCY4
  386. nop __LINE__
  387. }
  388. { .mmi
  389. (p13) LDFD f74 = [Y3], INCY4
  390. (p13) LDFD f75 = [Y4], INCY4
  391. nop __LINE__
  392. }
  393. ;;
  394. { .mmi
  395. (p14) LDFD f44 = [X1], INCX2
  396. (p14) LDFD f45 = [X2], INCX2
  397. nop __LINE__
  398. }
  399. ;;
  400. { .mmi
  401. (p14) LDFD f76 = [Y1], INCY2
  402. (p14) LDFD f77 = [Y2], INCY2
  403. nop __LINE__
  404. }
  405. ;;
  406. { .mmi
  407. (p15) LDFD f46 = [X1]
  408. (p15) LDFD f78 = [Y1]
  409. nop __LINE__
  410. }
  411. ;;
  412. (p12) FMA f32 = ALPHA, f32, f64
  413. (p12) FMA f33 = ALPHA, f33, f65
  414. (p12) FMA f34 = ALPHA, f34, f66
  415. (p12) FMA f35 = ALPHA, f35, f67
  416. (p12) FMA f36 = ALPHA, f36, f68
  417. (p12) FMA f37 = ALPHA, f37, f69
  418. (p12) FMA f38 = ALPHA, f38, f70
  419. (p12) FMA f39 = ALPHA, f39, f71
  420. ;;
  421. { .mmf
  422. (p12) STFD [YY1] = f32
  423. (p12) STFD [YY2] = f33
  424. (p13) FMA f40 = ALPHA, f40, f72
  425. }
  426. { .mmf
  427. (p12) add YY1 = INCY4, YY1
  428. (p12) add YY2 = INCY4, YY2
  429. (p13) FMA f41 = ALPHA, f41, f73
  430. }
  431. ;;
  432. { .mmf
  433. (p12) STFD [YY3] = f34
  434. (p12) STFD [YY4] = f35
  435. (p13) FMA f42 = ALPHA, f42, f74
  436. }
  437. { .mmf
  438. (p12) add YY3 = INCY4, YY3
  439. (p12) add YY4 = INCY4, YY4
  440. (p13) FMA f43 = ALPHA, f43, f75
  441. }
  442. ;;
  443. { .mmf
  444. (p12) STFD [YY1] = f36
  445. (p12) STFD [YY2] = f37
  446. (p14) FMA f44 = ALPHA, f44, f76
  447. }
  448. { .mmf
  449. (p12) add YY1 = INCY4, YY1
  450. (p12) add YY2 = INCY4, YY2
  451. (p14) FMA f45 = ALPHA, f45, f77
  452. }
  453. ;;
  454. { .mmf
  455. (p12) STFD [YY3] = f38
  456. (p12) STFD [YY4] = f39
  457. (p15) FMA f46 = ALPHA, f46, f78
  458. }
  459. { .mmi
  460. (p12) add YY3 = INCY4, YY3
  461. (p12) add YY4 = INCY4, YY4
  462. nop __LINE__
  463. }
  464. ;;
  465. { .mmi
  466. (p13) STFD [YY1] = f40
  467. (p13) STFD [YY2] = f41
  468. nop __LINE__
  469. }
  470. { .mmi
  471. (p13) add YY1 = INCY4, YY1
  472. (p13) add YY2 = INCY4, YY2
  473. nop __LINE__
  474. }
  475. ;;
  476. { .mmi
  477. (p13) STFD [YY3] = f42
  478. (p13) STFD [YY4] = f43
  479. nop __LINE__
  480. }
  481. ;;
  482. { .mmi
  483. (p14) STFD [YY1] = f44
  484. (p14) STFD [YY2] = f45
  485. (p14) add YY1 = INCY2, YY1
  486. }
  487. ;;
  488. { .mmb
  489. (p15) STFD [YY1] = f46
  490. nop __LINE__
  491. br.ret.sptk.many b0
  492. }
  493. ;;
  494. EPILOGUE