You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

caxpy.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCH_SIZE (32 * 16)
  41. #ifndef CONJ
  42. #define FMA1 FNMA
  43. #define FMA2 FMA
  44. #else
  45. #define FMA1 FMA
  46. #define FMA2 FNMA
  47. #endif
  48. #define SP r12
  49. #define N r32
  50. #define X1 r37
  51. #define INCX r38
  52. #define Y1 r39
  53. #define INCY r36
  54. #define PREX1 r2
  55. #define PREY1 r3
  56. #define I r33
  57. #define J r34
  58. #define Y2 r35
  59. #define X2 r14
  60. #define YY1 r15
  61. #define YY2 r16
  62. #define YY3 r17
  63. #define YY4 r18
  64. #define INCXM1 r19
  65. #define INCYM1 r20
  66. #define INCX3M1 r21
  67. #define INCY3M1 r22
  68. #define INCX7M1 r23
  69. #define INCY7M1 r24
  70. #define X3 r8
  71. #define Y3 r9
  72. #define X4 r10
  73. #define Y4 r11
  74. #define INCX8 r25
  75. #define INCY8 r26
  76. #define ARLC r29
  77. #define PR r30
  78. #define ALPHA_R f8
  79. #define ALPHA_I f9
  80. PROLOGUE
  81. .prologue
  82. PROFCODE
  83. { .mmi
  84. adds r14 = 16, SP
  85. and J = 7, N
  86. .save ar.lc, ARLC
  87. mov ARLC = ar.lc
  88. }
  89. { .mib
  90. cmp.gt p15, p0 = r0, N
  91. shr I = N, 3
  92. (p15) br.ret.sptk.many b0
  93. }
  94. ;;
  95. { .mmi
  96. ld8 INCY = [r14]
  97. nop __LINE__
  98. mov PR = pr
  99. }
  100. { .mmi
  101. adds PREX1 = (PREFETCH_SIZE + 0) * SIZE, X1
  102. adds PREY1 = (PREFETCH_SIZE + 0) * SIZE, Y1
  103. shl INCX = INCX, ZBASE_SHIFT
  104. }
  105. ;;
  106. { .mii
  107. adds I = -1, I
  108. mov pr.rot= 0
  109. shl INCY = INCY, ZBASE_SHIFT
  110. }
  111. ;;
  112. { .mmi
  113. adds INCXM1 = -SIZE, INCX
  114. adds INCYM1 = -SIZE, INCY
  115. mov ar.ec = 3
  116. }
  117. { .mmi
  118. shladd X2 = INCX, 1, X1
  119. shladd Y2 = INCY, 1, Y1
  120. cmp.eq p16, p0 = r0, r0
  121. }
  122. ;;
  123. { .mmi
  124. shladd INCX3M1 = INCX, 1, INCXM1
  125. shladd INCY3M1 = INCY, 1, INCYM1
  126. shladd INCX8 = INCX, 3, r0
  127. }
  128. { .mmi
  129. shladd X3 = INCX, 1, X2
  130. shladd Y3 = INCY, 1, Y2
  131. shladd INCY8 = INCY, 3, r0
  132. }
  133. ;;
  134. { .mmi
  135. shladd X4 = INCX, 1, X3
  136. shladd Y4 = INCY, 1, Y3
  137. shladd INCX7M1 = INCX, 2, INCX3M1
  138. }
  139. { .mmi
  140. mov YY1 = Y1
  141. mov YY2 = Y2
  142. shladd INCY7M1 = INCY, 2, INCY3M1
  143. }
  144. ;;
  145. { .mmi
  146. mov YY3 = Y3
  147. mov YY4 = Y4
  148. mov ar.lc = I
  149. }
  150. { .mib
  151. cmp.eq p11 ,p0 = -1, I
  152. tbit.z p0, p13 = N, 2
  153. (p11) br.cond.dpnt .L25
  154. }
  155. ;;
  156. .align 32
  157. .L22:
  158. { .mmf
  159. (p19) STFD [YY3] = f14
  160. (p19) STFD [YY4] = f15
  161. (p18) FMA2 f14 = ALPHA_R, f64, f112
  162. }
  163. { .mmf
  164. (p16) LDFD f80 = [Y1], 1 * SIZE
  165. (p16) LDFD f92 = [Y2], 1 * SIZE
  166. (p18) FMA2 f15 = ALPHA_R, f76, f124
  167. }
  168. ;;
  169. { .mmf
  170. (p16) lfetch.excl.nt1 [PREY1], INCY8
  171. (p16) LDFD f104 = [Y3], 1 * SIZE
  172. (p18) FMA1 f6 = ALPHA_I, f40, f6
  173. }
  174. { .mmf
  175. (p16) LDFD f116 = [Y4], 1 * SIZE
  176. nop __LINE__
  177. (p18) FMA1 f7 = ALPHA_I, f52, f7
  178. }
  179. ;;
  180. { .mmf
  181. (p16) LDFD f86 = [Y1], INCYM1
  182. (p16) LDFD f98 = [Y2], INCYM1
  183. (p18) FMA1 f10 = ALPHA_I, f64, f10
  184. }
  185. { .mmf
  186. nop __LINE__
  187. nop __LINE__
  188. (p18) FMA1 f11 = ALPHA_I, f76, f11
  189. }
  190. ;;
  191. { .mmf
  192. (p16) LDFD f110 = [Y3], INCYM1
  193. (p16) LDFD f122 = [Y4], INCYM1
  194. (p18) FMA f12 = ALPHA_I, f34, f12
  195. }
  196. { .mmf
  197. (p19) add YY1 = YY1, INCY7M1
  198. (p19) add YY2 = YY2, INCY7M1
  199. (p18) FMA f13 = ALPHA_I, f46, f13
  200. }
  201. ;;
  202. { .mmf
  203. (p16) LDFD f32 = [X1], 1 * SIZE
  204. (p16) LDFD f44 = [X2], 1 * SIZE
  205. (p18) FMA f14 = ALPHA_I, f58, f14
  206. }
  207. { .mmf
  208. (p19) add YY3 = YY3, INCY7M1
  209. (p19) add YY4 = YY4, INCY7M1
  210. (p18) FMA f15 = ALPHA_I, f70, f15
  211. }
  212. ;;
  213. { .mmf
  214. (p18) STFD [YY1] = f6, 1 * SIZE
  215. (p18) STFD [YY2] = f7, 1 * SIZE
  216. (p18) FMA f6 = ALPHA_R, f37, f85
  217. }
  218. { .mmf
  219. (p16) LDFD f56 = [X3], 1 * SIZE
  220. (p16) LDFD f68 = [X4], 1 * SIZE
  221. (p18) FMA f7 = ALPHA_R, f49, f97
  222. }
  223. ;;
  224. { .mmf
  225. (p18) STFD [YY3] = f10, 1 * SIZE
  226. (p18) STFD [YY4] = f11, 1 * SIZE
  227. (p18) FMA f10 = ALPHA_R, f61, f109
  228. }
  229. { .mmf
  230. (p16) LDFD f38 = [X1], INCXM1
  231. (p16) LDFD f50 = [X2], INCXM1
  232. (p18) FMA f11 = ALPHA_R, f73, f121
  233. }
  234. ;;
  235. { .mmf
  236. (p18) STFD [YY1] = f12
  237. (p18) STFD [YY2] = f13
  238. (p18) FMA2 f12 = ALPHA_R, f43, f91
  239. }
  240. { .mmf
  241. (p16) LDFD f62 = [X3], INCXM1
  242. (p16) LDFD f74 = [X4], INCXM1
  243. (p18) FMA2 f13 = ALPHA_R, f55, f103
  244. }
  245. ;;
  246. { .mmf
  247. (p18) STFD [YY3] = f14
  248. (p18) STFD [YY4] = f15
  249. (p18) FMA2 f14 = ALPHA_R, f67, f115
  250. }
  251. { .mmf
  252. (p16) LDFD f83 = [Y1], 1 * SIZE
  253. (p16) LDFD f95 = [Y2], 1 * SIZE
  254. (p18) FMA2 f15 = ALPHA_R, f79, f127
  255. }
  256. ;;
  257. { .mmf
  258. (p16) LDFD f107 = [Y3], 1 * SIZE
  259. (p16) LDFD f119 = [Y4], 1 * SIZE
  260. (p18) FMA1 f6 = ALPHA_I, f43, f6
  261. }
  262. { .mmf
  263. nop __LINE__
  264. nop __LINE__
  265. (p18) FMA1 f7 = ALPHA_I, f55, f7
  266. }
  267. ;;
  268. { .mmf
  269. (p16) LDFD f89 = [Y1], INCY7M1
  270. (p16) LDFD f101 = [Y2], INCY7M1
  271. (p18) FMA1 f10 = ALPHA_I, f67, f10
  272. }
  273. { .mmf
  274. (p18) add YY1 = YY1, INCYM1
  275. (p18) add YY2 = YY2, INCYM1
  276. (p18) FMA1 f11 = ALPHA_I, f79, f11
  277. }
  278. ;;
  279. { .mmf
  280. (p16) LDFD f113 = [Y3], INCY7M1
  281. (p16) LDFD f125 = [Y4], INCY7M1
  282. (p18) FMA f12 = ALPHA_I, f37, f12
  283. }
  284. { .mmf
  285. (p18) add YY3 = YY3, INCYM1
  286. (p18) add YY4 = YY4, INCYM1
  287. (p18) FMA f13 = ALPHA_I, f49, f13
  288. }
  289. ;;
  290. { .mmf
  291. (p16) LDFD f35 = [X1], 1 * SIZE
  292. (p16) LDFD f47 = [X2], 1 * SIZE
  293. (p18) FMA f14 = ALPHA_I, f61, f14
  294. }
  295. { .mmf
  296. (p16) LDFD f59 = [X3], 1 * SIZE
  297. (p16) LDFD f71 = [X4], 1 * SIZE
  298. (p18) FMA f15 = ALPHA_I, f73, f15
  299. }
  300. ;;
  301. { .mmf
  302. (p18) STFD [YY1] = f6, 1 * SIZE
  303. (p18) STFD [YY2] = f7, 1 * SIZE
  304. (p17) FMA f6 = ALPHA_R, f33, f81
  305. }
  306. { .mmf
  307. (p16) LDFD f41 = [X1], INCX7M1
  308. (p16) LDFD f53 = [X2], INCX7M1
  309. (p17) FMA f7 = ALPHA_R, f45, f93
  310. }
  311. ;;
  312. { .mmf
  313. (p18) STFD [YY3] = f10, 1 * SIZE
  314. (p18) STFD [YY4] = f11, 1 * SIZE
  315. (p17) FMA f10 = ALPHA_R, f57, f105
  316. }
  317. { .mmf
  318. (p16) LDFD f65 = [X3], INCX7M1
  319. (p16) LDFD f77 = [X4], INCX7M1
  320. (p17) FMA f11 = ALPHA_R, f69, f117
  321. }
  322. ;;
  323. { .mmf
  324. (p18) STFD [YY1] = f12
  325. (p18) STFD [YY2] = f13
  326. (p17) FMA2 f12 = ALPHA_R, f39, f87
  327. }
  328. { .mfb
  329. (p16) lfetch.nt1 [PREX1], INCX8
  330. (p17) FMA2 f13 = ALPHA_R, f51, f99
  331. br.ctop.sptk.few .L22
  332. }
  333. ;;
  334. (p19) add YY1 = YY1, INCY7M1
  335. (p19) add YY2 = YY2, INCY7M1
  336. ;;
  337. { .mmf
  338. (p19) STFD [YY3] = f14
  339. (p19) STFD [YY4] = f15
  340. }
  341. { .mmf
  342. (p19) add YY3 = YY3, INCY7M1
  343. (p19) add YY4 = YY4, INCY7M1
  344. }
  345. ;;
  346. .align 32
  347. .L25:
  348. { .mmi
  349. (p13) LDFD f32 = [X1], 1 * SIZE
  350. (p13) LDFD f36 = [X2], 1 * SIZE
  351. mov ar.lc = ARLC
  352. }
  353. ;;
  354. { .mmi
  355. (p13) LDFD f80 = [Y1], 1 * SIZE
  356. (p13) LDFD f84 = [Y2], 1 * SIZE
  357. mov pr = PR, -65474
  358. }
  359. ;;
  360. { .mmi
  361. (p13) LDFD f33 = [X1], INCXM1
  362. (p13) LDFD f37 = [X2], INCXM1
  363. cmp.eq p12, p0 = r0, J
  364. }
  365. ;;
  366. { .mmb
  367. (p13) LDFD f81 = [Y1], INCYM1
  368. (p13) LDFD f85 = [Y2], INCYM1
  369. (p12) br.ret.sptk.many b0
  370. }
  371. ;;
  372. { .mmi
  373. (p13) LDFD f34 = [X1], 1 * SIZE
  374. (p13) LDFD f38 = [X2], 1 * SIZE
  375. tbit.z p0, p14 = N, 1
  376. }
  377. ;;
  378. { .mmi
  379. (p13) LDFD f82 = [Y1], 1 * SIZE
  380. (p13) LDFD f86 = [Y2], 1 * SIZE
  381. tbit.z p0, p15 = N, 0
  382. }
  383. ;;
  384. { .mmf
  385. (p13) LDFD f35 = [X1], INCX3M1
  386. (p13) LDFD f39 = [X2], INCX3M1
  387. (p13) FMA f80 = ALPHA_R, f32, f80
  388. }
  389. ;;
  390. { .mmf
  391. (p13) LDFD f83 = [Y1], INCY3M1
  392. (p13) LDFD f87 = [Y2], INCY3M1
  393. (p13) FMA f84 = ALPHA_R, f36, f84
  394. }
  395. ;;
  396. { .mmf
  397. (p14) LDFD f40 = [X1], 1 * SIZE
  398. (p14) LDFD f88 = [Y1], 1 * SIZE
  399. (p13) FMA2 f81 = ALPHA_R, f33, f81
  400. }
  401. ;;
  402. { .mmf
  403. (p14) LDFD f41 = [X1], INCXM1
  404. (p14) LDFD f89 = [Y1], INCYM1
  405. (p13) FMA2 f85 = ALPHA_R, f37, f85
  406. }
  407. ;;
  408. { .mmf
  409. (p14) LDFD f42 = [X1], 1 * SIZE
  410. (p14) LDFD f90 = [Y1], 1 * SIZE
  411. (p13) FMA f82 = ALPHA_R, f34, f82
  412. }
  413. ;;
  414. { .mmf
  415. (p14) LDFD f43 = [X1], INCXM1
  416. (p14) LDFD f91 = [Y1], INCYM1
  417. (p13) FMA f86 = ALPHA_R, f38, f86
  418. }
  419. ;;
  420. { .mmf
  421. (p15) LDFD f44 = [X1], 1 * SIZE
  422. (p15) LDFD f92 = [Y1], 1 * SIZE
  423. (p13) FMA2 f83 = ALPHA_R, f35, f83
  424. }
  425. ;;
  426. { .mmf
  427. (p15) LDFD f45 = [X1]
  428. (p15) LDFD f93 = [Y1]
  429. (p13) FMA2 f87 = ALPHA_R, f39, f87
  430. }
  431. ;;
  432. (p13) FMA1 f80 = ALPHA_I, f33, f80
  433. (p13) FMA1 f84 = ALPHA_I, f37, f84
  434. (p13) FMA f81 = ALPHA_I, f32, f81
  435. (p13) FMA f85 = ALPHA_I, f36, f85
  436. (p13) FMA1 f82 = ALPHA_I, f35, f82
  437. (p13) FMA1 f86 = ALPHA_I, f39, f86
  438. (p13) FMA f83 = ALPHA_I, f34, f83
  439. (p13) FMA f87 = ALPHA_I, f38, f87
  440. ;;
  441. { .mmf
  442. (p13) STFD [YY1] = f80, 1 * SIZE
  443. (p13) STFD [YY2] = f84, 1 * SIZE
  444. (p14) FMA f88 = ALPHA_R, f40, f88
  445. }
  446. ;;
  447. { .mmf
  448. (p13) STFD [YY1] = f81
  449. (p13) STFD [YY2] = f85
  450. (p14) FMA2 f89 = ALPHA_R, f41, f89
  451. }
  452. { .mmf
  453. (p13) add YY1 = YY1, INCYM1
  454. (p13) add YY2 = YY2, INCYM1
  455. (p14) FMA f90 = ALPHA_R, f42, f90
  456. }
  457. ;;
  458. { .mmf
  459. (p13) STFD [YY1] = f82, 1 * SIZE
  460. (p13) STFD [YY2] = f86, 1 * SIZE
  461. (p14) FMA2 f91 = ALPHA_R, f43, f91
  462. }
  463. ;;
  464. { .mmf
  465. (p13) STFD [YY1] = f83
  466. (p13) STFD [YY2] = f87
  467. (p15) FMA f92 = ALPHA_R, f44, f92
  468. }
  469. { .mmf
  470. (p13) add YY1 = YY1, INCY3M1
  471. nop __LINE__
  472. (p15) FMA2 f93 = ALPHA_R, f45, f93
  473. }
  474. ;;
  475. (p14) FMA1 f88 = ALPHA_I, f41, f88
  476. (p14) FMA f89 = ALPHA_I, f40, f89
  477. (p14) FMA1 f90 = ALPHA_I, f43, f90
  478. (p14) FMA f91 = ALPHA_I, f42, f91
  479. ;;
  480. { .mmf
  481. (p14) STFD [YY1] = f88, 1 * SIZE
  482. (p15) FMA1 f92 = ALPHA_I, f45, f92
  483. }
  484. ;;
  485. { .mmf
  486. (p14) STFD [YY1] = f89
  487. (p14) add YY1 = YY1, INCYM1
  488. (p15) FMA f93 = ALPHA_I, f44, f93
  489. }
  490. ;;
  491. (p14) STFD [YY1] = f90, 1 * SIZE
  492. ;;
  493. (p14) STFD [YY1] = f91
  494. (p14) add YY1 = YY1, INCYM1
  495. ;;
  496. (p15) STFD [YY1] = f92, 1 * SIZE
  497. ;;
  498. { .mmb
  499. (p15) STFD [YY1] = f93
  500. nop __LINE__
  501. br.ret.sptk.many b0
  502. }
  503. ;;
  504. EPILOGUE