You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qdot.S 9.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCH_SIZE (8 * 24)
  41. #define N r32
  42. #define X1 r33
  43. #define INCX r34
  44. #define Y1 r35
  45. #define INCY r36
  46. #define PREX1 r2
  47. #define PREY1 r3
  48. #define I r14
  49. #define J r15
  50. #define Y2 r16
  51. #define X2 r17
  52. #define Y3 r18
  53. #define X3 r19
  54. #define Y4 r20
  55. #define X4 r21
  56. #define INCX2 r22
  57. #define INCY2 r23
  58. #define INCX4 r24
  59. #define INCY4 r25
  60. #define INCX16 r26
  61. #define INCY16 r27
  62. #define PREX2 r28
  63. #define PREY2 r29
  64. #define PR r30
  65. #define ARLC r31
  66. PROLOGUE
  67. .prologue
  68. PROFCODE
  69. { .mfi
  70. nop __LINE__
  71. mov f8 = f0
  72. .save ar.lc, ARLC
  73. mov ARLC = ar.lc
  74. }
  75. { .mfi
  76. mov r26 = 1
  77. mov f9 = f0
  78. nop __LINE__
  79. }
  80. ;;
  81. .body
  82. #ifdef F_INTERFACE
  83. LDINT N = [N]
  84. LDINT INCX = [INCX]
  85. LDINT INCY = [INCY]
  86. ;;
  87. #ifndef USE64BITINT
  88. sxt4 N = N
  89. sxt4 INCX = INCX
  90. sxt4 INCY = INCY
  91. ;;
  92. #endif
  93. cmp.le p0, p6 = r0, INCX
  94. cmp.le p0, p7 = r0, INCY
  95. sub r26 = r26, N
  96. ;;
  97. setf.sig f32 = r26
  98. setf.sig f33 = INCX
  99. setf.sig f34 = INCY
  100. ;;
  101. xmpy.l f33 = f32, f33
  102. xmpy.l f34 = f32, f34
  103. ;;
  104. getf.sig r26 = f33
  105. getf.sig r27 = f34
  106. ;;
  107. (p6) shladd X1 = r26, BASE_SHIFT, X1
  108. (p7) shladd Y1 = r27, BASE_SHIFT, Y1
  109. ;;
  110. #endif
  111. { .mmi
  112. adds PREX1 = (PREFETCH_SIZE + 2) * SIZE, X1
  113. adds PREY1 = (PREFETCH_SIZE + 2) * SIZE, Y1
  114. mov PR = pr
  115. }
  116. { .mib
  117. cmp.lt p0, p6 = r0, N
  118. shl INCX = INCX, BASE_SHIFT
  119. (p6) br.ret.sptk.many b0
  120. }
  121. ;;
  122. { .mfi
  123. add X2 = INCX, X1
  124. mov f10 = f0
  125. shl INCY = INCY, BASE_SHIFT
  126. }
  127. { .mmf
  128. and r8 = 127, X1
  129. shladd X3 = INCX, 1, X1
  130. mov f11 = f0
  131. }
  132. ;;
  133. { .mmi
  134. and PREY1 = -128, PREY1
  135. shladd X4 = INCX, 1, X2
  136. add INCX2 = INCX, INCX
  137. }
  138. { .mmi
  139. shladd INCX4 = INCX, 2, r0
  140. add Y2 = INCY, Y1
  141. shladd Y3 = INCY, 1, Y1
  142. }
  143. ;;
  144. { .mmi
  145. shladd Y4 = INCY, 1, Y2
  146. add INCY2 = INCY, INCY
  147. nop __LINE__
  148. }
  149. { .mmi
  150. shladd INCY4 = INCY, 2, r0
  151. shladd INCX16 = INCX, 4, r0
  152. shladd INCY16 = INCY, 4, r0
  153. }
  154. ;;
  155. { .mfi
  156. nop __LINE__
  157. mov f12 = f0
  158. mov pr.rot= 0
  159. }
  160. { .mfi
  161. or PREY1 = PREY1, r8
  162. mov f13 = f0
  163. shr I = N, 4
  164. }
  165. ;;
  166. { .mfi
  167. adds I = -1, I
  168. mov f14 = f0
  169. mov ar.ec= 3
  170. }
  171. { .mmf
  172. shladd PREX2 = INCX, 3, PREX1
  173. shladd PREY2 = INCY, 3, PREY1
  174. mov f15 = f0
  175. }
  176. ;;
  177. { .mmi
  178. and J = 15, N
  179. cmp.eq p16, p0 = r0, r0
  180. mov ar.lc = I
  181. }
  182. { .mib
  183. cmp.eq p6 ,p0 = -1, I
  184. tbit.nz p12, p0 = N, 3
  185. (p6) br.cond.dpnt .L215
  186. }
  187. ;;
  188. .align 32
  189. .L212:
  190. { .mmf
  191. (p16) lfetch.nt1 [PREX1], INCX16
  192. (p16) lfetch.nt1 [PREX2], INCX16
  193. (p18) FMA f8 = f34, f82, f8
  194. }
  195. { .mmf
  196. (p16) LDFD f80 = [X1], INCX4
  197. (p16) LDFD f83 = [X2], INCX4
  198. (p18) FMA f9 = f37, f85, f9
  199. }
  200. ;;
  201. { .mmf
  202. (p16) LDFD f86 = [X3], INCX4
  203. (p16) LDFD f89 = [X4], INCX4
  204. (p18) FMA f10 = f40, f88, f10
  205. }
  206. { .mmf
  207. (p16) LDFD f92 = [X1], INCX4
  208. (p16) LDFD f95 = [X2], INCX4
  209. (p18) FMA f11 = f43, f91, f11
  210. }
  211. ;;
  212. { .mmf
  213. (p16) LDFD f32 = [Y1], INCY4
  214. (p16) LDFD f35 = [Y2], INCY4
  215. (p18) FMA f12 = f46, f94, f12
  216. }
  217. { .mmf
  218. (p16) LDFD f38 = [Y3], INCY4
  219. (p16) LDFD f41 = [Y4], INCY4
  220. (p18) FMA f13 = f49, f97, f13
  221. }
  222. ;;
  223. { .mmf
  224. (p16) LDFD f98 = [X3], INCX4
  225. (p16) LDFD f101 = [X4], INCX4
  226. (p18) FMA f14 = f52, f100, f14
  227. }
  228. { .mmf
  229. (p16) LDFD f104 = [X1], INCX4
  230. (p16) LDFD f107 = [X2], INCX4
  231. (p18) FMA f15 = f55, f103, f15
  232. }
  233. ;;
  234. { .mmf
  235. (p16) LDFD f44 = [Y1], INCY4
  236. (p16) LDFD f47 = [Y2], INCY4
  237. (p18) FMA f8 = f58, f106, f8
  238. }
  239. { .mmf
  240. (p16) LDFD f50 = [Y3], INCY4
  241. (p16) LDFD f53 = [Y4], INCY4
  242. (p18) FMA f9 = f61, f109, f9
  243. }
  244. ;;
  245. { .mmf
  246. (p16) lfetch.nt1 [PREY1], INCY16
  247. (p16) lfetch.nt1 [PREY2], INCY16
  248. (p18) FMA f10 = f64, f112, f10
  249. }
  250. { .mmf
  251. (p16) LDFD f110 = [X3], INCX4
  252. (p16) LDFD f113 = [X4], INCX4
  253. (p18) FMA f11 = f67, f115, f11
  254. }
  255. ;;
  256. { .mmf
  257. (p16) LDFD f56 = [Y1], INCY4
  258. (p16) LDFD f59 = [Y2], INCY4
  259. (p18) FMA f12 = f70, f118, f12
  260. }
  261. { .mmf
  262. (p16) LDFD f62 = [Y3], INCY4
  263. (p16) LDFD f65 = [Y4], INCY4
  264. (p18) FMA f13 = f73, f121, f13
  265. }
  266. ;;
  267. { .mmf
  268. (p16) LDFD f116 = [X1], INCX4
  269. (p16) LDFD f119 = [X2], INCX4
  270. (p18) FMA f14 = f76, f124, f14
  271. }
  272. { .mmf
  273. (p16) LDFD f122 = [X3], INCX4
  274. (p16) LDFD f125 = [X4], INCX4
  275. (p18) FMA f15 = f79, f127, f15
  276. }
  277. ;;
  278. { .mmi
  279. (p16) LDFD f68 = [Y1], INCY4
  280. (p16) LDFD f71 = [Y2], INCY4
  281. nop __LINE__
  282. }
  283. { .mmb
  284. (p16) LDFD f74 = [Y3], INCY4
  285. (p16) LDFD f77 = [Y4], INCY4
  286. br.ctop.sptk.few .L212
  287. }
  288. ;;
  289. .align 32
  290. .L215:
  291. { .mmi
  292. (p12) LDFD f48 = [X1], INCX4
  293. (p12) LDFD f49 = [X2], INCX4
  294. cmp.eq p7, p0 = r0, J
  295. }
  296. { .mmb
  297. (p12) LDFD f50 = [X3], INCX4
  298. (p12) LDFD f51 = [X4], INCX4
  299. (p7) br.cond.dptk .L999
  300. }
  301. ;;
  302. { .mmi
  303. (p12) LDFD f32 = [Y1], INCY4
  304. (p12) LDFD f33 = [Y2], INCY4
  305. tbit.nz p13, p0 = N, 2
  306. }
  307. { .mmi
  308. (p12) LDFD f34 = [Y3], INCY4
  309. (p12) LDFD f35 = [Y4], INCY4
  310. nop __LINE__
  311. }
  312. ;;
  313. { .mmi
  314. (p12) LDFD f52 = [X1], INCX4
  315. (p12) LDFD f53 = [X2], INCX4
  316. tbit.nz p14, p0 = N, 1
  317. }
  318. { .mmi
  319. (p12) LDFD f54 = [X3], INCX4
  320. (p12) LDFD f55 = [X4], INCX4
  321. nop __LINE__
  322. }
  323. ;;
  324. { .mmi
  325. (p12) LDFD f36 = [Y1], INCY4
  326. (p12) LDFD f37 = [Y2], INCY4
  327. tbit.nz p15, p0 = N, 0
  328. }
  329. { .mmi
  330. (p12) LDFD f38 = [Y3], INCY4
  331. (p12) LDFD f39 = [Y4], INCY4
  332. nop __LINE__
  333. }
  334. ;;
  335. { .mmi
  336. (p13) LDFD f56 = [X1], INCX4
  337. (p13) LDFD f57 = [X2], INCX4
  338. nop __LINE__
  339. }
  340. { .mmi
  341. (p13) LDFD f58 = [X3], INCX4
  342. (p13) LDFD f59 = [X4], INCX4
  343. nop __LINE__
  344. }
  345. ;;
  346. { .mmi
  347. (p13) LDFD f40 = [Y1], INCY4
  348. (p13) LDFD f41 = [Y2], INCY4
  349. nop __LINE__
  350. }
  351. { .mmi
  352. (p13) LDFD f42 = [Y3], INCY4
  353. (p13) LDFD f43 = [Y4], INCY4
  354. nop __LINE__
  355. }
  356. ;;
  357. { .mmi
  358. (p14) LDFD f60 = [X1], INCX2
  359. (p14) LDFD f61 = [X2], INCX2
  360. nop __LINE__
  361. }
  362. { .mmi
  363. (p14) LDFD f44 = [Y1], INCY2
  364. (p14) LDFD f45 = [Y2], INCY2
  365. nop __LINE__
  366. }
  367. ;;
  368. { .mmi
  369. (p15) LDFD f62 = [X1]
  370. (p15) LDFD f46 = [Y1]
  371. nop __LINE__
  372. }
  373. ;;
  374. (p12) FMA f8 = f32, f48, f8
  375. (p12) FMA f9 = f33, f49, f9
  376. (p12) FMA f10 = f34, f50, f10
  377. (p12) FMA f11 = f35, f51, f11
  378. ;;
  379. (p12) FMA f12 = f36, f52, f12
  380. (p12) FMA f13 = f37, f53, f13
  381. (p12) FMA f14 = f38, f54, f14
  382. (p12) FMA f15 = f39, f55, f15
  383. ;;
  384. (p13) FMA f8 = f40, f56, f8
  385. (p13) FMA f9 = f41, f57, f9
  386. (p13) FMA f10 = f42, f58, f10
  387. (p13) FMA f11 = f43, f59, f11
  388. ;;
  389. (p14) FMA f8 = f44, f60, f8
  390. (p14) FMA f9 = f45, f61, f9
  391. (p15) FMA f10 = f46, f62, f10
  392. ;;
  393. .align 32
  394. .L999:
  395. FADD f8 = f8, f9
  396. FADD f10 = f10, f11
  397. FADD f12 = f12, f13
  398. FADD f14 = f14, f15
  399. ;;
  400. FADD f8 = f8, f10
  401. FADD f12 = f12, f14
  402. mov ar.lc = ARLC
  403. ;;
  404. FADD f8 = f8, f12
  405. mov pr = PR, -65474
  406. br.ret.sptk.many b0
  407. EPILOGUE