You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sdot.S 24 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCH_SIZE (8 * 16 + 4)
  41. #define N r32
  42. #define X1 r33
  43. #define INCX r34
  44. #define Y1 r35
  45. #define INCY r36
  46. #define PREX r2
  47. #define PREY r3
  48. #define I r14
  49. #define J r15
  50. #define Y2 r16
  51. #define X2 r17
  52. #define INCX16 r18
  53. #define INCY16 r19
  54. #define INCX5 r20
  55. #define INCY5 r21
  56. #define YY r22
  57. #define XA r23
  58. #define YA r24
  59. #define XX r25
  60. #define PR r30
  61. #define ARLC r31
  62. PROLOGUE
  63. .prologue
  64. PROFCODE
  65. { .mfi
  66. nop.m 0
  67. mov f8 = f0
  68. .save ar.lc, ARLC
  69. mov ARLC = ar.lc
  70. }
  71. { .mfi
  72. mov r26 = 1
  73. mov f9 = f0
  74. shr XA = X1, 3
  75. }
  76. ;;
  77. .body
  78. #ifdef F_INTERFACE
  79. LDINT N = [N]
  80. LDINT INCX = [INCX]
  81. LDINT INCY = [INCY]
  82. ;;
  83. #ifndef USE64BITINT
  84. sxt4 N = N
  85. sxt4 INCX = INCX
  86. sxt4 INCY = INCY
  87. ;;
  88. #endif
  89. cmp.le p0, p6 = r0, INCX
  90. cmp.le p0, p7 = r0, INCY
  91. sub r26 = r26, N
  92. ;;
  93. setf.sig f32 = r26
  94. setf.sig f33 = INCX
  95. setf.sig f34 = INCY
  96. ;;
  97. xmpy.l f33 = f32, f33
  98. xmpy.l f34 = f32, f34
  99. ;;
  100. getf.sig r26 = f33
  101. getf.sig r27 = f34
  102. ;;
  103. (p6) shladd X1 = r26, BASE_SHIFT, X1
  104. (p7) shladd Y1 = r27, BASE_SHIFT, Y1
  105. ;;
  106. #endif
  107. { .mfi
  108. shladd INCX = INCX, BASE_SHIFT, r0
  109. mov f32 = f0
  110. mov PR = pr
  111. }
  112. { .mfb
  113. cmp.lt p0, p6 = r0, N
  114. mov f80 = f0
  115. (p6) br.ret.sptk.many b0
  116. }
  117. ;;
  118. { .mfi
  119. shladd INCY = INCY, BASE_SHIFT, r0
  120. mov f10 = f0
  121. tbit.nz p15, p0 = X1, BASE_SHIFT
  122. }
  123. { .mfb
  124. cmp.ne p6, p0 = SIZE, INCX
  125. mov f11 = f0
  126. (p6) br.cond.dptk .L100
  127. }
  128. ;;
  129. { .mfi
  130. (p15) LDFD f32 = [X1], INCX
  131. mov f12 = f0
  132. mov pr.rot= 0
  133. }
  134. { .mfi
  135. (p15) adds N = -1, N
  136. mov f13 = f0
  137. shr YA = Y1, 3
  138. }
  139. ;;
  140. { .mfi
  141. (p15) LDFD f80 = [Y1], INCY
  142. mov f14 = f0
  143. shr I = N, 4
  144. }
  145. { .mmi
  146. and J = 15, N
  147. and XA = 0x1f, XA
  148. and YA = 0x1f, YA
  149. }
  150. ;;
  151. { .mmi
  152. shladd INCX5 = INCX, 2, INCX
  153. shladd INCY5 = INCY, 2, INCY
  154. sub XA = YA, XA
  155. }
  156. { .mmi
  157. shladd INCX16 = INCX, 4, r0
  158. shladd INCY16 = INCY, 4, r0
  159. tbit.z p0, p12 = N, 3
  160. }
  161. ;;
  162. { .mmi
  163. shladd Y2 = INCY, 2, Y1
  164. cmp.eq p7, p0 = r0, J
  165. mov ar.ec= 3
  166. }
  167. { .mmi
  168. adds I = -1, I
  169. cmp.ge p8, p0 = 4, XA
  170. cmp.eq p16, p0 = r0, r0
  171. }
  172. ;;
  173. { .mbb
  174. cmp.le p9, p0 = 24, XA
  175. (p8) br.cond.dpnt .L20
  176. (p9) br.cond.dpnt .L20
  177. }
  178. ;;
  179. { .mmi
  180. adds PREX = PREFETCH_SIZE * SIZE, X1
  181. adds PREY = (PREFETCH_SIZE + 6) * SIZE, Y1
  182. mov ar.lc = I
  183. }
  184. { .mfb
  185. cmp.eq p6 ,p0 = -1, I
  186. FMA f15 = f32, f80, f0
  187. (p6) br.cond.dpnt .L15
  188. }
  189. ;;
  190. .align 32
  191. /* INCX == 1 && X is aligned */
  192. .L12:
  193. { .mmf
  194. (p16) LDFPD f32, f35 = [X1], 2 * SIZE
  195. (p16) lfetch.nt1 [PREX], INCX16
  196. (p18) FMA f8 = f34, f82, f8
  197. }
  198. { .mmf
  199. (p16) LDFD f80 = [Y1], INCY
  200. (p16) LDFD f92 = [Y2], INCY
  201. (p18) FMA f9 = f37, f85, f9
  202. }
  203. ;;
  204. { .mmf
  205. (p16) LDFPD f38, f41 = [X1], 2 * SIZE
  206. (p16) lfetch.nt1 [PREY], INCY16
  207. (p18) FMA f10 = f40, f88, f10
  208. }
  209. { .mmf
  210. (p16) LDFD f83 = [Y1], INCY
  211. (p16) LDFD f95 = [Y2], INCY
  212. (p18) FMA f11 = f43, f91, f11
  213. }
  214. ;;
  215. { .mmf
  216. (p16) LDFPD f44, f47 = [X1], 2 * SIZE
  217. (p18) FMA f12 = f46, f94, f12
  218. }
  219. { .mmf
  220. (p16) LDFD f86 = [Y1], INCY
  221. (p16) LDFD f98 = [Y2], INCY
  222. (p18) FMA f13 = f49, f97, f13
  223. }
  224. ;;
  225. { .mmf
  226. (p16) LDFPD f50, f53 = [X1], 2 * SIZE
  227. (p18) FMA f14 = f52, f100, f14
  228. }
  229. { .mmf
  230. (p16) LDFD f89 = [Y1], INCY5
  231. (p16) LDFD f101 = [Y2], INCY5
  232. (p18) FMA f15 = f55, f103, f15
  233. }
  234. ;;
  235. { .mmf
  236. (p16) LDFPD f56, f59 = [X1], 2 * SIZE
  237. (p18) FMA f8 = f58, f106, f8
  238. }
  239. { .mmf
  240. (p16) LDFD f104 = [Y1], INCY
  241. (p16) LDFD f116 = [Y2], INCY
  242. (p18) FMA f9 = f61, f109, f9
  243. }
  244. ;;
  245. { .mmf
  246. (p16) LDFPD f62, f65 = [X1], 2 * SIZE
  247. (p18) FMA f10 = f64, f112, f10
  248. }
  249. { .mmf
  250. (p16) LDFD f107 = [Y1], INCY
  251. (p16) LDFD f119 = [Y2], INCY
  252. (p18) FMA f11 = f67, f115, f11
  253. }
  254. ;;
  255. { .mmf
  256. (p16) LDFPD f68, f71 = [X1], 2 * SIZE
  257. (p18) FMA f12 = f70, f118, f12
  258. }
  259. { .mmf
  260. (p16) LDFD f110 = [Y1], INCY
  261. (p16) LDFD f122 = [Y2], INCY
  262. (p18) FMA f13 = f73, f121, f13
  263. }
  264. ;;
  265. { .mmf
  266. (p16) LDFPD f74, f77 = [X1], 2 * SIZE
  267. (p16) LDFD f113 = [Y1], INCY5
  268. (p18) FMA f14 = f76, f124, f14
  269. }
  270. { .mfb
  271. (p16) LDFD f125 = [Y2], INCY5
  272. (p18) FMA f15 = f79, f127, f15
  273. br.ctop.sptk.few .L12
  274. }
  275. ;;
  276. .align 32
  277. .L15:
  278. { .mmi
  279. (p12) LDFPD f32, f33 = [X1], 2 * SIZE
  280. mov YY = Y1
  281. tbit.z p0, p13 = N, 2
  282. }
  283. { .mmb
  284. (p12) LDFD f34 = [Y1], INCY
  285. (p12) LDFD f42 = [Y2], INCY
  286. (p7) br.cond.dptk .L999
  287. }
  288. ;;
  289. { .mmi
  290. (p12) LDFPD f36, f37 = [X1], 2 * SIZE
  291. (p12) shladd YY = INCY, 3, YY
  292. tbit.z p0, p14 = N, 1
  293. }
  294. { .mmi
  295. (p12) LDFD f35 = [Y1], INCY
  296. (p12) LDFD f43 = [Y2], INCY
  297. tbit.z p0, p15 = N, 0
  298. }
  299. ;;
  300. { .mmi
  301. (p12) LDFPD f40, f41 = [X1], 2 * SIZE
  302. (p13) shladd YY = INCY, 2, YY
  303. }
  304. { .mmi
  305. (p12) LDFD f38 = [Y1], INCY
  306. (p12) LDFD f46 = [Y2], INCY
  307. }
  308. ;;
  309. (p12) LDFPD f44, f45 = [X1], 2 * SIZE
  310. (p12) LDFD f39 = [Y1], INCY5
  311. (p12) LDFD f47 = [Y2], INCY5
  312. ;;
  313. (p13) LDFPD f48, f49 = [X1], 2 * SIZE
  314. (p13) LDFD f50 = [Y1], INCY
  315. (p14) LDFD f58 = [YY], INCY
  316. ;;
  317. (p13) LDFPD f52, f53 = [X1], 2 * SIZE
  318. (p13) LDFD f51 = [Y1], INCY
  319. (p14) LDFD f59 = [YY], INCY
  320. ;;
  321. (p14) LDFPD f56, f57 = [X1], 2 * SIZE
  322. (p13) LDFD f54 = [Y1], INCY
  323. (p15) LDFD f61 = [YY]
  324. ;;
  325. (p13) LDFD f55 = [Y1], INCY
  326. (p15) LDFD f60 = [X1]
  327. ;;
  328. (p12) FMA f8 = f32, f34, f8
  329. (p12) FMA f9 = f33, f35, f9
  330. (p12) FMA f10 = f36, f38, f10
  331. (p12) FMA f11 = f37, f39, f11
  332. (p12) FMA f12 = f40, f42, f12
  333. (p12) FMA f13 = f41, f43, f13
  334. (p12) FMA f14 = f44, f46, f14
  335. (p12) FMA f15 = f45, f47, f15
  336. ;;
  337. (p13) FMA f8 = f48, f50, f8
  338. (p13) FMA f9 = f49, f51, f9
  339. (p13) FMA f10 = f52, f54, f10
  340. (p13) FMA f11 = f53, f55, f11
  341. (p14) FMA f12 = f56, f58, f12
  342. (p14) FMA f13 = f57, f59, f13
  343. (p15) FMA f14 = f60, f61, f14
  344. br .L999
  345. ;;
  346. .align 32
  347. .L20:
  348. { .mmi
  349. adds PREX = PREFETCH_SIZE * SIZE, X1
  350. adds PREY = (PREFETCH_SIZE + 38) * SIZE, Y1
  351. mov ar.lc = I
  352. }
  353. { .mfb
  354. cmp.eq p6 ,p0 = -1, I
  355. FMA f15 = f32, f80, f0
  356. (p6) br.cond.dpnt .L25
  357. }
  358. ;;
  359. .align 32
  360. .L22:
  361. { .mmf
  362. (p16) LDFPD f32, f35 = [X1], 2 * SIZE
  363. (p16) lfetch.nt1 [PREX], INCX16
  364. (p18) FMA f8 = f34, f82, f8
  365. }
  366. { .mmf
  367. (p17) LDFD f81 = [Y1], INCY
  368. (p17) LDFD f93 = [Y2], INCY
  369. (p18) FMA f9 = f37, f85, f9
  370. }
  371. ;;
  372. { .mmf
  373. (p16) LDFPD f38, f41 = [X1], 2 * SIZE
  374. (p16) lfetch.nt1 [PREY], INCY16
  375. (p18) FMA f10 = f40, f88, f10
  376. }
  377. { .mmf
  378. (p17) LDFD f84 = [Y1], INCY
  379. (p17) LDFD f96 = [Y2], INCY
  380. (p18) FMA f11 = f43, f91, f11
  381. }
  382. ;;
  383. { .mmf
  384. (p16) LDFPD f44, f47 = [X1], 2 * SIZE
  385. (p18) FMA f12 = f46, f94, f12
  386. }
  387. { .mmf
  388. (p17) LDFD f87 = [Y1], INCY
  389. (p17) LDFD f99 = [Y2], INCY
  390. (p18) FMA f13 = f49, f97, f13
  391. }
  392. ;;
  393. { .mmf
  394. (p16) LDFPD f50, f53 = [X1], 2 * SIZE
  395. (p18) FMA f14 = f52, f100, f14
  396. }
  397. { .mmf
  398. (p17) LDFD f90 = [Y1], INCY5
  399. (p17) LDFD f102 = [Y2], INCY5
  400. (p18) FMA f15 = f55, f103, f15
  401. }
  402. ;;
  403. { .mmf
  404. (p16) LDFPD f56, f59 = [X1], 2 * SIZE
  405. (p18) FMA f8 = f58, f106, f8
  406. }
  407. { .mmf
  408. (p17) LDFD f105 = [Y1], INCY
  409. (p17) LDFD f117 = [Y2], INCY
  410. (p18) FMA f9 = f61, f109, f9
  411. }
  412. ;;
  413. { .mmf
  414. (p16) LDFPD f62, f65 = [X1], 2 * SIZE
  415. (p18) FMA f10 = f64, f112, f10
  416. }
  417. { .mmf
  418. (p17) LDFD f108 = [Y1], INCY
  419. (p17) LDFD f120 = [Y2], INCY
  420. (p18) FMA f11 = f67, f115, f11
  421. }
  422. ;;
  423. { .mmf
  424. (p16) LDFPD f68, f71 = [X1], 2 * SIZE
  425. (p18) FMA f12 = f70, f118, f12
  426. }
  427. { .mmf
  428. (p17) LDFD f111 = [Y1], INCY
  429. (p17) LDFD f123 = [Y2], INCY
  430. (p18) FMA f13 = f73, f121, f13
  431. }
  432. ;;
  433. { .mmf
  434. (p16) LDFPD f74, f77 = [X1], 2 * SIZE
  435. (p17) LDFD f114 = [Y1], INCY5
  436. (p18) FMA f14 = f76, f124, f14
  437. }
  438. { .mfb
  439. (p17) LDFD f126 = [Y2], INCY5
  440. (p18) FMA f15 = f79, f127, f15
  441. br.ctop.sptk.few .L22
  442. }
  443. ;;
  444. .align 32
  445. .L25:
  446. { .mmi
  447. (p12) LDFPD f32, f33 = [X1], 2 * SIZE
  448. mov YY = Y1
  449. tbit.z p0, p13 = N, 2
  450. }
  451. { .mmb
  452. (p12) LDFD f34 = [Y1], INCY
  453. (p12) LDFD f42 = [Y2], INCY
  454. (p7) br.cond.dptk .L999
  455. }
  456. ;;
  457. { .mmi
  458. (p12) LDFPD f36, f37 = [X1], 2 * SIZE
  459. (p12) shladd YY = INCY, 3, YY
  460. tbit.z p0, p14 = N, 1
  461. }
  462. { .mmi
  463. (p12) LDFD f35 = [Y1], INCY
  464. (p12) LDFD f43 = [Y2], INCY
  465. tbit.z p0, p15 = N, 0
  466. }
  467. ;;
  468. { .mmi
  469. (p12) LDFPD f40, f41 = [X1], 2 * SIZE
  470. (p13) shladd YY = INCY, 2, YY
  471. }
  472. { .mmi
  473. (p12) LDFD f38 = [Y1], INCY
  474. (p12) LDFD f46 = [Y2], INCY
  475. }
  476. ;;
  477. (p12) LDFPD f44, f45 = [X1], 2 * SIZE
  478. (p12) LDFD f39 = [Y1], INCY5
  479. (p12) LDFD f47 = [Y2], INCY5
  480. ;;
  481. (p13) LDFPD f48, f49 = [X1], 2 * SIZE
  482. (p13) LDFD f50 = [Y1], INCY
  483. (p14) LDFD f58 = [YY], INCY
  484. ;;
  485. (p13) LDFPD f52, f53 = [X1], 2 * SIZE
  486. (p13) LDFD f51 = [Y1], INCY
  487. (p14) LDFD f59 = [YY], INCY
  488. ;;
  489. (p14) LDFPD f56, f57 = [X1], 2 * SIZE
  490. (p13) LDFD f54 = [Y1], INCY
  491. (p15) LDFD f61 = [YY]
  492. ;;
  493. (p13) LDFD f55 = [Y1], INCY
  494. (p15) LDFD f60 = [X1]
  495. ;;
  496. (p12) FMA f8 = f32, f34, f8
  497. (p12) FMA f9 = f33, f35, f9
  498. (p12) FMA f10 = f36, f38, f10
  499. (p12) FMA f11 = f37, f39, f11
  500. (p12) FMA f12 = f40, f42, f12
  501. (p12) FMA f13 = f41, f43, f13
  502. (p12) FMA f14 = f44, f46, f14
  503. (p12) FMA f15 = f45, f47, f15
  504. ;;
  505. (p13) FMA f8 = f48, f50, f8
  506. (p13) FMA f9 = f49, f51, f9
  507. (p13) FMA f10 = f52, f54, f10
  508. (p13) FMA f11 = f53, f55, f11
  509. (p14) FMA f12 = f56, f58, f12
  510. (p14) FMA f13 = f57, f59, f13
  511. (p15) FMA f14 = f60, f61, f14
  512. br .L999
  513. ;;
  514. .align 32
  515. .L100:
  516. { .mmi
  517. shladd X2 = INCX, 2, X1
  518. }
  519. { .mib
  520. cmp.ne p6, p0 = SIZE, INCY
  521. tbit.nz p15, p0 = Y1, BASE_SHIFT
  522. (p6) br.cond.dptk .L200
  523. }
  524. ;;
  525. { .mfi
  526. (p15) LDFD f32 = [X1], INCX
  527. mov f12 = f0
  528. mov pr.rot= 0
  529. }
  530. { .mfi
  531. (p15) adds N = -1, N
  532. mov f13 = f0
  533. shr YA = Y1, 3
  534. }
  535. ;;
  536. { .mfi
  537. (p15) LDFD f80 = [Y1], INCY
  538. mov f14 = f0
  539. shr I = N, 4
  540. }
  541. { .mmi
  542. and J = 15, N
  543. and XA = 0x1f, XA
  544. and YA = 0x1f, YA
  545. }
  546. ;;
  547. { .mmi
  548. shladd INCX5 = INCX, 2, INCX
  549. shladd INCY5 = INCY, 2, INCY
  550. sub XA = YA, XA
  551. }
  552. { .mmi
  553. shladd INCX16 = INCX, 4, r0
  554. shladd INCY16 = INCY, 4, r0
  555. tbit.z p0, p12 = N, 3
  556. }
  557. ;;
  558. { .mmi
  559. shladd X2 = INCX, 2, X1
  560. cmp.eq p7, p0 = r0, J
  561. mov ar.ec= 3
  562. }
  563. { .mmi
  564. adds I = -1, I
  565. cmp.ge p8, p0 = 8, XA
  566. cmp.eq p16, p0 = r0, r0
  567. }
  568. ;;
  569. { .mbb
  570. cmp.le p9, p0 = 28, XA
  571. (p8) br.cond.dpnt .L120
  572. (p9) br.cond.dpnt .L120
  573. }
  574. ;;
  575. { .mmi
  576. adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1
  577. adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1
  578. mov ar.lc = I
  579. }
  580. { .mfb
  581. cmp.eq p6 ,p0 = -1, I
  582. FMA f15 = f32, f80, f0
  583. (p6) br.cond.dpnt .L115
  584. }
  585. ;;
  586. .align 32
  587. /* INCY == 1 */
  588. .L112:
  589. { .mmf
  590. (p16) LDFPD f32, f35 = [Y1], 2 * SIZE
  591. (p16) lfetch.nt1 [PREX], INCX16
  592. (p18) FMA f8 = f34, f82, f8
  593. }
  594. { .mmf
  595. (p16) LDFD f80 = [X1], INCX
  596. (p16) LDFD f92 = [X2], INCX
  597. (p18) FMA f9 = f37, f85, f9
  598. }
  599. ;;
  600. { .mmf
  601. (p16) LDFPD f38, f41 = [Y1], 2 * SIZE
  602. (p16) lfetch.nt1 [PREY], INCY16
  603. (p18) FMA f10 = f40, f88, f10
  604. }
  605. { .mmf
  606. (p16) LDFD f83 = [X1], INCX
  607. (p16) LDFD f95 = [X2], INCX
  608. (p18) FMA f11 = f43, f91, f11
  609. }
  610. ;;
  611. { .mmf
  612. (p16) LDFPD f44, f47 = [Y1], 2 * SIZE
  613. (p18) FMA f12 = f46, f94, f12
  614. }
  615. { .mmf
  616. (p16) LDFD f86 = [X1], INCX
  617. (p16) LDFD f98 = [X2], INCX
  618. (p18) FMA f13 = f49, f97, f13
  619. }
  620. ;;
  621. { .mmf
  622. (p16) LDFPD f50, f53 = [Y1], 2 * SIZE
  623. (p18) FMA f14 = f52, f100, f14
  624. }
  625. { .mmf
  626. (p16) LDFD f89 = [X1], INCX5
  627. (p16) LDFD f101 = [X2], INCX5
  628. (p18) FMA f15 = f55, f103, f15
  629. }
  630. ;;
  631. { .mmf
  632. (p16) LDFPD f56, f59 = [Y1], 2 * SIZE
  633. (p18) FMA f8 = f58, f106, f8
  634. }
  635. { .mmf
  636. (p16) LDFD f104 = [X1], INCX
  637. (p16) LDFD f116 = [X2], INCX
  638. (p18) FMA f9 = f61, f109, f9
  639. }
  640. ;;
  641. { .mmf
  642. (p16) LDFPD f62, f65 = [Y1], 2 * SIZE
  643. (p18) FMA f10 = f64, f112, f10
  644. }
  645. { .mmf
  646. (p16) LDFD f107 = [X1], INCX
  647. (p16) LDFD f119 = [X2], INCX
  648. (p18) FMA f11 = f67, f115, f11
  649. }
  650. ;;
  651. { .mmf
  652. (p16) LDFPD f68, f71 = [Y1], 2 * SIZE
  653. (p18) FMA f12 = f70, f118, f12
  654. }
  655. { .mmf
  656. (p16) LDFD f110 = [X1], INCX
  657. (p16) LDFD f122 = [X2], INCX
  658. (p18) FMA f13 = f73, f121, f13
  659. }
  660. ;;
  661. { .mmf
  662. (p16) LDFPD f74, f77 = [Y1], 2 * SIZE
  663. (p16) LDFD f113 = [X1], INCX5
  664. (p18) FMA f14 = f76, f124, f14
  665. }
  666. { .mfb
  667. (p16) LDFD f125 = [X2], INCX5
  668. (p18) FMA f15 = f79, f127, f15
  669. br.ctop.sptk.few .L112
  670. }
  671. ;;
  672. .align 32
  673. .L115:
  674. { .mmi
  675. (p12) LDFPD f32, f33 = [Y1], 2 * SIZE
  676. mov XX = X1
  677. tbit.z p0, p13 = N, 2
  678. }
  679. { .mmb
  680. (p12) LDFD f34 = [X1], INCX
  681. (p12) LDFD f42 = [X2], INCX
  682. (p7) br.cond.dptk .L999
  683. }
  684. ;;
  685. { .mmi
  686. (p12) LDFPD f36, f37 = [Y1], 2 * SIZE
  687. (p12) shladd XX = INCX, 3, XX
  688. tbit.z p0, p14 = N, 1
  689. }
  690. { .mmi
  691. (p12) LDFD f35 = [X1], INCX
  692. (p12) LDFD f43 = [X2], INCX
  693. tbit.z p0, p15 = N, 0
  694. }
  695. ;;
  696. { .mmi
  697. (p12) LDFPD f40, f41 = [Y1], 2 * SIZE
  698. (p13) shladd XX = INCX, 2, XX
  699. }
  700. { .mmi
  701. (p12) LDFD f38 = [X1], INCX
  702. (p12) LDFD f46 = [X2], INCX
  703. }
  704. ;;
  705. (p12) LDFPD f44, f45 = [Y1], 2 * SIZE
  706. (p12) LDFD f39 = [X1], INCX5
  707. (p12) LDFD f47 = [X2], INCX5
  708. ;;
  709. (p13) LDFPD f48, f49 = [Y1], 2 * SIZE
  710. (p13) LDFD f50 = [X1], INCX
  711. (p14) LDFD f58 = [XX], INCX
  712. ;;
  713. (p13) LDFPD f52, f53 = [Y1], 2 * SIZE
  714. (p13) LDFD f51 = [X1], INCX
  715. (p14) LDFD f59 = [XX], INCX
  716. ;;
  717. (p14) LDFPD f56, f57 = [Y1], 2 * SIZE
  718. (p13) LDFD f54 = [X1], INCX
  719. (p15) LDFD f61 = [XX]
  720. ;;
  721. (p13) LDFD f55 = [X1], INCX
  722. (p15) LDFD f60 = [Y1]
  723. ;;
  724. (p12) FMA f8 = f32, f34, f8
  725. (p12) FMA f9 = f33, f35, f9
  726. (p12) FMA f10 = f36, f38, f10
  727. (p12) FMA f11 = f37, f39, f11
  728. (p12) FMA f12 = f40, f42, f12
  729. (p12) FMA f13 = f41, f43, f13
  730. (p12) FMA f14 = f44, f46, f14
  731. (p12) FMA f15 = f45, f47, f15
  732. ;;
  733. (p13) FMA f8 = f48, f50, f8
  734. (p13) FMA f9 = f49, f51, f9
  735. (p13) FMA f10 = f52, f54, f10
  736. (p13) FMA f11 = f53, f55, f11
  737. (p14) FMA f12 = f56, f58, f12
  738. (p14) FMA f13 = f57, f59, f13
  739. (p15) FMA f14 = f60, f61, f14
  740. br .L999
  741. ;;
  742. .align 32
  743. .L120:
  744. { .mmi
  745. adds PREX = (PREFETCH_SIZE + 17) * SIZE, X1
  746. adds PREY = (PREFETCH_SIZE + 19) * SIZE, X1
  747. mov ar.lc = I
  748. }
  749. { .mfb
  750. cmp.eq p6 ,p0 = -1, I
  751. FMA f15 = f32, f80, f0
  752. (p6) br.cond.dpnt .L125
  753. }
  754. ;;
  755. .align 32
  756. .L122:
  757. { .mmf
  758. (p16) LDFPD f32, f35 = [Y1], 2 * SIZE
  759. (p16) lfetch.nt1 [PREX], INCX16
  760. (p18) FMA f8 = f34, f82, f8
  761. }
  762. { .mmf
  763. (p17) LDFD f81 = [X1], INCX
  764. (p17) LDFD f93 = [X2], INCX
  765. (p18) FMA f9 = f37, f85, f9
  766. }
  767. ;;
  768. { .mmf
  769. (p16) LDFPD f38, f41 = [Y1], 2 * SIZE
  770. (p16) lfetch.nt1 [PREY], INCX16
  771. (p18) FMA f10 = f40, f88, f10
  772. }
  773. { .mmf
  774. (p17) LDFD f84 = [X1], INCX
  775. (p17) LDFD f96 = [X2], INCX
  776. (p18) FMA f11 = f43, f91, f11
  777. }
  778. ;;
  779. { .mmf
  780. (p16) LDFPD f44, f47 = [Y1], 2 * SIZE
  781. (p18) FMA f12 = f46, f94, f12
  782. }
  783. { .mmf
  784. (p17) LDFD f87 = [X1], INCX
  785. (p17) LDFD f99 = [X2], INCX
  786. (p18) FMA f13 = f49, f97, f13
  787. }
  788. ;;
  789. { .mmf
  790. (p16) LDFPD f50, f53 = [Y1], 2 * SIZE
  791. (p18) FMA f14 = f52, f100, f14
  792. }
  793. { .mmf
  794. (p17) LDFD f90 = [X1], INCX5
  795. (p17) LDFD f102 = [X2], INCX5
  796. (p18) FMA f15 = f55, f103, f15
  797. }
  798. ;;
  799. { .mmf
  800. (p16) LDFPD f56, f59 = [Y1], 2 * SIZE
  801. (p18) FMA f8 = f58, f106, f8
  802. }
  803. { .mmf
  804. (p17) LDFD f105 = [X1], INCX
  805. (p17) LDFD f117 = [X2], INCX
  806. (p18) FMA f9 = f61, f109, f9
  807. }
  808. ;;
  809. { .mmf
  810. (p16) LDFPD f62, f65 = [Y1], 2 * SIZE
  811. (p18) FMA f10 = f64, f112, f10
  812. }
  813. { .mmf
  814. (p17) LDFD f108 = [X1], INCX
  815. (p17) LDFD f120 = [X2], INCX
  816. (p18) FMA f11 = f67, f115, f11
  817. }
  818. ;;
  819. { .mmf
  820. (p16) LDFPD f68, f71 = [Y1], 2 * SIZE
  821. (p18) FMA f12 = f70, f118, f12
  822. }
  823. { .mmf
  824. (p17) LDFD f111 = [X1], INCX
  825. (p17) LDFD f123 = [X2], INCX
  826. (p18) FMA f13 = f73, f121, f13
  827. }
  828. ;;
  829. { .mmf
  830. (p16) LDFPD f74, f77 = [Y1], 2 * SIZE
  831. (p17) LDFD f114 = [X1], INCX5
  832. (p18) FMA f14 = f76, f124, f14
  833. }
  834. { .mfb
  835. (p17) LDFD f126 = [X2], INCX5
  836. (p18) FMA f15 = f79, f127, f15
  837. br.ctop.sptk.few .L122
  838. }
  839. ;;
  840. .align 32
  841. .L125:
  842. { .mmi
  843. (p12) LDFPD f32, f33 = [Y1], 2 * SIZE
  844. mov XX = X1
  845. tbit.z p0, p13 = N, 2
  846. }
  847. { .mmb
  848. (p12) LDFD f34 = [X1], INCX
  849. (p12) LDFD f42 = [X2], INCX
  850. (p7) br.cond.dptk .L999
  851. }
  852. ;;
  853. { .mmi
  854. (p12) LDFPD f36, f37 = [Y1], 2 * SIZE
  855. (p12) shladd XX = INCX, 3, XX
  856. tbit.z p0, p14 = N, 1
  857. }
  858. { .mmi
  859. (p12) LDFD f35 = [X1], INCX
  860. (p12) LDFD f43 = [X2], INCX
  861. tbit.z p0, p15 = N, 0
  862. }
  863. ;;
  864. { .mmi
  865. (p12) LDFPD f40, f41 = [Y1], 2 * SIZE
  866. (p13) shladd XX = INCX, 2, XX
  867. }
  868. { .mmi
  869. (p12) LDFD f38 = [X1], INCX
  870. (p12) LDFD f46 = [X2], INCX
  871. }
  872. ;;
  873. (p12) LDFPD f44, f45 = [Y1], 2 * SIZE
  874. (p12) LDFD f39 = [X1], INCX5
  875. (p12) LDFD f47 = [X2], INCX5
  876. ;;
  877. (p13) LDFPD f48, f49 = [Y1], 2 * SIZE
  878. (p13) LDFD f50 = [X1], INCX
  879. (p14) LDFD f58 = [XX], INCX
  880. ;;
  881. (p13) LDFPD f52, f53 = [Y1], 2 * SIZE
  882. (p13) LDFD f51 = [X1], INCX
  883. (p14) LDFD f59 = [XX], INCX
  884. ;;
  885. (p14) LDFPD f56, f57 = [Y1], 2 * SIZE
  886. (p13) LDFD f54 = [X1], INCX
  887. (p15) LDFD f61 = [XX]
  888. ;;
  889. (p13) LDFD f55 = [X1], INCX
  890. (p15) LDFD f60 = [Y1]
  891. ;;
  892. (p12) FMA f8 = f32, f34, f8
  893. (p12) FMA f9 = f33, f35, f9
  894. (p12) FMA f10 = f36, f38, f10
  895. (p12) FMA f11 = f37, f39, f11
  896. (p12) FMA f12 = f40, f42, f12
  897. (p12) FMA f13 = f41, f43, f13
  898. (p12) FMA f14 = f44, f46, f14
  899. (p12) FMA f15 = f45, f47, f15
  900. ;;
  901. (p13) FMA f8 = f48, f50, f8
  902. (p13) FMA f9 = f49, f51, f9
  903. (p13) FMA f10 = f52, f54, f10
  904. (p13) FMA f11 = f53, f55, f11
  905. (p14) FMA f12 = f56, f58, f12
  906. (p14) FMA f13 = f57, f59, f13
  907. (p15) FMA f14 = f60, f61, f14
  908. br .L999
  909. ;;
  910. .align 32
  911. .L200:
  912. { .mfi
  913. shladd INCX5 = INCX, 2, INCX
  914. mov f12 = f0
  915. mov pr.rot= 0
  916. }
  917. { .mfi
  918. and J = 15, N
  919. mov f13 = f0
  920. shr I = N, 4
  921. }
  922. ;;
  923. { .mmf
  924. cmp.eq p16, p0 = r0, r0
  925. shladd INCY5 = INCY, 2, INCY
  926. mov f14 = f0
  927. }
  928. { .mmi
  929. shladd INCX16 = INCX, 4, r0
  930. shladd INCY16 = INCY, 4, r0
  931. tbit.z p0, p12 = N, 3
  932. }
  933. ;;
  934. { .mmi
  935. cmp.eq p7, p0 = r0, J
  936. adds I = -1, I
  937. mov ar.ec= 3
  938. }
  939. { .mmi
  940. shladd Y2 = INCY, 2, Y1
  941. mov XX = X1
  942. mov YY = Y1
  943. }
  944. ;;
  945. { .mmi
  946. adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1
  947. adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1
  948. mov ar.lc = I
  949. }
  950. { .mfb
  951. cmp.eq p6 ,p0 = -1, I
  952. mov f15 = f0
  953. (p6) br.cond.dpnt .L215
  954. }
  955. ;;
  956. .align 32
  957. /* INCY == 1 */
  958. .L212:
  959. { .mmf
  960. (p16) lfetch.nt1 [PREX], INCX16
  961. (p16) lfetch.nt1 [PREY], INCY16
  962. (p18) FMA f8 = f34, f82, f8
  963. }
  964. { .mmf
  965. (p16) LDFD f32 = [Y1], INCY
  966. (p16) LDFD f44 = [Y2], INCY
  967. (p18) FMA f9 = f37, f85, f9
  968. }
  969. ;;
  970. { .mmf
  971. (p16) LDFD f80 = [X1], INCX
  972. (p16) LDFD f92 = [X2], INCX
  973. (p18) FMA f10 = f40, f88, f10
  974. }
  975. { .mmf
  976. (p16) LDFD f35 = [Y1], INCY
  977. (p16) LDFD f47 = [Y2], INCY
  978. (p18) FMA f11 = f43, f91, f11
  979. }
  980. ;;
  981. { .mmf
  982. (p16) LDFD f83 = [X1], INCX
  983. (p16) LDFD f95 = [X2], INCX
  984. (p18) FMA f12 = f46, f94, f12
  985. }
  986. { .mmf
  987. (p16) LDFD f38 = [Y1], INCY
  988. (p16) LDFD f50 = [Y2], INCY
  989. (p18) FMA f13 = f49, f97, f13
  990. }
  991. ;;
  992. { .mmf
  993. (p16) LDFD f86 = [X1], INCX
  994. (p16) LDFD f98 = [X2], INCX
  995. (p18) FMA f14 = f52, f100, f14
  996. }
  997. { .mmf
  998. (p16) LDFD f41 = [Y1], INCY5
  999. (p16) LDFD f53 = [Y2], INCY5
  1000. (p18) FMA f15 = f55, f103, f15
  1001. }
  1002. ;;
  1003. { .mmf
  1004. (p16) LDFD f89 = [X1], INCX5
  1005. (p16) LDFD f101 = [X2], INCX5
  1006. (p18) FMA f8 = f58, f106, f8
  1007. }
  1008. { .mmf
  1009. (p16) LDFD f56 = [Y1], INCY
  1010. (p16) LDFD f68 = [Y2], INCY
  1011. (p18) FMA f9 = f61, f109, f9
  1012. }
  1013. ;;
  1014. { .mmf
  1015. (p16) LDFD f104 = [X1], INCX
  1016. (p16) LDFD f116 = [X2], INCX
  1017. (p18) FMA f10 = f64, f112, f10
  1018. }
  1019. { .mmf
  1020. (p16) LDFD f59 = [Y1], INCY
  1021. (p16) LDFD f71 = [Y2], INCY
  1022. (p18) FMA f11 = f67, f115, f11
  1023. }
  1024. ;;
  1025. { .mmf
  1026. (p16) LDFD f107 = [X1], INCX
  1027. (p16) LDFD f119 = [X2], INCX
  1028. (p18) FMA f12 = f70, f118, f12
  1029. }
  1030. { .mmf
  1031. (p16) LDFD f62 = [Y1], INCY
  1032. (p16) LDFD f74 = [Y2], INCY
  1033. (p18) FMA f13 = f73, f121, f13
  1034. }
  1035. ;;
  1036. { .mmf
  1037. (p16) LDFD f110 = [X1], INCX
  1038. (p16) LDFD f122 = [X2], INCX
  1039. (p18) FMA f14 = f76, f124, f14
  1040. }
  1041. { .mmf
  1042. (p16) LDFD f65 = [Y1], INCY5
  1043. (p16) LDFD f77 = [Y2], INCY5
  1044. (p18) FMA f15 = f79, f127, f15
  1045. }
  1046. ;;
  1047. { .mmi
  1048. (p16) LDFD f113 = [X1], INCX5
  1049. (p16) LDFD f125 = [X2], INCX5
  1050. }
  1051. { .mmb
  1052. (p16) add XX = INCX16, XX
  1053. (p16) add YY = INCY16, YY
  1054. br.ctop.sptk.few .L212
  1055. }
  1056. ;;
  1057. .align 32
  1058. .L215:
  1059. { .mmi
  1060. (p12) LDFD f34 = [X1], INCX
  1061. (p12) LDFD f42 = [X2], INCX
  1062. tbit.z p0, p13 = N, 2
  1063. }
  1064. { .mmb
  1065. (p12) LDFD f32 = [Y1], INCY
  1066. (p12) LDFD f40 = [Y2], INCY
  1067. (p7) br.cond.dptk .L999
  1068. }
  1069. ;;
  1070. { .mmi
  1071. (p12) LDFD f35 = [X1], INCX
  1072. (p12) LDFD f43 = [X2], INCX
  1073. tbit.z p0, p14 = N, 1
  1074. }
  1075. { .mmi
  1076. (p12) LDFD f33 = [Y1], INCY
  1077. (p12) LDFD f41 = [Y2], INCY
  1078. tbit.z p0, p15 = N, 0
  1079. }
  1080. ;;
  1081. { .mmi
  1082. (p12) LDFD f38 = [X1], INCX
  1083. (p12) LDFD f46 = [X2], INCX
  1084. (p12) shladd XX = INCX, 3, XX
  1085. }
  1086. { .mmi
  1087. (p12) LDFD f36 = [Y1], INCY
  1088. (p12) LDFD f44 = [Y2], INCY
  1089. (p12) shladd YY = INCY, 3, YY
  1090. }
  1091. ;;
  1092. { .mmi
  1093. (p12) LDFD f39 = [X1], INCX5
  1094. (p12) LDFD f47 = [X2], INCX5
  1095. (p13) shladd XX = INCX, 2, XX
  1096. }
  1097. { .mmi
  1098. (p12) LDFD f37 = [Y1], INCY5
  1099. (p12) LDFD f45 = [Y2], INCY5
  1100. (p13) shladd YY = INCY, 2, YY
  1101. }
  1102. ;;
  1103. (p13) LDFD f50 = [X1], INCX
  1104. (p13) LDFD f48 = [Y1], INCY
  1105. (p14) LDFD f58 = [XX], INCX
  1106. (p14) LDFD f56 = [YY], INCY
  1107. ;;
  1108. (p13) LDFD f51 = [X1], INCX
  1109. (p13) LDFD f49 = [Y1], INCY
  1110. (p14) LDFD f59 = [XX], INCX
  1111. (p14) LDFD f57 = [YY], INCY
  1112. ;;
  1113. (p13) LDFD f54 = [X1], INCX
  1114. (p13) LDFD f52 = [Y1], INCY
  1115. (p15) LDFD f61 = [XX]
  1116. (p15) LDFD f60 = [YY]
  1117. ;;
  1118. (p13) LDFD f55 = [X1]
  1119. (p13) LDFD f53 = [Y1]
  1120. ;;
  1121. (p12) FMA f8 = f32, f34, f8
  1122. (p12) FMA f9 = f33, f35, f9
  1123. (p12) FMA f10 = f36, f38, f10
  1124. (p12) FMA f11 = f37, f39, f11
  1125. (p12) FMA f12 = f40, f42, f12
  1126. (p12) FMA f13 = f41, f43, f13
  1127. (p12) FMA f14 = f44, f46, f14
  1128. (p12) FMA f15 = f45, f47, f15
  1129. ;;
  1130. (p13) FMA f8 = f48, f50, f8
  1131. (p13) FMA f9 = f49, f51, f9
  1132. (p13) FMA f10 = f52, f54, f10
  1133. (p13) FMA f11 = f53, f55, f11
  1134. (p14) FMA f12 = f56, f58, f12
  1135. (p14) FMA f13 = f57, f59, f13
  1136. (p15) FMA f14 = f60, f61, f14
  1137. br .L999
  1138. ;;
  1139. .align 32
  1140. .L999:
  1141. FADD f8 = f8, f9
  1142. FADD f10 = f10, f11
  1143. FADD f12 = f12, f13
  1144. FADD f14 = f14, f15
  1145. ;;
  1146. FADD f8 = f8, f10
  1147. FADD f12 = f12, f14
  1148. mov ar.lc = ARLC
  1149. ;;
  1150. FADD f8 = f8, f12
  1151. mov pr = PR, -65474
  1152. br.ret.sptk.many b0
  1153. EPILOGUE