You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ddot.S 24 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCH_SIZE (16 * 16 + 2)
  41. #define N r32
  42. #define X1 r33
  43. #define INCX r34
  44. #define Y1 r35
  45. #define INCY r36
  46. #define PREX r2
  47. #define PREY r3
  48. #define I r14
  49. #define J r15
  50. #define Y2 r16
  51. #define X2 r17
  52. #define INCX16 r18
  53. #define INCY16 r19
  54. #define INCX3 r20
  55. #define INCY3 r21
  56. #define YY r22
  57. #define XA r23
  58. #define YA r24
  59. #define XX r25
  60. #define PR r30
  61. #define ARLC r31
  62. PROLOGUE
  63. .prologue
  64. PROFCODE
  65. { .mfi
  66. nop.m 0
  67. mov f8 = f0
  68. .save ar.lc, ARLC
  69. mov ARLC = ar.lc
  70. }
  71. { .mfi
  72. mov r26 = 1
  73. mov f9 = f0
  74. shr XA = X1, 4
  75. }
  76. ;;
  77. .body
  78. #ifdef F_INTERFACE
  79. LDINT N = [N]
  80. LDINT INCX = [INCX]
  81. LDINT INCY = [INCY]
  82. ;;
  83. #ifndef USE64BITINT
  84. sxt4 N = N
  85. sxt4 INCX = INCX
  86. sxt4 INCY = INCY
  87. ;;
  88. #endif
  89. cmp.le p0, p6 = r0, INCX
  90. cmp.le p0, p7 = r0, INCY
  91. sub r26 = r26, N
  92. ;;
  93. setf.sig f32 = r26
  94. setf.sig f33 = INCX
  95. setf.sig f34 = INCY
  96. ;;
  97. xmpy.l f33 = f32, f33
  98. xmpy.l f34 = f32, f34
  99. ;;
  100. getf.sig r26 = f33
  101. getf.sig r27 = f34
  102. ;;
  103. (p6) shladd X1 = r26, BASE_SHIFT, X1
  104. (p7) shladd Y1 = r27, BASE_SHIFT, Y1
  105. ;;
  106. #endif
  107. { .mfi
  108. shladd INCX = INCX, BASE_SHIFT, r0
  109. mov f32 = f0
  110. mov PR = pr
  111. }
  112. { .mfb
  113. cmp.lt p0, p6 = r0, N
  114. mov f80 = f0
  115. (p6) br.ret.sptk.many b0
  116. }
  117. ;;
  118. { .mfi
  119. shladd INCY = INCY, BASE_SHIFT, r0
  120. mov f10 = f0
  121. tbit.nz p15, p0 = X1, BASE_SHIFT
  122. }
  123. { .mfb
  124. cmp.ne p6, p0 = SIZE, INCX
  125. mov f11 = f0
  126. (p6) br.cond.dptk .L100
  127. }
  128. ;;
  129. { .mfi
  130. (p15) LDFD f32 = [X1], INCX
  131. mov f12 = f0
  132. mov pr.rot= 0
  133. }
  134. { .mfi
  135. (p15) adds N = -1, N
  136. mov f13 = f0
  137. shr YA = Y1, 4
  138. }
  139. ;;
  140. { .mfi
  141. (p15) LDFD f80 = [Y1], INCY
  142. mov f14 = f0
  143. shr I = N, 4
  144. }
  145. { .mmi
  146. and J = 15, N
  147. and XA = 0xf, XA
  148. and YA = 0xf, YA
  149. }
  150. ;;
  151. { .mmi
  152. shladd INCX3 = INCX, 1, INCX
  153. shladd INCY3 = INCY, 1, INCY
  154. sub XA = YA, XA
  155. }
  156. { .mmi
  157. shladd INCX16 = INCX, 4, r0
  158. shladd INCY16 = INCY, 4, r0
  159. tbit.z p0, p12 = N, 3
  160. }
  161. ;;
  162. { .mmi
  163. shladd Y2 = INCY, 1, Y1
  164. cmp.eq p7, p0 = r0, J
  165. mov ar.ec= 3
  166. }
  167. { .mmi
  168. adds I = -1, I
  169. cmp.ge p8, p0 = 2, XA
  170. cmp.eq p16, p0 = r0, r0
  171. }
  172. ;;
  173. { .mbb
  174. cmp.le p9, p0 = 12, XA
  175. (p8) br.cond.dpnt .L20
  176. (p9) br.cond.dpnt .L20
  177. }
  178. ;;
  179. { .mmi
  180. adds PREX = PREFETCH_SIZE * SIZE, X1
  181. adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1
  182. mov ar.lc = I
  183. }
  184. { .mfb
  185. cmp.eq p6 ,p0 = -1, I
  186. FMA f15 = f32, f80, f0
  187. (p6) br.cond.dpnt .L15
  188. }
  189. ;;
  190. .align 32
  191. /* INCX == 1 && X is aligned */
  192. .L12:
  193. { .mmf
  194. (p16) LDFPD f32, f35 = [X1], 2 * SIZE
  195. (p16) lfetch.nt1 [PREX], INCX16
  196. (p18) FMA f8 = f34, f82, f8
  197. }
  198. { .mmf
  199. (p16) LDFD f80 = [Y1], INCY
  200. (p16) LDFD f86 = [Y2], INCY
  201. (p18) FMA f9 = f37, f85, f9
  202. }
  203. ;;
  204. { .mmf
  205. (p16) LDFPD f38, f41 = [X1], 2 * SIZE
  206. (p16) lfetch.nt1 [PREY], INCY16
  207. (p18) FMA f10 = f40, f88, f10
  208. }
  209. { .mmf
  210. (p16) LDFD f83 = [Y1], INCY3
  211. (p16) LDFD f89 = [Y2], INCY3
  212. (p18) FMA f11 = f43, f91, f11
  213. }
  214. ;;
  215. { .mmf
  216. (p16) LDFPD f44, f47 = [X1], 2 * SIZE
  217. (p18) FMA f12 = f46, f94, f12
  218. }
  219. { .mmf
  220. (p16) LDFD f92 = [Y1], INCY
  221. (p16) LDFD f98 = [Y2], INCY
  222. (p18) FMA f13 = f49, f97, f13
  223. }
  224. ;;
  225. { .mmf
  226. (p16) LDFPD f50, f53 = [X1], 2 * SIZE
  227. (p18) FMA f14 = f52, f100, f14
  228. }
  229. { .mmf
  230. (p16) LDFD f95 = [Y1], INCY3
  231. (p16) LDFD f101 = [Y2], INCY3
  232. (p18) FMA f15 = f55, f103, f15
  233. }
  234. ;;
  235. { .mmf
  236. (p16) LDFPD f56, f59 = [X1], 2 * SIZE
  237. (p18) FMA f8 = f58, f106, f8
  238. }
  239. { .mmf
  240. (p16) LDFD f104 = [Y1], INCY
  241. (p16) LDFD f110 = [Y2], INCY
  242. (p18) FMA f9 = f61, f109, f9
  243. }
  244. ;;
  245. { .mmf
  246. (p16) LDFPD f62, f65 = [X1], 2 * SIZE
  247. (p18) FMA f10 = f64, f112, f10
  248. }
  249. { .mmf
  250. (p16) LDFD f107 = [Y1], INCY3
  251. (p16) LDFD f113 = [Y2], INCY3
  252. (p18) FMA f11 = f67, f115, f11
  253. }
  254. ;;
  255. { .mmf
  256. (p16) LDFPD f68, f71 = [X1], 2 * SIZE
  257. (p18) FMA f12 = f70, f118, f12
  258. }
  259. { .mmf
  260. (p16) LDFD f116 = [Y1], INCY
  261. (p16) LDFD f122 = [Y2], INCY
  262. (p18) FMA f13 = f73, f121, f13
  263. }
  264. ;;
  265. { .mmf
  266. (p16) LDFPD f74, f77 = [X1], 2 * SIZE
  267. (p16) LDFD f119 = [Y1], INCY3
  268. (p18) FMA f14 = f76, f124, f14
  269. }
  270. { .mfb
  271. (p16) LDFD f125 = [Y2], INCY3
  272. (p18) FMA f15 = f79, f127, f15
  273. br.ctop.sptk.few .L12
  274. }
  275. ;;
  276. .align 32
  277. .L15:
  278. { .mmi
  279. (p12) LDFPD f32, f33 = [X1], 2 * SIZE
  280. mov YY = Y1
  281. tbit.z p0, p13 = N, 2
  282. }
  283. { .mmb
  284. (p12) LDFD f34 = [Y1], INCY
  285. (p12) LDFD f38 = [Y2], INCY
  286. (p7) br.cond.dptk .L999
  287. }
  288. ;;
  289. { .mmi
  290. (p12) LDFPD f36, f37 = [X1], 2 * SIZE
  291. (p12) shladd YY = INCY, 3, YY
  292. tbit.z p0, p14 = N, 1
  293. }
  294. { .mmi
  295. (p12) LDFD f35 = [Y1], INCY3
  296. (p12) LDFD f39 = [Y2], INCY3
  297. tbit.z p0, p15 = N, 0
  298. }
  299. ;;
  300. { .mmi
  301. (p12) LDFPD f40, f41 = [X1], 2 * SIZE
  302. (p13) shladd YY = INCY, 2, YY
  303. }
  304. { .mmi
  305. (p12) LDFD f42 = [Y1], INCY
  306. (p12) LDFD f46 = [Y2], INCY
  307. }
  308. ;;
  309. (p12) LDFPD f44, f45 = [X1], 2 * SIZE
  310. (p12) LDFD f43 = [Y1], INCY3
  311. (p12) LDFD f47 = [Y2], INCY3
  312. (p14) shladd YY = INCY, 1, YY
  313. ;;
  314. (p13) LDFPD f48, f49 = [X1], 2 * SIZE
  315. (p13) LDFD f50 = [Y1], INCY
  316. (p13) LDFD f54 = [Y2], INCY
  317. ;;
  318. (p13) LDFPD f52, f53 = [X1], 2 * SIZE
  319. (p13) LDFD f51 = [Y1], INCY3
  320. (p13) LDFD f55 = [Y2], INCY3
  321. ;;
  322. (p14) LDFPD f56, f57 = [X1], 2 * SIZE
  323. (p14) LDFD f58 = [Y1], INCY
  324. (p15) LDFD f61 = [YY]
  325. ;;
  326. (p14) LDFD f59 = [Y1]
  327. (p15) LDFD f60 = [X1]
  328. ;;
  329. (p12) FMA f8 = f32, f34, f8
  330. (p12) FMA f9 = f33, f35, f9
  331. (p12) FMA f10 = f36, f38, f10
  332. (p12) FMA f11 = f37, f39, f11
  333. (p12) FMA f12 = f40, f42, f12
  334. (p12) FMA f13 = f41, f43, f13
  335. (p12) FMA f14 = f44, f46, f14
  336. (p12) FMA f15 = f45, f47, f15
  337. ;;
  338. (p13) FMA f8 = f48, f50, f8
  339. (p13) FMA f9 = f49, f51, f9
  340. (p13) FMA f10 = f52, f54, f10
  341. (p13) FMA f11 = f53, f55, f11
  342. (p14) FMA f12 = f56, f58, f12
  343. (p14) FMA f13 = f57, f59, f13
  344. (p15) FMA f14 = f60, f61, f14
  345. br .L999
  346. ;;
  347. .align 32
  348. .L20:
  349. { .mmi
  350. adds PREX = PREFETCH_SIZE * SIZE, X1
  351. adds PREY = (PREFETCH_SIZE + 18) * SIZE, Y1
  352. mov ar.lc = I
  353. }
  354. { .mfb
  355. cmp.eq p6 ,p0 = -1, I
  356. FMA f15 = f32, f80, f0
  357. (p6) br.cond.dpnt .L25
  358. }
  359. ;;
  360. .align 32
  361. .L22:
  362. { .mmf
  363. (p16) LDFPD f32, f35 = [X1], 2 * SIZE
  364. (p16) lfetch.nt1 [PREX], INCX16
  365. (p18) FMA f8 = f34, f82, f8
  366. }
  367. { .mmf
  368. (p17) LDFD f105 = [Y1], INCY
  369. (p17) LDFD f111 = [Y2], INCY
  370. (p18) FMA f9 = f37, f85, f9
  371. }
  372. ;;
  373. { .mmf
  374. (p16) LDFPD f38, f41 = [X1], 2 * SIZE
  375. (p16) lfetch.nt1 [PREY], INCY16
  376. (p18) FMA f10 = f40, f88, f10
  377. }
  378. { .mmf
  379. (p17) LDFD f108 = [Y1], INCY3
  380. (p17) LDFD f114 = [Y2], INCY3
  381. (p18) FMA f11 = f43, f91, f11
  382. }
  383. ;;
  384. { .mmf
  385. (p16) LDFPD f44, f47 = [X1], 2 * SIZE
  386. (p18) FMA f12 = f46, f94, f12
  387. }
  388. { .mmf
  389. (p17) LDFD f117 = [Y1], INCY
  390. (p17) LDFD f123 = [Y2], INCY
  391. (p18) FMA f13 = f49, f97, f13
  392. }
  393. ;;
  394. { .mmf
  395. (p16) LDFPD f50, f53 = [X1], 2 * SIZE
  396. (p18) FMA f14 = f52, f100, f14
  397. }
  398. { .mmf
  399. (p17) LDFD f120 = [Y1], INCY3
  400. (p17) LDFD f126 = [Y2], INCY3
  401. (p18) FMA f15 = f55, f103, f15
  402. }
  403. ;;
  404. { .mmf
  405. (p16) LDFPD f56, f59 = [X1], 2 * SIZE
  406. (p18) FMA f8 = f58, f106, f8
  407. }
  408. { .mmf
  409. (p16) LDFD f80 = [Y1], INCY
  410. (p16) LDFD f86 = [Y2], INCY
  411. (p18) FMA f9 = f61, f109, f9
  412. }
  413. ;;
  414. { .mmf
  415. (p16) LDFPD f62, f65 = [X1], 2 * SIZE
  416. (p18) FMA f10 = f64, f112, f10
  417. }
  418. { .mmf
  419. (p16) LDFD f83 = [Y1], INCY3
  420. (p16) LDFD f89 = [Y2], INCY3
  421. (p18) FMA f11 = f67, f115, f11
  422. }
  423. ;;
  424. { .mmf
  425. (p16) LDFPD f68, f71 = [X1], 2 * SIZE
  426. (p18) FMA f12 = f70, f118, f12
  427. }
  428. { .mmf
  429. (p16) LDFD f92 = [Y1], INCY
  430. (p16) LDFD f98 = [Y2], INCY
  431. (p18) FMA f13 = f73, f121, f13
  432. }
  433. ;;
  434. { .mmf
  435. (p16) LDFPD f74, f77 = [X1], 2 * SIZE
  436. (p16) LDFD f95 = [Y1], INCY3
  437. (p18) FMA f14 = f76, f124, f14
  438. }
  439. { .mfb
  440. (p16) LDFD f101 = [Y2], INCY3
  441. (p18) FMA f15 = f79, f127, f15
  442. br.ctop.sptk.few .L22
  443. }
  444. ;;
  445. .align 32
  446. .L25:
  447. { .mmi
  448. (p12) LDFPD f32, f33 = [X1], 2 * SIZE
  449. mov YY = Y1
  450. tbit.z p0, p13 = N, 2
  451. }
  452. { .mmb
  453. (p12) LDFD f34 = [Y1], INCY
  454. (p12) LDFD f38 = [Y2], INCY
  455. (p7) br.cond.dptk .L999
  456. }
  457. ;;
  458. { .mmi
  459. (p12) LDFPD f36, f37 = [X1], 2 * SIZE
  460. (p12) shladd YY = INCY, 3, YY
  461. tbit.z p0, p14 = N, 1
  462. }
  463. { .mmi
  464. (p12) LDFD f35 = [Y1], INCY3
  465. (p12) LDFD f39 = [Y2], INCY3
  466. tbit.z p0, p15 = N, 0
  467. }
  468. ;;
  469. { .mmi
  470. (p12) LDFPD f40, f41 = [X1], 2 * SIZE
  471. (p13) shladd YY = INCY, 2, YY
  472. }
  473. { .mmi
  474. (p12) LDFD f42 = [Y1], INCY
  475. (p12) LDFD f46 = [Y2], INCY
  476. }
  477. ;;
  478. (p12) LDFPD f44, f45 = [X1], 2 * SIZE
  479. (p12) LDFD f43 = [Y1], INCY3
  480. (p12) LDFD f47 = [Y2], INCY3
  481. (p14) shladd YY = INCY, 1, YY
  482. ;;
  483. (p13) LDFPD f48, f49 = [X1], 2 * SIZE
  484. (p13) LDFD f50 = [Y1], INCY
  485. (p13) LDFD f54 = [Y2], INCY
  486. ;;
  487. (p13) LDFPD f52, f53 = [X1], 2 * SIZE
  488. (p13) LDFD f51 = [Y1], INCY3
  489. (p13) LDFD f55 = [Y2], INCY3
  490. ;;
  491. (p14) LDFPD f56, f57 = [X1], 2 * SIZE
  492. (p14) LDFD f58 = [Y1], INCY
  493. (p15) LDFD f61 = [YY]
  494. ;;
  495. (p14) LDFD f59 = [Y1]
  496. (p15) LDFD f60 = [X1]
  497. ;;
  498. (p12) FMA f8 = f32, f34, f8
  499. (p12) FMA f9 = f33, f35, f9
  500. (p12) FMA f10 = f36, f38, f10
  501. (p12) FMA f11 = f37, f39, f11
  502. (p12) FMA f12 = f40, f42, f12
  503. (p12) FMA f13 = f41, f43, f13
  504. (p12) FMA f14 = f44, f46, f14
  505. (p12) FMA f15 = f45, f47, f15
  506. ;;
  507. (p13) FMA f8 = f48, f50, f8
  508. (p13) FMA f9 = f49, f51, f9
  509. (p13) FMA f10 = f52, f54, f10
  510. (p13) FMA f11 = f53, f55, f11
  511. (p14) FMA f12 = f56, f58, f12
  512. (p14) FMA f13 = f57, f59, f13
  513. (p15) FMA f14 = f60, f61, f14
  514. br .L999
  515. ;;
  516. .align 32
  517. .L100:
  518. { .mmi
  519. shladd X2 = INCX, 1, X1
  520. }
  521. { .mib
  522. cmp.ne p6, p0 = SIZE, INCY
  523. tbit.nz p15, p0 = Y1, BASE_SHIFT
  524. (p6) br.cond.dptk .L200
  525. }
  526. ;;
  527. { .mfi
  528. (p15) LDFD f32 = [X1], INCX
  529. mov f12 = f0
  530. mov pr.rot= 0
  531. }
  532. { .mfi
  533. (p15) adds N = -1, N
  534. mov f13 = f0
  535. shr YA = Y1, 4
  536. }
  537. ;;
  538. { .mfi
  539. (p15) LDFD f80 = [Y1], INCY
  540. mov f14 = f0
  541. shr I = N, 4
  542. }
  543. { .mmi
  544. and J = 15, N
  545. and XA = 0xf, XA
  546. and YA = 0xf, YA
  547. }
  548. ;;
  549. { .mmi
  550. shladd INCX3 = INCX, 1, INCX
  551. shladd INCY3 = INCY, 1, INCY
  552. sub XA = YA, XA
  553. }
  554. { .mmi
  555. shladd INCX16 = INCX, 4, r0
  556. shladd INCY16 = INCY, 4, r0
  557. tbit.z p0, p12 = N, 3
  558. }
  559. ;;
  560. { .mmi
  561. shladd X2 = INCX, 1, X1
  562. cmp.eq p7, p0 = r0, J
  563. mov ar.ec= 3
  564. }
  565. { .mmi
  566. adds I = -1, I
  567. cmp.ge p8, p0 = 4, XA
  568. cmp.eq p16, p0 = r0, r0
  569. }
  570. ;;
  571. { .mbb
  572. cmp.le p9, p0 = 14, XA
  573. (p8) br.cond.dpnt .L120
  574. (p9) br.cond.dpnt .L120
  575. }
  576. ;;
  577. { .mmi
  578. adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1
  579. adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1
  580. mov ar.lc = I
  581. }
  582. { .mfb
  583. cmp.eq p6 ,p0 = -1, I
  584. FMA f15 = f32, f80, f0
  585. (p6) br.cond.dpnt .L115
  586. }
  587. ;;
  588. .align 32
  589. /* INCY == 1 */
  590. .L112:
  591. { .mmf
  592. (p16) LDFPD f32, f35 = [Y1], 2 * SIZE
  593. (p16) lfetch.nt1 [PREX], INCX16
  594. (p18) FMA f8 = f34, f82, f8
  595. }
  596. { .mmf
  597. (p16) LDFD f80 = [X1], INCX
  598. (p16) LDFD f86 = [X2], INCX
  599. (p18) FMA f9 = f37, f85, f9
  600. }
  601. ;;
  602. { .mmf
  603. (p16) LDFPD f38, f41 = [Y1], 2 * SIZE
  604. (p16) lfetch.nt1 [PREY], INCY16
  605. (p18) FMA f10 = f40, f88, f10
  606. }
  607. { .mmf
  608. (p16) LDFD f83 = [X1], INCX3
  609. (p16) LDFD f89 = [X2], INCX3
  610. (p18) FMA f11 = f43, f91, f11
  611. }
  612. ;;
  613. { .mmf
  614. (p16) LDFPD f44, f47 = [Y1], 2 * SIZE
  615. (p18) FMA f12 = f46, f94, f12
  616. }
  617. { .mmf
  618. (p16) LDFD f92 = [X1], INCX
  619. (p16) LDFD f98 = [X2], INCX
  620. (p18) FMA f13 = f49, f97, f13
  621. }
  622. ;;
  623. { .mmf
  624. (p16) LDFPD f50, f53 = [Y1], 2 * SIZE
  625. (p18) FMA f14 = f52, f100, f14
  626. }
  627. { .mmf
  628. (p16) LDFD f95 = [X1], INCX3
  629. (p16) LDFD f101 = [X2], INCX3
  630. (p18) FMA f15 = f55, f103, f15
  631. }
  632. ;;
  633. { .mmf
  634. (p16) LDFPD f56, f59 = [Y1], 2 * SIZE
  635. (p18) FMA f8 = f58, f106, f8
  636. }
  637. { .mmf
  638. (p16) LDFD f104 = [X1], INCX
  639. (p16) LDFD f110 = [X2], INCX
  640. (p18) FMA f9 = f61, f109, f9
  641. }
  642. ;;
  643. { .mmf
  644. (p16) LDFPD f62, f65 = [Y1], 2 * SIZE
  645. (p18) FMA f10 = f64, f112, f10
  646. }
  647. { .mmf
  648. (p16) LDFD f107 = [X1], INCX3
  649. (p16) LDFD f113 = [X2], INCX3
  650. (p18) FMA f11 = f67, f115, f11
  651. }
  652. ;;
  653. { .mmf
  654. (p16) LDFPD f68, f71 = [Y1], 2 * SIZE
  655. (p18) FMA f12 = f70, f118, f12
  656. }
  657. { .mmf
  658. (p16) LDFD f116 = [X1], INCX
  659. (p16) LDFD f122 = [X2], INCX
  660. (p18) FMA f13 = f73, f121, f13
  661. }
  662. ;;
  663. { .mmf
  664. (p16) LDFPD f74, f77 = [Y1], 2 * SIZE
  665. (p16) LDFD f119 = [X1], INCX3
  666. (p18) FMA f14 = f76, f124, f14
  667. }
  668. { .mfb
  669. (p16) LDFD f125 = [X2], INCX3
  670. (p18) FMA f15 = f79, f127, f15
  671. br.ctop.sptk.few .L112
  672. }
  673. ;;
  674. .align 32
  675. .L115:
  676. { .mmi
  677. (p12) LDFPD f32, f33 = [Y1], 2 * SIZE
  678. mov XX = X1
  679. tbit.z p0, p13 = N, 2
  680. }
  681. { .mmb
  682. (p12) LDFD f34 = [X1], INCX
  683. (p12) LDFD f38 = [X2], INCX
  684. (p7) br.cond.dptk .L999
  685. }
  686. ;;
  687. { .mmi
  688. (p12) LDFPD f36, f37 = [Y1], 2 * SIZE
  689. (p12) shladd XX = INCX, 3, XX
  690. tbit.z p0, p14 = N, 1
  691. }
  692. { .mmi
  693. (p12) LDFD f35 = [X1], INCX3
  694. (p12) LDFD f39 = [X2], INCX3
  695. tbit.z p0, p15 = N, 0
  696. }
  697. ;;
  698. { .mmi
  699. (p12) LDFPD f40, f41 = [Y1], 2 * SIZE
  700. (p13) shladd XX = INCX, 2, XX
  701. }
  702. { .mmi
  703. (p12) LDFD f42 = [X1], INCX
  704. (p12) LDFD f46 = [X2], INCX
  705. }
  706. ;;
  707. (p12) LDFPD f44, f45 = [Y1], 2 * SIZE
  708. (p12) LDFD f43 = [X1], INCX3
  709. (p12) LDFD f47 = [X2], INCX3
  710. (p14) shladd XX = INCX, 1, XX
  711. ;;
  712. (p13) LDFPD f48, f49 = [Y1], 2 * SIZE
  713. (p13) LDFD f50 = [X1], INCX
  714. (p13) LDFD f54 = [X2], INCX
  715. ;;
  716. (p13) LDFPD f52, f53 = [Y1], 2 * SIZE
  717. (p13) LDFD f51 = [X1], INCX3
  718. (p13) LDFD f55 = [X2], INCX3
  719. ;;
  720. (p14) LDFPD f56, f57 = [Y1], 2 * SIZE
  721. (p14) LDFD f58 = [X1], INCX
  722. (p15) LDFD f61 = [XX]
  723. ;;
  724. (p14) LDFD f59 = [X1]
  725. (p15) LDFD f60 = [Y1]
  726. ;;
  727. (p12) FMA f8 = f32, f34, f8
  728. (p12) FMA f9 = f33, f35, f9
  729. (p12) FMA f10 = f36, f38, f10
  730. (p12) FMA f11 = f37, f39, f11
  731. (p12) FMA f12 = f40, f42, f12
  732. (p12) FMA f13 = f41, f43, f13
  733. (p12) FMA f14 = f44, f46, f14
  734. (p12) FMA f15 = f45, f47, f15
  735. ;;
  736. (p13) FMA f8 = f48, f50, f8
  737. (p13) FMA f9 = f49, f51, f9
  738. (p13) FMA f10 = f52, f54, f10
  739. (p13) FMA f11 = f53, f55, f11
  740. (p14) FMA f12 = f56, f58, f12
  741. (p14) FMA f13 = f57, f59, f13
  742. (p15) FMA f14 = f60, f61, f14
  743. br .L999
  744. ;;
  745. .align 32
  746. .L120:
  747. { .mmi
  748. adds PREX = (PREFETCH_SIZE + 17) * SIZE, X1
  749. adds PREY = (PREFETCH_SIZE + 19) * SIZE, X1
  750. mov ar.lc = I
  751. }
  752. { .mfb
  753. cmp.eq p6 ,p0 = -1, I
  754. FMA f15 = f32, f80, f0
  755. (p6) br.cond.dpnt .L125
  756. }
  757. ;;
  758. .align 32
  759. .L122:
  760. { .mmf
  761. (p16) LDFPD f32, f35 = [Y1], 2 * SIZE
  762. (p16) lfetch.nt1 [PREX], INCX16
  763. (p18) FMA f8 = f34, f82, f8
  764. }
  765. { .mmf
  766. (p17) LDFD f105 = [X1], INCX
  767. (p17) LDFD f111 = [X2], INCX
  768. (p18) FMA f9 = f37, f85, f9
  769. }
  770. ;;
  771. { .mmf
  772. (p16) LDFPD f38, f41 = [Y1], 2 * SIZE
  773. (p16) lfetch.nt1 [PREY], INCY16
  774. (p18) FMA f10 = f40, f88, f10
  775. }
  776. { .mmf
  777. (p17) LDFD f108 = [X1], INCX3
  778. (p17) LDFD f114 = [X2], INCX3
  779. (p18) FMA f11 = f43, f91, f11
  780. }
  781. ;;
  782. { .mmf
  783. (p16) LDFPD f44, f47 = [Y1], 2 * SIZE
  784. (p18) FMA f12 = f46, f94, f12
  785. }
  786. { .mmf
  787. (p17) LDFD f117 = [X1], INCX
  788. (p17) LDFD f123 = [X2], INCX
  789. (p18) FMA f13 = f49, f97, f13
  790. }
  791. ;;
  792. { .mmf
  793. (p16) LDFPD f50, f53 = [Y1], 2 * SIZE
  794. (p18) FMA f14 = f52, f100, f14
  795. }
  796. { .mmf
  797. (p17) LDFD f120 = [X1], INCX3
  798. (p17) LDFD f126 = [X2], INCX3
  799. (p18) FMA f15 = f55, f103, f15
  800. }
  801. ;;
  802. { .mmf
  803. (p16) LDFPD f56, f59 = [Y1], 2 * SIZE
  804. (p18) FMA f8 = f58, f106, f8
  805. }
  806. { .mmf
  807. (p16) LDFD f80 = [X1], INCX
  808. (p16) LDFD f86 = [X2], INCX
  809. (p18) FMA f9 = f61, f109, f9
  810. }
  811. ;;
  812. { .mmf
  813. (p16) LDFPD f62, f65 = [Y1], 2 * SIZE
  814. (p18) FMA f10 = f64, f112, f10
  815. }
  816. { .mmf
  817. (p16) LDFD f83 = [X1], INCX3
  818. (p16) LDFD f89 = [X2], INCX3
  819. (p18) FMA f11 = f67, f115, f11
  820. }
  821. ;;
  822. { .mmf
  823. (p16) LDFPD f68, f71 = [Y1], 2 * SIZE
  824. (p18) FMA f12 = f70, f118, f12
  825. }
  826. { .mmf
  827. (p16) LDFD f92 = [X1], INCX
  828. (p16) LDFD f98 = [X2], INCX
  829. (p18) FMA f13 = f73, f121, f13
  830. }
  831. ;;
  832. { .mmf
  833. (p16) LDFPD f74, f77 = [Y1], 2 * SIZE
  834. (p16) LDFD f95 = [X1], INCX3
  835. (p18) FMA f14 = f76, f124, f14
  836. }
  837. { .mfb
  838. (p16) LDFD f101 = [X2], INCX3
  839. (p18) FMA f15 = f79, f127, f15
  840. br.ctop.sptk.few .L122
  841. }
  842. ;;
  843. .align 32
  844. .L125:
  845. { .mmi
  846. (p12) LDFPD f32, f33 = [Y1], 2 * SIZE
  847. mov XX = X1
  848. tbit.z p0, p13 = N, 2
  849. }
  850. { .mmb
  851. (p12) LDFD f34 = [X1], INCX
  852. (p12) LDFD f38 = [X2], INCX
  853. (p7) br.cond.dptk .L999
  854. }
  855. ;;
  856. { .mmi
  857. (p12) LDFPD f36, f37 = [Y1], 2 * SIZE
  858. (p12) shladd XX = INCX, 3, XX
  859. tbit.z p0, p14 = N, 1
  860. }
  861. { .mmi
  862. (p12) LDFD f35 = [X1], INCX3
  863. (p12) LDFD f39 = [X2], INCX3
  864. tbit.z p0, p15 = N, 0
  865. }
  866. ;;
  867. { .mmi
  868. (p12) LDFPD f40, f41 = [Y1], 2 * SIZE
  869. (p13) shladd XX = INCX, 2, XX
  870. }
  871. { .mmi
  872. (p12) LDFD f42 = [X1], INCX
  873. (p12) LDFD f46 = [X2], INCX
  874. }
  875. ;;
  876. (p12) LDFPD f44, f45 = [Y1], 2 * SIZE
  877. (p12) LDFD f43 = [X1], INCX3
  878. (p12) LDFD f47 = [X2], INCX3
  879. (p14) shladd XX = INCX, 1, XX
  880. ;;
  881. (p13) LDFPD f48, f49 = [Y1], 2 * SIZE
  882. (p13) LDFD f50 = [X1], INCX
  883. (p13) LDFD f54 = [X2], INCX
  884. ;;
  885. (p13) LDFPD f52, f53 = [Y1], 2 * SIZE
  886. (p13) LDFD f51 = [X1], INCX3
  887. (p13) LDFD f55 = [X2], INCX3
  888. ;;
  889. (p14) LDFPD f56, f57 = [Y1], 2 * SIZE
  890. (p14) LDFD f58 = [X1], INCX
  891. (p15) LDFD f61 = [XX]
  892. ;;
  893. (p14) LDFD f59 = [X1]
  894. (p15) LDFD f60 = [Y1]
  895. ;;
  896. (p12) FMA f8 = f32, f34, f8
  897. (p12) FMA f9 = f33, f35, f9
  898. (p12) FMA f10 = f36, f38, f10
  899. (p12) FMA f11 = f37, f39, f11
  900. (p12) FMA f12 = f40, f42, f12
  901. (p12) FMA f13 = f41, f43, f13
  902. (p12) FMA f14 = f44, f46, f14
  903. (p12) FMA f15 = f45, f47, f15
  904. ;;
  905. (p13) FMA f8 = f48, f50, f8
  906. (p13) FMA f9 = f49, f51, f9
  907. (p13) FMA f10 = f52, f54, f10
  908. (p13) FMA f11 = f53, f55, f11
  909. (p14) FMA f12 = f56, f58, f12
  910. (p14) FMA f13 = f57, f59, f13
  911. (p15) FMA f14 = f60, f61, f14
  912. br .L999
  913. ;;
  914. .align 32
  915. .L200:
  916. { .mfi
  917. shladd INCX3 = INCX, 1, INCX
  918. mov f12 = f0
  919. mov pr.rot= 0
  920. }
  921. { .mfi
  922. and J = 15, N
  923. mov f13 = f0
  924. shr I = N, 4
  925. }
  926. ;;
  927. { .mmf
  928. cmp.eq p16, p0 = r0, r0
  929. shladd INCY3 = INCY, 1, INCY
  930. mov f14 = f0
  931. }
  932. { .mmi
  933. shladd INCX16 = INCX, 4, r0
  934. shladd INCY16 = INCY, 4, r0
  935. tbit.z p0, p12 = N, 3
  936. }
  937. ;;
  938. { .mmi
  939. cmp.eq p7, p0 = r0, J
  940. adds I = -1, I
  941. mov ar.ec= 3
  942. }
  943. { .mmi
  944. shladd Y2 = INCY, 1, Y1
  945. mov XX = X1
  946. mov YY = Y1
  947. }
  948. ;;
  949. { .mmi
  950. adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1
  951. adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1
  952. mov ar.lc = I
  953. }
  954. { .mfb
  955. cmp.eq p6 ,p0 = -1, I
  956. mov f15 = f0
  957. (p6) br.cond.dpnt .L215
  958. }
  959. ;;
  960. .align 32
  961. /* INCY == 1 */
  962. .L212:
  963. { .mmf
  964. (p16) lfetch.nt1 [PREX], INCX16
  965. (p16) lfetch.nt1 [PREY], INCY16
  966. (p18) FMA f8 = f34, f82, f8
  967. }
  968. { .mmf
  969. (p16) LDFD f32 = [Y1], INCY
  970. (p16) LDFD f38 = [Y2], INCY
  971. (p18) FMA f9 = f37, f85, f9
  972. }
  973. ;;
  974. { .mmf
  975. (p16) LDFD f80 = [X1], INCX
  976. (p16) LDFD f86 = [X2], INCX
  977. (p18) FMA f10 = f40, f88, f10
  978. }
  979. { .mmf
  980. (p16) LDFD f35 = [Y1], INCY3
  981. (p16) LDFD f41 = [Y2], INCY3
  982. (p18) FMA f11 = f43, f91, f11
  983. }
  984. ;;
  985. { .mmf
  986. (p16) LDFD f83 = [X1], INCX3
  987. (p16) LDFD f89 = [X2], INCX3
  988. (p18) FMA f12 = f46, f94, f12
  989. }
  990. { .mmf
  991. (p16) LDFD f44 = [Y1], INCY
  992. (p16) LDFD f50 = [Y2], INCY
  993. (p18) FMA f13 = f49, f97, f13
  994. }
  995. ;;
  996. { .mmf
  997. (p16) LDFD f92 = [X1], INCX
  998. (p16) LDFD f98 = [X2], INCX
  999. (p18) FMA f14 = f52, f100, f14
  1000. }
  1001. { .mmf
  1002. (p16) LDFD f47 = [Y1], INCY3
  1003. (p16) LDFD f53 = [Y2], INCY3
  1004. (p18) FMA f15 = f55, f103, f15
  1005. }
  1006. ;;
  1007. { .mmf
  1008. (p16) LDFD f95 = [X1], INCX3
  1009. (p16) LDFD f101 = [X2], INCX3
  1010. (p18) FMA f8 = f58, f106, f8
  1011. }
  1012. { .mmf
  1013. (p16) LDFD f56 = [Y1], INCY
  1014. (p16) LDFD f62 = [Y2], INCY
  1015. (p18) FMA f9 = f61, f109, f9
  1016. }
  1017. ;;
  1018. { .mmf
  1019. (p16) LDFD f104 = [X1], INCX
  1020. (p16) LDFD f110 = [X2], INCX
  1021. (p18) FMA f10 = f64, f112, f10
  1022. }
  1023. { .mmf
  1024. (p16) LDFD f59 = [Y1], INCY3
  1025. (p16) LDFD f65 = [Y2], INCY3
  1026. (p18) FMA f11 = f67, f115, f11
  1027. }
  1028. ;;
  1029. { .mmf
  1030. (p16) LDFD f107 = [X1], INCX3
  1031. (p16) LDFD f113 = [X2], INCX3
  1032. (p18) FMA f12 = f70, f118, f12
  1033. }
  1034. { .mmf
  1035. (p16) LDFD f68 = [Y1], INCY
  1036. (p16) LDFD f74 = [Y2], INCY
  1037. (p18) FMA f13 = f73, f121, f13
  1038. }
  1039. ;;
  1040. { .mmf
  1041. (p16) LDFD f116 = [X1], INCX
  1042. (p16) LDFD f122 = [X2], INCX
  1043. (p18) FMA f14 = f76, f124, f14
  1044. }
  1045. { .mmf
  1046. (p16) LDFD f71 = [Y1], INCY3
  1047. (p16) LDFD f77 = [Y2], INCY3
  1048. (p18) FMA f15 = f79, f127, f15
  1049. }
  1050. ;;
  1051. { .mmi
  1052. (p16) LDFD f119 = [X1], INCX3
  1053. (p16) LDFD f125 = [X2], INCX3
  1054. }
  1055. { .mmb
  1056. (p16) add XX = INCX16, XX
  1057. (p16) add YY = INCY16, YY
  1058. br.ctop.sptk.few .L212
  1059. }
  1060. ;;
  1061. .align 32
  1062. .L215:
  1063. { .mmi
  1064. (p12) LDFD f34 = [X1], INCX
  1065. (p12) LDFD f38 = [X2], INCX
  1066. tbit.z p0, p13 = N, 2
  1067. }
  1068. { .mmb
  1069. (p12) LDFD f32 = [Y1], INCY
  1070. (p12) LDFD f36 = [Y2], INCY
  1071. (p7) br.cond.dptk .L999
  1072. }
  1073. ;;
  1074. { .mmi
  1075. (p12) LDFD f35 = [X1], INCX3
  1076. (p12) LDFD f39 = [X2], INCX3
  1077. tbit.z p0, p14 = N, 1
  1078. }
  1079. { .mmi
  1080. (p12) LDFD f33 = [Y1], INCY3
  1081. (p12) LDFD f37 = [Y2], INCY3
  1082. tbit.z p0, p15 = N, 0
  1083. }
  1084. ;;
  1085. { .mmi
  1086. (p12) LDFD f42 = [X1], INCX
  1087. (p12) LDFD f46 = [X2], INCX
  1088. (p12) shladd XX = INCX, 3, XX
  1089. }
  1090. { .mmi
  1091. (p12) LDFD f40 = [Y1], INCY
  1092. (p12) LDFD f44 = [Y2], INCY
  1093. (p12) shladd YY = INCY, 3, YY
  1094. }
  1095. ;;
  1096. { .mmi
  1097. (p12) LDFD f43 = [X1], INCX3
  1098. (p12) LDFD f47 = [X2], INCX3
  1099. (p13) shladd XX = INCX, 2, XX
  1100. }
  1101. { .mmi
  1102. (p12) LDFD f41 = [Y1], INCY3
  1103. (p12) LDFD f45 = [Y2], INCY3
  1104. (p13) shladd YY = INCY, 2, YY
  1105. }
  1106. ;;
  1107. (p13) LDFD f50 = [X1], INCX
  1108. (p13) LDFD f54 = [X2], INCX
  1109. (p14) shladd XX = INCX, 1, XX
  1110. (p13) LDFD f48 = [Y1], INCY
  1111. (p13) LDFD f52 = [Y2], INCY
  1112. (p14) shladd YY = INCY, 1, YY
  1113. ;;
  1114. (p13) LDFD f51 = [X1], INCX3
  1115. (p13) LDFD f55 = [X2]
  1116. (p13) LDFD f49 = [Y1], INCY3
  1117. (p13) LDFD f53 = [Y2]
  1118. ;;
  1119. (p14) LDFD f58 = [X1], INCX
  1120. (p15) LDFD f61 = [XX]
  1121. (p14) LDFD f56 = [Y1], INCY
  1122. (p15) LDFD f60 = [YY]
  1123. ;;
  1124. (p14) LDFD f59 = [X1]
  1125. (p14) LDFD f57 = [Y1]
  1126. ;;
  1127. ;;
  1128. ;;
  1129. (p12) FMA f8 = f32, f34, f8
  1130. (p12) FMA f9 = f33, f35, f9
  1131. (p12) FMA f10 = f36, f38, f10
  1132. (p12) FMA f11 = f37, f39, f11
  1133. (p12) FMA f12 = f40, f42, f12
  1134. (p12) FMA f13 = f41, f43, f13
  1135. (p12) FMA f14 = f44, f46, f14
  1136. (p12) FMA f15 = f45, f47, f15
  1137. ;;
  1138. (p13) FMA f8 = f48, f50, f8
  1139. (p13) FMA f9 = f49, f51, f9
  1140. (p13) FMA f10 = f52, f54, f10
  1141. (p13) FMA f11 = f53, f55, f11
  1142. (p14) FMA f12 = f56, f58, f12
  1143. (p14) FMA f13 = f57, f59, f13
  1144. (p15) FMA f14 = f60, f61, f14
  1145. ;;
  1146. .align 32
  1147. .L999:
  1148. FADD f8 = f8, f9
  1149. FADD f10 = f10, f11
  1150. FADD f12 = f12, f13
  1151. FADD f14 = f14, f15
  1152. ;;
  1153. FADD f8 = f8, f10
  1154. FADD f12 = f12, f14
  1155. mov ar.lc = ARLC
  1156. ;;
  1157. FADD f8 = f8, f12
  1158. mov pr = PR, -65474
  1159. br.ret.sptk.many b0
  1160. EPILOGUE